CoCalc -- mmu

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/powerpc/aim/mmu_oea.c
³⁹⁵⁰⁷ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause AND BSD-4-Clause
3
 *
4
 * Copyright (c) 2001 The NetBSD Foundation, Inc.
5
 * All rights reserved.
6
 *
7
 * This code is derived from software contributed to The NetBSD Foundation
8
 * by Matt Thomas <[email protected]> of Allegro Networks, Inc.
9
 *
10
 * Redistribution and use in source and binary forms, with or without
11
 * modification, are permitted provided that the following conditions
12
 * are met:
13
 * 1. Redistributions of source code must retain the above copyright
14
 *    notice, this list of conditions and the following disclaimer.
15
 * 2. Redistributions in binary form must reproduce the above copyright
16
 *    notice, this list of conditions and the following disclaimer in the
17
 *    documentation and/or other materials provided with the distribution.
18
 *
19
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
 * POSSIBILITY OF SUCH DAMAGE.
30
 */
31
/*-
32
 * Copyright (C) 1995, 1996 Wolfgang Solfrank.
33
 * Copyright (C) 1995, 1996 TooLs GmbH.
34
 * All rights reserved.
35
 *
36
 * Redistribution and use in source and binary forms, with or without
37
 * modification, are permitted provided that the following conditions
38
 * are met:
39
 * 1. Redistributions of source code must retain the above copyright
40
 *    notice, this list of conditions and the following disclaimer.
41
 * 2. Redistributions in binary form must reproduce the above copyright
42
 *    notice, this list of conditions and the following disclaimer in the
43
 *    documentation and/or other materials provided with the distribution.
44
 * 3. All advertising materials mentioning features or use of this software
45
 *    must display the following acknowledgement:
46
 *	This product includes software developed by TooLs GmbH.
47
 * 4. The name of TooLs GmbH may not be used to endorse or promote products
48
 *    derived from this software without specific prior written permission.
49
 *
50
 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
51
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
52
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
53
 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
54
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
55
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
56
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
57
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
58
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
59
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
60
 *
61
 * $NetBSD: pmap.c,v 1.28 2000/03/26 20:42:36 kleink Exp $
62
 */
63
/*-
64
 * Copyright (C) 2001 Benno Rice.
65
 * All rights reserved.
66
 *
67
 * Redistribution and use in source and binary forms, with or without
68
 * modification, are permitted provided that the following conditions
69
 * are met:
70
 * 1. Redistributions of source code must retain the above copyright
71
 *    notice, this list of conditions and the following disclaimer.
72
 * 2. Redistributions in binary form must reproduce the above copyright
73
 *    notice, this list of conditions and the following disclaimer in the
74
 *    documentation and/or other materials provided with the distribution.
75
 *
76
 * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
77
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
78
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
79
 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
80
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
81
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
82
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
83
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
84
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
85
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
86
 */
87

88
#include <sys/cdefs.h>
89
/*
90
 * Manages physical address maps.
91
 *
92
 * Since the information managed by this module is also stored by the
93
 * logical address mapping module, this module may throw away valid virtual
94
 * to physical mappings at almost any time.  However, invalidations of
95
 * mappings must be done as requested.
96
 *
97
 * In order to cope with hardware architectures which make virtual to
98
 * physical map invalidates expensive, this module may delay invalidate
99
 * reduced protection operations until such time as they are actually
100
 * necessary.  This module is given full information as to which processors
101
 * are currently using which maps, and to when physical maps must be made
102
 * correct.
103
 */
104

105
#include "opt_kstack_pages.h"
106

107
#include <sys/param.h>
108
#include <sys/kernel.h>
109
#include <sys/conf.h>
110
#include <sys/queue.h>
111
#include <sys/cpuset.h>
112
#include <sys/kerneldump.h>
113
#include <sys/ktr.h>
114
#include <sys/lock.h>
115
#include <sys/mman.h>
116
#include <sys/msgbuf.h>
117
#include <sys/mutex.h>
118
#include <sys/proc.h>
119
#include <sys/rwlock.h>
120
#include <sys/sched.h>
121
#include <sys/sysctl.h>
122
#include <sys/systm.h>
123
#include <sys/vmmeter.h>
124

125
#include <dev/ofw/openfirm.h>
126

127
#include <vm/vm.h>
128
#include <vm/pmap.h>
129
#include <vm/vm_param.h>
130
#include <vm/vm_kern.h>
131
#include <vm/vm_page.h>
132
#include <vm/vm_map.h>
133
#include <vm/vm_object.h>
134
#include <vm/vm_extern.h>
135
#include <vm/vm_page.h>
136
#include <vm/vm_phys.h>
137
#include <vm/vm_pageout.h>
138
#include <vm/vm_radix.h>
139
#include <vm/uma.h>
140

141
#include <machine/cpu.h>
142
#include <machine/platform.h>
143
#include <machine/bat.h>
144
#include <machine/frame.h>
145
#include <machine/md_var.h>
146
#include <machine/psl.h>
147
#include <machine/pte.h>
148
#include <machine/smp.h>
149
#include <machine/sr.h>
150
#include <machine/mmuvar.h>
151
#include <machine/trap.h>
152

153
#define	MOEA_DEBUG
154

155
#define TODO	panic("%s: not implemented", __func__);
156

157
#define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
158
#define	VSID_TO_SR(vsid)	((vsid) & 0xf)
159
#define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
160

161
/* Get physical address from PVO. */
162
#define PVO_PADDR(pvo)		((pvo)->pvo_pte.pte.pte_lo & PTE_RPGN)
163

164
struct ofw_map {
165
	vm_offset_t	om_va;
166
	vm_size_t	om_len;
167
	vm_offset_t	om_pa;
168
	u_int		om_mode;
169
};
170

171
extern unsigned char _etext[];
172
extern unsigned char _end[];
173

174
/*
175
 * Map of physical memory regions.
176
 */
177
static struct	mem_region *regions;
178
static struct	mem_region *pregions;
179
static u_int    phys_avail_count;
180
static int	regions_sz, pregions_sz;
181
static struct	ofw_map *translations;
182

183
/*
184
 * Lock for the pteg and pvo tables.
185
 */
186
struct mtx	moea_table_mutex;
187
struct mtx	moea_vsid_mutex;
188

189
/* tlbie instruction synchronization */
190
static struct mtx tlbie_mtx;
191

192
/*
193
 * PTEG data.
194
 */
195
static struct	pteg *moea_pteg_table;
196
u_int		moea_pteg_count;
197
u_int		moea_pteg_mask;
198

199
/*
200
 * PVO data.
201
 */
202
struct	pvo_head *moea_pvo_table;		/* pvo entries by pteg index */
203
struct	pvo_head moea_pvo_kunmanaged =
204
    LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged);	/* list of unmanaged pages */
205

206
static struct rwlock_padalign pvh_global_lock;
207

208
uma_zone_t	moea_upvo_zone;	/* zone for pvo entries for unmanaged pages */
209
uma_zone_t	moea_mpvo_zone;	/* zone for pvo entries for managed pages */
210

211
#define	BPVO_POOL_SIZE	32768
212
static struct	pvo_entry *moea_bpvo_pool;
213
static int	moea_bpvo_pool_index = 0;
214

215
#define	VSID_NBPW	(sizeof(u_int32_t) * 8)
216
static u_int	moea_vsid_bitmap[NPMAPS / VSID_NBPW];
217

218
static bool	moea_initialized = false;
219

220
/*
221
 * Statistics.
222
 */
223
u_int	moea_pte_valid = 0;
224
u_int	moea_pte_overflow = 0;
225
u_int	moea_pte_replacements = 0;
226
u_int	moea_pvo_entries = 0;
227
u_int	moea_pvo_enter_calls = 0;
228
u_int	moea_pvo_remove_calls = 0;
229
u_int	moea_pte_spills = 0;
230
SYSCTL_INT(_machdep, OID_AUTO, moea_pte_valid, CTLFLAG_RD, &moea_pte_valid,
231
    0, "");
232
SYSCTL_INT(_machdep, OID_AUTO, moea_pte_overflow, CTLFLAG_RD,
233
    &moea_pte_overflow, 0, "");
234
SYSCTL_INT(_machdep, OID_AUTO, moea_pte_replacements, CTLFLAG_RD,
235
    &moea_pte_replacements, 0, "");
236
SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_entries, CTLFLAG_RD, &moea_pvo_entries,
237
    0, "");
238
SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_enter_calls, CTLFLAG_RD,
239
    &moea_pvo_enter_calls, 0, "");
240
SYSCTL_INT(_machdep, OID_AUTO, moea_pvo_remove_calls, CTLFLAG_RD,
241
    &moea_pvo_remove_calls, 0, "");
242
SYSCTL_INT(_machdep, OID_AUTO, moea_pte_spills, CTLFLAG_RD,
243
    &moea_pte_spills, 0, "");
244

245
/*
246
 * Allocate physical memory for use in moea_bootstrap.
247
 */
248
static vm_offset_t	moea_bootstrap_alloc(vm_size_t, u_int);
249

250
/*
251
 * PTE calls.
252
 */
253
static int		moea_pte_insert(u_int, struct pte *);
254

255
/*
256
 * PVO calls.
257
 */
258
static int	moea_pvo_enter(pmap_t, uma_zone_t, struct pvo_head *,
259
		    vm_offset_t, vm_paddr_t, u_int, int);
260
static void	moea_pvo_remove(struct pvo_entry *, int);
261
static struct	pvo_entry *moea_pvo_find_va(pmap_t, vm_offset_t, int *);
262
static struct	pte *moea_pvo_to_pte(const struct pvo_entry *, int);
263

264
/*
265
 * Utility routines.
266
 */
267
static int		moea_enter_locked(pmap_t, vm_offset_t, vm_page_t,
268
			    vm_prot_t, u_int, int8_t);
269
static void		moea_syncicache(vm_paddr_t, vm_size_t);
270
static bool		moea_query_bit(vm_page_t, int);
271
static u_int		moea_clear_bit(vm_page_t, int);
272
static void		moea_kremove(vm_offset_t);
273
int		moea_pte_spill(vm_offset_t);
274

275
/*
276
 * Kernel MMU interface
277
 */
278
void moea_clear_modify(vm_page_t);
279
void moea_copy_page(vm_page_t, vm_page_t);
280
void moea_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
281
    vm_page_t *mb, vm_offset_t b_offset, int xfersize);
282
int moea_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int,
283
    int8_t);
284
void moea_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
285
    vm_prot_t);
286
void moea_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
287
vm_paddr_t moea_extract(pmap_t, vm_offset_t);
288
vm_page_t moea_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
289
void moea_init(void);
290
bool moea_is_modified(vm_page_t);
291
bool moea_is_prefaultable(pmap_t, vm_offset_t);
292
bool moea_is_referenced(vm_page_t);
293
int moea_ts_referenced(vm_page_t);
294
vm_offset_t moea_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
295
static int moea_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
296
bool moea_page_exists_quick(pmap_t, vm_page_t);
297
void moea_page_init(vm_page_t);
298
int moea_page_wired_mappings(vm_page_t);
299
int moea_pinit(pmap_t);
300
void moea_pinit0(pmap_t);
301
void moea_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
302
void moea_qenter(vm_offset_t, vm_page_t *, int);
303
void moea_qremove(vm_offset_t, int);
304
void moea_release(pmap_t);
305
void moea_remove(pmap_t, vm_offset_t, vm_offset_t);
306
void moea_remove_all(vm_page_t);
307
void moea_remove_write(vm_page_t);
308
void moea_unwire(pmap_t, vm_offset_t, vm_offset_t);
309
void moea_zero_page(vm_page_t);
310
void moea_zero_page_area(vm_page_t, int, int);
311
void moea_activate(struct thread *);
312
void moea_deactivate(struct thread *);
313
void moea_cpu_bootstrap(int);
314
void moea_bootstrap(vm_offset_t, vm_offset_t);
315
void *moea_mapdev(vm_paddr_t, vm_size_t);
316
void *moea_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
317
void moea_unmapdev(void *, vm_size_t);
318
vm_paddr_t moea_kextract(vm_offset_t);
319
void moea_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t);
320
void moea_kenter(vm_offset_t, vm_paddr_t);
321
void moea_page_set_memattr(vm_page_t m, vm_memattr_t ma);
322
int moea_dev_direct_mapped(vm_paddr_t, vm_size_t);
323
static void moea_sync_icache(pmap_t, vm_offset_t, vm_size_t);
324
void moea_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
325
void moea_scan_init(void);
326
vm_offset_t moea_quick_enter_page(vm_page_t m);
327
void moea_quick_remove_page(vm_offset_t addr);
328
bool moea_page_is_mapped(vm_page_t m);
329
bool moea_ps_enabled(pmap_t pmap);
330
static int moea_map_user_ptr(pmap_t pm,
331
    volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen);
332
static int moea_decode_kernel_ptr(vm_offset_t addr,
333
    int *is_user, vm_offset_t *decoded_addr);
334

335
static struct pmap_funcs moea_methods = {
336
	.clear_modify = moea_clear_modify,
337
	.copy_page = moea_copy_page,
338
	.copy_pages = moea_copy_pages,
339
	.enter = moea_enter,
340
	.enter_object = moea_enter_object,
341
	.enter_quick = moea_enter_quick,
342
	.extract = moea_extract,
343
	.extract_and_hold = moea_extract_and_hold,
344
	.init = moea_init,
345
	.is_modified = moea_is_modified,
346
	.is_prefaultable = moea_is_prefaultable,
347
	.is_referenced = moea_is_referenced,
348
	.ts_referenced = moea_ts_referenced,
349
	.map =      		moea_map,
350
	.page_exists_quick = moea_page_exists_quick,
351
	.page_init = moea_page_init,
352
	.page_wired_mappings = moea_page_wired_mappings,
353
	.pinit = moea_pinit,
354
	.pinit0 = moea_pinit0,
355
	.protect = moea_protect,
356
	.qenter = moea_qenter,
357
	.qremove = moea_qremove,
358
	.release = moea_release,
359
	.remove = moea_remove,
360
	.remove_all = moea_remove_all,
361
	.mincore = moea_mincore,
362
	.remove_write = moea_remove_write,
363
	.sync_icache = moea_sync_icache,
364
	.unwire = moea_unwire,
365
	.zero_page =        	moea_zero_page,
366
	.zero_page_area = moea_zero_page_area,
367
	.activate = moea_activate,
368
	.deactivate =       	moea_deactivate,
369
	.page_set_memattr = moea_page_set_memattr,
370
	.quick_enter_page =  moea_quick_enter_page,
371
	.quick_remove_page =  moea_quick_remove_page,
372
	.page_is_mapped = moea_page_is_mapped,
373
	.ps_enabled = moea_ps_enabled,
374

375
	/* Internal interfaces */
376
	.bootstrap =        	moea_bootstrap,
377
	.cpu_bootstrap =    	moea_cpu_bootstrap,
378
	.mapdev_attr = moea_mapdev_attr,
379
	.mapdev = moea_mapdev,
380
	.unmapdev = moea_unmapdev,
381
	.kextract = moea_kextract,
382
	.kenter = moea_kenter,
383
	.kenter_attr = moea_kenter_attr,
384
	.dev_direct_mapped = moea_dev_direct_mapped,
385
	.dumpsys_pa_init = moea_scan_init,
386
	.dumpsys_map_chunk = moea_dumpsys_map,
387
	.map_user_ptr = moea_map_user_ptr,
388
	.decode_kernel_ptr =  moea_decode_kernel_ptr,
389
};
390

391
MMU_DEF(oea_mmu, MMU_TYPE_OEA, moea_methods);
392

393
static __inline uint32_t
394
moea_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
395
{
396
	uint32_t pte_lo;
397
	int i;
398

399
	if (ma != VM_MEMATTR_DEFAULT) {
400
		switch (ma) {
401
		case VM_MEMATTR_UNCACHEABLE:
402
			return (PTE_I | PTE_G);
403
		case VM_MEMATTR_CACHEABLE:
404
			return (PTE_M);
405
		case VM_MEMATTR_WRITE_COMBINING:
406
		case VM_MEMATTR_WRITE_BACK:
407
		case VM_MEMATTR_PREFETCHABLE:
408
			return (PTE_I);
409
		case VM_MEMATTR_WRITE_THROUGH:
410
			return (PTE_W | PTE_M);
411
		}
412
	}
413

414
	/*
415
	 * Assume the page is cache inhibited and access is guarded unless
416
	 * it's in our available memory array.
417
	 */
418
	pte_lo = PTE_I | PTE_G;
419
	for (i = 0; i < pregions_sz; i++) {
420
		if ((pa >= pregions[i].mr_start) &&
421
		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
422
			pte_lo = PTE_M;
423
			break;
424
		}
425
	}
426

427
	return pte_lo;
428
}
429

430
/*
431
 * Translate OFW translations into VM attributes.
432
 */
433
static __inline vm_memattr_t
434
moea_bootstrap_convert_wimg(uint32_t mode)
435
{
436

437
	switch (mode) {
438
	case (PTE_I | PTE_G):
439
		/* PCI device memory */
440
		return VM_MEMATTR_UNCACHEABLE;
441
	case (PTE_M):
442
		/* Explicitly coherent */
443
		return VM_MEMATTR_CACHEABLE;
444
	case 0: /* Default claim */
445
	case 2: /* Alternate PP bits set by OF for the original payload */
446
		/* "Normal" memory. */
447
		return VM_MEMATTR_DEFAULT;
448

449
	default:
450
		/* Err on the side of caution for unknowns */
451
		/* XXX should we panic instead? */
452
		return VM_MEMATTR_UNCACHEABLE;
453
	}
454
}
455

456
static void
457
tlbie(vm_offset_t va)
458
{
459

460
	mtx_lock_spin(&tlbie_mtx);
461
	__asm __volatile("ptesync");
462
	__asm __volatile("tlbie %0" :: "r"(va));
463
	__asm __volatile("eieio; tlbsync; ptesync");
464
	mtx_unlock_spin(&tlbie_mtx);
465
}
466

467
static void
468
tlbia(void)
469
{
470
	vm_offset_t va;
471

472
	for (va = 0; va < 0x00040000; va += 0x00001000) {
473
		__asm __volatile("tlbie %0" :: "r"(va));
474
		powerpc_sync();
475
	}
476
	__asm __volatile("tlbsync");
477
	powerpc_sync();
478
}
479

480
static __inline int
481
va_to_sr(u_int *sr, vm_offset_t va)
482
{
483
	return (sr[(uintptr_t)va >> ADDR_SR_SHFT]);
484
}
485

486
static __inline u_int
487
va_to_pteg(u_int sr, vm_offset_t addr)
488
{
489
	u_int hash;
490

491
	hash = (sr & SR_VSID_MASK) ^ (((u_int)addr & ADDR_PIDX) >>
492
	    ADDR_PIDX_SHFT);
493
	return (hash & moea_pteg_mask);
494
}
495

496
static __inline struct pvo_head *
497
vm_page_to_pvoh(vm_page_t m)
498
{
499

500
	return (&m->md.mdpg_pvoh);
501
}
502

503
static __inline void
504
moea_attr_clear(vm_page_t m, int ptebit)
505
{
506

507
	rw_assert(&pvh_global_lock, RA_WLOCKED);
508
	m->md.mdpg_attrs &= ~ptebit;
509
}
510

511
static __inline int
512
moea_attr_fetch(vm_page_t m)
513
{
514

515
	return (m->md.mdpg_attrs);
516
}
517

518
static __inline void
519
moea_attr_save(vm_page_t m, int ptebit)
520
{
521

522
	rw_assert(&pvh_global_lock, RA_WLOCKED);
523
	m->md.mdpg_attrs |= ptebit;
524
}
525

526
static __inline int
527
moea_pte_compare(const struct pte *pt, const struct pte *pvo_pt)
528
{
529
	if (pt->pte_hi == pvo_pt->pte_hi)
530
		return (1);
531

532
	return (0);
533
}
534

535
static __inline int
536
moea_pte_match(struct pte *pt, u_int sr, vm_offset_t va, int which)
537
{
538
	return (pt->pte_hi & ~PTE_VALID) ==
539
	    (((sr & SR_VSID_MASK) << PTE_VSID_SHFT) |
540
	    ((va >> ADDR_API_SHFT) & PTE_API) | which);
541
}
542

543
static __inline void
544
moea_pte_create(struct pte *pt, u_int sr, vm_offset_t va, u_int pte_lo)
545
{
546

547
	mtx_assert(&moea_table_mutex, MA_OWNED);
548

549
	/*
550
	 * Construct a PTE.  Default to IMB initially.  Valid bit only gets
551
	 * set when the real pte is set in memory.
552
	 *
553
	 * Note: Don't set the valid bit for correct operation of tlb update.
554
	 */
555
	pt->pte_hi = ((sr & SR_VSID_MASK) << PTE_VSID_SHFT) |
556
	    (((va & ADDR_PIDX) >> ADDR_API_SHFT) & PTE_API);
557
	pt->pte_lo = pte_lo;
558
}
559

560
static __inline void
561
moea_pte_synch(struct pte *pt, struct pte *pvo_pt)
562
{
563

564
	mtx_assert(&moea_table_mutex, MA_OWNED);
565
	pvo_pt->pte_lo |= pt->pte_lo & (PTE_REF | PTE_CHG);
566
}
567

568
static __inline void
569
moea_pte_clear(struct pte *pt, vm_offset_t va, int ptebit)
570
{
571

572
	mtx_assert(&moea_table_mutex, MA_OWNED);
573

574
	/*
575
	 * As shown in Section 7.6.3.2.3
576
	 */
577
	pt->pte_lo &= ~ptebit;
578
	tlbie(va);
579
}
580

581
static __inline void
582
moea_pte_set(struct pte *pt, struct pte *pvo_pt)
583
{
584

585
	mtx_assert(&moea_table_mutex, MA_OWNED);
586
	pvo_pt->pte_hi |= PTE_VALID;
587

588
	/*
589
	 * Update the PTE as defined in section 7.6.3.1.
590
	 * Note that the REF/CHG bits are from pvo_pt and thus should have
591
	 * been saved so this routine can restore them (if desired).
592
	 */
593
	pt->pte_lo = pvo_pt->pte_lo;
594
	powerpc_sync();
595
	pt->pte_hi = pvo_pt->pte_hi;
596
	powerpc_sync();
597
	moea_pte_valid++;
598
}
599

600
static __inline void
601
moea_pte_unset(struct pte *pt, struct pte *pvo_pt, vm_offset_t va)
602
{
603

604
	mtx_assert(&moea_table_mutex, MA_OWNED);
605
	pvo_pt->pte_hi &= ~PTE_VALID;
606

607
	/*
608
	 * Force the reg & chg bits back into the PTEs.
609
	 */
610
	powerpc_sync();
611

612
	/*
613
	 * Invalidate the pte.
614
	 */
615
	pt->pte_hi &= ~PTE_VALID;
616

617
	tlbie(va);
618

619
	/*
620
	 * Save the reg & chg bits.
621
	 */
622
	moea_pte_synch(pt, pvo_pt);
623
	moea_pte_valid--;
624
}
625

626
static __inline void
627
moea_pte_change(struct pte *pt, struct pte *pvo_pt, vm_offset_t va)
628
{
629

630
	/*
631
	 * Invalidate the PTE
632
	 */
633
	moea_pte_unset(pt, pvo_pt, va);
634
	moea_pte_set(pt, pvo_pt);
635
}
636

637
/*
638
 * Quick sort callout for comparing memory regions.
639
 */
640
static int	om_cmp(const void *a, const void *b);
641

642
static int
643
om_cmp(const void *a, const void *b)
644
{
645
	const struct	ofw_map *mapa;
646
	const struct	ofw_map *mapb;
647

648
	mapa = a;
649
	mapb = b;
650
	if (mapa->om_pa < mapb->om_pa)
651
		return (-1);
652
	else if (mapa->om_pa > mapb->om_pa)
653
		return (1);
654
	else
655
		return (0);
656
}
657

658
void
659
moea_cpu_bootstrap(int ap)
660
{
661
	u_int sdr;
662
	int i;
663

664
	if (ap) {
665
		powerpc_sync();
666
		__asm __volatile("mtdbatu 0,%0" :: "r"(battable[0].batu));
667
		__asm __volatile("mtdbatl 0,%0" :: "r"(battable[0].batl));
668
		isync();
669
		__asm __volatile("mtibatu 0,%0" :: "r"(battable[0].batu));
670
		__asm __volatile("mtibatl 0,%0" :: "r"(battable[0].batl));
671
		isync();
672
	}
673

674
	__asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu));
675
	__asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl));
676
	isync();
677

678
	__asm __volatile("mtibatu 1,%0" :: "r"(0));
679
	__asm __volatile("mtdbatu 2,%0" :: "r"(0));
680
	__asm __volatile("mtibatu 2,%0" :: "r"(0));
681
	__asm __volatile("mtdbatu 3,%0" :: "r"(0));
682
	__asm __volatile("mtibatu 3,%0" :: "r"(0));
683
	isync();
684

685
	for (i = 0; i < 16; i++)
686
		mtsrin(i << ADDR_SR_SHFT, kernel_pmap->pm_sr[i]);
687
	powerpc_sync();
688

689
	sdr = (u_int)moea_pteg_table | (moea_pteg_mask >> 10);
690
	__asm __volatile("mtsdr1 %0" :: "r"(sdr));
691
	isync();
692

693
	tlbia();
694
}
695

696
void
697
moea_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
698
{
699
	ihandle_t	mmui;
700
	phandle_t	chosen, mmu;
701
	int		sz;
702
	int		i, j;
703
	vm_size_t	size, physsz, hwphyssz;
704
	vm_offset_t	pa, va, off;
705
	void		*dpcpu;
706

707
	/*
708
	 * Map PCI memory space.
709
	 */
710
	battable[0x8].batl = BATL(0x80000000, BAT_I|BAT_G, BAT_PP_RW);
711
	battable[0x8].batu = BATU(0x80000000, BAT_BL_256M, BAT_Vs);
712

713
	battable[0x9].batl = BATL(0x90000000, BAT_I|BAT_G, BAT_PP_RW);
714
	battable[0x9].batu = BATU(0x90000000, BAT_BL_256M, BAT_Vs);
715

716
	battable[0xa].batl = BATL(0xa0000000, BAT_I|BAT_G, BAT_PP_RW);
717
	battable[0xa].batu = BATU(0xa0000000, BAT_BL_256M, BAT_Vs);
718

719
	battable[0xb].batl = BATL(0xb0000000, BAT_I|BAT_G, BAT_PP_RW);
720
	battable[0xb].batu = BATU(0xb0000000, BAT_BL_256M, BAT_Vs);
721

722
	powerpc_sync();
723

724
	/* map pci space */
725
	__asm __volatile("mtdbatu 1,%0" :: "r"(battable[8].batu));
726
	__asm __volatile("mtdbatl 1,%0" :: "r"(battable[8].batl));
727
	isync();
728

729
	/* set global direct map flag */
730
	hw_direct_map = 1;
731

732
	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
733
	CTR0(KTR_PMAP, "moea_bootstrap: physical memory");
734

735
	for (i = 0; i < pregions_sz; i++) {
736
		vm_offset_t pa;
737
		vm_offset_t end;
738

739
		CTR3(KTR_PMAP, "physregion: %#x - %#x (%#x)",
740
			pregions[i].mr_start,
741
			pregions[i].mr_start + pregions[i].mr_size,
742
			pregions[i].mr_size);
743
		/*
744
		 * Install entries into the BAT table to allow all
745
		 * of physmem to be convered by on-demand BAT entries.
746
		 * The loop will sometimes set the same battable element
747
		 * twice, but that's fine since they won't be used for
748
		 * a while yet.
749
		 */
750
		pa = pregions[i].mr_start & 0xf0000000;
751
		end = pregions[i].mr_start + pregions[i].mr_size;
752
		do {
753
                        u_int n = pa >> ADDR_SR_SHFT;
754

755
			battable[n].batl = BATL(pa, BAT_M, BAT_PP_RW);
756
			battable[n].batu = BATU(pa, BAT_BL_256M, BAT_Vs);
757
			pa += SEGMENT_LENGTH;
758
		} while (pa < end);
759
	}
760

761
	if (PHYS_AVAIL_ENTRIES < regions_sz)
762
		panic("moea_bootstrap: phys_avail too small");
763

764
	phys_avail_count = 0;
765
	physsz = 0;
766
	hwphyssz = 0;
767
	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
768
	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
769
		CTR3(KTR_PMAP, "region: %#x - %#x (%#x)", regions[i].mr_start,
770
		    regions[i].mr_start + regions[i].mr_size,
771
		    regions[i].mr_size);
772
		if (hwphyssz != 0 &&
773
		    (physsz + regions[i].mr_size) >= hwphyssz) {
774
			if (physsz < hwphyssz) {
775
				phys_avail[j] = regions[i].mr_start;
776
				phys_avail[j + 1] = regions[i].mr_start +
777
				    hwphyssz - physsz;
778
				physsz = hwphyssz;
779
				phys_avail_count++;
780
			}
781
			break;
782
		}
783
		phys_avail[j] = regions[i].mr_start;
784
		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
785
		phys_avail_count++;
786
		physsz += regions[i].mr_size;
787
	}
788

789
	/* Check for overlap with the kernel and exception vectors */
790
	for (j = 0; j < 2*phys_avail_count; j+=2) {
791
		if (phys_avail[j] < EXC_LAST)
792
			phys_avail[j] += EXC_LAST;
793

794
		if (kernelstart >= phys_avail[j] &&
795
		    kernelstart < phys_avail[j+1]) {
796
			if (kernelend < phys_avail[j+1]) {
797
				phys_avail[2*phys_avail_count] =
798
				    (kernelend & ~PAGE_MASK) + PAGE_SIZE;
799
				phys_avail[2*phys_avail_count + 1] =
800
				    phys_avail[j+1];
801
				phys_avail_count++;
802
			}
803

804
			phys_avail[j+1] = kernelstart & ~PAGE_MASK;
805
		}
806

807
		if (kernelend >= phys_avail[j] &&
808
		    kernelend < phys_avail[j+1]) {
809
			if (kernelstart > phys_avail[j]) {
810
				phys_avail[2*phys_avail_count] = phys_avail[j];
811
				phys_avail[2*phys_avail_count + 1] =
812
				    kernelstart & ~PAGE_MASK;
813
				phys_avail_count++;
814
			}
815

816
			phys_avail[j] = (kernelend & ~PAGE_MASK) + PAGE_SIZE;
817
		}
818
	}
819

820
	physmem = btoc(physsz);
821

822
	/*
823
	 * Allocate PTEG table.
824
	 */
825
#ifdef PTEGCOUNT
826
	moea_pteg_count = PTEGCOUNT;
827
#else
828
	moea_pteg_count = 0x1000;
829

830
	while (moea_pteg_count < physmem)
831
		moea_pteg_count <<= 1;
832

833
	moea_pteg_count >>= 1;
834
#endif /* PTEGCOUNT */
835

836
	size = moea_pteg_count * sizeof(struct pteg);
837
	CTR2(KTR_PMAP, "moea_bootstrap: %d PTEGs, %d bytes", moea_pteg_count,
838
	    size);
839
	moea_pteg_table = (struct pteg *)moea_bootstrap_alloc(size, size);
840
	CTR1(KTR_PMAP, "moea_bootstrap: PTEG table at %p", moea_pteg_table);
841
	bzero((void *)moea_pteg_table, moea_pteg_count * sizeof(struct pteg));
842
	moea_pteg_mask = moea_pteg_count - 1;
843

844
	/*
845
	 * Allocate pv/overflow lists.
846
	 */
847
	size = sizeof(struct pvo_head) * moea_pteg_count;
848
	moea_pvo_table = (struct pvo_head *)moea_bootstrap_alloc(size,
849
	    PAGE_SIZE);
850
	CTR1(KTR_PMAP, "moea_bootstrap: PVO table at %p", moea_pvo_table);
851
	for (i = 0; i < moea_pteg_count; i++)
852
		LIST_INIT(&moea_pvo_table[i]);
853

854
	/*
855
	 * Initialize the lock that synchronizes access to the pteg and pvo
856
	 * tables.
857
	 */
858
	mtx_init(&moea_table_mutex, "pmap table", NULL, MTX_DEF |
859
	    MTX_RECURSE);
860
	mtx_init(&moea_vsid_mutex, "VSID table", NULL, MTX_DEF);
861

862
	mtx_init(&tlbie_mtx, "tlbie", NULL, MTX_SPIN);
863

864
	/*
865
	 * Initialise the unmanaged pvo pool.
866
	 */
867
	moea_bpvo_pool = (struct pvo_entry *)moea_bootstrap_alloc(
868
		BPVO_POOL_SIZE*sizeof(struct pvo_entry), 0);
869
	moea_bpvo_pool_index = 0;
870

871
	/*
872
	 * Make sure kernel vsid is allocated as well as VSID 0.
873
	 */
874
	moea_vsid_bitmap[(KERNEL_VSIDBITS & (NPMAPS - 1)) / VSID_NBPW]
875
		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
876
	moea_vsid_bitmap[0] |= 1;
877

878
	/*
879
	 * Initialize the kernel pmap (which is statically allocated).
880
	 */
881
	PMAP_LOCK_INIT(kernel_pmap);
882
	for (i = 0; i < 16; i++)
883
		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
884
	CPU_FILL(&kernel_pmap->pm_active);
885
	RB_INIT(&kernel_pmap->pmap_pvo);
886

887
 	/*
888
	 * Initialize the global pv list lock.
889
	 */
890
	rw_init(&pvh_global_lock, "pmap pv global");
891

892
	/*
893
	 * Set up the Open Firmware mappings
894
	 */
895
	chosen = OF_finddevice("/chosen");
896
	if (chosen != -1 && OF_getprop(chosen, "mmu", &mmui, 4) != -1 &&
897
	    (mmu = OF_instance_to_package(mmui)) != -1 &&
898
	    (sz = OF_getproplen(mmu, "translations")) != -1) {
899
		translations = NULL;
900
		for (i = 0; phys_avail[i] != 0; i += 2) {
901
			if (phys_avail[i + 1] >= sz) {
902
				translations = (struct ofw_map *)phys_avail[i];
903
				break;
904
			}
905
		}
906
		if (translations == NULL)
907
			panic("moea_bootstrap: no space to copy translations");
908
		bzero(translations, sz);
909
		if (OF_getprop(mmu, "translations", translations, sz) == -1)
910
			panic("moea_bootstrap: can't get ofw translations");
911
		CTR0(KTR_PMAP, "moea_bootstrap: translations");
912
		sz /= sizeof(*translations);
913
		qsort(translations, sz, sizeof (*translations), om_cmp);
914
		for (i = 0; i < sz; i++) {
915
			CTR3(KTR_PMAP, "translation: pa=%#x va=%#x len=%#x",
916
			    translations[i].om_pa, translations[i].om_va,
917
			    translations[i].om_len);
918

919
			/*
920
			 * If the mapping is 1:1, let the RAM and device
921
			 * on-demand BAT tables take care of the translation.
922
			 *
923
			 * However, always enter mappings for segment 16,
924
			 * which is mixed-protection and therefore not
925
			 * compatible with a BAT entry.
926
			 */
927
			if ((translations[i].om_va >> ADDR_SR_SHFT) != 0xf &&
928
				translations[i].om_va == translations[i].om_pa)
929
					continue;
930

931
			/* Enter the pages */
932
			for (off = 0; off < translations[i].om_len;
933
			    off += PAGE_SIZE)
934
				moea_kenter_attr(translations[i].om_va + off,
935
				    translations[i].om_pa + off,
936
				    moea_bootstrap_convert_wimg(translations[i].om_mode));
937
		}
938
	}
939

940
	/*
941
	 * Calculate the last available physical address.
942
	 */
943
	for (i = 0; phys_avail[i + 2] != 0; i += 2)
944
		;
945
	Maxmem = powerpc_btop(phys_avail[i + 1]);
946

947
	moea_cpu_bootstrap(0);
948
	mtmsr(mfmsr() | PSL_DR | PSL_IR);
949
	pmap_bootstrapped++;
950

951
	/*
952
	 * Set the start and end of kva.
953
	 */
954
	virtual_avail = VM_MIN_KERNEL_ADDRESS;
955
	virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
956

957
	/*
958
	 * Allocate a kernel stack with a guard page for thread0 and map it
959
	 * into the kernel page map.
960
	 */
961
	pa = moea_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
962
	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
963
	virtual_avail = va + kstack_pages * PAGE_SIZE;
964
	CTR2(KTR_PMAP, "moea_bootstrap: kstack0 at %#x (%#x)", pa, va);
965
	thread0.td_kstack = va;
966
	thread0.td_kstack_pages = kstack_pages;
967
	for (i = 0; i < kstack_pages; i++) {
968
		moea_kenter(va, pa);
969
		pa += PAGE_SIZE;
970
		va += PAGE_SIZE;
971
	}
972

973
	/*
974
	 * Allocate virtual address space for the message buffer.
975
	 */
976
	pa = msgbuf_phys = moea_bootstrap_alloc(msgbufsize, PAGE_SIZE);
977
	msgbufp = (struct msgbuf *)virtual_avail;
978
	va = virtual_avail;
979
	virtual_avail += round_page(msgbufsize);
980
	while (va < virtual_avail) {
981
		moea_kenter(va, pa);
982
		pa += PAGE_SIZE;
983
		va += PAGE_SIZE;
984
	}
985

986
	/*
987
	 * Allocate virtual address space for the dynamic percpu area.
988
	 */
989
	pa = moea_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
990
	dpcpu = (void *)virtual_avail;
991
	va = virtual_avail;
992
	virtual_avail += DPCPU_SIZE;
993
	while (va < virtual_avail) {
994
		moea_kenter(va, pa);
995
		pa += PAGE_SIZE;
996
		va += PAGE_SIZE;
997
	}
998
	dpcpu_init(dpcpu, 0);
999
}
1000

1001
/*
1002
 * Activate a user pmap.  The pmap must be activated before it's address
1003
 * space can be accessed in any way.
1004
 */
1005
void
1006
moea_activate(struct thread *td)
1007
{
1008
	pmap_t	pm, pmr;
1009

1010
	/*
1011
	 * Load all the data we need up front to encourage the compiler to
1012
	 * not issue any loads while we have interrupts disabled below.
1013
	 */
1014
	pm = &td->td_proc->p_vmspace->vm_pmap;
1015
	pmr = pm->pmap_phys;
1016

1017
	CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
1018
	PCPU_SET(curpmap, pmr);
1019

1020
	mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
1021
}
1022

1023
void
1024
moea_deactivate(struct thread *td)
1025
{
1026
	pmap_t	pm;
1027

1028
	pm = &td->td_proc->p_vmspace->vm_pmap;
1029
	CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1030
	PCPU_SET(curpmap, NULL);
1031
}
1032

1033
void
1034
moea_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1035
{
1036
	struct	pvo_entry key, *pvo;
1037

1038
	PMAP_LOCK(pm);
1039
	key.pvo_vaddr = sva;
1040
	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1041
	    pvo != NULL && PVO_VADDR(pvo) < eva;
1042
	    pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
1043
		if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1044
			panic("moea_unwire: pvo %p is missing PVO_WIRED", pvo);
1045
		pvo->pvo_vaddr &= ~PVO_WIRED;
1046
		pm->pm_stats.wired_count--;
1047
	}
1048
	PMAP_UNLOCK(pm);
1049
}
1050

1051
void
1052
moea_copy_page(vm_page_t msrc, vm_page_t mdst)
1053
{
1054
	vm_offset_t	dst;
1055
	vm_offset_t	src;
1056

1057
	dst = VM_PAGE_TO_PHYS(mdst);
1058
	src = VM_PAGE_TO_PHYS(msrc);
1059

1060
	bcopy((void *)src, (void *)dst, PAGE_SIZE);
1061
}
1062

1063
void
1064
moea_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
1065
    vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1066
{
1067
	void *a_cp, *b_cp;
1068
	vm_offset_t a_pg_offset, b_pg_offset;
1069
	int cnt;
1070

1071
	while (xfersize > 0) {
1072
		a_pg_offset = a_offset & PAGE_MASK;
1073
		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1074
		a_cp = (char *)VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]) +
1075
		    a_pg_offset;
1076
		b_pg_offset = b_offset & PAGE_MASK;
1077
		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1078
		b_cp = (char *)VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]) +
1079
		    b_pg_offset;
1080
		bcopy(a_cp, b_cp, cnt);
1081
		a_offset += cnt;
1082
		b_offset += cnt;
1083
		xfersize -= cnt;
1084
	}
1085
}
1086

1087
/*
1088
 * Zero a page of physical memory by temporarily mapping it into the tlb.
1089
 */
1090
void
1091
moea_zero_page(vm_page_t m)
1092
{
1093
	vm_offset_t off, pa = VM_PAGE_TO_PHYS(m);
1094

1095
	for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1096
		__asm __volatile("dcbz 0,%0" :: "r"(pa + off));
1097
}
1098

1099
void
1100
moea_zero_page_area(vm_page_t m, int off, int size)
1101
{
1102
	vm_offset_t pa = VM_PAGE_TO_PHYS(m);
1103
	void *va = (void *)(pa + off);
1104

1105
	bzero(va, size);
1106
}
1107

1108
vm_offset_t
1109
moea_quick_enter_page(vm_page_t m)
1110
{
1111

1112
	return (VM_PAGE_TO_PHYS(m));
1113
}
1114

1115
void
1116
moea_quick_remove_page(vm_offset_t addr)
1117
{
1118
}
1119

1120
bool
1121
moea_page_is_mapped(vm_page_t m)
1122
{
1123
	return (!LIST_EMPTY(&(m)->md.mdpg_pvoh));
1124
}
1125

1126
bool
1127
moea_ps_enabled(pmap_t pmap __unused)
1128
{
1129
	return (false);
1130
}
1131

1132
/*
1133
 * Map the given physical page at the specified virtual address in the
1134
 * target pmap with the protection requested.  If specified the page
1135
 * will be wired down.
1136
 */
1137
int
1138
moea_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1139
    u_int flags, int8_t psind)
1140
{
1141
	int error;
1142

1143
	for (;;) {
1144
		rw_wlock(&pvh_global_lock);
1145
		PMAP_LOCK(pmap);
1146
		error = moea_enter_locked(pmap, va, m, prot, flags, psind);
1147
		rw_wunlock(&pvh_global_lock);
1148
		PMAP_UNLOCK(pmap);
1149
		if (error != ENOMEM)
1150
			return (KERN_SUCCESS);
1151
		if ((flags & PMAP_ENTER_NOSLEEP) != 0)
1152
			return (KERN_RESOURCE_SHORTAGE);
1153
		VM_OBJECT_ASSERT_UNLOCKED(m->object);
1154
		vm_wait(NULL);
1155
	}
1156
}
1157

1158
/*
1159
 * Map the given physical page at the specified virtual address in the
1160
 * target pmap with the protection requested.  If specified the page
1161
 * will be wired down.
1162
 *
1163
 * The global pvh and pmap must be locked.
1164
 */
1165
static int
1166
moea_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1167
    u_int flags, int8_t psind __unused)
1168
{
1169
	struct		pvo_head *pvo_head;
1170
	uma_zone_t	zone;
1171
	u_int		pte_lo, pvo_flags;
1172
	int		error;
1173

1174
	if (pmap_bootstrapped)
1175
		rw_assert(&pvh_global_lock, RA_WLOCKED);
1176
	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1177
	if ((m->oflags & VPO_UNMANAGED) == 0) {
1178
		if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0)
1179
			VM_PAGE_OBJECT_BUSY_ASSERT(m);
1180
		else
1181
			VM_OBJECT_ASSERT_LOCKED(m->object);
1182
	}
1183

1184
	if ((m->oflags & VPO_UNMANAGED) != 0 || !moea_initialized) {
1185
		pvo_head = &moea_pvo_kunmanaged;
1186
		zone = moea_upvo_zone;
1187
		pvo_flags = 0;
1188
	} else {
1189
		pvo_head = vm_page_to_pvoh(m);
1190
		zone = moea_mpvo_zone;
1191
		pvo_flags = PVO_MANAGED;
1192
	}
1193

1194
	pte_lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m));
1195

1196
	if (prot & VM_PROT_WRITE) {
1197
		pte_lo |= PTE_BW;
1198
		if (pmap_bootstrapped &&
1199
		    (m->oflags & VPO_UNMANAGED) == 0)
1200
			vm_page_aflag_set(m, PGA_WRITEABLE);
1201
	} else
1202
		pte_lo |= PTE_BR;
1203

1204
	if ((flags & PMAP_ENTER_WIRED) != 0)
1205
		pvo_flags |= PVO_WIRED;
1206

1207
	error = moea_pvo_enter(pmap, zone, pvo_head, va, VM_PAGE_TO_PHYS(m),
1208
	    pte_lo, pvo_flags);
1209

1210
	/*
1211
	 * Flush the real page from the instruction cache. This has be done
1212
	 * for all user mappings to prevent information leakage via the
1213
	 * instruction cache. moea_pvo_enter() returns ENOENT for the first
1214
	 * mapping for a page.
1215
	 */
1216
	if (pmap != kernel_pmap && error == ENOENT &&
1217
	    (pte_lo & (PTE_I | PTE_G)) == 0)
1218
		moea_syncicache(VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1219

1220
	return (error);
1221
}
1222

1223
/*
1224
 * Maps a sequence of resident pages belonging to the same object.
1225
 * The sequence begins with the given page m_start.  This page is
1226
 * mapped at the given virtual address start.  Each subsequent page is
1227
 * mapped at a virtual address that is offset from start by the same
1228
 * amount as the page is offset from m_start within the object.  The
1229
 * last page in the sequence is the page with the largest offset from
1230
 * m_start that can be mapped at a virtual address less than the given
1231
 * virtual address end.  Not every virtual page between start and end
1232
 * is mapped; only those for which a resident page exists with the
1233
 * corresponding offset from m_start are mapped.
1234
 */
1235
void
1236
moea_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
1237
    vm_page_t m_start, vm_prot_t prot)
1238
{
1239
	struct pctrie_iter pages;
1240
	vm_offset_t va;
1241
	vm_page_t m;
1242

1243
	VM_OBJECT_ASSERT_LOCKED(m_start->object);
1244

1245
	vm_page_iter_limit_init(&pages, m_start->object,
1246
	    m_start->pindex + atop(end - start));
1247
	m = vm_radix_iter_lookup(&pages, m_start->pindex);
1248
	rw_wlock(&pvh_global_lock);
1249
	PMAP_LOCK(pm);
1250
	while (m != NULL) {
1251
		va = start + ptoa(m->pindex - m_start->pindex);
1252
		moea_enter_locked(pm, va, m, prot &
1253
		    (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_QUICK_LOCKED,
1254
		    0);
1255
		m = vm_radix_iter_step(&pages);
1256
	}
1257
	rw_wunlock(&pvh_global_lock);
1258
	PMAP_UNLOCK(pm);
1259
}
1260

1261
void
1262
moea_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m,
1263
    vm_prot_t prot)
1264
{
1265

1266
	rw_wlock(&pvh_global_lock);
1267
	PMAP_LOCK(pm);
1268
	moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1269
	    PMAP_ENTER_QUICK_LOCKED, 0);
1270
	rw_wunlock(&pvh_global_lock);
1271
	PMAP_UNLOCK(pm);
1272
}
1273

1274
vm_paddr_t
1275
moea_extract(pmap_t pm, vm_offset_t va)
1276
{
1277
	struct	pvo_entry *pvo;
1278
	vm_paddr_t pa;
1279

1280
	PMAP_LOCK(pm);
1281
	pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
1282
	if (pvo == NULL)
1283
		pa = 0;
1284
	else
1285
		pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
1286
	PMAP_UNLOCK(pm);
1287
	return (pa);
1288
}
1289

1290
/*
1291
 * Atomically extract and hold the physical page with the given
1292
 * pmap and virtual address pair if that mapping permits the given
1293
 * protection.
1294
 */
1295
vm_page_t
1296
moea_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1297
{
1298
	struct	pvo_entry *pvo;
1299
	vm_page_t m;
1300

1301
	m = NULL;
1302
	PMAP_LOCK(pmap);
1303
	pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL);
1304
	if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID) &&
1305
	    ((pvo->pvo_pte.pte.pte_lo & PTE_PP) == PTE_RW ||
1306
	     (prot & VM_PROT_WRITE) == 0)) {
1307
		m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1308
		if (!vm_page_wire_mapped(m))
1309
			m = NULL;
1310
	}
1311
	PMAP_UNLOCK(pmap);
1312
	return (m);
1313
}
1314

1315
void
1316
moea_init(void)
1317
{
1318

1319
	moea_upvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1320
	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1321
	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1322
	moea_mpvo_zone = uma_zcreate("MPVO entry", sizeof(struct pvo_entry),
1323
	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1324
	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1325
	moea_initialized = true;
1326
}
1327

1328
bool
1329
moea_is_referenced(vm_page_t m)
1330
{
1331
	bool rv;
1332

1333
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1334
	    ("moea_is_referenced: page %p is not managed", m));
1335
	rw_wlock(&pvh_global_lock);
1336
	rv = moea_query_bit(m, PTE_REF);
1337
	rw_wunlock(&pvh_global_lock);
1338
	return (rv);
1339
}
1340

1341
bool
1342
moea_is_modified(vm_page_t m)
1343
{
1344
	bool rv;
1345

1346
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1347
	    ("moea_is_modified: page %p is not managed", m));
1348

1349
	/*
1350
	 * If the page is not busied then this check is racy.
1351
	 */
1352
	if (!pmap_page_is_write_mapped(m))
1353
		return (false);
1354

1355
	rw_wlock(&pvh_global_lock);
1356
	rv = moea_query_bit(m, PTE_CHG);
1357
	rw_wunlock(&pvh_global_lock);
1358
	return (rv);
1359
}
1360

1361
bool
1362
moea_is_prefaultable(pmap_t pmap, vm_offset_t va)
1363
{
1364
	struct pvo_entry *pvo;
1365
	bool rv;
1366

1367
	PMAP_LOCK(pmap);
1368
	pvo = moea_pvo_find_va(pmap, va & ~ADDR_POFF, NULL);
1369
	rv = pvo == NULL || (pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0;
1370
	PMAP_UNLOCK(pmap);
1371
	return (rv);
1372
}
1373

1374
void
1375
moea_clear_modify(vm_page_t m)
1376
{
1377

1378
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1379
	    ("moea_clear_modify: page %p is not managed", m));
1380
	vm_page_assert_busied(m);
1381

1382
	if (!pmap_page_is_write_mapped(m))
1383
		return;
1384
	rw_wlock(&pvh_global_lock);
1385
	moea_clear_bit(m, PTE_CHG);
1386
	rw_wunlock(&pvh_global_lock);
1387
}
1388

1389
/*
1390
 * Clear the write and modified bits in each of the given page's mappings.
1391
 */
1392
void
1393
moea_remove_write(vm_page_t m)
1394
{
1395
	struct	pvo_entry *pvo;
1396
	struct	pte *pt;
1397
	pmap_t	pmap;
1398
	u_int	lo;
1399

1400
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1401
	    ("moea_remove_write: page %p is not managed", m));
1402
	vm_page_assert_busied(m);
1403

1404
	if (!pmap_page_is_write_mapped(m))
1405
		return;
1406
	rw_wlock(&pvh_global_lock);
1407
	lo = moea_attr_fetch(m);
1408
	powerpc_sync();
1409
	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1410
		pmap = pvo->pvo_pmap;
1411
		PMAP_LOCK(pmap);
1412
		if ((pvo->pvo_pte.pte.pte_lo & PTE_PP) != PTE_BR) {
1413
			pt = moea_pvo_to_pte(pvo, -1);
1414
			pvo->pvo_pte.pte.pte_lo &= ~PTE_PP;
1415
			pvo->pvo_pte.pte.pte_lo |= PTE_BR;
1416
			if (pt != NULL) {
1417
				moea_pte_synch(pt, &pvo->pvo_pte.pte);
1418
				lo |= pvo->pvo_pte.pte.pte_lo;
1419
				pvo->pvo_pte.pte.pte_lo &= ~PTE_CHG;
1420
				moea_pte_change(pt, &pvo->pvo_pte.pte,
1421
				    pvo->pvo_vaddr);
1422
				mtx_unlock(&moea_table_mutex);
1423
			}
1424
		}
1425
		PMAP_UNLOCK(pmap);
1426
	}
1427
	if ((lo & PTE_CHG) != 0) {
1428
		moea_attr_clear(m, PTE_CHG);
1429
		vm_page_dirty(m);
1430
	}
1431
	vm_page_aflag_clear(m, PGA_WRITEABLE);
1432
	rw_wunlock(&pvh_global_lock);
1433
}
1434

1435
/*
1436
 *	moea_ts_referenced:
1437
 *
1438
 *	Return a count of reference bits for a page, clearing those bits.
1439
 *	It is not necessary for every reference bit to be cleared, but it
1440
 *	is necessary that 0 only be returned when there are truly no
1441
 *	reference bits set.
1442
 *
1443
 *	XXX: The exact number of bits to check and clear is a matter that
1444
 *	should be tested and standardized at some point in the future for
1445
 *	optimal aging of shared pages.
1446
 */
1447
int
1448
moea_ts_referenced(vm_page_t m)
1449
{
1450
	int count;
1451

1452
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1453
	    ("moea_ts_referenced: page %p is not managed", m));
1454
	rw_wlock(&pvh_global_lock);
1455
	count = moea_clear_bit(m, PTE_REF);
1456
	rw_wunlock(&pvh_global_lock);
1457
	return (count);
1458
}
1459

1460
/*
1461
 * Modify the WIMG settings of all mappings for a page.
1462
 */
1463
void
1464
moea_page_set_memattr(vm_page_t m, vm_memattr_t ma)
1465
{
1466
	struct	pvo_entry *pvo;
1467
	struct	pvo_head *pvo_head;
1468
	struct	pte *pt;
1469
	pmap_t	pmap;
1470
	u_int	lo;
1471

1472
	if (m->md.mdpg_cache_attrs == ma)
1473
		return;
1474

1475
	if ((m->oflags & VPO_UNMANAGED) != 0) {
1476
		m->md.mdpg_cache_attrs = ma;
1477
		return;
1478
	}
1479

1480
	rw_wlock(&pvh_global_lock);
1481
	pvo_head = vm_page_to_pvoh(m);
1482
	lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
1483

1484
	LIST_FOREACH(pvo, pvo_head, pvo_vlink) {
1485
		pmap = pvo->pvo_pmap;
1486
		PMAP_LOCK(pmap);
1487
		pt = moea_pvo_to_pte(pvo, -1);
1488
		pvo->pvo_pte.pte.pte_lo &= ~PTE_WIMG;
1489
		pvo->pvo_pte.pte.pte_lo |= lo;
1490
		if (pt != NULL) {
1491
			moea_pte_change(pt, &pvo->pvo_pte.pte,
1492
			    pvo->pvo_vaddr);
1493
			if (pvo->pvo_pmap == kernel_pmap)
1494
				isync();
1495
		}
1496
		mtx_unlock(&moea_table_mutex);
1497
		PMAP_UNLOCK(pmap);
1498
	}
1499
	m->md.mdpg_cache_attrs = ma;
1500
	rw_wunlock(&pvh_global_lock);
1501
}
1502

1503
/*
1504
 * Map a wired page into kernel virtual address space.
1505
 */
1506
void
1507
moea_kenter(vm_offset_t va, vm_paddr_t pa)
1508
{
1509

1510
	moea_kenter_attr(va, pa, VM_MEMATTR_DEFAULT);
1511
}
1512

1513
void
1514
moea_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
1515
{
1516
	u_int		pte_lo;
1517
	int		error;
1518

1519
#if 0
1520
	if (va < VM_MIN_KERNEL_ADDRESS)
1521
		panic("moea_kenter: attempt to enter non-kernel address %#x",
1522
		    va);
1523
#endif
1524

1525
	pte_lo = moea_calc_wimg(pa, ma);
1526

1527
	PMAP_LOCK(kernel_pmap);
1528
	error = moea_pvo_enter(kernel_pmap, moea_upvo_zone,
1529
	    &moea_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED);
1530

1531
	if (error != 0 && error != ENOENT)
1532
		panic("moea_kenter: failed to enter va %#x pa %#x: %d", va,
1533
		    pa, error);
1534

1535
	PMAP_UNLOCK(kernel_pmap);
1536
}
1537

1538
/*
1539
 * Extract the physical page address associated with the given kernel virtual
1540
 * address.
1541
 */
1542
vm_paddr_t
1543
moea_kextract(vm_offset_t va)
1544
{
1545
	struct		pvo_entry *pvo;
1546
	vm_paddr_t pa;
1547

1548
	/*
1549
	 * Allow direct mappings on 32-bit OEA
1550
	 */
1551
	if (va < VM_MIN_KERNEL_ADDRESS) {
1552
		return (va);
1553
	}
1554

1555
	PMAP_LOCK(kernel_pmap);
1556
	pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL);
1557
	KASSERT(pvo != NULL, ("moea_kextract: no addr found"));
1558
	pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
1559
	PMAP_UNLOCK(kernel_pmap);
1560
	return (pa);
1561
}
1562

1563
/*
1564
 * Remove a wired page from kernel virtual address space.
1565
 */
1566
void
1567
moea_kremove(vm_offset_t va)
1568
{
1569

1570
	moea_remove(kernel_pmap, va, va + PAGE_SIZE);
1571
}
1572

1573
/*
1574
 * Provide a kernel pointer corresponding to a given userland pointer.
1575
 * The returned pointer is valid until the next time this function is
1576
 * called in this thread. This is used internally in copyin/copyout.
1577
 */
1578
int
1579
moea_map_user_ptr(pmap_t pm, volatile const void *uaddr,
1580
    void **kaddr, size_t ulen, size_t *klen)
1581
{
1582
	size_t l;
1583
	register_t vsid;
1584

1585
	*kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK);
1586
	l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr);
1587
	if (l > ulen)
1588
		l = ulen;
1589
	if (klen)
1590
		*klen = l;
1591
	else if (l != ulen)
1592
		return (EFAULT);
1593

1594
	vsid = va_to_vsid(pm, (vm_offset_t)uaddr);
1595

1596
	/* Mark segment no-execute */
1597
	vsid |= SR_N;
1598

1599
	/* If we have already set this VSID, we can just return */
1600
	if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == vsid)
1601
		return (0);
1602

1603
	__asm __volatile("isync");
1604
	curthread->td_pcb->pcb_cpu.aim.usr_segm =
1605
	    (uintptr_t)uaddr >> ADDR_SR_SHFT;
1606
	curthread->td_pcb->pcb_cpu.aim.usr_vsid = vsid;
1607
	__asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(vsid));
1608

1609
	return (0);
1610
}
1611

1612
/*
1613
 * Figure out where a given kernel pointer (usually in a fault) points
1614
 * to from the VM's perspective, potentially remapping into userland's
1615
 * address space.
1616
 */
1617
static int
1618
moea_decode_kernel_ptr(vm_offset_t addr, int *is_user,
1619
    vm_offset_t *decoded_addr)
1620
{
1621
	vm_offset_t user_sr;
1622

1623
	if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
1624
		user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm;
1625
		addr &= ADDR_PIDX | ADDR_POFF;
1626
		addr |= user_sr << ADDR_SR_SHFT;
1627
		*decoded_addr = addr;
1628
		*is_user = 1;
1629
	} else {
1630
		*decoded_addr = addr;
1631
		*is_user = 0;
1632
	}
1633

1634
	return (0);
1635
}
1636

1637
/*
1638
 * Map a range of physical addresses into kernel virtual address space.
1639
 *
1640
 * The value passed in *virt is a suggested virtual address for the mapping.
1641
 * Architectures which can support a direct-mapped physical to virtual region
1642
 * can return the appropriate address within that region, leaving '*virt'
1643
 * unchanged.  We cannot and therefore do not; *virt is updated with the
1644
 * first usable address after the mapped region.
1645
 */
1646
vm_offset_t
1647
moea_map(vm_offset_t *virt, vm_paddr_t pa_start,
1648
    vm_paddr_t pa_end, int prot)
1649
{
1650
	vm_offset_t	sva, va;
1651

1652
	sva = *virt;
1653
	va = sva;
1654
	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
1655
		moea_kenter(va, pa_start);
1656
	*virt = va;
1657
	return (sva);
1658
}
1659

1660
/*
1661
 * Returns true if the pmap's pv is one of the first
1662
 * 16 pvs linked to from this page.  This count may
1663
 * be changed upwards or downwards in the future; it
1664
 * is only necessary that true be returned for a small
1665
 * subset of pmaps for proper page aging.
1666
 */
1667
bool
1668
moea_page_exists_quick(pmap_t pmap, vm_page_t m)
1669
{
1670
        int loops;
1671
	struct pvo_entry *pvo;
1672
	bool rv;
1673

1674
	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1675
	    ("moea_page_exists_quick: page %p is not managed", m));
1676
	loops = 0;
1677
	rv = false;
1678
	rw_wlock(&pvh_global_lock);
1679
	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1680
		if (pvo->pvo_pmap == pmap) {
1681
			rv = true;
1682
			break;
1683
		}
1684
		if (++loops >= 16)
1685
			break;
1686
	}
1687
	rw_wunlock(&pvh_global_lock);
1688
	return (rv);
1689
}
1690

1691
void
1692
moea_page_init(vm_page_t m)
1693
{
1694

1695
	m->md.mdpg_attrs = 0;
1696
	m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
1697
	LIST_INIT(&m->md.mdpg_pvoh);
1698
}
1699

1700
/*
1701
 * Return the number of managed mappings to the given physical page
1702
 * that are wired.
1703
 */
1704
int
1705
moea_page_wired_mappings(vm_page_t m)
1706
{
1707
	struct pvo_entry *pvo;
1708
	int count;
1709

1710
	count = 0;
1711
	if ((m->oflags & VPO_UNMANAGED) != 0)
1712
		return (count);
1713
	rw_wlock(&pvh_global_lock);
1714
	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
1715
		if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
1716
			count++;
1717
	rw_wunlock(&pvh_global_lock);
1718
	return (count);
1719
}
1720

1721
static u_int	moea_vsidcontext;
1722

1723
int
1724
moea_pinit(pmap_t pmap)
1725
{
1726
	int	i, mask;
1727
	u_int	entropy;
1728

1729
	RB_INIT(&pmap->pmap_pvo);
1730

1731
	entropy = 0;
1732
	__asm __volatile("mftb %0" : "=r"(entropy));
1733

1734
	if ((pmap->pmap_phys = (pmap_t)moea_kextract((vm_offset_t)pmap))
1735
	    == NULL) {
1736
		pmap->pmap_phys = pmap;
1737
	}
1738

1739
	mtx_lock(&moea_vsid_mutex);
1740
	/*
1741
	 * Allocate some segment registers for this pmap.
1742
	 */
1743
	for (i = 0; i < NPMAPS; i += VSID_NBPW) {
1744
		u_int	hash, n;
1745

1746
		/*
1747
		 * Create a new value by multiplying by a prime and adding in
1748
		 * entropy from the timebase register.  This is to make the
1749
		 * VSID more random so that the PT hash function collides
1750
		 * less often.  (Note that the prime casues gcc to do shifts
1751
		 * instead of a multiply.)
1752
		 */
1753
		moea_vsidcontext = (moea_vsidcontext * 0x1105) + entropy;
1754
		hash = moea_vsidcontext & (NPMAPS - 1);
1755
		if (hash == 0)		/* 0 is special, avoid it */
1756
			continue;
1757
		n = hash >> 5;
1758
		mask = 1 << (hash & (VSID_NBPW - 1));
1759
		hash = (moea_vsidcontext & 0xfffff);
1760
		if (moea_vsid_bitmap[n] & mask) {	/* collision? */
1761
			/* anything free in this bucket? */
1762
			if (moea_vsid_bitmap[n] == 0xffffffff) {
1763
				entropy = (moea_vsidcontext >> 20);
1764
				continue;
1765
			}
1766
			i = ffs(~moea_vsid_bitmap[n]) - 1;
1767
			mask = 1 << i;
1768
			hash &= rounddown2(0xfffff, VSID_NBPW);
1769
			hash |= i;
1770
		}
1771
		KASSERT(!(moea_vsid_bitmap[n] & mask),
1772
		    ("Allocating in-use VSID group %#x\n", hash));
1773
		moea_vsid_bitmap[n] |= mask;
1774
		for (i = 0; i < 16; i++)
1775
			pmap->pm_sr[i] = VSID_MAKE(i, hash);
1776
		mtx_unlock(&moea_vsid_mutex);
1777
		return (1);
1778
	}
1779

1780
	mtx_unlock(&moea_vsid_mutex);
1781
	panic("moea_pinit: out of segments");
1782
}
1783

1784
/*
1785
 * Initialize the pmap associated with process 0.
1786
 */
1787
void
1788
moea_pinit0(pmap_t pm)
1789
{
1790

1791
	PMAP_LOCK_INIT(pm);
1792
	moea_pinit(pm);
1793
	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
1794
}
1795

1796
/*
1797
 * Set the physical protection on the specified range of this map as requested.
1798
 */
1799
void
1800
moea_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
1801
    vm_prot_t prot)
1802
{
1803
	struct	pvo_entry *pvo, *tpvo, key;
1804
	struct	pte *pt;
1805

1806
	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
1807
	    ("moea_protect: non current pmap"));
1808

1809
	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1810
		moea_remove(pm, sva, eva);
1811
		return;
1812
	}
1813

1814
	rw_wlock(&pvh_global_lock);
1815
	PMAP_LOCK(pm);
1816
	key.pvo_vaddr = sva;
1817
	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1818
	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
1819
		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
1820

1821
		/*
1822
		 * Grab the PTE pointer before we diddle with the cached PTE
1823
		 * copy.
1824
		 */
1825
		pt = moea_pvo_to_pte(pvo, -1);
1826
		/*
1827
		 * Change the protection of the page.
1828
		 */
1829
		pvo->pvo_pte.pte.pte_lo &= ~PTE_PP;
1830
		pvo->pvo_pte.pte.pte_lo |= PTE_BR;
1831

1832
		/*
1833
		 * If the PVO is in the page table, update that pte as well.
1834
		 */
1835
		if (pt != NULL) {
1836
			moea_pte_change(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr);
1837
			mtx_unlock(&moea_table_mutex);
1838
		}
1839
	}
1840
	rw_wunlock(&pvh_global_lock);
1841
	PMAP_UNLOCK(pm);
1842
}
1843

1844
/*
1845
 * Map a list of wired pages into kernel virtual address space.  This is
1846
 * intended for temporary mappings which do not need page modification or
1847
 * references recorded.  Existing mappings in the region are overwritten.
1848
 */
1849
void
1850
moea_qenter(vm_offset_t sva, vm_page_t *m, int count)
1851
{
1852
	vm_offset_t va;
1853

1854
	va = sva;
1855
	while (count-- > 0) {
1856
		moea_kenter(va, VM_PAGE_TO_PHYS(*m));
1857
		va += PAGE_SIZE;
1858
		m++;
1859
	}
1860
}
1861

1862
/*
1863
 * Remove page mappings from kernel virtual address space.  Intended for
1864
 * temporary mappings entered by moea_qenter.
1865
 */
1866
void
1867
moea_qremove(vm_offset_t sva, int count)
1868
{
1869
	vm_offset_t va;
1870

1871
	va = sva;
1872
	while (count-- > 0) {
1873
		moea_kremove(va);
1874
		va += PAGE_SIZE;
1875
	}
1876
}
1877

1878
void
1879
moea_release(pmap_t pmap)
1880
{
1881
        int idx, mask;
1882

1883
	/*
1884
	 * Free segment register's VSID
1885
	 */
1886
        if (pmap->pm_sr[0] == 0)
1887
                panic("moea_release");
1888

1889
	mtx_lock(&moea_vsid_mutex);
1890
        idx = VSID_TO_HASH(pmap->pm_sr[0]) & (NPMAPS-1);
1891
        mask = 1 << (idx % VSID_NBPW);
1892
        idx /= VSID_NBPW;
1893
        moea_vsid_bitmap[idx] &= ~mask;
1894
	mtx_unlock(&moea_vsid_mutex);
1895
}
1896

1897
/*
1898
 * Remove the given range of addresses from the specified map.
1899
 */
1900
void
1901
moea_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1902
{
1903
	struct	pvo_entry *pvo, *tpvo, key;
1904

1905
	rw_wlock(&pvh_global_lock);
1906
	PMAP_LOCK(pm);
1907
	key.pvo_vaddr = sva;
1908
	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1909
	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
1910
		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
1911
		moea_pvo_remove(pvo, -1);
1912
	}
1913
	PMAP_UNLOCK(pm);
1914
	rw_wunlock(&pvh_global_lock);
1915
}
1916

1917
/*
1918
 * Remove physical page from all pmaps in which it resides. moea_pvo_remove()
1919
 * will reflect changes in pte's back to the vm_page.
1920
 */
1921
void
1922
moea_remove_all(vm_page_t m)
1923
{
1924
	struct  pvo_head *pvo_head;
1925
	struct	pvo_entry *pvo, *next_pvo;
1926
	pmap_t	pmap;
1927

1928
	rw_wlock(&pvh_global_lock);
1929
	pvo_head = vm_page_to_pvoh(m);
1930
	for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) {
1931
		next_pvo = LIST_NEXT(pvo, pvo_vlink);
1932

1933
		pmap = pvo->pvo_pmap;
1934
		PMAP_LOCK(pmap);
1935
		moea_pvo_remove(pvo, -1);
1936
		PMAP_UNLOCK(pmap);
1937
	}
1938
	if ((m->a.flags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) {
1939
		moea_attr_clear(m, PTE_CHG);
1940
		vm_page_dirty(m);
1941
	}
1942
	vm_page_aflag_clear(m, PGA_WRITEABLE);
1943
	rw_wunlock(&pvh_global_lock);
1944
}
1945

1946
static int
1947
moea_mincore(pmap_t pm, vm_offset_t va, vm_paddr_t *pap)
1948
{
1949
	struct pvo_entry *pvo;
1950
	vm_paddr_t pa;
1951
	vm_page_t m;
1952
	int val;
1953
	bool managed;
1954

1955
	PMAP_LOCK(pm);
1956

1957
	pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
1958
	if (pvo != NULL) {
1959
		pa = PVO_PADDR(pvo);
1960
		m = PHYS_TO_VM_PAGE(pa);
1961
		managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED;
1962
		val = MINCORE_INCORE;
1963
	} else {
1964
		PMAP_UNLOCK(pm);
1965
		return (0);
1966
	}
1967

1968
	PMAP_UNLOCK(pm);
1969

1970
	if (m == NULL)
1971
		return (0);
1972

1973
	if (managed) {
1974
		if (moea_is_modified(m))
1975
			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
1976

1977
		if (moea_is_referenced(m))
1978
			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
1979
	}
1980

1981
	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
1982
	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
1983
	    managed) {
1984
		*pap = pa;
1985
	}
1986

1987
	return (val);
1988
}
1989

1990
/*
1991
 * Allocate a physical page of memory directly from the phys_avail map.
1992
 * Can only be called from moea_bootstrap before avail start and end are
1993
 * calculated.
1994
 */
1995
static vm_offset_t
1996
moea_bootstrap_alloc(vm_size_t size, u_int align)
1997
{
1998
	vm_offset_t	s, e;
1999
	int		i, j;
2000

2001
	size = round_page(size);
2002
	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2003
		if (align != 0)
2004
			s = roundup2(phys_avail[i], align);
2005
		else
2006
			s = phys_avail[i];
2007
		e = s + size;
2008

2009
		if (s < phys_avail[i] || e > phys_avail[i + 1])
2010
			continue;
2011

2012
		if (s == phys_avail[i]) {
2013
			phys_avail[i] += size;
2014
		} else if (e == phys_avail[i + 1]) {
2015
			phys_avail[i + 1] -= size;
2016
		} else {
2017
			for (j = phys_avail_count * 2; j > i; j -= 2) {
2018
				phys_avail[j] = phys_avail[j - 2];
2019
				phys_avail[j + 1] = phys_avail[j - 1];
2020
			}
2021

2022
			phys_avail[i + 3] = phys_avail[i + 1];
2023
			phys_avail[i + 1] = s;
2024
			phys_avail[i + 2] = e;
2025
			phys_avail_count++;
2026
		}
2027

2028
		return (s);
2029
	}
2030
	panic("moea_bootstrap_alloc: could not allocate memory");
2031
}
2032

2033
static void
2034
moea_syncicache(vm_paddr_t pa, vm_size_t len)
2035
{
2036
	__syncicache((void *)pa, len);
2037
}
2038

2039
static int
2040
moea_pvo_enter(pmap_t pm, uma_zone_t zone, struct pvo_head *pvo_head,
2041
    vm_offset_t va, vm_paddr_t pa, u_int pte_lo, int flags)
2042
{
2043
	struct	pvo_entry *pvo;
2044
	u_int	sr;
2045
	int	first;
2046
	u_int	ptegidx;
2047
	int	i;
2048
	int     bootstrap;
2049

2050
	moea_pvo_enter_calls++;
2051
	first = 0;
2052
	bootstrap = 0;
2053

2054
	/*
2055
	 * Compute the PTE Group index.
2056
	 */
2057
	va &= ~ADDR_POFF;
2058
	sr = va_to_sr(pm->pm_sr, va);
2059
	ptegidx = va_to_pteg(sr, va);
2060

2061
	/*
2062
	 * Remove any existing mapping for this page.  Reuse the pvo entry if
2063
	 * there is a mapping.
2064
	 */
2065
	mtx_lock(&moea_table_mutex);
2066
	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
2067
		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
2068
			if (PVO_PADDR(pvo) == pa &&
2069
			    (pvo->pvo_pte.pte.pte_lo & PTE_PP) ==
2070
			    (pte_lo & PTE_PP)) {
2071
				/*
2072
				 * The PTE is not changing.  Instead, this may
2073
				 * be a request to change the mapping's wired
2074
				 * attribute.
2075
				 */
2076
				mtx_unlock(&moea_table_mutex);
2077
				if ((flags & PVO_WIRED) != 0 &&
2078
				    (pvo->pvo_vaddr & PVO_WIRED) == 0) {
2079
					pvo->pvo_vaddr |= PVO_WIRED;
2080
					pm->pm_stats.wired_count++;
2081
				} else if ((flags & PVO_WIRED) == 0 &&
2082
				    (pvo->pvo_vaddr & PVO_WIRED) != 0) {
2083
					pvo->pvo_vaddr &= ~PVO_WIRED;
2084
					pm->pm_stats.wired_count--;
2085
				}
2086
				return (0);
2087
			}
2088
			moea_pvo_remove(pvo, -1);
2089
			break;
2090
		}
2091
	}
2092

2093
	/*
2094
	 * If we aren't overwriting a mapping, try to allocate.
2095
	 */
2096
	if (moea_initialized) {
2097
		pvo = uma_zalloc(zone, M_NOWAIT);
2098
	} else {
2099
		if (moea_bpvo_pool_index >= BPVO_POOL_SIZE) {
2100
			panic("moea_enter: bpvo pool exhausted, %d, %d, %d",
2101
			      moea_bpvo_pool_index, BPVO_POOL_SIZE,
2102
			      BPVO_POOL_SIZE * sizeof(struct pvo_entry));
2103
		}
2104
		pvo = &moea_bpvo_pool[moea_bpvo_pool_index];
2105
		moea_bpvo_pool_index++;
2106
		bootstrap = 1;
2107
	}
2108

2109
	if (pvo == NULL) {
2110
		mtx_unlock(&moea_table_mutex);
2111
		return (ENOMEM);
2112
	}
2113

2114
	moea_pvo_entries++;
2115
	pvo->pvo_vaddr = va;
2116
	pvo->pvo_pmap = pm;
2117
	LIST_INSERT_HEAD(&moea_pvo_table[ptegidx], pvo, pvo_olink);
2118
	pvo->pvo_vaddr &= ~ADDR_POFF;
2119
	if (flags & PVO_WIRED)
2120
		pvo->pvo_vaddr |= PVO_WIRED;
2121
	if (pvo_head != &moea_pvo_kunmanaged)
2122
		pvo->pvo_vaddr |= PVO_MANAGED;
2123
	if (bootstrap)
2124
		pvo->pvo_vaddr |= PVO_BOOTSTRAP;
2125

2126
	moea_pte_create(&pvo->pvo_pte.pte, sr, va, pa | pte_lo);
2127

2128
	/*
2129
	 * Add to pmap list
2130
	 */
2131
	RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo);
2132

2133
	/*
2134
	 * Remember if the list was empty and therefore will be the first
2135
	 * item.
2136
	 */
2137
	if (LIST_FIRST(pvo_head) == NULL)
2138
		first = 1;
2139
	LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2140

2141
	if (pvo->pvo_vaddr & PVO_WIRED)
2142
		pm->pm_stats.wired_count++;
2143
	pm->pm_stats.resident_count++;
2144

2145
	i = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte);
2146
	KASSERT(i < 8, ("Invalid PTE index"));
2147
	if (i >= 0) {
2148
		PVO_PTEGIDX_SET(pvo, i);
2149
	} else {
2150
		panic("moea_pvo_enter: overflow");
2151
		moea_pte_overflow++;
2152
	}
2153
	mtx_unlock(&moea_table_mutex);
2154

2155
	return (first ? ENOENT : 0);
2156
}
2157

2158
static void
2159
moea_pvo_remove(struct pvo_entry *pvo, int pteidx)
2160
{
2161
	struct	pte *pt;
2162

2163
	/*
2164
	 * If there is an active pte entry, we need to deactivate it (and
2165
	 * save the ref & cfg bits).
2166
	 */
2167
	pt = moea_pvo_to_pte(pvo, pteidx);
2168
	if (pt != NULL) {
2169
		moea_pte_unset(pt, &pvo->pvo_pte.pte, pvo->pvo_vaddr);
2170
		mtx_unlock(&moea_table_mutex);
2171
		PVO_PTEGIDX_CLR(pvo);
2172
	} else {
2173
		moea_pte_overflow--;
2174
	}
2175

2176
	/*
2177
	 * Update our statistics.
2178
	 */
2179
	pvo->pvo_pmap->pm_stats.resident_count--;
2180
	if (pvo->pvo_vaddr & PVO_WIRED)
2181
		pvo->pvo_pmap->pm_stats.wired_count--;
2182

2183
	/*
2184
	 * Remove this PVO from the PV and pmap lists.
2185
	 */
2186
	LIST_REMOVE(pvo, pvo_vlink);
2187
	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2188

2189
	/*
2190
	 * Save the REF/CHG bits into their cache if the page is managed.
2191
	 * Clear PGA_WRITEABLE if all mappings of the page have been removed.
2192
	 */
2193
	if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED) {
2194
		struct vm_page *pg;
2195

2196
		pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2197
		if (pg != NULL) {
2198
			moea_attr_save(pg, pvo->pvo_pte.pte.pte_lo &
2199
			    (PTE_REF | PTE_CHG));
2200
			if (LIST_EMPTY(&pg->md.mdpg_pvoh))
2201
				vm_page_aflag_clear(pg, PGA_WRITEABLE);
2202
		}
2203
	}
2204

2205
	/*
2206
	 * Remove this from the overflow list and return it to the pool
2207
	 * if we aren't going to reuse it.
2208
	 */
2209
	LIST_REMOVE(pvo, pvo_olink);
2210
	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
2211
		uma_zfree(pvo->pvo_vaddr & PVO_MANAGED ? moea_mpvo_zone :
2212
		    moea_upvo_zone, pvo);
2213
	moea_pvo_entries--;
2214
	moea_pvo_remove_calls++;
2215
}
2216

2217
static __inline int
2218
moea_pvo_pte_index(const struct pvo_entry *pvo, int ptegidx)
2219
{
2220
	int	pteidx;
2221

2222
	/*
2223
	 * We can find the actual pte entry without searching by grabbing
2224
	 * the PTEG index from 3 unused bits in pte_lo[11:9] and by
2225
	 * noticing the HID bit.
2226
	 */
2227
	pteidx = ptegidx * 8 + PVO_PTEGIDX_GET(pvo);
2228
	if (pvo->pvo_pte.pte.pte_hi & PTE_HID)
2229
		pteidx ^= moea_pteg_mask * 8;
2230

2231
	return (pteidx);
2232
}
2233

2234
static struct pvo_entry *
2235
moea_pvo_find_va(pmap_t pm, vm_offset_t va, int *pteidx_p)
2236
{
2237
	struct	pvo_entry *pvo;
2238
	int	ptegidx;
2239
	u_int	sr;
2240

2241
	va &= ~ADDR_POFF;
2242
	sr = va_to_sr(pm->pm_sr, va);
2243
	ptegidx = va_to_pteg(sr, va);
2244

2245
	mtx_lock(&moea_table_mutex);
2246
	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
2247
		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va) {
2248
			if (pteidx_p)
2249
				*pteidx_p = moea_pvo_pte_index(pvo, ptegidx);
2250
			break;
2251
		}
2252
	}
2253
	mtx_unlock(&moea_table_mutex);
2254

2255
	return (pvo);
2256
}
2257

2258
static struct pte *
2259
moea_pvo_to_pte(const struct pvo_entry *pvo, int pteidx)
2260
{
2261
	struct	pte *pt;
2262

2263
	/*
2264
	 * If we haven't been supplied the ptegidx, calculate it.
2265
	 */
2266
	if (pteidx == -1) {
2267
		int	ptegidx;
2268
		u_int	sr;
2269

2270
		sr = va_to_sr(pvo->pvo_pmap->pm_sr, pvo->pvo_vaddr);
2271
		ptegidx = va_to_pteg(sr, pvo->pvo_vaddr);
2272
		pteidx = moea_pvo_pte_index(pvo, ptegidx);
2273
	}
2274

2275
	pt = &moea_pteg_table[pteidx >> 3].pt[pteidx & 7];
2276
	mtx_lock(&moea_table_mutex);
2277

2278
	if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) && !PVO_PTEGIDX_ISSET(pvo)) {
2279
		panic("moea_pvo_to_pte: pvo %p has valid pte in pvo but no "
2280
		    "valid pte index", pvo);
2281
	}
2282

2283
	if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0 && PVO_PTEGIDX_ISSET(pvo)) {
2284
		panic("moea_pvo_to_pte: pvo %p has valid pte index in pvo "
2285
		    "pvo but no valid pte", pvo);
2286
	}
2287

2288
	if ((pt->pte_hi ^ (pvo->pvo_pte.pte.pte_hi & ~PTE_VALID)) == PTE_VALID) {
2289
		if ((pvo->pvo_pte.pte.pte_hi & PTE_VALID) == 0) {
2290
			panic("moea_pvo_to_pte: pvo %p has valid pte in "
2291
			    "moea_pteg_table %p but invalid in pvo", pvo, pt);
2292
		}
2293

2294
		if (((pt->pte_lo ^ pvo->pvo_pte.pte.pte_lo) & ~(PTE_CHG|PTE_REF))
2295
		    != 0) {
2296
			panic("moea_pvo_to_pte: pvo %p pte does not match "
2297
			    "pte %p in moea_pteg_table", pvo, pt);
2298
		}
2299

2300
		mtx_assert(&moea_table_mutex, MA_OWNED);
2301
		return (pt);
2302
	}
2303

2304
	if (pvo->pvo_pte.pte.pte_hi & PTE_VALID) {
2305
		panic("moea_pvo_to_pte: pvo %p has invalid pte %p in "
2306
		    "moea_pteg_table but valid in pvo: %8x, %8x", pvo, pt, pvo->pvo_pte.pte.pte_hi, pt->pte_hi);
2307
	}
2308

2309
	mtx_unlock(&moea_table_mutex);
2310
	return (NULL);
2311
}
2312

2313
/*
2314
 * XXX: THIS STUFF SHOULD BE IN pte.c?
2315
 */
2316
int
2317
moea_pte_spill(vm_offset_t addr)
2318
{
2319
	struct	pvo_entry *source_pvo, *victim_pvo;
2320
	struct	pvo_entry *pvo;
2321
	int	ptegidx, i, j;
2322
	u_int	sr;
2323
	struct	pteg *pteg;
2324
	struct	pte *pt;
2325

2326
	moea_pte_spills++;
2327

2328
	sr = mfsrin(addr);
2329
	ptegidx = va_to_pteg(sr, addr);
2330

2331
	/*
2332
	 * Have to substitute some entry.  Use the primary hash for this.
2333
	 * Use low bits of timebase as random generator.
2334
	 */
2335
	pteg = &moea_pteg_table[ptegidx];
2336
	mtx_lock(&moea_table_mutex);
2337
	__asm __volatile("mftb %0" : "=r"(i));
2338
	i &= 7;
2339
	pt = &pteg->pt[i];
2340

2341
	source_pvo = NULL;
2342
	victim_pvo = NULL;
2343
	LIST_FOREACH(pvo, &moea_pvo_table[ptegidx], pvo_olink) {
2344
		/*
2345
		 * We need to find a pvo entry for this address.
2346
		 */
2347
		if (source_pvo == NULL &&
2348
		    moea_pte_match(&pvo->pvo_pte.pte, sr, addr,
2349
		    pvo->pvo_pte.pte.pte_hi & PTE_HID)) {
2350
			/*
2351
			 * Now found an entry to be spilled into the pteg.
2352
			 * The PTE is now valid, so we know it's active.
2353
			 */
2354
			j = moea_pte_insert(ptegidx, &pvo->pvo_pte.pte);
2355

2356
			if (j >= 0) {
2357
				PVO_PTEGIDX_SET(pvo, j);
2358
				moea_pte_overflow--;
2359
				mtx_unlock(&moea_table_mutex);
2360
				return (1);
2361
			}
2362

2363
			source_pvo = pvo;
2364

2365
			if (victim_pvo != NULL)
2366
				break;
2367
		}
2368

2369
		/*
2370
		 * We also need the pvo entry of the victim we are replacing
2371
		 * so save the R & C bits of the PTE.
2372
		 */
2373
		if ((pt->pte_hi & PTE_HID) == 0 && victim_pvo == NULL &&
2374
		    moea_pte_compare(pt, &pvo->pvo_pte.pte)) {
2375
			victim_pvo = pvo;
2376
			if (source_pvo != NULL)
2377
				break;
2378
		}
2379
	}
2380

2381
	if (source_pvo == NULL) {
2382
		mtx_unlock(&moea_table_mutex);
2383
		return (0);
2384
	}
2385

2386
	if (victim_pvo == NULL) {
2387
		if ((pt->pte_hi & PTE_HID) == 0)
2388
			panic("moea_pte_spill: victim p-pte (%p) has no pvo"
2389
			    "entry", pt);
2390

2391
		/*
2392
		 * If this is a secondary PTE, we need to search it's primary
2393
		 * pvo bucket for the matching PVO.
2394
		 */
2395
		LIST_FOREACH(pvo, &moea_pvo_table[ptegidx ^ moea_pteg_mask],
2396
		    pvo_olink) {
2397
			/*
2398
			 * We also need the pvo entry of the victim we are
2399
			 * replacing so save the R & C bits of the PTE.
2400
			 */
2401
			if (moea_pte_compare(pt, &pvo->pvo_pte.pte)) {
2402
				victim_pvo = pvo;
2403
				break;
2404
			}
2405
		}
2406

2407
		if (victim_pvo == NULL)
2408
			panic("moea_pte_spill: victim s-pte (%p) has no pvo"
2409
			    "entry", pt);
2410
	}
2411

2412
	/*
2413
	 * We are invalidating the TLB entry for the EA we are replacing even
2414
	 * though it's valid.  If we don't, we lose any ref/chg bit changes
2415
	 * contained in the TLB entry.
2416
	 */
2417
	source_pvo->pvo_pte.pte.pte_hi &= ~PTE_HID;
2418

2419
	moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr);
2420
	moea_pte_set(pt, &source_pvo->pvo_pte.pte);
2421

2422
	PVO_PTEGIDX_CLR(victim_pvo);
2423
	PVO_PTEGIDX_SET(source_pvo, i);
2424
	moea_pte_replacements++;
2425

2426
	mtx_unlock(&moea_table_mutex);
2427
	return (1);
2428
}
2429

2430
static __inline struct pvo_entry *
2431
moea_pte_spillable_ident(u_int ptegidx)
2432
{
2433
	struct	pte *pt;
2434
	struct	pvo_entry *pvo_walk, *pvo = NULL;
2435

2436
	LIST_FOREACH(pvo_walk, &moea_pvo_table[ptegidx], pvo_olink) {
2437
		if (pvo_walk->pvo_vaddr & PVO_WIRED)
2438
			continue;
2439

2440
		if (!(pvo_walk->pvo_pte.pte.pte_hi & PTE_VALID))
2441
			continue;
2442

2443
		pt = moea_pvo_to_pte(pvo_walk, -1);
2444

2445
		if (pt == NULL)
2446
			continue;
2447

2448
		pvo = pvo_walk;
2449

2450
		mtx_unlock(&moea_table_mutex);
2451
		if (!(pt->pte_lo & PTE_REF))
2452
			return (pvo_walk);
2453
	}
2454

2455
	return (pvo);
2456
}
2457

2458
static int
2459
moea_pte_insert(u_int ptegidx, struct pte *pvo_pt)
2460
{
2461
	struct	pte *pt;
2462
	struct	pvo_entry *victim_pvo;
2463
	int	i;
2464
	int	victim_idx;
2465
	u_int	pteg_bkpidx = ptegidx;
2466

2467
	mtx_assert(&moea_table_mutex, MA_OWNED);
2468

2469
	/*
2470
	 * First try primary hash.
2471
	 */
2472
	for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2473
		if ((pt->pte_hi & PTE_VALID) == 0) {
2474
			pvo_pt->pte_hi &= ~PTE_HID;
2475
			moea_pte_set(pt, pvo_pt);
2476
			return (i);
2477
		}
2478
	}
2479

2480
	/*
2481
	 * Now try secondary hash.
2482
	 */
2483
	ptegidx ^= moea_pteg_mask;
2484

2485
	for (pt = moea_pteg_table[ptegidx].pt, i = 0; i < 8; i++, pt++) {
2486
		if ((pt->pte_hi & PTE_VALID) == 0) {
2487
			pvo_pt->pte_hi |= PTE_HID;
2488
			moea_pte_set(pt, pvo_pt);
2489
			return (i);
2490
		}
2491
	}
2492

2493
	/* Try again, but this time try to force a PTE out. */
2494
	ptegidx = pteg_bkpidx;
2495

2496
	victim_pvo = moea_pte_spillable_ident(ptegidx);
2497
	if (victim_pvo == NULL) {
2498
		ptegidx ^= moea_pteg_mask;
2499
		victim_pvo = moea_pte_spillable_ident(ptegidx);
2500
	}
2501

2502
	if (victim_pvo == NULL) {
2503
		panic("moea_pte_insert: overflow");
2504
		return (-1);
2505
	}
2506

2507
	victim_idx = moea_pvo_pte_index(victim_pvo, ptegidx);
2508

2509
	if (pteg_bkpidx == ptegidx)
2510
		pvo_pt->pte_hi &= ~PTE_HID;
2511
	else
2512
		pvo_pt->pte_hi |= PTE_HID;
2513

2514
	/*
2515
	 * Synchronize the sacrifice PTE with its PVO, then mark both
2516
	 * invalid. The PVO will be reused when/if the VM system comes
2517
	 * here after a fault.
2518
	 */
2519
	pt = &moea_pteg_table[victim_idx >> 3].pt[victim_idx & 7];
2520

2521
	if (pt->pte_hi != victim_pvo->pvo_pte.pte.pte_hi)
2522
	    panic("Victim PVO doesn't match PTE! PVO: %8x, PTE: %8x", victim_pvo->pvo_pte.pte.pte_hi, pt->pte_hi);
2523

2524
	/*
2525
	 * Set the new PTE.
2526
	 */
2527
	moea_pte_unset(pt, &victim_pvo->pvo_pte.pte, victim_pvo->pvo_vaddr);
2528
	PVO_PTEGIDX_CLR(victim_pvo);
2529
	moea_pte_overflow++;
2530
	moea_pte_set(pt, pvo_pt);
2531

2532
	return (victim_idx & 7);
2533
}
2534

2535
static bool
2536
moea_query_bit(vm_page_t m, int ptebit)
2537
{
2538
	struct	pvo_entry *pvo;
2539
	struct	pte *pt;
2540

2541
	rw_assert(&pvh_global_lock, RA_WLOCKED);
2542
	if (moea_attr_fetch(m) & ptebit)
2543
		return (true);
2544

2545
	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2546
		/*
2547
		 * See if we saved the bit off.  If so, cache it and return
2548
		 * success.
2549
		 */
2550
		if (pvo->pvo_pte.pte.pte_lo & ptebit) {
2551
			moea_attr_save(m, ptebit);
2552
			return (true);
2553
		}
2554
	}
2555

2556
	/*
2557
	 * No luck, now go through the hard part of looking at the PTEs
2558
	 * themselves.  Sync so that any pending REF/CHG bits are flushed to
2559
	 * the PTEs.
2560
	 */
2561
	powerpc_sync();
2562
	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2563
		/*
2564
		 * See if this pvo has a valid PTE.  if so, fetch the
2565
		 * REF/CHG bits from the valid PTE.  If the appropriate
2566
		 * ptebit is set, cache it and return success.
2567
		 */
2568
		pt = moea_pvo_to_pte(pvo, -1);
2569
		if (pt != NULL) {
2570
			moea_pte_synch(pt, &pvo->pvo_pte.pte);
2571
			mtx_unlock(&moea_table_mutex);
2572
			if (pvo->pvo_pte.pte.pte_lo & ptebit) {
2573
				moea_attr_save(m, ptebit);
2574
				return (true);
2575
			}
2576
		}
2577
	}
2578

2579
	return (false);
2580
}
2581

2582
static u_int
2583
moea_clear_bit(vm_page_t m, int ptebit)
2584
{
2585
	u_int	count;
2586
	struct	pvo_entry *pvo;
2587
	struct	pte *pt;
2588

2589
	rw_assert(&pvh_global_lock, RA_WLOCKED);
2590

2591
	/*
2592
	 * Clear the cached value.
2593
	 */
2594
	moea_attr_clear(m, ptebit);
2595

2596
	/*
2597
	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2598
	 * we can reset the right ones).  note that since the pvo entries and
2599
	 * list heads are accessed via BAT0 and are never placed in the page
2600
	 * table, we don't have to worry about further accesses setting the
2601
	 * REF/CHG bits.
2602
	 */
2603
	powerpc_sync();
2604

2605
	/*
2606
	 * For each pvo entry, clear the pvo's ptebit.  If this pvo has a
2607
	 * valid pte clear the ptebit from the valid pte.
2608
	 */
2609
	count = 0;
2610
	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2611
		pt = moea_pvo_to_pte(pvo, -1);
2612
		if (pt != NULL) {
2613
			moea_pte_synch(pt, &pvo->pvo_pte.pte);
2614
			if (pvo->pvo_pte.pte.pte_lo & ptebit) {
2615
				count++;
2616
				moea_pte_clear(pt, PVO_VADDR(pvo), ptebit);
2617
			}
2618
			mtx_unlock(&moea_table_mutex);
2619
		}
2620
		pvo->pvo_pte.pte.pte_lo &= ~ptebit;
2621
	}
2622

2623
	return (count);
2624
}
2625

2626
/*
2627
 * Return true if the physical range is encompassed by the battable[idx]
2628
 */
2629
static int
2630
moea_bat_mapped(int idx, vm_paddr_t pa, vm_size_t size)
2631
{
2632
	u_int prot;
2633
	u_int32_t start;
2634
	u_int32_t end;
2635
	u_int32_t bat_ble;
2636

2637
	/*
2638
	 * Return immediately if not a valid mapping
2639
	 */
2640
	if (!(battable[idx].batu & BAT_Vs))
2641
		return (EINVAL);
2642

2643
	/*
2644
	 * The BAT entry must be cache-inhibited, guarded, and r/w
2645
	 * so it can function as an i/o page
2646
	 */
2647
	prot = battable[idx].batl & (BAT_I|BAT_G|BAT_PP_RW);
2648
	if (prot != (BAT_I|BAT_G|BAT_PP_RW))
2649
		return (EPERM);
2650

2651
	/*
2652
	 * The address should be within the BAT range. Assume that the
2653
	 * start address in the BAT has the correct alignment (thus
2654
	 * not requiring masking)
2655
	 */
2656
	start = battable[idx].batl & BAT_PBS;
2657
	bat_ble = (battable[idx].batu & ~(BAT_EBS)) | 0x03;
2658
	end = start | (bat_ble << 15) | 0x7fff;
2659

2660
	if ((pa < start) || ((pa + size) > end))
2661
		return (ERANGE);
2662

2663
	return (0);
2664
}
2665

2666
int
2667
moea_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
2668
{
2669
	int i;
2670

2671
	/*
2672
	 * This currently does not work for entries that
2673
	 * overlap 256M BAT segments.
2674
	 */
2675

2676
	for(i = 0; i < 16; i++)
2677
		if (moea_bat_mapped(i, pa, size) == 0)
2678
			return (0);
2679

2680
	return (EFAULT);
2681
}
2682

2683
/*
2684
 * Map a set of physical memory pages into the kernel virtual
2685
 * address space. Return a pointer to where it is mapped. This
2686
 * routine is intended to be used for mapping device memory,
2687
 * NOT real memory.
2688
 */
2689
void *
2690
moea_mapdev(vm_paddr_t pa, vm_size_t size)
2691
{
2692

2693
	return (moea_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
2694
}
2695

2696
void *
2697
moea_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
2698
{
2699
	vm_offset_t va, tmpva, ppa, offset;
2700
	int i;
2701

2702
	ppa = trunc_page(pa);
2703
	offset = pa & PAGE_MASK;
2704
	size = roundup(offset + size, PAGE_SIZE);
2705

2706
	/*
2707
	 * If the physical address lies within a valid BAT table entry,
2708
	 * return the 1:1 mapping. This currently doesn't work
2709
	 * for regions that overlap 256M BAT segments.
2710
	 */
2711
	for (i = 0; i < 16; i++) {
2712
		if (moea_bat_mapped(i, pa, size) == 0)
2713
			return ((void *) pa);
2714
	}
2715

2716
	va = kva_alloc(size);
2717
	if (!va)
2718
		panic("moea_mapdev: Couldn't alloc kernel virtual memory");
2719

2720
	for (tmpva = va; size > 0;) {
2721
		moea_kenter_attr(tmpva, ppa, ma);
2722
		tlbie(tmpva);
2723
		size -= PAGE_SIZE;
2724
		tmpva += PAGE_SIZE;
2725
		ppa += PAGE_SIZE;
2726
	}
2727

2728
	return ((void *)(va + offset));
2729
}
2730

2731
void
2732
moea_unmapdev(void *p, vm_size_t size)
2733
{
2734
	vm_offset_t base, offset, va;
2735

2736
	/*
2737
	 * If this is outside kernel virtual space, then it's a
2738
	 * battable entry and doesn't require unmapping
2739
	 */
2740
	va = (vm_offset_t)p;
2741
	if ((va >= VM_MIN_KERNEL_ADDRESS) && (va <= virtual_end)) {
2742
		base = trunc_page(va);
2743
		offset = va & PAGE_MASK;
2744
		size = roundup(offset + size, PAGE_SIZE);
2745
		moea_qremove(base, atop(size));
2746
		kva_free(base, size);
2747
	}
2748
}
2749

2750
static void
2751
moea_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
2752
{
2753
	struct pvo_entry *pvo;
2754
	vm_offset_t lim;
2755
	vm_paddr_t pa;
2756
	vm_size_t len;
2757

2758
	PMAP_LOCK(pm);
2759
	while (sz > 0) {
2760
		lim = round_page(va + 1);
2761
		len = MIN(lim - va, sz);
2762
		pvo = moea_pvo_find_va(pm, va & ~ADDR_POFF, NULL);
2763
		if (pvo != NULL) {
2764
			pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
2765
			moea_syncicache(pa, len);
2766
		}
2767
		va += len;
2768
		sz -= len;
2769
	}
2770
	PMAP_UNLOCK(pm);
2771
}
2772

2773
void
2774
moea_dumpsys_map(vm_paddr_t pa, size_t sz, void **va)
2775
{
2776

2777
	*va = (void *)pa;
2778
}
2779

2780
extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
2781

2782
void
2783
moea_scan_init(void)
2784
{
2785
	struct pvo_entry *pvo;
2786
	vm_offset_t va;
2787
	int i;
2788

2789
	if (!do_minidump) {
2790
		/* Initialize phys. segments for dumpsys(). */
2791
		memset(&dump_map, 0, sizeof(dump_map));
2792
		mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
2793
		for (i = 0; i < pregions_sz; i++) {
2794
			dump_map[i].pa_start = pregions[i].mr_start;
2795
			dump_map[i].pa_size = pregions[i].mr_size;
2796
		}
2797
		return;
2798
	}
2799

2800
	/* Virtual segments for minidumps: */
2801
	memset(&dump_map, 0, sizeof(dump_map));
2802

2803
	/* 1st: kernel .data and .bss. */
2804
	dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
2805
	dump_map[0].pa_size =
2806
	    round_page((uintptr_t)_end) - dump_map[0].pa_start;
2807

2808
	/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
2809
	dump_map[1].pa_start = (vm_paddr_t)msgbufp->msg_ptr;
2810
	dump_map[1].pa_size = round_page(msgbufp->msg_size);
2811

2812
	/* 3rd: kernel VM. */
2813
	va = dump_map[1].pa_start + dump_map[1].pa_size;
2814
	/* Find start of next chunk (from va). */
2815
	while (va < virtual_end) {
2816
		/* Don't dump the buffer cache. */
2817
		if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
2818
			va = kmi.buffer_eva;
2819
			continue;
2820
		}
2821
		pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF, NULL);
2822
		if (pvo != NULL && (pvo->pvo_pte.pte.pte_hi & PTE_VALID))
2823
			break;
2824
		va += PAGE_SIZE;
2825
	}
2826
	if (va < virtual_end) {
2827
		dump_map[2].pa_start = va;
2828
		va += PAGE_SIZE;
2829
		/* Find last page in chunk. */
2830
		while (va < virtual_end) {
2831
			/* Don't run into the buffer cache. */
2832
			if (va == kmi.buffer_sva)
2833
				break;
2834
			pvo = moea_pvo_find_va(kernel_pmap, va & ~ADDR_POFF,
2835
			    NULL);
2836
			if (pvo == NULL ||
2837
			    !(pvo->pvo_pte.pte.pte_hi & PTE_VALID))
2838
				break;
2839
			va += PAGE_SIZE;
2840
		}
2841
		dump_map[2].pa_size = va - dump_map[2].pa_start;
2842
	}
2843
}
2844

2845
Product

Resources

Company