CoCalc -- mmu.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/xen/mmu.c
¹⁷⁶³³ views
1
/*
2
 * Xen mmu operations
3
 *
4
 * This file contains the various mmu fetch and update operations.
5
 * The most important job they must perform is the mapping between the
6
 * domain's pfn and the overall machine mfns.
7
 *
8
 * Xen allows guests to directly update the pagetable, in a controlled
9
 * fashion.  In other words, the guest modifies the same pagetable
10
 * that the CPU actually uses, which eliminates the overhead of having
11
 * a separate shadow pagetable.
12
 *
13
 * In order to allow this, it falls on the guest domain to map its
14
 * notion of a "physical" pfn - which is just a domain-local linear
15
 * address - into a real "machine address" which the CPU's MMU can
16
 * use.
17
 *
18
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19
 * inserted directly into the pagetable.  When creating a new
20
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22
 * the mfn back into a pfn.
23
 *
24
 * The other constraint is that all pages which make up a pagetable
25
 * must be mapped read-only in the guest.  This prevents uncontrolled
26
 * guest updates to the pagetable.  Xen strictly enforces this, and
27
 * will disallow any pagetable update which will end up mapping a
28
 * pagetable page RW, and will disallow using any writable page as a
29
 * pagetable.
30
 *
31
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32
 * would need to validate the whole pagetable before going on.
33
 * Naturally, this is quite slow.  The solution is to "pin" a
34
 * pagetable, which enforces all the constraints on the pagetable even
35
 * when it is not actively in use.  This menas that Xen can be assured
36
 * that it is still valid when you do load it into %cr3, and doesn't
37
 * need to revalidate it.
38
 *
39
 * Jeremy Fitzhardinge <[email protected]>, XenSource Inc, 2007
40
 */
41
#include <linux/sched.h>
42
#include <linux/highmem.h>
43
#include <linux/debugfs.h>
44
#include <linux/bug.h>
45
#include <linux/vmalloc.h>
46
#include <linux/module.h>
47
#include <linux/gfp.h>
48
#include <linux/memblock.h>
49
#include <linux/seq_file.h>
50

51
#include <asm/pgtable.h>
52
#include <asm/tlbflush.h>
53
#include <asm/fixmap.h>
54
#include <asm/mmu_context.h>
55
#include <asm/setup.h>
56
#include <asm/paravirt.h>
57
#include <asm/e820.h>
58
#include <asm/linkage.h>
59
#include <asm/page.h>
60
#include <asm/init.h>
61
#include <asm/pat.h>
62
#include <asm/smp.h>
63

64
#include <asm/xen/hypercall.h>
65
#include <asm/xen/hypervisor.h>
66

67
#include <xen/xen.h>
68
#include <xen/page.h>
69
#include <xen/interface/xen.h>
70
#include <xen/interface/hvm/hvm_op.h>
71
#include <xen/interface/version.h>
72
#include <xen/interface/memory.h>
73
#include <xen/hvc-console.h>
74

75
#include "multicalls.h"
76
#include "mmu.h"
77
#include "debugfs.h"
78

79
/*
80
 * Protects atomic reservation decrease/increase against concurrent increases.
81
 * Also protects non-atomic updates of current_pages and balloon lists.
82
 */
83
DEFINE_SPINLOCK(xen_reservation_lock);
84

85
/*
86
 * Identity map, in addition to plain kernel map.  This needs to be
87
 * large enough to allocate page table pages to allocate the rest.
88
 * Each page can map 2MB.
89
 */
90
#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
91
static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
92

93
#ifdef CONFIG_X86_64
94
/* l3 pud for userspace vsyscall mapping */
95
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
96
#endif /* CONFIG_X86_64 */
97

98
/*
99
 * Note about cr3 (pagetable base) values:
100
 *
101
 * xen_cr3 contains the current logical cr3 value; it contains the
102
 * last set cr3.  This may not be the current effective cr3, because
103
 * its update may be being lazily deferred.  However, a vcpu looking
104
 * at its own cr3 can use this value knowing that it everything will
105
 * be self-consistent.
106
 *
107
 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
108
 * hypercall to set the vcpu cr3 is complete (so it may be a little
109
 * out of date, but it will never be set early).  If one vcpu is
110
 * looking at another vcpu's cr3 value, it should use this variable.
111
 */
112
DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
113
DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
114

115

116
/*
117
 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
118
 * redzone above it, so round it up to a PGD boundary.
119
 */
120
#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
121

122
unsigned long arbitrary_virt_to_mfn(void *vaddr)
123
{
124
	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
125

126
	return PFN_DOWN(maddr.maddr);
127
}
128

129
xmaddr_t arbitrary_virt_to_machine(void *vaddr)
130
{
131
	unsigned long address = (unsigned long)vaddr;
132
	unsigned int level;
133
	pte_t *pte;
134
	unsigned offset;
135

136
	/*
137
	 * if the PFN is in the linear mapped vaddr range, we can just use
138
	 * the (quick) virt_to_machine() p2m lookup
139
	 */
140
	if (virt_addr_valid(vaddr))
141
		return virt_to_machine(vaddr);
142

143
	/* otherwise we have to do a (slower) full page-table walk */
144

145
	pte = lookup_address(address, &level);
146
	BUG_ON(pte == NULL);
147
	offset = address & ~PAGE_MASK;
148
	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
149
}
150
EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
151

152
void make_lowmem_page_readonly(void *vaddr)
153
{
154
	pte_t *pte, ptev;
155
	unsigned long address = (unsigned long)vaddr;
156
	unsigned int level;
157

158
	pte = lookup_address(address, &level);
159
	if (pte == NULL)
160
		return;		/* vaddr missing */
161

162
	ptev = pte_wrprotect(*pte);
163

164
	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
165
		BUG();
166
}
167

168
void make_lowmem_page_readwrite(void *vaddr)
169
{
170
	pte_t *pte, ptev;
171
	unsigned long address = (unsigned long)vaddr;
172
	unsigned int level;
173

174
	pte = lookup_address(address, &level);
175
	if (pte == NULL)
176
		return;		/* vaddr missing */
177

178
	ptev = pte_mkwrite(*pte);
179

180
	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
181
		BUG();
182
}
183

184

185
static bool xen_page_pinned(void *ptr)
186
{
187
	struct page *page = virt_to_page(ptr);
188

189
	return PagePinned(page);
190
}
191

192
void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
193
{
194
	struct multicall_space mcs;
195
	struct mmu_update *u;
196

197
	mcs = xen_mc_entry(sizeof(*u));
198
	u = mcs.args;
199

200
	/* ptep might be kmapped when using 32-bit HIGHPTE */
201
	u->ptr = virt_to_machine(ptep).maddr;
202
	u->val = pte_val_ma(pteval);
203

204
	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
205

206
	xen_mc_issue(PARAVIRT_LAZY_MMU);
207
}
208
EXPORT_SYMBOL_GPL(xen_set_domain_pte);
209

210
static void xen_extend_mmu_update(const struct mmu_update *update)
211
{
212
	struct multicall_space mcs;
213
	struct mmu_update *u;
214

215
	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
216

217
	if (mcs.mc != NULL) {
218
		mcs.mc->args[1]++;
219
	} else {
220
		mcs = __xen_mc_entry(sizeof(*u));
221
		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
222
	}
223

224
	u = mcs.args;
225
	*u = *update;
226
}
227

228
static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
229
{
230
	struct mmu_update u;
231

232
	preempt_disable();
233

234
	xen_mc_batch();
235

236
	/* ptr may be ioremapped for 64-bit pagetable setup */
237
	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
238
	u.val = pmd_val_ma(val);
239
	xen_extend_mmu_update(&u);
240

241
	xen_mc_issue(PARAVIRT_LAZY_MMU);
242

243
	preempt_enable();
244
}
245

246
static void xen_set_pmd(pmd_t *ptr, pmd_t val)
247
{
248
	/* If page is not pinned, we can just update the entry
249
	   directly */
250
	if (!xen_page_pinned(ptr)) {
251
		*ptr = val;
252
		return;
253
	}
254

255
	xen_set_pmd_hyper(ptr, val);
256
}
257

258
/*
259
 * Associate a virtual page frame with a given physical page frame
260
 * and protection flags for that frame.
261
 */
262
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
263
{
264
	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
265
}
266

267
static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
268
{
269
	struct mmu_update u;
270

271
	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
272
		return false;
273

274
	xen_mc_batch();
275

276
	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
277
	u.val = pte_val_ma(pteval);
278
	xen_extend_mmu_update(&u);
279

280
	xen_mc_issue(PARAVIRT_LAZY_MMU);
281

282
	return true;
283
}
284

285
static void xen_set_pte(pte_t *ptep, pte_t pteval)
286
{
287
	if (!xen_batched_set_pte(ptep, pteval))
288
		native_set_pte(ptep, pteval);
289
}
290

291
static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
292
		    pte_t *ptep, pte_t pteval)
293
{
294
	xen_set_pte(ptep, pteval);
295
}
296

297
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
298
				 unsigned long addr, pte_t *ptep)
299
{
300
	/* Just return the pte as-is.  We preserve the bits on commit */
301
	return *ptep;
302
}
303

304
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
305
				 pte_t *ptep, pte_t pte)
306
{
307
	struct mmu_update u;
308

309
	xen_mc_batch();
310

311
	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
312
	u.val = pte_val_ma(pte);
313
	xen_extend_mmu_update(&u);
314

315
	xen_mc_issue(PARAVIRT_LAZY_MMU);
316
}
317

318
/* Assume pteval_t is equivalent to all the other *val_t types. */
319
static pteval_t pte_mfn_to_pfn(pteval_t val)
320
{
321
	if (val & _PAGE_PRESENT) {
322
		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
323
		pteval_t flags = val & PTE_FLAGS_MASK;
324
		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
325
	}
326

327
	return val;
328
}
329

330
static pteval_t pte_pfn_to_mfn(pteval_t val)
331
{
332
	if (val & _PAGE_PRESENT) {
333
		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
334
		pteval_t flags = val & PTE_FLAGS_MASK;
335
		unsigned long mfn;
336

337
		if (!xen_feature(XENFEAT_auto_translated_physmap))
338
			mfn = get_phys_to_machine(pfn);
339
		else
340
			mfn = pfn;
341
		/*
342
		 * If there's no mfn for the pfn, then just create an
343
		 * empty non-present pte.  Unfortunately this loses
344
		 * information about the original pfn, so
345
		 * pte_mfn_to_pfn is asymmetric.
346
		 */
347
		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
348
			mfn = 0;
349
			flags = 0;
350
		} else {
351
			/*
352
			 * Paramount to do this test _after_ the
353
			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
354
			 * IDENTITY_FRAME_BIT resolves to true.
355
			 */
356
			mfn &= ~FOREIGN_FRAME_BIT;
357
			if (mfn & IDENTITY_FRAME_BIT) {
358
				mfn &= ~IDENTITY_FRAME_BIT;
359
				flags |= _PAGE_IOMAP;
360
			}
361
		}
362
		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
363
	}
364

365
	return val;
366
}
367

368
static pteval_t iomap_pte(pteval_t val)
369
{
370
	if (val & _PAGE_PRESENT) {
371
		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
372
		pteval_t flags = val & PTE_FLAGS_MASK;
373

374
		/* We assume the pte frame number is a MFN, so
375
		   just use it as-is. */
376
		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
377
	}
378

379
	return val;
380
}
381

382
static pteval_t xen_pte_val(pte_t pte)
383
{
384
	pteval_t pteval = pte.pte;
385

386
	/* If this is a WC pte, convert back from Xen WC to Linux WC */
387
	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
388
		WARN_ON(!pat_enabled);
389
		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
390
	}
391

392
	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
393
		return pteval;
394

395
	return pte_mfn_to_pfn(pteval);
396
}
397
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
398

399
static pgdval_t xen_pgd_val(pgd_t pgd)
400
{
401
	return pte_mfn_to_pfn(pgd.pgd);
402
}
403
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
404

405
/*
406
 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
407
 * are reserved for now, to correspond to the Intel-reserved PAT
408
 * types.
409
 *
410
 * We expect Linux's PAT set as follows:
411
 *
412
 * Idx  PTE flags        Linux    Xen    Default
413
 * 0                     WB       WB     WB
414
 * 1            PWT      WC       WT     WT
415
 * 2        PCD          UC-      UC-    UC-
416
 * 3        PCD PWT      UC       UC     UC
417
 * 4    PAT              WB       WC     WB
418
 * 5    PAT     PWT      WC       WP     WT
419
 * 6    PAT PCD          UC-      UC     UC-
420
 * 7    PAT PCD PWT      UC       UC     UC
421
 */
422

423
void xen_set_pat(u64 pat)
424
{
425
	/* We expect Linux to use a PAT setting of
426
	 * UC UC- WC WB (ignoring the PAT flag) */
427
	WARN_ON(pat != 0x0007010600070106ull);
428
}
429

430
static pte_t xen_make_pte(pteval_t pte)
431
{
432
	phys_addr_t addr = (pte & PTE_PFN_MASK);
433

434
	/* If Linux is trying to set a WC pte, then map to the Xen WC.
435
	 * If _PAGE_PAT is set, then it probably means it is really
436
	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
437
	 * things work out OK...
438
	 *
439
	 * (We should never see kernel mappings with _PAGE_PSE set,
440
	 * but we could see hugetlbfs mappings, I think.).
441
	 */
442
	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
443
		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
444
			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
445
	}
446

447
	/*
448
	 * Unprivileged domains are allowed to do IOMAPpings for
449
	 * PCI passthrough, but not map ISA space.  The ISA
450
	 * mappings are just dummy local mappings to keep other
451
	 * parts of the kernel happy.
452
	 */
453
	if (unlikely(pte & _PAGE_IOMAP) &&
454
	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
455
		pte = iomap_pte(pte);
456
	} else {
457
		pte &= ~_PAGE_IOMAP;
458
		pte = pte_pfn_to_mfn(pte);
459
	}
460

461
	return native_make_pte(pte);
462
}
463
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
464

465
#ifdef CONFIG_XEN_DEBUG
466
pte_t xen_make_pte_debug(pteval_t pte)
467
{
468
	phys_addr_t addr = (pte & PTE_PFN_MASK);
469
	phys_addr_t other_addr;
470
	bool io_page = false;
471
	pte_t _pte;
472

473
	if (pte & _PAGE_IOMAP)
474
		io_page = true;
475

476
	_pte = xen_make_pte(pte);
477

478
	if (!addr)
479
		return _pte;
480

481
	if (io_page &&
482
	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
483
		other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
484
		WARN_ONCE(addr != other_addr,
485
			"0x%lx is using VM_IO, but it is 0x%lx!\n",
486
			(unsigned long)addr, (unsigned long)other_addr);
487
	} else {
488
		pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
489
		other_addr = (_pte.pte & PTE_PFN_MASK);
490
		WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
491
			"0x%lx is missing VM_IO (and wasn't fixed)!\n",
492
			(unsigned long)addr);
493
	}
494

495
	return _pte;
496
}
497
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
498
#endif
499

500
static pgd_t xen_make_pgd(pgdval_t pgd)
501
{
502
	pgd = pte_pfn_to_mfn(pgd);
503
	return native_make_pgd(pgd);
504
}
505
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
506

507
static pmdval_t xen_pmd_val(pmd_t pmd)
508
{
509
	return pte_mfn_to_pfn(pmd.pmd);
510
}
511
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
512

513
static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
514
{
515
	struct mmu_update u;
516

517
	preempt_disable();
518

519
	xen_mc_batch();
520

521
	/* ptr may be ioremapped for 64-bit pagetable setup */
522
	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
523
	u.val = pud_val_ma(val);
524
	xen_extend_mmu_update(&u);
525

526
	xen_mc_issue(PARAVIRT_LAZY_MMU);
527

528
	preempt_enable();
529
}
530

531
static void xen_set_pud(pud_t *ptr, pud_t val)
532
{
533
	/* If page is not pinned, we can just update the entry
534
	   directly */
535
	if (!xen_page_pinned(ptr)) {
536
		*ptr = val;
537
		return;
538
	}
539

540
	xen_set_pud_hyper(ptr, val);
541
}
542

543
#ifdef CONFIG_X86_PAE
544
static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
545
{
546
	set_64bit((u64 *)ptep, native_pte_val(pte));
547
}
548

549
static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
550
{
551
	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
552
		native_pte_clear(mm, addr, ptep);
553
}
554

555
static void xen_pmd_clear(pmd_t *pmdp)
556
{
557
	set_pmd(pmdp, __pmd(0));
558
}
559
#endif	/* CONFIG_X86_PAE */
560

561
static pmd_t xen_make_pmd(pmdval_t pmd)
562
{
563
	pmd = pte_pfn_to_mfn(pmd);
564
	return native_make_pmd(pmd);
565
}
566
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
567

568
#if PAGETABLE_LEVELS == 4
569
static pudval_t xen_pud_val(pud_t pud)
570
{
571
	return pte_mfn_to_pfn(pud.pud);
572
}
573
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
574

575
static pud_t xen_make_pud(pudval_t pud)
576
{
577
	pud = pte_pfn_to_mfn(pud);
578

579
	return native_make_pud(pud);
580
}
581
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
582

583
static pgd_t *xen_get_user_pgd(pgd_t *pgd)
584
{
585
	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
586
	unsigned offset = pgd - pgd_page;
587
	pgd_t *user_ptr = NULL;
588

589
	if (offset < pgd_index(USER_LIMIT)) {
590
		struct page *page = virt_to_page(pgd_page);
591
		user_ptr = (pgd_t *)page->private;
592
		if (user_ptr)
593
			user_ptr += offset;
594
	}
595

596
	return user_ptr;
597
}
598

599
static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
600
{
601
	struct mmu_update u;
602

603
	u.ptr = virt_to_machine(ptr).maddr;
604
	u.val = pgd_val_ma(val);
605
	xen_extend_mmu_update(&u);
606
}
607

608
/*
609
 * Raw hypercall-based set_pgd, intended for in early boot before
610
 * there's a page structure.  This implies:
611
 *  1. The only existing pagetable is the kernel's
612
 *  2. It is always pinned
613
 *  3. It has no user pagetable attached to it
614
 */
615
static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
616
{
617
	preempt_disable();
618

619
	xen_mc_batch();
620

621
	__xen_set_pgd_hyper(ptr, val);
622

623
	xen_mc_issue(PARAVIRT_LAZY_MMU);
624

625
	preempt_enable();
626
}
627

628
static void xen_set_pgd(pgd_t *ptr, pgd_t val)
629
{
630
	pgd_t *user_ptr = xen_get_user_pgd(ptr);
631

632
	/* If page is not pinned, we can just update the entry
633
	   directly */
634
	if (!xen_page_pinned(ptr)) {
635
		*ptr = val;
636
		if (user_ptr) {
637
			WARN_ON(xen_page_pinned(user_ptr));
638
			*user_ptr = val;
639
		}
640
		return;
641
	}
642

643
	/* If it's pinned, then we can at least batch the kernel and
644
	   user updates together. */
645
	xen_mc_batch();
646

647
	__xen_set_pgd_hyper(ptr, val);
648
	if (user_ptr)
649
		__xen_set_pgd_hyper(user_ptr, val);
650

651
	xen_mc_issue(PARAVIRT_LAZY_MMU);
652
}
653
#endif	/* PAGETABLE_LEVELS == 4 */
654

655
/*
656
 * (Yet another) pagetable walker.  This one is intended for pinning a
657
 * pagetable.  This means that it walks a pagetable and calls the
658
 * callback function on each page it finds making up the page table,
659
 * at every level.  It walks the entire pagetable, but it only bothers
660
 * pinning pte pages which are below limit.  In the normal case this
661
 * will be STACK_TOP_MAX, but at boot we need to pin up to
662
 * FIXADDR_TOP.
663
 *
664
 * For 32-bit the important bit is that we don't pin beyond there,
665
 * because then we start getting into Xen's ptes.
666
 *
667
 * For 64-bit, we must skip the Xen hole in the middle of the address
668
 * space, just after the big x86-64 virtual hole.
669
 */
670
static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
671
			  int (*func)(struct mm_struct *mm, struct page *,
672
				      enum pt_level),
673
			  unsigned long limit)
674
{
675
	int flush = 0;
676
	unsigned hole_low, hole_high;
677
	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
678
	unsigned pgdidx, pudidx, pmdidx;
679

680
	/* The limit is the last byte to be touched */
681
	limit--;
682
	BUG_ON(limit >= FIXADDR_TOP);
683

684
	if (xen_feature(XENFEAT_auto_translated_physmap))
685
		return 0;
686

687
	/*
688
	 * 64-bit has a great big hole in the middle of the address
689
	 * space, which contains the Xen mappings.  On 32-bit these
690
	 * will end up making a zero-sized hole and so is a no-op.
691
	 */
692
	hole_low = pgd_index(USER_LIMIT);
693
	hole_high = pgd_index(PAGE_OFFSET);
694

695
	pgdidx_limit = pgd_index(limit);
696
#if PTRS_PER_PUD > 1
697
	pudidx_limit = pud_index(limit);
698
#else
699
	pudidx_limit = 0;
700
#endif
701
#if PTRS_PER_PMD > 1
702
	pmdidx_limit = pmd_index(limit);
703
#else
704
	pmdidx_limit = 0;
705
#endif
706

707
	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
708
		pud_t *pud;
709

710
		if (pgdidx >= hole_low && pgdidx < hole_high)
711
			continue;
712

713
		if (!pgd_val(pgd[pgdidx]))
714
			continue;
715

716
		pud = pud_offset(&pgd[pgdidx], 0);
717

718
		if (PTRS_PER_PUD > 1) /* not folded */
719
			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
720

721
		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
722
			pmd_t *pmd;
723

724
			if (pgdidx == pgdidx_limit &&
725
			    pudidx > pudidx_limit)
726
				goto out;
727

728
			if (pud_none(pud[pudidx]))
729
				continue;
730

731
			pmd = pmd_offset(&pud[pudidx], 0);
732

733
			if (PTRS_PER_PMD > 1) /* not folded */
734
				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
735

736
			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
737
				struct page *pte;
738

739
				if (pgdidx == pgdidx_limit &&
740
				    pudidx == pudidx_limit &&
741
				    pmdidx > pmdidx_limit)
742
					goto out;
743

744
				if (pmd_none(pmd[pmdidx]))
745
					continue;
746

747
				pte = pmd_page(pmd[pmdidx]);
748
				flush |= (*func)(mm, pte, PT_PTE);
749
			}
750
		}
751
	}
752

753
out:
754
	/* Do the top level last, so that the callbacks can use it as
755
	   a cue to do final things like tlb flushes. */
756
	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
757

758
	return flush;
759
}
760

761
static int xen_pgd_walk(struct mm_struct *mm,
762
			int (*func)(struct mm_struct *mm, struct page *,
763
				    enum pt_level),
764
			unsigned long limit)
765
{
766
	return __xen_pgd_walk(mm, mm->pgd, func, limit);
767
}
768

769
/* If we're using split pte locks, then take the page's lock and
770
   return a pointer to it.  Otherwise return NULL. */
771
static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
772
{
773
	spinlock_t *ptl = NULL;
774

775
#if USE_SPLIT_PTLOCKS
776
	ptl = __pte_lockptr(page);
777
	spin_lock_nest_lock(ptl, &mm->page_table_lock);
778
#endif
779

780
	return ptl;
781
}
782

783
static void xen_pte_unlock(void *v)
784
{
785
	spinlock_t *ptl = v;
786
	spin_unlock(ptl);
787
}
788

789
static void xen_do_pin(unsigned level, unsigned long pfn)
790
{
791
	struct mmuext_op *op;
792
	struct multicall_space mcs;
793

794
	mcs = __xen_mc_entry(sizeof(*op));
795
	op = mcs.args;
796
	op->cmd = level;
797
	op->arg1.mfn = pfn_to_mfn(pfn);
798
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
799
}
800

801
static int xen_pin_page(struct mm_struct *mm, struct page *page,
802
			enum pt_level level)
803
{
804
	unsigned pgfl = TestSetPagePinned(page);
805
	int flush;
806

807
	if (pgfl)
808
		flush = 0;		/* already pinned */
809
	else if (PageHighMem(page))
810
		/* kmaps need flushing if we found an unpinned
811
		   highpage */
812
		flush = 1;
813
	else {
814
		void *pt = lowmem_page_address(page);
815
		unsigned long pfn = page_to_pfn(page);
816
		struct multicall_space mcs = __xen_mc_entry(0);
817
		spinlock_t *ptl;
818

819
		flush = 0;
820

821
		/*
822
		 * We need to hold the pagetable lock between the time
823
		 * we make the pagetable RO and when we actually pin
824
		 * it.  If we don't, then other users may come in and
825
		 * attempt to update the pagetable by writing it,
826
		 * which will fail because the memory is RO but not
827
		 * pinned, so Xen won't do the trap'n'emulate.
828
		 *
829
		 * If we're using split pte locks, we can't hold the
830
		 * entire pagetable's worth of locks during the
831
		 * traverse, because we may wrap the preempt count (8
832
		 * bits).  The solution is to mark RO and pin each PTE
833
		 * page while holding the lock.  This means the number
834
		 * of locks we end up holding is never more than a
835
		 * batch size (~32 entries, at present).
836
		 *
837
		 * If we're not using split pte locks, we needn't pin
838
		 * the PTE pages independently, because we're
839
		 * protected by the overall pagetable lock.
840
		 */
841
		ptl = NULL;
842
		if (level == PT_PTE)
843
			ptl = xen_pte_lock(page, mm);
844

845
		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
846
					pfn_pte(pfn, PAGE_KERNEL_RO),
847
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
848

849
		if (ptl) {
850
			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
851

852
			/* Queue a deferred unlock for when this batch
853
			   is completed. */
854
			xen_mc_callback(xen_pte_unlock, ptl);
855
		}
856
	}
857

858
	return flush;
859
}
860

861
/* This is called just after a mm has been created, but it has not
862
   been used yet.  We need to make sure that its pagetable is all
863
   read-only, and can be pinned. */
864
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
865
{
866
	xen_mc_batch();
867

868
	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
869
		/* re-enable interrupts for flushing */
870
		xen_mc_issue(0);
871

872
		kmap_flush_unused();
873

874
		xen_mc_batch();
875
	}
876

877
#ifdef CONFIG_X86_64
878
	{
879
		pgd_t *user_pgd = xen_get_user_pgd(pgd);
880

881
		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
882

883
		if (user_pgd) {
884
			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
885
			xen_do_pin(MMUEXT_PIN_L4_TABLE,
886
				   PFN_DOWN(__pa(user_pgd)));
887
		}
888
	}
889
#else /* CONFIG_X86_32 */
890
#ifdef CONFIG_X86_PAE
891
	/* Need to make sure unshared kernel PMD is pinnable */
892
	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
893
		     PT_PMD);
894
#endif
895
	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
896
#endif /* CONFIG_X86_64 */
897
	xen_mc_issue(0);
898
}
899

900
static void xen_pgd_pin(struct mm_struct *mm)
901
{
902
	__xen_pgd_pin(mm, mm->pgd);
903
}
904

905
/*
906
 * On save, we need to pin all pagetables to make sure they get their
907
 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
908
 * them (unpinned pgds are not currently in use, probably because the
909
 * process is under construction or destruction).
910
 *
911
 * Expected to be called in stop_machine() ("equivalent to taking
912
 * every spinlock in the system"), so the locking doesn't really
913
 * matter all that much.
914
 */
915
void xen_mm_pin_all(void)
916
{
917
	struct page *page;
918

919
	spin_lock(&pgd_lock);
920

921
	list_for_each_entry(page, &pgd_list, lru) {
922
		if (!PagePinned(page)) {
923
			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
924
			SetPageSavePinned(page);
925
		}
926
	}
927

928
	spin_unlock(&pgd_lock);
929
}
930

931
/*
932
 * The init_mm pagetable is really pinned as soon as its created, but
933
 * that's before we have page structures to store the bits.  So do all
934
 * the book-keeping now.
935
 */
936
static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
937
				  enum pt_level level)
938
{
939
	SetPagePinned(page);
940
	return 0;
941
}
942

943
static void __init xen_mark_init_mm_pinned(void)
944
{
945
	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
946
}
947

948
static int xen_unpin_page(struct mm_struct *mm, struct page *page,
949
			  enum pt_level level)
950
{
951
	unsigned pgfl = TestClearPagePinned(page);
952

953
	if (pgfl && !PageHighMem(page)) {
954
		void *pt = lowmem_page_address(page);
955
		unsigned long pfn = page_to_pfn(page);
956
		spinlock_t *ptl = NULL;
957
		struct multicall_space mcs;
958

959
		/*
960
		 * Do the converse to pin_page.  If we're using split
961
		 * pte locks, we must be holding the lock for while
962
		 * the pte page is unpinned but still RO to prevent
963
		 * concurrent updates from seeing it in this
964
		 * partially-pinned state.
965
		 */
966
		if (level == PT_PTE) {
967
			ptl = xen_pte_lock(page, mm);
968

969
			if (ptl)
970
				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
971
		}
972

973
		mcs = __xen_mc_entry(0);
974

975
		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
976
					pfn_pte(pfn, PAGE_KERNEL),
977
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
978

979
		if (ptl) {
980
			/* unlock when batch completed */
981
			xen_mc_callback(xen_pte_unlock, ptl);
982
		}
983
	}
984

985
	return 0;		/* never need to flush on unpin */
986
}
987

988
/* Release a pagetables pages back as normal RW */
989
static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
990
{
991
	xen_mc_batch();
992

993
	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
994

995
#ifdef CONFIG_X86_64
996
	{
997
		pgd_t *user_pgd = xen_get_user_pgd(pgd);
998

999
		if (user_pgd) {
1000
			xen_do_pin(MMUEXT_UNPIN_TABLE,
1001
				   PFN_DOWN(__pa(user_pgd)));
1002
			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1003
		}
1004
	}
1005
#endif
1006

1007
#ifdef CONFIG_X86_PAE
1008
	/* Need to make sure unshared kernel PMD is unpinned */
1009
	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1010
		       PT_PMD);
1011
#endif
1012

1013
	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1014

1015
	xen_mc_issue(0);
1016
}
1017

1018
static void xen_pgd_unpin(struct mm_struct *mm)
1019
{
1020
	__xen_pgd_unpin(mm, mm->pgd);
1021
}
1022

1023
/*
1024
 * On resume, undo any pinning done at save, so that the rest of the
1025
 * kernel doesn't see any unexpected pinned pagetables.
1026
 */
1027
void xen_mm_unpin_all(void)
1028
{
1029
	struct page *page;
1030

1031
	spin_lock(&pgd_lock);
1032

1033
	list_for_each_entry(page, &pgd_list, lru) {
1034
		if (PageSavePinned(page)) {
1035
			BUG_ON(!PagePinned(page));
1036
			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1037
			ClearPageSavePinned(page);
1038
		}
1039
	}
1040

1041
	spin_unlock(&pgd_lock);
1042
}
1043

1044
static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1045
{
1046
	spin_lock(&next->page_table_lock);
1047
	xen_pgd_pin(next);
1048
	spin_unlock(&next->page_table_lock);
1049
}
1050

1051
static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1052
{
1053
	spin_lock(&mm->page_table_lock);
1054
	xen_pgd_pin(mm);
1055
	spin_unlock(&mm->page_table_lock);
1056
}
1057

1058

1059
#ifdef CONFIG_SMP
1060
/* Another cpu may still have their %cr3 pointing at the pagetable, so
1061
   we need to repoint it somewhere else before we can unpin it. */
1062
static void drop_other_mm_ref(void *info)
1063
{
1064
	struct mm_struct *mm = info;
1065
	struct mm_struct *active_mm;
1066

1067
	active_mm = percpu_read(cpu_tlbstate.active_mm);
1068

1069
	if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1070
		leave_mm(smp_processor_id());
1071

1072
	/* If this cpu still has a stale cr3 reference, then make sure
1073
	   it has been flushed. */
1074
	if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1075
		load_cr3(swapper_pg_dir);
1076
}
1077

1078
static void xen_drop_mm_ref(struct mm_struct *mm)
1079
{
1080
	cpumask_var_t mask;
1081
	unsigned cpu;
1082

1083
	if (current->active_mm == mm) {
1084
		if (current->mm == mm)
1085
			load_cr3(swapper_pg_dir);
1086
		else
1087
			leave_mm(smp_processor_id());
1088
	}
1089

1090
	/* Get the "official" set of cpus referring to our pagetable. */
1091
	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1092
		for_each_online_cpu(cpu) {
1093
			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1094
			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1095
				continue;
1096
			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1097
		}
1098
		return;
1099
	}
1100
	cpumask_copy(mask, mm_cpumask(mm));
1101

1102
	/* It's possible that a vcpu may have a stale reference to our
1103
	   cr3, because its in lazy mode, and it hasn't yet flushed
1104
	   its set of pending hypercalls yet.  In this case, we can
1105
	   look at its actual current cr3 value, and force it to flush
1106
	   if needed. */
1107
	for_each_online_cpu(cpu) {
1108
		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1109
			cpumask_set_cpu(cpu, mask);
1110
	}
1111

1112
	if (!cpumask_empty(mask))
1113
		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1114
	free_cpumask_var(mask);
1115
}
1116
#else
1117
static void xen_drop_mm_ref(struct mm_struct *mm)
1118
{
1119
	if (current->active_mm == mm)
1120
		load_cr3(swapper_pg_dir);
1121
}
1122
#endif
1123

1124
/*
1125
 * While a process runs, Xen pins its pagetables, which means that the
1126
 * hypervisor forces it to be read-only, and it controls all updates
1127
 * to it.  This means that all pagetable updates have to go via the
1128
 * hypervisor, which is moderately expensive.
1129
 *
1130
 * Since we're pulling the pagetable down, we switch to use init_mm,
1131
 * unpin old process pagetable and mark it all read-write, which
1132
 * allows further operations on it to be simple memory accesses.
1133
 *
1134
 * The only subtle point is that another CPU may be still using the
1135
 * pagetable because of lazy tlb flushing.  This means we need need to
1136
 * switch all CPUs off this pagetable before we can unpin it.
1137
 */
1138
static void xen_exit_mmap(struct mm_struct *mm)
1139
{
1140
	get_cpu();		/* make sure we don't move around */
1141
	xen_drop_mm_ref(mm);
1142
	put_cpu();
1143

1144
	spin_lock(&mm->page_table_lock);
1145

1146
	/* pgd may not be pinned in the error exit path of execve */
1147
	if (xen_page_pinned(mm->pgd))
1148
		xen_pgd_unpin(mm);
1149

1150
	spin_unlock(&mm->page_table_lock);
1151
}
1152

1153
static void __init xen_pagetable_setup_start(pgd_t *base)
1154
{
1155
}
1156

1157
static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1158
{
1159
	/* reserve the range used */
1160
	native_pagetable_reserve(start, end);
1161

1162
	/* set as RW the rest */
1163
	printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1164
			PFN_PHYS(pgt_buf_top));
1165
	while (end < PFN_PHYS(pgt_buf_top)) {
1166
		make_lowmem_page_readwrite(__va(end));
1167
		end += PAGE_SIZE;
1168
	}
1169
}
1170

1171
static void xen_post_allocator_init(void);
1172

1173
static void __init xen_pagetable_setup_done(pgd_t *base)
1174
{
1175
	xen_setup_shared_info();
1176
	xen_post_allocator_init();
1177
}
1178

1179
static void xen_write_cr2(unsigned long cr2)
1180
{
1181
	percpu_read(xen_vcpu)->arch.cr2 = cr2;
1182
}
1183

1184
static unsigned long xen_read_cr2(void)
1185
{
1186
	return percpu_read(xen_vcpu)->arch.cr2;
1187
}
1188

1189
unsigned long xen_read_cr2_direct(void)
1190
{
1191
	return percpu_read(xen_vcpu_info.arch.cr2);
1192
}
1193

1194
static void xen_flush_tlb(void)
1195
{
1196
	struct mmuext_op *op;
1197
	struct multicall_space mcs;
1198

1199
	preempt_disable();
1200

1201
	mcs = xen_mc_entry(sizeof(*op));
1202

1203
	op = mcs.args;
1204
	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1205
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1206

1207
	xen_mc_issue(PARAVIRT_LAZY_MMU);
1208

1209
	preempt_enable();
1210
}
1211

1212
static void xen_flush_tlb_single(unsigned long addr)
1213
{
1214
	struct mmuext_op *op;
1215
	struct multicall_space mcs;
1216

1217
	preempt_disable();
1218

1219
	mcs = xen_mc_entry(sizeof(*op));
1220
	op = mcs.args;
1221
	op->cmd = MMUEXT_INVLPG_LOCAL;
1222
	op->arg1.linear_addr = addr & PAGE_MASK;
1223
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1224

1225
	xen_mc_issue(PARAVIRT_LAZY_MMU);
1226

1227
	preempt_enable();
1228
}
1229

1230
static void xen_flush_tlb_others(const struct cpumask *cpus,
1231
				 struct mm_struct *mm, unsigned long va)
1232
{
1233
	struct {
1234
		struct mmuext_op op;
1235
#ifdef CONFIG_SMP
1236
		DECLARE_BITMAP(mask, num_processors);
1237
#else
1238
		DECLARE_BITMAP(mask, NR_CPUS);
1239
#endif
1240
	} *args;
1241
	struct multicall_space mcs;
1242

1243
	if (cpumask_empty(cpus))
1244
		return;		/* nothing to do */
1245

1246
	mcs = xen_mc_entry(sizeof(*args));
1247
	args = mcs.args;
1248
	args->op.arg2.vcpumask = to_cpumask(args->mask);
1249

1250
	/* Remove us, and any offline CPUS. */
1251
	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1252
	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1253

1254
	if (va == TLB_FLUSH_ALL) {
1255
		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1256
	} else {
1257
		args->op.cmd = MMUEXT_INVLPG_MULTI;
1258
		args->op.arg1.linear_addr = va;
1259
	}
1260

1261
	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1262

1263
	xen_mc_issue(PARAVIRT_LAZY_MMU);
1264
}
1265

1266
static unsigned long xen_read_cr3(void)
1267
{
1268
	return percpu_read(xen_cr3);
1269
}
1270

1271
static void set_current_cr3(void *v)
1272
{
1273
	percpu_write(xen_current_cr3, (unsigned long)v);
1274
}
1275

1276
static void __xen_write_cr3(bool kernel, unsigned long cr3)
1277
{
1278
	struct mmuext_op *op;
1279
	struct multicall_space mcs;
1280
	unsigned long mfn;
1281

1282
	if (cr3)
1283
		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1284
	else
1285
		mfn = 0;
1286

1287
	WARN_ON(mfn == 0 && kernel);
1288

1289
	mcs = __xen_mc_entry(sizeof(*op));
1290

1291
	op = mcs.args;
1292
	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1293
	op->arg1.mfn = mfn;
1294

1295
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1296

1297
	if (kernel) {
1298
		percpu_write(xen_cr3, cr3);
1299

1300
		/* Update xen_current_cr3 once the batch has actually
1301
		   been submitted. */
1302
		xen_mc_callback(set_current_cr3, (void *)cr3);
1303
	}
1304
}
1305

1306
static void xen_write_cr3(unsigned long cr3)
1307
{
1308
	BUG_ON(preemptible());
1309

1310
	xen_mc_batch();  /* disables interrupts */
1311

1312
	/* Update while interrupts are disabled, so its atomic with
1313
	   respect to ipis */
1314
	percpu_write(xen_cr3, cr3);
1315

1316
	__xen_write_cr3(true, cr3);
1317

1318
#ifdef CONFIG_X86_64
1319
	{
1320
		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1321
		if (user_pgd)
1322
			__xen_write_cr3(false, __pa(user_pgd));
1323
		else
1324
			__xen_write_cr3(false, 0);
1325
	}
1326
#endif
1327

1328
	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1329
}
1330

1331
static int xen_pgd_alloc(struct mm_struct *mm)
1332
{
1333
	pgd_t *pgd = mm->pgd;
1334
	int ret = 0;
1335

1336
	BUG_ON(PagePinned(virt_to_page(pgd)));
1337

1338
#ifdef CONFIG_X86_64
1339
	{
1340
		struct page *page = virt_to_page(pgd);
1341
		pgd_t *user_pgd;
1342

1343
		BUG_ON(page->private != 0);
1344

1345
		ret = -ENOMEM;
1346

1347
		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1348
		page->private = (unsigned long)user_pgd;
1349

1350
		if (user_pgd != NULL) {
1351
			user_pgd[pgd_index(VSYSCALL_START)] =
1352
				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1353
			ret = 0;
1354
		}
1355

1356
		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1357
	}
1358
#endif
1359

1360
	return ret;
1361
}
1362

1363
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1364
{
1365
#ifdef CONFIG_X86_64
1366
	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1367

1368
	if (user_pgd)
1369
		free_page((unsigned long)user_pgd);
1370
#endif
1371
}
1372

1373
#ifdef CONFIG_X86_32
1374
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1375
{
1376
	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1377
	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1378
		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1379
			       pte_val_ma(pte));
1380

1381
	return pte;
1382
}
1383
#else /* CONFIG_X86_64 */
1384
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1385
{
1386
	unsigned long pfn = pte_pfn(pte);
1387

1388
	/*
1389
	 * If the new pfn is within the range of the newly allocated
1390
	 * kernel pagetable, and it isn't being mapped into an
1391
	 * early_ioremap fixmap slot as a freshly allocated page, make sure
1392
	 * it is RO.
1393
	 */
1394
	if (((!is_early_ioremap_ptep(ptep) &&
1395
			pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1396
			(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1397
		pte = pte_wrprotect(pte);
1398

1399
	return pte;
1400
}
1401
#endif /* CONFIG_X86_64 */
1402

1403
/* Init-time set_pte while constructing initial pagetables, which
1404
   doesn't allow RO pagetable pages to be remapped RW */
1405
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1406
{
1407
	pte = mask_rw_pte(ptep, pte);
1408

1409
	xen_set_pte(ptep, pte);
1410
}
1411

1412
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1413
{
1414
	struct mmuext_op op;
1415
	op.cmd = cmd;
1416
	op.arg1.mfn = pfn_to_mfn(pfn);
1417
	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1418
		BUG();
1419
}
1420

1421
/* Early in boot, while setting up the initial pagetable, assume
1422
   everything is pinned. */
1423
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1424
{
1425
#ifdef CONFIG_FLATMEM
1426
	BUG_ON(mem_map);	/* should only be used early */
1427
#endif
1428
	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1429
	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1430
}
1431

1432
/* Used for pmd and pud */
1433
static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1434
{
1435
#ifdef CONFIG_FLATMEM
1436
	BUG_ON(mem_map);	/* should only be used early */
1437
#endif
1438
	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1439
}
1440

1441
/* Early release_pte assumes that all pts are pinned, since there's
1442
   only init_mm and anything attached to that is pinned. */
1443
static void __init xen_release_pte_init(unsigned long pfn)
1444
{
1445
	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1446
	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1447
}
1448

1449
static void __init xen_release_pmd_init(unsigned long pfn)
1450
{
1451
	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452
}
1453

1454
/* This needs to make sure the new pte page is pinned iff its being
1455
   attached to a pinned pagetable. */
1456
static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1457
{
1458
	struct page *page = pfn_to_page(pfn);
1459

1460
	if (PagePinned(virt_to_page(mm->pgd))) {
1461
		SetPagePinned(page);
1462

1463
		if (!PageHighMem(page)) {
1464
			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1465
			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1466
				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1467
		} else {
1468
			/* make sure there are no stray mappings of
1469
			   this page */
1470
			kmap_flush_unused();
1471
		}
1472
	}
1473
}
1474

1475
static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1476
{
1477
	xen_alloc_ptpage(mm, pfn, PT_PTE);
1478
}
1479

1480
static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1481
{
1482
	xen_alloc_ptpage(mm, pfn, PT_PMD);
1483
}
1484

1485
/* This should never happen until we're OK to use struct page */
1486
static void xen_release_ptpage(unsigned long pfn, unsigned level)
1487
{
1488
	struct page *page = pfn_to_page(pfn);
1489

1490
	if (PagePinned(page)) {
1491
		if (!PageHighMem(page)) {
1492
			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1493
				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1494
			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1495
		}
1496
		ClearPagePinned(page);
1497
	}
1498
}
1499

1500
static void xen_release_pte(unsigned long pfn)
1501
{
1502
	xen_release_ptpage(pfn, PT_PTE);
1503
}
1504

1505
static void xen_release_pmd(unsigned long pfn)
1506
{
1507
	xen_release_ptpage(pfn, PT_PMD);
1508
}
1509

1510
#if PAGETABLE_LEVELS == 4
1511
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1512
{
1513
	xen_alloc_ptpage(mm, pfn, PT_PUD);
1514
}
1515

1516
static void xen_release_pud(unsigned long pfn)
1517
{
1518
	xen_release_ptpage(pfn, PT_PUD);
1519
}
1520
#endif
1521

1522
void __init xen_reserve_top(void)
1523
{
1524
#ifdef CONFIG_X86_32
1525
	unsigned long top = HYPERVISOR_VIRT_START;
1526
	struct xen_platform_parameters pp;
1527

1528
	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1529
		top = pp.virt_start;
1530

1531
	reserve_top_address(-top);
1532
#endif	/* CONFIG_X86_32 */
1533
}
1534

1535
/*
1536
 * Like __va(), but returns address in the kernel mapping (which is
1537
 * all we have until the physical memory mapping has been set up.
1538
 */
1539
static void *__ka(phys_addr_t paddr)
1540
{
1541
#ifdef CONFIG_X86_64
1542
	return (void *)(paddr + __START_KERNEL_map);
1543
#else
1544
	return __va(paddr);
1545
#endif
1546
}
1547

1548
/* Convert a machine address to physical address */
1549
static unsigned long m2p(phys_addr_t maddr)
1550
{
1551
	phys_addr_t paddr;
1552

1553
	maddr &= PTE_PFN_MASK;
1554
	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1555

1556
	return paddr;
1557
}
1558

1559
/* Convert a machine address to kernel virtual */
1560
static void *m2v(phys_addr_t maddr)
1561
{
1562
	return __ka(m2p(maddr));
1563
}
1564

1565
/* Set the page permissions on an identity-mapped pages */
1566
static void set_page_prot(void *addr, pgprot_t prot)
1567
{
1568
	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1569
	pte_t pte = pfn_pte(pfn, prot);
1570

1571
	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1572
		BUG();
1573
}
1574

1575
static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1576
{
1577
	unsigned pmdidx, pteidx;
1578
	unsigned ident_pte;
1579
	unsigned long pfn;
1580

1581
	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1582
				      PAGE_SIZE);
1583

1584
	ident_pte = 0;
1585
	pfn = 0;
1586
	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1587
		pte_t *pte_page;
1588

1589
		/* Reuse or allocate a page of ptes */
1590
		if (pmd_present(pmd[pmdidx]))
1591
			pte_page = m2v(pmd[pmdidx].pmd);
1592
		else {
1593
			/* Check for free pte pages */
1594
			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1595
				break;
1596

1597
			pte_page = &level1_ident_pgt[ident_pte];
1598
			ident_pte += PTRS_PER_PTE;
1599

1600
			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1601
		}
1602

1603
		/* Install mappings */
1604
		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1605
			pte_t pte;
1606

1607
#ifdef CONFIG_X86_32
1608
			if (pfn > max_pfn_mapped)
1609
				max_pfn_mapped = pfn;
1610
#endif
1611

1612
			if (!pte_none(pte_page[pteidx]))
1613
				continue;
1614

1615
			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1616
			pte_page[pteidx] = pte;
1617
		}
1618
	}
1619

1620
	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1621
		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1622

1623
	set_page_prot(pmd, PAGE_KERNEL_RO);
1624
}
1625

1626
void __init xen_setup_machphys_mapping(void)
1627
{
1628
	struct xen_machphys_mapping mapping;
1629
	unsigned long machine_to_phys_nr_ents;
1630

1631
	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1632
		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1633
		machine_to_phys_nr_ents = mapping.max_mfn + 1;
1634
	} else {
1635
		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1636
	}
1637
	machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1638
}
1639

1640
#ifdef CONFIG_X86_64
1641
static void convert_pfn_mfn(void *v)
1642
{
1643
	pte_t *pte = v;
1644
	int i;
1645

1646
	/* All levels are converted the same way, so just treat them
1647
	   as ptes. */
1648
	for (i = 0; i < PTRS_PER_PTE; i++)
1649
		pte[i] = xen_make_pte(pte[i].pte);
1650
}
1651

1652
/*
1653
 * Set up the initial kernel pagetable.
1654
 *
1655
 * We can construct this by grafting the Xen provided pagetable into
1656
 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1657
 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1658
 * means that only the kernel has a physical mapping to start with -
1659
 * but that's enough to get __va working.  We need to fill in the rest
1660
 * of the physical mapping once some sort of allocator has been set
1661
 * up.
1662
 */
1663
pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1664
					 unsigned long max_pfn)
1665
{
1666
	pud_t *l3;
1667
	pmd_t *l2;
1668

1669
	/* max_pfn_mapped is the last pfn mapped in the initial memory
1670
	 * mappings. Considering that on Xen after the kernel mappings we
1671
	 * have the mappings of some pages that don't exist in pfn space, we
1672
	 * set max_pfn_mapped to the last real pfn mapped. */
1673
	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1674

1675
	/* Zap identity mapping */
1676
	init_level4_pgt[0] = __pgd(0);
1677

1678
	/* Pre-constructed entries are in pfn, so convert to mfn */
1679
	convert_pfn_mfn(init_level4_pgt);
1680
	convert_pfn_mfn(level3_ident_pgt);
1681
	convert_pfn_mfn(level3_kernel_pgt);
1682

1683
	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1684
	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1685

1686
	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1687
	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1688

1689
	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1690
	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1691
	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1692

1693
	/* Set up identity map */
1694
	xen_map_identity_early(level2_ident_pgt, max_pfn);
1695

1696
	/* Make pagetable pieces RO */
1697
	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1698
	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1699
	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1700
	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1701
	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1702
	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1703

1704
	/* Pin down new L4 */
1705
	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1706
			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1707

1708
	/* Unpin Xen-provided one */
1709
	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1710

1711
	/* Switch over */
1712
	pgd = init_level4_pgt;
1713

1714
	/*
1715
	 * At this stage there can be no user pgd, and no page
1716
	 * structure to attach it to, so make sure we just set kernel
1717
	 * pgd.
1718
	 */
1719
	xen_mc_batch();
1720
	__xen_write_cr3(true, __pa(pgd));
1721
	xen_mc_issue(PARAVIRT_LAZY_CPU);
1722

1723
	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1724
		      __pa(xen_start_info->pt_base +
1725
			   xen_start_info->nr_pt_frames * PAGE_SIZE),
1726
		      "XEN PAGETABLES");
1727

1728
	return pgd;
1729
}
1730
#else	/* !CONFIG_X86_64 */
1731
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1732
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1733

1734
static void __init xen_write_cr3_init(unsigned long cr3)
1735
{
1736
	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1737

1738
	BUG_ON(read_cr3() != __pa(initial_page_table));
1739
	BUG_ON(cr3 != __pa(swapper_pg_dir));
1740

1741
	/*
1742
	 * We are switching to swapper_pg_dir for the first time (from
1743
	 * initial_page_table) and therefore need to mark that page
1744
	 * read-only and then pin it.
1745
	 *
1746
	 * Xen disallows sharing of kernel PMDs for PAE
1747
	 * guests. Therefore we must copy the kernel PMD from
1748
	 * initial_page_table into a new kernel PMD to be used in
1749
	 * swapper_pg_dir.
1750
	 */
1751
	swapper_kernel_pmd =
1752
		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1753
	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1754
	       sizeof(pmd_t) * PTRS_PER_PMD);
1755
	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1756
		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1757
	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1758

1759
	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1760
	xen_write_cr3(cr3);
1761
	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1762

1763
	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1764
			  PFN_DOWN(__pa(initial_page_table)));
1765
	set_page_prot(initial_page_table, PAGE_KERNEL);
1766
	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1767

1768
	pv_mmu_ops.write_cr3 = &xen_write_cr3;
1769
}
1770

1771
pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1772
					 unsigned long max_pfn)
1773
{
1774
	pmd_t *kernel_pmd;
1775

1776
	initial_kernel_pmd =
1777
		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1778

1779
	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1780
				  xen_start_info->nr_pt_frames * PAGE_SIZE +
1781
				  512*1024);
1782

1783
	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1784
	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1785

1786
	xen_map_identity_early(initial_kernel_pmd, max_pfn);
1787

1788
	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1789
	initial_page_table[KERNEL_PGD_BOUNDARY] =
1790
		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1791

1792
	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1793
	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1794
	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1795

1796
	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1797

1798
	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1799
			  PFN_DOWN(__pa(initial_page_table)));
1800
	xen_write_cr3(__pa(initial_page_table));
1801

1802
	memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1803
		      __pa(xen_start_info->pt_base +
1804
			   xen_start_info->nr_pt_frames * PAGE_SIZE),
1805
		      "XEN PAGETABLES");
1806

1807
	return initial_page_table;
1808
}
1809
#endif	/* CONFIG_X86_64 */
1810

1811
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1812

1813
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1814
{
1815
	pte_t pte;
1816

1817
	phys >>= PAGE_SHIFT;
1818

1819
	switch (idx) {
1820
	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1821
#ifdef CONFIG_X86_F00F_BUG
1822
	case FIX_F00F_IDT:
1823
#endif
1824
#ifdef CONFIG_X86_32
1825
	case FIX_WP_TEST:
1826
	case FIX_VDSO:
1827
# ifdef CONFIG_HIGHMEM
1828
	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1829
# endif
1830
#else
1831
	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1832
#endif
1833
	case FIX_TEXT_POKE0:
1834
	case FIX_TEXT_POKE1:
1835
		/* All local page mappings */
1836
		pte = pfn_pte(phys, prot);
1837
		break;
1838

1839
#ifdef CONFIG_X86_LOCAL_APIC
1840
	case FIX_APIC_BASE:	/* maps dummy local APIC */
1841
		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1842
		break;
1843
#endif
1844

1845
#ifdef CONFIG_X86_IO_APIC
1846
	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1847
		/*
1848
		 * We just don't map the IO APIC - all access is via
1849
		 * hypercalls.  Keep the address in the pte for reference.
1850
		 */
1851
		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1852
		break;
1853
#endif
1854

1855
	case FIX_PARAVIRT_BOOTMAP:
1856
		/* This is an MFN, but it isn't an IO mapping from the
1857
		   IO domain */
1858
		pte = mfn_pte(phys, prot);
1859
		break;
1860

1861
	default:
1862
		/* By default, set_fixmap is used for hardware mappings */
1863
		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1864
		break;
1865
	}
1866

1867
	__native_set_fixmap(idx, pte);
1868

1869
#ifdef CONFIG_X86_64
1870
	/* Replicate changes to map the vsyscall page into the user
1871
	   pagetable vsyscall mapping. */
1872
	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1873
		unsigned long vaddr = __fix_to_virt(idx);
1874
		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1875
	}
1876
#endif
1877
}
1878

1879
void __init xen_ident_map_ISA(void)
1880
{
1881
	unsigned long pa;
1882

1883
	/*
1884
	 * If we're dom0, then linear map the ISA machine addresses into
1885
	 * the kernel's address space.
1886
	 */
1887
	if (!xen_initial_domain())
1888
		return;
1889

1890
	xen_raw_printk("Xen: setup ISA identity maps\n");
1891

1892
	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1893
		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1894

1895
		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1896
			BUG();
1897
	}
1898

1899
	xen_flush_tlb();
1900
}
1901

1902
static void __init xen_post_allocator_init(void)
1903
{
1904
#ifdef CONFIG_XEN_DEBUG
1905
	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1906
#endif
1907
	pv_mmu_ops.set_pte = xen_set_pte;
1908
	pv_mmu_ops.set_pmd = xen_set_pmd;
1909
	pv_mmu_ops.set_pud = xen_set_pud;
1910
#if PAGETABLE_LEVELS == 4
1911
	pv_mmu_ops.set_pgd = xen_set_pgd;
1912
#endif
1913

1914
	/* This will work as long as patching hasn't happened yet
1915
	   (which it hasn't) */
1916
	pv_mmu_ops.alloc_pte = xen_alloc_pte;
1917
	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1918
	pv_mmu_ops.release_pte = xen_release_pte;
1919
	pv_mmu_ops.release_pmd = xen_release_pmd;
1920
#if PAGETABLE_LEVELS == 4
1921
	pv_mmu_ops.alloc_pud = xen_alloc_pud;
1922
	pv_mmu_ops.release_pud = xen_release_pud;
1923
#endif
1924

1925
#ifdef CONFIG_X86_64
1926
	SetPagePinned(virt_to_page(level3_user_vsyscall));
1927
#endif
1928
	xen_mark_init_mm_pinned();
1929
}
1930

1931
static void xen_leave_lazy_mmu(void)
1932
{
1933
	preempt_disable();
1934
	xen_mc_flush();
1935
	paravirt_leave_lazy_mmu();
1936
	preempt_enable();
1937
}
1938

1939
static const struct pv_mmu_ops xen_mmu_ops __initconst = {
1940
	.read_cr2 = xen_read_cr2,
1941
	.write_cr2 = xen_write_cr2,
1942

1943
	.read_cr3 = xen_read_cr3,
1944
#ifdef CONFIG_X86_32
1945
	.write_cr3 = xen_write_cr3_init,
1946
#else
1947
	.write_cr3 = xen_write_cr3,
1948
#endif
1949

1950
	.flush_tlb_user = xen_flush_tlb,
1951
	.flush_tlb_kernel = xen_flush_tlb,
1952
	.flush_tlb_single = xen_flush_tlb_single,
1953
	.flush_tlb_others = xen_flush_tlb_others,
1954

1955
	.pte_update = paravirt_nop,
1956
	.pte_update_defer = paravirt_nop,
1957

1958
	.pgd_alloc = xen_pgd_alloc,
1959
	.pgd_free = xen_pgd_free,
1960

1961
	.alloc_pte = xen_alloc_pte_init,
1962
	.release_pte = xen_release_pte_init,
1963
	.alloc_pmd = xen_alloc_pmd_init,
1964
	.release_pmd = xen_release_pmd_init,
1965

1966
	.set_pte = xen_set_pte_init,
1967
	.set_pte_at = xen_set_pte_at,
1968
	.set_pmd = xen_set_pmd_hyper,
1969

1970
	.ptep_modify_prot_start = __ptep_modify_prot_start,
1971
	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
1972

1973
	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
1974
	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1975

1976
	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
1977
	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1978

1979
#ifdef CONFIG_X86_PAE
1980
	.set_pte_atomic = xen_set_pte_atomic,
1981
	.pte_clear = xen_pte_clear,
1982
	.pmd_clear = xen_pmd_clear,
1983
#endif	/* CONFIG_X86_PAE */
1984
	.set_pud = xen_set_pud_hyper,
1985

1986
	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1987
	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1988

1989
#if PAGETABLE_LEVELS == 4
1990
	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
1991
	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
1992
	.set_pgd = xen_set_pgd_hyper,
1993

1994
	.alloc_pud = xen_alloc_pmd_init,
1995
	.release_pud = xen_release_pmd_init,
1996
#endif	/* PAGETABLE_LEVELS == 4 */
1997

1998
	.activate_mm = xen_activate_mm,
1999
	.dup_mmap = xen_dup_mmap,
2000
	.exit_mmap = xen_exit_mmap,
2001

2002
	.lazy_mode = {
2003
		.enter = paravirt_enter_lazy_mmu,
2004
		.leave = xen_leave_lazy_mmu,
2005
	},
2006

2007
	.set_fixmap = xen_set_fixmap,
2008
};
2009

2010
void __init xen_init_mmu_ops(void)
2011
{
2012
	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2013
	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2014
	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2015
	pv_mmu_ops = xen_mmu_ops;
2016

2017
	memset(dummy_mapping, 0xff, PAGE_SIZE);
2018
}
2019

2020
/* Protected by xen_reservation_lock. */
2021
#define MAX_CONTIG_ORDER 9 /* 2MB */
2022
static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2023

2024
#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2025
static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2026
				unsigned long *in_frames,
2027
				unsigned long *out_frames)
2028
{
2029
	int i;
2030
	struct multicall_space mcs;
2031

2032
	xen_mc_batch();
2033
	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2034
		mcs = __xen_mc_entry(0);
2035

2036
		if (in_frames)
2037
			in_frames[i] = virt_to_mfn(vaddr);
2038

2039
		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2040
		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2041

2042
		if (out_frames)
2043
			out_frames[i] = virt_to_pfn(vaddr);
2044
	}
2045
	xen_mc_issue(0);
2046
}
2047

2048
/*
2049
 * Update the pfn-to-mfn mappings for a virtual address range, either to
2050
 * point to an array of mfns, or contiguously from a single starting
2051
 * mfn.
2052
 */
2053
static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2054
				     unsigned long *mfns,
2055
				     unsigned long first_mfn)
2056
{
2057
	unsigned i, limit;
2058
	unsigned long mfn;
2059

2060
	xen_mc_batch();
2061

2062
	limit = 1u << order;
2063
	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2064
		struct multicall_space mcs;
2065
		unsigned flags;
2066

2067
		mcs = __xen_mc_entry(0);
2068
		if (mfns)
2069
			mfn = mfns[i];
2070
		else
2071
			mfn = first_mfn + i;
2072

2073
		if (i < (limit - 1))
2074
			flags = 0;
2075
		else {
2076
			if (order == 0)
2077
				flags = UVMF_INVLPG | UVMF_ALL;
2078
			else
2079
				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2080
		}
2081

2082
		MULTI_update_va_mapping(mcs.mc, vaddr,
2083
				mfn_pte(mfn, PAGE_KERNEL), flags);
2084

2085
		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2086
	}
2087

2088
	xen_mc_issue(0);
2089
}
2090

2091
/*
2092
 * Perform the hypercall to exchange a region of our pfns to point to
2093
 * memory with the required contiguous alignment.  Takes the pfns as
2094
 * input, and populates mfns as output.
2095
 *
2096
 * Returns a success code indicating whether the hypervisor was able to
2097
 * satisfy the request or not.
2098
 */
2099
static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2100
			       unsigned long *pfns_in,
2101
			       unsigned long extents_out,
2102
			       unsigned int order_out,
2103
			       unsigned long *mfns_out,
2104
			       unsigned int address_bits)
2105
{
2106
	long rc;
2107
	int success;
2108

2109
	struct xen_memory_exchange exchange = {
2110
		.in = {
2111
			.nr_extents   = extents_in,
2112
			.extent_order = order_in,
2113
			.extent_start = pfns_in,
2114
			.domid        = DOMID_SELF
2115
		},
2116
		.out = {
2117
			.nr_extents   = extents_out,
2118
			.extent_order = order_out,
2119
			.extent_start = mfns_out,
2120
			.address_bits = address_bits,
2121
			.domid        = DOMID_SELF
2122
		}
2123
	};
2124

2125
	BUG_ON(extents_in << order_in != extents_out << order_out);
2126

2127
	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2128
	success = (exchange.nr_exchanged == extents_in);
2129

2130
	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2131
	BUG_ON(success && (rc != 0));
2132

2133
	return success;
2134
}
2135

2136
int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2137
				 unsigned int address_bits)
2138
{
2139
	unsigned long *in_frames = discontig_frames, out_frame;
2140
	unsigned long  flags;
2141
	int            success;
2142

2143
	/*
2144
	 * Currently an auto-translated guest will not perform I/O, nor will
2145
	 * it require PAE page directories below 4GB. Therefore any calls to
2146
	 * this function are redundant and can be ignored.
2147
	 */
2148

2149
	if (xen_feature(XENFEAT_auto_translated_physmap))
2150
		return 0;
2151

2152
	if (unlikely(order > MAX_CONTIG_ORDER))
2153
		return -ENOMEM;
2154

2155
	memset((void *) vstart, 0, PAGE_SIZE << order);
2156

2157
	spin_lock_irqsave(&xen_reservation_lock, flags);
2158

2159
	/* 1. Zap current PTEs, remembering MFNs. */
2160
	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2161

2162
	/* 2. Get a new contiguous memory extent. */
2163
	out_frame = virt_to_pfn(vstart);
2164
	success = xen_exchange_memory(1UL << order, 0, in_frames,
2165
				      1, order, &out_frame,
2166
				      address_bits);
2167

2168
	/* 3. Map the new extent in place of old pages. */
2169
	if (success)
2170
		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2171
	else
2172
		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2173

2174
	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2175

2176
	return success ? 0 : -ENOMEM;
2177
}
2178
EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2179

2180
void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2181
{
2182
	unsigned long *out_frames = discontig_frames, in_frame;
2183
	unsigned long  flags;
2184
	int success;
2185

2186
	if (xen_feature(XENFEAT_auto_translated_physmap))
2187
		return;
2188

2189
	if (unlikely(order > MAX_CONTIG_ORDER))
2190
		return;
2191

2192
	memset((void *) vstart, 0, PAGE_SIZE << order);
2193

2194
	spin_lock_irqsave(&xen_reservation_lock, flags);
2195

2196
	/* 1. Find start MFN of contiguous extent. */
2197
	in_frame = virt_to_mfn(vstart);
2198

2199
	/* 2. Zap current PTEs. */
2200
	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2201

2202
	/* 3. Do the exchange for non-contiguous MFNs. */
2203
	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2204
					0, out_frames, 0);
2205

2206
	/* 4. Map new pages in place of old pages. */
2207
	if (success)
2208
		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2209
	else
2210
		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2211

2212
	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2213
}
2214
EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2215

2216
#ifdef CONFIG_XEN_PVHVM
2217
static void xen_hvm_exit_mmap(struct mm_struct *mm)
2218
{
2219
	struct xen_hvm_pagetable_dying a;
2220
	int rc;
2221

2222
	a.domid = DOMID_SELF;
2223
	a.gpa = __pa(mm->pgd);
2224
	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2225
	WARN_ON_ONCE(rc < 0);
2226
}
2227

2228
static int is_pagetable_dying_supported(void)
2229
{
2230
	struct xen_hvm_pagetable_dying a;
2231
	int rc = 0;
2232

2233
	a.domid = DOMID_SELF;
2234
	a.gpa = 0x00;
2235
	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2236
	if (rc < 0) {
2237
		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2238
		return 0;
2239
	}
2240
	return 1;
2241
}
2242

2243
void __init xen_hvm_init_mmu_ops(void)
2244
{
2245
	if (is_pagetable_dying_supported())
2246
		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2247
}
2248
#endif
2249

2250
#define REMAP_BATCH_SIZE 16
2251

2252
struct remap_data {
2253
	unsigned long mfn;
2254
	pgprot_t prot;
2255
	struct mmu_update *mmu_update;
2256
};
2257

2258
static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2259
				 unsigned long addr, void *data)
2260
{
2261
	struct remap_data *rmd = data;
2262
	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2263

2264
	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2265
	rmd->mmu_update->val = pte_val_ma(pte);
2266
	rmd->mmu_update++;
2267

2268
	return 0;
2269
}
2270

2271
int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2272
			       unsigned long addr,
2273
			       unsigned long mfn, int nr,
2274
			       pgprot_t prot, unsigned domid)
2275
{
2276
	struct remap_data rmd;
2277
	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2278
	int batch;
2279
	unsigned long range;
2280
	int err = 0;
2281

2282
	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2283

2284
	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2285
				(VM_PFNMAP | VM_RESERVED | VM_IO)));
2286

2287
	rmd.mfn = mfn;
2288
	rmd.prot = prot;
2289

2290
	while (nr) {
2291
		batch = min(REMAP_BATCH_SIZE, nr);
2292
		range = (unsigned long)batch << PAGE_SHIFT;
2293

2294
		rmd.mmu_update = mmu_update;
2295
		err = apply_to_page_range(vma->vm_mm, addr, range,
2296
					  remap_area_mfn_pte_fn, &rmd);
2297
		if (err)
2298
			goto out;
2299

2300
		err = -EFAULT;
2301
		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2302
			goto out;
2303

2304
		nr -= batch;
2305
		addr += range;
2306
	}
2307

2308
	err = 0;
2309
out:
2310

2311
	flush_tlb_all();
2312

2313
	return err;
2314
}
2315
EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2316

2317
#ifdef CONFIG_XEN_DEBUG_FS
2318
static int p2m_dump_open(struct inode *inode, struct file *filp)
2319
{
2320
	return single_open(filp, p2m_dump_show, NULL);
2321
}
2322

2323
static const struct file_operations p2m_dump_fops = {
2324
	.open		= p2m_dump_open,
2325
	.read		= seq_read,
2326
	.llseek		= seq_lseek,
2327
	.release	= single_release,
2328
};
2329
#endif /* CONFIG_XEN_DEBUG_FS */
2330

2331
Product

Resources

Company