CoCalc -- mmu

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/xen/mmu_pv.c
²⁶⁴²⁴ views
1
// SPDX-License-Identifier: GPL-2.0
2

3
/*
4
 * Xen mmu operations
5
 *
6
 * This file contains the various mmu fetch and update operations.
7
 * The most important job they must perform is the mapping between the
8
 * domain's pfn and the overall machine mfns.
9
 *
10
 * Xen allows guests to directly update the pagetable, in a controlled
11
 * fashion.  In other words, the guest modifies the same pagetable
12
 * that the CPU actually uses, which eliminates the overhead of having
13
 * a separate shadow pagetable.
14
 *
15
 * In order to allow this, it falls on the guest domain to map its
16
 * notion of a "physical" pfn - which is just a domain-local linear
17
 * address - into a real "machine address" which the CPU's MMU can
18
 * use.
19
 *
20
 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
21
 * inserted directly into the pagetable.  When creating a new
22
 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
23
 * when reading the content back with __(pgd|pmd|pte)_val, it converts
24
 * the mfn back into a pfn.
25
 *
26
 * The other constraint is that all pages which make up a pagetable
27
 * must be mapped read-only in the guest.  This prevents uncontrolled
28
 * guest updates to the pagetable.  Xen strictly enforces this, and
29
 * will disallow any pagetable update which will end up mapping a
30
 * pagetable page RW, and will disallow using any writable page as a
31
 * pagetable.
32
 *
33
 * Naively, when loading %cr3 with the base of a new pagetable, Xen
34
 * would need to validate the whole pagetable before going on.
35
 * Naturally, this is quite slow.  The solution is to "pin" a
36
 * pagetable, which enforces all the constraints on the pagetable even
37
 * when it is not actively in use.  This means that Xen can be assured
38
 * that it is still valid when you do load it into %cr3, and doesn't
39
 * need to revalidate it.
40
 *
41
 * Jeremy Fitzhardinge <[email protected]>, XenSource Inc, 2007
42
 */
43
#include <linux/sched/mm.h>
44
#include <linux/debugfs.h>
45
#include <linux/bug.h>
46
#include <linux/vmalloc.h>
47
#include <linux/export.h>
48
#include <linux/init.h>
49
#include <linux/gfp.h>
50
#include <linux/memblock.h>
51
#include <linux/seq_file.h>
52
#include <linux/crash_dump.h>
53
#include <linux/pgtable.h>
54
#ifdef CONFIG_KEXEC_CORE
55
#include <linux/kexec.h>
56
#endif
57

58
#include <trace/events/xen.h>
59

60
#include <asm/tlbflush.h>
61
#include <asm/fixmap.h>
62
#include <asm/mmu_context.h>
63
#include <asm/setup.h>
64
#include <asm/paravirt.h>
65
#include <asm/e820/api.h>
66
#include <asm/linkage.h>
67
#include <asm/page.h>
68
#include <asm/init.h>
69
#include <asm/memtype.h>
70
#include <asm/smp.h>
71
#include <asm/tlb.h>
72

73
#include <asm/xen/hypercall.h>
74
#include <asm/xen/hypervisor.h>
75

76
#include <xen/xen.h>
77
#include <xen/page.h>
78
#include <xen/interface/xen.h>
79
#include <xen/interface/hvm/hvm_op.h>
80
#include <xen/interface/version.h>
81
#include <xen/interface/memory.h>
82
#include <xen/hvc-console.h>
83
#include <xen/swiotlb-xen.h>
84

85
#include "xen-ops.h"
86

87
/*
88
 * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
89
 * to avoid warnings with "-Wmissing-prototypes".
90
 */
91
pteval_t xen_pte_val(pte_t pte);
92
pgdval_t xen_pgd_val(pgd_t pgd);
93
pmdval_t xen_pmd_val(pmd_t pmd);
94
pudval_t xen_pud_val(pud_t pud);
95
p4dval_t xen_p4d_val(p4d_t p4d);
96
pte_t xen_make_pte(pteval_t pte);
97
pgd_t xen_make_pgd(pgdval_t pgd);
98
pmd_t xen_make_pmd(pmdval_t pmd);
99
pud_t xen_make_pud(pudval_t pud);
100
p4d_t xen_make_p4d(p4dval_t p4d);
101
pte_t xen_make_pte_init(pteval_t pte);
102

103
#ifdef CONFIG_X86_VSYSCALL_EMULATION
104
/* l3 pud for userspace vsyscall mapping */
105
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
106
#endif
107

108
/*
109
 * Protects atomic reservation decrease/increase against concurrent increases.
110
 * Also protects non-atomic updates of current_pages and balloon lists.
111
 */
112
static DEFINE_SPINLOCK(xen_reservation_lock);
113

114
/* Protected by xen_reservation_lock. */
115
#define MIN_CONTIG_ORDER 9 /* 2MB */
116
static unsigned int discontig_frames_order = MIN_CONTIG_ORDER;
117
static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata;
118
static unsigned long *discontig_frames __refdata = discontig_frames_early;
119
static bool discontig_frames_dyn;
120

121
static int alloc_discontig_frames(unsigned int order)
122
{
123
	unsigned long *new_array, *old_array;
124
	unsigned int old_order;
125
	unsigned long flags;
126

127
	BUG_ON(order < MIN_CONTIG_ORDER);
128
	BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE);
129

130
	new_array = (unsigned long *)__get_free_pages(GFP_KERNEL,
131
						      order - MIN_CONTIG_ORDER);
132
	if (!new_array)
133
		return -ENOMEM;
134

135
	spin_lock_irqsave(&xen_reservation_lock, flags);
136

137
	old_order = discontig_frames_order;
138

139
	if (order > discontig_frames_order || !discontig_frames_dyn) {
140
		if (!discontig_frames_dyn)
141
			old_array = NULL;
142
		else
143
			old_array = discontig_frames;
144

145
		discontig_frames = new_array;
146
		discontig_frames_order = order;
147
		discontig_frames_dyn = true;
148
	} else {
149
		old_array = new_array;
150
	}
151

152
	spin_unlock_irqrestore(&xen_reservation_lock, flags);
153

154
	free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER);
155

156
	return 0;
157
}
158

159
/*
160
 * Note about cr3 (pagetable base) values:
161
 *
162
 * xen_cr3 contains the current logical cr3 value; it contains the
163
 * last set cr3.  This may not be the current effective cr3, because
164
 * its update may be being lazily deferred.  However, a vcpu looking
165
 * at its own cr3 can use this value knowing that it everything will
166
 * be self-consistent.
167
 *
168
 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
169
 * hypercall to set the vcpu cr3 is complete (so it may be a little
170
 * out of date, but it will never be set early).  If one vcpu is
171
 * looking at another vcpu's cr3 value, it should use this variable.
172
 */
173
DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
174
static DEFINE_PER_CPU(unsigned long, xen_current_cr3);	/* actual vcpu cr3 */
175

176
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
177

178
static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
179

180
/*
181
 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
182
 * redzone above it, so round it up to a PGD boundary.
183
 */
184
#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
185

186
void make_lowmem_page_readonly(void *vaddr)
187
{
188
	pte_t *pte, ptev;
189
	unsigned long address = (unsigned long)vaddr;
190
	unsigned int level;
191

192
	pte = lookup_address(address, &level);
193
	if (pte == NULL)
194
		return;		/* vaddr missing */
195

196
	ptev = pte_wrprotect(*pte);
197

198
	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
199
		BUG();
200
}
201

202
void make_lowmem_page_readwrite(void *vaddr)
203
{
204
	pte_t *pte, ptev;
205
	unsigned long address = (unsigned long)vaddr;
206
	unsigned int level;
207

208
	pte = lookup_address(address, &level);
209
	if (pte == NULL)
210
		return;		/* vaddr missing */
211

212
	ptev = pte_mkwrite_novma(*pte);
213

214
	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
215
		BUG();
216
}
217

218

219
/*
220
 * During early boot all page table pages are pinned, but we do not have struct
221
 * pages, so return true until struct pages are ready.
222
 */
223
static bool xen_page_pinned(void *ptr)
224
{
225
	if (static_branch_likely(&xen_struct_pages_ready)) {
226
		struct page *page = virt_to_page(ptr);
227

228
		return PagePinned(page);
229
	}
230
	return true;
231
}
232

233
static void xen_extend_mmu_update(const struct mmu_update *update)
234
{
235
	struct multicall_space mcs;
236
	struct mmu_update *u;
237

238
	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
239

240
	if (mcs.mc != NULL) {
241
		mcs.mc->args[1]++;
242
	} else {
243
		mcs = __xen_mc_entry(sizeof(*u));
244
		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
245
	}
246

247
	u = mcs.args;
248
	*u = *update;
249
}
250

251
static void xen_extend_mmuext_op(const struct mmuext_op *op)
252
{
253
	struct multicall_space mcs;
254
	struct mmuext_op *u;
255

256
	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
257

258
	if (mcs.mc != NULL) {
259
		mcs.mc->args[1]++;
260
	} else {
261
		mcs = __xen_mc_entry(sizeof(*u));
262
		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
263
	}
264

265
	u = mcs.args;
266
	*u = *op;
267
}
268

269
static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
270
{
271
	struct mmu_update u;
272

273
	preempt_disable();
274

275
	xen_mc_batch();
276

277
	/* ptr may be ioremapped for 64-bit pagetable setup */
278
	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
279
	u.val = pmd_val_ma(val);
280
	xen_extend_mmu_update(&u);
281

282
	xen_mc_issue(XEN_LAZY_MMU);
283

284
	preempt_enable();
285
}
286

287
static void xen_set_pmd(pmd_t *ptr, pmd_t val)
288
{
289
	trace_xen_mmu_set_pmd(ptr, val);
290

291
	/* If page is not pinned, we can just update the entry
292
	   directly */
293
	if (!xen_page_pinned(ptr)) {
294
		*ptr = val;
295
		return;
296
	}
297

298
	xen_set_pmd_hyper(ptr, val);
299
}
300

301
/*
302
 * Associate a virtual page frame with a given physical page frame
303
 * and protection flags for that frame.
304
 */
305
void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
306
{
307
	if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags),
308
					 UVMF_INVLPG))
309
		BUG();
310
}
311

312
static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
313
{
314
	struct mmu_update u;
315

316
	if (xen_get_lazy_mode() != XEN_LAZY_MMU)
317
		return false;
318

319
	xen_mc_batch();
320

321
	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
322
	u.val = pte_val_ma(pteval);
323
	xen_extend_mmu_update(&u);
324

325
	xen_mc_issue(XEN_LAZY_MMU);
326

327
	return true;
328
}
329

330
static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
331
{
332
	if (!xen_batched_set_pte(ptep, pteval)) {
333
		/*
334
		 * Could call native_set_pte() here and trap and
335
		 * emulate the PTE write, but a hypercall is much cheaper.
336
		 */
337
		struct mmu_update u;
338

339
		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
340
		u.val = pte_val_ma(pteval);
341
		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
342
	}
343
}
344

345
static void xen_set_pte(pte_t *ptep, pte_t pteval)
346
{
347
	trace_xen_mmu_set_pte(ptep, pteval);
348
	__xen_set_pte(ptep, pteval);
349
}
350

351
static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
352
					unsigned long addr, pte_t *ptep)
353
{
354
	/* Just return the pte as-is.  We preserve the bits on commit */
355
	trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
356
	return *ptep;
357
}
358

359
static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma,
360
					unsigned long addr,
361
					pte_t *ptep, pte_t pte)
362
{
363
	struct mmu_update u;
364

365
	trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte);
366
	xen_mc_batch();
367

368
	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
369
	u.val = pte_val_ma(pte);
370
	xen_extend_mmu_update(&u);
371

372
	xen_mc_issue(XEN_LAZY_MMU);
373
}
374

375
/* Assume pteval_t is equivalent to all the other *val_t types. */
376
static pteval_t pte_mfn_to_pfn(pteval_t val)
377
{
378
	if (val & _PAGE_PRESENT) {
379
		unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
380
		unsigned long pfn = mfn_to_pfn(mfn);
381

382
		pteval_t flags = val & PTE_FLAGS_MASK;
383
		if (unlikely(pfn == ~0))
384
			val = flags & ~_PAGE_PRESENT;
385
		else
386
			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
387
	}
388

389
	return val;
390
}
391

392
static pteval_t pte_pfn_to_mfn(pteval_t val)
393
{
394
	if (val & _PAGE_PRESENT) {
395
		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
396
		pteval_t flags = val & PTE_FLAGS_MASK;
397
		unsigned long mfn;
398

399
		mfn = __pfn_to_mfn(pfn);
400

401
		/*
402
		 * If there's no mfn for the pfn, then just create an
403
		 * empty non-present pte.  Unfortunately this loses
404
		 * information about the original pfn, so
405
		 * pte_mfn_to_pfn is asymmetric.
406
		 */
407
		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
408
			mfn = 0;
409
			flags = 0;
410
		} else
411
			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
412
		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
413
	}
414

415
	return val;
416
}
417

418
__visible pteval_t xen_pte_val(pte_t pte)
419
{
420
	pteval_t pteval = pte.pte;
421

422
	return pte_mfn_to_pfn(pteval);
423
}
424
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
425

426
__visible pgdval_t xen_pgd_val(pgd_t pgd)
427
{
428
	return pte_mfn_to_pfn(pgd.pgd);
429
}
430
PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
431

432
__visible pte_t xen_make_pte(pteval_t pte)
433
{
434
	pte = pte_pfn_to_mfn(pte);
435

436
	return native_make_pte(pte);
437
}
438
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
439

440
__visible pgd_t xen_make_pgd(pgdval_t pgd)
441
{
442
	pgd = pte_pfn_to_mfn(pgd);
443
	return native_make_pgd(pgd);
444
}
445
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
446

447
__visible pmdval_t xen_pmd_val(pmd_t pmd)
448
{
449
	return pte_mfn_to_pfn(pmd.pmd);
450
}
451
PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
452

453
static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
454
{
455
	struct mmu_update u;
456

457
	preempt_disable();
458

459
	xen_mc_batch();
460

461
	/* ptr may be ioremapped for 64-bit pagetable setup */
462
	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
463
	u.val = pud_val_ma(val);
464
	xen_extend_mmu_update(&u);
465

466
	xen_mc_issue(XEN_LAZY_MMU);
467

468
	preempt_enable();
469
}
470

471
static void xen_set_pud(pud_t *ptr, pud_t val)
472
{
473
	trace_xen_mmu_set_pud(ptr, val);
474

475
	/* If page is not pinned, we can just update the entry
476
	   directly */
477
	if (!xen_page_pinned(ptr)) {
478
		*ptr = val;
479
		return;
480
	}
481

482
	xen_set_pud_hyper(ptr, val);
483
}
484

485
__visible pmd_t xen_make_pmd(pmdval_t pmd)
486
{
487
	pmd = pte_pfn_to_mfn(pmd);
488
	return native_make_pmd(pmd);
489
}
490
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
491

492
__visible pudval_t xen_pud_val(pud_t pud)
493
{
494
	return pte_mfn_to_pfn(pud.pud);
495
}
496
PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
497

498
__visible pud_t xen_make_pud(pudval_t pud)
499
{
500
	pud = pte_pfn_to_mfn(pud);
501

502
	return native_make_pud(pud);
503
}
504
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
505

506
static pgd_t *xen_get_user_pgd(pgd_t *pgd)
507
{
508
	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
509
	unsigned offset = pgd - pgd_page;
510
	pgd_t *user_ptr = NULL;
511

512
	if (offset < pgd_index(USER_LIMIT)) {
513
		struct page *page = virt_to_page(pgd_page);
514
		user_ptr = (pgd_t *)page->private;
515
		if (user_ptr)
516
			user_ptr += offset;
517
	}
518

519
	return user_ptr;
520
}
521

522
static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
523
{
524
	struct mmu_update u;
525

526
	u.ptr = virt_to_machine(ptr).maddr;
527
	u.val = p4d_val_ma(val);
528
	xen_extend_mmu_update(&u);
529
}
530

531
/*
532
 * Raw hypercall-based set_p4d, intended for in early boot before
533
 * there's a page structure.  This implies:
534
 *  1. The only existing pagetable is the kernel's
535
 *  2. It is always pinned
536
 *  3. It has no user pagetable attached to it
537
 */
538
static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
539
{
540
	preempt_disable();
541

542
	xen_mc_batch();
543

544
	__xen_set_p4d_hyper(ptr, val);
545

546
	xen_mc_issue(XEN_LAZY_MMU);
547

548
	preempt_enable();
549
}
550

551
static void xen_set_p4d(p4d_t *ptr, p4d_t val)
552
{
553
	pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
554
	pgd_t pgd_val;
555

556
	trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
557

558
	/* If page is not pinned, we can just update the entry
559
	   directly */
560
	if (!xen_page_pinned(ptr)) {
561
		*ptr = val;
562
		if (user_ptr) {
563
			WARN_ON(xen_page_pinned(user_ptr));
564
			pgd_val.pgd = p4d_val_ma(val);
565
			*user_ptr = pgd_val;
566
		}
567
		return;
568
	}
569

570
	/* If it's pinned, then we can at least batch the kernel and
571
	   user updates together. */
572
	xen_mc_batch();
573

574
	__xen_set_p4d_hyper(ptr, val);
575
	if (user_ptr)
576
		__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
577

578
	xen_mc_issue(XEN_LAZY_MMU);
579
}
580

581
__visible p4dval_t xen_p4d_val(p4d_t p4d)
582
{
583
	return pte_mfn_to_pfn(p4d.p4d);
584
}
585
PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
586

587
__visible p4d_t xen_make_p4d(p4dval_t p4d)
588
{
589
	p4d = pte_pfn_to_mfn(p4d);
590

591
	return native_make_p4d(p4d);
592
}
593
PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
594

595
static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
596
			 void (*func)(struct mm_struct *mm, struct page *,
597
				      enum pt_level),
598
			 bool last, unsigned long limit)
599
{
600
	int i, nr;
601

602
	nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
603
	for (i = 0; i < nr; i++) {
604
		if (!pmd_none(pmd[i]))
605
			(*func)(mm, pmd_page(pmd[i]), PT_PTE);
606
	}
607
}
608

609
static void xen_pud_walk(struct mm_struct *mm, pud_t *pud,
610
			 void (*func)(struct mm_struct *mm, struct page *,
611
				      enum pt_level),
612
			 bool last, unsigned long limit)
613
{
614
	int i, nr;
615

616
	nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
617
	for (i = 0; i < nr; i++) {
618
		pmd_t *pmd;
619

620
		if (pud_none(pud[i]))
621
			continue;
622

623
		pmd = pmd_offset(&pud[i], 0);
624
		if (PTRS_PER_PMD > 1)
625
			(*func)(mm, virt_to_page(pmd), PT_PMD);
626
		xen_pmd_walk(mm, pmd, func, last && i == nr - 1, limit);
627
	}
628
}
629

630
static void xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
631
			 void (*func)(struct mm_struct *mm, struct page *,
632
				      enum pt_level),
633
			 bool last, unsigned long limit)
634
{
635
	pud_t *pud;
636

637

638
	if (p4d_none(*p4d))
639
		return;
640

641
	pud = pud_offset(p4d, 0);
642
	if (PTRS_PER_PUD > 1)
643
		(*func)(mm, virt_to_page(pud), PT_PUD);
644
	xen_pud_walk(mm, pud, func, last, limit);
645
}
646

647
/*
648
 * (Yet another) pagetable walker.  This one is intended for pinning a
649
 * pagetable.  This means that it walks a pagetable and calls the
650
 * callback function on each page it finds making up the page table,
651
 * at every level.  It walks the entire pagetable, but it only bothers
652
 * pinning pte pages which are below limit.  In the normal case this
653
 * will be STACK_TOP_MAX, but at boot we need to pin up to
654
 * FIXADDR_TOP.
655
 *
656
 * We must skip the Xen hole in the middle of the address space, just after
657
 * the big x86-64 virtual hole.
658
 */
659
static void __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
660
			   void (*func)(struct mm_struct *mm, struct page *,
661
					enum pt_level),
662
			   unsigned long limit)
663
{
664
	int i, nr;
665
	unsigned hole_low = 0, hole_high = 0;
666

667
	/* The limit is the last byte to be touched */
668
	limit--;
669
	BUG_ON(limit >= FIXADDR_TOP);
670

671
	/*
672
	 * 64-bit has a great big hole in the middle of the address
673
	 * space, which contains the Xen mappings.
674
	 */
675
	hole_low = pgd_index(GUARD_HOLE_BASE_ADDR);
676
	hole_high = pgd_index(GUARD_HOLE_END_ADDR);
677

678
	nr = pgd_index(limit) + 1;
679
	for (i = 0; i < nr; i++) {
680
		p4d_t *p4d;
681

682
		if (i >= hole_low && i < hole_high)
683
			continue;
684

685
		if (pgd_none(pgd[i]))
686
			continue;
687

688
		p4d = p4d_offset(&pgd[i], 0);
689
		xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
690
	}
691

692
	/* Do the top level last, so that the callbacks can use it as
693
	   a cue to do final things like tlb flushes. */
694
	(*func)(mm, virt_to_page(pgd), PT_PGD);
695
}
696

697
static void xen_pgd_walk(struct mm_struct *mm,
698
			 void (*func)(struct mm_struct *mm, struct page *,
699
				      enum pt_level),
700
			 unsigned long limit)
701
{
702
	__xen_pgd_walk(mm, mm->pgd, func, limit);
703
}
704

705
/* If we're using split pte locks, then take the page's lock and
706
   return a pointer to it.  Otherwise return NULL. */
707
static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
708
{
709
	spinlock_t *ptl = NULL;
710

711
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
712
	ptl = ptlock_ptr(page_ptdesc(page));
713
	spin_lock_nest_lock(ptl, &mm->page_table_lock);
714
#endif
715

716
	return ptl;
717
}
718

719
static void xen_pte_unlock(void *v)
720
{
721
	spinlock_t *ptl = v;
722
	spin_unlock(ptl);
723
}
724

725
static void xen_do_pin(unsigned level, unsigned long pfn)
726
{
727
	struct mmuext_op op;
728

729
	op.cmd = level;
730
	op.arg1.mfn = pfn_to_mfn(pfn);
731

732
	xen_extend_mmuext_op(&op);
733
}
734

735
static void xen_pin_page(struct mm_struct *mm, struct page *page,
736
			 enum pt_level level)
737
{
738
	unsigned pgfl = TestSetPagePinned(page);
739

740
	if (!pgfl) {
741
		void *pt = lowmem_page_address(page);
742
		unsigned long pfn = page_to_pfn(page);
743
		struct multicall_space mcs = __xen_mc_entry(0);
744
		spinlock_t *ptl;
745

746
		/*
747
		 * We need to hold the pagetable lock between the time
748
		 * we make the pagetable RO and when we actually pin
749
		 * it.  If we don't, then other users may come in and
750
		 * attempt to update the pagetable by writing it,
751
		 * which will fail because the memory is RO but not
752
		 * pinned, so Xen won't do the trap'n'emulate.
753
		 *
754
		 * If we're using split pte locks, we can't hold the
755
		 * entire pagetable's worth of locks during the
756
		 * traverse, because we may wrap the preempt count (8
757
		 * bits).  The solution is to mark RO and pin each PTE
758
		 * page while holding the lock.  This means the number
759
		 * of locks we end up holding is never more than a
760
		 * batch size (~32 entries, at present).
761
		 *
762
		 * If we're not using split pte locks, we needn't pin
763
		 * the PTE pages independently, because we're
764
		 * protected by the overall pagetable lock.
765
		 */
766
		ptl = NULL;
767
		if (level == PT_PTE)
768
			ptl = xen_pte_lock(page, mm);
769

770
		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
771
					pfn_pte(pfn, PAGE_KERNEL_RO),
772
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
773

774
		if (ptl) {
775
			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
776

777
			/* Queue a deferred unlock for when this batch
778
			   is completed. */
779
			xen_mc_callback(xen_pte_unlock, ptl);
780
		}
781
	}
782
}
783

784
/* This is called just after a mm has been created, but it has not
785
   been used yet.  We need to make sure that its pagetable is all
786
   read-only, and can be pinned. */
787
static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
788
{
789
	pgd_t *user_pgd = xen_get_user_pgd(pgd);
790

791
	trace_xen_mmu_pgd_pin(mm, pgd);
792

793
	xen_mc_batch();
794

795
	__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT);
796

797
	xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
798

799
	if (user_pgd) {
800
		xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
801
		xen_do_pin(MMUEXT_PIN_L4_TABLE,
802
			   PFN_DOWN(__pa(user_pgd)));
803
	}
804

805
	xen_mc_issue(0);
806
}
807

808
static void xen_pgd_pin(struct mm_struct *mm)
809
{
810
	__xen_pgd_pin(mm, mm->pgd);
811
}
812

813
/*
814
 * On save, we need to pin all pagetables to make sure they get their
815
 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
816
 * them (unpinned pgds are not currently in use, probably because the
817
 * process is under construction or destruction).
818
 *
819
 * Expected to be called in stop_machine() ("equivalent to taking
820
 * every spinlock in the system"), so the locking doesn't really
821
 * matter all that much.
822
 */
823
void xen_mm_pin_all(void)
824
{
825
	struct page *page;
826

827
	spin_lock(&init_mm.page_table_lock);
828
	spin_lock(&pgd_lock);
829

830
	list_for_each_entry(page, &pgd_list, lru) {
831
		if (!PagePinned(page)) {
832
			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
833
			SetPageSavePinned(page);
834
		}
835
	}
836

837
	spin_unlock(&pgd_lock);
838
	spin_unlock(&init_mm.page_table_lock);
839
}
840

841
static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
842
				   enum pt_level level)
843
{
844
	SetPagePinned(page);
845
}
846

847
/*
848
 * The init_mm pagetable is really pinned as soon as its created, but
849
 * that's before we have page structures to store the bits.  So do all
850
 * the book-keeping now once struct pages for allocated pages are
851
 * initialized. This happens only after memblock_free_all() is called.
852
 */
853
static void __init xen_after_bootmem(void)
854
{
855
	static_branch_enable(&xen_struct_pages_ready);
856
#ifdef CONFIG_X86_VSYSCALL_EMULATION
857
	SetPagePinned(virt_to_page(level3_user_vsyscall));
858
#endif
859
	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
860

861
	if (alloc_discontig_frames(MIN_CONTIG_ORDER))
862
		BUG();
863
}
864

865
static void xen_unpin_page(struct mm_struct *mm, struct page *page,
866
			   enum pt_level level)
867
{
868
	unsigned pgfl = TestClearPagePinned(page);
869

870
	if (pgfl) {
871
		void *pt = lowmem_page_address(page);
872
		unsigned long pfn = page_to_pfn(page);
873
		spinlock_t *ptl = NULL;
874
		struct multicall_space mcs;
875

876
		/*
877
		 * Do the converse to pin_page.  If we're using split
878
		 * pte locks, we must be holding the lock for while
879
		 * the pte page is unpinned but still RO to prevent
880
		 * concurrent updates from seeing it in this
881
		 * partially-pinned state.
882
		 */
883
		if (level == PT_PTE) {
884
			ptl = xen_pte_lock(page, mm);
885

886
			if (ptl)
887
				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
888
		}
889

890
		mcs = __xen_mc_entry(0);
891

892
		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
893
					pfn_pte(pfn, PAGE_KERNEL),
894
					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
895

896
		if (ptl) {
897
			/* unlock when batch completed */
898
			xen_mc_callback(xen_pte_unlock, ptl);
899
		}
900
	}
901
}
902

903
/* Release a pagetables pages back as normal RW */
904
static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
905
{
906
	pgd_t *user_pgd = xen_get_user_pgd(pgd);
907

908
	trace_xen_mmu_pgd_unpin(mm, pgd);
909

910
	xen_mc_batch();
911

912
	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
913

914
	if (user_pgd) {
915
		xen_do_pin(MMUEXT_UNPIN_TABLE,
916
			   PFN_DOWN(__pa(user_pgd)));
917
		xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
918
	}
919

920
	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
921

922
	xen_mc_issue(0);
923
}
924

925
static void xen_pgd_unpin(struct mm_struct *mm)
926
{
927
	__xen_pgd_unpin(mm, mm->pgd);
928
}
929

930
/*
931
 * On resume, undo any pinning done at save, so that the rest of the
932
 * kernel doesn't see any unexpected pinned pagetables.
933
 */
934
void xen_mm_unpin_all(void)
935
{
936
	struct page *page;
937

938
	spin_lock(&init_mm.page_table_lock);
939
	spin_lock(&pgd_lock);
940

941
	list_for_each_entry(page, &pgd_list, lru) {
942
		if (PageSavePinned(page)) {
943
			BUG_ON(!PagePinned(page));
944
			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
945
			ClearPageSavePinned(page);
946
		}
947
	}
948

949
	spin_unlock(&pgd_lock);
950
	spin_unlock(&init_mm.page_table_lock);
951
}
952

953
static void xen_enter_mmap(struct mm_struct *mm)
954
{
955
	spin_lock(&mm->page_table_lock);
956
	xen_pgd_pin(mm);
957
	spin_unlock(&mm->page_table_lock);
958
}
959

960
static void drop_mm_ref_this_cpu(void *info)
961
{
962
	struct mm_struct *mm = info;
963

964
	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
965
		leave_mm();
966

967
	/*
968
	 * If this cpu still has a stale cr3 reference, then make sure
969
	 * it has been flushed.
970
	 */
971
	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
972
		xen_mc_flush();
973
}
974

975
#ifdef CONFIG_SMP
976
/*
977
 * Another cpu may still have their %cr3 pointing at the pagetable, so
978
 * we need to repoint it somewhere else before we can unpin it.
979
 */
980
static void xen_drop_mm_ref(struct mm_struct *mm)
981
{
982
	cpumask_var_t mask;
983
	unsigned cpu;
984

985
	drop_mm_ref_this_cpu(mm);
986

987
	/* Get the "official" set of cpus referring to our pagetable. */
988
	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
989
		for_each_online_cpu(cpu) {
990
			if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
991
				continue;
992
			smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
993
		}
994
		return;
995
	}
996

997
	/*
998
	 * It's possible that a vcpu may have a stale reference to our
999
	 * cr3, because its in lazy mode, and it hasn't yet flushed
1000
	 * its set of pending hypercalls yet.  In this case, we can
1001
	 * look at its actual current cr3 value, and force it to flush
1002
	 * if needed.
1003
	 */
1004
	cpumask_clear(mask);
1005
	for_each_online_cpu(cpu) {
1006
		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1007
			cpumask_set_cpu(cpu, mask);
1008
	}
1009

1010
	smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
1011
	free_cpumask_var(mask);
1012
}
1013
#else
1014
static void xen_drop_mm_ref(struct mm_struct *mm)
1015
{
1016
	drop_mm_ref_this_cpu(mm);
1017
}
1018
#endif
1019

1020
/*
1021
 * While a process runs, Xen pins its pagetables, which means that the
1022
 * hypervisor forces it to be read-only, and it controls all updates
1023
 * to it.  This means that all pagetable updates have to go via the
1024
 * hypervisor, which is moderately expensive.
1025
 *
1026
 * Since we're pulling the pagetable down, we switch to use init_mm,
1027
 * unpin old process pagetable and mark it all read-write, which
1028
 * allows further operations on it to be simple memory accesses.
1029
 *
1030
 * The only subtle point is that another CPU may be still using the
1031
 * pagetable because of lazy tlb flushing.  This means we need need to
1032
 * switch all CPUs off this pagetable before we can unpin it.
1033
 */
1034
static void xen_exit_mmap(struct mm_struct *mm)
1035
{
1036
	get_cpu();		/* make sure we don't move around */
1037
	xen_drop_mm_ref(mm);
1038
	put_cpu();
1039

1040
	spin_lock(&mm->page_table_lock);
1041

1042
	/* pgd may not be pinned in the error exit path of execve */
1043
	if (xen_page_pinned(mm->pgd))
1044
		xen_pgd_unpin(mm);
1045

1046
	spin_unlock(&mm->page_table_lock);
1047
}
1048

1049
static void xen_post_allocator_init(void);
1050

1051
static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1052
{
1053
	struct mmuext_op op;
1054

1055
	op.cmd = cmd;
1056
	op.arg1.mfn = pfn_to_mfn(pfn);
1057
	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1058
		BUG();
1059
}
1060

1061
static void __init xen_cleanhighmap(unsigned long vaddr,
1062
				    unsigned long vaddr_end)
1063
{
1064
	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1065
	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1066

1067
	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
1068
	 * We include the PMD passed in on _both_ boundaries. */
1069
	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
1070
			pmd++, vaddr += PMD_SIZE) {
1071
		if (pmd_none(*pmd))
1072
			continue;
1073
		if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1074
			set_pmd(pmd, __pmd(0));
1075
	}
1076
	/* In case we did something silly, we should crash in this function
1077
	 * instead of somewhere later and be confusing. */
1078
	xen_mc_flush();
1079
}
1080

1081
/*
1082
 * Make a page range writeable and free it.
1083
 */
1084
static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1085
{
1086
	void *vaddr = __va(paddr);
1087
	void *vaddr_end = vaddr + size;
1088

1089
	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1090
		make_lowmem_page_readwrite(vaddr);
1091

1092
	memblock_phys_free(paddr, size);
1093
}
1094

1095
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1096
{
1097
	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1098

1099
	if (unpin)
1100
		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1101
	ClearPagePinned(virt_to_page(__va(pa)));
1102
	xen_free_ro_pages(pa, PAGE_SIZE);
1103
}
1104

1105
static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1106
{
1107
	unsigned long pa;
1108
	pte_t *pte_tbl;
1109
	int i;
1110

1111
	if (pmd_leaf(*pmd)) {
1112
		pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1113
		xen_free_ro_pages(pa, PMD_SIZE);
1114
		return;
1115
	}
1116

1117
	pte_tbl = pte_offset_kernel(pmd, 0);
1118
	for (i = 0; i < PTRS_PER_PTE; i++) {
1119
		if (pte_none(pte_tbl[i]))
1120
			continue;
1121
		pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
1122
		xen_free_ro_pages(pa, PAGE_SIZE);
1123
	}
1124
	set_pmd(pmd, __pmd(0));
1125
	xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
1126
}
1127

1128
static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1129
{
1130
	unsigned long pa;
1131
	pmd_t *pmd_tbl;
1132
	int i;
1133

1134
	if (pud_leaf(*pud)) {
1135
		pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1136
		xen_free_ro_pages(pa, PUD_SIZE);
1137
		return;
1138
	}
1139

1140
	pmd_tbl = pmd_offset(pud, 0);
1141
	for (i = 0; i < PTRS_PER_PMD; i++) {
1142
		if (pmd_none(pmd_tbl[i]))
1143
			continue;
1144
		xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
1145
	}
1146
	set_pud(pud, __pud(0));
1147
	xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
1148
}
1149

1150
static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1151
{
1152
	unsigned long pa;
1153
	pud_t *pud_tbl;
1154
	int i;
1155

1156
	if (p4d_leaf(*p4d)) {
1157
		pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
1158
		xen_free_ro_pages(pa, P4D_SIZE);
1159
		return;
1160
	}
1161

1162
	pud_tbl = pud_offset(p4d, 0);
1163
	for (i = 0; i < PTRS_PER_PUD; i++) {
1164
		if (pud_none(pud_tbl[i]))
1165
			continue;
1166
		xen_cleanmfnmap_pud(pud_tbl + i, unpin);
1167
	}
1168
	set_p4d(p4d, __p4d(0));
1169
	xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
1170
}
1171

1172
/*
1173
 * Since it is well isolated we can (and since it is perhaps large we should)
1174
 * also free the page tables mapping the initial P->M table.
1175
 */
1176
static void __init xen_cleanmfnmap(unsigned long vaddr)
1177
{
1178
	pgd_t *pgd;
1179
	p4d_t *p4d;
1180
	bool unpin;
1181

1182
	unpin = (vaddr == 2 * PGDIR_SIZE);
1183
	vaddr &= PMD_MASK;
1184
	pgd = pgd_offset_k(vaddr);
1185
	p4d = p4d_offset(pgd, 0);
1186
	if (!p4d_none(*p4d))
1187
		xen_cleanmfnmap_p4d(p4d, unpin);
1188
}
1189

1190
static void __init xen_pagetable_p2m_free(void)
1191
{
1192
	unsigned long size;
1193
	unsigned long addr;
1194

1195
	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1196

1197
	/* No memory or already called. */
1198
	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1199
		return;
1200

1201
	/* using __ka address and sticking INVALID_P2M_ENTRY! */
1202
	memset((void *)xen_start_info->mfn_list, 0xff, size);
1203

1204
	addr = xen_start_info->mfn_list;
1205
	/*
1206
	 * We could be in __ka space.
1207
	 * We roundup to the PMD, which means that if anybody at this stage is
1208
	 * using the __ka address of xen_start_info or
1209
	 * xen_start_info->shared_info they are in going to crash. Fortunately
1210
	 * we have already revectored in xen_setup_kernel_pagetable.
1211
	 */
1212
	size = roundup(size, PMD_SIZE);
1213

1214
	if (addr >= __START_KERNEL_map) {
1215
		xen_cleanhighmap(addr, addr + size);
1216
		size = PAGE_ALIGN(xen_start_info->nr_pages *
1217
				  sizeof(unsigned long));
1218
		memblock_free((void *)addr, size);
1219
	} else {
1220
		xen_cleanmfnmap(addr);
1221
	}
1222
}
1223

1224
static void __init xen_pagetable_cleanhighmap(void)
1225
{
1226
	unsigned long size;
1227
	unsigned long addr;
1228

1229
	/* At this stage, cleanup_highmap has already cleaned __ka space
1230
	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1231
	 * the ramdisk). We continue on, erasing PMD entries that point to page
1232
	 * tables - do note that they are accessible at this stage via __va.
1233
	 * As Xen is aligning the memory end to a 4MB boundary, for good
1234
	 * measure we also round up to PMD_SIZE * 2 - which means that if
1235
	 * anybody is using __ka address to the initial boot-stack - and try
1236
	 * to use it - they are going to crash. The xen_start_info has been
1237
	 * taken care of already in xen_setup_kernel_pagetable. */
1238
	addr = xen_start_info->pt_base;
1239
	size = xen_start_info->nr_pt_frames * PAGE_SIZE;
1240

1241
	xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2));
1242
	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1243
}
1244

1245
static void __init xen_pagetable_p2m_setup(void)
1246
{
1247
	xen_vmalloc_p2m_tree();
1248

1249
	xen_pagetable_p2m_free();
1250

1251
	xen_pagetable_cleanhighmap();
1252

1253
	/* And revector! Bye bye old array */
1254
	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1255
}
1256

1257
static void __init xen_pagetable_init(void)
1258
{
1259
	/*
1260
	 * The majority of further PTE writes is to pagetables already
1261
	 * announced as such to Xen. Hence it is more efficient to use
1262
	 * hypercalls for these updates.
1263
	 */
1264
	pv_ops.mmu.set_pte = __xen_set_pte;
1265

1266
	paging_init();
1267
	xen_post_allocator_init();
1268

1269
	xen_pagetable_p2m_setup();
1270

1271
	/* Allocate and initialize top and mid mfn levels for p2m structure */
1272
	xen_build_mfn_list_list();
1273

1274
	/* Remap memory freed due to conflicts with E820 map */
1275
	xen_remap_memory();
1276
	xen_setup_mfn_list_list();
1277
}
1278

1279
static noinstr void xen_write_cr2(unsigned long cr2)
1280
{
1281
	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1282
}
1283

1284
static noinline void xen_flush_tlb(void)
1285
{
1286
	struct mmuext_op *op;
1287
	struct multicall_space mcs;
1288

1289
	preempt_disable();
1290

1291
	mcs = xen_mc_entry(sizeof(*op));
1292

1293
	op = mcs.args;
1294
	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1295
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1296

1297
	xen_mc_issue(XEN_LAZY_MMU);
1298

1299
	preempt_enable();
1300
}
1301

1302
static void xen_flush_tlb_one_user(unsigned long addr)
1303
{
1304
	struct mmuext_op *op;
1305
	struct multicall_space mcs;
1306

1307
	trace_xen_mmu_flush_tlb_one_user(addr);
1308

1309
	preempt_disable();
1310

1311
	mcs = xen_mc_entry(sizeof(*op));
1312
	op = mcs.args;
1313
	op->cmd = MMUEXT_INVLPG_LOCAL;
1314
	op->arg1.linear_addr = addr & PAGE_MASK;
1315
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1316

1317
	xen_mc_issue(XEN_LAZY_MMU);
1318

1319
	preempt_enable();
1320
}
1321

1322
static void xen_flush_tlb_multi(const struct cpumask *cpus,
1323
				const struct flush_tlb_info *info)
1324
{
1325
	struct {
1326
		struct mmuext_op op;
1327
		DECLARE_BITMAP(mask, NR_CPUS);
1328
	} *args;
1329
	struct multicall_space mcs;
1330
	const size_t mc_entry_size = sizeof(args->op) +
1331
		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
1332

1333
	trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
1334

1335
	if (cpumask_empty(cpus))
1336
		return;		/* nothing to do */
1337

1338
	mcs = xen_mc_entry(mc_entry_size);
1339
	args = mcs.args;
1340
	args->op.arg2.vcpumask = to_cpumask(args->mask);
1341

1342
	/* Remove any offline CPUs */
1343
	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1344

1345
	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1346
	if (info->end != TLB_FLUSH_ALL &&
1347
	    (info->end - info->start) <= PAGE_SIZE) {
1348
		args->op.cmd = MMUEXT_INVLPG_MULTI;
1349
		args->op.arg1.linear_addr = info->start;
1350
	}
1351

1352
	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1353

1354
	xen_mc_issue(XEN_LAZY_MMU);
1355
}
1356

1357
static unsigned long xen_read_cr3(void)
1358
{
1359
	return this_cpu_read(xen_cr3);
1360
}
1361

1362
static void set_current_cr3(void *v)
1363
{
1364
	this_cpu_write(xen_current_cr3, (unsigned long)v);
1365
}
1366

1367
static void __xen_write_cr3(bool kernel, unsigned long cr3)
1368
{
1369
	struct mmuext_op op;
1370
	unsigned long mfn;
1371

1372
	trace_xen_mmu_write_cr3(kernel, cr3);
1373

1374
	if (cr3)
1375
		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1376
	else
1377
		mfn = 0;
1378

1379
	WARN_ON(mfn == 0 && kernel);
1380

1381
	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1382
	op.arg1.mfn = mfn;
1383

1384
	xen_extend_mmuext_op(&op);
1385

1386
	if (kernel) {
1387
		this_cpu_write(xen_cr3, cr3);
1388

1389
		/* Update xen_current_cr3 once the batch has actually
1390
		   been submitted. */
1391
		xen_mc_callback(set_current_cr3, (void *)cr3);
1392
	}
1393
}
1394
static void xen_write_cr3(unsigned long cr3)
1395
{
1396
	pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1397

1398
	BUG_ON(preemptible());
1399

1400
	xen_mc_batch();  /* disables interrupts */
1401

1402
	/* Update while interrupts are disabled, so its atomic with
1403
	   respect to ipis */
1404
	this_cpu_write(xen_cr3, cr3);
1405

1406
	__xen_write_cr3(true, cr3);
1407

1408
	if (user_pgd)
1409
		__xen_write_cr3(false, __pa(user_pgd));
1410
	else
1411
		__xen_write_cr3(false, 0);
1412

1413
	xen_mc_issue(XEN_LAZY_CPU);  /* interrupts restored */
1414
}
1415

1416
/*
1417
 * At the start of the day - when Xen launches a guest, it has already
1418
 * built pagetables for the guest. We diligently look over them
1419
 * in xen_setup_kernel_pagetable and graft as appropriate them in the
1420
 * init_top_pgt and its friends. Then when we are happy we load
1421
 * the new init_top_pgt - and continue on.
1422
 *
1423
 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1424
 * up the rest of the pagetables. When it has completed it loads the cr3.
1425
 * N.B. that baremetal would start at 'start_kernel' (and the early
1426
 * #PF handler would create bootstrap pagetables) - so we are running
1427
 * with the same assumptions as what to do when write_cr3 is executed
1428
 * at this point.
1429
 *
1430
 * Since there are no user-page tables at all, we have two variants
1431
 * of xen_write_cr3 - the early bootup (this one), and the late one
1432
 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1433
 * the Linux kernel and user-space are both in ring 3 while the
1434
 * hypervisor is in ring 0.
1435
 */
1436
static void __init xen_write_cr3_init(unsigned long cr3)
1437
{
1438
	BUG_ON(preemptible());
1439

1440
	xen_mc_batch();  /* disables interrupts */
1441

1442
	/* Update while interrupts are disabled, so its atomic with
1443
	   respect to ipis */
1444
	this_cpu_write(xen_cr3, cr3);
1445

1446
	__xen_write_cr3(true, cr3);
1447

1448
	xen_mc_issue(XEN_LAZY_CPU);  /* interrupts restored */
1449
}
1450

1451
static int xen_pgd_alloc(struct mm_struct *mm)
1452
{
1453
	pgd_t *pgd = mm->pgd;
1454
	struct page *page = virt_to_page(pgd);
1455
	pgd_t *user_pgd;
1456
	int ret = -ENOMEM;
1457

1458
	BUG_ON(PagePinned(virt_to_page(pgd)));
1459
	BUG_ON(page->private != 0);
1460

1461
	user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1462
	page->private = (unsigned long)user_pgd;
1463

1464
	if (user_pgd != NULL) {
1465
#ifdef CONFIG_X86_VSYSCALL_EMULATION
1466
		user_pgd[pgd_index(VSYSCALL_ADDR)] =
1467
			__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1468
#endif
1469
		ret = 0;
1470
	}
1471

1472
	BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1473

1474
	return ret;
1475
}
1476

1477
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1478
{
1479
	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1480

1481
	if (user_pgd)
1482
		free_page((unsigned long)user_pgd);
1483
}
1484

1485
/*
1486
 * Init-time set_pte while constructing initial pagetables, which
1487
 * doesn't allow RO page table pages to be remapped RW.
1488
 *
1489
 * If there is no MFN for this PFN then this page is initially
1490
 * ballooned out so clear the PTE (as in decrease_reservation() in
1491
 * drivers/xen/balloon.c).
1492
 *
1493
 * Many of these PTE updates are done on unpinned and writable pages
1494
 * and doing a hypercall for these is unnecessary and expensive.  At
1495
 * this point it is rarely possible to tell if a page is pinned, so
1496
 * mostly write the PTE directly and rely on Xen trapping and
1497
 * emulating any updates as necessary.
1498
 */
1499
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1500
{
1501
	if (unlikely(is_early_ioremap_ptep(ptep)))
1502
		__xen_set_pte(ptep, pte);
1503
	else
1504
		native_set_pte(ptep, pte);
1505
}
1506

1507
__visible pte_t xen_make_pte_init(pteval_t pte)
1508
{
1509
	unsigned long pfn;
1510

1511
	/*
1512
	 * Pages belonging to the initial p2m list mapped outside the default
1513
	 * address range must be mapped read-only. This region contains the
1514
	 * page tables for mapping the p2m list, too, and page tables MUST be
1515
	 * mapped read-only.
1516
	 */
1517
	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
1518
	if (xen_start_info->mfn_list < __START_KERNEL_map &&
1519
	    pfn >= xen_start_info->first_p2m_pfn &&
1520
	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1521
		pte &= ~_PAGE_RW;
1522

1523
	pte = pte_pfn_to_mfn(pte);
1524
	return native_make_pte(pte);
1525
}
1526
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
1527

1528
/* Early in boot, while setting up the initial pagetable, assume
1529
   everything is pinned. */
1530
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1531
{
1532
#ifdef CONFIG_FLATMEM
1533
	BUG_ON(mem_map);	/* should only be used early */
1534
#endif
1535
	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1536
	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1537
}
1538

1539
/* Used for pmd and pud */
1540
static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1541
{
1542
#ifdef CONFIG_FLATMEM
1543
	BUG_ON(mem_map);	/* should only be used early */
1544
#endif
1545
	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1546
}
1547

1548
/* Early release_pte assumes that all pts are pinned, since there's
1549
   only init_mm and anything attached to that is pinned. */
1550
static void __init xen_release_pte_init(unsigned long pfn)
1551
{
1552
	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1553
	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1554
}
1555

1556
static void __init xen_release_pmd_init(unsigned long pfn)
1557
{
1558
	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1559
}
1560

1561
static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1562
{
1563
	struct multicall_space mcs;
1564
	struct mmuext_op *op;
1565

1566
	mcs = __xen_mc_entry(sizeof(*op));
1567
	op = mcs.args;
1568
	op->cmd = cmd;
1569
	op->arg1.mfn = pfn_to_mfn(pfn);
1570

1571
	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1572
}
1573

1574
static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1575
{
1576
	struct multicall_space mcs;
1577
	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1578

1579
	mcs = __xen_mc_entry(0);
1580
	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1581
				pfn_pte(pfn, prot), 0);
1582
}
1583

1584
/* This needs to make sure the new pte page is pinned iff its being
1585
   attached to a pinned pagetable. */
1586
static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1587
				    unsigned level)
1588
{
1589
	bool pinned = xen_page_pinned(mm->pgd);
1590

1591
	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1592

1593
	if (pinned) {
1594
		struct page *page = pfn_to_page(pfn);
1595

1596
		pinned = false;
1597
		if (static_branch_likely(&xen_struct_pages_ready)) {
1598
			pinned = PagePinned(page);
1599
			SetPagePinned(page);
1600
		}
1601

1602
		xen_mc_batch();
1603

1604
		__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1605

1606
		if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) &&
1607
		    !pinned)
1608
			__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1609

1610
		xen_mc_issue(XEN_LAZY_MMU);
1611
	}
1612
}
1613

1614
static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1615
{
1616
	xen_alloc_ptpage(mm, pfn, PT_PTE);
1617
}
1618

1619
static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1620
{
1621
	xen_alloc_ptpage(mm, pfn, PT_PMD);
1622
}
1623

1624
/* This should never happen until we're OK to use struct page */
1625
static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1626
{
1627
	struct page *page = pfn_to_page(pfn);
1628
	bool pinned = PagePinned(page);
1629

1630
	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1631

1632
	if (pinned) {
1633
		xen_mc_batch();
1634

1635
		if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS))
1636
			__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1637

1638
		__set_pfn_prot(pfn, PAGE_KERNEL);
1639

1640
		xen_mc_issue(XEN_LAZY_MMU);
1641

1642
		ClearPagePinned(page);
1643
	}
1644
}
1645

1646
static void xen_release_pte(unsigned long pfn)
1647
{
1648
	xen_release_ptpage(pfn, PT_PTE);
1649
}
1650

1651
static void xen_release_pmd(unsigned long pfn)
1652
{
1653
	xen_release_ptpage(pfn, PT_PMD);
1654
}
1655

1656
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1657
{
1658
	xen_alloc_ptpage(mm, pfn, PT_PUD);
1659
}
1660

1661
static void xen_release_pud(unsigned long pfn)
1662
{
1663
	xen_release_ptpage(pfn, PT_PUD);
1664
}
1665

1666
/*
1667
 * Like __va(), but returns address in the kernel mapping (which is
1668
 * all we have until the physical memory mapping has been set up.
1669
 */
1670
static void * __init __ka(phys_addr_t paddr)
1671
{
1672
	return (void *)(paddr + __START_KERNEL_map);
1673
}
1674

1675
/* Convert a machine address to physical address */
1676
static unsigned long __init m2p(phys_addr_t maddr)
1677
{
1678
	phys_addr_t paddr;
1679

1680
	maddr &= XEN_PTE_MFN_MASK;
1681
	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1682

1683
	return paddr;
1684
}
1685

1686
/* Convert a machine address to kernel virtual */
1687
static void * __init m2v(phys_addr_t maddr)
1688
{
1689
	return __ka(m2p(maddr));
1690
}
1691

1692
/* Set the page permissions on an identity-mapped pages */
1693
static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1694
				       unsigned long flags)
1695
{
1696
	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1697
	pte_t pte = pfn_pte(pfn, prot);
1698

1699
	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1700
		BUG();
1701
}
1702
static void __init set_page_prot(void *addr, pgprot_t prot)
1703
{
1704
	return set_page_prot_flags(addr, prot, UVMF_NONE);
1705
}
1706

1707
void __init xen_setup_machphys_mapping(void)
1708
{
1709
	struct xen_machphys_mapping mapping;
1710

1711
	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1712
		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1713
		machine_to_phys_nr = mapping.max_mfn + 1;
1714
	} else {
1715
		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1716
	}
1717
}
1718

1719
static void __init convert_pfn_mfn(void *v)
1720
{
1721
	pte_t *pte = v;
1722
	int i;
1723

1724
	/* All levels are converted the same way, so just treat them
1725
	   as ptes. */
1726
	for (i = 0; i < PTRS_PER_PTE; i++)
1727
		pte[i] = xen_make_pte(pte[i].pte);
1728
}
1729
static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1730
				 unsigned long addr)
1731
{
1732
	if (*pt_base == PFN_DOWN(__pa(addr))) {
1733
		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1734
		clear_page((void *)addr);
1735
		(*pt_base)++;
1736
	}
1737
	if (*pt_end == PFN_DOWN(__pa(addr))) {
1738
		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1739
		clear_page((void *)addr);
1740
		(*pt_end)--;
1741
	}
1742
}
1743
/*
1744
 * Set up the initial kernel pagetable.
1745
 *
1746
 * We can construct this by grafting the Xen provided pagetable into
1747
 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1748
 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1749
 * kernel has a physical mapping to start with - but that's enough to
1750
 * get __va working.  We need to fill in the rest of the physical
1751
 * mapping once some sort of allocator has been set up.
1752
 */
1753
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1754
{
1755
	pud_t *l3;
1756
	pmd_t *l2;
1757
	unsigned long addr[3];
1758
	unsigned long pt_base, pt_end;
1759
	unsigned i;
1760

1761
	/* max_pfn_mapped is the last pfn mapped in the initial memory
1762
	 * mappings. Considering that on Xen after the kernel mappings we
1763
	 * have the mappings of some pages that don't exist in pfn space, we
1764
	 * set max_pfn_mapped to the last real pfn mapped. */
1765
	if (xen_start_info->mfn_list < __START_KERNEL_map)
1766
		max_pfn_mapped = xen_start_info->first_p2m_pfn;
1767
	else
1768
		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1769

1770
	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1771
	pt_end = pt_base + xen_start_info->nr_pt_frames;
1772

1773
	/* Zap identity mapping */
1774
	init_top_pgt[0] = __pgd(0);
1775

1776
	/* Pre-constructed entries are in pfn, so convert to mfn */
1777
	/* L4[273] -> level3_ident_pgt  */
1778
	/* L4[511] -> level3_kernel_pgt */
1779
	convert_pfn_mfn(init_top_pgt);
1780

1781
	/* L3_i[0] -> level2_ident_pgt */
1782
	convert_pfn_mfn(level3_ident_pgt);
1783
	/* L3_k[510] -> level2_kernel_pgt */
1784
	/* L3_k[511] -> level2_fixmap_pgt */
1785
	convert_pfn_mfn(level3_kernel_pgt);
1786

1787
	/* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */
1788
	convert_pfn_mfn(level2_fixmap_pgt);
1789

1790
	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
1791
	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1792
	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1793

1794
	addr[0] = (unsigned long)pgd;
1795
	addr[1] = (unsigned long)l3;
1796
	addr[2] = (unsigned long)l2;
1797
	/* Graft it onto L4[273][0]. Note that we creating an aliasing problem:
1798
	 * Both L4[273][0] and L4[511][510] have entries that point to the same
1799
	 * L2 (PMD) tables. Meaning that if you modify it in __va space
1800
	 * it will be also modified in the __ka space! (But if you just
1801
	 * modify the PMD table to point to other PTE's or none, then you
1802
	 * are OK - which is what cleanup_highmap does) */
1803
	copy_page(level2_ident_pgt, l2);
1804
	/* Graft it onto L4[511][510] */
1805
	copy_page(level2_kernel_pgt, l2);
1806

1807
	/*
1808
	 * Zap execute permission from the ident map. Due to the sharing of
1809
	 * L1 entries we need to do this in the L2.
1810
	 */
1811
	if (__supported_pte_mask & _PAGE_NX) {
1812
		for (i = 0; i < PTRS_PER_PMD; ++i) {
1813
			if (pmd_none(level2_ident_pgt[i]))
1814
				continue;
1815
			level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX);
1816
		}
1817
	}
1818

1819
	/* Copy the initial P->M table mappings if necessary. */
1820
	i = pgd_index(xen_start_info->mfn_list);
1821
	if (i && i < pgd_index(__START_KERNEL_map))
1822
		init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1823

1824
	/* Make pagetable pieces RO */
1825
	set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
1826
	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1827
	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1828
	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1829
	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1830
	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1831

1832
	for (i = 0; i < FIXMAP_PMD_NUM; i++) {
1833
		set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE,
1834
			      PAGE_KERNEL_RO);
1835
	}
1836

1837
	/* Pin down new L4 */
1838
	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1839
			  PFN_DOWN(__pa_symbol(init_top_pgt)));
1840

1841
	/* Unpin Xen-provided one */
1842
	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1843

1844
#ifdef CONFIG_X86_VSYSCALL_EMULATION
1845
	/* Pin user vsyscall L3 */
1846
	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1847
	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1848
			  PFN_DOWN(__pa_symbol(level3_user_vsyscall)));
1849
#endif
1850

1851
	/*
1852
	 * At this stage there can be no user pgd, and no page structure to
1853
	 * attach it to, so make sure we just set kernel pgd.
1854
	 */
1855
	xen_mc_batch();
1856
	__xen_write_cr3(true, __pa(init_top_pgt));
1857
	xen_mc_issue(XEN_LAZY_CPU);
1858

1859
	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
1860
	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
1861
	 * the initial domain. For guests using the toolstack, they are in:
1862
	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1863
	 * rip out the [L4] (pgd), but for guests we shave off three pages.
1864
	 */
1865
	for (i = 0; i < ARRAY_SIZE(addr); i++)
1866
		check_pt_base(&pt_base, &pt_end, addr[i]);
1867

1868
	/* Our (by three pages) smaller Xen pagetable that we are using */
1869
	xen_pt_base = PFN_PHYS(pt_base);
1870
	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
1871
	memblock_reserve(xen_pt_base, xen_pt_size);
1872

1873
	/* Revector the xen_start_info */
1874
	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1875
}
1876

1877
/*
1878
 * Read a value from a physical address.
1879
 */
1880
static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
1881
{
1882
	unsigned long *vaddr;
1883
	unsigned long val;
1884

1885
	vaddr = early_memremap_ro(addr, sizeof(val));
1886
	val = *vaddr;
1887
	early_memunmap(vaddr, sizeof(val));
1888
	return val;
1889
}
1890

1891
/*
1892
 * Translate a virtual address to a physical one without relying on mapped
1893
 * page tables. Don't rely on big pages being aligned in (guest) physical
1894
 * space!
1895
 */
1896
static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
1897
{
1898
	phys_addr_t pa;
1899
	pgd_t pgd;
1900
	pud_t pud;
1901
	pmd_t pmd;
1902
	pte_t pte;
1903

1904
	pa = read_cr3_pa();
1905
	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
1906
						       sizeof(pgd)));
1907
	if (!pgd_present(pgd))
1908
		return 0;
1909

1910
	pa = pgd_val(pgd) & PTE_PFN_MASK;
1911
	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
1912
						       sizeof(pud)));
1913
	if (!pud_present(pud))
1914
		return 0;
1915
	pa = pud_val(pud) & PTE_PFN_MASK;
1916
	if (pud_leaf(pud))
1917
		return pa + (vaddr & ~PUD_MASK);
1918

1919
	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
1920
						       sizeof(pmd)));
1921
	if (!pmd_present(pmd))
1922
		return 0;
1923
	pa = pmd_val(pmd) & PTE_PFN_MASK;
1924
	if (pmd_leaf(pmd))
1925
		return pa + (vaddr & ~PMD_MASK);
1926

1927
	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
1928
						       sizeof(pte)));
1929
	if (!pte_present(pte))
1930
		return 0;
1931
	pa = pte_pfn(pte) << PAGE_SHIFT;
1932

1933
	return pa | (vaddr & ~PAGE_MASK);
1934
}
1935

1936
/*
1937
 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
1938
 * this area.
1939
 */
1940
void __init xen_relocate_p2m(void)
1941
{
1942
	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
1943
	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
1944
	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
1945
	pte_t *pt;
1946
	pmd_t *pmd;
1947
	pud_t *pud;
1948
	pgd_t *pgd;
1949
	unsigned long *new_p2m;
1950

1951
	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1952
	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
1953
	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
1954
	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
1955
	n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
1956
	n_frames = n_pte + n_pt + n_pmd + n_pud;
1957

1958
	new_area = xen_find_free_area(PFN_PHYS(n_frames));
1959
	if (!new_area) {
1960
		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
1961
		BUG();
1962
	}
1963

1964
	/*
1965
	 * Setup the page tables for addressing the new p2m list.
1966
	 * We have asked the hypervisor to map the p2m list at the user address
1967
	 * PUD_SIZE. It may have done so, or it may have used a kernel space
1968
	 * address depending on the Xen version.
1969
	 * To avoid any possible virtual address collision, just use
1970
	 * 2 * PUD_SIZE for the new area.
1971
	 */
1972
	pud_phys = new_area;
1973
	pmd_phys = pud_phys + PFN_PHYS(n_pud);
1974
	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
1975
	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
1976

1977
	pgd = __va(read_cr3_pa());
1978
	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
1979
	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
1980
		pud = early_memremap(pud_phys, PAGE_SIZE);
1981
		clear_page(pud);
1982
		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
1983
				idx_pmd++) {
1984
			pmd = early_memremap(pmd_phys, PAGE_SIZE);
1985
			clear_page(pmd);
1986
			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
1987
					idx_pt++) {
1988
				pt = early_memremap(pt_phys, PAGE_SIZE);
1989
				clear_page(pt);
1990
				for (idx_pte = 0;
1991
				     idx_pte < min(n_pte, PTRS_PER_PTE);
1992
				     idx_pte++) {
1993
					pt[idx_pte] = pfn_pte(p2m_pfn,
1994
							      PAGE_KERNEL);
1995
					p2m_pfn++;
1996
				}
1997
				n_pte -= PTRS_PER_PTE;
1998
				early_memunmap(pt, PAGE_SIZE);
1999
				make_lowmem_page_readonly(__va(pt_phys));
2000
				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2001
						PFN_DOWN(pt_phys));
2002
				pmd[idx_pt] = __pmd(_PAGE_TABLE | pt_phys);
2003
				pt_phys += PAGE_SIZE;
2004
			}
2005
			n_pt -= PTRS_PER_PMD;
2006
			early_memunmap(pmd, PAGE_SIZE);
2007
			make_lowmem_page_readonly(__va(pmd_phys));
2008
			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2009
					PFN_DOWN(pmd_phys));
2010
			pud[idx_pmd] = __pud(_PAGE_TABLE | pmd_phys);
2011
			pmd_phys += PAGE_SIZE;
2012
		}
2013
		n_pmd -= PTRS_PER_PUD;
2014
		early_memunmap(pud, PAGE_SIZE);
2015
		make_lowmem_page_readonly(__va(pud_phys));
2016
		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2017
		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2018
		pud_phys += PAGE_SIZE;
2019
	}
2020

2021
	/* Now copy the old p2m info to the new area. */
2022
	memcpy(new_p2m, xen_p2m_addr, size);
2023
	xen_p2m_addr = new_p2m;
2024

2025
	/* Release the old p2m list and set new list info. */
2026
	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2027
	BUG_ON(!p2m_pfn);
2028
	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2029

2030
	if (xen_start_info->mfn_list < __START_KERNEL_map) {
2031
		pfn = xen_start_info->first_p2m_pfn;
2032
		pfn_end = xen_start_info->first_p2m_pfn +
2033
			  xen_start_info->nr_p2m_frames;
2034
		set_pgd(pgd + 1, __pgd(0));
2035
	} else {
2036
		pfn = p2m_pfn;
2037
		pfn_end = p2m_pfn_end;
2038
	}
2039

2040
	memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2041
	while (pfn < pfn_end) {
2042
		if (pfn == p2m_pfn) {
2043
			pfn = p2m_pfn_end;
2044
			continue;
2045
		}
2046
		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2047
		pfn++;
2048
	}
2049

2050
	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2051
	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
2052
	xen_start_info->nr_p2m_frames = n_frames;
2053
}
2054

2055
void __init xen_reserve_special_pages(void)
2056
{
2057
	phys_addr_t paddr;
2058

2059
	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2060
	if (xen_start_info->store_mfn) {
2061
		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2062
		memblock_reserve(paddr, PAGE_SIZE);
2063
	}
2064
	if (!xen_initial_domain()) {
2065
		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2066
		memblock_reserve(paddr, PAGE_SIZE);
2067
	}
2068
}
2069

2070
void __init xen_pt_check_e820(void)
2071
{
2072
	xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table");
2073
}
2074

2075
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2076

2077
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2078
{
2079
	pte_t pte;
2080
	unsigned long vaddr;
2081

2082
	phys >>= PAGE_SHIFT;
2083

2084
	switch (idx) {
2085
	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2086
#ifdef CONFIG_X86_VSYSCALL_EMULATION
2087
	case VSYSCALL_PAGE:
2088
#endif
2089
		/* All local page mappings */
2090
		pte = pfn_pte(phys, prot);
2091
		break;
2092

2093
#ifdef CONFIG_X86_LOCAL_APIC
2094
	case FIX_APIC_BASE:	/* maps dummy local APIC */
2095
		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2096
		break;
2097
#endif
2098

2099
#ifdef CONFIG_X86_IO_APIC
2100
	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2101
		/*
2102
		 * We just don't map the IO APIC - all access is via
2103
		 * hypercalls.  Keep the address in the pte for reference.
2104
		 */
2105
		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2106
		break;
2107
#endif
2108

2109
	case FIX_PARAVIRT_BOOTMAP:
2110
		/* This is an MFN, but it isn't an IO mapping from the
2111
		   IO domain */
2112
		pte = mfn_pte(phys, prot);
2113
		break;
2114

2115
	default:
2116
		/* By default, set_fixmap is used for hardware mappings */
2117
		pte = mfn_pte(phys, prot);
2118
		break;
2119
	}
2120

2121
	vaddr = __fix_to_virt(idx);
2122
	if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG))
2123
		BUG();
2124

2125
#ifdef CONFIG_X86_VSYSCALL_EMULATION
2126
	/* Replicate changes to map the vsyscall page into the user
2127
	   pagetable vsyscall mapping. */
2128
	if (idx == VSYSCALL_PAGE)
2129
		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2130
#endif
2131
}
2132

2133
static void xen_enter_lazy_mmu(void)
2134
{
2135
	enter_lazy(XEN_LAZY_MMU);
2136
}
2137

2138
static void xen_flush_lazy_mmu(void)
2139
{
2140
	preempt_disable();
2141

2142
	if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
2143
		arch_leave_lazy_mmu_mode();
2144
		arch_enter_lazy_mmu_mode();
2145
	}
2146

2147
	preempt_enable();
2148
}
2149

2150
static void __init xen_post_allocator_init(void)
2151
{
2152
	pv_ops.mmu.set_pte = xen_set_pte;
2153
	pv_ops.mmu.set_pmd = xen_set_pmd;
2154
	pv_ops.mmu.set_pud = xen_set_pud;
2155
	pv_ops.mmu.set_p4d = xen_set_p4d;
2156

2157
	/* This will work as long as patching hasn't happened yet
2158
	   (which it hasn't) */
2159
	pv_ops.mmu.alloc_pte = xen_alloc_pte;
2160
	pv_ops.mmu.alloc_pmd = xen_alloc_pmd;
2161
	pv_ops.mmu.release_pte = xen_release_pte;
2162
	pv_ops.mmu.release_pmd = xen_release_pmd;
2163
	pv_ops.mmu.alloc_pud = xen_alloc_pud;
2164
	pv_ops.mmu.release_pud = xen_release_pud;
2165
	pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte);
2166

2167
	pv_ops.mmu.write_cr3 = &xen_write_cr3;
2168
}
2169

2170
static void xen_leave_lazy_mmu(void)
2171
{
2172
	preempt_disable();
2173
	xen_mc_flush();
2174
	leave_lazy(XEN_LAZY_MMU);
2175
	preempt_enable();
2176
}
2177

2178
static const typeof(pv_ops) xen_mmu_ops __initconst = {
2179
	.mmu = {
2180
		.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
2181
		.write_cr2 = xen_write_cr2,
2182

2183
		.read_cr3 = xen_read_cr3,
2184
		.write_cr3 = xen_write_cr3_init,
2185

2186
		.flush_tlb_user = xen_flush_tlb,
2187
		.flush_tlb_kernel = xen_flush_tlb,
2188
		.flush_tlb_one_user = xen_flush_tlb_one_user,
2189
		.flush_tlb_multi = xen_flush_tlb_multi,
2190

2191
		.pgd_alloc = xen_pgd_alloc,
2192
		.pgd_free = xen_pgd_free,
2193

2194
		.alloc_pte = xen_alloc_pte_init,
2195
		.release_pte = xen_release_pte_init,
2196
		.alloc_pmd = xen_alloc_pmd_init,
2197
		.release_pmd = xen_release_pmd_init,
2198

2199
		.set_pte = xen_set_pte_init,
2200
		.set_pmd = xen_set_pmd_hyper,
2201

2202
		.ptep_modify_prot_start = xen_ptep_modify_prot_start,
2203
		.ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
2204

2205
		.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2206
		.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2207

2208
		.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
2209
		.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2210

2211
		.set_pud = xen_set_pud_hyper,
2212

2213
		.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2214
		.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2215

2216
		.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2217
		.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2218
		.set_p4d = xen_set_p4d_hyper,
2219

2220
		.alloc_pud = xen_alloc_pmd_init,
2221
		.release_pud = xen_release_pmd_init,
2222

2223
		.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
2224
		.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
2225

2226
		.enter_mmap = xen_enter_mmap,
2227
		.exit_mmap = xen_exit_mmap,
2228

2229
		.lazy_mode = {
2230
			.enter = xen_enter_lazy_mmu,
2231
			.leave = xen_leave_lazy_mmu,
2232
			.flush = xen_flush_lazy_mmu,
2233
		},
2234

2235
		.set_fixmap = xen_set_fixmap,
2236
	},
2237
};
2238

2239
void __init xen_init_mmu_ops(void)
2240
{
2241
	x86_init.paging.pagetable_init = xen_pagetable_init;
2242
	x86_init.hyper.init_after_bootmem = xen_after_bootmem;
2243

2244
	pv_ops.mmu = xen_mmu_ops.mmu;
2245

2246
	memset(dummy_mapping, 0xff, PAGE_SIZE);
2247
}
2248

2249
#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2250
static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2251
				unsigned long *in_frames,
2252
				unsigned long *out_frames)
2253
{
2254
	int i;
2255
	struct multicall_space mcs;
2256

2257
	xen_mc_batch();
2258
	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2259
		mcs = __xen_mc_entry(0);
2260

2261
		if (in_frames)
2262
			in_frames[i] = virt_to_mfn((void *)vaddr);
2263

2264
		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2265
		__set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY);
2266

2267
		if (out_frames)
2268
			out_frames[i] = virt_to_pfn((void *)vaddr);
2269
	}
2270
	xen_mc_issue(0);
2271
}
2272

2273
/*
2274
 * Update the pfn-to-mfn mappings for a virtual address range, either to
2275
 * point to an array of mfns, or contiguously from a single starting
2276
 * mfn.
2277
 */
2278
static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2279
				     unsigned long *mfns,
2280
				     unsigned long first_mfn)
2281
{
2282
	unsigned i, limit;
2283
	unsigned long mfn;
2284

2285
	xen_mc_batch();
2286

2287
	limit = 1u << order;
2288
	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2289
		struct multicall_space mcs;
2290
		unsigned flags;
2291

2292
		mcs = __xen_mc_entry(0);
2293
		if (mfns)
2294
			mfn = mfns[i];
2295
		else
2296
			mfn = first_mfn + i;
2297

2298
		if (i < (limit - 1))
2299
			flags = 0;
2300
		else {
2301
			if (order == 0)
2302
				flags = UVMF_INVLPG | UVMF_ALL;
2303
			else
2304
				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2305
		}
2306

2307
		MULTI_update_va_mapping(mcs.mc, vaddr,
2308
				mfn_pte(mfn, PAGE_KERNEL), flags);
2309

2310
		set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn);
2311
	}
2312

2313
	xen_mc_issue(0);
2314
}
2315

2316
/*
2317
 * Perform the hypercall to exchange a region of our pfns to point to
2318
 * memory with the required contiguous alignment.  Takes the pfns as
2319
 * input, and populates mfns as output.
2320
 *
2321
 * Returns a success code indicating whether the hypervisor was able to
2322
 * satisfy the request or not.
2323
 */
2324
static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2325
			       unsigned long *pfns_in,
2326
			       unsigned long extents_out,
2327
			       unsigned int order_out,
2328
			       unsigned long *mfns_out,
2329
			       unsigned int address_bits)
2330
{
2331
	long rc;
2332
	int success;
2333

2334
	struct xen_memory_exchange exchange = {
2335
		.in = {
2336
			.nr_extents   = extents_in,
2337
			.extent_order = order_in,
2338
			.extent_start = pfns_in,
2339
			.domid        = DOMID_SELF
2340
		},
2341
		.out = {
2342
			.nr_extents   = extents_out,
2343
			.extent_order = order_out,
2344
			.extent_start = mfns_out,
2345
			.address_bits = address_bits,
2346
			.domid        = DOMID_SELF
2347
		}
2348
	};
2349

2350
	BUG_ON(extents_in << order_in != extents_out << order_out);
2351

2352
	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2353
	success = (exchange.nr_exchanged == extents_in);
2354

2355
	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2356
	BUG_ON(success && (rc != 0));
2357

2358
	return success;
2359
}
2360

2361
int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2362
				 unsigned int address_bits,
2363
				 dma_addr_t *dma_handle)
2364
{
2365
	unsigned long *in_frames, out_frame;
2366
	unsigned long  flags;
2367
	int            success;
2368
	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2369

2370
	if (unlikely(order > discontig_frames_order)) {
2371
		if (!discontig_frames_dyn)
2372
			return -ENOMEM;
2373

2374
		if (alloc_discontig_frames(order))
2375
			return -ENOMEM;
2376
	}
2377

2378
	memset((void *) vstart, 0, PAGE_SIZE << order);
2379

2380
	spin_lock_irqsave(&xen_reservation_lock, flags);
2381

2382
	in_frames = discontig_frames;
2383

2384
	/* 1. Zap current PTEs, remembering MFNs. */
2385
	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2386

2387
	/* 2. Get a new contiguous memory extent. */
2388
	out_frame = virt_to_pfn((void *)vstart);
2389
	success = xen_exchange_memory(1UL << order, 0, in_frames,
2390
				      1, order, &out_frame,
2391
				      address_bits);
2392

2393
	/* 3. Map the new extent in place of old pages. */
2394
	if (success)
2395
		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2396
	else
2397
		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2398

2399
	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2400

2401
	*dma_handle = virt_to_machine(vstart).maddr;
2402
	return success ? 0 : -ENOMEM;
2403
}
2404

2405
void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2406
{
2407
	unsigned long *out_frames, in_frame;
2408
	unsigned long  flags;
2409
	int success;
2410
	unsigned long vstart;
2411

2412
	if (unlikely(order > discontig_frames_order))
2413
		return;
2414

2415
	vstart = (unsigned long)phys_to_virt(pstart);
2416
	memset((void *) vstart, 0, PAGE_SIZE << order);
2417

2418
	spin_lock_irqsave(&xen_reservation_lock, flags);
2419

2420
	out_frames = discontig_frames;
2421

2422
	/* 1. Find start MFN of contiguous extent. */
2423
	in_frame = virt_to_mfn((void *)vstart);
2424

2425
	/* 2. Zap current PTEs. */
2426
	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2427

2428
	/* 3. Do the exchange for non-contiguous MFNs. */
2429
	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2430
					0, out_frames, 0);
2431

2432
	/* 4. Map new pages in place of old pages. */
2433
	if (success)
2434
		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2435
	else
2436
		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2437

2438
	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2439
}
2440

2441
static noinline void xen_flush_tlb_all(void)
2442
{
2443
	struct mmuext_op *op;
2444
	struct multicall_space mcs;
2445

2446
	preempt_disable();
2447

2448
	mcs = xen_mc_entry(sizeof(*op));
2449

2450
	op = mcs.args;
2451
	op->cmd = MMUEXT_TLB_FLUSH_ALL;
2452
	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
2453

2454
	xen_mc_issue(XEN_LAZY_MMU);
2455

2456
	preempt_enable();
2457
}
2458

2459
#define REMAP_BATCH_SIZE 16
2460

2461
struct remap_data {
2462
	xen_pfn_t *pfn;
2463
	bool contiguous;
2464
	bool no_translate;
2465
	pgprot_t prot;
2466
	struct mmu_update *mmu_update;
2467
};
2468

2469
static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data)
2470
{
2471
	struct remap_data *rmd = data;
2472
	pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
2473

2474
	/*
2475
	 * If we have a contiguous range, just update the pfn itself,
2476
	 * else update pointer to be "next pfn".
2477
	 */
2478
	if (rmd->contiguous)
2479
		(*rmd->pfn)++;
2480
	else
2481
		rmd->pfn++;
2482

2483
	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2484
	rmd->mmu_update->ptr |= rmd->no_translate ?
2485
		MMU_PT_UPDATE_NO_TRANSLATE :
2486
		MMU_NORMAL_PT_UPDATE;
2487
	rmd->mmu_update->val = pte_val_ma(pte);
2488
	rmd->mmu_update++;
2489

2490
	return 0;
2491
}
2492

2493
int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
2494
		  xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot,
2495
		  unsigned int domid, bool no_translate)
2496
{
2497
	int err = 0;
2498
	struct remap_data rmd;
2499
	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2500
	unsigned long range;
2501
	int mapped = 0;
2502

2503
	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2504

2505
	rmd.pfn = pfn;
2506
	rmd.prot = prot;
2507
	/*
2508
	 * We use the err_ptr to indicate if there we are doing a contiguous
2509
	 * mapping or a discontiguous mapping.
2510
	 */
2511
	rmd.contiguous = !err_ptr;
2512
	rmd.no_translate = no_translate;
2513

2514
	while (nr) {
2515
		int index = 0;
2516
		int done = 0;
2517
		int batch = min(REMAP_BATCH_SIZE, nr);
2518
		int batch_left = batch;
2519

2520
		range = (unsigned long)batch << PAGE_SHIFT;
2521

2522
		rmd.mmu_update = mmu_update;
2523
		err = apply_to_page_range(vma->vm_mm, addr, range,
2524
					  remap_area_pfn_pte_fn, &rmd);
2525
		if (err)
2526
			goto out;
2527

2528
		/*
2529
		 * We record the error for each page that gives an error, but
2530
		 * continue mapping until the whole set is done
2531
		 */
2532
		do {
2533
			int i;
2534

2535
			err = HYPERVISOR_mmu_update(&mmu_update[index],
2536
						    batch_left, &done, domid);
2537

2538
			/*
2539
			 * @err_ptr may be the same buffer as @gfn, so
2540
			 * only clear it after each chunk of @gfn is
2541
			 * used.
2542
			 */
2543
			if (err_ptr) {
2544
				for (i = index; i < index + done; i++)
2545
					err_ptr[i] = 0;
2546
			}
2547
			if (err < 0) {
2548
				if (!err_ptr)
2549
					goto out;
2550
				err_ptr[i] = err;
2551
				done++; /* Skip failed frame. */
2552
			} else
2553
				mapped += done;
2554
			batch_left -= done;
2555
			index += done;
2556
		} while (batch_left);
2557

2558
		nr -= batch;
2559
		addr += range;
2560
		if (err_ptr)
2561
			err_ptr += batch;
2562
		cond_resched();
2563
	}
2564
out:
2565

2566
	xen_flush_tlb_all();
2567

2568
	return err < 0 ? err : mapped;
2569
}
2570
EXPORT_SYMBOL_GPL(xen_remap_pfn);
2571

2572
#ifdef CONFIG_VMCORE_INFO
2573
phys_addr_t paddr_vmcoreinfo_note(void)
2574
{
2575
	if (xen_pv_domain())
2576
		return virt_to_machine(vmcoreinfo_note).maddr;
2577
	else
2578
		return __pa(vmcoreinfo_note);
2579
}
2580
#endif /* CONFIG_KEXEC_CORE */
2581

2582
Product

Resources

Company