CoCalc -- mmu.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/kvm/mmu.c
¹⁷⁵⁷⁹ views
1
/*
2
 * Kernel-based Virtual Machine driver for Linux
3
 *
4
 * This module enables machines with Intel VT-x extensions to run virtual
5
 * machines without emulation or binary translation.
6
 *
7
 * MMU support
8
 *
9
 * Copyright (C) 2006 Qumranet, Inc.
10
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11
 *
12
 * Authors:
13
 *   Yaniv Kamay  <[email protected]>
14
 *   Avi Kivity   <[email protected]>
15
 *
16
 * This work is licensed under the terms of the GNU GPL, version 2.  See
17
 * the COPYING file in the top-level directory.
18
 *
19
 */
20

21
#include "irq.h"
22
#include "mmu.h"
23
#include "x86.h"
24
#include "kvm_cache_regs.h"
25
#include "x86.h"
26

27
#include <linux/kvm_host.h>
28
#include <linux/types.h>
29
#include <linux/string.h>
30
#include <linux/mm.h>
31
#include <linux/highmem.h>
32
#include <linux/module.h>
33
#include <linux/swap.h>
34
#include <linux/hugetlb.h>
35
#include <linux/compiler.h>
36
#include <linux/srcu.h>
37
#include <linux/slab.h>
38
#include <linux/uaccess.h>
39

40
#include <asm/page.h>
41
#include <asm/cmpxchg.h>
42
#include <asm/io.h>
43
#include <asm/vmx.h>
44

45
/*
46
 * When setting this variable to true it enables Two-Dimensional-Paging
47
 * where the hardware walks 2 page tables:
48
 * 1. the guest-virtual to guest-physical
49
 * 2. while doing 1. it walks guest-physical to host-physical
50
 * If the hardware supports that we don't need to do shadow paging.
51
 */
52
bool tdp_enabled = false;
53

54
enum {
55
	AUDIT_PRE_PAGE_FAULT,
56
	AUDIT_POST_PAGE_FAULT,
57
	AUDIT_PRE_PTE_WRITE,
58
	AUDIT_POST_PTE_WRITE,
59
	AUDIT_PRE_SYNC,
60
	AUDIT_POST_SYNC
61
};
62

63
char *audit_point_name[] = {
64
	"pre page fault",
65
	"post page fault",
66
	"pre pte write",
67
	"post pte write",
68
	"pre sync",
69
	"post sync"
70
};
71

72
#undef MMU_DEBUG
73

74
#ifdef MMU_DEBUG
75

76
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
77
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
78

79
#else
80

81
#define pgprintk(x...) do { } while (0)
82
#define rmap_printk(x...) do { } while (0)
83

84
#endif
85

86
#ifdef MMU_DEBUG
87
static int dbg = 0;
88
module_param(dbg, bool, 0644);
89
#endif
90

91
static int oos_shadow = 1;
92
module_param(oos_shadow, bool, 0644);
93

94
#ifndef MMU_DEBUG
95
#define ASSERT(x) do { } while (0)
96
#else
97
#define ASSERT(x)							\
98
	if (!(x)) {							\
99
		printk(KERN_WARNING "assertion failed %s:%d: %s\n",	\
100
		       __FILE__, __LINE__, #x);				\
101
	}
102
#endif
103

104
#define PTE_PREFETCH_NUM		8
105

106
#define PT_FIRST_AVAIL_BITS_SHIFT 9
107
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
108

109
#define PT64_LEVEL_BITS 9
110

111
#define PT64_LEVEL_SHIFT(level) \
112
		(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113

114
#define PT64_INDEX(address, level)\
115
	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
116

117

118
#define PT32_LEVEL_BITS 10
119

120
#define PT32_LEVEL_SHIFT(level) \
121
		(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
122

123
#define PT32_LVL_OFFSET_MASK(level) \
124
	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
125
						* PT32_LEVEL_BITS))) - 1))
126

127
#define PT32_INDEX(address, level)\
128
	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
129

130

131
#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
132
#define PT64_DIR_BASE_ADDR_MASK \
133
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
134
#define PT64_LVL_ADDR_MASK(level) \
135
	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
136
						* PT64_LEVEL_BITS))) - 1))
137
#define PT64_LVL_OFFSET_MASK(level) \
138
	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
139
						* PT64_LEVEL_BITS))) - 1))
140

141
#define PT32_BASE_ADDR_MASK PAGE_MASK
142
#define PT32_DIR_BASE_ADDR_MASK \
143
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
144
#define PT32_LVL_ADDR_MASK(level) \
145
	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
146
					    * PT32_LEVEL_BITS))) - 1))
147

148
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149
			| PT64_NX_MASK)
150

151
#define RMAP_EXT 4
152

153
#define ACC_EXEC_MASK    1
154
#define ACC_WRITE_MASK   PT_WRITABLE_MASK
155
#define ACC_USER_MASK    PT_USER_MASK
156
#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
157

158
#include <trace/events/kvm.h>
159

160
#define CREATE_TRACE_POINTS
161
#include "mmutrace.h"
162

163
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
164

165
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166

167
struct kvm_rmap_desc {
168
	u64 *sptes[RMAP_EXT];
169
	struct kvm_rmap_desc *more;
170
};
171

172
struct kvm_shadow_walk_iterator {
173
	u64 addr;
174
	hpa_t shadow_addr;
175
	int level;
176
	u64 *sptep;
177
	unsigned index;
178
};
179

180
#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
181
	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
182
	     shadow_walk_okay(&(_walker));			\
183
	     shadow_walk_next(&(_walker)))
184

185
typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
186

187
static struct kmem_cache *pte_chain_cache;
188
static struct kmem_cache *rmap_desc_cache;
189
static struct kmem_cache *mmu_page_header_cache;
190
static struct percpu_counter kvm_total_used_mmu_pages;
191

192
static u64 __read_mostly shadow_trap_nonpresent_pte;
193
static u64 __read_mostly shadow_notrap_nonpresent_pte;
194
static u64 __read_mostly shadow_nx_mask;
195
static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
196
static u64 __read_mostly shadow_user_mask;
197
static u64 __read_mostly shadow_accessed_mask;
198
static u64 __read_mostly shadow_dirty_mask;
199

200
static inline u64 rsvd_bits(int s, int e)
201
{
202
	return ((1ULL << (e - s + 1)) - 1) << s;
203
}
204

205
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
206
{
207
	shadow_trap_nonpresent_pte = trap_pte;
208
	shadow_notrap_nonpresent_pte = notrap_pte;
209
}
210
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
211

212
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213
		u64 dirty_mask, u64 nx_mask, u64 x_mask)
214
{
215
	shadow_user_mask = user_mask;
216
	shadow_accessed_mask = accessed_mask;
217
	shadow_dirty_mask = dirty_mask;
218
	shadow_nx_mask = nx_mask;
219
	shadow_x_mask = x_mask;
220
}
221
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
222

223
static bool is_write_protection(struct kvm_vcpu *vcpu)
224
{
225
	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
226
}
227

228
static int is_cpuid_PSE36(void)
229
{
230
	return 1;
231
}
232

233
static int is_nx(struct kvm_vcpu *vcpu)
234
{
235
	return vcpu->arch.efer & EFER_NX;
236
}
237

238
static int is_shadow_present_pte(u64 pte)
239
{
240
	return pte != shadow_trap_nonpresent_pte
241
		&& pte != shadow_notrap_nonpresent_pte;
242
}
243

244
static int is_large_pte(u64 pte)
245
{
246
	return pte & PT_PAGE_SIZE_MASK;
247
}
248

249
static int is_writable_pte(unsigned long pte)
250
{
251
	return pte & PT_WRITABLE_MASK;
252
}
253

254
static int is_dirty_gpte(unsigned long pte)
255
{
256
	return pte & PT_DIRTY_MASK;
257
}
258

259
static int is_rmap_spte(u64 pte)
260
{
261
	return is_shadow_present_pte(pte);
262
}
263

264
static int is_last_spte(u64 pte, int level)
265
{
266
	if (level == PT_PAGE_TABLE_LEVEL)
267
		return 1;
268
	if (is_large_pte(pte))
269
		return 1;
270
	return 0;
271
}
272

273
static pfn_t spte_to_pfn(u64 pte)
274
{
275
	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
276
}
277

278
static gfn_t pse36_gfn_delta(u32 gpte)
279
{
280
	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
281

282
	return (gpte & PT32_DIR_PSE36_MASK) << shift;
283
}
284

285
static void __set_spte(u64 *sptep, u64 spte)
286
{
287
	set_64bit(sptep, spte);
288
}
289

290
static u64 __xchg_spte(u64 *sptep, u64 new_spte)
291
{
292
#ifdef CONFIG_X86_64
293
	return xchg(sptep, new_spte);
294
#else
295
	u64 old_spte;
296

297
	do {
298
		old_spte = *sptep;
299
	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
300

301
	return old_spte;
302
#endif
303
}
304

305
static bool spte_has_volatile_bits(u64 spte)
306
{
307
	if (!shadow_accessed_mask)
308
		return false;
309

310
	if (!is_shadow_present_pte(spte))
311
		return false;
312

313
	if ((spte & shadow_accessed_mask) &&
314
	      (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
315
		return false;
316

317
	return true;
318
}
319

320
static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
321
{
322
	return (old_spte & bit_mask) && !(new_spte & bit_mask);
323
}
324

325
static void update_spte(u64 *sptep, u64 new_spte)
326
{
327
	u64 mask, old_spte = *sptep;
328

329
	WARN_ON(!is_rmap_spte(new_spte));
330

331
	new_spte |= old_spte & shadow_dirty_mask;
332

333
	mask = shadow_accessed_mask;
334
	if (is_writable_pte(old_spte))
335
		mask |= shadow_dirty_mask;
336

337
	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338
		__set_spte(sptep, new_spte);
339
	else
340
		old_spte = __xchg_spte(sptep, new_spte);
341

342
	if (!shadow_accessed_mask)
343
		return;
344

345
	if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
346
		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
347
	if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
348
		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
349
}
350

351
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352
				  struct kmem_cache *base_cache, int min)
353
{
354
	void *obj;
355

356
	if (cache->nobjs >= min)
357
		return 0;
358
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
359
		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
360
		if (!obj)
361
			return -ENOMEM;
362
		cache->objects[cache->nobjs++] = obj;
363
	}
364
	return 0;
365
}
366

367
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
368
				  struct kmem_cache *cache)
369
{
370
	while (mc->nobjs)
371
		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
372
}
373

374
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
375
				       int min)
376
{
377
	void *page;
378

379
	if (cache->nobjs >= min)
380
		return 0;
381
	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
382
		page = (void *)__get_free_page(GFP_KERNEL);
383
		if (!page)
384
			return -ENOMEM;
385
		cache->objects[cache->nobjs++] = page;
386
	}
387
	return 0;
388
}
389

390
static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
391
{
392
	while (mc->nobjs)
393
		free_page((unsigned long)mc->objects[--mc->nobjs]);
394
}
395

396
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
397
{
398
	int r;
399

400
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
401
				   pte_chain_cache, 4);
402
	if (r)
403
		goto out;
404
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405
				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
406
	if (r)
407
		goto out;
408
	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
409
	if (r)
410
		goto out;
411
	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
412
				   mmu_page_header_cache, 4);
413
out:
414
	return r;
415
}
416

417
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
418
{
419
	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
420
	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
421
	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422
	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423
				mmu_page_header_cache);
424
}
425

426
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
427
				    size_t size)
428
{
429
	void *p;
430

431
	BUG_ON(!mc->nobjs);
432
	p = mc->objects[--mc->nobjs];
433
	return p;
434
}
435

436
static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
437
{
438
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
439
				      sizeof(struct kvm_pte_chain));
440
}
441

442
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
443
{
444
	kmem_cache_free(pte_chain_cache, pc);
445
}
446

447
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
448
{
449
	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
450
				      sizeof(struct kvm_rmap_desc));
451
}
452

453
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
454
{
455
	kmem_cache_free(rmap_desc_cache, rd);
456
}
457

458
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
459
{
460
	if (!sp->role.direct)
461
		return sp->gfns[index];
462

463
	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
464
}
465

466
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
467
{
468
	if (sp->role.direct)
469
		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
470
	else
471
		sp->gfns[index] = gfn;
472
}
473

474
/*
475
 * Return the pointer to the large page information for a given gfn,
476
 * handling slots that are not large page aligned.
477
 */
478
static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
479
					      struct kvm_memory_slot *slot,
480
					      int level)
481
{
482
	unsigned long idx;
483

484
	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
485
	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
486
	return &slot->lpage_info[level - 2][idx];
487
}
488

489
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
490
{
491
	struct kvm_memory_slot *slot;
492
	struct kvm_lpage_info *linfo;
493
	int i;
494

495
	slot = gfn_to_memslot(kvm, gfn);
496
	for (i = PT_DIRECTORY_LEVEL;
497
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
498
		linfo = lpage_info_slot(gfn, slot, i);
499
		linfo->write_count += 1;
500
	}
501
}
502

503
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
504
{
505
	struct kvm_memory_slot *slot;
506
	struct kvm_lpage_info *linfo;
507
	int i;
508

509
	slot = gfn_to_memslot(kvm, gfn);
510
	for (i = PT_DIRECTORY_LEVEL;
511
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
512
		linfo = lpage_info_slot(gfn, slot, i);
513
		linfo->write_count -= 1;
514
		WARN_ON(linfo->write_count < 0);
515
	}
516
}
517

518
static int has_wrprotected_page(struct kvm *kvm,
519
				gfn_t gfn,
520
				int level)
521
{
522
	struct kvm_memory_slot *slot;
523
	struct kvm_lpage_info *linfo;
524

525
	slot = gfn_to_memslot(kvm, gfn);
526
	if (slot) {
527
		linfo = lpage_info_slot(gfn, slot, level);
528
		return linfo->write_count;
529
	}
530

531
	return 1;
532
}
533

534
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
535
{
536
	unsigned long page_size;
537
	int i, ret = 0;
538

539
	page_size = kvm_host_page_size(kvm, gfn);
540

541
	for (i = PT_PAGE_TABLE_LEVEL;
542
	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
543
		if (page_size >= KVM_HPAGE_SIZE(i))
544
			ret = i;
545
		else
546
			break;
547
	}
548

549
	return ret;
550
}
551

552
static struct kvm_memory_slot *
553
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554
			    bool no_dirty_log)
555
{
556
	struct kvm_memory_slot *slot;
557

558
	slot = gfn_to_memslot(vcpu->kvm, gfn);
559
	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
560
	      (no_dirty_log && slot->dirty_bitmap))
561
		slot = NULL;
562

563
	return slot;
564
}
565

566
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567
{
568
	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
569
}
570

571
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
572
{
573
	int host_level, level, max_level;
574

575
	host_level = host_mapping_level(vcpu->kvm, large_gfn);
576

577
	if (host_level == PT_PAGE_TABLE_LEVEL)
578
		return host_level;
579

580
	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
581
		kvm_x86_ops->get_lpage_level() : host_level;
582

583
	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
584
		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
585
			break;
586

587
	return level - 1;
588
}
589

590
/*
591
 * Take gfn and return the reverse mapping to it.
592
 */
593

594
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
595
{
596
	struct kvm_memory_slot *slot;
597
	struct kvm_lpage_info *linfo;
598

599
	slot = gfn_to_memslot(kvm, gfn);
600
	if (likely(level == PT_PAGE_TABLE_LEVEL))
601
		return &slot->rmap[gfn - slot->base_gfn];
602

603
	linfo = lpage_info_slot(gfn, slot, level);
604

605
	return &linfo->rmap_pde;
606
}
607

608
/*
609
 * Reverse mapping data structures:
610
 *
611
 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
612
 * that points to page_address(page).
613
 *
614
 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
615
 * containing more mappings.
616
 *
617
 * Returns the number of rmap entries before the spte was added or zero if
618
 * the spte was not added.
619
 *
620
 */
621
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
622
{
623
	struct kvm_mmu_page *sp;
624
	struct kvm_rmap_desc *desc;
625
	unsigned long *rmapp;
626
	int i, count = 0;
627

628
	if (!is_rmap_spte(*spte))
629
		return count;
630
	sp = page_header(__pa(spte));
631
	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
632
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
633
	if (!*rmapp) {
634
		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
635
		*rmapp = (unsigned long)spte;
636
	} else if (!(*rmapp & 1)) {
637
		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
638
		desc = mmu_alloc_rmap_desc(vcpu);
639
		desc->sptes[0] = (u64 *)*rmapp;
640
		desc->sptes[1] = spte;
641
		*rmapp = (unsigned long)desc | 1;
642
		++count;
643
	} else {
644
		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
645
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
646
		while (desc->sptes[RMAP_EXT-1] && desc->more) {
647
			desc = desc->more;
648
			count += RMAP_EXT;
649
		}
650
		if (desc->sptes[RMAP_EXT-1]) {
651
			desc->more = mmu_alloc_rmap_desc(vcpu);
652
			desc = desc->more;
653
		}
654
		for (i = 0; desc->sptes[i]; ++i)
655
			++count;
656
		desc->sptes[i] = spte;
657
	}
658
	return count;
659
}
660

661
static void rmap_desc_remove_entry(unsigned long *rmapp,
662
				   struct kvm_rmap_desc *desc,
663
				   int i,
664
				   struct kvm_rmap_desc *prev_desc)
665
{
666
	int j;
667

668
	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
669
		;
670
	desc->sptes[i] = desc->sptes[j];
671
	desc->sptes[j] = NULL;
672
	if (j != 0)
673
		return;
674
	if (!prev_desc && !desc->more)
675
		*rmapp = (unsigned long)desc->sptes[0];
676
	else
677
		if (prev_desc)
678
			prev_desc->more = desc->more;
679
		else
680
			*rmapp = (unsigned long)desc->more | 1;
681
	mmu_free_rmap_desc(desc);
682
}
683

684
static void rmap_remove(struct kvm *kvm, u64 *spte)
685
{
686
	struct kvm_rmap_desc *desc;
687
	struct kvm_rmap_desc *prev_desc;
688
	struct kvm_mmu_page *sp;
689
	gfn_t gfn;
690
	unsigned long *rmapp;
691
	int i;
692

693
	sp = page_header(__pa(spte));
694
	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
695
	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
696
	if (!*rmapp) {
697
		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
698
		BUG();
699
	} else if (!(*rmapp & 1)) {
700
		rmap_printk("rmap_remove:  %p 1->0\n", spte);
701
		if ((u64 *)*rmapp != spte) {
702
			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
703
			BUG();
704
		}
705
		*rmapp = 0;
706
	} else {
707
		rmap_printk("rmap_remove:  %p many->many\n", spte);
708
		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
709
		prev_desc = NULL;
710
		while (desc) {
711
			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
712
				if (desc->sptes[i] == spte) {
713
					rmap_desc_remove_entry(rmapp,
714
							       desc, i,
715
							       prev_desc);
716
					return;
717
				}
718
			prev_desc = desc;
719
			desc = desc->more;
720
		}
721
		pr_err("rmap_remove: %p many->many\n", spte);
722
		BUG();
723
	}
724
}
725

726
static int set_spte_track_bits(u64 *sptep, u64 new_spte)
727
{
728
	pfn_t pfn;
729
	u64 old_spte = *sptep;
730

731
	if (!spte_has_volatile_bits(old_spte))
732
		__set_spte(sptep, new_spte);
733
	else
734
		old_spte = __xchg_spte(sptep, new_spte);
735

736
	if (!is_rmap_spte(old_spte))
737
		return 0;
738

739
	pfn = spte_to_pfn(old_spte);
740
	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
741
		kvm_set_pfn_accessed(pfn);
742
	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
743
		kvm_set_pfn_dirty(pfn);
744
	return 1;
745
}
746

747
static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
748
{
749
	if (set_spte_track_bits(sptep, new_spte))
750
		rmap_remove(kvm, sptep);
751
}
752

753
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
754
{
755
	struct kvm_rmap_desc *desc;
756
	u64 *prev_spte;
757
	int i;
758

759
	if (!*rmapp)
760
		return NULL;
761
	else if (!(*rmapp & 1)) {
762
		if (!spte)
763
			return (u64 *)*rmapp;
764
		return NULL;
765
	}
766
	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
767
	prev_spte = NULL;
768
	while (desc) {
769
		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
770
			if (prev_spte == spte)
771
				return desc->sptes[i];
772
			prev_spte = desc->sptes[i];
773
		}
774
		desc = desc->more;
775
	}
776
	return NULL;
777
}
778

779
static int rmap_write_protect(struct kvm *kvm, u64 gfn)
780
{
781
	unsigned long *rmapp;
782
	u64 *spte;
783
	int i, write_protected = 0;
784

785
	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
786

787
	spte = rmap_next(kvm, rmapp, NULL);
788
	while (spte) {
789
		BUG_ON(!spte);
790
		BUG_ON(!(*spte & PT_PRESENT_MASK));
791
		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
792
		if (is_writable_pte(*spte)) {
793
			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
794
			write_protected = 1;
795
		}
796
		spte = rmap_next(kvm, rmapp, spte);
797
	}
798

799
	/* check for huge page mappings */
800
	for (i = PT_DIRECTORY_LEVEL;
801
	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
802
		rmapp = gfn_to_rmap(kvm, gfn, i);
803
		spte = rmap_next(kvm, rmapp, NULL);
804
		while (spte) {
805
			BUG_ON(!spte);
806
			BUG_ON(!(*spte & PT_PRESENT_MASK));
807
			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
808
			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
809
			if (is_writable_pte(*spte)) {
810
				drop_spte(kvm, spte,
811
					  shadow_trap_nonpresent_pte);
812
				--kvm->stat.lpages;
813
				spte = NULL;
814
				write_protected = 1;
815
			}
816
			spte = rmap_next(kvm, rmapp, spte);
817
		}
818
	}
819

820
	return write_protected;
821
}
822

823
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
824
			   unsigned long data)
825
{
826
	u64 *spte;
827
	int need_tlb_flush = 0;
828

829
	while ((spte = rmap_next(kvm, rmapp, NULL))) {
830
		BUG_ON(!(*spte & PT_PRESENT_MASK));
831
		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
832
		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
833
		need_tlb_flush = 1;
834
	}
835
	return need_tlb_flush;
836
}
837

838
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
839
			     unsigned long data)
840
{
841
	int need_flush = 0;
842
	u64 *spte, new_spte;
843
	pte_t *ptep = (pte_t *)data;
844
	pfn_t new_pfn;
845

846
	WARN_ON(pte_huge(*ptep));
847
	new_pfn = pte_pfn(*ptep);
848
	spte = rmap_next(kvm, rmapp, NULL);
849
	while (spte) {
850
		BUG_ON(!is_shadow_present_pte(*spte));
851
		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
852
		need_flush = 1;
853
		if (pte_write(*ptep)) {
854
			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
855
			spte = rmap_next(kvm, rmapp, NULL);
856
		} else {
857
			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
858
			new_spte |= (u64)new_pfn << PAGE_SHIFT;
859

860
			new_spte &= ~PT_WRITABLE_MASK;
861
			new_spte &= ~SPTE_HOST_WRITEABLE;
862
			new_spte &= ~shadow_accessed_mask;
863
			set_spte_track_bits(spte, new_spte);
864
			spte = rmap_next(kvm, rmapp, spte);
865
		}
866
	}
867
	if (need_flush)
868
		kvm_flush_remote_tlbs(kvm);
869

870
	return 0;
871
}
872

873
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
874
			  unsigned long data,
875
			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
876
					 unsigned long data))
877
{
878
	int i, j;
879
	int ret;
880
	int retval = 0;
881
	struct kvm_memslots *slots;
882

883
	slots = kvm_memslots(kvm);
884

885
	for (i = 0; i < slots->nmemslots; i++) {
886
		struct kvm_memory_slot *memslot = &slots->memslots[i];
887
		unsigned long start = memslot->userspace_addr;
888
		unsigned long end;
889

890
		end = start + (memslot->npages << PAGE_SHIFT);
891
		if (hva >= start && hva < end) {
892
			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
893
			gfn_t gfn = memslot->base_gfn + gfn_offset;
894

895
			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
896

897
			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
898
				struct kvm_lpage_info *linfo;
899

900
				linfo = lpage_info_slot(gfn, memslot,
901
							PT_DIRECTORY_LEVEL + j);
902
				ret |= handler(kvm, &linfo->rmap_pde, data);
903
			}
904
			trace_kvm_age_page(hva, memslot, ret);
905
			retval |= ret;
906
		}
907
	}
908

909
	return retval;
910
}
911

912
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
913
{
914
	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
915
}
916

917
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
918
{
919
	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
920
}
921

922
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
923
			 unsigned long data)
924
{
925
	u64 *spte;
926
	int young = 0;
927

928
	/*
929
	 * Emulate the accessed bit for EPT, by checking if this page has
930
	 * an EPT mapping, and clearing it if it does. On the next access,
931
	 * a new EPT mapping will be established.
932
	 * This has some overhead, but not as much as the cost of swapping
933
	 * out actively used pages or breaking up actively used hugepages.
934
	 */
935
	if (!shadow_accessed_mask)
936
		return kvm_unmap_rmapp(kvm, rmapp, data);
937

938
	spte = rmap_next(kvm, rmapp, NULL);
939
	while (spte) {
940
		int _young;
941
		u64 _spte = *spte;
942
		BUG_ON(!(_spte & PT_PRESENT_MASK));
943
		_young = _spte & PT_ACCESSED_MASK;
944
		if (_young) {
945
			young = 1;
946
			clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
947
		}
948
		spte = rmap_next(kvm, rmapp, spte);
949
	}
950
	return young;
951
}
952

953
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
954
			      unsigned long data)
955
{
956
	u64 *spte;
957
	int young = 0;
958

959
	/*
960
	 * If there's no access bit in the secondary pte set by the
961
	 * hardware it's up to gup-fast/gup to set the access bit in
962
	 * the primary pte or in the page structure.
963
	 */
964
	if (!shadow_accessed_mask)
965
		goto out;
966

967
	spte = rmap_next(kvm, rmapp, NULL);
968
	while (spte) {
969
		u64 _spte = *spte;
970
		BUG_ON(!(_spte & PT_PRESENT_MASK));
971
		young = _spte & PT_ACCESSED_MASK;
972
		if (young) {
973
			young = 1;
974
			break;
975
		}
976
		spte = rmap_next(kvm, rmapp, spte);
977
	}
978
out:
979
	return young;
980
}
981

982
#define RMAP_RECYCLE_THRESHOLD 1000
983

984
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
985
{
986
	unsigned long *rmapp;
987
	struct kvm_mmu_page *sp;
988

989
	sp = page_header(__pa(spte));
990

991
	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
992

993
	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
994
	kvm_flush_remote_tlbs(vcpu->kvm);
995
}
996

997
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
998
{
999
	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
1000
}
1001

1002
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1003
{
1004
	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1005
}
1006

1007
#ifdef MMU_DEBUG
1008
static int is_empty_shadow_page(u64 *spt)
1009
{
1010
	u64 *pos;
1011
	u64 *end;
1012

1013
	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
1014
		if (is_shadow_present_pte(*pos)) {
1015
			printk(KERN_ERR "%s: %p %llx\n", __func__,
1016
			       pos, *pos);
1017
			return 0;
1018
		}
1019
	return 1;
1020
}
1021
#endif
1022

1023
/*
1024
 * This value is the sum of all of the kvm instances's
1025
 * kvm->arch.n_used_mmu_pages values.  We need a global,
1026
 * aggregate version in order to make the slab shrinker
1027
 * faster
1028
 */
1029
static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1030
{
1031
	kvm->arch.n_used_mmu_pages += nr;
1032
	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033
}
1034

1035
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1036
{
1037
	ASSERT(is_empty_shadow_page(sp->spt));
1038
	hlist_del(&sp->hash_link);
1039
	list_del(&sp->link);
1040
	free_page((unsigned long)sp->spt);
1041
	if (!sp->role.direct)
1042
		free_page((unsigned long)sp->gfns);
1043
	kmem_cache_free(mmu_page_header_cache, sp);
1044
	kvm_mod_used_mmu_pages(kvm, -1);
1045
}
1046

1047
static unsigned kvm_page_table_hashfn(gfn_t gfn)
1048
{
1049
	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1050
}
1051

1052
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1053
					       u64 *parent_pte, int direct)
1054
{
1055
	struct kvm_mmu_page *sp;
1056

1057
	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1058
	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1059
	if (!direct)
1060
		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1061
						  PAGE_SIZE);
1062
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1063
	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1064
	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1065
	sp->multimapped = 0;
1066
	sp->parent_pte = parent_pte;
1067
	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1068
	return sp;
1069
}
1070

1071
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1072
				    struct kvm_mmu_page *sp, u64 *parent_pte)
1073
{
1074
	struct kvm_pte_chain *pte_chain;
1075
	struct hlist_node *node;
1076
	int i;
1077

1078
	if (!parent_pte)
1079
		return;
1080
	if (!sp->multimapped) {
1081
		u64 *old = sp->parent_pte;
1082

1083
		if (!old) {
1084
			sp->parent_pte = parent_pte;
1085
			return;
1086
		}
1087
		sp->multimapped = 1;
1088
		pte_chain = mmu_alloc_pte_chain(vcpu);
1089
		INIT_HLIST_HEAD(&sp->parent_ptes);
1090
		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1091
		pte_chain->parent_ptes[0] = old;
1092
	}
1093
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1094
		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1095
			continue;
1096
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1097
			if (!pte_chain->parent_ptes[i]) {
1098
				pte_chain->parent_ptes[i] = parent_pte;
1099
				return;
1100
			}
1101
	}
1102
	pte_chain = mmu_alloc_pte_chain(vcpu);
1103
	BUG_ON(!pte_chain);
1104
	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1105
	pte_chain->parent_ptes[0] = parent_pte;
1106
}
1107

1108
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1109
				       u64 *parent_pte)
1110
{
1111
	struct kvm_pte_chain *pte_chain;
1112
	struct hlist_node *node;
1113
	int i;
1114

1115
	if (!sp->multimapped) {
1116
		BUG_ON(sp->parent_pte != parent_pte);
1117
		sp->parent_pte = NULL;
1118
		return;
1119
	}
1120
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1121
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1122
			if (!pte_chain->parent_ptes[i])
1123
				break;
1124
			if (pte_chain->parent_ptes[i] != parent_pte)
1125
				continue;
1126
			while (i + 1 < NR_PTE_CHAIN_ENTRIES
1127
				&& pte_chain->parent_ptes[i + 1]) {
1128
				pte_chain->parent_ptes[i]
1129
					= pte_chain->parent_ptes[i + 1];
1130
				++i;
1131
			}
1132
			pte_chain->parent_ptes[i] = NULL;
1133
			if (i == 0) {
1134
				hlist_del(&pte_chain->link);
1135
				mmu_free_pte_chain(pte_chain);
1136
				if (hlist_empty(&sp->parent_ptes)) {
1137
					sp->multimapped = 0;
1138
					sp->parent_pte = NULL;
1139
				}
1140
			}
1141
			return;
1142
		}
1143
	BUG();
1144
}
1145

1146
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1147
{
1148
	struct kvm_pte_chain *pte_chain;
1149
	struct hlist_node *node;
1150
	struct kvm_mmu_page *parent_sp;
1151
	int i;
1152

1153
	if (!sp->multimapped && sp->parent_pte) {
1154
		parent_sp = page_header(__pa(sp->parent_pte));
1155
		fn(parent_sp, sp->parent_pte);
1156
		return;
1157
	}
1158

1159
	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1160
		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1161
			u64 *spte = pte_chain->parent_ptes[i];
1162

1163
			if (!spte)
1164
				break;
1165
			parent_sp = page_header(__pa(spte));
1166
			fn(parent_sp, spte);
1167
		}
1168
}
1169

1170
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
1171
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1172
{
1173
	mmu_parent_walk(sp, mark_unsync);
1174
}
1175

1176
static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1177
{
1178
	unsigned int index;
1179

1180
	index = spte - sp->spt;
1181
	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1182
		return;
1183
	if (sp->unsync_children++)
1184
		return;
1185
	kvm_mmu_mark_parents_unsync(sp);
1186
}
1187

1188
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1189
				    struct kvm_mmu_page *sp)
1190
{
1191
	int i;
1192

1193
	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1194
		sp->spt[i] = shadow_trap_nonpresent_pte;
1195
}
1196

1197
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1198
			       struct kvm_mmu_page *sp)
1199
{
1200
	return 1;
1201
}
1202

1203
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1204
{
1205
}
1206

1207
static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208
				 struct kvm_mmu_page *sp, u64 *spte,
1209
				 const void *pte)
1210
{
1211
	WARN_ON(1);
1212
}
1213

1214
#define KVM_PAGE_ARRAY_NR 16
1215

1216
struct kvm_mmu_pages {
1217
	struct mmu_page_and_offset {
1218
		struct kvm_mmu_page *sp;
1219
		unsigned int idx;
1220
	} page[KVM_PAGE_ARRAY_NR];
1221
	unsigned int nr;
1222
};
1223

1224
#define for_each_unsync_children(bitmap, idx)		\
1225
	for (idx = find_first_bit(bitmap, 512);		\
1226
	     idx < 512;					\
1227
	     idx = find_next_bit(bitmap, 512, idx+1))
1228

1229
static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1230
			 int idx)
1231
{
1232
	int i;
1233

1234
	if (sp->unsync)
1235
		for (i=0; i < pvec->nr; i++)
1236
			if (pvec->page[i].sp == sp)
1237
				return 0;
1238

1239
	pvec->page[pvec->nr].sp = sp;
1240
	pvec->page[pvec->nr].idx = idx;
1241
	pvec->nr++;
1242
	return (pvec->nr == KVM_PAGE_ARRAY_NR);
1243
}
1244

1245
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1246
			   struct kvm_mmu_pages *pvec)
1247
{
1248
	int i, ret, nr_unsync_leaf = 0;
1249

1250
	for_each_unsync_children(sp->unsync_child_bitmap, i) {
1251
		struct kvm_mmu_page *child;
1252
		u64 ent = sp->spt[i];
1253

1254
		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
1255
			goto clear_child_bitmap;
1256

1257
		child = page_header(ent & PT64_BASE_ADDR_MASK);
1258

1259
		if (child->unsync_children) {
1260
			if (mmu_pages_add(pvec, child, i))
1261
				return -ENOSPC;
1262

1263
			ret = __mmu_unsync_walk(child, pvec);
1264
			if (!ret)
1265
				goto clear_child_bitmap;
1266
			else if (ret > 0)
1267
				nr_unsync_leaf += ret;
1268
			else
1269
				return ret;
1270
		} else if (child->unsync) {
1271
			nr_unsync_leaf++;
1272
			if (mmu_pages_add(pvec, child, i))
1273
				return -ENOSPC;
1274
		} else
1275
			 goto clear_child_bitmap;
1276

1277
		continue;
1278

1279
clear_child_bitmap:
1280
		__clear_bit(i, sp->unsync_child_bitmap);
1281
		sp->unsync_children--;
1282
		WARN_ON((int)sp->unsync_children < 0);
1283
	}
1284

1285

1286
	return nr_unsync_leaf;
1287
}
1288

1289
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1290
			   struct kvm_mmu_pages *pvec)
1291
{
1292
	if (!sp->unsync_children)
1293
		return 0;
1294

1295
	mmu_pages_add(pvec, sp, 0);
1296
	return __mmu_unsync_walk(sp, pvec);
1297
}
1298

1299
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1300
{
1301
	WARN_ON(!sp->unsync);
1302
	trace_kvm_mmu_sync_page(sp);
1303
	sp->unsync = 0;
1304
	--kvm->stat.mmu_unsync;
1305
}
1306

1307
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1308
				    struct list_head *invalid_list);
1309
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1310
				    struct list_head *invalid_list);
1311

1312
#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
1313
  hlist_for_each_entry(sp, pos,						\
1314
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
1315
	if ((sp)->gfn != (gfn)) {} else
1316

1317
#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
1318
  hlist_for_each_entry(sp, pos,						\
1319
   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
1320
		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
1321
			(sp)->role.invalid) {} else
1322

1323
/* @sp->gfn should be write-protected at the call site */
1324
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1325
			   struct list_head *invalid_list, bool clear_unsync)
1326
{
1327
	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1328
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1329
		return 1;
1330
	}
1331

1332
	if (clear_unsync)
1333
		kvm_unlink_unsync_page(vcpu->kvm, sp);
1334

1335
	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1336
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1337
		return 1;
1338
	}
1339

1340
	kvm_mmu_flush_tlb(vcpu);
1341
	return 0;
1342
}
1343

1344
static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
1345
				   struct kvm_mmu_page *sp)
1346
{
1347
	LIST_HEAD(invalid_list);
1348
	int ret;
1349

1350
	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
1351
	if (ret)
1352
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1353

1354
	return ret;
1355
}
1356

1357
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1358
			 struct list_head *invalid_list)
1359
{
1360
	return __kvm_sync_page(vcpu, sp, invalid_list, true);
1361
}
1362

1363
/* @gfn should be write-protected at the call site */
1364
static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1365
{
1366
	struct kvm_mmu_page *s;
1367
	struct hlist_node *node;
1368
	LIST_HEAD(invalid_list);
1369
	bool flush = false;
1370

1371
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1372
		if (!s->unsync)
1373
			continue;
1374

1375
		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1376
		kvm_unlink_unsync_page(vcpu->kvm, s);
1377
		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1378
			(vcpu->arch.mmu.sync_page(vcpu, s))) {
1379
			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1380
			continue;
1381
		}
1382
		flush = true;
1383
	}
1384

1385
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1386
	if (flush)
1387
		kvm_mmu_flush_tlb(vcpu);
1388
}
1389

1390
struct mmu_page_path {
1391
	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1392
	unsigned int idx[PT64_ROOT_LEVEL-1];
1393
};
1394

1395
#define for_each_sp(pvec, sp, parents, i)			\
1396
		for (i = mmu_pages_next(&pvec, &parents, -1),	\
1397
			sp = pvec.page[i].sp;			\
1398
			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
1399
			i = mmu_pages_next(&pvec, &parents, i))
1400

1401
static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1402
			  struct mmu_page_path *parents,
1403
			  int i)
1404
{
1405
	int n;
1406

1407
	for (n = i+1; n < pvec->nr; n++) {
1408
		struct kvm_mmu_page *sp = pvec->page[n].sp;
1409

1410
		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1411
			parents->idx[0] = pvec->page[n].idx;
1412
			return n;
1413
		}
1414

1415
		parents->parent[sp->role.level-2] = sp;
1416
		parents->idx[sp->role.level-1] = pvec->page[n].idx;
1417
	}
1418

1419
	return n;
1420
}
1421

1422
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1423
{
1424
	struct kvm_mmu_page *sp;
1425
	unsigned int level = 0;
1426

1427
	do {
1428
		unsigned int idx = parents->idx[level];
1429

1430
		sp = parents->parent[level];
1431
		if (!sp)
1432
			return;
1433

1434
		--sp->unsync_children;
1435
		WARN_ON((int)sp->unsync_children < 0);
1436
		__clear_bit(idx, sp->unsync_child_bitmap);
1437
		level++;
1438
	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1439
}
1440

1441
static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
1442
			       struct mmu_page_path *parents,
1443
			       struct kvm_mmu_pages *pvec)
1444
{
1445
	parents->parent[parent->role.level-1] = NULL;
1446
	pvec->nr = 0;
1447
}
1448

1449
static void mmu_sync_children(struct kvm_vcpu *vcpu,
1450
			      struct kvm_mmu_page *parent)
1451
{
1452
	int i;
1453
	struct kvm_mmu_page *sp;
1454
	struct mmu_page_path parents;
1455
	struct kvm_mmu_pages pages;
1456
	LIST_HEAD(invalid_list);
1457

1458
	kvm_mmu_pages_init(parent, &parents, &pages);
1459
	while (mmu_unsync_walk(parent, &pages)) {
1460
		int protected = 0;
1461

1462
		for_each_sp(pages, sp, parents, i)
1463
			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1464

1465
		if (protected)
1466
			kvm_flush_remote_tlbs(vcpu->kvm);
1467

1468
		for_each_sp(pages, sp, parents, i) {
1469
			kvm_sync_page(vcpu, sp, &invalid_list);
1470
			mmu_pages_clear_parents(&parents);
1471
		}
1472
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
1473
		cond_resched_lock(&vcpu->kvm->mmu_lock);
1474
		kvm_mmu_pages_init(parent, &parents, &pages);
1475
	}
1476
}
1477

1478
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1479
					     gfn_t gfn,
1480
					     gva_t gaddr,
1481
					     unsigned level,
1482
					     int direct,
1483
					     unsigned access,
1484
					     u64 *parent_pte)
1485
{
1486
	union kvm_mmu_page_role role;
1487
	unsigned quadrant;
1488
	struct kvm_mmu_page *sp;
1489
	struct hlist_node *node;
1490
	bool need_sync = false;
1491

1492
	role = vcpu->arch.mmu.base_role;
1493
	role.level = level;
1494
	role.direct = direct;
1495
	if (role.direct)
1496
		role.cr4_pae = 0;
1497
	role.access = access;
1498
	if (!vcpu->arch.mmu.direct_map
1499
	    && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1500
		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1501
		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1502
		role.quadrant = quadrant;
1503
	}
1504
	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
1505
		if (!need_sync && sp->unsync)
1506
			need_sync = true;
1507

1508
		if (sp->role.word != role.word)
1509
			continue;
1510

1511
		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
1512
			break;
1513

1514
		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1515
		if (sp->unsync_children) {
1516
			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1517
			kvm_mmu_mark_parents_unsync(sp);
1518
		} else if (sp->unsync)
1519
			kvm_mmu_mark_parents_unsync(sp);
1520

1521
		trace_kvm_mmu_get_page(sp, false);
1522
		return sp;
1523
	}
1524
	++vcpu->kvm->stat.mmu_cache_miss;
1525
	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
1526
	if (!sp)
1527
		return sp;
1528
	sp->gfn = gfn;
1529
	sp->role = role;
1530
	hlist_add_head(&sp->hash_link,
1531
		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
1532
	if (!direct) {
1533
		if (rmap_write_protect(vcpu->kvm, gfn))
1534
			kvm_flush_remote_tlbs(vcpu->kvm);
1535
		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
1536
			kvm_sync_pages(vcpu, gfn);
1537

1538
		account_shadowed(vcpu->kvm, gfn);
1539
	}
1540
	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1541
		vcpu->arch.mmu.prefetch_page(vcpu, sp);
1542
	else
1543
		nonpaging_prefetch_page(vcpu, sp);
1544
	trace_kvm_mmu_get_page(sp, true);
1545
	return sp;
1546
}
1547

1548
static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1549
			     struct kvm_vcpu *vcpu, u64 addr)
1550
{
1551
	iterator->addr = addr;
1552
	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1553
	iterator->level = vcpu->arch.mmu.shadow_root_level;
1554

1555
	if (iterator->level == PT64_ROOT_LEVEL &&
1556
	    vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1557
	    !vcpu->arch.mmu.direct_map)
1558
		--iterator->level;
1559

1560
	if (iterator->level == PT32E_ROOT_LEVEL) {
1561
		iterator->shadow_addr
1562
			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1563
		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1564
		--iterator->level;
1565
		if (!iterator->shadow_addr)
1566
			iterator->level = 0;
1567
	}
1568
}
1569

1570
static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1571
{
1572
	if (iterator->level < PT_PAGE_TABLE_LEVEL)
1573
		return false;
1574

1575
	if (iterator->level == PT_PAGE_TABLE_LEVEL)
1576
		if (is_large_pte(*iterator->sptep))
1577
			return false;
1578

1579
	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1580
	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1581
	return true;
1582
}
1583

1584
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1585
{
1586
	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1587
	--iterator->level;
1588
}
1589

1590
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1591
{
1592
	u64 spte;
1593

1594
	spte = __pa(sp->spt)
1595
		| PT_PRESENT_MASK | PT_ACCESSED_MASK
1596
		| PT_WRITABLE_MASK | PT_USER_MASK;
1597
	__set_spte(sptep, spte);
1598
}
1599

1600
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1601
{
1602
	if (is_large_pte(*sptep)) {
1603
		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
1604
		kvm_flush_remote_tlbs(vcpu->kvm);
1605
	}
1606
}
1607

1608
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1609
				   unsigned direct_access)
1610
{
1611
	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
1612
		struct kvm_mmu_page *child;
1613

1614
		/*
1615
		 * For the direct sp, if the guest pte's dirty bit
1616
		 * changed form clean to dirty, it will corrupt the
1617
		 * sp's access: allow writable in the read-only sp,
1618
		 * so we should update the spte at this point to get
1619
		 * a new sp with the correct access.
1620
		 */
1621
		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
1622
		if (child->role.access == direct_access)
1623
			return;
1624

1625
		mmu_page_remove_parent_pte(child, sptep);
1626
		__set_spte(sptep, shadow_trap_nonpresent_pte);
1627
		kvm_flush_remote_tlbs(vcpu->kvm);
1628
	}
1629
}
1630

1631
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1632
					 struct kvm_mmu_page *sp)
1633
{
1634
	unsigned i;
1635
	u64 *pt;
1636
	u64 ent;
1637

1638
	pt = sp->spt;
1639

1640
	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1641
		ent = pt[i];
1642

1643
		if (is_shadow_present_pte(ent)) {
1644
			if (!is_last_spte(ent, sp->role.level)) {
1645
				ent &= PT64_BASE_ADDR_MASK;
1646
				mmu_page_remove_parent_pte(page_header(ent),
1647
							   &pt[i]);
1648
			} else {
1649
				if (is_large_pte(ent))
1650
					--kvm->stat.lpages;
1651
				drop_spte(kvm, &pt[i],
1652
					  shadow_trap_nonpresent_pte);
1653
			}
1654
		}
1655
		pt[i] = shadow_trap_nonpresent_pte;
1656
	}
1657
}
1658

1659
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1660
{
1661
	mmu_page_remove_parent_pte(sp, parent_pte);
1662
}
1663

1664
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1665
{
1666
	int i;
1667
	struct kvm_vcpu *vcpu;
1668

1669
	kvm_for_each_vcpu(i, vcpu, kvm)
1670
		vcpu->arch.last_pte_updated = NULL;
1671
}
1672

1673
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1674
{
1675
	u64 *parent_pte;
1676

1677
	while (sp->multimapped || sp->parent_pte) {
1678
		if (!sp->multimapped)
1679
			parent_pte = sp->parent_pte;
1680
		else {
1681
			struct kvm_pte_chain *chain;
1682

1683
			chain = container_of(sp->parent_ptes.first,
1684
					     struct kvm_pte_chain, link);
1685
			parent_pte = chain->parent_ptes[0];
1686
		}
1687
		BUG_ON(!parent_pte);
1688
		kvm_mmu_put_page(sp, parent_pte);
1689
		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1690
	}
1691
}
1692

1693
static int mmu_zap_unsync_children(struct kvm *kvm,
1694
				   struct kvm_mmu_page *parent,
1695
				   struct list_head *invalid_list)
1696
{
1697
	int i, zapped = 0;
1698
	struct mmu_page_path parents;
1699
	struct kvm_mmu_pages pages;
1700

1701
	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1702
		return 0;
1703

1704
	kvm_mmu_pages_init(parent, &parents, &pages);
1705
	while (mmu_unsync_walk(parent, &pages)) {
1706
		struct kvm_mmu_page *sp;
1707

1708
		for_each_sp(pages, sp, parents, i) {
1709
			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
1710
			mmu_pages_clear_parents(&parents);
1711
			zapped++;
1712
		}
1713
		kvm_mmu_pages_init(parent, &parents, &pages);
1714
	}
1715

1716
	return zapped;
1717
}
1718

1719
static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1720
				    struct list_head *invalid_list)
1721
{
1722
	int ret;
1723

1724
	trace_kvm_mmu_prepare_zap_page(sp);
1725
	++kvm->stat.mmu_shadow_zapped;
1726
	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
1727
	kvm_mmu_page_unlink_children(kvm, sp);
1728
	kvm_mmu_unlink_parents(kvm, sp);
1729
	if (!sp->role.invalid && !sp->role.direct)
1730
		unaccount_shadowed(kvm, sp->gfn);
1731
	if (sp->unsync)
1732
		kvm_unlink_unsync_page(kvm, sp);
1733
	if (!sp->root_count) {
1734
		/* Count self */
1735
		ret++;
1736
		list_move(&sp->link, invalid_list);
1737
	} else {
1738
		list_move(&sp->link, &kvm->arch.active_mmu_pages);
1739
		kvm_reload_remote_mmus(kvm);
1740
	}
1741

1742
	sp->role.invalid = 1;
1743
	kvm_mmu_reset_last_pte_updated(kvm);
1744
	return ret;
1745
}
1746

1747
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1748
				    struct list_head *invalid_list)
1749
{
1750
	struct kvm_mmu_page *sp;
1751

1752
	if (list_empty(invalid_list))
1753
		return;
1754

1755
	kvm_flush_remote_tlbs(kvm);
1756

1757
	do {
1758
		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1759
		WARN_ON(!sp->role.invalid || sp->root_count);
1760
		kvm_mmu_free_page(kvm, sp);
1761
	} while (!list_empty(invalid_list));
1762

1763
}
1764

1765
/*
1766
 * Changing the number of mmu pages allocated to the vm
1767
 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1768
 */
1769
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1770
{
1771
	LIST_HEAD(invalid_list);
1772
	/*
1773
	 * If we set the number of mmu pages to be smaller be than the
1774
	 * number of actived pages , we must to free some mmu pages before we
1775
	 * change the value
1776
	 */
1777

1778
	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1779
		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1780
			!list_empty(&kvm->arch.active_mmu_pages)) {
1781
			struct kvm_mmu_page *page;
1782

1783
			page = container_of(kvm->arch.active_mmu_pages.prev,
1784
					    struct kvm_mmu_page, link);
1785
			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1786
			kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787
		}
1788
		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1789
	}
1790

1791
	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1792
}
1793

1794
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1795
{
1796
	struct kvm_mmu_page *sp;
1797
	struct hlist_node *node;
1798
	LIST_HEAD(invalid_list);
1799
	int r;
1800

1801
	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1802
	r = 0;
1803

1804
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1805
		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1806
			 sp->role.word);
1807
		r = 1;
1808
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1809
	}
1810
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1811
	return r;
1812
}
1813

1814
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1815
{
1816
	struct kvm_mmu_page *sp;
1817
	struct hlist_node *node;
1818
	LIST_HEAD(invalid_list);
1819

1820
	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1821
		pgprintk("%s: zap %llx %x\n",
1822
			 __func__, gfn, sp->role.word);
1823
		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1824
	}
1825
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
1826
}
1827

1828
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1829
{
1830
	int slot = memslot_id(kvm, gfn);
1831
	struct kvm_mmu_page *sp = page_header(__pa(pte));
1832

1833
	__set_bit(slot, sp->slot_bitmap);
1834
}
1835

1836
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1837
{
1838
	int i;
1839
	u64 *pt = sp->spt;
1840

1841
	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1842
		return;
1843

1844
	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1845
		if (pt[i] == shadow_notrap_nonpresent_pte)
1846
			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1847
	}
1848
}
1849

1850
/*
1851
 * The function is based on mtrr_type_lookup() in
1852
 * arch/x86/kernel/cpu/mtrr/generic.c
1853
 */
1854
static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
1855
			 u64 start, u64 end)
1856
{
1857
	int i;
1858
	u64 base, mask;
1859
	u8 prev_match, curr_match;
1860
	int num_var_ranges = KVM_NR_VAR_MTRR;
1861

1862
	if (!mtrr_state->enabled)
1863
		return 0xFF;
1864

1865
	/* Make end inclusive end, instead of exclusive */
1866
	end--;
1867

1868
	/* Look in fixed ranges. Just return the type as per start */
1869
	if (mtrr_state->have_fixed && (start < 0x100000)) {
1870
		int idx;
1871

1872
		if (start < 0x80000) {
1873
			idx = 0;
1874
			idx += (start >> 16);
1875
			return mtrr_state->fixed_ranges[idx];
1876
		} else if (start < 0xC0000) {
1877
			idx = 1 * 8;
1878
			idx += ((start - 0x80000) >> 14);
1879
			return mtrr_state->fixed_ranges[idx];
1880
		} else if (start < 0x1000000) {
1881
			idx = 3 * 8;
1882
			idx += ((start - 0xC0000) >> 12);
1883
			return mtrr_state->fixed_ranges[idx];
1884
		}
1885
	}
1886

1887
	/*
1888
	 * Look in variable ranges
1889
	 * Look of multiple ranges matching this address and pick type
1890
	 * as per MTRR precedence
1891
	 */
1892
	if (!(mtrr_state->enabled & 2))
1893
		return mtrr_state->def_type;
1894

1895
	prev_match = 0xFF;
1896
	for (i = 0; i < num_var_ranges; ++i) {
1897
		unsigned short start_state, end_state;
1898

1899
		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1900
			continue;
1901

1902
		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
1903
		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
1904
		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
1905
		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
1906

1907
		start_state = ((start & mask) == (base & mask));
1908
		end_state = ((end & mask) == (base & mask));
1909
		if (start_state != end_state)
1910
			return 0xFE;
1911

1912
		if ((start & mask) != (base & mask))
1913
			continue;
1914

1915
		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1916
		if (prev_match == 0xFF) {
1917
			prev_match = curr_match;
1918
			continue;
1919
		}
1920

1921
		if (prev_match == MTRR_TYPE_UNCACHABLE ||
1922
		    curr_match == MTRR_TYPE_UNCACHABLE)
1923
			return MTRR_TYPE_UNCACHABLE;
1924

1925
		if ((prev_match == MTRR_TYPE_WRBACK &&
1926
		     curr_match == MTRR_TYPE_WRTHROUGH) ||
1927
		    (prev_match == MTRR_TYPE_WRTHROUGH &&
1928
		     curr_match == MTRR_TYPE_WRBACK)) {
1929
			prev_match = MTRR_TYPE_WRTHROUGH;
1930
			curr_match = MTRR_TYPE_WRTHROUGH;
1931
		}
1932

1933
		if (prev_match != curr_match)
1934
			return MTRR_TYPE_UNCACHABLE;
1935
	}
1936

1937
	if (prev_match != 0xFF)
1938
		return prev_match;
1939

1940
	return mtrr_state->def_type;
1941
}
1942

1943
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1944
{
1945
	u8 mtrr;
1946

1947
	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
1948
			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
1949
	if (mtrr == 0xfe || mtrr == 0xff)
1950
		mtrr = MTRR_TYPE_WRBACK;
1951
	return mtrr;
1952
}
1953
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
1954

1955
static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1956
{
1957
	trace_kvm_mmu_unsync_page(sp);
1958
	++vcpu->kvm->stat.mmu_unsync;
1959
	sp->unsync = 1;
1960

1961
	kvm_mmu_mark_parents_unsync(sp);
1962
	mmu_convert_notrap(sp);
1963
}
1964

1965
static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
1966
{
1967
	struct kvm_mmu_page *s;
1968
	struct hlist_node *node;
1969

1970
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1971
		if (s->unsync)
1972
			continue;
1973
		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1974
		__kvm_unsync_page(vcpu, s);
1975
	}
1976
}
1977

1978
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1979
				  bool can_unsync)
1980
{
1981
	struct kvm_mmu_page *s;
1982
	struct hlist_node *node;
1983
	bool need_unsync = false;
1984

1985
	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
1986
		if (!can_unsync)
1987
			return 1;
1988

1989
		if (s->role.level != PT_PAGE_TABLE_LEVEL)
1990
			return 1;
1991

1992
		if (!need_unsync && !s->unsync) {
1993
			if (!oos_shadow)
1994
				return 1;
1995
			need_unsync = true;
1996
		}
1997
	}
1998
	if (need_unsync)
1999
		kvm_unsync_pages(vcpu, gfn);
2000
	return 0;
2001
}
2002

2003
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004
		    unsigned pte_access, int user_fault,
2005
		    int write_fault, int dirty, int level,
2006
		    gfn_t gfn, pfn_t pfn, bool speculative,
2007
		    bool can_unsync, bool host_writable)
2008
{
2009
	u64 spte, entry = *sptep;
2010
	int ret = 0;
2011

2012
	/*
2013
	 * We don't set the accessed bit, since we sometimes want to see
2014
	 * whether the guest actually used the pte (in order to detect
2015
	 * demand paging).
2016
	 */
2017
	spte = PT_PRESENT_MASK;
2018
	if (!speculative)
2019
		spte |= shadow_accessed_mask;
2020
	if (!dirty)
2021
		pte_access &= ~ACC_WRITE_MASK;
2022
	if (pte_access & ACC_EXEC_MASK)
2023
		spte |= shadow_x_mask;
2024
	else
2025
		spte |= shadow_nx_mask;
2026
	if (pte_access & ACC_USER_MASK)
2027
		spte |= shadow_user_mask;
2028
	if (level > PT_PAGE_TABLE_LEVEL)
2029
		spte |= PT_PAGE_SIZE_MASK;
2030
	if (tdp_enabled)
2031
		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2032
			kvm_is_mmio_pfn(pfn));
2033

2034
	if (host_writable)
2035
		spte |= SPTE_HOST_WRITEABLE;
2036
	else
2037
		pte_access &= ~ACC_WRITE_MASK;
2038

2039
	spte |= (u64)pfn << PAGE_SHIFT;
2040

2041
	if ((pte_access & ACC_WRITE_MASK)
2042
	    || (!vcpu->arch.mmu.direct_map && write_fault
2043
		&& !is_write_protection(vcpu) && !user_fault)) {
2044

2045
		if (level > PT_PAGE_TABLE_LEVEL &&
2046
		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
2047
			ret = 1;
2048
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2049
			goto done;
2050
		}
2051

2052
		spte |= PT_WRITABLE_MASK;
2053

2054
		if (!vcpu->arch.mmu.direct_map
2055
		    && !(pte_access & ACC_WRITE_MASK))
2056
			spte &= ~PT_USER_MASK;
2057

2058
		/*
2059
		 * Optimization: for pte sync, if spte was writable the hash
2060
		 * lookup is unnecessary (and expensive). Write protection
2061
		 * is responsibility of mmu_get_page / kvm_sync_page.
2062
		 * Same reasoning can be applied to dirty page accounting.
2063
		 */
2064
		if (!can_unsync && is_writable_pte(*sptep))
2065
			goto set_pte;
2066

2067
		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
2068
			pgprintk("%s: found shadow page for %llx, marking ro\n",
2069
				 __func__, gfn);
2070
			ret = 1;
2071
			pte_access &= ~ACC_WRITE_MASK;
2072
			if (is_writable_pte(spte))
2073
				spte &= ~PT_WRITABLE_MASK;
2074
		}
2075
	}
2076

2077
	if (pte_access & ACC_WRITE_MASK)
2078
		mark_page_dirty(vcpu->kvm, gfn);
2079

2080
set_pte:
2081
	update_spte(sptep, spte);
2082
	/*
2083
	 * If we overwrite a writable spte with a read-only one we
2084
	 * should flush remote TLBs. Otherwise rmap_write_protect
2085
	 * will find a read-only spte, even though the writable spte
2086
	 * might be cached on a CPU's TLB.
2087
	 */
2088
	if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2089
		kvm_flush_remote_tlbs(vcpu->kvm);
2090
done:
2091
	return ret;
2092
}
2093

2094
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2095
			 unsigned pt_access, unsigned pte_access,
2096
			 int user_fault, int write_fault, int dirty,
2097
			 int *ptwrite, int level, gfn_t gfn,
2098
			 pfn_t pfn, bool speculative,
2099
			 bool host_writable)
2100
{
2101
	int was_rmapped = 0;
2102
	int rmap_count;
2103

2104
	pgprintk("%s: spte %llx access %x write_fault %d"
2105
		 " user_fault %d gfn %llx\n",
2106
		 __func__, *sptep, pt_access,
2107
		 write_fault, user_fault, gfn);
2108

2109
	if (is_rmap_spte(*sptep)) {
2110
		/*
2111
		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2112
		 * the parent of the now unreachable PTE.
2113
		 */
2114
		if (level > PT_PAGE_TABLE_LEVEL &&
2115
		    !is_large_pte(*sptep)) {
2116
			struct kvm_mmu_page *child;
2117
			u64 pte = *sptep;
2118

2119
			child = page_header(pte & PT64_BASE_ADDR_MASK);
2120
			mmu_page_remove_parent_pte(child, sptep);
2121
			__set_spte(sptep, shadow_trap_nonpresent_pte);
2122
			kvm_flush_remote_tlbs(vcpu->kvm);
2123
		} else if (pfn != spte_to_pfn(*sptep)) {
2124
			pgprintk("hfn old %llx new %llx\n",
2125
				 spte_to_pfn(*sptep), pfn);
2126
			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2127
			kvm_flush_remote_tlbs(vcpu->kvm);
2128
		} else
2129
			was_rmapped = 1;
2130
	}
2131

2132
	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2133
		      dirty, level, gfn, pfn, speculative, true,
2134
		      host_writable)) {
2135
		if (write_fault)
2136
			*ptwrite = 1;
2137
		kvm_mmu_flush_tlb(vcpu);
2138
	}
2139

2140
	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2141
	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2142
		 is_large_pte(*sptep)? "2MB" : "4kB",
2143
		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2144
		 *sptep, sptep);
2145
	if (!was_rmapped && is_large_pte(*sptep))
2146
		++vcpu->kvm->stat.lpages;
2147

2148
	page_header_update_slot(vcpu->kvm, sptep, gfn);
2149
	if (!was_rmapped) {
2150
		rmap_count = rmap_add(vcpu, sptep, gfn);
2151
		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2152
			rmap_recycle(vcpu, sptep, gfn);
2153
	}
2154
	kvm_release_pfn_clean(pfn);
2155
	if (speculative) {
2156
		vcpu->arch.last_pte_updated = sptep;
2157
		vcpu->arch.last_pte_gfn = gfn;
2158
	}
2159
}
2160

2161
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2162
{
2163
}
2164

2165
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2166
				     bool no_dirty_log)
2167
{
2168
	struct kvm_memory_slot *slot;
2169
	unsigned long hva;
2170

2171
	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172
	if (!slot) {
2173
		get_page(bad_page);
2174
		return page_to_pfn(bad_page);
2175
	}
2176

2177
	hva = gfn_to_hva_memslot(slot, gfn);
2178

2179
	return hva_to_pfn_atomic(vcpu->kvm, hva);
2180
}
2181

2182
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2183
				    struct kvm_mmu_page *sp,
2184
				    u64 *start, u64 *end)
2185
{
2186
	struct page *pages[PTE_PREFETCH_NUM];
2187
	unsigned access = sp->role.access;
2188
	int i, ret;
2189
	gfn_t gfn;
2190

2191
	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2192
	if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2193
		return -1;
2194

2195
	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2196
	if (ret <= 0)
2197
		return -1;
2198

2199
	for (i = 0; i < ret; i++, gfn++, start++)
2200
		mmu_set_spte(vcpu, start, ACC_ALL,
2201
			     access, 0, 0, 1, NULL,
2202
			     sp->role.level, gfn,
2203
			     page_to_pfn(pages[i]), true, true);
2204

2205
	return 0;
2206
}
2207

2208
static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2209
				  struct kvm_mmu_page *sp, u64 *sptep)
2210
{
2211
	u64 *spte, *start = NULL;
2212
	int i;
2213

2214
	WARN_ON(!sp->role.direct);
2215

2216
	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2217
	spte = sp->spt + i;
2218

2219
	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220
		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2221
			if (!start)
2222
				continue;
2223
			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2224
				break;
2225
			start = NULL;
2226
		} else if (!start)
2227
			start = spte;
2228
	}
2229
}
2230

2231
static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2232
{
2233
	struct kvm_mmu_page *sp;
2234

2235
	/*
2236
	 * Since it's no accessed bit on EPT, it's no way to
2237
	 * distinguish between actually accessed translations
2238
	 * and prefetched, so disable pte prefetch if EPT is
2239
	 * enabled.
2240
	 */
2241
	if (!shadow_accessed_mask)
2242
		return;
2243

2244
	sp = page_header(__pa(sptep));
2245
	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2246
		return;
2247

2248
	__direct_pte_prefetch(vcpu, sp, sptep);
2249
}
2250

2251
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2252
			int map_writable, int level, gfn_t gfn, pfn_t pfn,
2253
			bool prefault)
2254
{
2255
	struct kvm_shadow_walk_iterator iterator;
2256
	struct kvm_mmu_page *sp;
2257
	int pt_write = 0;
2258
	gfn_t pseudo_gfn;
2259

2260
	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2261
		if (iterator.level == level) {
2262
			unsigned pte_access = ACC_ALL;
2263

2264
			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2265
				     0, write, 1, &pt_write,
2266
				     level, gfn, pfn, prefault, map_writable);
2267
			direct_pte_prefetch(vcpu, iterator.sptep);
2268
			++vcpu->stat.pf_fixed;
2269
			break;
2270
		}
2271

2272
		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
2273
			u64 base_addr = iterator.addr;
2274

2275
			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
2276
			pseudo_gfn = base_addr >> PAGE_SHIFT;
2277
			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2278
					      iterator.level - 1,
2279
					      1, ACC_ALL, iterator.sptep);
2280
			if (!sp) {
2281
				pgprintk("nonpaging_map: ENOMEM\n");
2282
				kvm_release_pfn_clean(pfn);
2283
				return -ENOMEM;
2284
			}
2285

2286
			__set_spte(iterator.sptep,
2287
				   __pa(sp->spt)
2288
				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
2289
				   | shadow_user_mask | shadow_x_mask
2290
				   | shadow_accessed_mask);
2291
		}
2292
	}
2293
	return pt_write;
2294
}
2295

2296
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2297
{
2298
	siginfo_t info;
2299

2300
	info.si_signo	= SIGBUS;
2301
	info.si_errno	= 0;
2302
	info.si_code	= BUS_MCEERR_AR;
2303
	info.si_addr	= (void __user *)address;
2304
	info.si_addr_lsb = PAGE_SHIFT;
2305

2306
	send_sig_info(SIGBUS, &info, tsk);
2307
}
2308

2309
static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2310
{
2311
	kvm_release_pfn_clean(pfn);
2312
	if (is_hwpoison_pfn(pfn)) {
2313
		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2314
		return 0;
2315
	} else if (is_fault_pfn(pfn))
2316
		return -EFAULT;
2317

2318
	return 1;
2319
}
2320

2321
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2322
					gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2323
{
2324
	pfn_t pfn = *pfnp;
2325
	gfn_t gfn = *gfnp;
2326
	int level = *levelp;
2327

2328
	/*
2329
	 * Check if it's a transparent hugepage. If this would be an
2330
	 * hugetlbfs page, level wouldn't be set to
2331
	 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2332
	 * here.
2333
	 */
2334
	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2335
	    level == PT_PAGE_TABLE_LEVEL &&
2336
	    PageTransCompound(pfn_to_page(pfn)) &&
2337
	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2338
		unsigned long mask;
2339
		/*
2340
		 * mmu_notifier_retry was successful and we hold the
2341
		 * mmu_lock here, so the pmd can't become splitting
2342
		 * from under us, and in turn
2343
		 * __split_huge_page_refcount() can't run from under
2344
		 * us and we can safely transfer the refcount from
2345
		 * PG_tail to PG_head as we switch the pfn to tail to
2346
		 * head.
2347
		 */
2348
		*levelp = level = PT_DIRECTORY_LEVEL;
2349
		mask = KVM_PAGES_PER_HPAGE(level) - 1;
2350
		VM_BUG_ON((gfn & mask) != (pfn & mask));
2351
		if (pfn & mask) {
2352
			gfn &= ~mask;
2353
			*gfnp = gfn;
2354
			kvm_release_pfn_clean(pfn);
2355
			pfn &= ~mask;
2356
			if (!get_page_unless_zero(pfn_to_page(pfn)))
2357
				BUG();
2358
			*pfnp = pfn;
2359
		}
2360
	}
2361
}
2362

2363
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364
			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365

2366
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2367
			 bool prefault)
2368
{
2369
	int r;
2370
	int level;
2371
	int force_pt_level;
2372
	pfn_t pfn;
2373
	unsigned long mmu_seq;
2374
	bool map_writable;
2375

2376
	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2377
	if (likely(!force_pt_level)) {
2378
		level = mapping_level(vcpu, gfn);
2379
		/*
2380
		 * This path builds a PAE pagetable - so we can map
2381
		 * 2mb pages at maximum. Therefore check if the level
2382
		 * is larger than that.
2383
		 */
2384
		if (level > PT_DIRECTORY_LEVEL)
2385
			level = PT_DIRECTORY_LEVEL;
2386

2387
		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2388
	} else
2389
		level = PT_PAGE_TABLE_LEVEL;
2390

2391
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2392
	smp_rmb();
2393

2394
	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395
		return 0;
2396

2397
	/* mmio */
2398
	if (is_error_pfn(pfn))
2399
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2400

2401
	spin_lock(&vcpu->kvm->mmu_lock);
2402
	if (mmu_notifier_retry(vcpu, mmu_seq))
2403
		goto out_unlock;
2404
	kvm_mmu_free_some_pages(vcpu);
2405
	if (likely(!force_pt_level))
2406
		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2407
	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2408
			 prefault);
2409
	spin_unlock(&vcpu->kvm->mmu_lock);
2410

2411

2412
	return r;
2413

2414
out_unlock:
2415
	spin_unlock(&vcpu->kvm->mmu_lock);
2416
	kvm_release_pfn_clean(pfn);
2417
	return 0;
2418
}
2419

2420

2421
static void mmu_free_roots(struct kvm_vcpu *vcpu)
2422
{
2423
	int i;
2424
	struct kvm_mmu_page *sp;
2425
	LIST_HEAD(invalid_list);
2426

2427
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2428
		return;
2429
	spin_lock(&vcpu->kvm->mmu_lock);
2430
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2431
	    (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2432
	     vcpu->arch.mmu.direct_map)) {
2433
		hpa_t root = vcpu->arch.mmu.root_hpa;
2434

2435
		sp = page_header(root);
2436
		--sp->root_count;
2437
		if (!sp->root_count && sp->role.invalid) {
2438
			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2439
			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2440
		}
2441
		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2442
		spin_unlock(&vcpu->kvm->mmu_lock);
2443
		return;
2444
	}
2445
	for (i = 0; i < 4; ++i) {
2446
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2447

2448
		if (root) {
2449
			root &= PT64_BASE_ADDR_MASK;
2450
			sp = page_header(root);
2451
			--sp->root_count;
2452
			if (!sp->root_count && sp->role.invalid)
2453
				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
2454
							 &invalid_list);
2455
		}
2456
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2457
	}
2458
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2459
	spin_unlock(&vcpu->kvm->mmu_lock);
2460
	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2461
}
2462

2463
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2464
{
2465
	int ret = 0;
2466

2467
	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
2468
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2469
		ret = 1;
2470
	}
2471

2472
	return ret;
2473
}
2474

2475
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2476
{
2477
	struct kvm_mmu_page *sp;
2478
	unsigned i;
2479

2480
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2481
		spin_lock(&vcpu->kvm->mmu_lock);
2482
		kvm_mmu_free_some_pages(vcpu);
2483
		sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2484
				      1, ACC_ALL, NULL);
2485
		++sp->root_count;
2486
		spin_unlock(&vcpu->kvm->mmu_lock);
2487
		vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2488
	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2489
		for (i = 0; i < 4; ++i) {
2490
			hpa_t root = vcpu->arch.mmu.pae_root[i];
2491

2492
			ASSERT(!VALID_PAGE(root));
2493
			spin_lock(&vcpu->kvm->mmu_lock);
2494
			kvm_mmu_free_some_pages(vcpu);
2495
			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2496
					      i << 30,
2497
					      PT32_ROOT_LEVEL, 1, ACC_ALL,
2498
					      NULL);
2499
			root = __pa(sp->spt);
2500
			++sp->root_count;
2501
			spin_unlock(&vcpu->kvm->mmu_lock);
2502
			vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2503
		}
2504
		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2505
	} else
2506
		BUG();
2507

2508
	return 0;
2509
}
2510

2511
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2512
{
2513
	struct kvm_mmu_page *sp;
2514
	u64 pdptr, pm_mask;
2515
	gfn_t root_gfn;
2516
	int i;
2517

2518
	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2519

2520
	if (mmu_check_root(vcpu, root_gfn))
2521
		return 1;
2522

2523
	/*
2524
	 * Do we shadow a long mode page table? If so we need to
2525
	 * write-protect the guests page table root.
2526
	 */
2527
	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2528
		hpa_t root = vcpu->arch.mmu.root_hpa;
2529

2530
		ASSERT(!VALID_PAGE(root));
2531

2532
		spin_lock(&vcpu->kvm->mmu_lock);
2533
		kvm_mmu_free_some_pages(vcpu);
2534
		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2535
				      0, ACC_ALL, NULL);
2536
		root = __pa(sp->spt);
2537
		++sp->root_count;
2538
		spin_unlock(&vcpu->kvm->mmu_lock);
2539
		vcpu->arch.mmu.root_hpa = root;
2540
		return 0;
2541
	}
2542

2543
	/*
2544
	 * We shadow a 32 bit page table. This may be a legacy 2-level
2545
	 * or a PAE 3-level page table. In either case we need to be aware that
2546
	 * the shadow page table may be a PAE or a long mode page table.
2547
	 */
2548
	pm_mask = PT_PRESENT_MASK;
2549
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2550
		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2551

2552
	for (i = 0; i < 4; ++i) {
2553
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2554

2555
		ASSERT(!VALID_PAGE(root));
2556
		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2557
			pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2558
			if (!is_present_gpte(pdptr)) {
2559
				vcpu->arch.mmu.pae_root[i] = 0;
2560
				continue;
2561
			}
2562
			root_gfn = pdptr >> PAGE_SHIFT;
2563
			if (mmu_check_root(vcpu, root_gfn))
2564
				return 1;
2565
		}
2566
		spin_lock(&vcpu->kvm->mmu_lock);
2567
		kvm_mmu_free_some_pages(vcpu);
2568
		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2569
				      PT32_ROOT_LEVEL, 0,
2570
				      ACC_ALL, NULL);
2571
		root = __pa(sp->spt);
2572
		++sp->root_count;
2573
		spin_unlock(&vcpu->kvm->mmu_lock);
2574

2575
		vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2576
	}
2577
	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2578

2579
	/*
2580
	 * If we shadow a 32 bit page table with a long mode page
2581
	 * table we enter this path.
2582
	 */
2583
	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2584
		if (vcpu->arch.mmu.lm_root == NULL) {
2585
			/*
2586
			 * The additional page necessary for this is only
2587
			 * allocated on demand.
2588
			 */
2589

2590
			u64 *lm_root;
2591

2592
			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2593
			if (lm_root == NULL)
2594
				return 1;
2595

2596
			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2597

2598
			vcpu->arch.mmu.lm_root = lm_root;
2599
		}
2600

2601
		vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2602
	}
2603

2604
	return 0;
2605
}
2606

2607
static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2608
{
2609
	if (vcpu->arch.mmu.direct_map)
2610
		return mmu_alloc_direct_roots(vcpu);
2611
	else
2612
		return mmu_alloc_shadow_roots(vcpu);
2613
}
2614

2615
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2616
{
2617
	int i;
2618
	struct kvm_mmu_page *sp;
2619

2620
	if (vcpu->arch.mmu.direct_map)
2621
		return;
2622

2623
	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2624
		return;
2625

2626
	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627
	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2628
		hpa_t root = vcpu->arch.mmu.root_hpa;
2629
		sp = page_header(root);
2630
		mmu_sync_children(vcpu, sp);
2631
		trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2632
		return;
2633
	}
2634
	for (i = 0; i < 4; ++i) {
2635
		hpa_t root = vcpu->arch.mmu.pae_root[i];
2636

2637
		if (root && VALID_PAGE(root)) {
2638
			root &= PT64_BASE_ADDR_MASK;
2639
			sp = page_header(root);
2640
			mmu_sync_children(vcpu, sp);
2641
		}
2642
	}
2643
	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2644
}
2645

2646
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2647
{
2648
	spin_lock(&vcpu->kvm->mmu_lock);
2649
	mmu_sync_roots(vcpu);
2650
	spin_unlock(&vcpu->kvm->mmu_lock);
2651
}
2652

2653
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2654
				  u32 access, struct x86_exception *exception)
2655
{
2656
	if (exception)
2657
		exception->error_code = 0;
2658
	return vaddr;
2659
}
2660

2661
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2662
					 u32 access,
2663
					 struct x86_exception *exception)
2664
{
2665
	if (exception)
2666
		exception->error_code = 0;
2667
	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668
}
2669

2670
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2671
				u32 error_code, bool prefault)
2672
{
2673
	gfn_t gfn;
2674
	int r;
2675

2676
	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2677
	r = mmu_topup_memory_caches(vcpu);
2678
	if (r)
2679
		return r;
2680

2681
	ASSERT(vcpu);
2682
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2683

2684
	gfn = gva >> PAGE_SHIFT;
2685

2686
	return nonpaging_map(vcpu, gva & PAGE_MASK,
2687
			     error_code & PFERR_WRITE_MASK, gfn, prefault);
2688
}
2689

2690
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2691
{
2692
	struct kvm_arch_async_pf arch;
2693

2694
	arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2695
	arch.gfn = gfn;
2696
	arch.direct_map = vcpu->arch.mmu.direct_map;
2697
	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2698

2699
	return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2700
}
2701

2702
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2703
{
2704
	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2705
		     kvm_event_needs_reinjection(vcpu)))
2706
		return false;
2707

2708
	return kvm_x86_ops->interrupt_allowed(vcpu);
2709
}
2710

2711
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2712
			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2713
{
2714
	bool async;
2715

2716
	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2717

2718
	if (!async)
2719
		return false; /* *pfn has correct page already */
2720

2721
	put_page(pfn_to_page(*pfn));
2722

2723
	if (!prefault && can_do_async_pf(vcpu)) {
2724
		trace_kvm_try_async_get_page(gva, gfn);
2725
		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2726
			trace_kvm_async_pf_doublefault(gva, gfn);
2727
			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2728
			return true;
2729
		} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2730
			return true;
2731
	}
2732

2733
	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2734

2735
	return false;
2736
}
2737

2738
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2739
			  bool prefault)
2740
{
2741
	pfn_t pfn;
2742
	int r;
2743
	int level;
2744
	int force_pt_level;
2745
	gfn_t gfn = gpa >> PAGE_SHIFT;
2746
	unsigned long mmu_seq;
2747
	int write = error_code & PFERR_WRITE_MASK;
2748
	bool map_writable;
2749

2750
	ASSERT(vcpu);
2751
	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2752

2753
	r = mmu_topup_memory_caches(vcpu);
2754
	if (r)
2755
		return r;
2756

2757
	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2758
	if (likely(!force_pt_level)) {
2759
		level = mapping_level(vcpu, gfn);
2760
		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2761
	} else
2762
		level = PT_PAGE_TABLE_LEVEL;
2763

2764
	mmu_seq = vcpu->kvm->mmu_notifier_seq;
2765
	smp_rmb();
2766

2767
	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768
		return 0;
2769

2770
	/* mmio */
2771
	if (is_error_pfn(pfn))
2772
		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2773
	spin_lock(&vcpu->kvm->mmu_lock);
2774
	if (mmu_notifier_retry(vcpu, mmu_seq))
2775
		goto out_unlock;
2776
	kvm_mmu_free_some_pages(vcpu);
2777
	if (likely(!force_pt_level))
2778
		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2779
	r = __direct_map(vcpu, gpa, write, map_writable,
2780
			 level, gfn, pfn, prefault);
2781
	spin_unlock(&vcpu->kvm->mmu_lock);
2782

2783
	return r;
2784

2785
out_unlock:
2786
	spin_unlock(&vcpu->kvm->mmu_lock);
2787
	kvm_release_pfn_clean(pfn);
2788
	return 0;
2789
}
2790

2791
static void nonpaging_free(struct kvm_vcpu *vcpu)
2792
{
2793
	mmu_free_roots(vcpu);
2794
}
2795

2796
static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2797
				  struct kvm_mmu *context)
2798
{
2799
	context->new_cr3 = nonpaging_new_cr3;
2800
	context->page_fault = nonpaging_page_fault;
2801
	context->gva_to_gpa = nonpaging_gva_to_gpa;
2802
	context->free = nonpaging_free;
2803
	context->prefetch_page = nonpaging_prefetch_page;
2804
	context->sync_page = nonpaging_sync_page;
2805
	context->invlpg = nonpaging_invlpg;
2806
	context->update_pte = nonpaging_update_pte;
2807
	context->root_level = 0;
2808
	context->shadow_root_level = PT32E_ROOT_LEVEL;
2809
	context->root_hpa = INVALID_PAGE;
2810
	context->direct_map = true;
2811
	context->nx = false;
2812
	return 0;
2813
}
2814

2815
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2816
{
2817
	++vcpu->stat.tlb_flush;
2818
	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2819
}
2820

2821
static void paging_new_cr3(struct kvm_vcpu *vcpu)
2822
{
2823
	pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2824
	mmu_free_roots(vcpu);
2825
}
2826

2827
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2828
{
2829
	return kvm_read_cr3(vcpu);
2830
}
2831

2832
static void inject_page_fault(struct kvm_vcpu *vcpu,
2833
			      struct x86_exception *fault)
2834
{
2835
	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2836
}
2837

2838
static void paging_free(struct kvm_vcpu *vcpu)
2839
{
2840
	nonpaging_free(vcpu);
2841
}
2842

2843
static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2844
{
2845
	int bit7;
2846

2847
	bit7 = (gpte >> 7) & 1;
2848
	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2849
}
2850

2851
#define PTTYPE 64
2852
#include "paging_tmpl.h"
2853
#undef PTTYPE
2854

2855
#define PTTYPE 32
2856
#include "paging_tmpl.h"
2857
#undef PTTYPE
2858

2859
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2860
				  struct kvm_mmu *context,
2861
				  int level)
2862
{
2863
	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2864
	u64 exb_bit_rsvd = 0;
2865

2866
	if (!context->nx)
2867
		exb_bit_rsvd = rsvd_bits(63, 63);
2868
	switch (level) {
2869
	case PT32_ROOT_LEVEL:
2870
		/* no rsvd bits for 2 level 4K page table entries */
2871
		context->rsvd_bits_mask[0][1] = 0;
2872
		context->rsvd_bits_mask[0][0] = 0;
2873
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2874

2875
		if (!is_pse(vcpu)) {
2876
			context->rsvd_bits_mask[1][1] = 0;
2877
			break;
2878
		}
2879

2880
		if (is_cpuid_PSE36())
2881
			/* 36bits PSE 4MB page */
2882
			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2883
		else
2884
			/* 32 bits PSE 4MB page */
2885
			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2886
		break;
2887
	case PT32E_ROOT_LEVEL:
2888
		context->rsvd_bits_mask[0][2] =
2889
			rsvd_bits(maxphyaddr, 63) |
2890
			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2891
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2892
			rsvd_bits(maxphyaddr, 62);	/* PDE */
2893
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2894
			rsvd_bits(maxphyaddr, 62); 	/* PTE */
2895
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2896
			rsvd_bits(maxphyaddr, 62) |
2897
			rsvd_bits(13, 20);		/* large page */
2898
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2899
		break;
2900
	case PT64_ROOT_LEVEL:
2901
		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2902
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2903
		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2904
			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2905
		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2906
			rsvd_bits(maxphyaddr, 51);
2907
		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2908
			rsvd_bits(maxphyaddr, 51);
2909
		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2910
		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2911
			rsvd_bits(maxphyaddr, 51) |
2912
			rsvd_bits(13, 29);
2913
		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2914
			rsvd_bits(maxphyaddr, 51) |
2915
			rsvd_bits(13, 20);		/* large page */
2916
		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2917
		break;
2918
	}
2919
}
2920

2921
static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2922
					struct kvm_mmu *context,
2923
					int level)
2924
{
2925
	context->nx = is_nx(vcpu);
2926

2927
	reset_rsvds_bits_mask(vcpu, context, level);
2928

2929
	ASSERT(is_pae(vcpu));
2930
	context->new_cr3 = paging_new_cr3;
2931
	context->page_fault = paging64_page_fault;
2932
	context->gva_to_gpa = paging64_gva_to_gpa;
2933
	context->prefetch_page = paging64_prefetch_page;
2934
	context->sync_page = paging64_sync_page;
2935
	context->invlpg = paging64_invlpg;
2936
	context->update_pte = paging64_update_pte;
2937
	context->free = paging_free;
2938
	context->root_level = level;
2939
	context->shadow_root_level = level;
2940
	context->root_hpa = INVALID_PAGE;
2941
	context->direct_map = false;
2942
	return 0;
2943
}
2944

2945
static int paging64_init_context(struct kvm_vcpu *vcpu,
2946
				 struct kvm_mmu *context)
2947
{
2948
	return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2949
}
2950

2951
static int paging32_init_context(struct kvm_vcpu *vcpu,
2952
				 struct kvm_mmu *context)
2953
{
2954
	context->nx = false;
2955

2956
	reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2957

2958
	context->new_cr3 = paging_new_cr3;
2959
	context->page_fault = paging32_page_fault;
2960
	context->gva_to_gpa = paging32_gva_to_gpa;
2961
	context->free = paging_free;
2962
	context->prefetch_page = paging32_prefetch_page;
2963
	context->sync_page = paging32_sync_page;
2964
	context->invlpg = paging32_invlpg;
2965
	context->update_pte = paging32_update_pte;
2966
	context->root_level = PT32_ROOT_LEVEL;
2967
	context->shadow_root_level = PT32E_ROOT_LEVEL;
2968
	context->root_hpa = INVALID_PAGE;
2969
	context->direct_map = false;
2970
	return 0;
2971
}
2972

2973
static int paging32E_init_context(struct kvm_vcpu *vcpu,
2974
				  struct kvm_mmu *context)
2975
{
2976
	return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2977
}
2978

2979
static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2980
{
2981
	struct kvm_mmu *context = vcpu->arch.walk_mmu;
2982

2983
	context->base_role.word = 0;
2984
	context->new_cr3 = nonpaging_new_cr3;
2985
	context->page_fault = tdp_page_fault;
2986
	context->free = nonpaging_free;
2987
	context->prefetch_page = nonpaging_prefetch_page;
2988
	context->sync_page = nonpaging_sync_page;
2989
	context->invlpg = nonpaging_invlpg;
2990
	context->update_pte = nonpaging_update_pte;
2991
	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2992
	context->root_hpa = INVALID_PAGE;
2993
	context->direct_map = true;
2994
	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2995
	context->get_cr3 = get_cr3;
2996
	context->inject_page_fault = kvm_inject_page_fault;
2997
	context->nx = is_nx(vcpu);
2998

2999
	if (!is_paging(vcpu)) {
3000
		context->nx = false;
3001
		context->gva_to_gpa = nonpaging_gva_to_gpa;
3002
		context->root_level = 0;
3003
	} else if (is_long_mode(vcpu)) {
3004
		context->nx = is_nx(vcpu);
3005
		reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
3006
		context->gva_to_gpa = paging64_gva_to_gpa;
3007
		context->root_level = PT64_ROOT_LEVEL;
3008
	} else if (is_pae(vcpu)) {
3009
		context->nx = is_nx(vcpu);
3010
		reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
3011
		context->gva_to_gpa = paging64_gva_to_gpa;
3012
		context->root_level = PT32E_ROOT_LEVEL;
3013
	} else {
3014
		context->nx = false;
3015
		reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
3016
		context->gva_to_gpa = paging32_gva_to_gpa;
3017
		context->root_level = PT32_ROOT_LEVEL;
3018
	}
3019

3020
	return 0;
3021
}
3022

3023
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3024
{
3025
	int r;
3026
	ASSERT(vcpu);
3027
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3028

3029
	if (!is_paging(vcpu))
3030
		r = nonpaging_init_context(vcpu, context);
3031
	else if (is_long_mode(vcpu))
3032
		r = paging64_init_context(vcpu, context);
3033
	else if (is_pae(vcpu))
3034
		r = paging32E_init_context(vcpu, context);
3035
	else
3036
		r = paging32_init_context(vcpu, context);
3037

3038
	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3039
	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
3040

3041
	return r;
3042
}
3043
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
3044

3045
static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3046
{
3047
	int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
3048

3049
	vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
3050
	vcpu->arch.walk_mmu->get_cr3           = get_cr3;
3051
	vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3052

3053
	return r;
3054
}
3055

3056
static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3057
{
3058
	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3059

3060
	g_context->get_cr3           = get_cr3;
3061
	g_context->inject_page_fault = kvm_inject_page_fault;
3062

3063
	/*
3064
	 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3065
	 * translation of l2_gpa to l1_gpa addresses is done using the
3066
	 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3067
	 * functions between mmu and nested_mmu are swapped.
3068
	 */
3069
	if (!is_paging(vcpu)) {
3070
		g_context->nx = false;
3071
		g_context->root_level = 0;
3072
		g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3073
	} else if (is_long_mode(vcpu)) {
3074
		g_context->nx = is_nx(vcpu);
3075
		reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3076
		g_context->root_level = PT64_ROOT_LEVEL;
3077
		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3078
	} else if (is_pae(vcpu)) {
3079
		g_context->nx = is_nx(vcpu);
3080
		reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3081
		g_context->root_level = PT32E_ROOT_LEVEL;
3082
		g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3083
	} else {
3084
		g_context->nx = false;
3085
		reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3086
		g_context->root_level = PT32_ROOT_LEVEL;
3087
		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3088
	}
3089

3090
	return 0;
3091
}
3092

3093
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3094
{
3095
	if (mmu_is_nested(vcpu))
3096
		return init_kvm_nested_mmu(vcpu);
3097
	else if (tdp_enabled)
3098
		return init_kvm_tdp_mmu(vcpu);
3099
	else
3100
		return init_kvm_softmmu(vcpu);
3101
}
3102

3103
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
3104
{
3105
	ASSERT(vcpu);
3106
	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
3107
		/* mmu.free() should set root_hpa = INVALID_PAGE */
3108
		vcpu->arch.mmu.free(vcpu);
3109
}
3110

3111
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
3112
{
3113
	destroy_kvm_mmu(vcpu);
3114
	return init_kvm_mmu(vcpu);
3115
}
3116
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
3117

3118
int kvm_mmu_load(struct kvm_vcpu *vcpu)
3119
{
3120
	int r;
3121

3122
	r = mmu_topup_memory_caches(vcpu);
3123
	if (r)
3124
		goto out;
3125
	r = mmu_alloc_roots(vcpu);
3126
	spin_lock(&vcpu->kvm->mmu_lock);
3127
	mmu_sync_roots(vcpu);
3128
	spin_unlock(&vcpu->kvm->mmu_lock);
3129
	if (r)
3130
		goto out;
3131
	/* set_cr3() should ensure TLB has been flushed */
3132
	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
3133
out:
3134
	return r;
3135
}
3136
EXPORT_SYMBOL_GPL(kvm_mmu_load);
3137

3138
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3139
{
3140
	mmu_free_roots(vcpu);
3141
}
3142
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3143

3144
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3145
				  struct kvm_mmu_page *sp,
3146
				  u64 *spte)
3147
{
3148
	u64 pte;
3149
	struct kvm_mmu_page *child;
3150

3151
	pte = *spte;
3152
	if (is_shadow_present_pte(pte)) {
3153
		if (is_last_spte(pte, sp->role.level))
3154
			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3155
		else {
3156
			child = page_header(pte & PT64_BASE_ADDR_MASK);
3157
			mmu_page_remove_parent_pte(child, spte);
3158
		}
3159
	}
3160
	__set_spte(spte, shadow_trap_nonpresent_pte);
3161
	if (is_large_pte(pte))
3162
		--vcpu->kvm->stat.lpages;
3163
}
3164

3165
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166
				  struct kvm_mmu_page *sp, u64 *spte,
3167
				  const void *new)
3168
{
3169
	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3170
		++vcpu->kvm->stat.mmu_pde_zapped;
3171
		return;
3172
        }
3173

3174
	++vcpu->kvm->stat.mmu_pte_updated;
3175
	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
3176
}
3177

3178
static bool need_remote_flush(u64 old, u64 new)
3179
{
3180
	if (!is_shadow_present_pte(old))
3181
		return false;
3182
	if (!is_shadow_present_pte(new))
3183
		return true;
3184
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
3185
		return true;
3186
	old ^= PT64_NX_MASK;
3187
	new ^= PT64_NX_MASK;
3188
	return (old & ~new & PT64_PERM_MASK) != 0;
3189
}
3190

3191
static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
3192
				    bool remote_flush, bool local_flush)
3193
{
3194
	if (zap_page)
3195
		return;
3196

3197
	if (remote_flush)
3198
		kvm_flush_remote_tlbs(vcpu->kvm);
3199
	else if (local_flush)
3200
		kvm_mmu_flush_tlb(vcpu);
3201
}
3202

3203
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
3204
{
3205
	u64 *spte = vcpu->arch.last_pte_updated;
3206

3207
	return !!(spte && (*spte & shadow_accessed_mask));
3208
}
3209

3210
static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
3211
{
3212
	u64 *spte = vcpu->arch.last_pte_updated;
3213

3214
	if (spte
3215
	    && vcpu->arch.last_pte_gfn == gfn
3216
	    && shadow_accessed_mask
3217
	    && !(*spte & shadow_accessed_mask)
3218
	    && is_shadow_present_pte(*spte))
3219
		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
3220
}
3221

3222
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3223
		       const u8 *new, int bytes,
3224
		       bool guest_initiated)
3225
{
3226
	gfn_t gfn = gpa >> PAGE_SHIFT;
3227
	union kvm_mmu_page_role mask = { .word = 0 };
3228
	struct kvm_mmu_page *sp;
3229
	struct hlist_node *node;
3230
	LIST_HEAD(invalid_list);
3231
	u64 entry, gentry, *spte;
3232
	unsigned pte_size, page_offset, misaligned, quadrant, offset;
3233
	int level, npte, invlpg_counter, r, flooded = 0;
3234
	bool remote_flush, local_flush, zap_page;
3235

3236
	zap_page = remote_flush = local_flush = false;
3237
	offset = offset_in_page(gpa);
3238

3239
	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
3240

3241
	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
3242

3243
	/*
3244
	 * Assume that the pte write on a page table of the same type
3245
	 * as the current vcpu paging mode since we update the sptes only
3246
	 * when they have the same mode.
3247
	 */
3248
	if ((is_pae(vcpu) && bytes == 4) || !new) {
3249
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3250
		if (is_pae(vcpu)) {
3251
			gpa &= ~(gpa_t)7;
3252
			bytes = 8;
3253
		}
3254
		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
3255
		if (r)
3256
			gentry = 0;
3257
		new = (const u8 *)&gentry;
3258
	}
3259

3260
	switch (bytes) {
3261
	case 4:
3262
		gentry = *(const u32 *)new;
3263
		break;
3264
	case 8:
3265
		gentry = *(const u64 *)new;
3266
		break;
3267
	default:
3268
		gentry = 0;
3269
		break;
3270
	}
3271

3272
	spin_lock(&vcpu->kvm->mmu_lock);
3273
	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3274
		gentry = 0;
3275
	kvm_mmu_free_some_pages(vcpu);
3276
	++vcpu->kvm->stat.mmu_pte_write;
3277
	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
3278
	if (guest_initiated) {
3279
		kvm_mmu_access_page(vcpu, gfn);
3280
		if (gfn == vcpu->arch.last_pt_write_gfn
3281
		    && !last_updated_pte_accessed(vcpu)) {
3282
			++vcpu->arch.last_pt_write_count;
3283
			if (vcpu->arch.last_pt_write_count >= 3)
3284
				flooded = 1;
3285
		} else {
3286
			vcpu->arch.last_pt_write_gfn = gfn;
3287
			vcpu->arch.last_pt_write_count = 1;
3288
			vcpu->arch.last_pte_updated = NULL;
3289
		}
3290
	}
3291

3292
	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
3293
	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
3294
		pte_size = sp->role.cr4_pae ? 8 : 4;
3295
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
3296
		misaligned |= bytes < 4;
3297
		if (misaligned || flooded) {
3298
			/*
3299
			 * Misaligned accesses are too much trouble to fix
3300
			 * up; also, they usually indicate a page is not used
3301
			 * as a page table.
3302
			 *
3303
			 * If we're seeing too many writes to a page,
3304
			 * it may no longer be a page table, or we may be
3305
			 * forking, in which case it is better to unmap the
3306
			 * page.
3307
			 */
3308
			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
3309
				 gpa, bytes, sp->role.word);
3310
			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
3311
						     &invalid_list);
3312
			++vcpu->kvm->stat.mmu_flooded;
3313
			continue;
3314
		}
3315
		page_offset = offset;
3316
		level = sp->role.level;
3317
		npte = 1;
3318
		if (!sp->role.cr4_pae) {
3319
			page_offset <<= 1;	/* 32->64 */
3320
			/*
3321
			 * A 32-bit pde maps 4MB while the shadow pdes map
3322
			 * only 2MB.  So we need to double the offset again
3323
			 * and zap two pdes instead of one.
3324
			 */
3325
			if (level == PT32_ROOT_LEVEL) {
3326
				page_offset &= ~7; /* kill rounding error */
3327
				page_offset <<= 1;
3328
				npte = 2;
3329
			}
3330
			quadrant = page_offset >> PAGE_SHIFT;
3331
			page_offset &= ~PAGE_MASK;
3332
			if (quadrant != sp->role.quadrant)
3333
				continue;
3334
		}
3335
		local_flush = true;
3336
		spte = &sp->spt[page_offset / sizeof(*spte)];
3337
		while (npte--) {
3338
			entry = *spte;
3339
			mmu_pte_write_zap_pte(vcpu, sp, spte);
3340
			if (gentry &&
3341
			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3342
			      & mask.word))
3343
				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3344
			if (!remote_flush && need_remote_flush(entry, *spte))
3345
				remote_flush = true;
3346
			++spte;
3347
		}
3348
	}
3349
	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
3350
	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3351
	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
3352
	spin_unlock(&vcpu->kvm->mmu_lock);
3353
}
3354

3355
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
3356
{
3357
	gpa_t gpa;
3358
	int r;
3359

3360
	if (vcpu->arch.mmu.direct_map)
3361
		return 0;
3362

3363
	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
3364

3365
	spin_lock(&vcpu->kvm->mmu_lock);
3366
	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3367
	spin_unlock(&vcpu->kvm->mmu_lock);
3368
	return r;
3369
}
3370
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
3371

3372
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3373
{
3374
	LIST_HEAD(invalid_list);
3375

3376
	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
3377
	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
3378
		struct kvm_mmu_page *sp;
3379

3380
		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3381
				  struct kvm_mmu_page, link);
3382
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3383
		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3384
		++vcpu->kvm->stat.mmu_recycled;
3385
	}
3386
}
3387

3388
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3389
		       void *insn, int insn_len)
3390
{
3391
	int r;
3392
	enum emulation_result er;
3393

3394
	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
3395
	if (r < 0)
3396
		goto out;
3397

3398
	if (!r) {
3399
		r = 1;
3400
		goto out;
3401
	}
3402

3403
	r = mmu_topup_memory_caches(vcpu);
3404
	if (r)
3405
		goto out;
3406

3407
	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
3408

3409
	switch (er) {
3410
	case EMULATE_DONE:
3411
		return 1;
3412
	case EMULATE_DO_MMIO:
3413
		++vcpu->stat.mmio_exits;
3414
		/* fall through */
3415
	case EMULATE_FAIL:
3416
		return 0;
3417
	default:
3418
		BUG();
3419
	}
3420
out:
3421
	return r;
3422
}
3423
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
3424

3425
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
3426
{
3427
	vcpu->arch.mmu.invlpg(vcpu, gva);
3428
	kvm_mmu_flush_tlb(vcpu);
3429
	++vcpu->stat.invlpg;
3430
}
3431
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
3432

3433
void kvm_enable_tdp(void)
3434
{
3435
	tdp_enabled = true;
3436
}
3437
EXPORT_SYMBOL_GPL(kvm_enable_tdp);
3438

3439
void kvm_disable_tdp(void)
3440
{
3441
	tdp_enabled = false;
3442
}
3443
EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3444

3445
static void free_mmu_pages(struct kvm_vcpu *vcpu)
3446
{
3447
	free_page((unsigned long)vcpu->arch.mmu.pae_root);
3448
	if (vcpu->arch.mmu.lm_root != NULL)
3449
		free_page((unsigned long)vcpu->arch.mmu.lm_root);
3450
}
3451

3452
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
3453
{
3454
	struct page *page;
3455
	int i;
3456

3457
	ASSERT(vcpu);
3458

3459
	/*
3460
	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
3461
	 * Therefore we need to allocate shadow page tables in the first
3462
	 * 4GB of memory, which happens to fit the DMA32 zone.
3463
	 */
3464
	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
3465
	if (!page)
3466
		return -ENOMEM;
3467

3468
	vcpu->arch.mmu.pae_root = page_address(page);
3469
	for (i = 0; i < 4; ++i)
3470
		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
3471

3472
	return 0;
3473
}
3474

3475
int kvm_mmu_create(struct kvm_vcpu *vcpu)
3476
{
3477
	ASSERT(vcpu);
3478
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3479

3480
	return alloc_mmu_pages(vcpu);
3481
}
3482

3483
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3484
{
3485
	ASSERT(vcpu);
3486
	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3487

3488
	return init_kvm_mmu(vcpu);
3489
}
3490

3491
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3492
{
3493
	struct kvm_mmu_page *sp;
3494

3495
	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3496
		int i;
3497
		u64 *pt;
3498

3499
		if (!test_bit(slot, sp->slot_bitmap))
3500
			continue;
3501

3502
		pt = sp->spt;
3503
		for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3504
			if (!is_shadow_present_pte(pt[i]) ||
3505
			      !is_last_spte(pt[i], sp->role.level))
3506
				continue;
3507

3508
			if (is_large_pte(pt[i])) {
3509
				drop_spte(kvm, &pt[i],
3510
					  shadow_trap_nonpresent_pte);
3511
				--kvm->stat.lpages;
3512
				continue;
3513
			}
3514

3515
			/* avoid RMW */
3516
			if (is_writable_pte(pt[i]))
3517
				update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3518
		}
3519
	}
3520
	kvm_flush_remote_tlbs(kvm);
3521
}
3522

3523
void kvm_mmu_zap_all(struct kvm *kvm)
3524
{
3525
	struct kvm_mmu_page *sp, *node;
3526
	LIST_HEAD(invalid_list);
3527

3528
	spin_lock(&kvm->mmu_lock);
3529
restart:
3530
	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
3531
		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
3532
			goto restart;
3533

3534
	kvm_mmu_commit_zap_page(kvm, &invalid_list);
3535
	spin_unlock(&kvm->mmu_lock);
3536
}
3537

3538
static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3539
					       struct list_head *invalid_list)
3540
{
3541
	struct kvm_mmu_page *page;
3542

3543
	page = container_of(kvm->arch.active_mmu_pages.prev,
3544
			    struct kvm_mmu_page, link);
3545
	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3546
}
3547

3548
static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3549
{
3550
	struct kvm *kvm;
3551
	struct kvm *kvm_freed = NULL;
3552
	int nr_to_scan = sc->nr_to_scan;
3553

3554
	if (nr_to_scan == 0)
3555
		goto out;
3556

3557
	raw_spin_lock(&kvm_lock);
3558

3559
	list_for_each_entry(kvm, &vm_list, vm_list) {
3560
		int idx, freed_pages;
3561
		LIST_HEAD(invalid_list);
3562

3563
		idx = srcu_read_lock(&kvm->srcu);
3564
		spin_lock(&kvm->mmu_lock);
3565
		if (!kvm_freed && nr_to_scan > 0 &&
3566
		    kvm->arch.n_used_mmu_pages > 0) {
3567
			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3568
							  &invalid_list);
3569
			kvm_freed = kvm;
3570
		}
3571
		nr_to_scan--;
3572

3573
		kvm_mmu_commit_zap_page(kvm, &invalid_list);
3574
		spin_unlock(&kvm->mmu_lock);
3575
		srcu_read_unlock(&kvm->srcu, idx);
3576
	}
3577
	if (kvm_freed)
3578
		list_move_tail(&kvm_freed->vm_list, &vm_list);
3579

3580
	raw_spin_unlock(&kvm_lock);
3581

3582
out:
3583
	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3584
}
3585

3586
static struct shrinker mmu_shrinker = {
3587
	.shrink = mmu_shrink,
3588
	.seeks = DEFAULT_SEEKS * 10,
3589
};
3590

3591
static void mmu_destroy_caches(void)
3592
{
3593
	if (pte_chain_cache)
3594
		kmem_cache_destroy(pte_chain_cache);
3595
	if (rmap_desc_cache)
3596
		kmem_cache_destroy(rmap_desc_cache);
3597
	if (mmu_page_header_cache)
3598
		kmem_cache_destroy(mmu_page_header_cache);
3599
}
3600

3601
int kvm_mmu_module_init(void)
3602
{
3603
	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
3604
					    sizeof(struct kvm_pte_chain),
3605
					    0, 0, NULL);
3606
	if (!pte_chain_cache)
3607
		goto nomem;
3608
	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3609
					    sizeof(struct kvm_rmap_desc),
3610
					    0, 0, NULL);
3611
	if (!rmap_desc_cache)
3612
		goto nomem;
3613

3614
	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
3615
						  sizeof(struct kvm_mmu_page),
3616
						  0, 0, NULL);
3617
	if (!mmu_page_header_cache)
3618
		goto nomem;
3619

3620
	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3621
		goto nomem;
3622

3623
	register_shrinker(&mmu_shrinker);
3624

3625
	return 0;
3626

3627
nomem:
3628
	mmu_destroy_caches();
3629
	return -ENOMEM;
3630
}
3631

3632
/*
3633
 * Caculate mmu pages needed for kvm.
3634
 */
3635
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3636
{
3637
	int i;
3638
	unsigned int nr_mmu_pages;
3639
	unsigned int  nr_pages = 0;
3640
	struct kvm_memslots *slots;
3641

3642
	slots = kvm_memslots(kvm);
3643

3644
	for (i = 0; i < slots->nmemslots; i++)
3645
		nr_pages += slots->memslots[i].npages;
3646

3647
	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3648
	nr_mmu_pages = max(nr_mmu_pages,
3649
			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
3650

3651
	return nr_mmu_pages;
3652
}
3653

3654
static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3655
				unsigned len)
3656
{
3657
	if (len > buffer->len)
3658
		return NULL;
3659
	return buffer->ptr;
3660
}
3661

3662
static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
3663
				unsigned len)
3664
{
3665
	void *ret;
3666

3667
	ret = pv_mmu_peek_buffer(buffer, len);
3668
	if (!ret)
3669
		return ret;
3670
	buffer->ptr += len;
3671
	buffer->len -= len;
3672
	buffer->processed += len;
3673
	return ret;
3674
}
3675

3676
static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3677
			     gpa_t addr, gpa_t value)
3678
{
3679
	int bytes = 8;
3680
	int r;
3681

3682
	if (!is_long_mode(vcpu) && !is_pae(vcpu))
3683
		bytes = 4;
3684

3685
	r = mmu_topup_memory_caches(vcpu);
3686
	if (r)
3687
		return r;
3688

3689
	if (!emulator_write_phys(vcpu, addr, &value, bytes))
3690
		return -EFAULT;
3691

3692
	return 1;
3693
}
3694

3695
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3696
{
3697
	(void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3698
	return 1;
3699
}
3700

3701
static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
3702
{
3703
	spin_lock(&vcpu->kvm->mmu_lock);
3704
	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
3705
	spin_unlock(&vcpu->kvm->mmu_lock);
3706
	return 1;
3707
}
3708

3709
static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
3710
			     struct kvm_pv_mmu_op_buffer *buffer)
3711
{
3712
	struct kvm_mmu_op_header *header;
3713

3714
	header = pv_mmu_peek_buffer(buffer, sizeof *header);
3715
	if (!header)
3716
		return 0;
3717
	switch (header->op) {
3718
	case KVM_MMU_OP_WRITE_PTE: {
3719
		struct kvm_mmu_op_write_pte *wpte;
3720

3721
		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
3722
		if (!wpte)
3723
			return 0;
3724
		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
3725
					wpte->pte_val);
3726
	}
3727
	case KVM_MMU_OP_FLUSH_TLB: {
3728
		struct kvm_mmu_op_flush_tlb *ftlb;
3729

3730
		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
3731
		if (!ftlb)
3732
			return 0;
3733
		return kvm_pv_mmu_flush_tlb(vcpu);
3734
	}
3735
	case KVM_MMU_OP_RELEASE_PT: {
3736
		struct kvm_mmu_op_release_pt *rpt;
3737

3738
		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
3739
		if (!rpt)
3740
			return 0;
3741
		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
3742
	}
3743
	default: return 0;
3744
	}
3745
}
3746

3747
int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3748
		  gpa_t addr, unsigned long *ret)
3749
{
3750
	int r;
3751
	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
3752

3753
	buffer->ptr = buffer->buf;
3754
	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
3755
	buffer->processed = 0;
3756

3757
	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
3758
	if (r)
3759
		goto out;
3760

3761
	while (buffer->len) {
3762
		r = kvm_pv_mmu_op_one(vcpu, buffer);
3763
		if (r < 0)
3764
			goto out;
3765
		if (r == 0)
3766
			break;
3767
	}
3768

3769
	r = 1;
3770
out:
3771
	*ret = buffer->processed;
3772
	return r;
3773
}
3774

3775
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3776
{
3777
	struct kvm_shadow_walk_iterator iterator;
3778
	int nr_sptes = 0;
3779

3780
	spin_lock(&vcpu->kvm->mmu_lock);
3781
	for_each_shadow_entry(vcpu, addr, iterator) {
3782
		sptes[iterator.level-1] = *iterator.sptep;
3783
		nr_sptes++;
3784
		if (!is_shadow_present_pte(*iterator.sptep))
3785
			break;
3786
	}
3787
	spin_unlock(&vcpu->kvm->mmu_lock);
3788

3789
	return nr_sptes;
3790
}
3791
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3792

3793
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3794
{
3795
	ASSERT(vcpu);
3796

3797
	destroy_kvm_mmu(vcpu);
3798
	free_mmu_pages(vcpu);
3799
	mmu_free_memory_caches(vcpu);
3800
}
3801

3802
#ifdef CONFIG_KVM_MMU_AUDIT
3803
#include "mmu_audit.c"
3804
#else
3805
static void mmu_audit_disable(void) { }
3806
#endif
3807

3808
void kvm_mmu_module_exit(void)
3809
{
3810
	mmu_destroy_caches();
3811
	percpu_counter_destroy(&kvm_total_used_mmu_pages);
3812
	unregister_shrinker(&mmu_shrinker);
3813
	mmu_audit_disable();
3814
}
3815

3816
Product

Resources

Company