Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/kvm/paging_tmpl.h
10817 views
1
/*
2
* Kernel-based Virtual Machine driver for Linux
3
*
4
* This module enables machines with Intel VT-x extensions to run virtual
5
* machines without emulation or binary translation.
6
*
7
* MMU support
8
*
9
* Copyright (C) 2006 Qumranet, Inc.
10
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
11
*
12
* Authors:
13
* Yaniv Kamay <[email protected]>
14
* Avi Kivity <[email protected]>
15
*
16
* This work is licensed under the terms of the GNU GPL, version 2. See
17
* the COPYING file in the top-level directory.
18
*
19
*/
20
21
/*
22
* We need the mmu code to access both 32-bit and 64-bit guest ptes,
23
* so the code in this file is compiled twice, once per pte size.
24
*/
25
26
#if PTTYPE == 64
27
#define pt_element_t u64
28
#define guest_walker guest_walker64
29
#define FNAME(name) paging##64_##name
30
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34
#define PT_LEVEL_BITS PT64_LEVEL_BITS
35
#ifdef CONFIG_X86_64
36
#define PT_MAX_FULL_LEVELS 4
37
#define CMPXCHG cmpxchg
38
#else
39
#define CMPXCHG cmpxchg64
40
#define PT_MAX_FULL_LEVELS 2
41
#endif
42
#elif PTTYPE == 32
43
#define pt_element_t u32
44
#define guest_walker guest_walker32
45
#define FNAME(name) paging##32_##name
46
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47
#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
48
#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
49
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
50
#define PT_LEVEL_BITS PT32_LEVEL_BITS
51
#define PT_MAX_FULL_LEVELS 2
52
#define CMPXCHG cmpxchg
53
#else
54
#error Invalid PTTYPE value
55
#endif
56
57
#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
58
#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
59
60
/*
61
* The guest_walker structure emulates the behavior of the hardware page
62
* table walker.
63
*/
64
struct guest_walker {
65
int level;
66
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
67
pt_element_t ptes[PT_MAX_FULL_LEVELS];
68
pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
69
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70
unsigned pt_access;
71
unsigned pte_access;
72
gfn_t gfn;
73
struct x86_exception fault;
74
};
75
76
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
77
{
78
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
79
}
80
81
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
82
pt_element_t __user *ptep_user, unsigned index,
83
pt_element_t orig_pte, pt_element_t new_pte)
84
{
85
int npages;
86
pt_element_t ret;
87
pt_element_t *table;
88
struct page *page;
89
90
npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
91
/* Check if the user is doing something meaningless. */
92
if (unlikely(npages != 1))
93
return -EFAULT;
94
95
table = kmap_atomic(page, KM_USER0);
96
ret = CMPXCHG(&table[index], orig_pte, new_pte);
97
kunmap_atomic(table, KM_USER0);
98
99
kvm_release_page_dirty(page);
100
101
return (ret != orig_pte);
102
}
103
104
static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
105
{
106
unsigned access;
107
108
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
109
#if PTTYPE == 64
110
if (vcpu->arch.mmu.nx)
111
access &= ~(gpte >> PT64_NX_SHIFT);
112
#endif
113
return access;
114
}
115
116
/*
117
* Fetch a guest pte for a guest virtual address
118
*/
119
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
120
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
121
gva_t addr, u32 access)
122
{
123
pt_element_t pte;
124
pt_element_t __user *uninitialized_var(ptep_user);
125
gfn_t table_gfn;
126
unsigned index, pt_access, uninitialized_var(pte_access);
127
gpa_t pte_gpa;
128
bool eperm, present, rsvd_fault;
129
int offset, write_fault, user_fault, fetch_fault;
130
131
write_fault = access & PFERR_WRITE_MASK;
132
user_fault = access & PFERR_USER_MASK;
133
fetch_fault = access & PFERR_FETCH_MASK;
134
135
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
136
fetch_fault);
137
walk:
138
present = true;
139
eperm = rsvd_fault = false;
140
walker->level = mmu->root_level;
141
pte = mmu->get_cr3(vcpu);
142
143
#if PTTYPE == 64
144
if (walker->level == PT32E_ROOT_LEVEL) {
145
pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
146
trace_kvm_mmu_paging_element(pte, walker->level);
147
if (!is_present_gpte(pte)) {
148
present = false;
149
goto error;
150
}
151
--walker->level;
152
}
153
#endif
154
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
155
(mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
156
157
pt_access = ACC_ALL;
158
159
for (;;) {
160
gfn_t real_gfn;
161
unsigned long host_addr;
162
163
index = PT_INDEX(addr, walker->level);
164
165
table_gfn = gpte_to_gfn(pte);
166
offset = index * sizeof(pt_element_t);
167
pte_gpa = gfn_to_gpa(table_gfn) + offset;
168
walker->table_gfn[walker->level - 1] = table_gfn;
169
walker->pte_gpa[walker->level - 1] = pte_gpa;
170
171
real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
172
PFERR_USER_MASK|PFERR_WRITE_MASK);
173
if (unlikely(real_gfn == UNMAPPED_GVA)) {
174
present = false;
175
break;
176
}
177
real_gfn = gpa_to_gfn(real_gfn);
178
179
host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180
if (unlikely(kvm_is_error_hva(host_addr))) {
181
present = false;
182
break;
183
}
184
185
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186
if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
187
present = false;
188
break;
189
}
190
191
trace_kvm_mmu_paging_element(pte, walker->level);
192
193
if (unlikely(!is_present_gpte(pte))) {
194
present = false;
195
break;
196
}
197
198
if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199
walker->level))) {
200
rsvd_fault = true;
201
break;
202
}
203
204
if (unlikely(write_fault && !is_writable_pte(pte)
205
&& (user_fault || is_write_protection(vcpu))))
206
eperm = true;
207
208
if (unlikely(user_fault && !(pte & PT_USER_MASK)))
209
eperm = true;
210
211
#if PTTYPE == 64
212
if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
213
eperm = true;
214
#endif
215
216
if (!eperm && !rsvd_fault
217
&& unlikely(!(pte & PT_ACCESSED_MASK))) {
218
int ret;
219
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
220
sizeof(pte));
221
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
222
pte, pte|PT_ACCESSED_MASK);
223
if (unlikely(ret < 0)) {
224
present = false;
225
break;
226
} else if (ret)
227
goto walk;
228
229
mark_page_dirty(vcpu->kvm, table_gfn);
230
pte |= PT_ACCESSED_MASK;
231
}
232
233
pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
234
235
walker->ptes[walker->level - 1] = pte;
236
237
if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
238
((walker->level == PT_DIRECTORY_LEVEL) &&
239
is_large_pte(pte) &&
240
(PTTYPE == 64 || is_pse(vcpu))) ||
241
((walker->level == PT_PDPE_LEVEL) &&
242
is_large_pte(pte) &&
243
mmu->root_level == PT64_ROOT_LEVEL)) {
244
int lvl = walker->level;
245
gpa_t real_gpa;
246
gfn_t gfn;
247
u32 ac;
248
249
gfn = gpte_to_gfn_lvl(pte, lvl);
250
gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
251
252
if (PTTYPE == 32 &&
253
walker->level == PT_DIRECTORY_LEVEL &&
254
is_cpuid_PSE36())
255
gfn += pse36_gfn_delta(pte);
256
257
ac = write_fault | fetch_fault | user_fault;
258
259
real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
260
ac);
261
if (real_gpa == UNMAPPED_GVA)
262
return 0;
263
264
walker->gfn = real_gpa >> PAGE_SHIFT;
265
266
break;
267
}
268
269
pt_access = pte_access;
270
--walker->level;
271
}
272
273
if (unlikely(!present || eperm || rsvd_fault))
274
goto error;
275
276
if (write_fault && unlikely(!is_dirty_gpte(pte))) {
277
int ret;
278
279
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
280
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
281
pte, pte|PT_DIRTY_MASK);
282
if (unlikely(ret < 0)) {
283
present = false;
284
goto error;
285
} else if (ret)
286
goto walk;
287
288
mark_page_dirty(vcpu->kvm, table_gfn);
289
pte |= PT_DIRTY_MASK;
290
walker->ptes[walker->level - 1] = pte;
291
}
292
293
walker->pt_access = pt_access;
294
walker->pte_access = pte_access;
295
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
296
__func__, (u64)pte, pte_access, pt_access);
297
return 1;
298
299
error:
300
walker->fault.vector = PF_VECTOR;
301
walker->fault.error_code_valid = true;
302
walker->fault.error_code = 0;
303
if (present)
304
walker->fault.error_code |= PFERR_PRESENT_MASK;
305
306
walker->fault.error_code |= write_fault | user_fault;
307
308
if (fetch_fault && mmu->nx)
309
walker->fault.error_code |= PFERR_FETCH_MASK;
310
if (rsvd_fault)
311
walker->fault.error_code |= PFERR_RSVD_MASK;
312
313
walker->fault.address = addr;
314
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
315
316
trace_kvm_mmu_walker_error(walker->fault.error_code);
317
return 0;
318
}
319
320
static int FNAME(walk_addr)(struct guest_walker *walker,
321
struct kvm_vcpu *vcpu, gva_t addr, u32 access)
322
{
323
return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
324
access);
325
}
326
327
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
328
struct kvm_vcpu *vcpu, gva_t addr,
329
u32 access)
330
{
331
return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
332
addr, access);
333
}
334
335
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
336
struct kvm_mmu_page *sp, u64 *spte,
337
pt_element_t gpte)
338
{
339
u64 nonpresent = shadow_trap_nonpresent_pte;
340
341
if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
342
goto no_present;
343
344
if (!is_present_gpte(gpte)) {
345
if (!sp->unsync)
346
nonpresent = shadow_notrap_nonpresent_pte;
347
goto no_present;
348
}
349
350
if (!(gpte & PT_ACCESSED_MASK))
351
goto no_present;
352
353
return false;
354
355
no_present:
356
drop_spte(vcpu->kvm, spte, nonpresent);
357
return true;
358
}
359
360
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
361
u64 *spte, const void *pte)
362
{
363
pt_element_t gpte;
364
unsigned pte_access;
365
pfn_t pfn;
366
367
gpte = *(const pt_element_t *)pte;
368
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
369
return;
370
371
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
372
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
373
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
374
if (is_error_pfn(pfn)) {
375
kvm_release_pfn_clean(pfn);
376
return;
377
}
378
379
/*
380
* we call mmu_set_spte() with host_writable = true because that
381
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
382
*/
383
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
384
is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
385
gpte_to_gfn(gpte), pfn, true, true);
386
}
387
388
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
389
struct guest_walker *gw, int level)
390
{
391
pt_element_t curr_pte;
392
gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
393
u64 mask;
394
int r, index;
395
396
if (level == PT_PAGE_TABLE_LEVEL) {
397
mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
398
base_gpa = pte_gpa & ~mask;
399
index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
400
401
r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
402
gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
403
curr_pte = gw->prefetch_ptes[index];
404
} else
405
r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
406
&curr_pte, sizeof(curr_pte));
407
408
return r || curr_pte != gw->ptes[level - 1];
409
}
410
411
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
412
u64 *sptep)
413
{
414
struct kvm_mmu_page *sp;
415
pt_element_t *gptep = gw->prefetch_ptes;
416
u64 *spte;
417
int i;
418
419
sp = page_header(__pa(sptep));
420
421
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
422
return;
423
424
if (sp->role.direct)
425
return __direct_pte_prefetch(vcpu, sp, sptep);
426
427
i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
428
spte = sp->spt + i;
429
430
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
431
pt_element_t gpte;
432
unsigned pte_access;
433
gfn_t gfn;
434
pfn_t pfn;
435
bool dirty;
436
437
if (spte == sptep)
438
continue;
439
440
if (*spte != shadow_trap_nonpresent_pte)
441
continue;
442
443
gpte = gptep[i];
444
445
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446
continue;
447
448
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
449
gfn = gpte_to_gfn(gpte);
450
dirty = is_dirty_gpte(gpte);
451
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452
(pte_access & ACC_WRITE_MASK) && dirty);
453
if (is_error_pfn(pfn)) {
454
kvm_release_pfn_clean(pfn);
455
break;
456
}
457
458
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
459
dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
460
pfn, true, true);
461
}
462
}
463
464
/*
465
* Fetch a shadow pte for a specific level in the paging hierarchy.
466
*/
467
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
468
struct guest_walker *gw,
469
int user_fault, int write_fault, int hlevel,
470
int *ptwrite, pfn_t pfn, bool map_writable,
471
bool prefault)
472
{
473
unsigned access = gw->pt_access;
474
struct kvm_mmu_page *sp = NULL;
475
bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
476
int top_level;
477
unsigned direct_access;
478
struct kvm_shadow_walk_iterator it;
479
480
if (!is_present_gpte(gw->ptes[gw->level - 1]))
481
return NULL;
482
483
direct_access = gw->pt_access & gw->pte_access;
484
if (!dirty)
485
direct_access &= ~ACC_WRITE_MASK;
486
487
top_level = vcpu->arch.mmu.root_level;
488
if (top_level == PT32E_ROOT_LEVEL)
489
top_level = PT32_ROOT_LEVEL;
490
/*
491
* Verify that the top-level gpte is still there. Since the page
492
* is a root page, it is either write protected (and cannot be
493
* changed from now on) or it is invalid (in which case, we don't
494
* really care if it changes underneath us after this point).
495
*/
496
if (FNAME(gpte_changed)(vcpu, gw, top_level))
497
goto out_gpte_changed;
498
499
for (shadow_walk_init(&it, vcpu, addr);
500
shadow_walk_okay(&it) && it.level > gw->level;
501
shadow_walk_next(&it)) {
502
gfn_t table_gfn;
503
504
drop_large_spte(vcpu, it.sptep);
505
506
sp = NULL;
507
if (!is_shadow_present_pte(*it.sptep)) {
508
table_gfn = gw->table_gfn[it.level - 2];
509
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
510
false, access, it.sptep);
511
}
512
513
/*
514
* Verify that the gpte in the page we've just write
515
* protected is still there.
516
*/
517
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
518
goto out_gpte_changed;
519
520
if (sp)
521
link_shadow_page(it.sptep, sp);
522
}
523
524
for (;
525
shadow_walk_okay(&it) && it.level > hlevel;
526
shadow_walk_next(&it)) {
527
gfn_t direct_gfn;
528
529
validate_direct_spte(vcpu, it.sptep, direct_access);
530
531
drop_large_spte(vcpu, it.sptep);
532
533
if (is_shadow_present_pte(*it.sptep))
534
continue;
535
536
direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
537
538
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
539
true, direct_access, it.sptep);
540
link_shadow_page(it.sptep, sp);
541
}
542
543
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
544
user_fault, write_fault, dirty, ptwrite, it.level,
545
gw->gfn, pfn, prefault, map_writable);
546
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
547
548
return it.sptep;
549
550
out_gpte_changed:
551
if (sp)
552
kvm_mmu_put_page(sp, it.sptep);
553
kvm_release_pfn_clean(pfn);
554
return NULL;
555
}
556
557
/*
558
* Page fault handler. There are several causes for a page fault:
559
* - there is no shadow pte for the guest pte
560
* - write access through a shadow pte marked read only so that we can set
561
* the dirty bit
562
* - write access to a shadow pte marked read only so we can update the page
563
* dirty bitmap, when userspace requests it
564
* - mmio access; in this case we will never install a present shadow pte
565
* - normal guest page fault due to the guest pte marked not present, not
566
* writable, or not executable
567
*
568
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
569
* a negative value on error.
570
*/
571
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
572
bool prefault)
573
{
574
int write_fault = error_code & PFERR_WRITE_MASK;
575
int user_fault = error_code & PFERR_USER_MASK;
576
struct guest_walker walker;
577
u64 *sptep;
578
int write_pt = 0;
579
int r;
580
pfn_t pfn;
581
int level = PT_PAGE_TABLE_LEVEL;
582
int force_pt_level;
583
unsigned long mmu_seq;
584
bool map_writable;
585
586
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
587
588
r = mmu_topup_memory_caches(vcpu);
589
if (r)
590
return r;
591
592
/*
593
* Look up the guest pte for the faulting address.
594
*/
595
r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
596
597
/*
598
* The page is not mapped by the guest. Let the guest handle it.
599
*/
600
if (!r) {
601
pgprintk("%s: guest page fault\n", __func__);
602
if (!prefault) {
603
inject_page_fault(vcpu, &walker.fault);
604
/* reset fork detector */
605
vcpu->arch.last_pt_write_count = 0;
606
}
607
return 0;
608
}
609
610
if (walker.level >= PT_DIRECTORY_LEVEL)
611
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
612
else
613
force_pt_level = 1;
614
if (!force_pt_level) {
615
level = min(walker.level, mapping_level(vcpu, walker.gfn));
616
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
617
}
618
619
mmu_seq = vcpu->kvm->mmu_notifier_seq;
620
smp_rmb();
621
622
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
623
&map_writable))
624
return 0;
625
626
/* mmio */
627
if (is_error_pfn(pfn))
628
return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
629
630
spin_lock(&vcpu->kvm->mmu_lock);
631
if (mmu_notifier_retry(vcpu, mmu_seq))
632
goto out_unlock;
633
634
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
635
kvm_mmu_free_some_pages(vcpu);
636
if (!force_pt_level)
637
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
638
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
639
level, &write_pt, pfn, map_writable, prefault);
640
(void)sptep;
641
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
642
sptep, *sptep, write_pt);
643
644
if (!write_pt)
645
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
646
647
++vcpu->stat.pf_fixed;
648
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
649
spin_unlock(&vcpu->kvm->mmu_lock);
650
651
return write_pt;
652
653
out_unlock:
654
spin_unlock(&vcpu->kvm->mmu_lock);
655
kvm_release_pfn_clean(pfn);
656
return 0;
657
}
658
659
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
660
{
661
struct kvm_shadow_walk_iterator iterator;
662
struct kvm_mmu_page *sp;
663
gpa_t pte_gpa = -1;
664
int level;
665
u64 *sptep;
666
int need_flush = 0;
667
668
spin_lock(&vcpu->kvm->mmu_lock);
669
670
for_each_shadow_entry(vcpu, gva, iterator) {
671
level = iterator.level;
672
sptep = iterator.sptep;
673
674
sp = page_header(__pa(sptep));
675
if (is_last_spte(*sptep, level)) {
676
int offset, shift;
677
678
if (!sp->unsync)
679
break;
680
681
shift = PAGE_SHIFT -
682
(PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
683
offset = sp->role.quadrant << shift;
684
685
pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
686
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
687
688
if (is_shadow_present_pte(*sptep)) {
689
if (is_large_pte(*sptep))
690
--vcpu->kvm->stat.lpages;
691
drop_spte(vcpu->kvm, sptep,
692
shadow_trap_nonpresent_pte);
693
need_flush = 1;
694
} else
695
__set_spte(sptep, shadow_trap_nonpresent_pte);
696
break;
697
}
698
699
if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
700
break;
701
}
702
703
if (need_flush)
704
kvm_flush_remote_tlbs(vcpu->kvm);
705
706
atomic_inc(&vcpu->kvm->arch.invlpg_counter);
707
708
spin_unlock(&vcpu->kvm->mmu_lock);
709
710
if (pte_gpa == -1)
711
return;
712
713
if (mmu_topup_memory_caches(vcpu))
714
return;
715
kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
716
}
717
718
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
719
struct x86_exception *exception)
720
{
721
struct guest_walker walker;
722
gpa_t gpa = UNMAPPED_GVA;
723
int r;
724
725
r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
726
727
if (r) {
728
gpa = gfn_to_gpa(walker.gfn);
729
gpa |= vaddr & ~PAGE_MASK;
730
} else if (exception)
731
*exception = walker.fault;
732
733
return gpa;
734
}
735
736
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
737
u32 access,
738
struct x86_exception *exception)
739
{
740
struct guest_walker walker;
741
gpa_t gpa = UNMAPPED_GVA;
742
int r;
743
744
r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
745
746
if (r) {
747
gpa = gfn_to_gpa(walker.gfn);
748
gpa |= vaddr & ~PAGE_MASK;
749
} else if (exception)
750
*exception = walker.fault;
751
752
return gpa;
753
}
754
755
static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
756
struct kvm_mmu_page *sp)
757
{
758
int i, j, offset, r;
759
pt_element_t pt[256 / sizeof(pt_element_t)];
760
gpa_t pte_gpa;
761
762
if (sp->role.direct
763
|| (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
764
nonpaging_prefetch_page(vcpu, sp);
765
return;
766
}
767
768
pte_gpa = gfn_to_gpa(sp->gfn);
769
if (PTTYPE == 32) {
770
offset = sp->role.quadrant << PT64_LEVEL_BITS;
771
pte_gpa += offset * sizeof(pt_element_t);
772
}
773
774
for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
775
r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
776
pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
777
for (j = 0; j < ARRAY_SIZE(pt); ++j)
778
if (r || is_present_gpte(pt[j]))
779
sp->spt[i+j] = shadow_trap_nonpresent_pte;
780
else
781
sp->spt[i+j] = shadow_notrap_nonpresent_pte;
782
}
783
}
784
785
/*
786
* Using the cached information from sp->gfns is safe because:
787
* - The spte has a reference to the struct page, so the pfn for a given gfn
788
* can't change unless all sptes pointing to it are nuked first.
789
*
790
* Note:
791
* We should flush all tlbs if spte is dropped even though guest is
792
* responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
793
* and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
794
* used by guest then tlbs are not flushed, so guest is allowed to access the
795
* freed pages.
796
* And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
797
*/
798
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
799
{
800
int i, offset, nr_present;
801
bool host_writable;
802
gpa_t first_pte_gpa;
803
804
offset = nr_present = 0;
805
806
/* direct kvm_mmu_page can not be unsync. */
807
BUG_ON(sp->role.direct);
808
809
if (PTTYPE == 32)
810
offset = sp->role.quadrant << PT64_LEVEL_BITS;
811
812
first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
813
814
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
815
unsigned pte_access;
816
pt_element_t gpte;
817
gpa_t pte_gpa;
818
gfn_t gfn;
819
820
if (!is_shadow_present_pte(sp->spt[i]))
821
continue;
822
823
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
824
825
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
826
sizeof(pt_element_t)))
827
return -EINVAL;
828
829
gfn = gpte_to_gfn(gpte);
830
831
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
832
vcpu->kvm->tlbs_dirty++;
833
continue;
834
}
835
836
if (gfn != sp->gfns[i]) {
837
drop_spte(vcpu->kvm, &sp->spt[i],
838
shadow_trap_nonpresent_pte);
839
vcpu->kvm->tlbs_dirty++;
840
continue;
841
}
842
843
nr_present++;
844
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
845
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
846
847
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
848
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
849
spte_to_pfn(sp->spt[i]), true, false,
850
host_writable);
851
}
852
853
return !nr_present;
854
}
855
856
#undef pt_element_t
857
#undef guest_walker
858
#undef FNAME
859
#undef PT_BASE_ADDR_MASK
860
#undef PT_INDEX
861
#undef PT_LVL_ADDR_MASK
862
#undef PT_LVL_OFFSET_MASK
863
#undef PT_LEVEL_BITS
864
#undef PT_MAX_FULL_LEVELS
865
#undef gpte_to_gfn
866
#undef gpte_to_gfn_lvl
867
#undef CMPXCHG
868
869