Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/mmu/tdp_mmu.c
26481 views
1
// SPDX-License-Identifier: GPL-2.0
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4
#include "mmu.h"
5
#include "mmu_internal.h"
6
#include "mmutrace.h"
7
#include "tdp_iter.h"
8
#include "tdp_mmu.h"
9
#include "spte.h"
10
11
#include <asm/cmpxchg.h>
12
#include <trace/events/kvm.h>
13
14
/* Initializes the TDP MMU for the VM, if enabled. */
15
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16
{
17
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19
}
20
21
/* Arbitrarily returns true so that this may be used in if statements. */
22
static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23
bool shared)
24
{
25
if (shared)
26
lockdep_assert_held_read(&kvm->mmu_lock);
27
else
28
lockdep_assert_held_write(&kvm->mmu_lock);
29
30
return true;
31
}
32
33
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34
{
35
/*
36
* Invalidate all roots, which besides the obvious, schedules all roots
37
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
38
* ultimately frees all roots.
39
*/
40
kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41
kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42
43
#ifdef CONFIG_KVM_PROVE_MMU
44
KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
45
#endif
46
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47
48
/*
49
* Ensure that all the outstanding RCU callbacks to free shadow pages
50
* can run before the VM is torn down. Putting the last reference to
51
* zapped roots will create new callbacks.
52
*/
53
rcu_barrier();
54
}
55
56
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
57
{
58
free_page((unsigned long)sp->external_spt);
59
free_page((unsigned long)sp->spt);
60
kmem_cache_free(mmu_page_header_cache, sp);
61
}
62
63
/*
64
* This is called through call_rcu in order to free TDP page table memory
65
* safely with respect to other kernel threads that may be operating on
66
* the memory.
67
* By only accessing TDP MMU page table memory in an RCU read critical
68
* section, and freeing it after a grace period, lockless access to that
69
* memory won't use it after it is freed.
70
*/
71
static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72
{
73
struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74
rcu_head);
75
76
tdp_mmu_free_sp(sp);
77
}
78
79
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
80
{
81
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
82
return;
83
84
/*
85
* The TDP MMU itself holds a reference to each root until the root is
86
* explicitly invalidated, i.e. the final reference should be never be
87
* put for a valid root.
88
*/
89
KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
90
91
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92
list_del_rcu(&root->link);
93
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
95
}
96
97
static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
98
enum kvm_tdp_mmu_root_types types)
99
{
100
if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
101
return false;
102
103
if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
104
return false;
105
106
if (likely(!is_mirror_sp(root)))
107
return types & KVM_DIRECT_ROOTS;
108
return types & KVM_MIRROR_ROOTS;
109
}
110
111
/*
112
* Returns the next root after @prev_root (or the first root if @prev_root is
113
* NULL) that matches with @types. A reference to the returned root is
114
* acquired, and the reference to @prev_root is released (the caller obviously
115
* must hold a reference to @prev_root if it's non-NULL).
116
*
117
* Roots that doesn't match with @types are skipped.
118
*
119
* Returns NULL if the end of tdp_mmu_roots was reached.
120
*/
121
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
122
struct kvm_mmu_page *prev_root,
123
enum kvm_tdp_mmu_root_types types)
124
{
125
struct kvm_mmu_page *next_root;
126
127
/*
128
* While the roots themselves are RCU-protected, fields such as
129
* role.invalid are protected by mmu_lock.
130
*/
131
lockdep_assert_held(&kvm->mmu_lock);
132
133
rcu_read_lock();
134
135
if (prev_root)
136
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
137
&prev_root->link,
138
typeof(*prev_root), link);
139
else
140
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
141
typeof(*next_root), link);
142
143
while (next_root) {
144
if (tdp_mmu_root_match(next_root, types) &&
145
kvm_tdp_mmu_get_root(next_root))
146
break;
147
148
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
149
&next_root->link, typeof(*next_root), link);
150
}
151
152
rcu_read_unlock();
153
154
if (prev_root)
155
kvm_tdp_mmu_put_root(kvm, prev_root);
156
157
return next_root;
158
}
159
160
/*
161
* Note: this iterator gets and puts references to the roots it iterates over.
162
* This makes it safe to release the MMU lock and yield within the loop, but
163
* if exiting the loop early, the caller must drop the reference to the most
164
* recent root. (Unless keeping a live reference is desirable.)
165
*
166
* If shared is set, this function is operating under the MMU lock in read
167
* mode.
168
*/
169
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
170
for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
171
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
172
_root = tdp_mmu_next_root(_kvm, _root, _types)) \
173
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
174
} else
175
176
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
177
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
178
179
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
180
for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
181
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
182
_root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
183
184
/*
185
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
186
* the implication being that any flow that holds mmu_lock for read is
187
* inherently yield-friendly and should use the yield-safe variant above.
188
* Holding mmu_lock for write obviates the need for RCU protection as the list
189
* is guaranteed to be stable.
190
*/
191
#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
192
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
193
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
194
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
195
!tdp_mmu_root_match((_root), (_types)))) { \
196
} else
197
198
/*
199
* Iterate over all TDP MMU roots in an RCU read-side critical section.
200
* It is safe to iterate over the SPTEs under the root, but their values will
201
* be unstable, so all writes must be atomic. As this routine is meant to be
202
* used without holding the mmu_lock at all, any bits that are flipped must
203
* be reflected in kvm_tdp_mmu_spte_need_atomic_write().
204
*/
205
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
206
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
207
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
208
!tdp_mmu_root_match((_root), (_types))) { \
209
} else
210
211
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
212
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
213
214
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
215
{
216
struct kvm_mmu_page *sp;
217
218
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
219
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
220
221
return sp;
222
}
223
224
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
225
gfn_t gfn, union kvm_mmu_page_role role)
226
{
227
INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
228
229
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
230
231
sp->role = role;
232
sp->gfn = gfn;
233
sp->ptep = sptep;
234
sp->tdp_mmu_page = true;
235
236
trace_kvm_mmu_get_page(sp, true);
237
}
238
239
static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
240
struct tdp_iter *iter)
241
{
242
struct kvm_mmu_page *parent_sp;
243
union kvm_mmu_page_role role;
244
245
parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
246
247
role = parent_sp->role;
248
role.level--;
249
250
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
251
}
252
253
void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
254
{
255
struct kvm_mmu *mmu = vcpu->arch.mmu;
256
union kvm_mmu_page_role role = mmu->root_role;
257
int as_id = kvm_mmu_role_as_id(role);
258
struct kvm *kvm = vcpu->kvm;
259
struct kvm_mmu_page *root;
260
261
if (mirror)
262
role.is_mirror = true;
263
264
/*
265
* Check for an existing root before acquiring the pages lock to avoid
266
* unnecessary serialization if multiple vCPUs are loading a new root.
267
* E.g. when bringing up secondary vCPUs, KVM will already have created
268
* a valid root on behalf of the primary vCPU.
269
*/
270
read_lock(&kvm->mmu_lock);
271
272
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
273
if (root->role.word == role.word)
274
goto out_read_unlock;
275
}
276
277
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
278
279
/*
280
* Recheck for an existing root after acquiring the pages lock, another
281
* vCPU may have raced ahead and created a new usable root. Manually
282
* walk the list of roots as the standard macros assume that the pages
283
* lock is *not* held. WARN if grabbing a reference to a usable root
284
* fails, as the last reference to a root can only be put *after* the
285
* root has been invalidated, which requires holding mmu_lock for write.
286
*/
287
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
288
if (root->role.word == role.word &&
289
!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
290
goto out_spin_unlock;
291
}
292
293
root = tdp_mmu_alloc_sp(vcpu);
294
tdp_mmu_init_sp(root, NULL, 0, role);
295
296
/*
297
* TDP MMU roots are kept until they are explicitly invalidated, either
298
* by a memslot update or by the destruction of the VM. Initialize the
299
* refcount to two; one reference for the vCPU, and one reference for
300
* the TDP MMU itself, which is held until the root is invalidated and
301
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
302
*/
303
refcount_set(&root->tdp_mmu_root_count, 2);
304
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
305
306
out_spin_unlock:
307
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
308
out_read_unlock:
309
read_unlock(&kvm->mmu_lock);
310
/*
311
* Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
312
* and actually consuming the root if it's invalidated after dropping
313
* mmu_lock, and the root can't be freed as this vCPU holds a reference.
314
*/
315
if (mirror) {
316
mmu->mirror_root_hpa = __pa(root->spt);
317
} else {
318
mmu->root.hpa = __pa(root->spt);
319
mmu->root.pgd = 0;
320
}
321
}
322
323
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
324
u64 old_spte, u64 new_spte, int level,
325
bool shared);
326
327
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
328
{
329
kvm_account_pgtable_pages((void *)sp->spt, +1);
330
#ifdef CONFIG_KVM_PROVE_MMU
331
atomic64_inc(&kvm->arch.tdp_mmu_pages);
332
#endif
333
}
334
335
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
336
{
337
kvm_account_pgtable_pages((void *)sp->spt, -1);
338
#ifdef CONFIG_KVM_PROVE_MMU
339
atomic64_dec(&kvm->arch.tdp_mmu_pages);
340
#endif
341
}
342
343
/**
344
* tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
345
*
346
* @kvm: kvm instance
347
* @sp: the page to be removed
348
*/
349
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
350
{
351
tdp_unaccount_mmu_page(kvm, sp);
352
353
if (!sp->nx_huge_page_disallowed)
354
return;
355
356
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
357
sp->nx_huge_page_disallowed = false;
358
untrack_possible_nx_huge_page(kvm, sp);
359
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
360
}
361
362
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
363
int level)
364
{
365
kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
366
int ret;
367
368
/*
369
* External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
370
* PTs are removed in a special order, involving free_external_spt().
371
* But remove_external_spte() will be called on non-leaf PTEs via
372
* __tdp_mmu_zap_root(), so avoid the error the former would return
373
* in this case.
374
*/
375
if (!is_last_spte(old_spte, level))
376
return;
377
378
/* Zapping leaf spte is allowed only when write lock is held. */
379
lockdep_assert_held_write(&kvm->mmu_lock);
380
/* Because write lock is held, operation should success. */
381
ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
382
KVM_BUG_ON(ret, kvm);
383
}
384
385
/**
386
* handle_removed_pt() - handle a page table removed from the TDP structure
387
*
388
* @kvm: kvm instance
389
* @pt: the page removed from the paging structure
390
* @shared: This operation may not be running under the exclusive use
391
* of the MMU lock and the operation must synchronize with other
392
* threads that might be modifying SPTEs.
393
*
394
* Given a page table that has been removed from the TDP paging structure,
395
* iterates through the page table to clear SPTEs and free child page tables.
396
*
397
* Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
398
* protection. Since this thread removed it from the paging structure,
399
* this thread will be responsible for ensuring the page is freed. Hence the
400
* early rcu_dereferences in the function.
401
*/
402
static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
403
{
404
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
405
int level = sp->role.level;
406
gfn_t base_gfn = sp->gfn;
407
int i;
408
409
trace_kvm_mmu_prepare_zap_page(sp);
410
411
tdp_mmu_unlink_sp(kvm, sp);
412
413
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
414
tdp_ptep_t sptep = pt + i;
415
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
416
u64 old_spte;
417
418
if (shared) {
419
/*
420
* Set the SPTE to a nonpresent value that other
421
* threads will not overwrite. If the SPTE was
422
* already marked as frozen then another thread
423
* handling a page fault could overwrite it, so
424
* set the SPTE until it is set from some other
425
* value to the frozen SPTE value.
426
*/
427
for (;;) {
428
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
429
if (!is_frozen_spte(old_spte))
430
break;
431
cpu_relax();
432
}
433
} else {
434
/*
435
* If the SPTE is not MMU-present, there is no backing
436
* page associated with the SPTE and so no side effects
437
* that need to be recorded, and exclusive ownership of
438
* mmu_lock ensures the SPTE can't be made present.
439
* Note, zapping MMIO SPTEs is also unnecessary as they
440
* are guarded by the memslots generation, not by being
441
* unreachable.
442
*/
443
old_spte = kvm_tdp_mmu_read_spte(sptep);
444
if (!is_shadow_present_pte(old_spte))
445
continue;
446
447
/*
448
* Use the common helper instead of a raw WRITE_ONCE as
449
* the SPTE needs to be updated atomically if it can be
450
* modified by a different vCPU outside of mmu_lock.
451
* Even though the parent SPTE is !PRESENT, the TLB
452
* hasn't yet been flushed, and both Intel and AMD
453
* document that A/D assists can use upper-level PxE
454
* entries that are cached in the TLB, i.e. the CPU can
455
* still access the page and mark it dirty.
456
*
457
* No retry is needed in the atomic update path as the
458
* sole concern is dropping a Dirty bit, i.e. no other
459
* task can zap/remove the SPTE as mmu_lock is held for
460
* write. Marking the SPTE as a frozen SPTE is not
461
* strictly necessary for the same reason, but using
462
* the frozen SPTE value keeps the shared/exclusive
463
* paths consistent and allows the handle_changed_spte()
464
* call below to hardcode the new value to FROZEN_SPTE.
465
*
466
* Note, even though dropping a Dirty bit is the only
467
* scenario where a non-atomic update could result in a
468
* functional bug, simply checking the Dirty bit isn't
469
* sufficient as a fast page fault could read the upper
470
* level SPTE before it is zapped, and then make this
471
* target SPTE writable, resume the guest, and set the
472
* Dirty bit between reading the SPTE above and writing
473
* it here.
474
*/
475
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
476
FROZEN_SPTE, level);
477
}
478
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
479
old_spte, FROZEN_SPTE, level, shared);
480
481
if (is_mirror_sp(sp)) {
482
KVM_BUG_ON(shared, kvm);
483
remove_external_spte(kvm, gfn, old_spte, level);
484
}
485
}
486
487
if (is_mirror_sp(sp) &&
488
WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
489
sp->external_spt))) {
490
/*
491
* Failed to free page table page in mirror page table and
492
* there is nothing to do further.
493
* Intentionally leak the page to prevent the kernel from
494
* accessing the encrypted page.
495
*/
496
sp->external_spt = NULL;
497
}
498
499
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
500
}
501
502
static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
503
{
504
if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
505
struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
506
507
WARN_ON_ONCE(sp->role.level + 1 != level);
508
WARN_ON_ONCE(sp->gfn != gfn);
509
return sp->external_spt;
510
}
511
512
return NULL;
513
}
514
515
static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
516
gfn_t gfn, u64 old_spte,
517
u64 new_spte, int level)
518
{
519
bool was_present = is_shadow_present_pte(old_spte);
520
bool is_present = is_shadow_present_pte(new_spte);
521
bool is_leaf = is_present && is_last_spte(new_spte, level);
522
kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
523
int ret = 0;
524
525
KVM_BUG_ON(was_present, kvm);
526
527
lockdep_assert_held(&kvm->mmu_lock);
528
/*
529
* We need to lock out other updates to the SPTE until the external
530
* page table has been modified. Use FROZEN_SPTE similar to
531
* the zapping case.
532
*/
533
if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
534
return -EBUSY;
535
536
/*
537
* Use different call to either set up middle level
538
* external page table, or leaf.
539
*/
540
if (is_leaf) {
541
ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
542
} else {
543
void *external_spt = get_external_spt(gfn, new_spte, level);
544
545
KVM_BUG_ON(!external_spt, kvm);
546
ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
547
}
548
if (ret)
549
__kvm_tdp_mmu_write_spte(sptep, old_spte);
550
else
551
__kvm_tdp_mmu_write_spte(sptep, new_spte);
552
return ret;
553
}
554
555
/**
556
* handle_changed_spte - handle bookkeeping associated with an SPTE change
557
* @kvm: kvm instance
558
* @as_id: the address space of the paging structure the SPTE was a part of
559
* @gfn: the base GFN that was mapped by the SPTE
560
* @old_spte: The value of the SPTE before the change
561
* @new_spte: The value of the SPTE after the change
562
* @level: the level of the PT the SPTE is part of in the paging structure
563
* @shared: This operation may not be running under the exclusive use of
564
* the MMU lock and the operation must synchronize with other
565
* threads that might be modifying SPTEs.
566
*
567
* Handle bookkeeping that might result from the modification of a SPTE. Note,
568
* dirty logging updates are handled in common code, not here (see make_spte()
569
* and fast_pf_fix_direct_spte()).
570
*/
571
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
572
u64 old_spte, u64 new_spte, int level,
573
bool shared)
574
{
575
bool was_present = is_shadow_present_pte(old_spte);
576
bool is_present = is_shadow_present_pte(new_spte);
577
bool was_leaf = was_present && is_last_spte(old_spte, level);
578
bool is_leaf = is_present && is_last_spte(new_spte, level);
579
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
580
581
WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
582
WARN_ON_ONCE(level < PG_LEVEL_4K);
583
WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
584
585
/*
586
* If this warning were to trigger it would indicate that there was a
587
* missing MMU notifier or a race with some notifier handler.
588
* A present, leaf SPTE should never be directly replaced with another
589
* present leaf SPTE pointing to a different PFN. A notifier handler
590
* should be zapping the SPTE before the main MM's page table is
591
* changed, or the SPTE should be zeroed, and the TLBs flushed by the
592
* thread before replacement.
593
*/
594
if (was_leaf && is_leaf && pfn_changed) {
595
pr_err("Invalid SPTE change: cannot replace a present leaf\n"
596
"SPTE with another present leaf SPTE mapping a\n"
597
"different PFN!\n"
598
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
599
as_id, gfn, old_spte, new_spte, level);
600
601
/*
602
* Crash the host to prevent error propagation and guest data
603
* corruption.
604
*/
605
BUG();
606
}
607
608
if (old_spte == new_spte)
609
return;
610
611
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
612
613
if (is_leaf)
614
check_spte_writable_invariants(new_spte);
615
616
/*
617
* The only times a SPTE should be changed from a non-present to
618
* non-present state is when an MMIO entry is installed/modified/
619
* removed. In that case, there is nothing to do here.
620
*/
621
if (!was_present && !is_present) {
622
/*
623
* If this change does not involve a MMIO SPTE or frozen SPTE,
624
* it is unexpected. Log the change, though it should not
625
* impact the guest since both the former and current SPTEs
626
* are nonpresent.
627
*/
628
if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
629
!is_mmio_spte(kvm, new_spte) &&
630
!is_frozen_spte(new_spte)))
631
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
632
"should not be replaced with another,\n"
633
"different nonpresent SPTE, unless one or both\n"
634
"are MMIO SPTEs, or the new SPTE is\n"
635
"a temporary frozen SPTE.\n"
636
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
637
as_id, gfn, old_spte, new_spte, level);
638
return;
639
}
640
641
if (is_leaf != was_leaf)
642
kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
643
644
/*
645
* Recursively handle child PTs if the change removed a subtree from
646
* the paging structure. Note the WARN on the PFN changing without the
647
* SPTE being converted to a hugepage (leaf) or being zapped. Shadow
648
* pages are kernel allocations and should never be migrated.
649
*/
650
if (was_present && !was_leaf &&
651
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
652
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
653
}
654
655
static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
656
struct tdp_iter *iter,
657
u64 new_spte)
658
{
659
/*
660
* The caller is responsible for ensuring the old SPTE is not a FROZEN
661
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
662
* and pre-checking before inserting a new SPTE is advantageous as it
663
* avoids unnecessary work.
664
*/
665
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
666
667
if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
668
int ret;
669
670
/*
671
* Users of atomic zapping don't operate on mirror roots,
672
* so don't handle it and bug the VM if it's seen.
673
*/
674
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
675
return -EBUSY;
676
677
ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
678
iter->old_spte, new_spte, iter->level);
679
if (ret)
680
return ret;
681
} else {
682
u64 *sptep = rcu_dereference(iter->sptep);
683
684
/*
685
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
686
* and does not hold the mmu_lock. On failure, i.e. if a
687
* different logical CPU modified the SPTE, try_cmpxchg64()
688
* updates iter->old_spte with the current value, so the caller
689
* operates on fresh data, e.g. if it retries
690
* tdp_mmu_set_spte_atomic()
691
*/
692
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
693
return -EBUSY;
694
}
695
696
return 0;
697
}
698
699
/*
700
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
701
* and handle the associated bookkeeping. Do not mark the page dirty
702
* in KVM's dirty bitmaps.
703
*
704
* If setting the SPTE fails because it has changed, iter->old_spte will be
705
* refreshed to the current value of the spte.
706
*
707
* @kvm: kvm instance
708
* @iter: a tdp_iter instance currently on the SPTE that should be set
709
* @new_spte: The value the SPTE should be set to
710
* Return:
711
* * 0 - If the SPTE was set.
712
* * -EBUSY - If the SPTE cannot be set. In this case this function will have
713
* no side-effects other than setting iter->old_spte to the last
714
* known value of the spte.
715
*/
716
static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
717
struct tdp_iter *iter,
718
u64 new_spte)
719
{
720
int ret;
721
722
lockdep_assert_held_read(&kvm->mmu_lock);
723
724
ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
725
if (ret)
726
return ret;
727
728
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
729
new_spte, iter->level, true);
730
731
return 0;
732
}
733
734
/*
735
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
736
* @kvm: KVM instance
737
* @as_id: Address space ID, i.e. regular vs. SMM
738
* @sptep: Pointer to the SPTE
739
* @old_spte: The current value of the SPTE
740
* @new_spte: The new value that will be set for the SPTE
741
* @gfn: The base GFN that was (or will be) mapped by the SPTE
742
* @level: The level _containing_ the SPTE (its parent PT's level)
743
*
744
* Returns the old SPTE value, which _may_ be different than @old_spte if the
745
* SPTE had voldatile bits.
746
*/
747
static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
748
u64 old_spte, u64 new_spte, gfn_t gfn, int level)
749
{
750
lockdep_assert_held_write(&kvm->mmu_lock);
751
752
/*
753
* No thread should be using this function to set SPTEs to or from the
754
* temporary frozen SPTE value.
755
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
756
* should be used. If operating under the MMU lock in write mode, the
757
* use of the frozen SPTE should not be necessary.
758
*/
759
WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
760
761
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
762
763
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
764
765
/*
766
* Users that do non-atomic setting of PTEs don't operate on mirror
767
* roots, so don't handle it and bug the VM if it's seen.
768
*/
769
if (is_mirror_sptep(sptep)) {
770
KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
771
remove_external_spte(kvm, gfn, old_spte, level);
772
}
773
774
return old_spte;
775
}
776
777
static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
778
u64 new_spte)
779
{
780
WARN_ON_ONCE(iter->yielded);
781
iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
782
iter->old_spte, new_spte,
783
iter->gfn, iter->level);
784
}
785
786
#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
787
for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
788
789
#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
790
tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
791
if (!is_shadow_present_pte(_iter.old_spte) || \
792
!is_last_spte(_iter.old_spte, _iter.level)) \
793
continue; \
794
else
795
796
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
797
struct tdp_iter *iter)
798
{
799
if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
800
return false;
801
802
/* Ensure forward progress has been made before yielding. */
803
return iter->next_last_level_gfn != iter->yielded_gfn;
804
}
805
806
/*
807
* Yield if the MMU lock is contended or this thread needs to return control
808
* to the scheduler.
809
*
810
* If this function should yield and flush is set, it will perform a remote
811
* TLB flush before yielding.
812
*
813
* If this function yields, iter->yielded is set and the caller must skip to
814
* the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
815
* over the paging structures to allow the iterator to continue its traversal
816
* from the paging structure root.
817
*
818
* Returns true if this function yielded.
819
*/
820
static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
821
struct tdp_iter *iter,
822
bool flush, bool shared)
823
{
824
KVM_MMU_WARN_ON(iter->yielded);
825
826
if (!tdp_mmu_iter_need_resched(kvm, iter))
827
return false;
828
829
if (flush)
830
kvm_flush_remote_tlbs(kvm);
831
832
rcu_read_unlock();
833
834
if (shared)
835
cond_resched_rwlock_read(&kvm->mmu_lock);
836
else
837
cond_resched_rwlock_write(&kvm->mmu_lock);
838
839
rcu_read_lock();
840
841
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
842
843
iter->yielded = true;
844
return true;
845
}
846
847
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
848
{
849
/*
850
* Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
851
* a gpa range that would exceed the max gfn, and KVM does not create
852
* MMIO SPTEs for "impossible" gfns, instead sending such accesses down
853
* the slow emulation path every time.
854
*/
855
return kvm_mmu_max_gfn() + 1;
856
}
857
858
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
859
bool shared, int zap_level)
860
{
861
struct tdp_iter iter;
862
863
for_each_tdp_pte_min_level_all(iter, root, zap_level) {
864
retry:
865
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
866
continue;
867
868
if (!is_shadow_present_pte(iter.old_spte))
869
continue;
870
871
if (iter.level > zap_level)
872
continue;
873
874
if (!shared)
875
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
876
else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
877
goto retry;
878
}
879
}
880
881
static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
882
bool shared)
883
{
884
885
/*
886
* The root must have an elevated refcount so that it's reachable via
887
* mmu_notifier callbacks, which allows this path to yield and drop
888
* mmu_lock. When handling an unmap/release mmu_notifier command, KVM
889
* must drop all references to relevant pages prior to completing the
890
* callback. Dropping mmu_lock with an unreachable root would result
891
* in zapping SPTEs after a relevant mmu_notifier callback completes
892
* and lead to use-after-free as zapping a SPTE triggers "writeback" of
893
* dirty accessed bits to the SPTE's associated struct page.
894
*/
895
WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
896
897
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
898
899
rcu_read_lock();
900
901
/*
902
* Zap roots in multiple passes of decreasing granularity, i.e. zap at
903
* 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
904
* preempt models) or mmu_lock contention (full or real-time models).
905
* Zapping at finer granularity marginally increases the total time of
906
* the zap, but in most cases the zap itself isn't latency sensitive.
907
*
908
* If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
909
* in order to mimic the page fault path, which can replace a 1GiB page
910
* table with an equivalent 1GiB hugepage, i.e. can get saddled with
911
* zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
912
* allows verifying that KVM can safely zap 1GiB regions, e.g. without
913
* inducing RCU stalls, without relying on a relatively rare event
914
* (zapping roots is orders of magnitude more common). Note, because
915
* zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
916
* in the iterator itself is unnecessary.
917
*/
918
if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
919
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
920
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
921
}
922
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
923
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
924
925
rcu_read_unlock();
926
}
927
928
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
929
{
930
u64 old_spte;
931
932
/*
933
* This helper intentionally doesn't allow zapping a root shadow page,
934
* which doesn't have a parent page table and thus no associated entry.
935
*/
936
if (WARN_ON_ONCE(!sp->ptep))
937
return false;
938
939
old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
940
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
941
return false;
942
943
tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
944
SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
945
946
return true;
947
}
948
949
/*
950
* If can_yield is true, will release the MMU lock and reschedule if the
951
* scheduler needs the CPU or there is contention on the MMU lock. If this
952
* function cannot yield, it will not release the MMU lock or reschedule and
953
* the caller must ensure it does not supply too large a GFN range, or the
954
* operation can cause a soft lockup.
955
*/
956
static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
957
gfn_t start, gfn_t end, bool can_yield, bool flush)
958
{
959
struct tdp_iter iter;
960
961
end = min(end, tdp_mmu_max_gfn_exclusive());
962
963
lockdep_assert_held_write(&kvm->mmu_lock);
964
965
rcu_read_lock();
966
967
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
968
if (can_yield &&
969
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
970
flush = false;
971
continue;
972
}
973
974
if (!is_shadow_present_pte(iter.old_spte) ||
975
!is_last_spte(iter.old_spte, iter.level))
976
continue;
977
978
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
979
980
/*
981
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
982
* see kvm_tdp_mmu_zap_invalidated_roots() for details.
983
*/
984
if (!root->role.invalid)
985
flush = true;
986
}
987
988
rcu_read_unlock();
989
990
/*
991
* Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
992
* to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
993
*/
994
return flush;
995
}
996
997
/*
998
* Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
999
* Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
1000
* one or more SPTEs were zapped since the MMU lock was last acquired.
1001
*/
1002
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
1003
{
1004
struct kvm_mmu_page *root;
1005
1006
lockdep_assert_held_write(&kvm->mmu_lock);
1007
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
1008
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
1009
1010
return flush;
1011
}
1012
1013
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1014
{
1015
struct kvm_mmu_page *root;
1016
1017
/*
1018
* Zap all direct roots, including invalid direct roots, as all direct
1019
* SPTEs must be dropped before returning to the caller. For TDX, mirror
1020
* roots don't need handling in response to the mmu notifier (the caller).
1021
*
1022
* Zap directly even if the root is also being zapped by a concurrent
1023
* "fast zap". Walking zapped top-level SPTEs isn't all that expensive
1024
* and mmu_lock is already held, which means the other thread has yielded.
1025
*
1026
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
1027
* is being destroyed or the userspace VMM has exited. In both cases,
1028
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1029
*/
1030
lockdep_assert_held_write(&kvm->mmu_lock);
1031
__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1032
KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1033
tdp_mmu_zap_root(kvm, root, false);
1034
}
1035
1036
/*
1037
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1038
* zap" completes.
1039
*/
1040
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1041
{
1042
struct kvm_mmu_page *root;
1043
1044
if (shared)
1045
read_lock(&kvm->mmu_lock);
1046
else
1047
write_lock(&kvm->mmu_lock);
1048
1049
for_each_tdp_mmu_root_yield_safe(kvm, root) {
1050
if (!root->tdp_mmu_scheduled_root_to_zap)
1051
continue;
1052
1053
root->tdp_mmu_scheduled_root_to_zap = false;
1054
KVM_BUG_ON(!root->role.invalid, kvm);
1055
1056
/*
1057
* A TLB flush is not necessary as KVM performs a local TLB
1058
* flush when allocating a new root (see kvm_mmu_load()), and
1059
* when migrating a vCPU to a different pCPU. Note, the local
1060
* TLB flush on reuse also invalidates paging-structure-cache
1061
* entries, i.e. TLB entries for intermediate paging structures,
1062
* that may be zapped, as such entries are associated with the
1063
* ASID on both VMX and SVM.
1064
*/
1065
tdp_mmu_zap_root(kvm, root, shared);
1066
1067
/*
1068
* The referenced needs to be put *after* zapping the root, as
1069
* the root must be reachable by mmu_notifiers while it's being
1070
* zapped
1071
*/
1072
kvm_tdp_mmu_put_root(kvm, root);
1073
}
1074
1075
if (shared)
1076
read_unlock(&kvm->mmu_lock);
1077
else
1078
write_unlock(&kvm->mmu_lock);
1079
}
1080
1081
/*
1082
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1083
* is about to be zapped, e.g. in response to a memslots update. The actual
1084
* zapping is done separately so that it happens with mmu_lock with read,
1085
* whereas invalidating roots must be done with mmu_lock held for write (unless
1086
* the VM is being destroyed).
1087
*
1088
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1089
* See kvm_tdp_mmu_alloc_root().
1090
*/
1091
void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1092
enum kvm_tdp_mmu_root_types root_types)
1093
{
1094
struct kvm_mmu_page *root;
1095
1096
/*
1097
* Invalidating invalid roots doesn't make sense, prevent developers from
1098
* having to think about it.
1099
*/
1100
if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1101
root_types &= ~KVM_INVALID_ROOTS;
1102
1103
/*
1104
* mmu_lock must be held for write to ensure that a root doesn't become
1105
* invalid while there are active readers (invalidating a root while
1106
* there are active readers may or may not be problematic in practice,
1107
* but it's uncharted territory and not supported).
1108
*
1109
* Waive the assertion if there are no users of @kvm, i.e. the VM is
1110
* being destroyed after all references have been put, or if no vCPUs
1111
* have been created (which means there are no roots), i.e. the VM is
1112
* being destroyed in an error path of KVM_CREATE_VM.
1113
*/
1114
if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1115
refcount_read(&kvm->users_count) && kvm->created_vcpus)
1116
lockdep_assert_held_write(&kvm->mmu_lock);
1117
1118
/*
1119
* As above, mmu_lock isn't held when destroying the VM! There can't
1120
* be other references to @kvm, i.e. nothing else can invalidate roots
1121
* or get/put references to roots.
1122
*/
1123
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1124
if (!tdp_mmu_root_match(root, root_types))
1125
continue;
1126
1127
/*
1128
* Note, invalid roots can outlive a memslot update! Invalid
1129
* roots must be *zapped* before the memslot update completes,
1130
* but a different task can acquire a reference and keep the
1131
* root alive after its been zapped.
1132
*/
1133
if (!root->role.invalid) {
1134
root->tdp_mmu_scheduled_root_to_zap = true;
1135
root->role.invalid = true;
1136
}
1137
}
1138
}
1139
1140
/*
1141
* Installs a last-level SPTE to handle a TDP page fault.
1142
* (NPT/EPT violation/misconfiguration)
1143
*/
1144
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1145
struct kvm_page_fault *fault,
1146
struct tdp_iter *iter)
1147
{
1148
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1149
u64 new_spte;
1150
int ret = RET_PF_FIXED;
1151
bool wrprot = false;
1152
1153
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1154
return RET_PF_RETRY;
1155
1156
if (is_shadow_present_pte(iter->old_spte) &&
1157
(fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
1158
is_last_spte(iter->old_spte, iter->level)) {
1159
WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
1160
return RET_PF_SPURIOUS;
1161
}
1162
1163
if (unlikely(!fault->slot))
1164
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1165
else
1166
wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1167
fault->pfn, iter->old_spte, fault->prefetch,
1168
false, fault->map_writable, &new_spte);
1169
1170
if (new_spte == iter->old_spte)
1171
ret = RET_PF_SPURIOUS;
1172
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1173
return RET_PF_RETRY;
1174
else if (is_shadow_present_pte(iter->old_spte) &&
1175
(!is_last_spte(iter->old_spte, iter->level) ||
1176
WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1177
kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1178
1179
/*
1180
* If the page fault was caused by a write but the page is write
1181
* protected, emulation is needed. If the emulation was skipped,
1182
* the vCPU would have the same fault again.
1183
*/
1184
if (wrprot && fault->write)
1185
ret = RET_PF_WRITE_PROTECTED;
1186
1187
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1188
if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1189
vcpu->stat.pf_mmio_spte_created++;
1190
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1191
new_spte);
1192
ret = RET_PF_EMULATE;
1193
} else {
1194
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1195
rcu_dereference(iter->sptep));
1196
}
1197
1198
return ret;
1199
}
1200
1201
/*
1202
* tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1203
* provided page table.
1204
*
1205
* @kvm: kvm instance
1206
* @iter: a tdp_iter instance currently on the SPTE that should be set
1207
* @sp: The new TDP page table to install.
1208
* @shared: This operation is running under the MMU lock in read mode.
1209
*
1210
* Returns: 0 if the new page table was installed. Non-0 if the page table
1211
* could not be installed (e.g. the atomic compare-exchange failed).
1212
*/
1213
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1214
struct kvm_mmu_page *sp, bool shared)
1215
{
1216
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1217
int ret = 0;
1218
1219
if (shared) {
1220
ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1221
if (ret)
1222
return ret;
1223
} else {
1224
tdp_mmu_iter_set_spte(kvm, iter, spte);
1225
}
1226
1227
tdp_account_mmu_page(kvm, sp);
1228
1229
return 0;
1230
}
1231
1232
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1233
struct kvm_mmu_page *sp, bool shared);
1234
1235
/*
1236
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1237
* page tables and SPTEs to translate the faulting guest physical address.
1238
*/
1239
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1240
{
1241
struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1242
struct kvm *kvm = vcpu->kvm;
1243
struct tdp_iter iter;
1244
struct kvm_mmu_page *sp;
1245
int ret = RET_PF_RETRY;
1246
1247
kvm_mmu_hugepage_adjust(vcpu, fault);
1248
1249
trace_kvm_mmu_spte_requested(fault);
1250
1251
rcu_read_lock();
1252
1253
for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1254
int r;
1255
1256
if (fault->nx_huge_page_workaround_enabled)
1257
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1258
1259
/*
1260
* If SPTE has been frozen by another thread, just give up and
1261
* retry, avoiding unnecessary page table allocation and free.
1262
*/
1263
if (is_frozen_spte(iter.old_spte))
1264
goto retry;
1265
1266
if (iter.level == fault->goal_level)
1267
goto map_target_level;
1268
1269
/* Step down into the lower level page table if it exists. */
1270
if (is_shadow_present_pte(iter.old_spte) &&
1271
!is_large_pte(iter.old_spte))
1272
continue;
1273
1274
/*
1275
* The SPTE is either non-present or points to a huge page that
1276
* needs to be split.
1277
*/
1278
sp = tdp_mmu_alloc_sp(vcpu);
1279
tdp_mmu_init_child_sp(sp, &iter);
1280
if (is_mirror_sp(sp))
1281
kvm_mmu_alloc_external_spt(vcpu, sp);
1282
1283
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1284
1285
if (is_shadow_present_pte(iter.old_spte)) {
1286
/* Don't support large page for mirrored roots (TDX) */
1287
KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1288
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1289
} else {
1290
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1291
}
1292
1293
/*
1294
* Force the guest to retry if installing an upper level SPTE
1295
* failed, e.g. because a different task modified the SPTE.
1296
*/
1297
if (r) {
1298
tdp_mmu_free_sp(sp);
1299
goto retry;
1300
}
1301
1302
if (fault->huge_page_disallowed &&
1303
fault->req_level >= iter.level) {
1304
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1305
if (sp->nx_huge_page_disallowed)
1306
track_possible_nx_huge_page(kvm, sp);
1307
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1308
}
1309
}
1310
1311
/*
1312
* The walk aborted before reaching the target level, e.g. because the
1313
* iterator detected an upper level SPTE was frozen during traversal.
1314
*/
1315
WARN_ON_ONCE(iter.level == fault->goal_level);
1316
goto retry;
1317
1318
map_target_level:
1319
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1320
1321
retry:
1322
rcu_read_unlock();
1323
return ret;
1324
}
1325
1326
/* Used by mmu notifier via kvm_unmap_gfn_range() */
1327
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1328
bool flush)
1329
{
1330
enum kvm_tdp_mmu_root_types types;
1331
struct kvm_mmu_page *root;
1332
1333
types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1334
1335
__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1336
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1337
range->may_block, flush);
1338
1339
return flush;
1340
}
1341
1342
/*
1343
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1344
* if any of the GFNs in the range have been accessed.
1345
*
1346
* No need to mark the corresponding PFN as accessed as this call is coming
1347
* from the clear_young() or clear_flush_young() notifier, which uses the
1348
* return value to determine if the page has been accessed.
1349
*/
1350
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
1351
{
1352
u64 new_spte;
1353
1354
if (spte_ad_enabled(iter->old_spte)) {
1355
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1356
shadow_accessed_mask);
1357
new_spte = iter->old_spte & ~shadow_accessed_mask;
1358
} else {
1359
new_spte = mark_spte_for_access_track(iter->old_spte);
1360
/*
1361
* It is safe for the following cmpxchg to fail. Leave the
1362
* Accessed bit set, as the spte is most likely young anyway.
1363
*/
1364
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1365
return;
1366
}
1367
1368
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1369
iter->old_spte, new_spte);
1370
}
1371
1372
static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1373
struct kvm_gfn_range *range,
1374
bool test_only)
1375
{
1376
enum kvm_tdp_mmu_root_types types;
1377
struct kvm_mmu_page *root;
1378
struct tdp_iter iter;
1379
bool ret = false;
1380
1381
types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1382
1383
/*
1384
* Don't support rescheduling, none of the MMU notifiers that funnel
1385
* into this helper allow blocking; it'd be dead, wasteful code. Note,
1386
* this helper must NOT be used to unmap GFNs, as it processes only
1387
* valid roots!
1388
*/
1389
WARN_ON(types & ~KVM_VALID_ROOTS);
1390
1391
guard(rcu)();
1392
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
1393
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1394
if (!is_accessed_spte(iter.old_spte))
1395
continue;
1396
1397
if (test_only)
1398
return true;
1399
1400
ret = true;
1401
kvm_tdp_mmu_age_spte(kvm, &iter);
1402
}
1403
}
1404
1405
return ret;
1406
}
1407
1408
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1409
{
1410
return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1411
}
1412
1413
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1414
{
1415
return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1416
}
1417
1418
/*
1419
* Remove write access from all SPTEs at or above min_level that map GFNs
1420
* [start, end). Returns true if an SPTE has been changed and the TLBs need to
1421
* be flushed.
1422
*/
1423
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1424
gfn_t start, gfn_t end, int min_level)
1425
{
1426
struct tdp_iter iter;
1427
u64 new_spte;
1428
bool spte_set = false;
1429
1430
rcu_read_lock();
1431
1432
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1433
1434
for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1435
retry:
1436
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1437
continue;
1438
1439
if (!is_shadow_present_pte(iter.old_spte) ||
1440
!is_last_spte(iter.old_spte, iter.level) ||
1441
!(iter.old_spte & PT_WRITABLE_MASK))
1442
continue;
1443
1444
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1445
1446
if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1447
goto retry;
1448
1449
spte_set = true;
1450
}
1451
1452
rcu_read_unlock();
1453
return spte_set;
1454
}
1455
1456
/*
1457
* Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1458
* only affect leaf SPTEs down to min_level.
1459
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
1460
*/
1461
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1462
const struct kvm_memory_slot *slot, int min_level)
1463
{
1464
struct kvm_mmu_page *root;
1465
bool spte_set = false;
1466
1467
lockdep_assert_held_read(&kvm->mmu_lock);
1468
1469
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1470
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1471
slot->base_gfn + slot->npages, min_level);
1472
1473
return spte_set;
1474
}
1475
1476
static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1477
{
1478
struct kvm_mmu_page *sp;
1479
1480
sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1481
if (!sp)
1482
return NULL;
1483
1484
sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1485
if (!sp->spt) {
1486
kmem_cache_free(mmu_page_header_cache, sp);
1487
return NULL;
1488
}
1489
1490
return sp;
1491
}
1492
1493
/* Note, the caller is responsible for initializing @sp. */
1494
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1495
struct kvm_mmu_page *sp, bool shared)
1496
{
1497
const u64 huge_spte = iter->old_spte;
1498
const int level = iter->level;
1499
int ret, i;
1500
1501
/*
1502
* No need for atomics when writing to sp->spt since the page table has
1503
* not been linked in yet and thus is not reachable from any other CPU.
1504
*/
1505
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1506
sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1507
1508
/*
1509
* Replace the huge spte with a pointer to the populated lower level
1510
* page table. Since we are making this change without a TLB flush vCPUs
1511
* will see a mix of the split mappings and the original huge mapping,
1512
* depending on what's currently in their TLB. This is fine from a
1513
* correctness standpoint since the translation will be the same either
1514
* way.
1515
*/
1516
ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1517
if (ret)
1518
goto out;
1519
1520
/*
1521
* tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1522
* are overwriting from the page stats. But we have to manually update
1523
* the page stats with the new present child pages.
1524
*/
1525
kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1526
1527
out:
1528
trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1529
return ret;
1530
}
1531
1532
static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1533
struct kvm_mmu_page *root,
1534
gfn_t start, gfn_t end,
1535
int target_level, bool shared)
1536
{
1537
struct kvm_mmu_page *sp = NULL;
1538
struct tdp_iter iter;
1539
1540
rcu_read_lock();
1541
1542
/*
1543
* Traverse the page table splitting all huge pages above the target
1544
* level into one lower level. For example, if we encounter a 1GB page
1545
* we split it into 512 2MB pages.
1546
*
1547
* Since the TDP iterator uses a pre-order traversal, we are guaranteed
1548
* to visit an SPTE before ever visiting its children, which means we
1549
* will correctly recursively split huge pages that are more than one
1550
* level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1551
* and then splitting each of those to 512 4KB pages).
1552
*/
1553
for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1554
retry:
1555
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1556
continue;
1557
1558
if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1559
continue;
1560
1561
if (!sp) {
1562
rcu_read_unlock();
1563
1564
if (shared)
1565
read_unlock(&kvm->mmu_lock);
1566
else
1567
write_unlock(&kvm->mmu_lock);
1568
1569
sp = tdp_mmu_alloc_sp_for_split();
1570
1571
if (shared)
1572
read_lock(&kvm->mmu_lock);
1573
else
1574
write_lock(&kvm->mmu_lock);
1575
1576
if (!sp) {
1577
trace_kvm_mmu_split_huge_page(iter.gfn,
1578
iter.old_spte,
1579
iter.level, -ENOMEM);
1580
return -ENOMEM;
1581
}
1582
1583
rcu_read_lock();
1584
1585
iter.yielded = true;
1586
continue;
1587
}
1588
1589
tdp_mmu_init_child_sp(sp, &iter);
1590
1591
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1592
goto retry;
1593
1594
sp = NULL;
1595
}
1596
1597
rcu_read_unlock();
1598
1599
/*
1600
* It's possible to exit the loop having never used the last sp if, for
1601
* example, a vCPU doing HugePage NX splitting wins the race and
1602
* installs its own sp in place of the last sp we tried to split.
1603
*/
1604
if (sp)
1605
tdp_mmu_free_sp(sp);
1606
1607
return 0;
1608
}
1609
1610
1611
/*
1612
* Try to split all huge pages mapped by the TDP MMU down to the target level.
1613
*/
1614
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1615
const struct kvm_memory_slot *slot,
1616
gfn_t start, gfn_t end,
1617
int target_level, bool shared)
1618
{
1619
struct kvm_mmu_page *root;
1620
int r = 0;
1621
1622
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1623
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1624
r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1625
if (r) {
1626
kvm_tdp_mmu_put_root(kvm, root);
1627
break;
1628
}
1629
}
1630
}
1631
1632
static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
1633
{
1634
/*
1635
* All TDP MMU shadow pages share the same role as their root, aside
1636
* from level, so it is valid to key off any shadow page to determine if
1637
* write protection is needed for an entire tree.
1638
*/
1639
return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
1640
}
1641
1642
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1643
gfn_t start, gfn_t end)
1644
{
1645
const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
1646
PT_WRITABLE_MASK : shadow_dirty_mask;
1647
struct tdp_iter iter;
1648
1649
rcu_read_lock();
1650
1651
tdp_root_for_each_pte(iter, kvm, root, start, end) {
1652
retry:
1653
if (!is_shadow_present_pte(iter.old_spte) ||
1654
!is_last_spte(iter.old_spte, iter.level))
1655
continue;
1656
1657
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1658
continue;
1659
1660
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1661
spte_ad_need_write_protect(iter.old_spte));
1662
1663
if (!(iter.old_spte & dbit))
1664
continue;
1665
1666
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1667
goto retry;
1668
}
1669
1670
rcu_read_unlock();
1671
}
1672
1673
/*
1674
* Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1675
* memslot.
1676
*/
1677
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1678
const struct kvm_memory_slot *slot)
1679
{
1680
struct kvm_mmu_page *root;
1681
1682
lockdep_assert_held_read(&kvm->mmu_lock);
1683
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1684
clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1685
slot->base_gfn + slot->npages);
1686
}
1687
1688
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1689
gfn_t gfn, unsigned long mask, bool wrprot)
1690
{
1691
const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
1692
PT_WRITABLE_MASK : shadow_dirty_mask;
1693
struct tdp_iter iter;
1694
1695
lockdep_assert_held_write(&kvm->mmu_lock);
1696
1697
rcu_read_lock();
1698
1699
tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1700
gfn + BITS_PER_LONG) {
1701
if (!mask)
1702
break;
1703
1704
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1705
spte_ad_need_write_protect(iter.old_spte));
1706
1707
if (iter.level > PG_LEVEL_4K ||
1708
!(mask & (1UL << (iter.gfn - gfn))))
1709
continue;
1710
1711
mask &= ~(1UL << (iter.gfn - gfn));
1712
1713
if (!(iter.old_spte & dbit))
1714
continue;
1715
1716
iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1717
iter.old_spte, dbit,
1718
iter.level);
1719
1720
trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1721
iter.old_spte,
1722
iter.old_spte & ~dbit);
1723
}
1724
1725
rcu_read_unlock();
1726
}
1727
1728
/*
1729
* Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1730
* which a bit is set in mask, starting at gfn. The given memslot is expected to
1731
* contain all the GFNs represented by set bits in the mask.
1732
*/
1733
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1734
struct kvm_memory_slot *slot,
1735
gfn_t gfn, unsigned long mask,
1736
bool wrprot)
1737
{
1738
struct kvm_mmu_page *root;
1739
1740
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1741
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1742
}
1743
1744
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1745
struct tdp_iter *parent,
1746
u64 *huge_spte)
1747
{
1748
struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1749
gfn_t start = parent->gfn;
1750
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1751
struct tdp_iter iter;
1752
1753
tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1754
/*
1755
* Use the parent iterator when checking for forward progress so
1756
* that KVM doesn't get stuck continuously trying to yield (i.e.
1757
* returning -EAGAIN here and then failing the forward progress
1758
* check in the caller ad nauseam).
1759
*/
1760
if (tdp_mmu_iter_need_resched(kvm, parent))
1761
return -EAGAIN;
1762
1763
*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1764
return 0;
1765
}
1766
1767
return -ENOENT;
1768
}
1769
1770
static void recover_huge_pages_range(struct kvm *kvm,
1771
struct kvm_mmu_page *root,
1772
const struct kvm_memory_slot *slot)
1773
{
1774
gfn_t start = slot->base_gfn;
1775
gfn_t end = start + slot->npages;
1776
struct tdp_iter iter;
1777
int max_mapping_level;
1778
bool flush = false;
1779
u64 huge_spte;
1780
int r;
1781
1782
if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1783
return;
1784
1785
rcu_read_lock();
1786
1787
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1788
retry:
1789
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1790
flush = false;
1791
continue;
1792
}
1793
1794
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1795
!is_shadow_present_pte(iter.old_spte))
1796
continue;
1797
1798
/*
1799
* Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1800
* a large page size, then its parent would have been zapped
1801
* instead of stepping down.
1802
*/
1803
if (is_last_spte(iter.old_spte, iter.level))
1804
continue;
1805
1806
/*
1807
* If iter.gfn resides outside of the slot, i.e. the page for
1808
* the current level overlaps but is not contained by the slot,
1809
* then the SPTE can't be made huge. More importantly, trying
1810
* to query that info from slot->arch.lpage_info will cause an
1811
* out-of-bounds access.
1812
*/
1813
if (iter.gfn < start || iter.gfn >= end)
1814
continue;
1815
1816
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
1817
if (max_mapping_level < iter.level)
1818
continue;
1819
1820
r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1821
if (r == -EAGAIN)
1822
goto retry;
1823
else if (r)
1824
continue;
1825
1826
if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1827
goto retry;
1828
1829
flush = true;
1830
}
1831
1832
if (flush)
1833
kvm_flush_remote_tlbs_memslot(kvm, slot);
1834
1835
rcu_read_unlock();
1836
}
1837
1838
/*
1839
* Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1840
* huge SPTEs where possible.
1841
*/
1842
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1843
const struct kvm_memory_slot *slot)
1844
{
1845
struct kvm_mmu_page *root;
1846
1847
lockdep_assert_held_read(&kvm->mmu_lock);
1848
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1849
recover_huge_pages_range(kvm, root, slot);
1850
}
1851
1852
/*
1853
* Removes write access on the last level SPTE mapping this GFN and unsets the
1854
* MMU-writable bit to ensure future writes continue to be intercepted.
1855
* Returns true if an SPTE was set and a TLB flush is needed.
1856
*/
1857
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1858
gfn_t gfn, int min_level)
1859
{
1860
struct tdp_iter iter;
1861
u64 new_spte;
1862
bool spte_set = false;
1863
1864
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1865
1866
rcu_read_lock();
1867
1868
for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1869
if (!is_shadow_present_pte(iter.old_spte) ||
1870
!is_last_spte(iter.old_spte, iter.level))
1871
continue;
1872
1873
new_spte = iter.old_spte &
1874
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1875
1876
if (new_spte == iter.old_spte)
1877
break;
1878
1879
tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1880
spte_set = true;
1881
}
1882
1883
rcu_read_unlock();
1884
1885
return spte_set;
1886
}
1887
1888
/*
1889
* Removes write access on the last level SPTE mapping this GFN and unsets the
1890
* MMU-writable bit to ensure future writes continue to be intercepted.
1891
* Returns true if an SPTE was set and a TLB flush is needed.
1892
*/
1893
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1894
struct kvm_memory_slot *slot, gfn_t gfn,
1895
int min_level)
1896
{
1897
struct kvm_mmu_page *root;
1898
bool spte_set = false;
1899
1900
lockdep_assert_held_write(&kvm->mmu_lock);
1901
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1902
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1903
1904
return spte_set;
1905
}
1906
1907
/*
1908
* Return the level of the lowest level SPTE added to sptes.
1909
* That SPTE may be non-present.
1910
*
1911
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1912
*/
1913
static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1914
struct kvm_mmu_page *root)
1915
{
1916
struct tdp_iter iter;
1917
gfn_t gfn = addr >> PAGE_SHIFT;
1918
int leaf = -1;
1919
1920
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1921
leaf = iter.level;
1922
sptes[leaf] = iter.old_spte;
1923
}
1924
1925
return leaf;
1926
}
1927
1928
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1929
int *root_level)
1930
{
1931
struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1932
*root_level = vcpu->arch.mmu->root_role.level;
1933
1934
return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root);
1935
}
1936
1937
bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa)
1938
{
1939
struct kvm *kvm = vcpu->kvm;
1940
bool is_direct = kvm_is_addr_direct(kvm, gpa);
1941
hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa :
1942
vcpu->arch.mmu->mirror_root_hpa;
1943
u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte;
1944
int leaf;
1945
1946
lockdep_assert_held(&kvm->mmu_lock);
1947
rcu_read_lock();
1948
leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root));
1949
rcu_read_unlock();
1950
if (leaf < 0)
1951
return false;
1952
1953
spte = sptes[leaf];
1954
return is_shadow_present_pte(spte) && is_last_spte(spte, leaf);
1955
}
1956
EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped);
1957
1958
/*
1959
* Returns the last level spte pointer of the shadow page walk for the given
1960
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1961
* walk could be performed, returns NULL and *spte does not contain valid data.
1962
*
1963
* Contract:
1964
* - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1965
* - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1966
*
1967
* WARNING: This function is only intended to be called during fast_page_fault.
1968
*/
1969
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1970
u64 *spte)
1971
{
1972
/* Fast pf is not supported for mirrored roots */
1973
struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1974
struct tdp_iter iter;
1975
tdp_ptep_t sptep = NULL;
1976
1977
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1978
*spte = iter.old_spte;
1979
sptep = iter.sptep;
1980
}
1981
1982
/*
1983
* Perform the rcu_dereference to get the raw spte pointer value since
1984
* we are passing it up to fast_page_fault, which is shared with the
1985
* legacy MMU and thus does not retain the TDP MMU-specific __rcu
1986
* annotation.
1987
*
1988
* This is safe since fast_page_fault obeys the contracts of this
1989
* function as well as all TDP MMU contracts around modifying SPTEs
1990
* outside of mmu_lock.
1991
*/
1992
return rcu_dereference(sptep);
1993
}
1994
1995