Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/mmu/tdp_mmu.c
54335 views
1
// SPDX-License-Identifier: GPL-2.0
2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4
#include "mmu.h"
5
#include "mmu_internal.h"
6
#include "mmutrace.h"
7
#include "tdp_iter.h"
8
#include "tdp_mmu.h"
9
#include "spte.h"
10
11
#include <asm/cmpxchg.h>
12
#include <trace/events/kvm.h>
13
14
/* Initializes the TDP MMU for the VM, if enabled. */
15
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16
{
17
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19
}
20
21
/* Arbitrarily returns true so that this may be used in if statements. */
22
static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23
bool shared)
24
{
25
if (shared)
26
lockdep_assert_held_read(&kvm->mmu_lock);
27
else
28
lockdep_assert_held_write(&kvm->mmu_lock);
29
30
return true;
31
}
32
33
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34
{
35
/*
36
* Invalidate all roots, which besides the obvious, schedules all roots
37
* for zapping and thus puts the TDP MMU's reference to each root, i.e.
38
* ultimately frees all roots.
39
*/
40
kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41
kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42
43
#ifdef CONFIG_KVM_PROVE_MMU
44
KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
45
#endif
46
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47
48
/*
49
* Ensure that all the outstanding RCU callbacks to free shadow pages
50
* can run before the VM is torn down. Putting the last reference to
51
* zapped roots will create new callbacks.
52
*/
53
rcu_barrier();
54
}
55
56
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
57
{
58
free_page((unsigned long)sp->external_spt);
59
free_page((unsigned long)sp->spt);
60
kmem_cache_free(mmu_page_header_cache, sp);
61
}
62
63
/*
64
* This is called through call_rcu in order to free TDP page table memory
65
* safely with respect to other kernel threads that may be operating on
66
* the memory.
67
* By only accessing TDP MMU page table memory in an RCU read critical
68
* section, and freeing it after a grace period, lockless access to that
69
* memory won't use it after it is freed.
70
*/
71
static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72
{
73
struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74
rcu_head);
75
76
tdp_mmu_free_sp(sp);
77
}
78
79
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
80
{
81
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
82
return;
83
84
/*
85
* The TDP MMU itself holds a reference to each root until the root is
86
* explicitly invalidated, i.e. the final reference should be never be
87
* put for a valid root.
88
*/
89
KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
90
91
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92
list_del_rcu(&root->link);
93
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
95
}
96
97
static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
98
enum kvm_tdp_mmu_root_types types)
99
{
100
if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
101
return false;
102
103
if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
104
return false;
105
106
if (likely(!is_mirror_sp(root)))
107
return types & KVM_DIRECT_ROOTS;
108
return types & KVM_MIRROR_ROOTS;
109
}
110
111
/*
112
* Returns the next root after @prev_root (or the first root if @prev_root is
113
* NULL) that matches with @types. A reference to the returned root is
114
* acquired, and the reference to @prev_root is released (the caller obviously
115
* must hold a reference to @prev_root if it's non-NULL).
116
*
117
* Roots that doesn't match with @types are skipped.
118
*
119
* Returns NULL if the end of tdp_mmu_roots was reached.
120
*/
121
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
122
struct kvm_mmu_page *prev_root,
123
enum kvm_tdp_mmu_root_types types)
124
{
125
struct kvm_mmu_page *next_root;
126
127
/*
128
* While the roots themselves are RCU-protected, fields such as
129
* role.invalid are protected by mmu_lock.
130
*/
131
lockdep_assert_held(&kvm->mmu_lock);
132
133
rcu_read_lock();
134
135
if (prev_root)
136
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
137
&prev_root->link,
138
typeof(*prev_root), link);
139
else
140
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
141
typeof(*next_root), link);
142
143
while (next_root) {
144
if (tdp_mmu_root_match(next_root, types) &&
145
kvm_tdp_mmu_get_root(next_root))
146
break;
147
148
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
149
&next_root->link, typeof(*next_root), link);
150
}
151
152
rcu_read_unlock();
153
154
if (prev_root)
155
kvm_tdp_mmu_put_root(kvm, prev_root);
156
157
return next_root;
158
}
159
160
/*
161
* Note: this iterator gets and puts references to the roots it iterates over.
162
* This makes it safe to release the MMU lock and yield within the loop, but
163
* if exiting the loop early, the caller must drop the reference to the most
164
* recent root. (Unless keeping a live reference is desirable.)
165
*
166
* If shared is set, this function is operating under the MMU lock in read
167
* mode.
168
*/
169
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
170
for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
171
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
172
_root = tdp_mmu_next_root(_kvm, _root, _types)) \
173
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
174
} else
175
176
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
177
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
178
179
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
180
for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
181
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
182
_root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
183
184
/*
185
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
186
* the implication being that any flow that holds mmu_lock for read is
187
* inherently yield-friendly and should use the yield-safe variant above.
188
* Holding mmu_lock for write obviates the need for RCU protection as the list
189
* is guaranteed to be stable.
190
*/
191
#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
192
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
193
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
194
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
195
!tdp_mmu_root_match((_root), (_types)))) { \
196
} else
197
198
/*
199
* Iterate over all TDP MMU roots in an RCU read-side critical section.
200
* It is safe to iterate over the SPTEs under the root, but their values will
201
* be unstable, so all writes must be atomic. As this routine is meant to be
202
* used without holding the mmu_lock at all, any bits that are flipped must
203
* be reflected in kvm_tdp_mmu_spte_need_atomic_write().
204
*/
205
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
206
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
207
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
208
!tdp_mmu_root_match((_root), (_types))) { \
209
} else
210
211
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
212
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
213
214
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
215
{
216
struct kvm_mmu_page *sp;
217
218
sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
219
sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
220
221
return sp;
222
}
223
224
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
225
gfn_t gfn, union kvm_mmu_page_role role)
226
{
227
INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
228
229
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
230
231
sp->role = role;
232
sp->gfn = gfn;
233
sp->ptep = sptep;
234
sp->tdp_mmu_page = true;
235
236
trace_kvm_mmu_get_page(sp, true);
237
}
238
239
static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
240
struct tdp_iter *iter)
241
{
242
struct kvm_mmu_page *parent_sp;
243
union kvm_mmu_page_role role;
244
245
parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
246
247
role = parent_sp->role;
248
role.level--;
249
250
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
251
}
252
253
void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
254
{
255
struct kvm_mmu *mmu = vcpu->arch.mmu;
256
union kvm_mmu_page_role role = mmu->root_role;
257
int as_id = kvm_mmu_role_as_id(role);
258
struct kvm *kvm = vcpu->kvm;
259
struct kvm_mmu_page *root;
260
261
if (mirror)
262
role.is_mirror = true;
263
264
/*
265
* Check for an existing root before acquiring the pages lock to avoid
266
* unnecessary serialization if multiple vCPUs are loading a new root.
267
* E.g. when bringing up secondary vCPUs, KVM will already have created
268
* a valid root on behalf of the primary vCPU.
269
*/
270
read_lock(&kvm->mmu_lock);
271
272
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
273
if (root->role.word == role.word)
274
goto out_read_unlock;
275
}
276
277
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
278
279
/*
280
* Recheck for an existing root after acquiring the pages lock, another
281
* vCPU may have raced ahead and created a new usable root. Manually
282
* walk the list of roots as the standard macros assume that the pages
283
* lock is *not* held. WARN if grabbing a reference to a usable root
284
* fails, as the last reference to a root can only be put *after* the
285
* root has been invalidated, which requires holding mmu_lock for write.
286
*/
287
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
288
if (root->role.word == role.word &&
289
!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
290
goto out_spin_unlock;
291
}
292
293
root = tdp_mmu_alloc_sp(vcpu);
294
tdp_mmu_init_sp(root, NULL, 0, role);
295
296
/*
297
* TDP MMU roots are kept until they are explicitly invalidated, either
298
* by a memslot update or by the destruction of the VM. Initialize the
299
* refcount to two; one reference for the vCPU, and one reference for
300
* the TDP MMU itself, which is held until the root is invalidated and
301
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
302
*/
303
refcount_set(&root->tdp_mmu_root_count, 2);
304
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
305
306
out_spin_unlock:
307
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
308
out_read_unlock:
309
read_unlock(&kvm->mmu_lock);
310
/*
311
* Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
312
* and actually consuming the root if it's invalidated after dropping
313
* mmu_lock, and the root can't be freed as this vCPU holds a reference.
314
*/
315
if (mirror) {
316
mmu->mirror_root_hpa = __pa(root->spt);
317
} else {
318
mmu->root.hpa = __pa(root->spt);
319
mmu->root.pgd = 0;
320
}
321
}
322
323
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
324
u64 old_spte, u64 new_spte, int level,
325
bool shared);
326
327
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
328
{
329
kvm_account_pgtable_pages((void *)sp->spt, +1);
330
#ifdef CONFIG_KVM_PROVE_MMU
331
atomic64_inc(&kvm->arch.tdp_mmu_pages);
332
#endif
333
}
334
335
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
336
{
337
kvm_account_pgtable_pages((void *)sp->spt, -1);
338
#ifdef CONFIG_KVM_PROVE_MMU
339
atomic64_dec(&kvm->arch.tdp_mmu_pages);
340
#endif
341
}
342
343
/**
344
* tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
345
*
346
* @kvm: kvm instance
347
* @sp: the page to be removed
348
*/
349
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
350
{
351
tdp_unaccount_mmu_page(kvm, sp);
352
353
if (!sp->nx_huge_page_disallowed)
354
return;
355
356
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
357
sp->nx_huge_page_disallowed = false;
358
untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
359
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
360
}
361
362
static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
363
int level)
364
{
365
/*
366
* External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
367
* PTs are removed in a special order, involving free_external_spt().
368
* But remove_external_spte() will be called on non-leaf PTEs via
369
* __tdp_mmu_zap_root(), so avoid the error the former would return
370
* in this case.
371
*/
372
if (!is_last_spte(old_spte, level))
373
return;
374
375
/* Zapping leaf spte is allowed only when write lock is held. */
376
lockdep_assert_held_write(&kvm->mmu_lock);
377
378
kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
379
}
380
381
/**
382
* handle_removed_pt() - handle a page table removed from the TDP structure
383
*
384
* @kvm: kvm instance
385
* @pt: the page removed from the paging structure
386
* @shared: This operation may not be running under the exclusive use
387
* of the MMU lock and the operation must synchronize with other
388
* threads that might be modifying SPTEs.
389
*
390
* Given a page table that has been removed from the TDP paging structure,
391
* iterates through the page table to clear SPTEs and free child page tables.
392
*
393
* Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
394
* protection. Since this thread removed it from the paging structure,
395
* this thread will be responsible for ensuring the page is freed. Hence the
396
* early rcu_dereferences in the function.
397
*/
398
static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
399
{
400
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
401
int level = sp->role.level;
402
gfn_t base_gfn = sp->gfn;
403
int i;
404
405
trace_kvm_mmu_prepare_zap_page(sp);
406
407
tdp_mmu_unlink_sp(kvm, sp);
408
409
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
410
tdp_ptep_t sptep = pt + i;
411
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
412
u64 old_spte;
413
414
if (shared) {
415
/*
416
* Set the SPTE to a nonpresent value that other
417
* threads will not overwrite. If the SPTE was
418
* already marked as frozen then another thread
419
* handling a page fault could overwrite it, so
420
* set the SPTE until it is set from some other
421
* value to the frozen SPTE value.
422
*/
423
for (;;) {
424
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
425
if (!is_frozen_spte(old_spte))
426
break;
427
cpu_relax();
428
}
429
} else {
430
/*
431
* If the SPTE is not MMU-present, there is no backing
432
* page associated with the SPTE and so no side effects
433
* that need to be recorded, and exclusive ownership of
434
* mmu_lock ensures the SPTE can't be made present.
435
* Note, zapping MMIO SPTEs is also unnecessary as they
436
* are guarded by the memslots generation, not by being
437
* unreachable.
438
*/
439
old_spte = kvm_tdp_mmu_read_spte(sptep);
440
if (!is_shadow_present_pte(old_spte))
441
continue;
442
443
/*
444
* Use the common helper instead of a raw WRITE_ONCE as
445
* the SPTE needs to be updated atomically if it can be
446
* modified by a different vCPU outside of mmu_lock.
447
* Even though the parent SPTE is !PRESENT, the TLB
448
* hasn't yet been flushed, and both Intel and AMD
449
* document that A/D assists can use upper-level PxE
450
* entries that are cached in the TLB, i.e. the CPU can
451
* still access the page and mark it dirty.
452
*
453
* No retry is needed in the atomic update path as the
454
* sole concern is dropping a Dirty bit, i.e. no other
455
* task can zap/remove the SPTE as mmu_lock is held for
456
* write. Marking the SPTE as a frozen SPTE is not
457
* strictly necessary for the same reason, but using
458
* the frozen SPTE value keeps the shared/exclusive
459
* paths consistent and allows the handle_changed_spte()
460
* call below to hardcode the new value to FROZEN_SPTE.
461
*
462
* Note, even though dropping a Dirty bit is the only
463
* scenario where a non-atomic update could result in a
464
* functional bug, simply checking the Dirty bit isn't
465
* sufficient as a fast page fault could read the upper
466
* level SPTE before it is zapped, and then make this
467
* target SPTE writable, resume the guest, and set the
468
* Dirty bit between reading the SPTE above and writing
469
* it here.
470
*/
471
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
472
FROZEN_SPTE, level);
473
}
474
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
475
old_spte, FROZEN_SPTE, level, shared);
476
477
if (is_mirror_sp(sp)) {
478
KVM_BUG_ON(shared, kvm);
479
remove_external_spte(kvm, gfn, old_spte, level);
480
}
481
}
482
483
if (is_mirror_sp(sp) &&
484
WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
485
sp->external_spt))) {
486
/*
487
* Failed to free page table page in mirror page table and
488
* there is nothing to do further.
489
* Intentionally leak the page to prevent the kernel from
490
* accessing the encrypted page.
491
*/
492
sp->external_spt = NULL;
493
}
494
495
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
496
}
497
498
static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
499
{
500
if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
501
struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
502
503
WARN_ON_ONCE(sp->role.level + 1 != level);
504
WARN_ON_ONCE(sp->gfn != gfn);
505
return sp->external_spt;
506
}
507
508
return NULL;
509
}
510
511
static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
512
gfn_t gfn, u64 old_spte,
513
u64 new_spte, int level)
514
{
515
bool was_present = is_shadow_present_pte(old_spte);
516
bool is_present = is_shadow_present_pte(new_spte);
517
bool is_leaf = is_present && is_last_spte(new_spte, level);
518
int ret = 0;
519
520
KVM_BUG_ON(was_present, kvm);
521
522
lockdep_assert_held(&kvm->mmu_lock);
523
/*
524
* We need to lock out other updates to the SPTE until the external
525
* page table has been modified. Use FROZEN_SPTE similar to
526
* the zapping case.
527
*/
528
if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
529
return -EBUSY;
530
531
/*
532
* Use different call to either set up middle level
533
* external page table, or leaf.
534
*/
535
if (is_leaf) {
536
ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
537
} else {
538
void *external_spt = get_external_spt(gfn, new_spte, level);
539
540
KVM_BUG_ON(!external_spt, kvm);
541
ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
542
}
543
if (ret)
544
__kvm_tdp_mmu_write_spte(sptep, old_spte);
545
else
546
__kvm_tdp_mmu_write_spte(sptep, new_spte);
547
return ret;
548
}
549
550
/**
551
* handle_changed_spte - handle bookkeeping associated with an SPTE change
552
* @kvm: kvm instance
553
* @as_id: the address space of the paging structure the SPTE was a part of
554
* @gfn: the base GFN that was mapped by the SPTE
555
* @old_spte: The value of the SPTE before the change
556
* @new_spte: The value of the SPTE after the change
557
* @level: the level of the PT the SPTE is part of in the paging structure
558
* @shared: This operation may not be running under the exclusive use of
559
* the MMU lock and the operation must synchronize with other
560
* threads that might be modifying SPTEs.
561
*
562
* Handle bookkeeping that might result from the modification of a SPTE. Note,
563
* dirty logging updates are handled in common code, not here (see make_spte()
564
* and fast_pf_fix_direct_spte()).
565
*/
566
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
567
u64 old_spte, u64 new_spte, int level,
568
bool shared)
569
{
570
bool was_present = is_shadow_present_pte(old_spte);
571
bool is_present = is_shadow_present_pte(new_spte);
572
bool was_leaf = was_present && is_last_spte(old_spte, level);
573
bool is_leaf = is_present && is_last_spte(new_spte, level);
574
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
575
576
WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
577
WARN_ON_ONCE(level < PG_LEVEL_4K);
578
WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
579
580
/*
581
* If this warning were to trigger it would indicate that there was a
582
* missing MMU notifier or a race with some notifier handler.
583
* A present, leaf SPTE should never be directly replaced with another
584
* present leaf SPTE pointing to a different PFN. A notifier handler
585
* should be zapping the SPTE before the main MM's page table is
586
* changed, or the SPTE should be zeroed, and the TLBs flushed by the
587
* thread before replacement.
588
*/
589
if (was_leaf && is_leaf && pfn_changed) {
590
pr_err("Invalid SPTE change: cannot replace a present leaf\n"
591
"SPTE with another present leaf SPTE mapping a\n"
592
"different PFN!\n"
593
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
594
as_id, gfn, old_spte, new_spte, level);
595
596
/*
597
* Crash the host to prevent error propagation and guest data
598
* corruption.
599
*/
600
BUG();
601
}
602
603
if (old_spte == new_spte)
604
return;
605
606
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
607
608
if (is_leaf)
609
check_spte_writable_invariants(new_spte);
610
611
/*
612
* The only times a SPTE should be changed from a non-present to
613
* non-present state is when an MMIO entry is installed/modified/
614
* removed. In that case, there is nothing to do here.
615
*/
616
if (!was_present && !is_present) {
617
/*
618
* If this change does not involve a MMIO SPTE or frozen SPTE,
619
* it is unexpected. Log the change, though it should not
620
* impact the guest since both the former and current SPTEs
621
* are nonpresent.
622
*/
623
if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
624
!is_mmio_spte(kvm, new_spte) &&
625
!is_frozen_spte(new_spte)))
626
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
627
"should not be replaced with another,\n"
628
"different nonpresent SPTE, unless one or both\n"
629
"are MMIO SPTEs, or the new SPTE is\n"
630
"a temporary frozen SPTE.\n"
631
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
632
as_id, gfn, old_spte, new_spte, level);
633
return;
634
}
635
636
if (is_leaf != was_leaf)
637
kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
638
639
/*
640
* Recursively handle child PTs if the change removed a subtree from
641
* the paging structure. Note the WARN on the PFN changing without the
642
* SPTE being converted to a hugepage (leaf) or being zapped. Shadow
643
* pages are kernel allocations and should never be migrated.
644
*/
645
if (was_present && !was_leaf &&
646
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
647
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
648
}
649
650
static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
651
struct tdp_iter *iter,
652
u64 new_spte)
653
{
654
/*
655
* The caller is responsible for ensuring the old SPTE is not a FROZEN
656
* SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
657
* and pre-checking before inserting a new SPTE is advantageous as it
658
* avoids unnecessary work.
659
*/
660
WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
661
662
if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
663
int ret;
664
665
/*
666
* Users of atomic zapping don't operate on mirror roots,
667
* so don't handle it and bug the VM if it's seen.
668
*/
669
if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
670
return -EBUSY;
671
672
ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
673
iter->old_spte, new_spte, iter->level);
674
if (ret)
675
return ret;
676
} else {
677
u64 *sptep = rcu_dereference(iter->sptep);
678
679
/*
680
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
681
* and does not hold the mmu_lock. On failure, i.e. if a
682
* different logical CPU modified the SPTE, try_cmpxchg64()
683
* updates iter->old_spte with the current value, so the caller
684
* operates on fresh data, e.g. if it retries
685
* tdp_mmu_set_spte_atomic()
686
*/
687
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
688
return -EBUSY;
689
}
690
691
return 0;
692
}
693
694
/*
695
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
696
* and handle the associated bookkeeping. Do not mark the page dirty
697
* in KVM's dirty bitmaps.
698
*
699
* If setting the SPTE fails because it has changed, iter->old_spte will be
700
* refreshed to the current value of the spte.
701
*
702
* @kvm: kvm instance
703
* @iter: a tdp_iter instance currently on the SPTE that should be set
704
* @new_spte: The value the SPTE should be set to
705
* Return:
706
* * 0 - If the SPTE was set.
707
* * -EBUSY - If the SPTE cannot be set. In this case this function will have
708
* no side-effects other than setting iter->old_spte to the last
709
* known value of the spte.
710
*/
711
static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
712
struct tdp_iter *iter,
713
u64 new_spte)
714
{
715
int ret;
716
717
lockdep_assert_held_read(&kvm->mmu_lock);
718
719
ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
720
if (ret)
721
return ret;
722
723
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
724
new_spte, iter->level, true);
725
726
return 0;
727
}
728
729
/*
730
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
731
* @kvm: KVM instance
732
* @as_id: Address space ID, i.e. regular vs. SMM
733
* @sptep: Pointer to the SPTE
734
* @old_spte: The current value of the SPTE
735
* @new_spte: The new value that will be set for the SPTE
736
* @gfn: The base GFN that was (or will be) mapped by the SPTE
737
* @level: The level _containing_ the SPTE (its parent PT's level)
738
*
739
* Returns the old SPTE value, which _may_ be different than @old_spte if the
740
* SPTE had voldatile bits.
741
*/
742
static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
743
u64 old_spte, u64 new_spte, gfn_t gfn, int level)
744
{
745
lockdep_assert_held_write(&kvm->mmu_lock);
746
747
/*
748
* No thread should be using this function to set SPTEs to or from the
749
* temporary frozen SPTE value.
750
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
751
* should be used. If operating under the MMU lock in write mode, the
752
* use of the frozen SPTE should not be necessary.
753
*/
754
WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
755
756
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
757
758
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
759
760
/*
761
* Users that do non-atomic setting of PTEs don't operate on mirror
762
* roots, so don't handle it and bug the VM if it's seen.
763
*/
764
if (is_mirror_sptep(sptep)) {
765
KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
766
remove_external_spte(kvm, gfn, old_spte, level);
767
}
768
769
return old_spte;
770
}
771
772
static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
773
u64 new_spte)
774
{
775
WARN_ON_ONCE(iter->yielded);
776
iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
777
iter->old_spte, new_spte,
778
iter->gfn, iter->level);
779
}
780
781
#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
782
for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
783
784
#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
785
tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
786
if (!is_shadow_present_pte(_iter.old_spte) || \
787
!is_last_spte(_iter.old_spte, _iter.level)) \
788
continue; \
789
else
790
791
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
792
struct tdp_iter *iter)
793
{
794
if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
795
return false;
796
797
/* Ensure forward progress has been made before yielding. */
798
return iter->next_last_level_gfn != iter->yielded_gfn;
799
}
800
801
/*
802
* Yield if the MMU lock is contended or this thread needs to return control
803
* to the scheduler.
804
*
805
* If this function should yield and flush is set, it will perform a remote
806
* TLB flush before yielding.
807
*
808
* If this function yields, iter->yielded is set and the caller must skip to
809
* the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
810
* over the paging structures to allow the iterator to continue its traversal
811
* from the paging structure root.
812
*
813
* Returns true if this function yielded.
814
*/
815
static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
816
struct tdp_iter *iter,
817
bool flush, bool shared)
818
{
819
KVM_MMU_WARN_ON(iter->yielded);
820
821
if (!tdp_mmu_iter_need_resched(kvm, iter))
822
return false;
823
824
if (flush)
825
kvm_flush_remote_tlbs(kvm);
826
827
rcu_read_unlock();
828
829
if (shared)
830
cond_resched_rwlock_read(&kvm->mmu_lock);
831
else
832
cond_resched_rwlock_write(&kvm->mmu_lock);
833
834
rcu_read_lock();
835
836
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
837
838
iter->yielded = true;
839
return true;
840
}
841
842
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
843
{
844
/*
845
* Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
846
* a gpa range that would exceed the max gfn, and KVM does not create
847
* MMIO SPTEs for "impossible" gfns, instead sending such accesses down
848
* the slow emulation path every time.
849
*/
850
return kvm_mmu_max_gfn() + 1;
851
}
852
853
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
854
bool shared, int zap_level)
855
{
856
struct tdp_iter iter;
857
858
for_each_tdp_pte_min_level_all(iter, root, zap_level) {
859
retry:
860
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
861
continue;
862
863
if (!is_shadow_present_pte(iter.old_spte))
864
continue;
865
866
if (iter.level > zap_level)
867
continue;
868
869
if (!shared)
870
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
871
else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
872
goto retry;
873
}
874
}
875
876
static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
877
bool shared)
878
{
879
880
/*
881
* The root must have an elevated refcount so that it's reachable via
882
* mmu_notifier callbacks, which allows this path to yield and drop
883
* mmu_lock. When handling an unmap/release mmu_notifier command, KVM
884
* must drop all references to relevant pages prior to completing the
885
* callback. Dropping mmu_lock with an unreachable root would result
886
* in zapping SPTEs after a relevant mmu_notifier callback completes
887
* and lead to use-after-free as zapping a SPTE triggers "writeback" of
888
* dirty accessed bits to the SPTE's associated struct page.
889
*/
890
WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
891
892
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
893
894
rcu_read_lock();
895
896
/*
897
* Zap roots in multiple passes of decreasing granularity, i.e. zap at
898
* 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
899
* preempt models) or mmu_lock contention (full or real-time models).
900
* Zapping at finer granularity marginally increases the total time of
901
* the zap, but in most cases the zap itself isn't latency sensitive.
902
*
903
* If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
904
* in order to mimic the page fault path, which can replace a 1GiB page
905
* table with an equivalent 1GiB hugepage, i.e. can get saddled with
906
* zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
907
* allows verifying that KVM can safely zap 1GiB regions, e.g. without
908
* inducing RCU stalls, without relying on a relatively rare event
909
* (zapping roots is orders of magnitude more common). Note, because
910
* zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
911
* in the iterator itself is unnecessary.
912
*/
913
if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
914
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
915
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
916
}
917
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
918
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
919
920
rcu_read_unlock();
921
}
922
923
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
924
struct kvm_mmu_page *sp)
925
{
926
struct tdp_iter iter = {
927
.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
928
.sptep = sp->ptep,
929
.level = sp->role.level + 1,
930
.gfn = sp->gfn,
931
.as_id = kvm_mmu_page_as_id(sp),
932
};
933
934
lockdep_assert_held_read(&kvm->mmu_lock);
935
936
if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
937
return false;
938
939
/*
940
* Root shadow pages don't have a parent page table and thus no
941
* associated entry, but they can never be possible NX huge pages.
942
*/
943
if (WARN_ON_ONCE(!sp->ptep))
944
return false;
945
946
/*
947
* Since mmu_lock is held in read mode, it's possible another task has
948
* already modified the SPTE. Zap the SPTE if and only if the SPTE
949
* points at the SP's page table, as checking shadow-present isn't
950
* sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
951
* another SP. Note, spte_to_child_pt() also checks that the SPTE is
952
* shadow-present, i.e. guards against zapping a frozen SPTE.
953
*/
954
if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
955
return false;
956
957
/*
958
* If a different task modified the SPTE, then it should be impossible
959
* for the SPTE to still be used for the to-be-zapped SP. Non-leaf
960
* SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
961
* creating non-leaf SPTEs, and all other bits are immutable for non-
962
* leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
963
* zapping and replacement.
964
*/
965
if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
966
WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
967
return false;
968
}
969
970
return true;
971
}
972
973
/*
974
* If can_yield is true, will release the MMU lock and reschedule if the
975
* scheduler needs the CPU or there is contention on the MMU lock. If this
976
* function cannot yield, it will not release the MMU lock or reschedule and
977
* the caller must ensure it does not supply too large a GFN range, or the
978
* operation can cause a soft lockup.
979
*/
980
static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
981
gfn_t start, gfn_t end, bool can_yield, bool flush)
982
{
983
struct tdp_iter iter;
984
985
end = min(end, tdp_mmu_max_gfn_exclusive());
986
987
lockdep_assert_held_write(&kvm->mmu_lock);
988
989
rcu_read_lock();
990
991
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
992
if (can_yield &&
993
tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
994
flush = false;
995
continue;
996
}
997
998
if (!is_shadow_present_pte(iter.old_spte) ||
999
!is_last_spte(iter.old_spte, iter.level))
1000
continue;
1001
1002
tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
1003
1004
/*
1005
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
1006
* see kvm_tdp_mmu_zap_invalidated_roots() for details.
1007
*/
1008
if (!root->role.invalid)
1009
flush = true;
1010
}
1011
1012
rcu_read_unlock();
1013
1014
/*
1015
* Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
1016
* to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
1017
*/
1018
return flush;
1019
}
1020
1021
/*
1022
* Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
1023
* Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
1024
* one or more SPTEs were zapped since the MMU lock was last acquired.
1025
*/
1026
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
1027
{
1028
struct kvm_mmu_page *root;
1029
1030
lockdep_assert_held_write(&kvm->mmu_lock);
1031
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
1032
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
1033
1034
return flush;
1035
}
1036
1037
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1038
{
1039
struct kvm_mmu_page *root;
1040
1041
/*
1042
* Zap all direct roots, including invalid direct roots, as all direct
1043
* SPTEs must be dropped before returning to the caller. For TDX, mirror
1044
* roots don't need handling in response to the mmu notifier (the caller).
1045
*
1046
* Zap directly even if the root is also being zapped by a concurrent
1047
* "fast zap". Walking zapped top-level SPTEs isn't all that expensive
1048
* and mmu_lock is already held, which means the other thread has yielded.
1049
*
1050
* A TLB flush is unnecessary, KVM zaps everything if and only the VM
1051
* is being destroyed or the userspace VMM has exited. In both cases,
1052
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1053
*/
1054
lockdep_assert_held_write(&kvm->mmu_lock);
1055
__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1056
KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1057
tdp_mmu_zap_root(kvm, root, false);
1058
}
1059
1060
/*
1061
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1062
* zap" completes.
1063
*/
1064
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1065
{
1066
struct kvm_mmu_page *root;
1067
1068
if (shared)
1069
read_lock(&kvm->mmu_lock);
1070
else
1071
write_lock(&kvm->mmu_lock);
1072
1073
for_each_tdp_mmu_root_yield_safe(kvm, root) {
1074
if (!root->tdp_mmu_scheduled_root_to_zap)
1075
continue;
1076
1077
root->tdp_mmu_scheduled_root_to_zap = false;
1078
KVM_BUG_ON(!root->role.invalid, kvm);
1079
1080
/*
1081
* A TLB flush is not necessary as KVM performs a local TLB
1082
* flush when allocating a new root (see kvm_mmu_load()), and
1083
* when migrating a vCPU to a different pCPU. Note, the local
1084
* TLB flush on reuse also invalidates paging-structure-cache
1085
* entries, i.e. TLB entries for intermediate paging structures,
1086
* that may be zapped, as such entries are associated with the
1087
* ASID on both VMX and SVM.
1088
*/
1089
tdp_mmu_zap_root(kvm, root, shared);
1090
1091
/*
1092
* The referenced needs to be put *after* zapping the root, as
1093
* the root must be reachable by mmu_notifiers while it's being
1094
* zapped
1095
*/
1096
kvm_tdp_mmu_put_root(kvm, root);
1097
}
1098
1099
if (shared)
1100
read_unlock(&kvm->mmu_lock);
1101
else
1102
write_unlock(&kvm->mmu_lock);
1103
}
1104
1105
/*
1106
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1107
* is about to be zapped, e.g. in response to a memslots update. The actual
1108
* zapping is done separately so that it happens with mmu_lock with read,
1109
* whereas invalidating roots must be done with mmu_lock held for write (unless
1110
* the VM is being destroyed).
1111
*
1112
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1113
* See kvm_tdp_mmu_alloc_root().
1114
*/
1115
void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1116
enum kvm_tdp_mmu_root_types root_types)
1117
{
1118
struct kvm_mmu_page *root;
1119
1120
/*
1121
* Invalidating invalid roots doesn't make sense, prevent developers from
1122
* having to think about it.
1123
*/
1124
if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1125
root_types &= ~KVM_INVALID_ROOTS;
1126
1127
/*
1128
* mmu_lock must be held for write to ensure that a root doesn't become
1129
* invalid while there are active readers (invalidating a root while
1130
* there are active readers may or may not be problematic in practice,
1131
* but it's uncharted territory and not supported).
1132
*
1133
* Waive the assertion if there are no users of @kvm, i.e. the VM is
1134
* being destroyed after all references have been put, or if no vCPUs
1135
* have been created (which means there are no roots), i.e. the VM is
1136
* being destroyed in an error path of KVM_CREATE_VM.
1137
*/
1138
if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1139
refcount_read(&kvm->users_count) && kvm->created_vcpus)
1140
lockdep_assert_held_write(&kvm->mmu_lock);
1141
1142
/*
1143
* As above, mmu_lock isn't held when destroying the VM! There can't
1144
* be other references to @kvm, i.e. nothing else can invalidate roots
1145
* or get/put references to roots.
1146
*/
1147
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1148
if (!tdp_mmu_root_match(root, root_types))
1149
continue;
1150
1151
/*
1152
* Note, invalid roots can outlive a memslot update! Invalid
1153
* roots must be *zapped* before the memslot update completes,
1154
* but a different task can acquire a reference and keep the
1155
* root alive after its been zapped.
1156
*/
1157
if (!root->role.invalid) {
1158
root->tdp_mmu_scheduled_root_to_zap = true;
1159
root->role.invalid = true;
1160
}
1161
}
1162
}
1163
1164
/*
1165
* Installs a last-level SPTE to handle a TDP page fault.
1166
* (NPT/EPT violation/misconfiguration)
1167
*/
1168
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1169
struct kvm_page_fault *fault,
1170
struct tdp_iter *iter)
1171
{
1172
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1173
u64 new_spte;
1174
int ret = RET_PF_FIXED;
1175
bool wrprot = false;
1176
1177
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1178
return RET_PF_RETRY;
1179
1180
if (is_shadow_present_pte(iter->old_spte) &&
1181
(fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
1182
is_last_spte(iter->old_spte, iter->level)) {
1183
WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
1184
return RET_PF_SPURIOUS;
1185
}
1186
1187
if (unlikely(!fault->slot))
1188
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1189
else
1190
wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1191
fault->pfn, iter->old_spte, fault->prefetch,
1192
false, fault->map_writable, &new_spte);
1193
1194
if (new_spte == iter->old_spte)
1195
ret = RET_PF_SPURIOUS;
1196
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1197
return RET_PF_RETRY;
1198
else if (is_shadow_present_pte(iter->old_spte) &&
1199
(!is_last_spte(iter->old_spte, iter->level) ||
1200
WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1201
kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1202
1203
/*
1204
* If the page fault was caused by a write but the page is write
1205
* protected, emulation is needed. If the emulation was skipped,
1206
* the vCPU would have the same fault again.
1207
*/
1208
if (wrprot && fault->write)
1209
ret = RET_PF_WRITE_PROTECTED;
1210
1211
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1212
if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1213
vcpu->stat.pf_mmio_spte_created++;
1214
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1215
new_spte);
1216
ret = RET_PF_EMULATE;
1217
} else {
1218
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1219
rcu_dereference(iter->sptep));
1220
}
1221
1222
return ret;
1223
}
1224
1225
/*
1226
* tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1227
* provided page table.
1228
*
1229
* @kvm: kvm instance
1230
* @iter: a tdp_iter instance currently on the SPTE that should be set
1231
* @sp: The new TDP page table to install.
1232
* @shared: This operation is running under the MMU lock in read mode.
1233
*
1234
* Returns: 0 if the new page table was installed. Non-0 if the page table
1235
* could not be installed (e.g. the atomic compare-exchange failed).
1236
*/
1237
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1238
struct kvm_mmu_page *sp, bool shared)
1239
{
1240
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1241
int ret = 0;
1242
1243
if (shared) {
1244
ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1245
if (ret)
1246
return ret;
1247
} else {
1248
tdp_mmu_iter_set_spte(kvm, iter, spte);
1249
}
1250
1251
tdp_account_mmu_page(kvm, sp);
1252
1253
return 0;
1254
}
1255
1256
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1257
struct kvm_mmu_page *sp, bool shared);
1258
1259
/*
1260
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1261
* page tables and SPTEs to translate the faulting guest physical address.
1262
*/
1263
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1264
{
1265
struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1266
struct kvm *kvm = vcpu->kvm;
1267
struct tdp_iter iter;
1268
struct kvm_mmu_page *sp;
1269
int ret = RET_PF_RETRY;
1270
1271
KVM_MMU_WARN_ON(!root || root->role.invalid);
1272
1273
kvm_mmu_hugepage_adjust(vcpu, fault);
1274
1275
trace_kvm_mmu_spte_requested(fault);
1276
1277
rcu_read_lock();
1278
1279
for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1280
int r;
1281
1282
if (fault->nx_huge_page_workaround_enabled)
1283
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1284
1285
/*
1286
* If SPTE has been frozen by another thread, just give up and
1287
* retry, avoiding unnecessary page table allocation and free.
1288
*/
1289
if (is_frozen_spte(iter.old_spte))
1290
goto retry;
1291
1292
if (iter.level == fault->goal_level)
1293
goto map_target_level;
1294
1295
/* Step down into the lower level page table if it exists. */
1296
if (is_shadow_present_pte(iter.old_spte) &&
1297
!is_large_pte(iter.old_spte))
1298
continue;
1299
1300
/*
1301
* The SPTE is either non-present or points to a huge page that
1302
* needs to be split.
1303
*/
1304
sp = tdp_mmu_alloc_sp(vcpu);
1305
tdp_mmu_init_child_sp(sp, &iter);
1306
if (is_mirror_sp(sp))
1307
kvm_mmu_alloc_external_spt(vcpu, sp);
1308
1309
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1310
1311
if (is_shadow_present_pte(iter.old_spte)) {
1312
/* Don't support large page for mirrored roots (TDX) */
1313
KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1314
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1315
} else {
1316
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1317
}
1318
1319
/*
1320
* Force the guest to retry if installing an upper level SPTE
1321
* failed, e.g. because a different task modified the SPTE.
1322
*/
1323
if (r) {
1324
tdp_mmu_free_sp(sp);
1325
goto retry;
1326
}
1327
1328
if (fault->huge_page_disallowed &&
1329
fault->req_level >= iter.level) {
1330
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1331
if (sp->nx_huge_page_disallowed)
1332
track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
1333
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1334
}
1335
}
1336
1337
/*
1338
* The walk aborted before reaching the target level, e.g. because the
1339
* iterator detected an upper level SPTE was frozen during traversal.
1340
*/
1341
WARN_ON_ONCE(iter.level == fault->goal_level);
1342
goto retry;
1343
1344
map_target_level:
1345
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1346
1347
retry:
1348
rcu_read_unlock();
1349
return ret;
1350
}
1351
1352
/* Used by mmu notifier via kvm_unmap_gfn_range() */
1353
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1354
bool flush)
1355
{
1356
enum kvm_tdp_mmu_root_types types;
1357
struct kvm_mmu_page *root;
1358
1359
types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1360
1361
__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1362
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1363
range->may_block, flush);
1364
1365
return flush;
1366
}
1367
1368
/*
1369
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1370
* if any of the GFNs in the range have been accessed.
1371
*
1372
* No need to mark the corresponding PFN as accessed as this call is coming
1373
* from the clear_young() or clear_flush_young() notifier, which uses the
1374
* return value to determine if the page has been accessed.
1375
*/
1376
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
1377
{
1378
u64 new_spte;
1379
1380
if (spte_ad_enabled(iter->old_spte)) {
1381
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1382
shadow_accessed_mask);
1383
new_spte = iter->old_spte & ~shadow_accessed_mask;
1384
} else {
1385
new_spte = mark_spte_for_access_track(iter->old_spte);
1386
/*
1387
* It is safe for the following cmpxchg to fail. Leave the
1388
* Accessed bit set, as the spte is most likely young anyway.
1389
*/
1390
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1391
return;
1392
}
1393
1394
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1395
iter->old_spte, new_spte);
1396
}
1397
1398
static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1399
struct kvm_gfn_range *range,
1400
bool test_only)
1401
{
1402
enum kvm_tdp_mmu_root_types types;
1403
struct kvm_mmu_page *root;
1404
struct tdp_iter iter;
1405
bool ret = false;
1406
1407
types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1408
1409
/*
1410
* Don't support rescheduling, none of the MMU notifiers that funnel
1411
* into this helper allow blocking; it'd be dead, wasteful code. Note,
1412
* this helper must NOT be used to unmap GFNs, as it processes only
1413
* valid roots!
1414
*/
1415
WARN_ON(types & ~KVM_VALID_ROOTS);
1416
1417
guard(rcu)();
1418
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
1419
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1420
if (!is_accessed_spte(iter.old_spte))
1421
continue;
1422
1423
if (test_only)
1424
return true;
1425
1426
ret = true;
1427
kvm_tdp_mmu_age_spte(kvm, &iter);
1428
}
1429
}
1430
1431
return ret;
1432
}
1433
1434
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1435
{
1436
return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1437
}
1438
1439
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1440
{
1441
return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1442
}
1443
1444
/*
1445
* Remove write access from all SPTEs at or above min_level that map GFNs
1446
* [start, end). Returns true if an SPTE has been changed and the TLBs need to
1447
* be flushed.
1448
*/
1449
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1450
gfn_t start, gfn_t end, int min_level)
1451
{
1452
struct tdp_iter iter;
1453
u64 new_spte;
1454
bool spte_set = false;
1455
1456
rcu_read_lock();
1457
1458
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1459
1460
for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1461
retry:
1462
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1463
continue;
1464
1465
if (!is_shadow_present_pte(iter.old_spte) ||
1466
!is_last_spte(iter.old_spte, iter.level) ||
1467
!(iter.old_spte & PT_WRITABLE_MASK))
1468
continue;
1469
1470
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1471
1472
if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1473
goto retry;
1474
1475
spte_set = true;
1476
}
1477
1478
rcu_read_unlock();
1479
return spte_set;
1480
}
1481
1482
/*
1483
* Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1484
* only affect leaf SPTEs down to min_level.
1485
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
1486
*/
1487
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1488
const struct kvm_memory_slot *slot, int min_level)
1489
{
1490
struct kvm_mmu_page *root;
1491
bool spte_set = false;
1492
1493
lockdep_assert_held_read(&kvm->mmu_lock);
1494
1495
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1496
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1497
slot->base_gfn + slot->npages, min_level);
1498
1499
return spte_set;
1500
}
1501
1502
static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1503
{
1504
struct kvm_mmu_page *sp;
1505
1506
sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1507
if (!sp)
1508
return NULL;
1509
1510
sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1511
if (!sp->spt) {
1512
kmem_cache_free(mmu_page_header_cache, sp);
1513
return NULL;
1514
}
1515
1516
return sp;
1517
}
1518
1519
/* Note, the caller is responsible for initializing @sp. */
1520
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1521
struct kvm_mmu_page *sp, bool shared)
1522
{
1523
const u64 huge_spte = iter->old_spte;
1524
const int level = iter->level;
1525
int ret, i;
1526
1527
/*
1528
* No need for atomics when writing to sp->spt since the page table has
1529
* not been linked in yet and thus is not reachable from any other CPU.
1530
*/
1531
for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1532
sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1533
1534
/*
1535
* Replace the huge spte with a pointer to the populated lower level
1536
* page table. Since we are making this change without a TLB flush vCPUs
1537
* will see a mix of the split mappings and the original huge mapping,
1538
* depending on what's currently in their TLB. This is fine from a
1539
* correctness standpoint since the translation will be the same either
1540
* way.
1541
*/
1542
ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1543
if (ret)
1544
goto out;
1545
1546
/*
1547
* tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1548
* are overwriting from the page stats. But we have to manually update
1549
* the page stats with the new present child pages.
1550
*/
1551
kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1552
1553
out:
1554
trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1555
return ret;
1556
}
1557
1558
static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1559
struct kvm_mmu_page *root,
1560
gfn_t start, gfn_t end,
1561
int target_level, bool shared)
1562
{
1563
struct kvm_mmu_page *sp = NULL;
1564
struct tdp_iter iter;
1565
1566
rcu_read_lock();
1567
1568
/*
1569
* Traverse the page table splitting all huge pages above the target
1570
* level into one lower level. For example, if we encounter a 1GB page
1571
* we split it into 512 2MB pages.
1572
*
1573
* Since the TDP iterator uses a pre-order traversal, we are guaranteed
1574
* to visit an SPTE before ever visiting its children, which means we
1575
* will correctly recursively split huge pages that are more than one
1576
* level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1577
* and then splitting each of those to 512 4KB pages).
1578
*/
1579
for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1580
retry:
1581
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1582
continue;
1583
1584
if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1585
continue;
1586
1587
if (!sp) {
1588
rcu_read_unlock();
1589
1590
if (shared)
1591
read_unlock(&kvm->mmu_lock);
1592
else
1593
write_unlock(&kvm->mmu_lock);
1594
1595
sp = tdp_mmu_alloc_sp_for_split();
1596
1597
if (shared)
1598
read_lock(&kvm->mmu_lock);
1599
else
1600
write_lock(&kvm->mmu_lock);
1601
1602
if (!sp) {
1603
trace_kvm_mmu_split_huge_page(iter.gfn,
1604
iter.old_spte,
1605
iter.level, -ENOMEM);
1606
return -ENOMEM;
1607
}
1608
1609
rcu_read_lock();
1610
1611
iter.yielded = true;
1612
continue;
1613
}
1614
1615
tdp_mmu_init_child_sp(sp, &iter);
1616
1617
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1618
goto retry;
1619
1620
sp = NULL;
1621
}
1622
1623
rcu_read_unlock();
1624
1625
/*
1626
* It's possible to exit the loop having never used the last sp if, for
1627
* example, a vCPU doing HugePage NX splitting wins the race and
1628
* installs its own sp in place of the last sp we tried to split.
1629
*/
1630
if (sp)
1631
tdp_mmu_free_sp(sp);
1632
1633
return 0;
1634
}
1635
1636
1637
/*
1638
* Try to split all huge pages mapped by the TDP MMU down to the target level.
1639
*/
1640
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1641
const struct kvm_memory_slot *slot,
1642
gfn_t start, gfn_t end,
1643
int target_level, bool shared)
1644
{
1645
struct kvm_mmu_page *root;
1646
int r = 0;
1647
1648
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1649
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1650
r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1651
if (r) {
1652
kvm_tdp_mmu_put_root(kvm, root);
1653
break;
1654
}
1655
}
1656
}
1657
1658
static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
1659
{
1660
/*
1661
* All TDP MMU shadow pages share the same role as their root, aside
1662
* from level, so it is valid to key off any shadow page to determine if
1663
* write protection is needed for an entire tree.
1664
*/
1665
return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
1666
}
1667
1668
static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1669
gfn_t start, gfn_t end)
1670
{
1671
const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
1672
PT_WRITABLE_MASK : shadow_dirty_mask;
1673
struct tdp_iter iter;
1674
1675
rcu_read_lock();
1676
1677
tdp_root_for_each_pte(iter, kvm, root, start, end) {
1678
retry:
1679
if (!is_shadow_present_pte(iter.old_spte) ||
1680
!is_last_spte(iter.old_spte, iter.level))
1681
continue;
1682
1683
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1684
continue;
1685
1686
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1687
spte_ad_need_write_protect(iter.old_spte));
1688
1689
if (!(iter.old_spte & dbit))
1690
continue;
1691
1692
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1693
goto retry;
1694
}
1695
1696
rcu_read_unlock();
1697
}
1698
1699
/*
1700
* Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1701
* memslot.
1702
*/
1703
void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1704
const struct kvm_memory_slot *slot)
1705
{
1706
struct kvm_mmu_page *root;
1707
1708
lockdep_assert_held_read(&kvm->mmu_lock);
1709
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1710
clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1711
slot->base_gfn + slot->npages);
1712
}
1713
1714
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1715
gfn_t gfn, unsigned long mask, bool wrprot)
1716
{
1717
const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
1718
PT_WRITABLE_MASK : shadow_dirty_mask;
1719
struct tdp_iter iter;
1720
1721
lockdep_assert_held_write(&kvm->mmu_lock);
1722
1723
rcu_read_lock();
1724
1725
tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1726
gfn + BITS_PER_LONG) {
1727
if (!mask)
1728
break;
1729
1730
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1731
spte_ad_need_write_protect(iter.old_spte));
1732
1733
if (iter.level > PG_LEVEL_4K ||
1734
!(mask & (1UL << (iter.gfn - gfn))))
1735
continue;
1736
1737
mask &= ~(1UL << (iter.gfn - gfn));
1738
1739
if (!(iter.old_spte & dbit))
1740
continue;
1741
1742
iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1743
iter.old_spte, dbit,
1744
iter.level);
1745
1746
trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1747
iter.old_spte,
1748
iter.old_spte & ~dbit);
1749
}
1750
1751
rcu_read_unlock();
1752
}
1753
1754
/*
1755
* Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1756
* which a bit is set in mask, starting at gfn. The given memslot is expected to
1757
* contain all the GFNs represented by set bits in the mask.
1758
*/
1759
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1760
struct kvm_memory_slot *slot,
1761
gfn_t gfn, unsigned long mask,
1762
bool wrprot)
1763
{
1764
struct kvm_mmu_page *root;
1765
1766
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1767
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1768
}
1769
1770
static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1771
struct tdp_iter *parent,
1772
u64 *huge_spte)
1773
{
1774
struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1775
gfn_t start = parent->gfn;
1776
gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1777
struct tdp_iter iter;
1778
1779
tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1780
/*
1781
* Use the parent iterator when checking for forward progress so
1782
* that KVM doesn't get stuck continuously trying to yield (i.e.
1783
* returning -EAGAIN here and then failing the forward progress
1784
* check in the caller ad nauseam).
1785
*/
1786
if (tdp_mmu_iter_need_resched(kvm, parent))
1787
return -EAGAIN;
1788
1789
*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1790
return 0;
1791
}
1792
1793
return -ENOENT;
1794
}
1795
1796
static void recover_huge_pages_range(struct kvm *kvm,
1797
struct kvm_mmu_page *root,
1798
const struct kvm_memory_slot *slot)
1799
{
1800
gfn_t start = slot->base_gfn;
1801
gfn_t end = start + slot->npages;
1802
struct tdp_iter iter;
1803
int max_mapping_level;
1804
bool flush = false;
1805
u64 huge_spte;
1806
int r;
1807
1808
if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1809
return;
1810
1811
rcu_read_lock();
1812
1813
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1814
retry:
1815
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1816
flush = false;
1817
continue;
1818
}
1819
1820
if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1821
!is_shadow_present_pte(iter.old_spte))
1822
continue;
1823
1824
/*
1825
* Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1826
* a large page size, then its parent would have been zapped
1827
* instead of stepping down.
1828
*/
1829
if (is_last_spte(iter.old_spte, iter.level))
1830
continue;
1831
1832
/*
1833
* If iter.gfn resides outside of the slot, i.e. the page for
1834
* the current level overlaps but is not contained by the slot,
1835
* then the SPTE can't be made huge. More importantly, trying
1836
* to query that info from slot->arch.lpage_info will cause an
1837
* out-of-bounds access.
1838
*/
1839
if (iter.gfn < start || iter.gfn >= end)
1840
continue;
1841
1842
max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
1843
if (max_mapping_level < iter.level)
1844
continue;
1845
1846
r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1847
if (r == -EAGAIN)
1848
goto retry;
1849
else if (r)
1850
continue;
1851
1852
if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1853
goto retry;
1854
1855
flush = true;
1856
}
1857
1858
if (flush)
1859
kvm_flush_remote_tlbs_memslot(kvm, slot);
1860
1861
rcu_read_unlock();
1862
}
1863
1864
/*
1865
* Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1866
* huge SPTEs where possible.
1867
*/
1868
void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1869
const struct kvm_memory_slot *slot)
1870
{
1871
struct kvm_mmu_page *root;
1872
1873
lockdep_assert_held_read(&kvm->mmu_lock);
1874
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1875
recover_huge_pages_range(kvm, root, slot);
1876
}
1877
1878
/*
1879
* Removes write access on the last level SPTE mapping this GFN and unsets the
1880
* MMU-writable bit to ensure future writes continue to be intercepted.
1881
* Returns true if an SPTE was set and a TLB flush is needed.
1882
*/
1883
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1884
gfn_t gfn, int min_level)
1885
{
1886
struct tdp_iter iter;
1887
u64 new_spte;
1888
bool spte_set = false;
1889
1890
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1891
1892
rcu_read_lock();
1893
1894
for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1895
if (!is_shadow_present_pte(iter.old_spte) ||
1896
!is_last_spte(iter.old_spte, iter.level))
1897
continue;
1898
1899
new_spte = iter.old_spte &
1900
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1901
1902
if (new_spte == iter.old_spte)
1903
break;
1904
1905
tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1906
spte_set = true;
1907
}
1908
1909
rcu_read_unlock();
1910
1911
return spte_set;
1912
}
1913
1914
/*
1915
* Removes write access on the last level SPTE mapping this GFN and unsets the
1916
* MMU-writable bit to ensure future writes continue to be intercepted.
1917
* Returns true if an SPTE was set and a TLB flush is needed.
1918
*/
1919
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1920
struct kvm_memory_slot *slot, gfn_t gfn,
1921
int min_level)
1922
{
1923
struct kvm_mmu_page *root;
1924
bool spte_set = false;
1925
1926
lockdep_assert_held_write(&kvm->mmu_lock);
1927
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1928
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1929
1930
return spte_set;
1931
}
1932
1933
/*
1934
* Return the level of the lowest level SPTE added to sptes.
1935
* That SPTE may be non-present.
1936
*
1937
* Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1938
*/
1939
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1940
int *root_level)
1941
{
1942
struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1943
struct tdp_iter iter;
1944
gfn_t gfn = addr >> PAGE_SHIFT;
1945
int leaf = -1;
1946
1947
*root_level = vcpu->arch.mmu->root_role.level;
1948
1949
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1950
leaf = iter.level;
1951
sptes[leaf] = iter.old_spte;
1952
}
1953
1954
return leaf;
1955
}
1956
1957
/*
1958
* Returns the last level spte pointer of the shadow page walk for the given
1959
* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1960
* walk could be performed, returns NULL and *spte does not contain valid data.
1961
*
1962
* Contract:
1963
* - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1964
* - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1965
*
1966
* WARNING: This function is only intended to be called during fast_page_fault.
1967
*/
1968
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1969
u64 *spte)
1970
{
1971
/* Fast pf is not supported for mirrored roots */
1972
struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1973
struct tdp_iter iter;
1974
tdp_ptep_t sptep = NULL;
1975
1976
for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1977
*spte = iter.old_spte;
1978
sptep = iter.sptep;
1979
}
1980
1981
/*
1982
* Perform the rcu_dereference to get the raw spte pointer value since
1983
* we are passing it up to fast_page_fault, which is shared with the
1984
* legacy MMU and thus does not retain the TDP MMU-specific __rcu
1985
* annotation.
1986
*
1987
* This is safe since fast_page_fault obeys the contracts of this
1988
* function as well as all TDP MMU contracts around modifying SPTEs
1989
* outside of mmu_lock.
1990
*/
1991
return rcu_dereference(sptep);
1992
}
1993
1994