Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/s390/kvm/gmap.c
121832 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Guest memory management for KVM/s390
4
*
5
* Copyright IBM Corp. 2008, 2020, 2024
6
*
7
* Author(s): Claudio Imbrenda <[email protected]>
8
* Martin Schwidefsky <[email protected]>
9
* David Hildenbrand <[email protected]>
10
* Janosch Frank <[email protected]>
11
*/
12
13
#include <linux/compiler.h>
14
#include <linux/kvm.h>
15
#include <linux/kvm_host.h>
16
#include <linux/pgtable.h>
17
#include <linux/pagemap.h>
18
#include <asm/lowcore.h>
19
#include <asm/uv.h>
20
#include <asm/gmap_helpers.h>
21
22
#include "dat.h"
23
#include "gmap.h"
24
#include "kvm-s390.h"
25
#include "faultin.h"
26
27
static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
28
{
29
return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
30
}
31
32
static int gmap_limit_to_type(gfn_t limit)
33
{
34
if (!limit)
35
return TABLE_TYPE_REGION1;
36
if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
37
return TABLE_TYPE_SEGMENT;
38
if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
39
return TABLE_TYPE_REGION3;
40
if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
41
return TABLE_TYPE_REGION2;
42
return TABLE_TYPE_REGION1;
43
}
44
45
/**
46
* gmap_new() - Allocate and initialize a guest address space.
47
* @kvm: The kvm owning the guest.
48
* @limit: Maximum address of the gmap address space.
49
*
50
* Return: A guest address space structure.
51
*/
52
struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
53
{
54
struct crst_table *table;
55
struct gmap *gmap;
56
int type;
57
58
type = gmap_limit_to_type(limit);
59
60
gmap = kzalloc_obj(*gmap, GFP_KERNEL_ACCOUNT);
61
if (!gmap)
62
return NULL;
63
INIT_LIST_HEAD(&gmap->children);
64
INIT_LIST_HEAD(&gmap->list);
65
INIT_LIST_HEAD(&gmap->scb_users);
66
INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
67
spin_lock_init(&gmap->children_lock);
68
spin_lock_init(&gmap->host_to_rmap_lock);
69
refcount_set(&gmap->refcount, 1);
70
71
table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
72
if (!table) {
73
kfree(gmap);
74
return NULL;
75
}
76
77
gmap->asce.val = __pa(table);
78
gmap->asce.dt = type;
79
gmap->asce.tl = _ASCE_TABLE_LENGTH;
80
gmap->asce.x = 1;
81
gmap->asce.p = 1;
82
gmap->asce.s = 1;
83
gmap->kvm = kvm;
84
set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
85
86
return gmap;
87
}
88
89
static void gmap_add_child(struct gmap *parent, struct gmap *child)
90
{
91
KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
92
KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
93
KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
94
lockdep_assert_held(&parent->children_lock);
95
96
child->parent = parent;
97
98
if (is_ucontrol(parent))
99
set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
100
else
101
clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
102
103
if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
104
set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
105
else
106
clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
107
108
if (kvm_is_ucontrol(parent->kvm))
109
clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
110
list_add(&child->list, &parent->children);
111
}
112
113
struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
114
{
115
struct gmap *res;
116
117
lockdep_assert_not_held(&parent->children_lock);
118
res = gmap_new(parent->kvm, limit);
119
if (res) {
120
scoped_guard(spinlock, &parent->children_lock)
121
gmap_add_child(parent, res);
122
}
123
return res;
124
}
125
126
int gmap_set_limit(struct gmap *gmap, gfn_t limit)
127
{
128
struct kvm_s390_mmu_cache *mc;
129
int rc, type;
130
131
type = gmap_limit_to_type(limit);
132
133
mc = kvm_s390_new_mmu_cache();
134
if (!mc)
135
return -ENOMEM;
136
137
do {
138
rc = kvm_s390_mmu_cache_topup(mc);
139
if (rc)
140
return rc;
141
scoped_guard(write_lock, &gmap->kvm->mmu_lock)
142
rc = dat_set_asce_limit(mc, &gmap->asce, type);
143
} while (rc == -ENOMEM);
144
145
kvm_s390_free_mmu_cache(mc);
146
return 0;
147
}
148
149
static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
150
{
151
struct vsie_rmap *rmap, *rnext, *head;
152
struct radix_tree_iter iter;
153
unsigned long indices[16];
154
unsigned long index;
155
void __rcu **slot;
156
int i, nr;
157
158
/* A radix tree is freed by deleting all of its entries */
159
index = 0;
160
do {
161
nr = 0;
162
radix_tree_for_each_slot(slot, root, &iter, index) {
163
indices[nr] = iter.index;
164
if (++nr == 16)
165
break;
166
}
167
for (i = 0; i < nr; i++) {
168
index = indices[i];
169
head = radix_tree_delete(root, index);
170
gmap_for_each_rmap_safe(rmap, rnext, head)
171
kfree(rmap);
172
}
173
} while (nr > 0);
174
}
175
176
void gmap_remove_child(struct gmap *child)
177
{
178
if (KVM_BUG_ON(!child->parent, child->kvm))
179
return;
180
lockdep_assert_held(&child->parent->children_lock);
181
182
list_del(&child->list);
183
child->parent = NULL;
184
}
185
186
/**
187
* gmap_dispose() - Remove and free a guest address space and its children.
188
* @gmap: Pointer to the guest address space structure.
189
*/
190
void gmap_dispose(struct gmap *gmap)
191
{
192
/* The gmap must have been removed from the parent beforehands */
193
KVM_BUG_ON(gmap->parent, gmap->kvm);
194
/* All children of this gmap must have been removed beforehands */
195
KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
196
/* No VSIE shadow block is allowed to use this gmap */
197
KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
198
/* The ASCE must be valid */
199
KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
200
/* The refcount must be 0 */
201
KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
202
203
/* Flush tlb of all gmaps */
204
asce_flush_tlb(gmap->asce);
205
206
/* Free all DAT tables. */
207
dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
208
209
/* Free additional data for a shadow gmap */
210
if (is_shadow(gmap))
211
gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
212
213
kfree(gmap);
214
}
215
216
/**
217
* s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
218
* @gmap: The gmap whose ASCE needs to be replaced.
219
*
220
* If the ASCE is a SEGMENT type then this function will return -EINVAL,
221
* otherwise the pointers in the host_to_guest radix tree will keep pointing
222
* to the wrong pages, causing use-after-free and memory corruption.
223
* If the allocation of the new top level page table fails, the ASCE is not
224
* replaced.
225
* In any case, the old ASCE is always removed from the gmap CRST list.
226
* Therefore the caller has to make sure to save a pointer to it
227
* beforehand, unless a leak is actually intended.
228
*
229
* Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
230
* -ENOMEM if runinng out of memory.
231
*/
232
int s390_replace_asce(struct gmap *gmap)
233
{
234
struct crst_table *table;
235
union asce asce;
236
237
/* Replacing segment type ASCEs would cause serious issues */
238
if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
239
return -EINVAL;
240
241
table = dat_alloc_crst_sleepable(0);
242
if (!table)
243
return -ENOMEM;
244
memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
245
246
/* Set new table origin while preserving existing ASCE control bits */
247
asce = gmap->asce;
248
asce.rsto = virt_to_pfn(table);
249
WRITE_ONCE(gmap->asce, asce);
250
251
return 0;
252
}
253
254
bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
255
{
256
struct kvm *kvm = gmap->kvm;
257
struct kvm_vcpu *vcpu;
258
gfn_t prefix_gfn;
259
unsigned long i;
260
261
if (is_shadow(gmap))
262
return false;
263
kvm_for_each_vcpu(i, vcpu, kvm) {
264
/* Match against both prefix pages */
265
prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
266
if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
267
if (hint && kvm_s390_is_in_sie(vcpu))
268
return false;
269
VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
270
gfn_to_gpa(gfn), gfn_to_gpa(end));
271
kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
272
}
273
}
274
return true;
275
}
276
277
struct clear_young_pte_priv {
278
struct gmap *gmap;
279
bool young;
280
};
281
282
static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
283
{
284
struct clear_young_pte_priv *p = walk->priv;
285
union pgste pgste;
286
union pte pte, new;
287
288
pte = READ_ONCE(*ptep);
289
290
if (!pte.s.pr || (!pte.s.y && pte.h.i))
291
return 0;
292
293
pgste = pgste_get_lock(ptep);
294
if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
295
new = pte;
296
new.h.i = 1;
297
new.s.y = 0;
298
if ((new.s.d || !new.h.p) && !new.s.s)
299
folio_set_dirty(pfn_folio(pte.h.pfra));
300
new.s.d = 0;
301
new.h.p = 1;
302
303
pgste.prefix_notif = 0;
304
pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
305
}
306
p->young = 1;
307
pgste_set_unlock(ptep, pgste);
308
return 0;
309
}
310
311
static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
312
{
313
struct clear_young_pte_priv *priv = walk->priv;
314
union crste crste, new;
315
316
do {
317
crste = READ_ONCE(*crstep);
318
319
if (!crste.h.fc)
320
return 0;
321
if (!crste.s.fc1.y && crste.h.i)
322
return 0;
323
if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
324
break;
325
326
new = crste;
327
new.h.i = 1;
328
new.s.fc1.y = 0;
329
new.s.fc1.prefix_notif = 0;
330
if (new.s.fc1.d || !new.h.p)
331
folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
332
new.s.fc1.d = 0;
333
new.h.p = 1;
334
} while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
335
336
priv->young = 1;
337
return 0;
338
}
339
340
/**
341
* gmap_age_gfn() - Clear young.
342
* @gmap: The guest gmap.
343
* @start: The first gfn to test.
344
* @end: The gfn after the last one to test.
345
*
346
* Context: Called with the kvm mmu write lock held.
347
* Return: 1 if any page in the given range was young, otherwise 0.
348
*/
349
bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
350
{
351
const struct dat_walk_ops ops = {
352
.pte_entry = gmap_clear_young_pte,
353
.pmd_entry = gmap_clear_young_crste,
354
.pud_entry = gmap_clear_young_crste,
355
};
356
struct clear_young_pte_priv priv = {
357
.gmap = gmap,
358
.young = false,
359
};
360
361
_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
362
363
return priv.young;
364
}
365
366
struct gmap_unmap_priv {
367
struct gmap *gmap;
368
struct kvm_memory_slot *slot;
369
};
370
371
static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
372
{
373
struct gmap_unmap_priv *priv = w->priv;
374
struct folio *folio = NULL;
375
unsigned long vmaddr;
376
union pgste pgste;
377
378
pgste = pgste_get_lock(ptep);
379
if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
380
vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
381
gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
382
}
383
if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
384
folio = pfn_folio(ptep->h.pfra);
385
pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
386
pgste_set_unlock(ptep, pgste);
387
if (folio)
388
uv_convert_from_secure_folio(folio);
389
390
return 0;
391
}
392
393
static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
394
{
395
struct gmap_unmap_priv *priv = walk->priv;
396
struct folio *folio = NULL;
397
union crste old = *crstep;
398
399
if (!old.h.fc)
400
return 0;
401
402
if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
403
folio = phys_to_folio(crste_origin_large(old));
404
/* No races should happen because kvm->mmu_lock is held in write mode */
405
KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
406
priv->gmap->kvm);
407
if (folio)
408
uv_convert_from_secure_folio(folio);
409
410
return 0;
411
}
412
413
/**
414
* gmap_unmap_gfn_range() - Unmap a range of guest addresses.
415
* @gmap: The gmap to act on.
416
* @slot: The memslot in which the range is located.
417
* @start: The first gfn to unmap.
418
* @end: The gfn after the last one to unmap.
419
*
420
* Context: Called with the kvm mmu write lock held.
421
* Return: false
422
*/
423
bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
424
{
425
const struct dat_walk_ops ops = {
426
.pte_entry = _gmap_unmap_pte,
427
.pmd_entry = _gmap_unmap_crste,
428
.pud_entry = _gmap_unmap_crste,
429
};
430
struct gmap_unmap_priv priv = {
431
.gmap = gmap,
432
.slot = slot,
433
};
434
435
lockdep_assert_held_write(&gmap->kvm->mmu_lock);
436
437
_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
438
return false;
439
}
440
441
static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
442
struct gmap *gmap)
443
{
444
union pte pte = READ_ONCE(*ptep);
445
446
if (!pte.s.pr || (pte.h.p && !pte.s.sd))
447
return pgste;
448
449
/*
450
* If this page contains one or more prefixes of vCPUS that are currently
451
* running, do not reset the protection, leave it marked as dirty.
452
*/
453
if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
454
pte.h.p = 1;
455
pte.s.sd = 0;
456
pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
457
}
458
459
mark_page_dirty(gmap->kvm, gfn);
460
461
return pgste;
462
}
463
464
static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
465
struct dat_walk *walk)
466
{
467
struct gmap *gmap = walk->priv;
468
union pgste pgste;
469
470
pgste = pgste_get_lock(ptep);
471
pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
472
pgste_set_unlock(ptep, pgste);
473
return 0;
474
}
475
476
static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
477
struct dat_walk *walk)
478
{
479
struct gmap *gmap = walk->priv;
480
union crste crste, new;
481
482
if (fatal_signal_pending(current))
483
return 1;
484
do {
485
crste = READ_ONCE(*table);
486
if (!crste.h.fc)
487
return 0;
488
if (crste.h.p && !crste.s.fc1.sd)
489
return 0;
490
491
/*
492
* If this large page contains one or more prefixes of vCPUs that are
493
* currently running, do not reset the protection, leave it marked as
494
* dirty.
495
*/
496
if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
497
break;
498
new = crste;
499
new.h.p = 1;
500
new.s.fc1.sd = 0;
501
} while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
502
503
for ( ; gfn < end; gfn++)
504
mark_page_dirty(gmap->kvm, gfn);
505
506
return 0;
507
}
508
509
void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
510
{
511
const struct dat_walk_ops walk_ops = {
512
.pte_entry = _pte_test_and_clear_softdirty,
513
.pmd_entry = _crste_test_and_clear_softdirty,
514
.pud_entry = _crste_test_and_clear_softdirty,
515
};
516
517
lockdep_assert_held(&gmap->kvm->mmu_lock);
518
519
_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
520
}
521
522
static int gmap_handle_minor_crste_fault(struct gmap *gmap, struct guest_fault *f)
523
{
524
union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
525
526
/* Somehow the crste is not large anymore, let the slow path deal with it. */
527
if (!oldcrste.h.fc)
528
return 1;
529
530
f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
531
f->writable = oldcrste.s.fc1.w;
532
533
/* Appropriate permissions already (race with another handler), nothing to do. */
534
if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
535
return 0;
536
537
if (!f->write_attempt || oldcrste.s.fc1.w) {
538
f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
539
newcrste = oldcrste;
540
newcrste.h.i = 0;
541
newcrste.s.fc1.y = 1;
542
if (f->write_attempt) {
543
newcrste.h.p = 0;
544
newcrste.s.fc1.d = 1;
545
newcrste.s.fc1.sd = 1;
546
}
547
/* In case of races, let the slow path deal with it. */
548
return !gmap_crstep_xchg_atomic(gmap, f->crstep, oldcrste, newcrste, f->gfn);
549
}
550
/* Trying to write on a read-only page, let the slow path deal with it. */
551
return 1;
552
}
553
554
static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
555
struct guest_fault *f)
556
{
557
union pte newpte, oldpte = READ_ONCE(*f->ptep);
558
559
f->pfn = oldpte.h.pfra;
560
f->writable = oldpte.s.w;
561
562
/* Appropriate permissions already (race with another handler), nothing to do. */
563
if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
564
return 0;
565
/* Trying to write on a read-only page, let the slow path deal with it. */
566
if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
567
return 1;
568
569
newpte = oldpte;
570
newpte.h.i = 0;
571
newpte.s.y = 1;
572
if (f->write_attempt) {
573
newpte.h.p = 0;
574
newpte.s.d = 1;
575
newpte.s.sd = 1;
576
}
577
*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
578
579
return 0;
580
}
581
582
/**
583
* gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
584
* @gmap: The gmap whose fault needs to be resolved.
585
* @fault: Describes the fault that is being resolved.
586
*
587
* A minor fault is a fault that can be resolved quickly within gmap.
588
* The page is already mapped, the fault is only due to dirty/young tracking.
589
*
590
* Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
591
* not be resolved and needs to go through the slow path.
592
*/
593
int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
594
{
595
union pgste pgste;
596
int rc;
597
598
lockdep_assert_held(&gmap->kvm->mmu_lock);
599
600
rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
601
&fault->crstep, &fault->ptep);
602
/* If a PTE or a leaf CRSTE could not be reached, slow path. */
603
if (rc)
604
return 1;
605
606
if (fault->ptep) {
607
pgste = pgste_get_lock(fault->ptep);
608
rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
609
if (!rc && fault->callback)
610
fault->callback(fault);
611
pgste_set_unlock(fault->ptep, pgste);
612
} else {
613
rc = gmap_handle_minor_crste_fault(gmap, fault);
614
if (!rc && fault->callback)
615
fault->callback(fault);
616
}
617
return rc;
618
}
619
620
static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn)
621
{
622
return false;
623
}
624
625
static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
626
{
627
return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
628
}
629
630
static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
631
struct guest_fault *f)
632
{
633
union crste oldval, newval;
634
union pte newpte, oldpte;
635
union pgste pgste;
636
int rc = 0;
637
638
rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
639
&f->crstep, &f->ptep);
640
if (rc == -ENOMEM)
641
return rc;
642
if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
643
return rc;
644
if (rc)
645
return -EAGAIN;
646
if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
647
return -EINVAL;
648
649
if (f->ptep) {
650
pgste = pgste_get_lock(f->ptep);
651
oldpte = *f->ptep;
652
newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
653
newpte.s.sd = oldpte.s.sd;
654
oldpte.s.sd = 0;
655
if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
656
pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
657
if (f->callback)
658
f->callback(f);
659
} else {
660
rc = -EAGAIN;
661
}
662
pgste_set_unlock(f->ptep, pgste);
663
} else {
664
do {
665
oldval = READ_ONCE(*f->crstep);
666
newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
667
f->write_attempt | oldval.s.fc1.d);
668
newval.s.fc1.s = !f->page;
669
newval.s.fc1.sd = oldval.s.fc1.sd;
670
if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
671
crste_origin_large(oldval) != crste_origin_large(newval))
672
return -EAGAIN;
673
} while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
674
if (f->callback)
675
f->callback(f);
676
}
677
678
return rc;
679
}
680
681
int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
682
{
683
unsigned int order;
684
int level;
685
686
lockdep_assert_held(&gmap->kvm->mmu_lock);
687
688
level = TABLE_TYPE_PAGE_TABLE;
689
if (f->page) {
690
order = folio_order(page_folio(f->page));
691
if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn))
692
level = TABLE_TYPE_REGION3;
693
else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
694
level = TABLE_TYPE_SEGMENT;
695
}
696
return _gmap_link(mc, gmap, level, f);
697
}
698
699
static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
700
gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
701
{
702
union crste newcrste, oldcrste;
703
struct page_table *pt;
704
union crste *crstep;
705
union pte *ptep;
706
int rc;
707
708
if (force_alloc)
709
rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
710
TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
711
else
712
rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
713
TABLE_TYPE_SEGMENT, &crstep, &ptep);
714
if (rc)
715
return rc;
716
if (!ptep) {
717
newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
718
newcrste.h.i = 1;
719
newcrste.h.fc0.tl = 1;
720
} else {
721
pt = pte_table_start(ptep);
722
dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
723
newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
724
}
725
rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
726
&crstep, &ptep);
727
if (rc)
728
return rc;
729
do {
730
oldcrste = READ_ONCE(*crstep);
731
if (oldcrste.val == newcrste.val)
732
break;
733
} while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
734
return 0;
735
}
736
737
static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
738
{
739
union pte *ptep;
740
int rc;
741
742
rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
743
TABLE_TYPE_SEGMENT, crstepp, &ptep);
744
if (rc || (!ptep && !crste_is_ucas(**crstepp)))
745
return -EREMOTE;
746
if (!ptep)
747
return 1;
748
*gaddr &= ~_SEGMENT_MASK;
749
*gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
750
return 0;
751
}
752
753
/**
754
* gmap_ucas_translate() - Translate a vcpu address into a host gmap address
755
* @mc: The memory cache to be used for allocations.
756
* @gmap: The per-cpu gmap.
757
* @gaddr: Pointer to the address to be translated, will get overwritten with
758
* the translated address in case of success.
759
* Translates the per-vCPU guest address into a fake guest address, which can
760
* then be used with the fake memslots that are identity mapping userspace.
761
* This allows ucontrol VMs to use the normal fault resolution path, like
762
* normal VMs.
763
*
764
* Return: %0 in case of success, otherwise %-EREMOTE.
765
*/
766
int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
767
{
768
gpa_t translated_address;
769
union crste *crstep;
770
gfn_t gfn;
771
int rc;
772
773
gfn = gpa_to_gfn(*gaddr);
774
775
scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
776
rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
777
if (rc <= 0)
778
return rc;
779
}
780
do {
781
scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
782
rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
783
if (rc <= 0)
784
return rc;
785
translated_address = (*gaddr & ~_SEGMENT_MASK) |
786
(crstep->val & _SEGMENT_MASK);
787
rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
788
}
789
if (!rc) {
790
*gaddr = translated_address;
791
return 0;
792
}
793
if (rc != -ENOMEM)
794
return -EREMOTE;
795
rc = kvm_s390_mmu_cache_topup(mc);
796
if (rc)
797
return rc;
798
} while (1);
799
return 0;
800
}
801
802
int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
803
{
804
struct kvm_s390_mmu_cache *mc;
805
int rc;
806
807
mc = kvm_s390_new_mmu_cache();
808
if (!mc)
809
return -ENOMEM;
810
811
while (count) {
812
scoped_guard(write_lock, &gmap->kvm->mmu_lock)
813
rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
814
if (rc == -ENOMEM) {
815
rc = kvm_s390_mmu_cache_topup(mc);
816
if (rc)
817
return rc;
818
continue;
819
}
820
if (rc)
821
return rc;
822
823
count--;
824
c_gfn += _PAGE_ENTRIES;
825
p_gfn += _PAGE_ENTRIES;
826
}
827
return rc;
828
}
829
830
static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
831
{
832
union crste *crstep;
833
union pte *ptep;
834
int rc;
835
836
rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
837
if (rc)
838
return;
839
while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
840
;
841
}
842
843
void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
844
{
845
guard(read_lock)(&gmap->kvm->mmu_lock);
846
847
for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
848
gmap_ucas_unmap_one(gmap, c_gfn);
849
}
850
851
static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
852
{
853
struct gmap *gmap = walk->priv;
854
union crste crste, newcrste;
855
856
crste = READ_ONCE(*crstep);
857
newcrste = _CRSTE_EMPTY(crste.h.tt);
858
859
while (crste_leaf(crste)) {
860
if (crste_prefix(crste))
861
gmap_unmap_prefix(gmap, gfn, next);
862
if (crste.s.fc1.vsie_notif)
863
gmap_handle_vsie_unshadow_event(gmap, gfn);
864
if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
865
break;
866
crste = READ_ONCE(*crstep);
867
}
868
869
if (need_resched())
870
return next;
871
872
return 0;
873
}
874
875
void gmap_split_huge_pages(struct gmap *gmap)
876
{
877
const struct dat_walk_ops ops = {
878
.pmd_entry = _gmap_split_crste,
879
.pud_entry = _gmap_split_crste,
880
};
881
gfn_t start = 0;
882
883
do {
884
scoped_guard(read_lock, &gmap->kvm->mmu_lock)
885
start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
886
&ops, DAT_WALK_IGN_HOLES, gmap);
887
cond_resched();
888
} while (start);
889
}
890
891
static int _gmap_enable_skeys(struct gmap *gmap)
892
{
893
gfn_t start = 0;
894
int rc;
895
896
if (uses_skeys(gmap))
897
return 0;
898
899
set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
900
rc = gmap_helper_disable_cow_sharing();
901
if (rc) {
902
clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
903
return rc;
904
}
905
906
do {
907
scoped_guard(write_lock, &gmap->kvm->mmu_lock)
908
start = dat_reset_skeys(gmap->asce, start);
909
cond_resched();
910
} while (start);
911
return 0;
912
}
913
914
int gmap_enable_skeys(struct gmap *gmap)
915
{
916
int rc;
917
918
mmap_write_lock(gmap->kvm->mm);
919
rc = _gmap_enable_skeys(gmap);
920
mmap_write_unlock(gmap->kvm->mm);
921
return rc;
922
}
923
924
static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
925
{
926
if (!ptep->s.pr)
927
return 0;
928
__kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
929
if (need_resched())
930
return next;
931
return 0;
932
}
933
934
static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
935
{
936
phys_addr_t origin, cur, end;
937
938
if (!crstep->h.fc || !crstep->s.fc1.pr)
939
return 0;
940
941
origin = crste_origin_large(*crstep);
942
cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
943
end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
944
for ( ; cur < end; cur += PAGE_SIZE)
945
__kvm_s390_pv_destroy_page(phys_to_page(cur));
946
if (need_resched())
947
return next;
948
return 0;
949
}
950
951
int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
952
{
953
const struct dat_walk_ops ops = {
954
.pte_entry = _destroy_pages_pte,
955
.pmd_entry = _destroy_pages_crste,
956
.pud_entry = _destroy_pages_crste,
957
};
958
959
do {
960
scoped_guard(read_lock, &gmap->kvm->mmu_lock)
961
start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
962
DAT_WALK_IGN_HOLES, NULL);
963
if (interruptible && fatal_signal_pending(current))
964
return -EINTR;
965
cond_resched();
966
} while (start && start < end);
967
return 0;
968
}
969
970
int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
971
{
972
struct vsie_rmap *rmap __free(kvfree) = NULL;
973
struct vsie_rmap *temp;
974
void __rcu **slot;
975
int rc = 0;
976
977
KVM_BUG_ON(!is_shadow(sg), sg->kvm);
978
lockdep_assert_held(&sg->host_to_rmap_lock);
979
980
rmap = kzalloc_obj(*rmap, GFP_ATOMIC);
981
if (!rmap)
982
return -ENOMEM;
983
984
rmap->r_gfn = r_gfn;
985
rmap->level = level;
986
slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
987
if (slot) {
988
rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
989
for (temp = rmap->next; temp; temp = temp->next) {
990
if (temp->val == rmap->val)
991
return 0;
992
}
993
radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
994
} else {
995
rmap->next = NULL;
996
rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
997
if (rc)
998
return rc;
999
}
1000
rmap = NULL;
1001
1002
return 0;
1003
}
1004
1005
int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
1006
kvm_pfn_t pfn, int level, bool wr)
1007
{
1008
union crste *crstep;
1009
union pgste pgste;
1010
union pte *ptep;
1011
union pte pte;
1012
int flags, rc;
1013
1014
KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1015
lockdep_assert_held(&sg->parent->children_lock);
1016
1017
flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
1018
rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
1019
TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
1020
if (rc)
1021
return rc;
1022
if (level <= TABLE_TYPE_REGION1) {
1023
scoped_guard(spinlock, &sg->host_to_rmap_lock)
1024
rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
1025
}
1026
if (rc)
1027
return rc;
1028
1029
if (!pgste_get_trylock(ptep, &pgste))
1030
return -EAGAIN;
1031
pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
1032
pte.h.p = 1;
1033
pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
1034
pgste.vsie_notif = 1;
1035
pgste_set_unlock(ptep, pgste);
1036
1037
return 0;
1038
}
1039
1040
static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
1041
{
1042
__atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
1043
if (need_resched())
1044
return next;
1045
return 0;
1046
}
1047
1048
void gmap_set_cmma_all_dirty(struct gmap *gmap)
1049
{
1050
const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
1051
gfn_t gfn = 0;
1052
1053
do {
1054
scoped_guard(read_lock, &gmap->kvm->mmu_lock)
1055
gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
1056
DAT_WALK_IGN_HOLES, NULL);
1057
cond_resched();
1058
} while (gfn);
1059
}
1060
1061
static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
1062
{
1063
unsigned long align = PAGE_SIZE;
1064
gpa_t gaddr = gfn_to_gpa(r_gfn);
1065
union crste *crstep;
1066
union crste crste;
1067
union pte *ptep;
1068
1069
if (level > TABLE_TYPE_PAGE_TABLE)
1070
align = 1UL << (11 * level + _SEGMENT_SHIFT);
1071
kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
1072
if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
1073
return;
1074
if (ptep) {
1075
if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
1076
dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
1077
return;
1078
}
1079
1080
crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
1081
if (crste_leaf(crste) || crste.h.i)
1082
return;
1083
if (is_pmd(crste))
1084
dat_free_pt(dereference_pmd(crste.pmd));
1085
else
1086
dat_free_level(dereference_crste(crste), true);
1087
}
1088
1089
static void gmap_unshadow(struct gmap *sg)
1090
{
1091
struct gmap_cache *gmap_cache, *next;
1092
1093
KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1094
KVM_BUG_ON(!sg->parent, sg->kvm);
1095
1096
lockdep_assert_held(&sg->parent->children_lock);
1097
1098
gmap_remove_child(sg);
1099
kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
1100
1101
list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
1102
gmap_cache->gmap = NULL;
1103
list_del(&gmap_cache->list);
1104
}
1105
1106
gmap_put(sg);
1107
}
1108
1109
void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
1110
{
1111
struct vsie_rmap *rmap, *rnext, *head;
1112
struct gmap *sg, *next;
1113
gfn_t start, end;
1114
1115
list_for_each_entry_safe(sg, next, &parent->children, list) {
1116
start = sg->guest_asce.rsto;
1117
end = start + sg->guest_asce.tl + 1;
1118
if (!sg->guest_asce.r && gfn >= start && gfn < end) {
1119
gmap_unshadow(sg);
1120
continue;
1121
}
1122
scoped_guard(spinlock, &sg->host_to_rmap_lock)
1123
head = radix_tree_delete(&sg->host_to_rmap, gfn);
1124
gmap_for_each_rmap_safe(rmap, rnext, head)
1125
gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
1126
}
1127
}
1128
1129
/**
1130
* gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
1131
* @parent: Pointer to the parent gmap.
1132
* @asce: ASCE for which the shadow table is created.
1133
* @edat_level: Edat level to be used for the shadow translation.
1134
*
1135
* Context: Called with parent->children_lock held.
1136
*
1137
* Return: The pointer to a gmap if a shadow table with the given asce is
1138
* already available, ERR_PTR(-EAGAIN) if another one is just being created,
1139
* otherwise NULL.
1140
*/
1141
static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
1142
{
1143
struct gmap *sg;
1144
1145
lockdep_assert_held(&parent->children_lock);
1146
list_for_each_entry(sg, &parent->children, list) {
1147
if (!gmap_is_shadow_valid(sg, asce, edat_level))
1148
continue;
1149
return sg;
1150
}
1151
return NULL;
1152
}
1153
1154
#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
1155
struct gmap_protect_asce_top_level {
1156
unsigned long seq;
1157
struct guest_fault f[CRST_TABLE_PAGES];
1158
};
1159
1160
static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1161
struct gmap_protect_asce_top_level *context)
1162
{
1163
struct gmap *parent;
1164
int rc, i;
1165
1166
guard(write_lock)(&sg->kvm->mmu_lock);
1167
1168
if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
1169
return -EAGAIN;
1170
1171
parent = READ_ONCE(sg->parent);
1172
if (!parent)
1173
return -EAGAIN;
1174
scoped_guard(spinlock, &parent->children_lock) {
1175
if (READ_ONCE(sg->parent) != parent)
1176
return -EAGAIN;
1177
for (i = 0; i < CRST_TABLE_PAGES; i++) {
1178
if (!context->f[i].valid)
1179
continue;
1180
rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
1181
TABLE_TYPE_REGION1 + 1, context->f[i].writable);
1182
if (rc)
1183
return rc;
1184
}
1185
gmap_add_child(sg->parent, sg);
1186
}
1187
1188
kvm_s390_release_faultin_array(sg->kvm, context->f, false);
1189
return 0;
1190
}
1191
1192
static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
1193
struct gmap_protect_asce_top_level *context)
1194
{
1195
int rc;
1196
1197
if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
1198
return -EAGAIN;
1199
do {
1200
rc = kvm_s390_mmu_cache_topup(mc);
1201
if (rc)
1202
return rc;
1203
rc = radix_tree_preload(GFP_KERNEL);
1204
if (rc)
1205
return rc;
1206
rc = __gmap_protect_asce_top_level(mc, sg, context);
1207
radix_tree_preload_end();
1208
} while (rc == -ENOMEM);
1209
1210
return rc;
1211
}
1212
1213
static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
1214
{
1215
struct gmap_protect_asce_top_level context = {};
1216
union asce asce = sg->guest_asce;
1217
int rc;
1218
1219
KVM_BUG_ON(!is_shadow(sg), sg->kvm);
1220
1221
context.seq = sg->kvm->mmu_invalidate_seq;
1222
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1223
smp_rmb();
1224
1225
rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
1226
if (rc > 0)
1227
rc = -EFAULT;
1228
if (!rc)
1229
rc = _gmap_protect_asce_top_level(mc, sg, &context);
1230
if (rc)
1231
kvm_s390_release_faultin_array(sg->kvm, context.f, true);
1232
return rc;
1233
}
1234
1235
/**
1236
* gmap_create_shadow() - Create/find a shadow guest address space.
1237
* @mc: The cache to use to allocate dat tables.
1238
* @parent: Pointer to the parent gmap.
1239
* @asce: ASCE for which the shadow table is created.
1240
* @edat_level: Edat level to be used for the shadow translation.
1241
*
1242
* The pages of the top level page table referred by the asce parameter
1243
* will be set to read-only and marked in the PGSTEs of the kvm process.
1244
* The shadow table will be removed automatically on any change to the
1245
* PTE mapping for the source table.
1246
*
1247
* The returned shadow gmap will be returned with one extra reference.
1248
*
1249
* Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1250
* ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1251
* parent gmap table could not be protected.
1252
*/
1253
struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
1254
union asce asce, int edat_level)
1255
{
1256
struct gmap *sg, *new;
1257
int rc;
1258
1259
if (WARN_ON(!parent))
1260
return ERR_PTR(-EINVAL);
1261
1262
scoped_guard(spinlock, &parent->children_lock) {
1263
sg = gmap_find_shadow(parent, asce, edat_level);
1264
if (sg) {
1265
gmap_get(sg);
1266
return sg;
1267
}
1268
}
1269
/* Create a new shadow gmap. */
1270
new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
1271
if (!new)
1272
return ERR_PTR(-ENOMEM);
1273
new->guest_asce = asce;
1274
new->edat_level = edat_level;
1275
set_bit(GMAP_FLAG_SHADOW, &new->flags);
1276
1277
scoped_guard(spinlock, &parent->children_lock) {
1278
/* Recheck if another CPU created the same shadow. */
1279
sg = gmap_find_shadow(parent, asce, edat_level);
1280
if (sg) {
1281
gmap_put(new);
1282
gmap_get(sg);
1283
return sg;
1284
}
1285
if (asce.r) {
1286
/* Only allow one real-space gmap shadow. */
1287
list_for_each_entry(sg, &parent->children, list) {
1288
if (sg->guest_asce.r) {
1289
scoped_guard(write_lock, &parent->kvm->mmu_lock)
1290
gmap_unshadow(sg);
1291
break;
1292
}
1293
}
1294
gmap_add_child(parent, new);
1295
/* Nothing to protect, return right away. */
1296
gmap_get(new);
1297
return new;
1298
}
1299
}
1300
1301
gmap_get(new);
1302
new->parent = parent;
1303
/* Protect while inserting, protects against invalidation races. */
1304
rc = gmap_protect_asce_top_level(mc, new);
1305
if (rc) {
1306
new->parent = NULL;
1307
gmap_put(new);
1308
gmap_put(new);
1309
return ERR_PTR(rc);
1310
}
1311
return new;
1312
}
1313
1314