Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/sgx/encl.c
51031 views
1
// SPDX-License-Identifier: GPL-2.0
2
/* Copyright(c) 2016-20 Intel Corporation. */
3
4
#include <linux/lockdep.h>
5
#include <linux/mm.h>
6
#include <linux/mman.h>
7
#include <linux/shmem_fs.h>
8
#include <linux/suspend.h>
9
#include <linux/sched/mm.h>
10
#include <asm/sgx.h>
11
#include "encl.h"
12
#include "encls.h"
13
#include "sgx.h"
14
15
static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
16
struct sgx_backing *backing);
17
18
#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
19
/*
20
* 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
21
* determine the page index associated with the first PCMD entry
22
* within a PCMD page.
23
*/
24
#define PCMD_FIRST_MASK GENMASK(4, 0)
25
26
/**
27
* reclaimer_writing_to_pcmd() - Query if any enclave page associated with
28
* a PCMD page is in process of being reclaimed.
29
* @encl: Enclave to which PCMD page belongs
30
* @start_addr: Address of enclave page using first entry within the PCMD page
31
*
32
* When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
33
* stored. The PCMD data of a reclaimed enclave page contains enough
34
* information for the processor to verify the page at the time
35
* it is loaded back into the Enclave Page Cache (EPC).
36
*
37
* The backing storage to which enclave pages are reclaimed is laid out as
38
* follows:
39
* Encrypted enclave pages:SECS page:PCMD pages
40
*
41
* Each PCMD page contains the PCMD metadata of
42
* PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
43
*
44
* A PCMD page can only be truncated if it is (a) empty, and (b) not in the
45
* process of getting data (and thus soon being non-empty). (b) is tested with
46
* a check if an enclave page sharing the PCMD page is in the process of being
47
* reclaimed.
48
*
49
* The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
50
* intends to reclaim that enclave page - it means that the PCMD page
51
* associated with that enclave page is about to get some data and thus
52
* even if the PCMD page is empty, it should not be truncated.
53
*
54
* Context: Enclave mutex (&sgx_encl->lock) must be held.
55
* Return: 1 if the reclaimer is about to write to the PCMD page
56
* 0 if the reclaimer has no intention to write to the PCMD page
57
*/
58
static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
59
unsigned long start_addr)
60
{
61
int reclaimed = 0;
62
int i;
63
64
/*
65
* PCMD_FIRST_MASK is based on number of PCMD entries within
66
* PCMD page being 32.
67
*/
68
BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
69
70
for (i = 0; i < PCMDS_PER_PAGE; i++) {
71
struct sgx_encl_page *entry;
72
unsigned long addr;
73
74
addr = start_addr + i * PAGE_SIZE;
75
76
/*
77
* Stop when reaching the SECS page - it does not
78
* have a page_array entry and its reclaim is
79
* started and completed with enclave mutex held so
80
* it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
81
* flag.
82
*/
83
if (addr == encl->base + encl->size)
84
break;
85
86
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
87
if (!entry)
88
continue;
89
90
/*
91
* VA page slot ID uses same bit as the flag so it is important
92
* to ensure that the page is not already in backing store.
93
*/
94
if (entry->epc_page &&
95
(entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
96
reclaimed = 1;
97
break;
98
}
99
}
100
101
return reclaimed;
102
}
103
104
/*
105
* Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
106
* follow right after the EPC data in the backing storage. In addition to the
107
* visible enclave pages, there's one extra page slot for SECS, before PCMD
108
* structs.
109
*/
110
static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
111
unsigned long page_index)
112
{
113
pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
114
115
return epc_end_off + page_index * sizeof(struct sgx_pcmd);
116
}
117
118
/*
119
* Free a page from the backing storage in the given page index.
120
*/
121
static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
122
{
123
struct inode *inode = file_inode(encl->backing);
124
125
shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
126
}
127
128
/*
129
* ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
130
* Pages" in the SDM.
131
*/
132
static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
133
struct sgx_epc_page *epc_page,
134
struct sgx_epc_page *secs_page)
135
{
136
unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
137
struct sgx_encl *encl = encl_page->encl;
138
pgoff_t page_index, page_pcmd_off;
139
unsigned long pcmd_first_page;
140
struct sgx_pageinfo pginfo;
141
struct sgx_backing b;
142
bool pcmd_page_empty;
143
u8 *pcmd_page;
144
int ret;
145
146
if (secs_page)
147
page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
148
else
149
page_index = PFN_DOWN(encl->size);
150
151
/*
152
* Address of enclave page using the first entry within the PCMD page.
153
*/
154
pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
155
156
page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
157
158
ret = sgx_encl_lookup_backing(encl, page_index, &b);
159
if (ret)
160
return ret;
161
162
pginfo.addr = encl_page->desc & PAGE_MASK;
163
pginfo.contents = (unsigned long)kmap_local_page(b.contents);
164
pcmd_page = kmap_local_page(b.pcmd);
165
pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
166
167
if (secs_page)
168
pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
169
else
170
pginfo.secs = 0;
171
172
ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
173
sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
174
if (ret) {
175
if (encls_failed(ret))
176
ENCLS_WARN(ret, "ELDU");
177
178
ret = -EFAULT;
179
}
180
181
memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
182
set_page_dirty(b.pcmd);
183
184
/*
185
* The area for the PCMD in the page was zeroed above. Check if the
186
* whole page is now empty meaning that all PCMD's have been zeroed:
187
*/
188
pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
189
190
kunmap_local(pcmd_page);
191
kunmap_local((void *)(unsigned long)pginfo.contents);
192
193
get_page(b.pcmd);
194
sgx_encl_put_backing(&b);
195
196
sgx_encl_truncate_backing_page(encl, page_index);
197
198
if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
199
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
200
pcmd_page = kmap_local_page(b.pcmd);
201
if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
202
pr_warn("PCMD page not empty after truncate.\n");
203
kunmap_local(pcmd_page);
204
}
205
206
put_page(b.pcmd);
207
208
return ret;
209
}
210
211
static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
212
struct sgx_epc_page *secs_page)
213
{
214
215
unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
216
struct sgx_encl *encl = encl_page->encl;
217
struct sgx_epc_page *epc_page;
218
int ret;
219
220
epc_page = sgx_alloc_epc_page(encl_page, false);
221
if (IS_ERR(epc_page))
222
return epc_page;
223
224
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
225
if (ret) {
226
sgx_encl_free_epc_page(epc_page);
227
return ERR_PTR(ret);
228
}
229
230
sgx_free_va_slot(encl_page->va_page, va_offset);
231
list_move(&encl_page->va_page->list, &encl->va_pages);
232
encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
233
encl_page->epc_page = epc_page;
234
235
return epc_page;
236
}
237
238
/*
239
* Ensure the SECS page is not swapped out. Must be called with encl->lock
240
* to protect the enclave states including SECS and ensure the SECS page is
241
* not swapped out again while being used.
242
*/
243
static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
244
{
245
struct sgx_epc_page *epc_page = encl->secs.epc_page;
246
247
if (!epc_page)
248
epc_page = sgx_encl_eldu(&encl->secs, NULL);
249
250
return epc_page;
251
}
252
253
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
254
struct sgx_encl_page *entry)
255
{
256
struct sgx_epc_page *epc_page;
257
258
/* Entry successfully located. */
259
if (entry->epc_page) {
260
if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
261
return ERR_PTR(-EBUSY);
262
263
return entry;
264
}
265
266
epc_page = sgx_encl_load_secs(encl);
267
if (IS_ERR(epc_page))
268
return ERR_CAST(epc_page);
269
270
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
271
if (IS_ERR(epc_page))
272
return ERR_CAST(epc_page);
273
274
encl->secs_child_cnt++;
275
sgx_mark_page_reclaimable(entry->epc_page);
276
277
return entry;
278
}
279
280
static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
281
unsigned long addr,
282
vm_flags_t vm_flags)
283
{
284
unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
285
struct sgx_encl_page *entry;
286
287
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
288
if (!entry)
289
return ERR_PTR(-EFAULT);
290
291
/*
292
* Verify that the page has equal or higher build time
293
* permissions than the VMA permissions (i.e. the subset of {VM_READ,
294
* VM_WRITE, VM_EXECUTE} in vma->vm_flags).
295
*/
296
if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
297
return ERR_PTR(-EFAULT);
298
299
return __sgx_encl_load_page(encl, entry);
300
}
301
302
struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
303
unsigned long addr)
304
{
305
struct sgx_encl_page *entry;
306
307
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
308
if (!entry)
309
return ERR_PTR(-EFAULT);
310
311
return __sgx_encl_load_page(encl, entry);
312
}
313
314
/**
315
* sgx_encl_eaug_page() - Dynamically add page to initialized enclave
316
* @vma: VMA obtained from fault info from where page is accessed
317
* @encl: enclave accessing the page
318
* @addr: address that triggered the page fault
319
*
320
* When an initialized enclave accesses a page with no backing EPC page
321
* on a SGX2 system then the EPC can be added dynamically via the SGX2
322
* ENCLS[EAUG] instruction.
323
*
324
* Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
325
* successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
326
*/
327
static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
328
struct sgx_encl *encl, unsigned long addr)
329
{
330
vm_fault_t vmret = VM_FAULT_SIGBUS;
331
struct sgx_pageinfo pginfo = {0};
332
struct sgx_encl_page *encl_page;
333
struct sgx_epc_page *epc_page;
334
struct sgx_va_page *va_page;
335
unsigned long phys_addr;
336
u64 secinfo_flags;
337
int ret;
338
339
if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
340
return VM_FAULT_SIGBUS;
341
342
/*
343
* Ignore internal permission checking for dynamically added pages.
344
* They matter only for data added during the pre-initialization
345
* phase. The enclave decides the permissions by the means of
346
* EACCEPT, EACCEPTCOPY and EMODPE.
347
*/
348
secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
349
encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
350
if (IS_ERR(encl_page))
351
return VM_FAULT_OOM;
352
353
mutex_lock(&encl->lock);
354
355
epc_page = sgx_encl_load_secs(encl);
356
if (IS_ERR(epc_page)) {
357
if (PTR_ERR(epc_page) == -EBUSY)
358
vmret = VM_FAULT_NOPAGE;
359
goto err_out_unlock;
360
}
361
362
epc_page = sgx_alloc_epc_page(encl_page, false);
363
if (IS_ERR(epc_page)) {
364
if (PTR_ERR(epc_page) == -EBUSY)
365
vmret = VM_FAULT_NOPAGE;
366
goto err_out_unlock;
367
}
368
369
va_page = sgx_encl_grow(encl, false);
370
if (IS_ERR(va_page)) {
371
if (PTR_ERR(va_page) == -EBUSY)
372
vmret = VM_FAULT_NOPAGE;
373
goto err_out_epc;
374
}
375
376
if (va_page)
377
list_add(&va_page->list, &encl->va_pages);
378
379
ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
380
encl_page, GFP_KERNEL);
381
/*
382
* If ret == -EBUSY then page was created in another flow while
383
* running without encl->lock
384
*/
385
if (ret)
386
goto err_out_shrink;
387
388
pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
389
pginfo.addr = encl_page->desc & PAGE_MASK;
390
pginfo.metadata = 0;
391
392
ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
393
if (ret)
394
goto err_out;
395
396
encl_page->encl = encl;
397
encl_page->epc_page = epc_page;
398
encl_page->type = SGX_PAGE_TYPE_REG;
399
encl->secs_child_cnt++;
400
401
sgx_mark_page_reclaimable(encl_page->epc_page);
402
403
phys_addr = sgx_get_epc_phys_addr(epc_page);
404
/*
405
* Do not undo everything when creating PTE entry fails - next #PF
406
* would find page ready for a PTE.
407
*/
408
vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
409
if (vmret != VM_FAULT_NOPAGE) {
410
mutex_unlock(&encl->lock);
411
return VM_FAULT_SIGBUS;
412
}
413
mutex_unlock(&encl->lock);
414
return VM_FAULT_NOPAGE;
415
416
err_out:
417
xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
418
419
err_out_shrink:
420
sgx_encl_shrink(encl, va_page);
421
err_out_epc:
422
sgx_encl_free_epc_page(epc_page);
423
err_out_unlock:
424
mutex_unlock(&encl->lock);
425
kfree(encl_page);
426
427
return vmret;
428
}
429
430
static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
431
{
432
unsigned long addr = (unsigned long)vmf->address;
433
struct vm_area_struct *vma = vmf->vma;
434
struct sgx_encl_page *entry;
435
unsigned long phys_addr;
436
struct sgx_encl *encl;
437
vm_fault_t ret;
438
439
encl = vma->vm_private_data;
440
441
/*
442
* It's very unlikely but possible that allocating memory for the
443
* mm_list entry of a forked process failed in sgx_vma_open(). When
444
* this happens, vm_private_data is set to NULL.
445
*/
446
if (unlikely(!encl))
447
return VM_FAULT_SIGBUS;
448
449
/*
450
* The page_array keeps track of all enclave pages, whether they
451
* are swapped out or not. If there is no entry for this page and
452
* the system supports SGX2 then it is possible to dynamically add
453
* a new enclave page. This is only possible for an initialized
454
* enclave that will be checked for right away.
455
*/
456
if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
457
(!xa_load(&encl->page_array, PFN_DOWN(addr))))
458
return sgx_encl_eaug_page(vma, encl, addr);
459
460
mutex_lock(&encl->lock);
461
462
entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
463
if (IS_ERR(entry)) {
464
mutex_unlock(&encl->lock);
465
466
if (PTR_ERR(entry) == -EBUSY)
467
return VM_FAULT_NOPAGE;
468
469
return VM_FAULT_SIGBUS;
470
}
471
472
phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
473
474
ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
475
if (ret != VM_FAULT_NOPAGE) {
476
mutex_unlock(&encl->lock);
477
478
return VM_FAULT_SIGBUS;
479
}
480
481
sgx_encl_test_and_clear_young(vma->vm_mm, entry);
482
mutex_unlock(&encl->lock);
483
484
return VM_FAULT_NOPAGE;
485
}
486
487
static void sgx_vma_open(struct vm_area_struct *vma)
488
{
489
struct sgx_encl *encl = vma->vm_private_data;
490
491
/*
492
* It's possible but unlikely that vm_private_data is NULL. This can
493
* happen in a grandchild of a process, when sgx_encl_mm_add() had
494
* failed to allocate memory in this callback.
495
*/
496
if (unlikely(!encl))
497
return;
498
499
if (sgx_encl_mm_add(encl, vma->vm_mm))
500
vma->vm_private_data = NULL;
501
}
502
503
504
/**
505
* sgx_encl_may_map() - Check if a requested VMA mapping is allowed
506
* @encl: an enclave pointer
507
* @start: lower bound of the address range, inclusive
508
* @end: upper bound of the address range, exclusive
509
* @vm_flags: VMA flags
510
*
511
* Iterate through the enclave pages contained within [@start, @end) to verify
512
* that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
513
* do not contain any permissions that are not contained in the build time
514
* permissions of any of the enclave pages within the given address range.
515
*
516
* An enclave creator must declare the strongest permissions that will be
517
* needed for each enclave page. This ensures that mappings have the identical
518
* or weaker permissions than the earlier declared permissions.
519
*
520
* Return: 0 on success, -EACCES otherwise
521
*/
522
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
523
unsigned long end, vm_flags_t vm_flags)
524
{
525
vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
526
struct sgx_encl_page *page;
527
unsigned long count = 0;
528
int ret = 0;
529
530
XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
531
532
/* Disallow mapping outside enclave's address range. */
533
if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
534
(start < encl->base || end > encl->base + encl->size))
535
return -EACCES;
536
537
/*
538
* Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
539
* conflict with the enclave page permissions.
540
*/
541
if (current->personality & READ_IMPLIES_EXEC)
542
return -EACCES;
543
544
mutex_lock(&encl->lock);
545
xas_lock(&xas);
546
xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
547
if (~page->vm_max_prot_bits & vm_prot_bits) {
548
ret = -EACCES;
549
break;
550
}
551
552
/* Reschedule on every XA_CHECK_SCHED iteration. */
553
if (!(++count % XA_CHECK_SCHED)) {
554
xas_pause(&xas);
555
xas_unlock(&xas);
556
mutex_unlock(&encl->lock);
557
558
cond_resched();
559
560
mutex_lock(&encl->lock);
561
xas_lock(&xas);
562
}
563
}
564
xas_unlock(&xas);
565
mutex_unlock(&encl->lock);
566
567
return ret;
568
}
569
570
static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
571
unsigned long end, unsigned long newflags)
572
{
573
return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
574
}
575
576
static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
577
unsigned long addr, void *data)
578
{
579
unsigned long offset = addr & ~PAGE_MASK;
580
int ret;
581
582
583
ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
584
if (ret)
585
return -EIO;
586
587
return 0;
588
}
589
590
static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
591
unsigned long addr, void *data)
592
{
593
unsigned long offset = addr & ~PAGE_MASK;
594
int ret;
595
596
ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
597
if (ret)
598
return -EIO;
599
600
return 0;
601
}
602
603
/*
604
* Load an enclave page to EPC if required, and take encl->lock.
605
*/
606
static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
607
unsigned long addr,
608
vm_flags_t vm_flags)
609
{
610
struct sgx_encl_page *entry;
611
612
for ( ; ; ) {
613
mutex_lock(&encl->lock);
614
615
entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
616
if (PTR_ERR(entry) != -EBUSY)
617
break;
618
619
mutex_unlock(&encl->lock);
620
}
621
622
if (IS_ERR(entry))
623
mutex_unlock(&encl->lock);
624
625
return entry;
626
}
627
628
static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
629
void *buf, int len, int write)
630
{
631
struct sgx_encl *encl = vma->vm_private_data;
632
struct sgx_encl_page *entry = NULL;
633
char data[sizeof(unsigned long)];
634
unsigned long align;
635
int offset;
636
int cnt;
637
int ret = 0;
638
int i;
639
640
/*
641
* If process was forked, VMA is still there but vm_private_data is set
642
* to NULL.
643
*/
644
if (!encl)
645
return -EFAULT;
646
647
if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
648
return -EFAULT;
649
650
for (i = 0; i < len; i += cnt) {
651
entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
652
vma->vm_flags);
653
if (IS_ERR(entry)) {
654
ret = PTR_ERR(entry);
655
break;
656
}
657
658
align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
659
offset = (addr + i) & (sizeof(unsigned long) - 1);
660
cnt = sizeof(unsigned long) - offset;
661
cnt = min(cnt, len - i);
662
663
ret = sgx_encl_debug_read(encl, entry, align, data);
664
if (ret)
665
goto out;
666
667
if (write) {
668
memcpy(data + offset, buf + i, cnt);
669
ret = sgx_encl_debug_write(encl, entry, align, data);
670
if (ret)
671
goto out;
672
} else {
673
memcpy(buf + i, data + offset, cnt);
674
}
675
676
out:
677
mutex_unlock(&encl->lock);
678
679
if (ret)
680
break;
681
}
682
683
return ret < 0 ? ret : i;
684
}
685
686
const struct vm_operations_struct sgx_vm_ops = {
687
.fault = sgx_vma_fault,
688
.mprotect = sgx_vma_mprotect,
689
.open = sgx_vma_open,
690
.access = sgx_vma_access,
691
};
692
693
/**
694
* sgx_encl_release - Destroy an enclave instance
695
* @ref: address of a kref inside &sgx_encl
696
*
697
* Used together with kref_put(). Frees all the resources associated with the
698
* enclave and the instance itself.
699
*/
700
void sgx_encl_release(struct kref *ref)
701
{
702
struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
703
unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
704
struct sgx_va_page *va_page;
705
struct sgx_encl_page *entry;
706
unsigned long count = 0;
707
708
XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
709
710
xas_lock(&xas);
711
xas_for_each(&xas, entry, max_page_index) {
712
if (entry->epc_page) {
713
/*
714
* The page and its radix tree entry cannot be freed
715
* if the page is being held by the reclaimer.
716
*/
717
if (sgx_unmark_page_reclaimable(entry->epc_page))
718
continue;
719
720
sgx_encl_free_epc_page(entry->epc_page);
721
encl->secs_child_cnt--;
722
entry->epc_page = NULL;
723
}
724
725
kfree(entry);
726
/*
727
* Invoke scheduler on every XA_CHECK_SCHED iteration
728
* to prevent soft lockups.
729
*/
730
if (!(++count % XA_CHECK_SCHED)) {
731
xas_pause(&xas);
732
xas_unlock(&xas);
733
734
cond_resched();
735
736
xas_lock(&xas);
737
}
738
}
739
xas_unlock(&xas);
740
741
xa_destroy(&encl->page_array);
742
743
if (!encl->secs_child_cnt && encl->secs.epc_page) {
744
sgx_encl_free_epc_page(encl->secs.epc_page);
745
encl->secs.epc_page = NULL;
746
}
747
748
while (!list_empty(&encl->va_pages)) {
749
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
750
list);
751
list_del(&va_page->list);
752
sgx_encl_free_epc_page(va_page->epc_page);
753
kfree(va_page);
754
}
755
756
if (encl->backing)
757
fput(encl->backing);
758
759
cleanup_srcu_struct(&encl->srcu);
760
761
WARN_ON_ONCE(!list_empty(&encl->mm_list));
762
763
/* Detect EPC page leak's. */
764
WARN_ON_ONCE(encl->secs_child_cnt);
765
WARN_ON_ONCE(encl->secs.epc_page);
766
767
kfree(encl);
768
sgx_dec_usage_count();
769
}
770
771
/*
772
* 'mm' is exiting and no longer needs mmu notifications.
773
*/
774
static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
775
struct mm_struct *mm)
776
{
777
struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
778
struct sgx_encl_mm *tmp = NULL;
779
bool found = false;
780
781
/*
782
* The enclave itself can remove encl_mm. Note, objects can't be moved
783
* off an RCU protected list, but deletion is ok.
784
*/
785
spin_lock(&encl_mm->encl->mm_lock);
786
list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
787
if (tmp == encl_mm) {
788
list_del_rcu(&encl_mm->list);
789
found = true;
790
break;
791
}
792
}
793
spin_unlock(&encl_mm->encl->mm_lock);
794
795
if (found) {
796
synchronize_srcu(&encl_mm->encl->srcu);
797
mmu_notifier_put(mn);
798
}
799
}
800
801
static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
802
{
803
struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
804
805
/* 'encl_mm' is going away, put encl_mm->encl reference: */
806
kref_put(&encl_mm->encl->refcount, sgx_encl_release);
807
808
kfree(encl_mm);
809
}
810
811
static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
812
.release = sgx_mmu_notifier_release,
813
.free_notifier = sgx_mmu_notifier_free,
814
};
815
816
static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
817
struct mm_struct *mm)
818
{
819
struct sgx_encl_mm *encl_mm = NULL;
820
struct sgx_encl_mm *tmp;
821
int idx;
822
823
idx = srcu_read_lock(&encl->srcu);
824
825
list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
826
if (tmp->mm == mm) {
827
encl_mm = tmp;
828
break;
829
}
830
}
831
832
srcu_read_unlock(&encl->srcu, idx);
833
834
return encl_mm;
835
}
836
837
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
838
{
839
struct sgx_encl_mm *encl_mm;
840
int ret;
841
842
/*
843
* Even though a single enclave may be mapped into an mm more than once,
844
* each 'mm' only appears once on encl->mm_list. This is guaranteed by
845
* holding the mm's mmap lock for write before an mm can be added or
846
* remove to an encl->mm_list.
847
*/
848
mmap_assert_write_locked(mm);
849
850
/*
851
* It's possible that an entry already exists in the mm_list, because it
852
* is removed only on VFS release or process exit.
853
*/
854
if (sgx_encl_find_mm(encl, mm))
855
return 0;
856
857
encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
858
if (!encl_mm)
859
return -ENOMEM;
860
861
/* Grab a refcount for the encl_mm->encl reference: */
862
kref_get(&encl->refcount);
863
encl_mm->encl = encl;
864
encl_mm->mm = mm;
865
encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
866
867
ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
868
if (ret) {
869
kfree(encl_mm);
870
return ret;
871
}
872
873
spin_lock(&encl->mm_lock);
874
list_add_rcu(&encl_mm->list, &encl->mm_list);
875
/* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
876
smp_wmb();
877
encl->mm_list_version++;
878
spin_unlock(&encl->mm_lock);
879
880
return 0;
881
}
882
883
/**
884
* sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
885
* @encl: the enclave
886
*
887
* Some SGX functions require that no cached linear-to-physical address
888
* mappings are present before they can succeed. For example, ENCLS[EWB]
889
* copies a page from the enclave page cache to regular main memory but
890
* it fails if it cannot ensure that there are no cached
891
* linear-to-physical address mappings referring to the page.
892
*
893
* SGX hardware flushes all cached linear-to-physical mappings on a CPU
894
* when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
895
* Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
896
* address mappings are cleared but coordination with the tracking done within
897
* the SGX hardware is needed to support the SGX functions that depend on this
898
* cache clearing.
899
*
900
* When the ENCLS[ETRACK] function is issued on an enclave the hardware
901
* tracks threads operating inside the enclave at that time. The SGX
902
* hardware tracking require that all the identified threads must have
903
* exited the enclave in order to flush the mappings before a function such
904
* as ENCLS[EWB] will be permitted
905
*
906
* The following flow is used to support SGX functions that require that
907
* no cached linear-to-physical address mappings are present:
908
* 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
909
* 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
910
* accessing the enclave.
911
* 3) Send IPI to identified CPUs, kicking them out of the enclave and
912
* thus flushing all locally cached linear-to-physical address mappings.
913
* 4) Execute SGX function.
914
*
915
* Context: It is required to call this function after ENCLS[ETRACK].
916
* This will ensure that if any new mm appears (racing with
917
* sgx_encl_mm_add()) then the new mm will enter into the
918
* enclave with fresh linear-to-physical address mappings.
919
*
920
* It is required that all IPIs are completed before a new
921
* ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
922
* of the above flow with the enclave's mutex.
923
*
924
* Return: cpumask of CPUs that might be accessing @encl
925
*/
926
const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
927
{
928
cpumask_t *cpumask = &encl->cpumask;
929
struct sgx_encl_mm *encl_mm;
930
int idx;
931
932
cpumask_clear(cpumask);
933
934
idx = srcu_read_lock(&encl->srcu);
935
936
list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
937
if (!mmget_not_zero(encl_mm->mm))
938
continue;
939
940
cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
941
942
mmput_async(encl_mm->mm);
943
}
944
945
srcu_read_unlock(&encl->srcu, idx);
946
947
return cpumask;
948
}
949
950
static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
951
pgoff_t index)
952
{
953
struct address_space *mapping = encl->backing->f_mapping;
954
gfp_t gfpmask = mapping_gfp_mask(mapping);
955
956
return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
957
}
958
959
/**
960
* __sgx_encl_get_backing() - Pin the backing storage
961
* @encl: an enclave pointer
962
* @page_index: enclave page index
963
* @backing: data for accessing backing storage for the page
964
*
965
* Pin the backing storage pages for storing the encrypted contents and Paging
966
* Crypto MetaData (PCMD) of an enclave page.
967
*
968
* Return:
969
* 0 on success,
970
* -errno otherwise.
971
*/
972
static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
973
struct sgx_backing *backing)
974
{
975
pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
976
struct page *contents;
977
struct page *pcmd;
978
979
contents = sgx_encl_get_backing_page(encl, page_index);
980
if (IS_ERR(contents))
981
return PTR_ERR(contents);
982
983
pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
984
if (IS_ERR(pcmd)) {
985
put_page(contents);
986
return PTR_ERR(pcmd);
987
}
988
989
backing->contents = contents;
990
backing->pcmd = pcmd;
991
backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
992
993
return 0;
994
}
995
996
/*
997
* When called from ksgxd, returns the mem_cgroup of a struct mm stored
998
* in the enclave's mm_list. When not called from ksgxd, just returns
999
* the mem_cgroup of the current task.
1000
*/
1001
static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
1002
{
1003
struct mem_cgroup *memcg = NULL;
1004
struct sgx_encl_mm *encl_mm;
1005
int idx;
1006
1007
/*
1008
* If called from normal task context, return the mem_cgroup
1009
* of the current task's mm. The remainder of the handling is for
1010
* ksgxd.
1011
*/
1012
if (!current_is_ksgxd())
1013
return get_mem_cgroup_from_mm(current->mm);
1014
1015
/*
1016
* Search the enclave's mm_list to find an mm associated with
1017
* this enclave to charge the allocation to.
1018
*/
1019
idx = srcu_read_lock(&encl->srcu);
1020
1021
list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
1022
if (!mmget_not_zero(encl_mm->mm))
1023
continue;
1024
1025
memcg = get_mem_cgroup_from_mm(encl_mm->mm);
1026
1027
mmput_async(encl_mm->mm);
1028
1029
break;
1030
}
1031
1032
srcu_read_unlock(&encl->srcu, idx);
1033
1034
/*
1035
* In the rare case that there isn't an mm associated with
1036
* the enclave, set memcg to the current active mem_cgroup.
1037
* This will be the root mem_cgroup if there is no active
1038
* mem_cgroup.
1039
*/
1040
if (!memcg)
1041
return get_mem_cgroup_from_mm(NULL);
1042
1043
return memcg;
1044
}
1045
1046
/**
1047
* sgx_encl_alloc_backing() - create a new backing storage page
1048
* @encl: an enclave pointer
1049
* @page_index: enclave page index
1050
* @backing: data for accessing backing storage for the page
1051
*
1052
* When called from ksgxd, sets the active memcg from one of the
1053
* mms in the enclave's mm_list prior to any backing page allocation,
1054
* in order to ensure that shmem page allocations are charged to the
1055
* enclave. Create a backing page for loading data back into an EPC page with
1056
* ELDU. This function takes a reference on a new backing page which
1057
* must be dropped with a corresponding call to sgx_encl_put_backing().
1058
*
1059
* Return:
1060
* 0 on success,
1061
* -errno otherwise.
1062
*/
1063
int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
1064
struct sgx_backing *backing)
1065
{
1066
struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
1067
struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
1068
int ret;
1069
1070
ret = __sgx_encl_get_backing(encl, page_index, backing);
1071
1072
set_active_memcg(memcg);
1073
mem_cgroup_put(encl_memcg);
1074
1075
return ret;
1076
}
1077
1078
/**
1079
* sgx_encl_lookup_backing() - retrieve an existing backing storage page
1080
* @encl: an enclave pointer
1081
* @page_index: enclave page index
1082
* @backing: data for accessing backing storage for the page
1083
*
1084
* Retrieve a backing page for loading data back into an EPC page with ELDU.
1085
* It is the caller's responsibility to ensure that it is appropriate to use
1086
* sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
1087
* not used correctly, this will cause an allocation which is not accounted for.
1088
* This function takes a reference on an existing backing page which must be
1089
* dropped with a corresponding call to sgx_encl_put_backing().
1090
*
1091
* Return:
1092
* 0 on success,
1093
* -errno otherwise.
1094
*/
1095
static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
1096
struct sgx_backing *backing)
1097
{
1098
return __sgx_encl_get_backing(encl, page_index, backing);
1099
}
1100
1101
/**
1102
* sgx_encl_put_backing() - Unpin the backing storage
1103
* @backing: data for accessing backing storage for the page
1104
*/
1105
void sgx_encl_put_backing(struct sgx_backing *backing)
1106
{
1107
put_page(backing->pcmd);
1108
put_page(backing->contents);
1109
}
1110
1111
static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
1112
void *data)
1113
{
1114
pte_t pte;
1115
int ret;
1116
1117
ret = pte_young(*ptep);
1118
if (ret) {
1119
pte = pte_mkold(*ptep);
1120
set_pte_at((struct mm_struct *)data, addr, ptep, pte);
1121
}
1122
1123
return ret;
1124
}
1125
1126
/**
1127
* sgx_encl_test_and_clear_young() - Test and reset the accessed bit
1128
* @mm: mm_struct that is checked
1129
* @page: enclave page to be tested for recent access
1130
*
1131
* Checks the Access (A) bit from the PTE corresponding to the enclave page and
1132
* clears it.
1133
*
1134
* Return: 1 if the page has been recently accessed and 0 if not.
1135
*/
1136
int sgx_encl_test_and_clear_young(struct mm_struct *mm,
1137
struct sgx_encl_page *page)
1138
{
1139
unsigned long addr = page->desc & PAGE_MASK;
1140
struct sgx_encl *encl = page->encl;
1141
struct vm_area_struct *vma;
1142
int ret;
1143
1144
ret = sgx_encl_find(mm, addr, &vma);
1145
if (ret)
1146
return 0;
1147
1148
if (encl != vma->vm_private_data)
1149
return 0;
1150
1151
ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
1152
sgx_encl_test_and_clear_young_cb, vma->vm_mm);
1153
if (ret < 0)
1154
return 0;
1155
1156
return ret;
1157
}
1158
1159
struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
1160
unsigned long offset,
1161
u64 secinfo_flags)
1162
{
1163
struct sgx_encl_page *encl_page;
1164
unsigned long prot;
1165
1166
encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
1167
if (!encl_page)
1168
return ERR_PTR(-ENOMEM);
1169
1170
encl_page->desc = encl->base + offset;
1171
encl_page->encl = encl;
1172
1173
prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
1174
_calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
1175
_calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
1176
1177
/*
1178
* TCS pages must always RW set for CPU access while the SECINFO
1179
* permissions are *always* zero - the CPU ignores the user provided
1180
* values and silently overwrites them with zero permissions.
1181
*/
1182
if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
1183
prot |= PROT_READ | PROT_WRITE;
1184
1185
/* Calculate maximum of the VM flags for the page. */
1186
encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
1187
1188
return encl_page;
1189
}
1190
1191
/**
1192
* sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
1193
* @encl: the enclave
1194
* @addr: page aligned pointer to single page for which PTEs will be removed
1195
*
1196
* Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
1197
* @addr from each VMA. Ensure that page fault handler is ready to handle
1198
* new mappings of @addr before calling this function.
1199
*/
1200
void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
1201
{
1202
unsigned long mm_list_version;
1203
struct sgx_encl_mm *encl_mm;
1204
struct vm_area_struct *vma;
1205
int idx, ret;
1206
1207
do {
1208
mm_list_version = encl->mm_list_version;
1209
1210
/* Pairs with smp_wmb() in sgx_encl_mm_add(). */
1211
smp_rmb();
1212
1213
idx = srcu_read_lock(&encl->srcu);
1214
1215
list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
1216
if (!mmget_not_zero(encl_mm->mm))
1217
continue;
1218
1219
mmap_read_lock(encl_mm->mm);
1220
1221
ret = sgx_encl_find(encl_mm->mm, addr, &vma);
1222
if (!ret && encl == vma->vm_private_data)
1223
zap_vma_ptes(vma, addr, PAGE_SIZE);
1224
1225
mmap_read_unlock(encl_mm->mm);
1226
1227
mmput_async(encl_mm->mm);
1228
}
1229
1230
srcu_read_unlock(&encl->srcu, idx);
1231
} while (unlikely(encl->mm_list_version != mm_list_version));
1232
}
1233
1234
/**
1235
* sgx_alloc_va_page() - Allocate a Version Array (VA) page
1236
* @reclaim: Reclaim EPC pages directly if none available. Enclave
1237
* mutex should not be held if this is set.
1238
*
1239
* Allocate a free EPC page and convert it to a Version Array (VA) page.
1240
*
1241
* Return:
1242
* a VA page,
1243
* -errno otherwise
1244
*/
1245
struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
1246
{
1247
struct sgx_epc_page *epc_page;
1248
int ret;
1249
1250
epc_page = sgx_alloc_epc_page(NULL, reclaim);
1251
if (IS_ERR(epc_page))
1252
return ERR_CAST(epc_page);
1253
1254
ret = __epa(sgx_get_epc_virt_addr(epc_page));
1255
if (ret) {
1256
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
1257
sgx_encl_free_epc_page(epc_page);
1258
return ERR_PTR(-EFAULT);
1259
}
1260
1261
return epc_page;
1262
}
1263
1264
/**
1265
* sgx_alloc_va_slot - allocate a VA slot
1266
* @va_page: a &struct sgx_va_page instance
1267
*
1268
* Allocates a slot from a &struct sgx_va_page instance.
1269
*
1270
* Return: offset of the slot inside the VA page
1271
*/
1272
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
1273
{
1274
int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
1275
1276
if (slot < SGX_VA_SLOT_COUNT)
1277
set_bit(slot, va_page->slots);
1278
1279
return slot << 3;
1280
}
1281
1282
/**
1283
* sgx_free_va_slot - free a VA slot
1284
* @va_page: a &struct sgx_va_page instance
1285
* @offset: offset of the slot inside the VA page
1286
*
1287
* Frees a slot from a &struct sgx_va_page instance.
1288
*/
1289
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
1290
{
1291
clear_bit(offset >> 3, va_page->slots);
1292
}
1293
1294
/**
1295
* sgx_va_page_full - is the VA page full?
1296
* @va_page: a &struct sgx_va_page instance
1297
*
1298
* Return: true if all slots have been taken
1299
*/
1300
bool sgx_va_page_full(struct sgx_va_page *va_page)
1301
{
1302
int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
1303
1304
return slot == SGX_VA_SLOT_COUNT;
1305
}
1306
1307
/**
1308
* sgx_encl_free_epc_page - free an EPC page assigned to an enclave
1309
* @page: EPC page to be freed
1310
*
1311
* Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
1312
* only upon success, it puts the page back to free page list. Otherwise, it
1313
* gives a WARNING to indicate page is leaked.
1314
*/
1315
void sgx_encl_free_epc_page(struct sgx_epc_page *page)
1316
{
1317
int ret;
1318
1319
WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
1320
1321
ret = __eremove(sgx_get_epc_virt_addr(page));
1322
if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
1323
return;
1324
1325
sgx_free_epc_page(page);
1326
}
1327
1328