Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/io_uring/register.c
38178 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Code related to the io_uring_register() syscall
4
*
5
* Copyright (C) 2023 Jens Axboe
6
*/
7
#include <linux/kernel.h>
8
#include <linux/errno.h>
9
#include <linux/syscalls.h>
10
#include <linux/refcount.h>
11
#include <linux/bits.h>
12
#include <linux/fs.h>
13
#include <linux/file.h>
14
#include <linux/slab.h>
15
#include <linux/uaccess.h>
16
#include <linux/nospec.h>
17
#include <linux/compat.h>
18
#include <linux/io_uring.h>
19
#include <linux/io_uring_types.h>
20
21
#include "filetable.h"
22
#include "io_uring.h"
23
#include "opdef.h"
24
#include "tctx.h"
25
#include "rsrc.h"
26
#include "sqpoll.h"
27
#include "register.h"
28
#include "cancel.h"
29
#include "kbuf.h"
30
#include "napi.h"
31
#include "eventfd.h"
32
#include "msg_ring.h"
33
#include "memmap.h"
34
#include "zcrx.h"
35
#include "query.h"
36
37
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
38
IORING_REGISTER_LAST + IORING_OP_LAST)
39
40
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41
unsigned nr_args)
42
{
43
struct io_uring_probe *p;
44
size_t size;
45
int i, ret;
46
47
if (nr_args > IORING_OP_LAST)
48
nr_args = IORING_OP_LAST;
49
50
size = struct_size(p, ops, nr_args);
51
p = memdup_user(arg, size);
52
if (IS_ERR(p))
53
return PTR_ERR(p);
54
ret = -EINVAL;
55
if (memchr_inv(p, 0, size))
56
goto out;
57
58
p->last_op = IORING_OP_LAST - 1;
59
60
for (i = 0; i < nr_args; i++) {
61
p->ops[i].op = i;
62
if (io_uring_op_supported(i))
63
p->ops[i].flags = IO_URING_OP_SUPPORTED;
64
}
65
p->ops_len = i;
66
67
ret = 0;
68
if (copy_to_user(arg, p, size))
69
ret = -EFAULT;
70
out:
71
kfree(p);
72
return ret;
73
}
74
75
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76
{
77
const struct cred *creds;
78
79
creds = xa_erase(&ctx->personalities, id);
80
if (creds) {
81
put_cred(creds);
82
return 0;
83
}
84
85
return -EINVAL;
86
}
87
88
89
static int io_register_personality(struct io_ring_ctx *ctx)
90
{
91
const struct cred *creds;
92
u32 id;
93
int ret;
94
95
creds = get_current_cred();
96
97
ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98
XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99
if (ret < 0) {
100
put_cred(creds);
101
return ret;
102
}
103
return id;
104
}
105
106
static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107
struct io_restriction *restrictions)
108
{
109
struct io_uring_restriction *res;
110
size_t size;
111
int i, ret;
112
113
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114
return -EINVAL;
115
116
size = array_size(nr_args, sizeof(*res));
117
if (size == SIZE_MAX)
118
return -EOVERFLOW;
119
120
res = memdup_user(arg, size);
121
if (IS_ERR(res))
122
return PTR_ERR(res);
123
124
ret = -EINVAL;
125
126
for (i = 0; i < nr_args; i++) {
127
switch (res[i].opcode) {
128
case IORING_RESTRICTION_REGISTER_OP:
129
if (res[i].register_op >= IORING_REGISTER_LAST)
130
goto err;
131
__set_bit(res[i].register_op, restrictions->register_op);
132
break;
133
case IORING_RESTRICTION_SQE_OP:
134
if (res[i].sqe_op >= IORING_OP_LAST)
135
goto err;
136
__set_bit(res[i].sqe_op, restrictions->sqe_op);
137
break;
138
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139
restrictions->sqe_flags_allowed = res[i].sqe_flags;
140
break;
141
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142
restrictions->sqe_flags_required = res[i].sqe_flags;
143
break;
144
default:
145
goto err;
146
}
147
}
148
149
ret = 0;
150
151
err:
152
kfree(res);
153
return ret;
154
}
155
156
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157
void __user *arg, unsigned int nr_args)
158
{
159
int ret;
160
161
/* Restrictions allowed only if rings started disabled */
162
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163
return -EBADFD;
164
165
/* We allow only a single restrictions registration */
166
if (ctx->restrictions.registered)
167
return -EBUSY;
168
169
ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170
/* Reset all restrictions if an error happened */
171
if (ret != 0)
172
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173
else
174
ctx->restrictions.registered = true;
175
return ret;
176
}
177
178
static int io_register_enable_rings(struct io_ring_ctx *ctx)
179
{
180
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181
return -EBADFD;
182
183
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184
WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185
/*
186
* Lazy activation attempts would fail if it was polled before
187
* submitter_task is set.
188
*/
189
if (wq_has_sleeper(&ctx->poll_wq))
190
io_activate_pollwq(ctx);
191
}
192
193
if (ctx->restrictions.registered)
194
ctx->restricted = 1;
195
196
ctx->flags &= ~IORING_SETUP_R_DISABLED;
197
if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198
wake_up(&ctx->sq_data->wait);
199
return 0;
200
}
201
202
static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203
cpumask_var_t new_mask)
204
{
205
int ret;
206
207
if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208
ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209
} else {
210
mutex_unlock(&ctx->uring_lock);
211
ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212
mutex_lock(&ctx->uring_lock);
213
}
214
215
return ret;
216
}
217
218
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219
void __user *arg, unsigned len)
220
{
221
cpumask_var_t new_mask;
222
int ret;
223
224
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225
return -ENOMEM;
226
227
cpumask_clear(new_mask);
228
if (len > cpumask_size())
229
len = cpumask_size();
230
231
#ifdef CONFIG_COMPAT
232
if (in_compat_syscall())
233
ret = compat_get_bitmap(cpumask_bits(new_mask),
234
(const compat_ulong_t __user *)arg,
235
len * 8 /* CHAR_BIT */);
236
else
237
#endif
238
ret = copy_from_user(new_mask, arg, len);
239
240
if (ret) {
241
free_cpumask_var(new_mask);
242
return -EFAULT;
243
}
244
245
ret = __io_register_iowq_aff(ctx, new_mask);
246
free_cpumask_var(new_mask);
247
return ret;
248
}
249
250
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251
{
252
return __io_register_iowq_aff(ctx, NULL);
253
}
254
255
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256
void __user *arg)
257
__must_hold(&ctx->uring_lock)
258
{
259
struct io_tctx_node *node;
260
struct io_uring_task *tctx = NULL;
261
struct io_sq_data *sqd = NULL;
262
__u32 new_count[2];
263
int i, ret;
264
265
if (copy_from_user(new_count, arg, sizeof(new_count)))
266
return -EFAULT;
267
for (i = 0; i < ARRAY_SIZE(new_count); i++)
268
if (new_count[i] > INT_MAX)
269
return -EINVAL;
270
271
if (ctx->flags & IORING_SETUP_SQPOLL) {
272
sqd = ctx->sq_data;
273
if (sqd) {
274
struct task_struct *tsk;
275
276
/*
277
* Observe the correct sqd->lock -> ctx->uring_lock
278
* ordering. Fine to drop uring_lock here, we hold
279
* a ref to the ctx.
280
*/
281
refcount_inc(&sqd->refs);
282
mutex_unlock(&ctx->uring_lock);
283
mutex_lock(&sqd->lock);
284
mutex_lock(&ctx->uring_lock);
285
tsk = sqpoll_task_locked(sqd);
286
if (tsk)
287
tctx = tsk->io_uring;
288
}
289
} else {
290
tctx = current->io_uring;
291
}
292
293
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294
295
for (i = 0; i < ARRAY_SIZE(new_count); i++)
296
if (new_count[i])
297
ctx->iowq_limits[i] = new_count[i];
298
ctx->iowq_limits_set = true;
299
300
if (tctx && tctx->io_wq) {
301
ret = io_wq_max_workers(tctx->io_wq, new_count);
302
if (ret)
303
goto err;
304
} else {
305
memset(new_count, 0, sizeof(new_count));
306
}
307
308
if (sqd) {
309
mutex_unlock(&ctx->uring_lock);
310
mutex_unlock(&sqd->lock);
311
io_put_sq_data(sqd);
312
mutex_lock(&ctx->uring_lock);
313
}
314
315
if (copy_to_user(arg, new_count, sizeof(new_count)))
316
return -EFAULT;
317
318
/* that's it for SQPOLL, only the SQPOLL task creates requests */
319
if (sqd)
320
return 0;
321
322
/* now propagate the restriction to all registered users */
323
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324
tctx = node->task->io_uring;
325
if (WARN_ON_ONCE(!tctx->io_wq))
326
continue;
327
328
for (i = 0; i < ARRAY_SIZE(new_count); i++)
329
new_count[i] = ctx->iowq_limits[i];
330
/* ignore errors, it always returns zero anyway */
331
(void)io_wq_max_workers(tctx->io_wq, new_count);
332
}
333
return 0;
334
err:
335
if (sqd) {
336
mutex_unlock(&ctx->uring_lock);
337
mutex_unlock(&sqd->lock);
338
io_put_sq_data(sqd);
339
mutex_lock(&ctx->uring_lock);
340
}
341
return ret;
342
}
343
344
static int io_register_clock(struct io_ring_ctx *ctx,
345
struct io_uring_clock_register __user *arg)
346
{
347
struct io_uring_clock_register reg;
348
349
if (copy_from_user(&reg, arg, sizeof(reg)))
350
return -EFAULT;
351
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
352
return -EINVAL;
353
354
switch (reg.clockid) {
355
case CLOCK_MONOTONIC:
356
ctx->clock_offset = 0;
357
break;
358
case CLOCK_BOOTTIME:
359
ctx->clock_offset = TK_OFFS_BOOT;
360
break;
361
default:
362
return -EINVAL;
363
}
364
365
ctx->clockid = reg.clockid;
366
return 0;
367
}
368
369
/*
370
* State to maintain until we can swap. Both new and old state, used for
371
* either mapping or freeing.
372
*/
373
struct io_ring_ctx_rings {
374
struct io_rings *rings;
375
struct io_uring_sqe *sq_sqes;
376
377
struct io_mapped_region sq_region;
378
struct io_mapped_region ring_region;
379
};
380
381
static void io_register_free_rings(struct io_ring_ctx *ctx,
382
struct io_ring_ctx_rings *r)
383
{
384
io_free_region(ctx->user, &r->sq_region);
385
io_free_region(ctx->user, &r->ring_region);
386
}
387
388
#define swap_old(ctx, o, n, field) \
389
do { \
390
(o).field = (ctx)->field; \
391
(ctx)->field = (n).field; \
392
} while (0)
393
394
#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
395
#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
396
IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
397
IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
398
399
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
400
{
401
struct io_ctx_config config;
402
struct io_uring_region_desc rd;
403
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404
unsigned i, tail, old_head;
405
struct io_uring_params *p = &config.p;
406
struct io_rings_layout *rl = &config.layout;
407
int ret;
408
409
memset(&config, 0, sizeof(config));
410
411
/* limited to DEFER_TASKRUN for now */
412
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
413
return -EINVAL;
414
if (copy_from_user(p, arg, sizeof(*p)))
415
return -EFAULT;
416
if (p->flags & ~RESIZE_FLAGS)
417
return -EINVAL;
418
419
/* properties that are always inherited */
420
p->flags |= (ctx->flags & COPY_FLAGS);
421
422
ret = io_prepare_config(&config);
423
if (unlikely(ret))
424
return ret;
425
426
memset(&rd, 0, sizeof(rd));
427
rd.size = PAGE_ALIGN(rl->rings_size);
428
if (p->flags & IORING_SETUP_NO_MMAP) {
429
rd.user_addr = p->cq_off.user_addr;
430
rd.flags |= IORING_MEM_REGION_TYPE_USER;
431
}
432
ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
433
if (ret)
434
return ret;
435
436
n.rings = io_region_get_ptr(&n.ring_region);
437
438
/*
439
* At this point n.rings is shared with userspace, just like o.rings
440
* is as well. While we don't expect userspace to modify it while
441
* a resize is in progress, and it's most likely that userspace will
442
* shoot itself in the foot if it does, we can't always assume good
443
* intent... Use read/write once helpers from here on to indicate the
444
* shared nature of it.
445
*/
446
WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
447
WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
448
WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
449
WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
450
451
if (copy_to_user(arg, p, sizeof(*p))) {
452
io_register_free_rings(ctx, &n);
453
return -EFAULT;
454
}
455
456
memset(&rd, 0, sizeof(rd));
457
rd.size = PAGE_ALIGN(rl->sq_size);
458
if (p->flags & IORING_SETUP_NO_MMAP) {
459
rd.user_addr = p->sq_off.user_addr;
460
rd.flags |= IORING_MEM_REGION_TYPE_USER;
461
}
462
ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
463
if (ret) {
464
io_register_free_rings(ctx, &n);
465
return ret;
466
}
467
n.sq_sqes = io_region_get_ptr(&n.sq_region);
468
469
/*
470
* If using SQPOLL, park the thread
471
*/
472
if (ctx->sq_data) {
473
mutex_unlock(&ctx->uring_lock);
474
io_sq_thread_park(ctx->sq_data);
475
mutex_lock(&ctx->uring_lock);
476
}
477
478
/*
479
* We'll do the swap. Grab the ctx->mmap_lock, which will exclude
480
* any new mmap's on the ring fd. Clear out existing mappings to prevent
481
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
482
* existing rings beyond this point will fail. Not that it could proceed
483
* at this point anyway, as the io_uring mmap side needs go grab the
484
* ctx->mmap_lock as well. Likewise, hold the completion lock over the
485
* duration of the actual swap.
486
*/
487
mutex_lock(&ctx->mmap_lock);
488
spin_lock(&ctx->completion_lock);
489
o.rings = ctx->rings;
490
ctx->rings = NULL;
491
o.sq_sqes = ctx->sq_sqes;
492
ctx->sq_sqes = NULL;
493
494
/*
495
* Now copy SQ and CQ entries, if any. If either of the destination
496
* rings can't hold what is already there, then fail the operation.
497
*/
498
tail = READ_ONCE(o.rings->sq.tail);
499
old_head = READ_ONCE(o.rings->sq.head);
500
if (tail - old_head > p->sq_entries)
501
goto overflow;
502
for (i = old_head; i < tail; i++) {
503
unsigned src_head = i & (ctx->sq_entries - 1);
504
unsigned dst_head = i & (p->sq_entries - 1);
505
506
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
507
}
508
WRITE_ONCE(n.rings->sq.head, old_head);
509
WRITE_ONCE(n.rings->sq.tail, tail);
510
511
tail = READ_ONCE(o.rings->cq.tail);
512
old_head = READ_ONCE(o.rings->cq.head);
513
if (tail - old_head > p->cq_entries) {
514
overflow:
515
/* restore old rings, and return -EOVERFLOW via cleanup path */
516
ctx->rings = o.rings;
517
ctx->sq_sqes = o.sq_sqes;
518
to_free = &n;
519
ret = -EOVERFLOW;
520
goto out;
521
}
522
for (i = old_head; i < tail; i++) {
523
unsigned src_head = i & (ctx->cq_entries - 1);
524
unsigned dst_head = i & (p->cq_entries - 1);
525
526
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
527
}
528
WRITE_ONCE(n.rings->cq.head, old_head);
529
WRITE_ONCE(n.rings->cq.tail, tail);
530
/* invalidate cached cqe refill */
531
ctx->cqe_cached = ctx->cqe_sentinel = NULL;
532
533
WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
534
atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
535
WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
536
WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
537
538
/* all done, store old pointers and assign new ones */
539
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
540
ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
541
542
ctx->sq_entries = p->sq_entries;
543
ctx->cq_entries = p->cq_entries;
544
545
ctx->rings = n.rings;
546
ctx->sq_sqes = n.sq_sqes;
547
swap_old(ctx, o, n, ring_region);
548
swap_old(ctx, o, n, sq_region);
549
to_free = &o;
550
ret = 0;
551
out:
552
spin_unlock(&ctx->completion_lock);
553
mutex_unlock(&ctx->mmap_lock);
554
io_register_free_rings(ctx, to_free);
555
556
if (ctx->sq_data)
557
io_sq_thread_unpark(ctx->sq_data);
558
559
return ret;
560
}
561
562
static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
563
{
564
struct io_uring_mem_region_reg __user *reg_uptr = uarg;
565
struct io_uring_mem_region_reg reg;
566
struct io_uring_region_desc __user *rd_uptr;
567
struct io_uring_region_desc rd;
568
struct io_mapped_region region = {};
569
int ret;
570
571
if (io_region_is_set(&ctx->param_region))
572
return -EBUSY;
573
if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
574
return -EFAULT;
575
rd_uptr = u64_to_user_ptr(reg.region_uptr);
576
if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
577
return -EFAULT;
578
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
579
return -EINVAL;
580
if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
581
return -EINVAL;
582
583
/*
584
* This ensures there are no waiters. Waiters are unlocked and it's
585
* hard to synchronise with them, especially if we need to initialise
586
* the region.
587
*/
588
if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
589
!(ctx->flags & IORING_SETUP_R_DISABLED))
590
return -EINVAL;
591
592
ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
593
if (ret)
594
return ret;
595
if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
596
io_free_region(ctx->user, &region);
597
return -EFAULT;
598
}
599
600
if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
601
ctx->cq_wait_arg = io_region_get_ptr(&region);
602
ctx->cq_wait_size = rd.size;
603
}
604
605
io_region_publish(ctx, &region, &ctx->param_region);
606
return 0;
607
}
608
609
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
610
void __user *arg, unsigned nr_args)
611
__releases(ctx->uring_lock)
612
__acquires(ctx->uring_lock)
613
{
614
int ret;
615
616
/*
617
* We don't quiesce the refs for register anymore and so it can't be
618
* dying as we're holding a file ref here.
619
*/
620
if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
621
return -ENXIO;
622
623
if (ctx->submitter_task && ctx->submitter_task != current)
624
return -EEXIST;
625
626
if (ctx->restricted) {
627
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
628
if (!test_bit(opcode, ctx->restrictions.register_op))
629
return -EACCES;
630
}
631
632
switch (opcode) {
633
case IORING_REGISTER_BUFFERS:
634
ret = -EFAULT;
635
if (!arg)
636
break;
637
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
638
break;
639
case IORING_UNREGISTER_BUFFERS:
640
ret = -EINVAL;
641
if (arg || nr_args)
642
break;
643
ret = io_sqe_buffers_unregister(ctx);
644
break;
645
case IORING_REGISTER_FILES:
646
ret = -EFAULT;
647
if (!arg)
648
break;
649
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
650
break;
651
case IORING_UNREGISTER_FILES:
652
ret = -EINVAL;
653
if (arg || nr_args)
654
break;
655
ret = io_sqe_files_unregister(ctx);
656
break;
657
case IORING_REGISTER_FILES_UPDATE:
658
ret = io_register_files_update(ctx, arg, nr_args);
659
break;
660
case IORING_REGISTER_EVENTFD:
661
ret = -EINVAL;
662
if (nr_args != 1)
663
break;
664
ret = io_eventfd_register(ctx, arg, 0);
665
break;
666
case IORING_REGISTER_EVENTFD_ASYNC:
667
ret = -EINVAL;
668
if (nr_args != 1)
669
break;
670
ret = io_eventfd_register(ctx, arg, 1);
671
break;
672
case IORING_UNREGISTER_EVENTFD:
673
ret = -EINVAL;
674
if (arg || nr_args)
675
break;
676
ret = io_eventfd_unregister(ctx);
677
break;
678
case IORING_REGISTER_PROBE:
679
ret = -EINVAL;
680
if (!arg || nr_args > 256)
681
break;
682
ret = io_probe(ctx, arg, nr_args);
683
break;
684
case IORING_REGISTER_PERSONALITY:
685
ret = -EINVAL;
686
if (arg || nr_args)
687
break;
688
ret = io_register_personality(ctx);
689
break;
690
case IORING_UNREGISTER_PERSONALITY:
691
ret = -EINVAL;
692
if (arg)
693
break;
694
ret = io_unregister_personality(ctx, nr_args);
695
break;
696
case IORING_REGISTER_ENABLE_RINGS:
697
ret = -EINVAL;
698
if (arg || nr_args)
699
break;
700
ret = io_register_enable_rings(ctx);
701
break;
702
case IORING_REGISTER_RESTRICTIONS:
703
ret = io_register_restrictions(ctx, arg, nr_args);
704
break;
705
case IORING_REGISTER_FILES2:
706
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
707
break;
708
case IORING_REGISTER_FILES_UPDATE2:
709
ret = io_register_rsrc_update(ctx, arg, nr_args,
710
IORING_RSRC_FILE);
711
break;
712
case IORING_REGISTER_BUFFERS2:
713
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
714
break;
715
case IORING_REGISTER_BUFFERS_UPDATE:
716
ret = io_register_rsrc_update(ctx, arg, nr_args,
717
IORING_RSRC_BUFFER);
718
break;
719
case IORING_REGISTER_IOWQ_AFF:
720
ret = -EINVAL;
721
if (!arg || !nr_args)
722
break;
723
ret = io_register_iowq_aff(ctx, arg, nr_args);
724
break;
725
case IORING_UNREGISTER_IOWQ_AFF:
726
ret = -EINVAL;
727
if (arg || nr_args)
728
break;
729
ret = io_unregister_iowq_aff(ctx);
730
break;
731
case IORING_REGISTER_IOWQ_MAX_WORKERS:
732
ret = -EINVAL;
733
if (!arg || nr_args != 2)
734
break;
735
ret = io_register_iowq_max_workers(ctx, arg);
736
break;
737
case IORING_REGISTER_RING_FDS:
738
ret = io_ringfd_register(ctx, arg, nr_args);
739
break;
740
case IORING_UNREGISTER_RING_FDS:
741
ret = io_ringfd_unregister(ctx, arg, nr_args);
742
break;
743
case IORING_REGISTER_PBUF_RING:
744
ret = -EINVAL;
745
if (!arg || nr_args != 1)
746
break;
747
ret = io_register_pbuf_ring(ctx, arg);
748
break;
749
case IORING_UNREGISTER_PBUF_RING:
750
ret = -EINVAL;
751
if (!arg || nr_args != 1)
752
break;
753
ret = io_unregister_pbuf_ring(ctx, arg);
754
break;
755
case IORING_REGISTER_SYNC_CANCEL:
756
ret = -EINVAL;
757
if (!arg || nr_args != 1)
758
break;
759
ret = io_sync_cancel(ctx, arg);
760
break;
761
case IORING_REGISTER_FILE_ALLOC_RANGE:
762
ret = -EINVAL;
763
if (!arg || nr_args)
764
break;
765
ret = io_register_file_alloc_range(ctx, arg);
766
break;
767
case IORING_REGISTER_PBUF_STATUS:
768
ret = -EINVAL;
769
if (!arg || nr_args != 1)
770
break;
771
ret = io_register_pbuf_status(ctx, arg);
772
break;
773
case IORING_REGISTER_NAPI:
774
ret = -EINVAL;
775
if (!arg || nr_args != 1)
776
break;
777
ret = io_register_napi(ctx, arg);
778
break;
779
case IORING_UNREGISTER_NAPI:
780
ret = -EINVAL;
781
if (nr_args != 1)
782
break;
783
ret = io_unregister_napi(ctx, arg);
784
break;
785
case IORING_REGISTER_CLOCK:
786
ret = -EINVAL;
787
if (!arg || nr_args)
788
break;
789
ret = io_register_clock(ctx, arg);
790
break;
791
case IORING_REGISTER_CLONE_BUFFERS:
792
ret = -EINVAL;
793
if (!arg || nr_args != 1)
794
break;
795
ret = io_register_clone_buffers(ctx, arg);
796
break;
797
case IORING_REGISTER_ZCRX_IFQ:
798
ret = -EINVAL;
799
if (!arg || nr_args != 1)
800
break;
801
ret = io_register_zcrx_ifq(ctx, arg);
802
break;
803
case IORING_REGISTER_RESIZE_RINGS:
804
ret = -EINVAL;
805
if (!arg || nr_args != 1)
806
break;
807
ret = io_register_resize_rings(ctx, arg);
808
break;
809
case IORING_REGISTER_MEM_REGION:
810
ret = -EINVAL;
811
if (!arg || nr_args != 1)
812
break;
813
ret = io_register_mem_region(ctx, arg);
814
break;
815
case IORING_REGISTER_QUERY:
816
ret = io_query(arg, nr_args);
817
break;
818
case IORING_REGISTER_ZCRX_CTRL:
819
ret = io_zcrx_ctrl(ctx, arg, nr_args);
820
break;
821
default:
822
ret = -EINVAL;
823
break;
824
}
825
826
return ret;
827
}
828
829
/*
830
* Given an 'fd' value, return the ctx associated with if. If 'registered' is
831
* true, then the registered index is used. Otherwise, the normal fd table.
832
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
833
*/
834
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
835
{
836
struct file *file;
837
838
if (registered) {
839
/*
840
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
841
* need only dereference our task private array to find it.
842
*/
843
struct io_uring_task *tctx = current->io_uring;
844
845
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
846
return ERR_PTR(-EINVAL);
847
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
848
file = tctx->registered_rings[fd];
849
if (file)
850
get_file(file);
851
} else {
852
file = fget(fd);
853
}
854
855
if (unlikely(!file))
856
return ERR_PTR(-EBADF);
857
if (io_is_uring_fops(file))
858
return file;
859
fput(file);
860
return ERR_PTR(-EOPNOTSUPP);
861
}
862
863
static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
864
{
865
struct io_uring_sqe sqe;
866
867
if (!arg || nr_args != 1)
868
return -EINVAL;
869
if (copy_from_user(&sqe, arg, sizeof(sqe)))
870
return -EFAULT;
871
/* no flags supported */
872
if (sqe.flags)
873
return -EINVAL;
874
if (sqe.opcode != IORING_OP_MSG_RING)
875
return -EINVAL;
876
877
return io_uring_sync_msg_ring(&sqe);
878
}
879
880
/*
881
* "blind" registration opcodes are ones where there's no ring given, and
882
* hence the source fd must be -1.
883
*/
884
static int io_uring_register_blind(unsigned int opcode, void __user *arg,
885
unsigned int nr_args)
886
{
887
switch (opcode) {
888
case IORING_REGISTER_SEND_MSG_RING:
889
return io_uring_register_send_msg_ring(arg, nr_args);
890
case IORING_REGISTER_QUERY:
891
return io_query(arg, nr_args);
892
}
893
return -EINVAL;
894
}
895
896
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
897
void __user *, arg, unsigned int, nr_args)
898
{
899
struct io_ring_ctx *ctx;
900
long ret = -EBADF;
901
struct file *file;
902
bool use_registered_ring;
903
904
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
905
opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
906
907
if (opcode >= IORING_REGISTER_LAST)
908
return -EINVAL;
909
910
if (fd == -1)
911
return io_uring_register_blind(opcode, arg, nr_args);
912
913
file = io_uring_register_get_file(fd, use_registered_ring);
914
if (IS_ERR(file))
915
return PTR_ERR(file);
916
ctx = file->private_data;
917
918
mutex_lock(&ctx->uring_lock);
919
ret = __io_uring_register(ctx, opcode, arg, nr_args);
920
921
trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
922
ctx->buf_table.nr, ret);
923
mutex_unlock(&ctx->uring_lock);
924
925
fput(file);
926
return ret;
927
}
928
929