Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/io_uring/rsrc.c
26131 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/kernel.h>
3
#include <linux/errno.h>
4
#include <linux/fs.h>
5
#include <linux/file.h>
6
#include <linux/mm.h>
7
#include <linux/slab.h>
8
#include <linux/nospec.h>
9
#include <linux/hugetlb.h>
10
#include <linux/compat.h>
11
#include <linux/io_uring.h>
12
#include <linux/io_uring/cmd.h>
13
14
#include <uapi/linux/io_uring.h>
15
16
#include "io_uring.h"
17
#include "openclose.h"
18
#include "rsrc.h"
19
#include "memmap.h"
20
#include "register.h"
21
22
struct io_rsrc_update {
23
struct file *file;
24
u64 arg;
25
u32 nr_args;
26
u32 offset;
27
};
28
29
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
30
struct iovec *iov, struct page **last_hpage);
31
32
/* only define max */
33
#define IORING_MAX_FIXED_FILES (1U << 20)
34
#define IORING_MAX_REG_BUFFERS (1U << 14)
35
36
#define IO_CACHED_BVECS_SEGS 32
37
38
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
39
{
40
unsigned long page_limit, cur_pages, new_pages;
41
42
if (!nr_pages)
43
return 0;
44
45
/* Don't allow more pages than we can safely lock */
46
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47
48
cur_pages = atomic_long_read(&user->locked_vm);
49
do {
50
new_pages = cur_pages + nr_pages;
51
if (new_pages > page_limit)
52
return -ENOMEM;
53
} while (!atomic_long_try_cmpxchg(&user->locked_vm,
54
&cur_pages, new_pages));
55
return 0;
56
}
57
58
void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
59
{
60
if (ctx->user)
61
__io_unaccount_mem(ctx->user, nr_pages);
62
63
if (ctx->mm_account)
64
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
65
}
66
67
int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
68
{
69
int ret;
70
71
if (ctx->user) {
72
ret = __io_account_mem(ctx->user, nr_pages);
73
if (ret)
74
return ret;
75
}
76
77
if (ctx->mm_account)
78
atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
79
80
return 0;
81
}
82
83
int io_validate_user_buf_range(u64 uaddr, u64 ulen)
84
{
85
unsigned long tmp, base = (unsigned long)uaddr;
86
unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
87
88
/* arbitrary limit, but we need something */
89
if (ulen > SZ_1G || !ulen)
90
return -EFAULT;
91
if (check_add_overflow(base, acct_len, &tmp))
92
return -EOVERFLOW;
93
return 0;
94
}
95
96
static int io_buffer_validate(struct iovec *iov)
97
{
98
/*
99
* Don't impose further limits on the size and buffer
100
* constraints here, we'll -EINVAL later when IO is
101
* submitted if they are wrong.
102
*/
103
if (!iov->iov_base)
104
return iov->iov_len ? -EFAULT : 0;
105
106
return io_validate_user_buf_range((unsigned long)iov->iov_base,
107
iov->iov_len);
108
}
109
110
static void io_release_ubuf(void *priv)
111
{
112
struct io_mapped_ubuf *imu = priv;
113
unsigned int i;
114
115
for (i = 0; i < imu->nr_bvecs; i++) {
116
struct folio *folio = page_folio(imu->bvec[i].bv_page);
117
118
unpin_user_folio(folio, 1);
119
}
120
}
121
122
static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
123
int nr_bvecs)
124
{
125
if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
126
return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
127
return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
128
GFP_KERNEL);
129
}
130
131
static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
132
{
133
if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
134
io_cache_free(&ctx->imu_cache, imu);
135
else
136
kvfree(imu);
137
}
138
139
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
140
{
141
if (unlikely(refcount_read(&imu->refs) > 1)) {
142
if (!refcount_dec_and_test(&imu->refs))
143
return;
144
}
145
146
if (imu->acct_pages)
147
io_unaccount_mem(ctx, imu->acct_pages);
148
imu->release(imu->priv);
149
io_free_imu(ctx, imu);
150
}
151
152
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
153
{
154
struct io_rsrc_node *node;
155
156
node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
157
if (node) {
158
node->type = type;
159
node->refs = 1;
160
node->tag = 0;
161
node->file_ptr = 0;
162
}
163
return node;
164
}
165
166
bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
167
{
168
const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
169
IO_CACHED_BVECS_SEGS);
170
const int node_size = sizeof(struct io_rsrc_node);
171
bool ret;
172
173
ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
174
node_size, 0);
175
ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
176
imu_cache_size, 0);
177
return ret;
178
}
179
180
void io_rsrc_cache_free(struct io_ring_ctx *ctx)
181
{
182
io_alloc_cache_free(&ctx->node_cache, kfree);
183
io_alloc_cache_free(&ctx->imu_cache, kfree);
184
}
185
186
static void io_clear_table_tags(struct io_rsrc_data *data)
187
{
188
int i;
189
190
for (i = 0; i < data->nr; i++) {
191
struct io_rsrc_node *node = data->nodes[i];
192
193
if (node)
194
node->tag = 0;
195
}
196
}
197
198
__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
199
struct io_rsrc_data *data)
200
{
201
if (!data->nr)
202
return;
203
while (data->nr--) {
204
if (data->nodes[data->nr])
205
io_put_rsrc_node(ctx, data->nodes[data->nr]);
206
}
207
kvfree(data->nodes);
208
data->nodes = NULL;
209
data->nr = 0;
210
}
211
212
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
213
{
214
data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
215
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
216
if (data->nodes) {
217
data->nr = nr;
218
return 0;
219
}
220
return -ENOMEM;
221
}
222
223
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
224
struct io_uring_rsrc_update2 *up,
225
unsigned nr_args)
226
{
227
u64 __user *tags = u64_to_user_ptr(up->tags);
228
__s32 __user *fds = u64_to_user_ptr(up->data);
229
int fd, i, err = 0;
230
unsigned int done;
231
232
if (!ctx->file_table.data.nr)
233
return -ENXIO;
234
if (up->offset + nr_args > ctx->file_table.data.nr)
235
return -EINVAL;
236
237
for (done = 0; done < nr_args; done++) {
238
u64 tag = 0;
239
240
if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
241
copy_from_user(&fd, &fds[done], sizeof(fd))) {
242
err = -EFAULT;
243
break;
244
}
245
if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
246
err = -EINVAL;
247
break;
248
}
249
if (fd == IORING_REGISTER_FILES_SKIP)
250
continue;
251
252
i = up->offset + done;
253
if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
254
io_file_bitmap_clear(&ctx->file_table, i);
255
256
if (fd != -1) {
257
struct file *file = fget(fd);
258
struct io_rsrc_node *node;
259
260
if (!file) {
261
err = -EBADF;
262
break;
263
}
264
/*
265
* Don't allow io_uring instances to be registered.
266
*/
267
if (io_is_uring_fops(file)) {
268
fput(file);
269
err = -EBADF;
270
break;
271
}
272
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
273
if (!node) {
274
err = -ENOMEM;
275
fput(file);
276
break;
277
}
278
ctx->file_table.data.nodes[i] = node;
279
if (tag)
280
node->tag = tag;
281
io_fixed_file_set(node, file);
282
io_file_bitmap_set(&ctx->file_table, i);
283
}
284
}
285
return done ? done : err;
286
}
287
288
static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
289
struct io_uring_rsrc_update2 *up,
290
unsigned int nr_args)
291
{
292
u64 __user *tags = u64_to_user_ptr(up->tags);
293
struct iovec fast_iov, *iov;
294
struct page *last_hpage = NULL;
295
struct iovec __user *uvec;
296
u64 user_data = up->data;
297
__u32 done;
298
int i, err;
299
300
if (!ctx->buf_table.nr)
301
return -ENXIO;
302
if (up->offset + nr_args > ctx->buf_table.nr)
303
return -EINVAL;
304
305
for (done = 0; done < nr_args; done++) {
306
struct io_rsrc_node *node;
307
u64 tag = 0;
308
309
uvec = u64_to_user_ptr(user_data);
310
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
311
if (IS_ERR(iov)) {
312
err = PTR_ERR(iov);
313
break;
314
}
315
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
316
err = -EFAULT;
317
break;
318
}
319
err = io_buffer_validate(iov);
320
if (err)
321
break;
322
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
323
if (IS_ERR(node)) {
324
err = PTR_ERR(node);
325
break;
326
}
327
if (tag) {
328
if (!node) {
329
err = -EINVAL;
330
break;
331
}
332
node->tag = tag;
333
}
334
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
335
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
336
ctx->buf_table.nodes[i] = node;
337
if (ctx->compat)
338
user_data += sizeof(struct compat_iovec);
339
else
340
user_data += sizeof(struct iovec);
341
}
342
return done ? done : err;
343
}
344
345
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
346
struct io_uring_rsrc_update2 *up,
347
unsigned nr_args)
348
{
349
__u32 tmp;
350
351
lockdep_assert_held(&ctx->uring_lock);
352
353
if (check_add_overflow(up->offset, nr_args, &tmp))
354
return -EOVERFLOW;
355
356
switch (type) {
357
case IORING_RSRC_FILE:
358
return __io_sqe_files_update(ctx, up, nr_args);
359
case IORING_RSRC_BUFFER:
360
return __io_sqe_buffers_update(ctx, up, nr_args);
361
}
362
return -EINVAL;
363
}
364
365
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
366
unsigned nr_args)
367
{
368
struct io_uring_rsrc_update2 up;
369
370
if (!nr_args)
371
return -EINVAL;
372
memset(&up, 0, sizeof(up));
373
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
374
return -EFAULT;
375
if (up.resv || up.resv2)
376
return -EINVAL;
377
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
378
}
379
380
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
381
unsigned size, unsigned type)
382
{
383
struct io_uring_rsrc_update2 up;
384
385
if (size != sizeof(up))
386
return -EINVAL;
387
if (copy_from_user(&up, arg, sizeof(up)))
388
return -EFAULT;
389
if (!up.nr || up.resv || up.resv2)
390
return -EINVAL;
391
return __io_register_rsrc_update(ctx, type, &up, up.nr);
392
}
393
394
__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
395
unsigned int size, unsigned int type)
396
{
397
struct io_uring_rsrc_register rr;
398
399
/* keep it extendible */
400
if (size != sizeof(rr))
401
return -EINVAL;
402
403
memset(&rr, 0, sizeof(rr));
404
if (copy_from_user(&rr, arg, size))
405
return -EFAULT;
406
if (!rr.nr || rr.resv2)
407
return -EINVAL;
408
if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
409
return -EINVAL;
410
411
switch (type) {
412
case IORING_RSRC_FILE:
413
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
414
break;
415
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
416
rr.nr, u64_to_user_ptr(rr.tags));
417
case IORING_RSRC_BUFFER:
418
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
419
break;
420
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
421
rr.nr, u64_to_user_ptr(rr.tags));
422
}
423
return -EINVAL;
424
}
425
426
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
427
{
428
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
429
430
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
431
return -EINVAL;
432
if (sqe->rw_flags || sqe->splice_fd_in)
433
return -EINVAL;
434
435
up->offset = READ_ONCE(sqe->off);
436
up->nr_args = READ_ONCE(sqe->len);
437
if (!up->nr_args)
438
return -EINVAL;
439
up->arg = READ_ONCE(sqe->addr);
440
return 0;
441
}
442
443
static int io_files_update_with_index_alloc(struct io_kiocb *req,
444
unsigned int issue_flags)
445
{
446
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
447
__s32 __user *fds = u64_to_user_ptr(up->arg);
448
unsigned int done;
449
struct file *file;
450
int ret, fd;
451
452
if (!req->ctx->file_table.data.nr)
453
return -ENXIO;
454
455
for (done = 0; done < up->nr_args; done++) {
456
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
457
ret = -EFAULT;
458
break;
459
}
460
461
file = fget(fd);
462
if (!file) {
463
ret = -EBADF;
464
break;
465
}
466
ret = io_fixed_fd_install(req, issue_flags, file,
467
IORING_FILE_INDEX_ALLOC);
468
if (ret < 0)
469
break;
470
if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
471
__io_close_fixed(req->ctx, issue_flags, ret);
472
ret = -EFAULT;
473
break;
474
}
475
}
476
477
if (done)
478
return done;
479
return ret;
480
}
481
482
int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
483
{
484
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
485
struct io_ring_ctx *ctx = req->ctx;
486
struct io_uring_rsrc_update2 up2;
487
int ret;
488
489
up2.offset = up->offset;
490
up2.data = up->arg;
491
up2.nr = 0;
492
up2.tags = 0;
493
up2.resv = 0;
494
up2.resv2 = 0;
495
496
if (up->offset == IORING_FILE_INDEX_ALLOC) {
497
ret = io_files_update_with_index_alloc(req, issue_flags);
498
} else {
499
io_ring_submit_lock(ctx, issue_flags);
500
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
501
&up2, up->nr_args);
502
io_ring_submit_unlock(ctx, issue_flags);
503
}
504
505
if (ret < 0)
506
req_set_fail(req);
507
io_req_set_res(req, ret, 0);
508
return IOU_COMPLETE;
509
}
510
511
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
512
{
513
if (node->tag)
514
io_post_aux_cqe(ctx, node->tag, 0, 0);
515
516
switch (node->type) {
517
case IORING_RSRC_FILE:
518
fput(io_slot_file(node));
519
break;
520
case IORING_RSRC_BUFFER:
521
io_buffer_unmap(ctx, node->buf);
522
break;
523
default:
524
WARN_ON_ONCE(1);
525
break;
526
}
527
528
io_cache_free(&ctx->node_cache, node);
529
}
530
531
int io_sqe_files_unregister(struct io_ring_ctx *ctx)
532
{
533
if (!ctx->file_table.data.nr)
534
return -ENXIO;
535
536
io_free_file_tables(ctx, &ctx->file_table);
537
io_file_table_set_alloc_range(ctx, 0, 0);
538
return 0;
539
}
540
541
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
542
unsigned nr_args, u64 __user *tags)
543
{
544
__s32 __user *fds = (__s32 __user *) arg;
545
struct file *file;
546
int fd, ret;
547
unsigned i;
548
549
if (ctx->file_table.data.nr)
550
return -EBUSY;
551
if (!nr_args)
552
return -EINVAL;
553
if (nr_args > IORING_MAX_FIXED_FILES)
554
return -EMFILE;
555
if (nr_args > rlimit(RLIMIT_NOFILE))
556
return -EMFILE;
557
if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
558
return -ENOMEM;
559
560
for (i = 0; i < nr_args; i++) {
561
struct io_rsrc_node *node;
562
u64 tag = 0;
563
564
ret = -EFAULT;
565
if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
566
goto fail;
567
if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
568
goto fail;
569
/* allow sparse sets */
570
if (!fds || fd == -1) {
571
ret = -EINVAL;
572
if (tag)
573
goto fail;
574
continue;
575
}
576
577
file = fget(fd);
578
ret = -EBADF;
579
if (unlikely(!file))
580
goto fail;
581
582
/*
583
* Don't allow io_uring instances to be registered.
584
*/
585
if (io_is_uring_fops(file)) {
586
fput(file);
587
goto fail;
588
}
589
ret = -ENOMEM;
590
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
591
if (!node) {
592
fput(file);
593
goto fail;
594
}
595
if (tag)
596
node->tag = tag;
597
ctx->file_table.data.nodes[i] = node;
598
io_fixed_file_set(node, file);
599
io_file_bitmap_set(&ctx->file_table, i);
600
}
601
602
/* default it to the whole table */
603
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
604
return 0;
605
fail:
606
io_clear_table_tags(&ctx->file_table.data);
607
io_sqe_files_unregister(ctx);
608
return ret;
609
}
610
611
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
612
{
613
if (!ctx->buf_table.nr)
614
return -ENXIO;
615
io_rsrc_data_free(ctx, &ctx->buf_table);
616
return 0;
617
}
618
619
/*
620
* Not super efficient, but this is just a registration time. And we do cache
621
* the last compound head, so generally we'll only do a full search if we don't
622
* match that one.
623
*
624
* We check if the given compound head page has already been accounted, to
625
* avoid double accounting it. This allows us to account the full size of the
626
* page, not just the constituent pages of a huge page.
627
*/
628
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
629
int nr_pages, struct page *hpage)
630
{
631
int i, j;
632
633
/* check current page array */
634
for (i = 0; i < nr_pages; i++) {
635
if (!PageCompound(pages[i]))
636
continue;
637
if (compound_head(pages[i]) == hpage)
638
return true;
639
}
640
641
/* check previously registered pages */
642
for (i = 0; i < ctx->buf_table.nr; i++) {
643
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
644
struct io_mapped_ubuf *imu;
645
646
if (!node)
647
continue;
648
imu = node->buf;
649
for (j = 0; j < imu->nr_bvecs; j++) {
650
if (!PageCompound(imu->bvec[j].bv_page))
651
continue;
652
if (compound_head(imu->bvec[j].bv_page) == hpage)
653
return true;
654
}
655
}
656
657
return false;
658
}
659
660
static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
661
int nr_pages, struct io_mapped_ubuf *imu,
662
struct page **last_hpage)
663
{
664
int i, ret;
665
666
imu->acct_pages = 0;
667
for (i = 0; i < nr_pages; i++) {
668
if (!PageCompound(pages[i])) {
669
imu->acct_pages++;
670
} else {
671
struct page *hpage;
672
673
hpage = compound_head(pages[i]);
674
if (hpage == *last_hpage)
675
continue;
676
*last_hpage = hpage;
677
if (headpage_already_acct(ctx, pages, i, hpage))
678
continue;
679
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
680
}
681
}
682
683
if (!imu->acct_pages)
684
return 0;
685
686
ret = io_account_mem(ctx, imu->acct_pages);
687
if (ret)
688
imu->acct_pages = 0;
689
return ret;
690
}
691
692
static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
693
struct io_imu_folio_data *data)
694
{
695
struct page **page_array = *pages, **new_array = NULL;
696
unsigned nr_pages_left = *nr_pages;
697
unsigned nr_folios = data->nr_folios;
698
unsigned i, j;
699
700
/* Store head pages only*/
701
new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
702
if (!new_array)
703
return false;
704
705
for (i = 0, j = 0; i < nr_folios; i++) {
706
struct page *p = compound_head(page_array[j]);
707
struct folio *folio = page_folio(p);
708
unsigned int nr;
709
710
WARN_ON_ONCE(i > 0 && p != page_array[j]);
711
712
nr = i ? data->nr_pages_mid : data->nr_pages_head;
713
nr = min(nr, nr_pages_left);
714
/* Drop all but one ref, the entire folio will remain pinned. */
715
if (nr > 1)
716
unpin_user_folio(folio, nr - 1);
717
j += nr;
718
nr_pages_left -= nr;
719
new_array[i] = p;
720
}
721
722
WARN_ON_ONCE(j != *nr_pages);
723
724
kvfree(page_array);
725
*pages = new_array;
726
*nr_pages = nr_folios;
727
return true;
728
}
729
730
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
731
struct io_imu_folio_data *data)
732
{
733
struct folio *folio = page_folio(page_array[0]);
734
unsigned int count = 1, nr_folios = 1;
735
int i;
736
737
data->nr_pages_mid = folio_nr_pages(folio);
738
data->folio_shift = folio_shift(folio);
739
data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
740
741
/*
742
* Check if pages are contiguous inside a folio, and all folios have
743
* the same page count except for the head and tail.
744
*/
745
for (i = 1; i < nr_pages; i++) {
746
if (page_folio(page_array[i]) == folio &&
747
page_array[i] == page_array[i-1] + 1) {
748
count++;
749
continue;
750
}
751
752
if (nr_folios == 1) {
753
if (folio_page_idx(folio, page_array[i-1]) !=
754
data->nr_pages_mid - 1)
755
return false;
756
757
data->nr_pages_head = count;
758
} else if (count != data->nr_pages_mid) {
759
return false;
760
}
761
762
folio = page_folio(page_array[i]);
763
if (folio_size(folio) != (1UL << data->folio_shift) ||
764
folio_page_idx(folio, page_array[i]) != 0)
765
return false;
766
767
count = 1;
768
nr_folios++;
769
}
770
if (nr_folios == 1)
771
data->nr_pages_head = count;
772
773
data->nr_folios = nr_folios;
774
return true;
775
}
776
777
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
778
struct iovec *iov,
779
struct page **last_hpage)
780
{
781
struct io_mapped_ubuf *imu = NULL;
782
struct page **pages = NULL;
783
struct io_rsrc_node *node;
784
unsigned long off;
785
size_t size;
786
int ret, nr_pages, i;
787
struct io_imu_folio_data data;
788
bool coalesced = false;
789
790
if (!iov->iov_base)
791
return NULL;
792
793
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
794
if (!node)
795
return ERR_PTR(-ENOMEM);
796
797
ret = -ENOMEM;
798
pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
799
&nr_pages);
800
if (IS_ERR(pages)) {
801
ret = PTR_ERR(pages);
802
pages = NULL;
803
goto done;
804
}
805
806
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
807
if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
808
if (data.nr_pages_mid != 1)
809
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
810
}
811
812
imu = io_alloc_imu(ctx, nr_pages);
813
if (!imu)
814
goto done;
815
816
imu->nr_bvecs = nr_pages;
817
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
818
if (ret)
819
goto done;
820
821
size = iov->iov_len;
822
/* store original address for later verification */
823
imu->ubuf = (unsigned long) iov->iov_base;
824
imu->len = iov->iov_len;
825
imu->folio_shift = PAGE_SHIFT;
826
imu->release = io_release_ubuf;
827
imu->priv = imu;
828
imu->is_kbuf = false;
829
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
830
if (coalesced)
831
imu->folio_shift = data.folio_shift;
832
refcount_set(&imu->refs, 1);
833
834
off = (unsigned long)iov->iov_base & ~PAGE_MASK;
835
if (coalesced)
836
off += data.first_folio_page_idx << PAGE_SHIFT;
837
838
node->buf = imu;
839
ret = 0;
840
841
for (i = 0; i < nr_pages; i++) {
842
size_t vec_len;
843
844
vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
845
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
846
off = 0;
847
size -= vec_len;
848
}
849
done:
850
if (ret) {
851
if (imu)
852
io_free_imu(ctx, imu);
853
if (pages) {
854
for (i = 0; i < nr_pages; i++)
855
unpin_user_folio(page_folio(pages[i]), 1);
856
}
857
io_cache_free(&ctx->node_cache, node);
858
node = ERR_PTR(ret);
859
}
860
kvfree(pages);
861
return node;
862
}
863
864
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
865
unsigned int nr_args, u64 __user *tags)
866
{
867
struct page *last_hpage = NULL;
868
struct io_rsrc_data data;
869
struct iovec fast_iov, *iov = &fast_iov;
870
const struct iovec __user *uvec;
871
int i, ret;
872
873
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
874
875
if (ctx->buf_table.nr)
876
return -EBUSY;
877
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
878
return -EINVAL;
879
ret = io_rsrc_data_alloc(&data, nr_args);
880
if (ret)
881
return ret;
882
883
if (!arg)
884
memset(iov, 0, sizeof(*iov));
885
886
for (i = 0; i < nr_args; i++) {
887
struct io_rsrc_node *node;
888
u64 tag = 0;
889
890
if (arg) {
891
uvec = (struct iovec __user *) arg;
892
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
893
if (IS_ERR(iov)) {
894
ret = PTR_ERR(iov);
895
break;
896
}
897
ret = io_buffer_validate(iov);
898
if (ret)
899
break;
900
if (ctx->compat)
901
arg += sizeof(struct compat_iovec);
902
else
903
arg += sizeof(struct iovec);
904
}
905
906
if (tags) {
907
if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
908
ret = -EFAULT;
909
break;
910
}
911
}
912
913
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
914
if (IS_ERR(node)) {
915
ret = PTR_ERR(node);
916
break;
917
}
918
if (tag) {
919
if (!node) {
920
ret = -EINVAL;
921
break;
922
}
923
node->tag = tag;
924
}
925
data.nodes[i] = node;
926
}
927
928
ctx->buf_table = data;
929
if (ret) {
930
io_clear_table_tags(&ctx->buf_table);
931
io_sqe_buffers_unregister(ctx);
932
}
933
return ret;
934
}
935
936
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
937
void (*release)(void *), unsigned int index,
938
unsigned int issue_flags)
939
{
940
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
941
struct io_rsrc_data *data = &ctx->buf_table;
942
struct req_iterator rq_iter;
943
struct io_mapped_ubuf *imu;
944
struct io_rsrc_node *node;
945
struct bio_vec bv, *bvec;
946
u16 nr_bvecs;
947
int ret = 0;
948
949
io_ring_submit_lock(ctx, issue_flags);
950
if (index >= data->nr) {
951
ret = -EINVAL;
952
goto unlock;
953
}
954
index = array_index_nospec(index, data->nr);
955
956
if (data->nodes[index]) {
957
ret = -EBUSY;
958
goto unlock;
959
}
960
961
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
962
if (!node) {
963
ret = -ENOMEM;
964
goto unlock;
965
}
966
967
nr_bvecs = blk_rq_nr_phys_segments(rq);
968
imu = io_alloc_imu(ctx, nr_bvecs);
969
if (!imu) {
970
kfree(node);
971
ret = -ENOMEM;
972
goto unlock;
973
}
974
975
imu->ubuf = 0;
976
imu->len = blk_rq_bytes(rq);
977
imu->acct_pages = 0;
978
imu->folio_shift = PAGE_SHIFT;
979
imu->nr_bvecs = nr_bvecs;
980
refcount_set(&imu->refs, 1);
981
imu->release = release;
982
imu->priv = rq;
983
imu->is_kbuf = true;
984
imu->dir = 1 << rq_data_dir(rq);
985
986
bvec = imu->bvec;
987
rq_for_each_bvec(bv, rq, rq_iter)
988
*bvec++ = bv;
989
990
node->buf = imu;
991
data->nodes[index] = node;
992
unlock:
993
io_ring_submit_unlock(ctx, issue_flags);
994
return ret;
995
}
996
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
997
998
int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
999
unsigned int issue_flags)
1000
{
1001
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
1002
struct io_rsrc_data *data = &ctx->buf_table;
1003
struct io_rsrc_node *node;
1004
int ret = 0;
1005
1006
io_ring_submit_lock(ctx, issue_flags);
1007
if (index >= data->nr) {
1008
ret = -EINVAL;
1009
goto unlock;
1010
}
1011
index = array_index_nospec(index, data->nr);
1012
1013
node = data->nodes[index];
1014
if (!node) {
1015
ret = -EINVAL;
1016
goto unlock;
1017
}
1018
if (!node->buf->is_kbuf) {
1019
ret = -EBUSY;
1020
goto unlock;
1021
}
1022
1023
io_put_rsrc_node(ctx, node);
1024
data->nodes[index] = NULL;
1025
unlock:
1026
io_ring_submit_unlock(ctx, issue_flags);
1027
return ret;
1028
}
1029
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
1030
1031
static int validate_fixed_range(u64 buf_addr, size_t len,
1032
const struct io_mapped_ubuf *imu)
1033
{
1034
u64 buf_end;
1035
1036
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1037
return -EFAULT;
1038
/* not inside the mapped region */
1039
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1040
return -EFAULT;
1041
if (unlikely(len > MAX_RW_COUNT))
1042
return -EFAULT;
1043
return 0;
1044
}
1045
1046
static int io_import_kbuf(int ddir, struct iov_iter *iter,
1047
struct io_mapped_ubuf *imu, size_t len, size_t offset)
1048
{
1049
size_t count = len + offset;
1050
1051
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1052
iov_iter_advance(iter, offset);
1053
1054
if (count < imu->len) {
1055
const struct bio_vec *bvec = iter->bvec;
1056
1057
while (len > bvec->bv_len) {
1058
len -= bvec->bv_len;
1059
bvec++;
1060
}
1061
iter->nr_segs = 1 + bvec - iter->bvec;
1062
}
1063
return 0;
1064
}
1065
1066
static int io_import_fixed(int ddir, struct iov_iter *iter,
1067
struct io_mapped_ubuf *imu,
1068
u64 buf_addr, size_t len)
1069
{
1070
const struct bio_vec *bvec;
1071
size_t folio_mask;
1072
unsigned nr_segs;
1073
size_t offset;
1074
int ret;
1075
1076
ret = validate_fixed_range(buf_addr, len, imu);
1077
if (unlikely(ret))
1078
return ret;
1079
if (!(imu->dir & (1 << ddir)))
1080
return -EFAULT;
1081
1082
offset = buf_addr - imu->ubuf;
1083
1084
if (imu->is_kbuf)
1085
return io_import_kbuf(ddir, iter, imu, len, offset);
1086
1087
/*
1088
* Don't use iov_iter_advance() here, as it's really slow for
1089
* using the latter parts of a big fixed buffer - it iterates
1090
* over each segment manually. We can cheat a bit here for user
1091
* registered nodes, because we know that:
1092
*
1093
* 1) it's a BVEC iter, we set it up
1094
* 2) all bvecs are the same in size, except potentially the
1095
* first and last bvec
1096
*/
1097
folio_mask = (1UL << imu->folio_shift) - 1;
1098
bvec = imu->bvec;
1099
if (offset >= bvec->bv_len) {
1100
unsigned long seg_skip;
1101
1102
/* skip first vec */
1103
offset -= bvec->bv_len;
1104
seg_skip = 1 + (offset >> imu->folio_shift);
1105
bvec += seg_skip;
1106
offset &= folio_mask;
1107
}
1108
nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1109
iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1110
iter->iov_offset = offset;
1111
return 0;
1112
}
1113
1114
inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
1115
unsigned issue_flags)
1116
{
1117
struct io_ring_ctx *ctx = req->ctx;
1118
struct io_rsrc_node *node;
1119
1120
if (req->flags & REQ_F_BUF_NODE)
1121
return req->buf_node;
1122
req->flags |= REQ_F_BUF_NODE;
1123
1124
io_ring_submit_lock(ctx, issue_flags);
1125
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
1126
if (node) {
1127
node->refs++;
1128
req->buf_node = node;
1129
io_ring_submit_unlock(ctx, issue_flags);
1130
return node;
1131
}
1132
req->flags &= ~REQ_F_BUF_NODE;
1133
io_ring_submit_unlock(ctx, issue_flags);
1134
return NULL;
1135
}
1136
1137
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
1138
u64 buf_addr, size_t len, int ddir,
1139
unsigned issue_flags)
1140
{
1141
struct io_rsrc_node *node;
1142
1143
node = io_find_buf_node(req, issue_flags);
1144
if (!node)
1145
return -EFAULT;
1146
return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
1147
}
1148
1149
/* Lock two rings at once. The rings must be different! */
1150
static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
1151
{
1152
if (ctx1 > ctx2)
1153
swap(ctx1, ctx2);
1154
mutex_lock(&ctx1->uring_lock);
1155
mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
1156
}
1157
1158
/* Both rings are locked by the caller. */
1159
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
1160
struct io_uring_clone_buffers *arg)
1161
{
1162
struct io_rsrc_data data;
1163
int i, ret, off, nr;
1164
unsigned int nbufs;
1165
1166
lockdep_assert_held(&ctx->uring_lock);
1167
lockdep_assert_held(&src_ctx->uring_lock);
1168
1169
/*
1170
* Accounting state is shared between the two rings; that only works if
1171
* both rings are accounted towards the same counters.
1172
*/
1173
if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
1174
return -EINVAL;
1175
1176
/* if offsets are given, must have nr specified too */
1177
if (!arg->nr && (arg->dst_off || arg->src_off))
1178
return -EINVAL;
1179
/* not allowed unless REPLACE is set */
1180
if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
1181
return -EBUSY;
1182
1183
nbufs = src_ctx->buf_table.nr;
1184
if (!arg->nr)
1185
arg->nr = nbufs;
1186
else if (arg->nr > nbufs)
1187
return -EINVAL;
1188
else if (arg->nr > IORING_MAX_REG_BUFFERS)
1189
return -EINVAL;
1190
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
1191
return -EOVERFLOW;
1192
if (nbufs > IORING_MAX_REG_BUFFERS)
1193
return -EINVAL;
1194
1195
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
1196
if (ret)
1197
return ret;
1198
1199
/* Fill entries in data from dst that won't overlap with src */
1200
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
1201
struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
1202
1203
if (src_node) {
1204
data.nodes[i] = src_node;
1205
src_node->refs++;
1206
}
1207
}
1208
1209
ret = -ENXIO;
1210
nbufs = src_ctx->buf_table.nr;
1211
if (!nbufs)
1212
goto out_free;
1213
ret = -EINVAL;
1214
if (!arg->nr)
1215
arg->nr = nbufs;
1216
else if (arg->nr > nbufs)
1217
goto out_free;
1218
ret = -EOVERFLOW;
1219
if (check_add_overflow(arg->nr, arg->src_off, &off))
1220
goto out_free;
1221
if (off > nbufs)
1222
goto out_free;
1223
1224
off = arg->dst_off;
1225
i = arg->src_off;
1226
nr = arg->nr;
1227
while (nr--) {
1228
struct io_rsrc_node *dst_node, *src_node;
1229
1230
src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
1231
if (!src_node) {
1232
dst_node = NULL;
1233
} else {
1234
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1235
if (!dst_node) {
1236
ret = -ENOMEM;
1237
goto out_free;
1238
}
1239
1240
refcount_inc(&src_node->buf->refs);
1241
dst_node->buf = src_node->buf;
1242
}
1243
data.nodes[off++] = dst_node;
1244
i++;
1245
}
1246
1247
/*
1248
* If asked for replace, put the old table. data->nodes[] holds both
1249
* old and new nodes at this point.
1250
*/
1251
if (arg->flags & IORING_REGISTER_DST_REPLACE)
1252
io_rsrc_data_free(ctx, &ctx->buf_table);
1253
1254
/*
1255
* ctx->buf_table must be empty now - either the contents are being
1256
* replaced and we just freed the table, or the contents are being
1257
* copied to a ring that does not have buffers yet (checked at function
1258
* entry).
1259
*/
1260
WARN_ON_ONCE(ctx->buf_table.nr);
1261
ctx->buf_table = data;
1262
return 0;
1263
1264
out_free:
1265
io_rsrc_data_free(ctx, &data);
1266
return ret;
1267
}
1268
1269
/*
1270
* Copy the registered buffers from the source ring whose file descriptor
1271
* is given in the src_fd to the current ring. This is identical to registering
1272
* the buffers with ctx, except faster as mappings already exist.
1273
*
1274
* Since the memory is already accounted once, don't account it again.
1275
*/
1276
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1277
{
1278
struct io_uring_clone_buffers buf;
1279
struct io_ring_ctx *src_ctx;
1280
bool registered_src;
1281
struct file *file;
1282
int ret;
1283
1284
if (copy_from_user(&buf, arg, sizeof(buf)))
1285
return -EFAULT;
1286
if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
1287
return -EINVAL;
1288
if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
1289
return -EBUSY;
1290
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1291
return -EINVAL;
1292
1293
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1294
file = io_uring_register_get_file(buf.src_fd, registered_src);
1295
if (IS_ERR(file))
1296
return PTR_ERR(file);
1297
1298
src_ctx = file->private_data;
1299
if (src_ctx != ctx) {
1300
mutex_unlock(&ctx->uring_lock);
1301
lock_two_rings(ctx, src_ctx);
1302
}
1303
1304
ret = io_clone_buffers(ctx, src_ctx, &buf);
1305
1306
if (src_ctx != ctx)
1307
mutex_unlock(&src_ctx->uring_lock);
1308
1309
fput(file);
1310
return ret;
1311
}
1312
1313
void io_vec_free(struct iou_vec *iv)
1314
{
1315
if (!iv->iovec)
1316
return;
1317
kfree(iv->iovec);
1318
iv->iovec = NULL;
1319
iv->nr = 0;
1320
}
1321
1322
int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
1323
{
1324
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1325
struct iovec *iov;
1326
1327
iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
1328
if (!iov)
1329
return -ENOMEM;
1330
1331
io_vec_free(iv);
1332
iv->iovec = iov;
1333
iv->nr = nr_entries;
1334
return 0;
1335
}
1336
1337
static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
1338
struct io_mapped_ubuf *imu,
1339
struct iovec *iovec, unsigned nr_iovs,
1340
struct iou_vec *vec)
1341
{
1342
unsigned long folio_size = 1 << imu->folio_shift;
1343
unsigned long folio_mask = folio_size - 1;
1344
struct bio_vec *res_bvec = vec->bvec;
1345
size_t total_len = 0;
1346
unsigned bvec_idx = 0;
1347
unsigned iov_idx;
1348
1349
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1350
size_t iov_len = iovec[iov_idx].iov_len;
1351
u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
1352
struct bio_vec *src_bvec;
1353
size_t offset;
1354
int ret;
1355
1356
ret = validate_fixed_range(buf_addr, iov_len, imu);
1357
if (unlikely(ret))
1358
return ret;
1359
1360
if (unlikely(!iov_len))
1361
return -EFAULT;
1362
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
1363
return -EOVERFLOW;
1364
1365
offset = buf_addr - imu->ubuf;
1366
/*
1367
* Only the first bvec can have non zero bv_offset, account it
1368
* here and work with full folios below.
1369
*/
1370
offset += imu->bvec[0].bv_offset;
1371
1372
src_bvec = imu->bvec + (offset >> imu->folio_shift);
1373
offset &= folio_mask;
1374
1375
for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
1376
size_t seg_size = min_t(size_t, iov_len,
1377
folio_size - offset);
1378
1379
bvec_set_page(&res_bvec[bvec_idx],
1380
src_bvec->bv_page, seg_size, offset);
1381
iov_len -= seg_size;
1382
}
1383
}
1384
if (total_len > MAX_RW_COUNT)
1385
return -EINVAL;
1386
1387
iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
1388
return 0;
1389
}
1390
1391
static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
1392
struct io_mapped_ubuf *imu)
1393
{
1394
unsigned shift = imu->folio_shift;
1395
size_t max_segs = 0;
1396
unsigned i;
1397
1398
for (i = 0; i < nr_iovs; i++)
1399
max_segs += (iov[i].iov_len >> shift) + 2;
1400
return max_segs;
1401
}
1402
1403
static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
1404
struct io_mapped_ubuf *imu,
1405
struct iovec *iovec, unsigned nr_iovs,
1406
struct iou_vec *vec)
1407
{
1408
const struct bio_vec *src_bvec = imu->bvec;
1409
struct bio_vec *res_bvec = vec->bvec;
1410
unsigned res_idx = 0;
1411
size_t total_len = 0;
1412
unsigned iov_idx;
1413
1414
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1415
size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
1416
size_t iov_len = iovec[iov_idx].iov_len;
1417
struct bvec_iter bi = {
1418
.bi_size = offset + iov_len,
1419
};
1420
struct bio_vec bv;
1421
1422
bvec_iter_advance(src_bvec, &bi, offset);
1423
for_each_mp_bvec(bv, src_bvec, bi, bi)
1424
res_bvec[res_idx++] = bv;
1425
total_len += iov_len;
1426
}
1427
iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
1428
return 0;
1429
}
1430
1431
static int iov_kern_bvec_size(const struct iovec *iov,
1432
const struct io_mapped_ubuf *imu,
1433
unsigned int *nr_seg)
1434
{
1435
size_t offset = (size_t)(uintptr_t)iov->iov_base;
1436
const struct bio_vec *bvec = imu->bvec;
1437
int start = 0, i = 0;
1438
size_t off = 0;
1439
int ret;
1440
1441
ret = validate_fixed_range(offset, iov->iov_len, imu);
1442
if (unlikely(ret))
1443
return ret;
1444
1445
for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
1446
off += bvec[i].bv_len, i++) {
1447
if (offset >= off && offset < off + bvec[i].bv_len)
1448
start = i;
1449
}
1450
*nr_seg = i - start;
1451
return 0;
1452
}
1453
1454
static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
1455
struct io_mapped_ubuf *imu, unsigned *nr_segs)
1456
{
1457
unsigned max_segs = 0;
1458
size_t total_len = 0;
1459
unsigned i;
1460
int ret;
1461
1462
*nr_segs = 0;
1463
for (i = 0; i < nr_iovs; i++) {
1464
if (unlikely(!iov[i].iov_len))
1465
return -EFAULT;
1466
if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
1467
&total_len)))
1468
return -EOVERFLOW;
1469
ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
1470
if (unlikely(ret))
1471
return ret;
1472
*nr_segs += max_segs;
1473
}
1474
if (total_len > MAX_RW_COUNT)
1475
return -EINVAL;
1476
return 0;
1477
}
1478
1479
int io_import_reg_vec(int ddir, struct iov_iter *iter,
1480
struct io_kiocb *req, struct iou_vec *vec,
1481
unsigned nr_iovs, unsigned issue_flags)
1482
{
1483
struct io_rsrc_node *node;
1484
struct io_mapped_ubuf *imu;
1485
unsigned iovec_off;
1486
struct iovec *iov;
1487
unsigned nr_segs;
1488
1489
node = io_find_buf_node(req, issue_flags);
1490
if (!node)
1491
return -EFAULT;
1492
imu = node->buf;
1493
if (!(imu->dir & (1 << ddir)))
1494
return -EFAULT;
1495
1496
iovec_off = vec->nr - nr_iovs;
1497
iov = vec->iovec + iovec_off;
1498
1499
if (imu->is_kbuf) {
1500
int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
1501
1502
if (unlikely(ret))
1503
return ret;
1504
} else {
1505
nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
1506
}
1507
1508
if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
1509
size_t bvec_bytes;
1510
1511
bvec_bytes = nr_segs * sizeof(struct bio_vec);
1512
nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
1513
nr_segs += nr_iovs;
1514
}
1515
1516
if (nr_segs > vec->nr) {
1517
struct iou_vec tmp_vec = {};
1518
int ret;
1519
1520
ret = io_vec_realloc(&tmp_vec, nr_segs);
1521
if (ret)
1522
return ret;
1523
1524
iovec_off = tmp_vec.nr - nr_iovs;
1525
memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
1526
io_vec_free(vec);
1527
1528
*vec = tmp_vec;
1529
iov = vec->iovec + iovec_off;
1530
req->flags |= REQ_F_NEED_CLEANUP;
1531
}
1532
1533
if (imu->is_kbuf)
1534
return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1535
1536
return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1537
}
1538
1539
int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
1540
const struct iovec __user *uvec, size_t uvec_segs)
1541
{
1542
struct iovec *iov;
1543
int iovec_off, ret;
1544
void *res;
1545
1546
if (uvec_segs > iv->nr) {
1547
ret = io_vec_realloc(iv, uvec_segs);
1548
if (ret)
1549
return ret;
1550
req->flags |= REQ_F_NEED_CLEANUP;
1551
}
1552
1553
/* pad iovec to the right */
1554
iovec_off = iv->nr - uvec_segs;
1555
iov = iv->iovec + iovec_off;
1556
res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
1557
io_is_compat(req->ctx));
1558
if (IS_ERR(res))
1559
return PTR_ERR(res);
1560
1561
req->flags |= REQ_F_IMPORT_BUFFER;
1562
return 0;
1563
}
1564
1565