Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/io_uring/rsrc.c
49163 views
1
// SPDX-License-Identifier: GPL-2.0
2
#include <linux/kernel.h>
3
#include <linux/errno.h>
4
#include <linux/fs.h>
5
#include <linux/file.h>
6
#include <linux/mm.h>
7
#include <linux/slab.h>
8
#include <linux/nospec.h>
9
#include <linux/hugetlb.h>
10
#include <linux/compat.h>
11
#include <linux/io_uring.h>
12
#include <linux/io_uring/cmd.h>
13
14
#include <uapi/linux/io_uring.h>
15
16
#include "filetable.h"
17
#include "io_uring.h"
18
#include "openclose.h"
19
#include "rsrc.h"
20
#include "memmap.h"
21
#include "register.h"
22
23
struct io_rsrc_update {
24
struct file *file;
25
u64 arg;
26
u32 nr_args;
27
u32 offset;
28
};
29
30
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
31
struct iovec *iov, struct page **last_hpage);
32
33
/* only define max */
34
#define IORING_MAX_FIXED_FILES (1U << 20)
35
#define IORING_MAX_REG_BUFFERS (1U << 14)
36
37
#define IO_CACHED_BVECS_SEGS 32
38
39
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
40
{
41
unsigned long page_limit, cur_pages, new_pages;
42
43
if (!nr_pages)
44
return 0;
45
46
/* Don't allow more pages than we can safely lock */
47
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
48
49
cur_pages = atomic_long_read(&user->locked_vm);
50
do {
51
new_pages = cur_pages + nr_pages;
52
if (new_pages > page_limit)
53
return -ENOMEM;
54
} while (!atomic_long_try_cmpxchg(&user->locked_vm,
55
&cur_pages, new_pages));
56
return 0;
57
}
58
59
void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account,
60
unsigned long nr_pages)
61
{
62
if (user)
63
__io_unaccount_mem(user, nr_pages);
64
65
if (mm_account)
66
atomic64_sub(nr_pages, &mm_account->pinned_vm);
67
}
68
69
int io_account_mem(struct user_struct *user, struct mm_struct *mm_account,
70
unsigned long nr_pages)
71
{
72
int ret;
73
74
if (user) {
75
ret = __io_account_mem(user, nr_pages);
76
if (ret)
77
return ret;
78
}
79
80
if (mm_account)
81
atomic64_add(nr_pages, &mm_account->pinned_vm);
82
83
return 0;
84
}
85
86
int io_validate_user_buf_range(u64 uaddr, u64 ulen)
87
{
88
unsigned long tmp, base = (unsigned long)uaddr;
89
unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
90
91
/* arbitrary limit, but we need something */
92
if (ulen > SZ_1G || !ulen)
93
return -EFAULT;
94
if (check_add_overflow(base, acct_len, &tmp))
95
return -EOVERFLOW;
96
return 0;
97
}
98
99
static int io_buffer_validate(struct iovec *iov)
100
{
101
/*
102
* Don't impose further limits on the size and buffer
103
* constraints here, we'll -EINVAL later when IO is
104
* submitted if they are wrong.
105
*/
106
if (!iov->iov_base)
107
return iov->iov_len ? -EFAULT : 0;
108
109
return io_validate_user_buf_range((unsigned long)iov->iov_base,
110
iov->iov_len);
111
}
112
113
static void io_release_ubuf(void *priv)
114
{
115
struct io_mapped_ubuf *imu = priv;
116
unsigned int i;
117
118
for (i = 0; i < imu->nr_bvecs; i++) {
119
struct folio *folio = page_folio(imu->bvec[i].bv_page);
120
121
unpin_user_folio(folio, 1);
122
}
123
}
124
125
static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
126
int nr_bvecs)
127
{
128
if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
129
return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
130
return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
131
GFP_KERNEL);
132
}
133
134
static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
135
{
136
if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
137
io_cache_free(&ctx->imu_cache, imu);
138
else
139
kvfree(imu);
140
}
141
142
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
143
{
144
if (unlikely(refcount_read(&imu->refs) > 1)) {
145
if (!refcount_dec_and_test(&imu->refs))
146
return;
147
}
148
149
if (imu->acct_pages)
150
io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
151
imu->release(imu->priv);
152
io_free_imu(ctx, imu);
153
}
154
155
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
156
{
157
struct io_rsrc_node *node;
158
159
node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
160
if (node) {
161
node->type = type;
162
node->refs = 1;
163
node->tag = 0;
164
node->file_ptr = 0;
165
}
166
return node;
167
}
168
169
bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
170
{
171
const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
172
IO_CACHED_BVECS_SEGS);
173
const int node_size = sizeof(struct io_rsrc_node);
174
bool ret;
175
176
ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
177
node_size, 0);
178
ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
179
imu_cache_size, 0);
180
return ret;
181
}
182
183
void io_rsrc_cache_free(struct io_ring_ctx *ctx)
184
{
185
io_alloc_cache_free(&ctx->node_cache, kfree);
186
io_alloc_cache_free(&ctx->imu_cache, kfree);
187
}
188
189
static void io_clear_table_tags(struct io_rsrc_data *data)
190
{
191
int i;
192
193
for (i = 0; i < data->nr; i++) {
194
struct io_rsrc_node *node = data->nodes[i];
195
196
if (node)
197
node->tag = 0;
198
}
199
}
200
201
__cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
202
struct io_rsrc_data *data)
203
{
204
if (!data->nr)
205
return;
206
while (data->nr--) {
207
if (data->nodes[data->nr])
208
io_put_rsrc_node(ctx, data->nodes[data->nr]);
209
}
210
kvfree(data->nodes);
211
data->nodes = NULL;
212
data->nr = 0;
213
}
214
215
__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
216
{
217
data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
218
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
219
if (data->nodes) {
220
data->nr = nr;
221
return 0;
222
}
223
return -ENOMEM;
224
}
225
226
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
227
struct io_uring_rsrc_update2 *up,
228
unsigned nr_args)
229
{
230
u64 __user *tags = u64_to_user_ptr(up->tags);
231
__s32 __user *fds = u64_to_user_ptr(up->data);
232
int fd, i, err = 0;
233
unsigned int done;
234
235
if (!ctx->file_table.data.nr)
236
return -ENXIO;
237
if (up->offset + nr_args > ctx->file_table.data.nr)
238
return -EINVAL;
239
240
for (done = 0; done < nr_args; done++) {
241
u64 tag = 0;
242
243
if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
244
copy_from_user(&fd, &fds[done], sizeof(fd))) {
245
err = -EFAULT;
246
break;
247
}
248
if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
249
err = -EINVAL;
250
break;
251
}
252
if (fd == IORING_REGISTER_FILES_SKIP)
253
continue;
254
255
i = up->offset + done;
256
if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
257
io_file_bitmap_clear(&ctx->file_table, i);
258
259
if (fd != -1) {
260
struct file *file = fget(fd);
261
struct io_rsrc_node *node;
262
263
if (!file) {
264
err = -EBADF;
265
break;
266
}
267
/*
268
* Don't allow io_uring instances to be registered.
269
*/
270
if (io_is_uring_fops(file)) {
271
fput(file);
272
err = -EBADF;
273
break;
274
}
275
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
276
if (!node) {
277
err = -ENOMEM;
278
fput(file);
279
break;
280
}
281
ctx->file_table.data.nodes[i] = node;
282
if (tag)
283
node->tag = tag;
284
io_fixed_file_set(node, file);
285
io_file_bitmap_set(&ctx->file_table, i);
286
}
287
}
288
return done ? done : err;
289
}
290
291
static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
292
struct io_uring_rsrc_update2 *up,
293
unsigned int nr_args)
294
{
295
u64 __user *tags = u64_to_user_ptr(up->tags);
296
struct iovec fast_iov, *iov;
297
struct page *last_hpage = NULL;
298
struct iovec __user *uvec;
299
u64 user_data = up->data;
300
__u32 done;
301
int i, err;
302
303
if (!ctx->buf_table.nr)
304
return -ENXIO;
305
if (up->offset + nr_args > ctx->buf_table.nr)
306
return -EINVAL;
307
308
for (done = 0; done < nr_args; done++) {
309
struct io_rsrc_node *node;
310
u64 tag = 0;
311
312
uvec = u64_to_user_ptr(user_data);
313
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
314
if (IS_ERR(iov)) {
315
err = PTR_ERR(iov);
316
break;
317
}
318
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
319
err = -EFAULT;
320
break;
321
}
322
err = io_buffer_validate(iov);
323
if (err)
324
break;
325
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
326
if (IS_ERR(node)) {
327
err = PTR_ERR(node);
328
break;
329
}
330
if (tag) {
331
if (!node) {
332
err = -EINVAL;
333
break;
334
}
335
node->tag = tag;
336
}
337
i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
338
io_reset_rsrc_node(ctx, &ctx->buf_table, i);
339
ctx->buf_table.nodes[i] = node;
340
if (ctx->compat)
341
user_data += sizeof(struct compat_iovec);
342
else
343
user_data += sizeof(struct iovec);
344
}
345
return done ? done : err;
346
}
347
348
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
349
struct io_uring_rsrc_update2 *up,
350
unsigned nr_args)
351
{
352
__u32 tmp;
353
354
lockdep_assert_held(&ctx->uring_lock);
355
356
if (check_add_overflow(up->offset, nr_args, &tmp))
357
return -EOVERFLOW;
358
359
switch (type) {
360
case IORING_RSRC_FILE:
361
return __io_sqe_files_update(ctx, up, nr_args);
362
case IORING_RSRC_BUFFER:
363
return __io_sqe_buffers_update(ctx, up, nr_args);
364
}
365
return -EINVAL;
366
}
367
368
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
369
unsigned nr_args)
370
{
371
struct io_uring_rsrc_update2 up;
372
373
if (!nr_args)
374
return -EINVAL;
375
memset(&up, 0, sizeof(up));
376
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
377
return -EFAULT;
378
if (up.resv || up.resv2)
379
return -EINVAL;
380
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
381
}
382
383
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
384
unsigned size, unsigned type)
385
{
386
struct io_uring_rsrc_update2 up;
387
388
if (size != sizeof(up))
389
return -EINVAL;
390
if (copy_from_user(&up, arg, sizeof(up)))
391
return -EFAULT;
392
if (!up.nr || up.resv || up.resv2)
393
return -EINVAL;
394
return __io_register_rsrc_update(ctx, type, &up, up.nr);
395
}
396
397
__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
398
unsigned int size, unsigned int type)
399
{
400
struct io_uring_rsrc_register rr;
401
402
/* keep it extendible */
403
if (size != sizeof(rr))
404
return -EINVAL;
405
406
memset(&rr, 0, sizeof(rr));
407
if (copy_from_user(&rr, arg, size))
408
return -EFAULT;
409
if (!rr.nr || rr.resv2)
410
return -EINVAL;
411
if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
412
return -EINVAL;
413
414
switch (type) {
415
case IORING_RSRC_FILE:
416
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
417
break;
418
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
419
rr.nr, u64_to_user_ptr(rr.tags));
420
case IORING_RSRC_BUFFER:
421
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
422
break;
423
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
424
rr.nr, u64_to_user_ptr(rr.tags));
425
}
426
return -EINVAL;
427
}
428
429
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
430
{
431
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
432
433
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
434
return -EINVAL;
435
if (sqe->rw_flags || sqe->splice_fd_in)
436
return -EINVAL;
437
438
up->offset = READ_ONCE(sqe->off);
439
up->nr_args = READ_ONCE(sqe->len);
440
if (!up->nr_args)
441
return -EINVAL;
442
up->arg = READ_ONCE(sqe->addr);
443
return 0;
444
}
445
446
static int io_files_update_with_index_alloc(struct io_kiocb *req,
447
unsigned int issue_flags)
448
{
449
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
450
__s32 __user *fds = u64_to_user_ptr(up->arg);
451
unsigned int done;
452
struct file *file;
453
int ret, fd;
454
455
if (!req->ctx->file_table.data.nr)
456
return -ENXIO;
457
458
for (done = 0; done < up->nr_args; done++) {
459
if (get_user(fd, &fds[done])) {
460
ret = -EFAULT;
461
break;
462
}
463
464
file = fget(fd);
465
if (!file) {
466
ret = -EBADF;
467
break;
468
}
469
ret = io_fixed_fd_install(req, issue_flags, file,
470
IORING_FILE_INDEX_ALLOC);
471
if (ret < 0)
472
break;
473
if (put_user(ret, &fds[done])) {
474
__io_close_fixed(req->ctx, issue_flags, ret);
475
ret = -EFAULT;
476
break;
477
}
478
}
479
480
if (done)
481
return done;
482
return ret;
483
}
484
485
int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
486
{
487
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
488
struct io_ring_ctx *ctx = req->ctx;
489
struct io_uring_rsrc_update2 up2;
490
int ret;
491
492
up2.offset = up->offset;
493
up2.data = up->arg;
494
up2.nr = 0;
495
up2.tags = 0;
496
up2.resv = 0;
497
up2.resv2 = 0;
498
499
if (up->offset == IORING_FILE_INDEX_ALLOC) {
500
ret = io_files_update_with_index_alloc(req, issue_flags);
501
} else {
502
io_ring_submit_lock(ctx, issue_flags);
503
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
504
&up2, up->nr_args);
505
io_ring_submit_unlock(ctx, issue_flags);
506
}
507
508
if (ret < 0)
509
req_set_fail(req);
510
io_req_set_res(req, ret, 0);
511
return IOU_COMPLETE;
512
}
513
514
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
515
{
516
if (node->tag)
517
io_post_aux_cqe(ctx, node->tag, 0, 0);
518
519
switch (node->type) {
520
case IORING_RSRC_FILE:
521
fput(io_slot_file(node));
522
break;
523
case IORING_RSRC_BUFFER:
524
io_buffer_unmap(ctx, node->buf);
525
break;
526
default:
527
WARN_ON_ONCE(1);
528
break;
529
}
530
531
io_cache_free(&ctx->node_cache, node);
532
}
533
534
int io_sqe_files_unregister(struct io_ring_ctx *ctx)
535
{
536
if (!ctx->file_table.data.nr)
537
return -ENXIO;
538
539
io_free_file_tables(ctx, &ctx->file_table);
540
io_file_table_set_alloc_range(ctx, 0, 0);
541
return 0;
542
}
543
544
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
545
unsigned nr_args, u64 __user *tags)
546
{
547
__s32 __user *fds = (__s32 __user *) arg;
548
struct file *file;
549
int fd, ret;
550
unsigned i;
551
552
if (ctx->file_table.data.nr)
553
return -EBUSY;
554
if (!nr_args)
555
return -EINVAL;
556
if (nr_args > IORING_MAX_FIXED_FILES)
557
return -EMFILE;
558
if (nr_args > rlimit(RLIMIT_NOFILE))
559
return -EMFILE;
560
if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
561
return -ENOMEM;
562
563
for (i = 0; i < nr_args; i++) {
564
struct io_rsrc_node *node;
565
u64 tag = 0;
566
567
ret = -EFAULT;
568
if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
569
goto fail;
570
if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
571
goto fail;
572
/* allow sparse sets */
573
if (!fds || fd == -1) {
574
ret = -EINVAL;
575
if (tag)
576
goto fail;
577
continue;
578
}
579
580
file = fget(fd);
581
ret = -EBADF;
582
if (unlikely(!file))
583
goto fail;
584
585
/*
586
* Don't allow io_uring instances to be registered.
587
*/
588
if (io_is_uring_fops(file)) {
589
fput(file);
590
goto fail;
591
}
592
ret = -ENOMEM;
593
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
594
if (!node) {
595
fput(file);
596
goto fail;
597
}
598
if (tag)
599
node->tag = tag;
600
ctx->file_table.data.nodes[i] = node;
601
io_fixed_file_set(node, file);
602
io_file_bitmap_set(&ctx->file_table, i);
603
}
604
605
/* default it to the whole table */
606
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
607
return 0;
608
fail:
609
io_clear_table_tags(&ctx->file_table.data);
610
io_sqe_files_unregister(ctx);
611
return ret;
612
}
613
614
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
615
{
616
if (!ctx->buf_table.nr)
617
return -ENXIO;
618
io_rsrc_data_free(ctx, &ctx->buf_table);
619
return 0;
620
}
621
622
/*
623
* Not super efficient, but this is just a registration time. And we do cache
624
* the last compound head, so generally we'll only do a full search if we don't
625
* match that one.
626
*
627
* We check if the given compound head page has already been accounted, to
628
* avoid double accounting it. This allows us to account the full size of the
629
* page, not just the constituent pages of a huge page.
630
*/
631
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
632
int nr_pages, struct page *hpage)
633
{
634
int i, j;
635
636
/* check current page array */
637
for (i = 0; i < nr_pages; i++) {
638
if (!PageCompound(pages[i]))
639
continue;
640
if (compound_head(pages[i]) == hpage)
641
return true;
642
}
643
644
/* check previously registered pages */
645
for (i = 0; i < ctx->buf_table.nr; i++) {
646
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
647
struct io_mapped_ubuf *imu;
648
649
if (!node)
650
continue;
651
imu = node->buf;
652
for (j = 0; j < imu->nr_bvecs; j++) {
653
if (!PageCompound(imu->bvec[j].bv_page))
654
continue;
655
if (compound_head(imu->bvec[j].bv_page) == hpage)
656
return true;
657
}
658
}
659
660
return false;
661
}
662
663
static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
664
int nr_pages, struct io_mapped_ubuf *imu,
665
struct page **last_hpage)
666
{
667
int i, ret;
668
669
imu->acct_pages = 0;
670
for (i = 0; i < nr_pages; i++) {
671
if (!PageCompound(pages[i])) {
672
imu->acct_pages++;
673
} else {
674
struct page *hpage;
675
676
hpage = compound_head(pages[i]);
677
if (hpage == *last_hpage)
678
continue;
679
*last_hpage = hpage;
680
if (headpage_already_acct(ctx, pages, i, hpage))
681
continue;
682
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
683
}
684
}
685
686
if (!imu->acct_pages)
687
return 0;
688
689
ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
690
if (ret)
691
imu->acct_pages = 0;
692
return ret;
693
}
694
695
static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
696
struct io_imu_folio_data *data)
697
{
698
struct page **page_array = *pages, **new_array = NULL;
699
unsigned nr_pages_left = *nr_pages;
700
unsigned nr_folios = data->nr_folios;
701
unsigned i, j;
702
703
/* Store head pages only*/
704
new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
705
if (!new_array)
706
return false;
707
708
for (i = 0, j = 0; i < nr_folios; i++) {
709
struct page *p = compound_head(page_array[j]);
710
struct folio *folio = page_folio(p);
711
unsigned int nr;
712
713
WARN_ON_ONCE(i > 0 && p != page_array[j]);
714
715
nr = i ? data->nr_pages_mid : data->nr_pages_head;
716
nr = min(nr, nr_pages_left);
717
/* Drop all but one ref, the entire folio will remain pinned. */
718
if (nr > 1)
719
unpin_user_folio(folio, nr - 1);
720
j += nr;
721
nr_pages_left -= nr;
722
new_array[i] = p;
723
}
724
725
WARN_ON_ONCE(j != *nr_pages);
726
727
kvfree(page_array);
728
*pages = new_array;
729
*nr_pages = nr_folios;
730
return true;
731
}
732
733
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
734
struct io_imu_folio_data *data)
735
{
736
struct folio *folio = page_folio(page_array[0]);
737
unsigned int count = 1, nr_folios = 1;
738
int i;
739
740
data->nr_pages_mid = folio_nr_pages(folio);
741
data->folio_shift = folio_shift(folio);
742
data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
743
744
/*
745
* Check if pages are contiguous inside a folio, and all folios have
746
* the same page count except for the head and tail.
747
*/
748
for (i = 1; i < nr_pages; i++) {
749
if (page_folio(page_array[i]) == folio &&
750
page_array[i] == page_array[i-1] + 1) {
751
count++;
752
continue;
753
}
754
755
if (nr_folios == 1) {
756
if (folio_page_idx(folio, page_array[i-1]) !=
757
data->nr_pages_mid - 1)
758
return false;
759
760
data->nr_pages_head = count;
761
} else if (count != data->nr_pages_mid) {
762
return false;
763
}
764
765
folio = page_folio(page_array[i]);
766
if (folio_size(folio) != (1UL << data->folio_shift) ||
767
folio_page_idx(folio, page_array[i]) != 0)
768
return false;
769
770
count = 1;
771
nr_folios++;
772
}
773
if (nr_folios == 1)
774
data->nr_pages_head = count;
775
776
data->nr_folios = nr_folios;
777
return true;
778
}
779
780
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
781
struct iovec *iov,
782
struct page **last_hpage)
783
{
784
struct io_mapped_ubuf *imu = NULL;
785
struct page **pages = NULL;
786
struct io_rsrc_node *node;
787
unsigned long off;
788
size_t size;
789
int ret, nr_pages, i;
790
struct io_imu_folio_data data;
791
bool coalesced = false;
792
793
if (!iov->iov_base)
794
return NULL;
795
796
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
797
if (!node)
798
return ERR_PTR(-ENOMEM);
799
800
ret = -ENOMEM;
801
pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
802
&nr_pages);
803
if (IS_ERR(pages)) {
804
ret = PTR_ERR(pages);
805
pages = NULL;
806
goto done;
807
}
808
809
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
810
if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
811
if (data.nr_pages_mid != 1)
812
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
813
}
814
815
imu = io_alloc_imu(ctx, nr_pages);
816
if (!imu)
817
goto done;
818
819
imu->nr_bvecs = nr_pages;
820
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
821
if (ret)
822
goto done;
823
824
size = iov->iov_len;
825
/* store original address for later verification */
826
imu->ubuf = (unsigned long) iov->iov_base;
827
imu->len = iov->iov_len;
828
imu->folio_shift = PAGE_SHIFT;
829
imu->release = io_release_ubuf;
830
imu->priv = imu;
831
imu->is_kbuf = false;
832
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
833
if (coalesced)
834
imu->folio_shift = data.folio_shift;
835
refcount_set(&imu->refs, 1);
836
837
off = (unsigned long)iov->iov_base & ~PAGE_MASK;
838
if (coalesced)
839
off += data.first_folio_page_idx << PAGE_SHIFT;
840
841
node->buf = imu;
842
ret = 0;
843
844
for (i = 0; i < nr_pages; i++) {
845
size_t vec_len;
846
847
vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
848
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
849
off = 0;
850
size -= vec_len;
851
}
852
done:
853
if (ret) {
854
if (imu)
855
io_free_imu(ctx, imu);
856
if (pages) {
857
for (i = 0; i < nr_pages; i++)
858
unpin_user_folio(page_folio(pages[i]), 1);
859
}
860
io_cache_free(&ctx->node_cache, node);
861
node = ERR_PTR(ret);
862
}
863
kvfree(pages);
864
return node;
865
}
866
867
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
868
unsigned int nr_args, u64 __user *tags)
869
{
870
struct page *last_hpage = NULL;
871
struct io_rsrc_data data;
872
struct iovec fast_iov, *iov = &fast_iov;
873
const struct iovec __user *uvec;
874
int i, ret;
875
876
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
877
878
if (ctx->buf_table.nr)
879
return -EBUSY;
880
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
881
return -EINVAL;
882
ret = io_rsrc_data_alloc(&data, nr_args);
883
if (ret)
884
return ret;
885
886
if (!arg)
887
memset(iov, 0, sizeof(*iov));
888
889
for (i = 0; i < nr_args; i++) {
890
struct io_rsrc_node *node;
891
u64 tag = 0;
892
893
if (arg) {
894
uvec = (struct iovec __user *) arg;
895
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
896
if (IS_ERR(iov)) {
897
ret = PTR_ERR(iov);
898
break;
899
}
900
ret = io_buffer_validate(iov);
901
if (ret)
902
break;
903
if (ctx->compat)
904
arg += sizeof(struct compat_iovec);
905
else
906
arg += sizeof(struct iovec);
907
}
908
909
if (tags) {
910
if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
911
ret = -EFAULT;
912
break;
913
}
914
}
915
916
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
917
if (IS_ERR(node)) {
918
ret = PTR_ERR(node);
919
break;
920
}
921
if (tag) {
922
if (!node) {
923
ret = -EINVAL;
924
break;
925
}
926
node->tag = tag;
927
}
928
data.nodes[i] = node;
929
}
930
931
ctx->buf_table = data;
932
if (ret) {
933
io_clear_table_tags(&ctx->buf_table);
934
io_sqe_buffers_unregister(ctx);
935
}
936
return ret;
937
}
938
939
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
940
void (*release)(void *), unsigned int index,
941
unsigned int issue_flags)
942
{
943
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
944
struct io_rsrc_data *data = &ctx->buf_table;
945
struct req_iterator rq_iter;
946
struct io_mapped_ubuf *imu;
947
struct io_rsrc_node *node;
948
struct bio_vec bv;
949
unsigned int nr_bvecs = 0;
950
int ret = 0;
951
952
io_ring_submit_lock(ctx, issue_flags);
953
if (index >= data->nr) {
954
ret = -EINVAL;
955
goto unlock;
956
}
957
index = array_index_nospec(index, data->nr);
958
959
if (data->nodes[index]) {
960
ret = -EBUSY;
961
goto unlock;
962
}
963
964
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
965
if (!node) {
966
ret = -ENOMEM;
967
goto unlock;
968
}
969
970
/*
971
* blk_rq_nr_phys_segments() may overestimate the number of bvecs
972
* but avoids needing to iterate over the bvecs
973
*/
974
imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
975
if (!imu) {
976
kfree(node);
977
ret = -ENOMEM;
978
goto unlock;
979
}
980
981
imu->ubuf = 0;
982
imu->len = blk_rq_bytes(rq);
983
imu->acct_pages = 0;
984
imu->folio_shift = PAGE_SHIFT;
985
refcount_set(&imu->refs, 1);
986
imu->release = release;
987
imu->priv = rq;
988
imu->is_kbuf = true;
989
imu->dir = 1 << rq_data_dir(rq);
990
991
rq_for_each_bvec(bv, rq, rq_iter)
992
imu->bvec[nr_bvecs++] = bv;
993
imu->nr_bvecs = nr_bvecs;
994
995
node->buf = imu;
996
data->nodes[index] = node;
997
unlock:
998
io_ring_submit_unlock(ctx, issue_flags);
999
return ret;
1000
}
1001
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
1002
1003
int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
1004
unsigned int issue_flags)
1005
{
1006
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
1007
struct io_rsrc_data *data = &ctx->buf_table;
1008
struct io_rsrc_node *node;
1009
int ret = 0;
1010
1011
io_ring_submit_lock(ctx, issue_flags);
1012
if (index >= data->nr) {
1013
ret = -EINVAL;
1014
goto unlock;
1015
}
1016
index = array_index_nospec(index, data->nr);
1017
1018
node = data->nodes[index];
1019
if (!node) {
1020
ret = -EINVAL;
1021
goto unlock;
1022
}
1023
if (!node->buf->is_kbuf) {
1024
ret = -EBUSY;
1025
goto unlock;
1026
}
1027
1028
io_put_rsrc_node(ctx, node);
1029
data->nodes[index] = NULL;
1030
unlock:
1031
io_ring_submit_unlock(ctx, issue_flags);
1032
return ret;
1033
}
1034
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
1035
1036
static int validate_fixed_range(u64 buf_addr, size_t len,
1037
const struct io_mapped_ubuf *imu)
1038
{
1039
u64 buf_end;
1040
1041
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1042
return -EFAULT;
1043
/* not inside the mapped region */
1044
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1045
return -EFAULT;
1046
if (unlikely(len > MAX_RW_COUNT))
1047
return -EFAULT;
1048
return 0;
1049
}
1050
1051
static int io_import_kbuf(int ddir, struct iov_iter *iter,
1052
struct io_mapped_ubuf *imu, size_t len, size_t offset)
1053
{
1054
size_t count = len + offset;
1055
1056
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1057
iov_iter_advance(iter, offset);
1058
1059
if (count < imu->len) {
1060
const struct bio_vec *bvec = iter->bvec;
1061
1062
len += iter->iov_offset;
1063
while (len > bvec->bv_len) {
1064
len -= bvec->bv_len;
1065
bvec++;
1066
}
1067
iter->nr_segs = 1 + bvec - iter->bvec;
1068
}
1069
return 0;
1070
}
1071
1072
static int io_import_fixed(int ddir, struct iov_iter *iter,
1073
struct io_mapped_ubuf *imu,
1074
u64 buf_addr, size_t len)
1075
{
1076
const struct bio_vec *bvec;
1077
size_t folio_mask;
1078
unsigned nr_segs;
1079
size_t offset;
1080
int ret;
1081
1082
ret = validate_fixed_range(buf_addr, len, imu);
1083
if (unlikely(ret))
1084
return ret;
1085
if (!(imu->dir & (1 << ddir)))
1086
return -EFAULT;
1087
1088
offset = buf_addr - imu->ubuf;
1089
1090
if (imu->is_kbuf)
1091
return io_import_kbuf(ddir, iter, imu, len, offset);
1092
1093
/*
1094
* Don't use iov_iter_advance() here, as it's really slow for
1095
* using the latter parts of a big fixed buffer - it iterates
1096
* over each segment manually. We can cheat a bit here for user
1097
* registered nodes, because we know that:
1098
*
1099
* 1) it's a BVEC iter, we set it up
1100
* 2) all bvecs are the same in size, except potentially the
1101
* first and last bvec
1102
*/
1103
folio_mask = (1UL << imu->folio_shift) - 1;
1104
bvec = imu->bvec;
1105
if (offset >= bvec->bv_len) {
1106
unsigned long seg_skip;
1107
1108
/* skip first vec */
1109
offset -= bvec->bv_len;
1110
seg_skip = 1 + (offset >> imu->folio_shift);
1111
bvec += seg_skip;
1112
offset &= folio_mask;
1113
}
1114
nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1115
iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1116
iter->iov_offset = offset;
1117
return 0;
1118
}
1119
1120
inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
1121
unsigned issue_flags)
1122
{
1123
struct io_ring_ctx *ctx = req->ctx;
1124
struct io_rsrc_node *node;
1125
1126
if (req->flags & REQ_F_BUF_NODE)
1127
return req->buf_node;
1128
req->flags |= REQ_F_BUF_NODE;
1129
1130
io_ring_submit_lock(ctx, issue_flags);
1131
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
1132
if (node) {
1133
node->refs++;
1134
req->buf_node = node;
1135
io_ring_submit_unlock(ctx, issue_flags);
1136
return node;
1137
}
1138
req->flags &= ~REQ_F_BUF_NODE;
1139
io_ring_submit_unlock(ctx, issue_flags);
1140
return NULL;
1141
}
1142
1143
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
1144
u64 buf_addr, size_t len, int ddir,
1145
unsigned issue_flags)
1146
{
1147
struct io_rsrc_node *node;
1148
1149
node = io_find_buf_node(req, issue_flags);
1150
if (!node)
1151
return -EFAULT;
1152
return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
1153
}
1154
1155
/* Lock two rings at once. The rings must be different! */
1156
static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
1157
{
1158
if (ctx1 > ctx2)
1159
swap(ctx1, ctx2);
1160
mutex_lock(&ctx1->uring_lock);
1161
mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
1162
}
1163
1164
/* Both rings are locked by the caller. */
1165
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
1166
struct io_uring_clone_buffers *arg)
1167
{
1168
struct io_rsrc_data data;
1169
int i, ret, off, nr;
1170
unsigned int nbufs;
1171
1172
lockdep_assert_held(&ctx->uring_lock);
1173
lockdep_assert_held(&src_ctx->uring_lock);
1174
1175
/*
1176
* Accounting state is shared between the two rings; that only works if
1177
* both rings are accounted towards the same counters.
1178
*/
1179
if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
1180
return -EINVAL;
1181
1182
/* if offsets are given, must have nr specified too */
1183
if (!arg->nr && (arg->dst_off || arg->src_off))
1184
return -EINVAL;
1185
/* not allowed unless REPLACE is set */
1186
if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
1187
return -EBUSY;
1188
1189
nbufs = src_ctx->buf_table.nr;
1190
if (!nbufs)
1191
return -ENXIO;
1192
if (!arg->nr)
1193
arg->nr = nbufs;
1194
else if (arg->nr > nbufs)
1195
return -EINVAL;
1196
else if (arg->nr > IORING_MAX_REG_BUFFERS)
1197
return -EINVAL;
1198
if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
1199
return -EOVERFLOW;
1200
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
1201
return -EOVERFLOW;
1202
if (nbufs > IORING_MAX_REG_BUFFERS)
1203
return -EINVAL;
1204
1205
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
1206
if (ret)
1207
return ret;
1208
1209
/* Copy original dst nodes from before the cloned range */
1210
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
1211
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
1212
1213
if (node) {
1214
data.nodes[i] = node;
1215
node->refs++;
1216
}
1217
}
1218
1219
off = arg->dst_off;
1220
i = arg->src_off;
1221
nr = arg->nr;
1222
while (nr--) {
1223
struct io_rsrc_node *dst_node, *src_node;
1224
1225
src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
1226
if (!src_node) {
1227
dst_node = NULL;
1228
} else {
1229
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1230
if (!dst_node) {
1231
io_rsrc_data_free(ctx, &data);
1232
return -ENOMEM;
1233
}
1234
1235
refcount_inc(&src_node->buf->refs);
1236
dst_node->buf = src_node->buf;
1237
}
1238
data.nodes[off++] = dst_node;
1239
i++;
1240
}
1241
1242
/* Copy original dst nodes from after the cloned range */
1243
for (i = nbufs; i < ctx->buf_table.nr; i++) {
1244
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
1245
1246
if (node) {
1247
data.nodes[i] = node;
1248
node->refs++;
1249
}
1250
}
1251
1252
/*
1253
* If asked for replace, put the old table. data->nodes[] holds both
1254
* old and new nodes at this point.
1255
*/
1256
if (arg->flags & IORING_REGISTER_DST_REPLACE)
1257
io_rsrc_data_free(ctx, &ctx->buf_table);
1258
1259
/*
1260
* ctx->buf_table must be empty now - either the contents are being
1261
* replaced and we just freed the table, or the contents are being
1262
* copied to a ring that does not have buffers yet (checked at function
1263
* entry).
1264
*/
1265
WARN_ON_ONCE(ctx->buf_table.nr);
1266
ctx->buf_table = data;
1267
return 0;
1268
}
1269
1270
/*
1271
* Copy the registered buffers from the source ring whose file descriptor
1272
* is given in the src_fd to the current ring. This is identical to registering
1273
* the buffers with ctx, except faster as mappings already exist.
1274
*
1275
* Since the memory is already accounted once, don't account it again.
1276
*/
1277
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1278
{
1279
struct io_uring_clone_buffers buf;
1280
struct io_ring_ctx *src_ctx;
1281
bool registered_src;
1282
struct file *file;
1283
int ret;
1284
1285
if (copy_from_user(&buf, arg, sizeof(buf)))
1286
return -EFAULT;
1287
if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
1288
return -EINVAL;
1289
if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
1290
return -EBUSY;
1291
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1292
return -EINVAL;
1293
1294
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1295
file = io_uring_register_get_file(buf.src_fd, registered_src);
1296
if (IS_ERR(file))
1297
return PTR_ERR(file);
1298
1299
src_ctx = file->private_data;
1300
if (src_ctx != ctx) {
1301
mutex_unlock(&ctx->uring_lock);
1302
lock_two_rings(ctx, src_ctx);
1303
1304
if (src_ctx->submitter_task &&
1305
src_ctx->submitter_task != current) {
1306
ret = -EEXIST;
1307
goto out;
1308
}
1309
}
1310
1311
ret = io_clone_buffers(ctx, src_ctx, &buf);
1312
1313
out:
1314
if (src_ctx != ctx)
1315
mutex_unlock(&src_ctx->uring_lock);
1316
1317
fput(file);
1318
return ret;
1319
}
1320
1321
void io_vec_free(struct iou_vec *iv)
1322
{
1323
if (!iv->iovec)
1324
return;
1325
kfree(iv->iovec);
1326
iv->iovec = NULL;
1327
iv->nr = 0;
1328
}
1329
1330
int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
1331
{
1332
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1333
struct iovec *iov;
1334
1335
iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
1336
if (!iov)
1337
return -ENOMEM;
1338
1339
io_vec_free(iv);
1340
iv->iovec = iov;
1341
iv->nr = nr_entries;
1342
return 0;
1343
}
1344
1345
static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
1346
struct io_mapped_ubuf *imu,
1347
struct iovec *iovec, unsigned nr_iovs,
1348
struct iou_vec *vec)
1349
{
1350
unsigned long folio_size = 1 << imu->folio_shift;
1351
unsigned long folio_mask = folio_size - 1;
1352
struct bio_vec *res_bvec = vec->bvec;
1353
size_t total_len = 0;
1354
unsigned bvec_idx = 0;
1355
unsigned iov_idx;
1356
1357
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1358
size_t iov_len = iovec[iov_idx].iov_len;
1359
u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
1360
struct bio_vec *src_bvec;
1361
size_t offset;
1362
int ret;
1363
1364
ret = validate_fixed_range(buf_addr, iov_len, imu);
1365
if (unlikely(ret))
1366
return ret;
1367
1368
if (unlikely(!iov_len))
1369
return -EFAULT;
1370
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
1371
return -EOVERFLOW;
1372
1373
offset = buf_addr - imu->ubuf;
1374
/*
1375
* Only the first bvec can have non zero bv_offset, account it
1376
* here and work with full folios below.
1377
*/
1378
offset += imu->bvec[0].bv_offset;
1379
1380
src_bvec = imu->bvec + (offset >> imu->folio_shift);
1381
offset &= folio_mask;
1382
1383
for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
1384
size_t seg_size = min_t(size_t, iov_len,
1385
folio_size - offset);
1386
1387
bvec_set_page(&res_bvec[bvec_idx],
1388
src_bvec->bv_page, seg_size, offset);
1389
iov_len -= seg_size;
1390
}
1391
}
1392
if (total_len > MAX_RW_COUNT)
1393
return -EINVAL;
1394
1395
iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
1396
return 0;
1397
}
1398
1399
static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
1400
struct io_mapped_ubuf *imu)
1401
{
1402
unsigned shift = imu->folio_shift;
1403
size_t max_segs = 0;
1404
unsigned i;
1405
1406
for (i = 0; i < nr_iovs; i++) {
1407
max_segs += (iov[i].iov_len >> shift) + 2;
1408
if (max_segs > INT_MAX)
1409
return -EOVERFLOW;
1410
}
1411
return max_segs;
1412
}
1413
1414
static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
1415
struct io_mapped_ubuf *imu,
1416
struct iovec *iovec, unsigned nr_iovs,
1417
struct iou_vec *vec)
1418
{
1419
const struct bio_vec *src_bvec = imu->bvec;
1420
struct bio_vec *res_bvec = vec->bvec;
1421
unsigned res_idx = 0;
1422
size_t total_len = 0;
1423
unsigned iov_idx;
1424
1425
for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1426
size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
1427
size_t iov_len = iovec[iov_idx].iov_len;
1428
struct bvec_iter bi = {
1429
.bi_size = offset + iov_len,
1430
};
1431
struct bio_vec bv;
1432
1433
bvec_iter_advance(src_bvec, &bi, offset);
1434
for_each_mp_bvec(bv, src_bvec, bi, bi)
1435
res_bvec[res_idx++] = bv;
1436
total_len += iov_len;
1437
}
1438
iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
1439
return 0;
1440
}
1441
1442
static int iov_kern_bvec_size(const struct iovec *iov,
1443
const struct io_mapped_ubuf *imu,
1444
unsigned int *nr_seg)
1445
{
1446
size_t offset = (size_t)(uintptr_t)iov->iov_base;
1447
const struct bio_vec *bvec = imu->bvec;
1448
int start = 0, i = 0;
1449
size_t off = 0;
1450
int ret;
1451
1452
ret = validate_fixed_range(offset, iov->iov_len, imu);
1453
if (unlikely(ret))
1454
return ret;
1455
1456
for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
1457
off += bvec[i].bv_len, i++) {
1458
if (offset >= off && offset < off + bvec[i].bv_len)
1459
start = i;
1460
}
1461
*nr_seg = i - start;
1462
return 0;
1463
}
1464
1465
static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
1466
struct io_mapped_ubuf *imu, unsigned *nr_segs)
1467
{
1468
unsigned max_segs = 0;
1469
size_t total_len = 0;
1470
unsigned i;
1471
int ret;
1472
1473
*nr_segs = 0;
1474
for (i = 0; i < nr_iovs; i++) {
1475
if (unlikely(!iov[i].iov_len))
1476
return -EFAULT;
1477
if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
1478
&total_len)))
1479
return -EOVERFLOW;
1480
ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
1481
if (unlikely(ret))
1482
return ret;
1483
*nr_segs += max_segs;
1484
}
1485
if (total_len > MAX_RW_COUNT)
1486
return -EINVAL;
1487
return 0;
1488
}
1489
1490
int io_import_reg_vec(int ddir, struct iov_iter *iter,
1491
struct io_kiocb *req, struct iou_vec *vec,
1492
unsigned nr_iovs, unsigned issue_flags)
1493
{
1494
struct io_rsrc_node *node;
1495
struct io_mapped_ubuf *imu;
1496
unsigned iovec_off;
1497
struct iovec *iov;
1498
unsigned nr_segs;
1499
1500
node = io_find_buf_node(req, issue_flags);
1501
if (!node)
1502
return -EFAULT;
1503
imu = node->buf;
1504
if (!(imu->dir & (1 << ddir)))
1505
return -EFAULT;
1506
1507
iovec_off = vec->nr - nr_iovs;
1508
iov = vec->iovec + iovec_off;
1509
1510
if (imu->is_kbuf) {
1511
int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
1512
1513
if (unlikely(ret))
1514
return ret;
1515
} else {
1516
int ret = io_estimate_bvec_size(iov, nr_iovs, imu);
1517
1518
if (ret < 0)
1519
return ret;
1520
nr_segs = ret;
1521
}
1522
1523
if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
1524
size_t bvec_bytes;
1525
1526
bvec_bytes = nr_segs * sizeof(struct bio_vec);
1527
nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
1528
nr_segs += nr_iovs;
1529
}
1530
1531
if (nr_segs > vec->nr) {
1532
struct iou_vec tmp_vec = {};
1533
int ret;
1534
1535
ret = io_vec_realloc(&tmp_vec, nr_segs);
1536
if (ret)
1537
return ret;
1538
1539
iovec_off = tmp_vec.nr - nr_iovs;
1540
memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
1541
io_vec_free(vec);
1542
1543
*vec = tmp_vec;
1544
iov = vec->iovec + iovec_off;
1545
req->flags |= REQ_F_NEED_CLEANUP;
1546
}
1547
1548
if (imu->is_kbuf)
1549
return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1550
1551
return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1552
}
1553
1554
int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
1555
const struct iovec __user *uvec, size_t uvec_segs)
1556
{
1557
struct iovec *iov;
1558
int iovec_off, ret;
1559
void *res;
1560
1561
if (uvec_segs > iv->nr) {
1562
ret = io_vec_realloc(iv, uvec_segs);
1563
if (ret)
1564
return ret;
1565
req->flags |= REQ_F_NEED_CLEANUP;
1566
}
1567
1568
/* pad iovec to the right */
1569
iovec_off = iv->nr - uvec_segs;
1570
iov = iv->iovec + iovec_off;
1571
res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
1572
io_is_compat(req->ctx));
1573
if (IS_ERR(res))
1574
return PTR_ERR(res);
1575
1576
req->flags |= REQ_F_IMPORT_BUFFER;
1577
return 0;
1578
}
1579
1580