Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
108469 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24
* Use is subject to license terms.
25
*/
26
27
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28
/* All Rights Reserved */
29
30
/*
31
* University Copyright- Copyright (c) 1982, 1986, 1988
32
* The Regents of the University of California
33
* All Rights Reserved
34
*
35
* University Acknowledgment- Portions of this document are derived from
36
* software developed by the University of California, Berkeley, and its
37
* contributors.
38
*/
39
/*
40
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
41
*/
42
43
#ifdef _KERNEL
44
45
#include <sys/errno.h>
46
#include <sys/vmem.h>
47
#include <sys/sysmacros.h>
48
#include <sys/types.h>
49
#include <sys/uio_impl.h>
50
#include <sys/sysmacros.h>
51
#include <sys/string.h>
52
#include <sys/zfs_refcount.h>
53
#include <sys/zfs_debug.h>
54
#include <linux/kmap_compat.h>
55
#include <linux/uaccess.h>
56
#include <linux/pagemap.h>
57
#include <linux/mman.h>
58
59
/*
60
* Move "n" bytes at byte address "p"; "rw" indicates the direction
61
* of the move, and the I/O parameters are provided in "uio", which is
62
* update to reflect the data which was moved. Returns 0 on success or
63
* a non-zero errno on failure.
64
*/
65
static int
66
zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
67
{
68
const struct iovec *iov = uio->uio_iov;
69
size_t skip = uio->uio_skip;
70
ulong_t cnt;
71
72
ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
73
while (n && uio->uio_resid) {
74
cnt = MIN(iov->iov_len - skip, n);
75
if (rw == UIO_READ)
76
memcpy(iov->iov_base + skip, p, cnt);
77
else
78
memcpy(p, iov->iov_base + skip, cnt);
79
skip += cnt;
80
if (skip == iov->iov_len) {
81
skip = 0;
82
uio->uio_iov = (++iov);
83
uio->uio_iovcnt--;
84
}
85
uio->uio_skip = skip;
86
uio->uio_resid -= cnt;
87
uio->uio_loffset += cnt;
88
p = (caddr_t)p + cnt;
89
n -= cnt;
90
}
91
return (0);
92
}
93
94
static int
95
zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
96
{
97
const struct bio_vec *bv = uio->uio_bvec;
98
size_t skip = uio->uio_skip;
99
ulong_t cnt;
100
101
while (n && uio->uio_resid) {
102
void *paddr;
103
size_t offset = bv->bv_offset + skip;
104
cnt = MIN(PAGE_SIZE - (offset & ~PAGE_MASK),
105
MIN(bv->bv_len - skip, n));
106
107
paddr = zfs_kmap_local(bv->bv_page + (offset >> PAGE_SHIFT));
108
if (rw == UIO_READ) {
109
/* Copy from buffer 'p' to the bvec data */
110
memcpy(paddr + (offset & ~PAGE_MASK), p, cnt);
111
} else {
112
/* Copy from bvec data to buffer 'p' */
113
memcpy(p, paddr + (offset & ~PAGE_MASK), cnt);
114
}
115
zfs_kunmap_local(paddr);
116
117
skip += cnt;
118
if (skip == bv->bv_len) {
119
skip = 0;
120
uio->uio_bvec = (++bv);
121
uio->uio_iovcnt--;
122
}
123
uio->uio_skip = skip;
124
uio->uio_resid -= cnt;
125
uio->uio_loffset += cnt;
126
p = (caddr_t)p + cnt;
127
n -= cnt;
128
}
129
return (0);
130
}
131
132
static void
133
zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
134
struct bio_vec *bv)
135
{
136
void *paddr;
137
138
paddr = zfs_kmap_local(bv->bv_page);
139
if (rw == UIO_READ) {
140
/* Copy from buffer 'p' to the bvec data */
141
memcpy(paddr + bv->bv_offset + skip, p, cnt);
142
} else {
143
/* Copy from bvec data to buffer 'p' */
144
memcpy(p, paddr + bv->bv_offset + skip, cnt);
145
}
146
zfs_kunmap_local(paddr);
147
}
148
149
/*
150
* Copy 'n' bytes of data between the buffer p[] and the data represented
151
* by the request in the uio.
152
*/
153
static int
154
zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
155
{
156
struct request *rq = uio->rq;
157
struct bio_vec bv;
158
struct req_iterator iter;
159
size_t this_seg_start; /* logical offset */
160
size_t this_seg_end; /* logical offset */
161
size_t skip_in_seg;
162
size_t copy_from_seg;
163
size_t orig_loffset;
164
int copied = 0;
165
166
/*
167
* Get the original logical offset of this entire request (because
168
* uio->uio_loffset will be modified over time).
169
*/
170
orig_loffset = io_offset(NULL, rq);
171
this_seg_start = orig_loffset;
172
173
rq_for_each_segment(bv, rq, iter) {
174
/*
175
* Lookup what the logical offset of the last byte of this
176
* segment is.
177
*/
178
this_seg_end = this_seg_start + bv.bv_len - 1;
179
180
/*
181
* We only need to operate on segments that have data we're
182
* copying.
183
*/
184
if (uio->uio_loffset >= this_seg_start &&
185
uio->uio_loffset <= this_seg_end) {
186
/*
187
* Some, or all, of the data in this segment needs to be
188
* copied.
189
*/
190
191
/*
192
* We may be not be copying from the first byte in the
193
* segment. Figure out how many bytes to skip copying
194
* from the beginning of this segment.
195
*/
196
skip_in_seg = uio->uio_loffset - this_seg_start;
197
198
/*
199
* Calculate the total number of bytes from this
200
* segment that we will be copying.
201
*/
202
copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
203
204
/* Copy the bytes */
205
zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
206
p = ((char *)p) + copy_from_seg;
207
208
n -= copy_from_seg;
209
uio->uio_resid -= copy_from_seg;
210
uio->uio_loffset += copy_from_seg;
211
copied = 1; /* We copied some data */
212
}
213
214
this_seg_start = this_seg_end + 1;
215
}
216
217
if (!copied) {
218
/* Didn't copy anything */
219
uio->uio_resid = 0;
220
}
221
return (0);
222
}
223
224
static int
225
zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
226
{
227
if (uio->rq != NULL)
228
return (zfs_uiomove_bvec_rq(p, n, rw, uio));
229
return (zfs_uiomove_bvec_impl(p, n, rw, uio));
230
}
231
232
static int
233
zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
234
boolean_t revert)
235
{
236
size_t cnt = MIN(n, uio->uio_resid);
237
238
if (rw == UIO_READ)
239
cnt = copy_to_iter(p, cnt, uio->uio_iter);
240
else
241
cnt = copy_from_iter(p, cnt, uio->uio_iter);
242
243
/*
244
* When operating on a full pipe no bytes are processed.
245
* In which case return EFAULT which is converted to EAGAIN
246
* by the kernel's generic_file_splice_read() function.
247
*/
248
if (cnt == 0)
249
return (EFAULT);
250
251
/*
252
* Revert advancing the uio_iter. This is set by zfs_uiocopy()
253
* to avoid consuming the uio and its iov_iter structure.
254
*/
255
if (revert)
256
iov_iter_revert(uio->uio_iter, cnt);
257
258
uio->uio_resid -= cnt;
259
uio->uio_loffset += cnt;
260
261
return (0);
262
}
263
264
int
265
zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
266
{
267
if (uio->uio_segflg == UIO_BVEC)
268
return (zfs_uiomove_bvec(p, n, rw, uio));
269
else if (uio->uio_segflg == UIO_ITER)
270
return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
271
else
272
return (zfs_uiomove_iov(p, n, rw, uio));
273
}
274
EXPORT_SYMBOL(zfs_uiomove);
275
276
/*
277
* Fault in the pages of the first n bytes specified by the uio structure.
278
* 1 byte in each page is touched and the uio struct is unmodified. Any
279
* error will terminate the process as this is only a best attempt to get
280
* the pages resident.
281
*/
282
int
283
zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
284
{
285
if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
286
(uio->uio_extflg & UIO_DIRECT)) {
287
/*
288
* There's never a need to fault in kernel pages or Direct I/O
289
* write pages. Direct I/O write pages have been pinned in so
290
* there is never a time for these pages a fault will occur.
291
*/
292
return (0);
293
} else {
294
ASSERT3S(uio->uio_segflg, ==, UIO_ITER);
295
/*
296
* At least a Linux 4.18 kernel, iov_iter_fault_in_readable()
297
* can be relied on to fault in user pages when referenced.
298
*/
299
if (iov_iter_fault_in_readable(uio->uio_iter, n))
300
return (EFAULT);
301
}
302
303
return (0);
304
}
305
EXPORT_SYMBOL(zfs_uio_prefaultpages);
306
307
/*
308
* The same as zfs_uiomove() but doesn't modify uio structure.
309
* return in cbytes how many bytes were copied.
310
*/
311
int
312
zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
313
{
314
zfs_uio_t uio_copy;
315
int ret;
316
317
memcpy(&uio_copy, uio, sizeof (zfs_uio_t));
318
319
if (uio->uio_segflg == UIO_BVEC)
320
ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
321
else if (uio->uio_segflg == UIO_ITER)
322
ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
323
else
324
ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
325
326
*cbytes = uio->uio_resid - uio_copy.uio_resid;
327
328
return (ret);
329
}
330
EXPORT_SYMBOL(zfs_uiocopy);
331
332
/*
333
* Drop the next n chars out of *uio.
334
*/
335
void
336
zfs_uioskip(zfs_uio_t *uio, size_t n)
337
{
338
if (n > uio->uio_resid)
339
return;
340
/*
341
* When using a uio with a struct request, we simply
342
* use uio_loffset as a pointer to the next logical byte to
343
* copy in the request. We don't have to do any fancy
344
* accounting with uio_bvec/uio_iovcnt since we don't use
345
* them.
346
*/
347
if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
348
uio->uio_skip += n;
349
while (uio->uio_iovcnt &&
350
uio->uio_skip >= uio->uio_bvec->bv_len) {
351
uio->uio_skip -= uio->uio_bvec->bv_len;
352
uio->uio_bvec++;
353
uio->uio_iovcnt--;
354
}
355
} else if (uio->uio_segflg == UIO_ITER) {
356
iov_iter_advance(uio->uio_iter, n);
357
} else {
358
ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
359
uio->uio_skip += n;
360
while (uio->uio_iovcnt &&
361
uio->uio_skip >= uio->uio_iov->iov_len) {
362
uio->uio_skip -= uio->uio_iov->iov_len;
363
uio->uio_iov++;
364
uio->uio_iovcnt--;
365
}
366
}
367
368
uio->uio_loffset += n;
369
uio->uio_resid -= n;
370
}
371
EXPORT_SYMBOL(zfs_uioskip);
372
373
/*
374
* Check if the uio is page-aligned in memory.
375
*/
376
boolean_t
377
zfs_uio_page_aligned(zfs_uio_t *uio)
378
{
379
boolean_t aligned = B_TRUE;
380
381
if (uio->uio_segflg == UIO_SYSSPACE) {
382
const struct iovec *iov = uio->uio_iov;
383
size_t skip = uio->uio_skip;
384
385
for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
386
uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
387
size_t size = iov->iov_len - skip;
388
if ((addr & (PAGE_SIZE - 1)) ||
389
(size & (PAGE_SIZE - 1))) {
390
aligned = B_FALSE;
391
break;
392
}
393
skip = 0;
394
}
395
} else if (uio->uio_segflg == UIO_ITER) {
396
unsigned long alignment =
397
iov_iter_alignment(uio->uio_iter);
398
aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
399
} else {
400
/* Currently not supported */
401
aligned = B_FALSE;
402
}
403
404
return (aligned);
405
}
406
407
#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
408
#define ZFS_MARKEED_PAGE 0x0
409
#define IS_ZFS_MARKED_PAGE(_p) 0
410
#define zfs_mark_page(_p)
411
#define zfs_unmark_page(_p)
412
#define IS_ZERO_PAGE(_p) 0
413
414
#else
415
/*
416
* Mark pages to know if they were allocated to replace ZERO_PAGE() for
417
* Direct I/O writes.
418
*/
419
#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
420
#define IS_ZFS_MARKED_PAGE(_p) \
421
(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
422
#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
423
424
static inline void
425
zfs_mark_page(struct page *page)
426
{
427
ASSERT3P(page, !=, NULL);
428
get_page(page);
429
SetPagePrivate(page);
430
set_page_private(page, ZFS_MARKED_PAGE);
431
}
432
433
static inline void
434
zfs_unmark_page(struct page *page)
435
{
436
ASSERT3P(page, !=, NULL);
437
set_page_private(page, 0UL);
438
ClearPagePrivate(page);
439
put_page(page);
440
}
441
#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
442
443
static void
444
zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
445
{
446
ASSERT3P(uio->uio_dio.pages, !=, NULL);
447
448
for (long i = 0; i < uio->uio_dio.npages; i++) {
449
struct page *p = uio->uio_dio.pages[i];
450
lock_page(p);
451
452
if (IS_ZERO_PAGE(p)) {
453
/*
454
* If the user page points the kernels ZERO_PAGE() a
455
* new zero filled page will just be allocated so the
456
* contents of the page can not be changed by the user
457
* while a Direct I/O write is taking place.
458
*/
459
gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
460
__GFP_ZERO | GFP_KERNEL;
461
462
ASSERT0(IS_ZFS_MARKED_PAGE(p));
463
unlock_page(p);
464
put_page(p);
465
466
uio->uio_dio.pages[i] =
467
__page_cache_alloc(gfp_zero_page);
468
zfs_mark_page(uio->uio_dio.pages[i]);
469
} else {
470
unlock_page(p);
471
}
472
}
473
}
474
475
void
476
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
477
{
478
479
ASSERT(uio->uio_extflg & UIO_DIRECT);
480
ASSERT3P(uio->uio_dio.pages, !=, NULL);
481
482
if (uio->uio_dio.pinned) {
483
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
484
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
485
#endif
486
} else {
487
for (long i = 0; i < uio->uio_dio.npages; i++) {
488
struct page *p = uio->uio_dio.pages[i];
489
490
if (IS_ZFS_MARKED_PAGE(p)) {
491
zfs_unmark_page(p);
492
__free_page(p);
493
continue;
494
}
495
496
put_page(p);
497
}
498
}
499
500
vmem_free(uio->uio_dio.pages,
501
uio->uio_dio.npages * sizeof (struct page *));
502
}
503
504
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
505
static int
506
zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
507
{
508
long res;
509
size_t skip = uio->uio_iter->iov_offset;
510
size_t len = uio->uio_resid - skip;
511
unsigned int gup_flags = 0;
512
unsigned long addr;
513
unsigned long nr_pages;
514
515
ASSERT3U(uio->uio_segflg, ==, UIO_ITER);
516
517
/*
518
* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
519
* possibly be used here in the future to allow for P2P operations with
520
* user pages.
521
*/
522
if (rw == UIO_READ)
523
gup_flags = FOLL_WRITE;
524
525
if (len == 0)
526
return (0);
527
528
uio->uio_dio.pinned = B_TRUE;
529
#if defined(HAVE_ITER_IS_UBUF)
530
if (iter_is_ubuf(uio->uio_iter)) {
531
nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
532
addr = (unsigned long)uio->uio_iter->ubuf + skip;
533
res = pin_user_pages_unlocked(addr, nr_pages,
534
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
535
if (res < 0) {
536
return (SET_ERROR(-res));
537
} else if (len != (res * PAGE_SIZE)) {
538
uio->uio_dio.npages += res;
539
return (SET_ERROR(EFAULT));
540
}
541
uio->uio_dio.npages += res;
542
return (0);
543
}
544
#endif
545
const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
546
for (int i = 0; i < uio->uio_iovcnt; i++) {
547
size_t amt = iovp->iov_len - skip;
548
if (amt == 0) {
549
iovp++;
550
skip = 0;
551
continue;
552
}
553
554
addr = (unsigned long)iovp->iov_base + skip;
555
nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
556
res = pin_user_pages_unlocked(addr, nr_pages,
557
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
558
if (res < 0) {
559
return (SET_ERROR(-res));
560
} else if (amt != (res * PAGE_SIZE)) {
561
uio->uio_dio.npages += res;
562
return (SET_ERROR(EFAULT));
563
}
564
565
len -= amt;
566
uio->uio_dio.npages += res;
567
skip = 0;
568
iovp++;
569
};
570
571
ASSERT0(len);
572
573
return (0);
574
}
575
#endif
576
577
static int
578
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
579
{
580
size_t start;
581
size_t wanted = uio->uio_resid;
582
ssize_t rollback = 0;
583
ssize_t cnt;
584
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
585
586
while (wanted) {
587
#if defined(HAVE_IOV_ITER_GET_PAGES2)
588
cnt = iov_iter_get_pages2(uio->uio_iter,
589
&uio->uio_dio.pages[uio->uio_dio.npages],
590
wanted, maxpages, &start);
591
#else
592
cnt = iov_iter_get_pages(uio->uio_iter,
593
&uio->uio_dio.pages[uio->uio_dio.npages],
594
wanted, maxpages, &start);
595
#endif
596
if (cnt < 0) {
597
iov_iter_revert(uio->uio_iter, rollback);
598
return (SET_ERROR(-cnt));
599
}
600
/*
601
* All Direct I/O operations must be page aligned.
602
*/
603
ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
604
uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
605
rollback += cnt;
606
wanted -= cnt;
607
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
608
/*
609
* iov_iter_get_pages2() advances the iov_iter on success.
610
*/
611
iov_iter_advance(uio->uio_iter, cnt);
612
#endif
613
614
}
615
ASSERT3U(rollback, ==, uio->uio_resid);
616
iov_iter_revert(uio->uio_iter, rollback);
617
618
return (0);
619
}
620
621
/*
622
* This function pins user pages. In the event that the user pages were not
623
* successfully pinned an error value is returned.
624
*
625
* On success, 0 is returned.
626
*/
627
int
628
zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
629
{
630
int error = 0;
631
long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
632
size_t size = npages * sizeof (struct page *);
633
634
if (uio->uio_segflg == UIO_ITER) {
635
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
636
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
637
if (zfs_user_backed_iov_iter(uio->uio_iter))
638
error = zfs_uio_pin_user_pages(uio, rw);
639
else
640
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
641
#else
642
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
643
#endif
644
} else {
645
return (SET_ERROR(EOPNOTSUPP));
646
}
647
648
ASSERT3S(uio->uio_dio.npages, >=, 0);
649
650
if (error) {
651
if (uio->uio_dio.pinned) {
652
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
653
unpin_user_pages(uio->uio_dio.pages,
654
uio->uio_dio.npages);
655
#endif
656
} else {
657
for (long i = 0; i < uio->uio_dio.npages; i++)
658
put_page(uio->uio_dio.pages[i]);
659
}
660
661
vmem_free(uio->uio_dio.pages, size);
662
return (error);
663
} else {
664
ASSERT3S(uio->uio_dio.npages, ==, npages);
665
}
666
667
if (rw == UIO_WRITE && !uio->uio_dio.pinned)
668
zfs_uio_dio_check_for_zero_page(uio);
669
670
uio->uio_extflg |= UIO_DIRECT;
671
672
return (0);
673
}
674
675
#endif /* _KERNEL */
676
677