Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
48774 views
1
// SPDX-License-Identifier: CDDL-1.0
2
/*
3
* CDDL HEADER START
4
*
5
* The contents of this file are subject to the terms of the
6
* Common Development and Distribution License (the "License").
7
* You may not use this file except in compliance with the License.
8
*
9
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10
* or https://opensource.org/licenses/CDDL-1.0.
11
* See the License for the specific language governing permissions
12
* and limitations under the License.
13
*
14
* When distributing Covered Code, include this CDDL HEADER in each
15
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16
* If applicable, add the following below this CDDL HEADER, with the
17
* fields enclosed by brackets "[]" replaced with your own identifying
18
* information: Portions Copyright [yyyy] [name of copyright owner]
19
*
20
* CDDL HEADER END
21
*/
22
/*
23
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24
* Use is subject to license terms.
25
*/
26
27
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28
/* All Rights Reserved */
29
30
/*
31
* University Copyright- Copyright (c) 1982, 1986, 1988
32
* The Regents of the University of California
33
* All Rights Reserved
34
*
35
* University Acknowledgment- Portions of this document are derived from
36
* software developed by the University of California, Berkeley, and its
37
* contributors.
38
*/
39
/*
40
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
41
*/
42
43
#ifdef _KERNEL
44
45
#include <sys/errno.h>
46
#include <sys/vmem.h>
47
#include <sys/sysmacros.h>
48
#include <sys/types.h>
49
#include <sys/uio_impl.h>
50
#include <sys/sysmacros.h>
51
#include <sys/string.h>
52
#include <sys/zfs_refcount.h>
53
#include <sys/zfs_debug.h>
54
#include <linux/kmap_compat.h>
55
#include <linux/uaccess.h>
56
#include <linux/pagemap.h>
57
#include <linux/mman.h>
58
59
/*
60
* Move "n" bytes at byte address "p"; "rw" indicates the direction
61
* of the move, and the I/O parameters are provided in "uio", which is
62
* update to reflect the data which was moved. Returns 0 on success or
63
* a non-zero errno on failure.
64
*/
65
static int
66
zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
67
{
68
const struct iovec *iov = uio->uio_iov;
69
size_t skip = uio->uio_skip;
70
ulong_t cnt;
71
72
ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
73
while (n && uio->uio_resid) {
74
cnt = MIN(iov->iov_len - skip, n);
75
if (rw == UIO_READ)
76
memcpy(iov->iov_base + skip, p, cnt);
77
else
78
memcpy(p, iov->iov_base + skip, cnt);
79
skip += cnt;
80
if (skip == iov->iov_len) {
81
skip = 0;
82
uio->uio_iov = (++iov);
83
uio->uio_iovcnt--;
84
}
85
uio->uio_skip = skip;
86
uio->uio_resid -= cnt;
87
uio->uio_loffset += cnt;
88
p = (caddr_t)p + cnt;
89
n -= cnt;
90
}
91
return (0);
92
}
93
94
static int
95
zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
96
{
97
const struct bio_vec *bv = uio->uio_bvec;
98
size_t skip = uio->uio_skip;
99
ulong_t cnt;
100
101
while (n && uio->uio_resid) {
102
void *paddr;
103
cnt = MIN(bv->bv_len - skip, n);
104
105
paddr = zfs_kmap_local(bv->bv_page);
106
if (rw == UIO_READ) {
107
/* Copy from buffer 'p' to the bvec data */
108
memcpy(paddr + bv->bv_offset + skip, p, cnt);
109
} else {
110
/* Copy from bvec data to buffer 'p' */
111
memcpy(p, paddr + bv->bv_offset + skip, cnt);
112
}
113
zfs_kunmap_local(paddr);
114
115
skip += cnt;
116
if (skip == bv->bv_len) {
117
skip = 0;
118
uio->uio_bvec = (++bv);
119
uio->uio_iovcnt--;
120
}
121
uio->uio_skip = skip;
122
uio->uio_resid -= cnt;
123
uio->uio_loffset += cnt;
124
p = (caddr_t)p + cnt;
125
n -= cnt;
126
}
127
return (0);
128
}
129
130
static void
131
zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
132
struct bio_vec *bv)
133
{
134
void *paddr;
135
136
paddr = zfs_kmap_local(bv->bv_page);
137
if (rw == UIO_READ) {
138
/* Copy from buffer 'p' to the bvec data */
139
memcpy(paddr + bv->bv_offset + skip, p, cnt);
140
} else {
141
/* Copy from bvec data to buffer 'p' */
142
memcpy(p, paddr + bv->bv_offset + skip, cnt);
143
}
144
zfs_kunmap_local(paddr);
145
}
146
147
/*
148
* Copy 'n' bytes of data between the buffer p[] and the data represented
149
* by the request in the uio.
150
*/
151
static int
152
zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
153
{
154
struct request *rq = uio->rq;
155
struct bio_vec bv;
156
struct req_iterator iter;
157
size_t this_seg_start; /* logical offset */
158
size_t this_seg_end; /* logical offset */
159
size_t skip_in_seg;
160
size_t copy_from_seg;
161
size_t orig_loffset;
162
int copied = 0;
163
164
/*
165
* Get the original logical offset of this entire request (because
166
* uio->uio_loffset will be modified over time).
167
*/
168
orig_loffset = io_offset(NULL, rq);
169
this_seg_start = orig_loffset;
170
171
rq_for_each_segment(bv, rq, iter) {
172
/*
173
* Lookup what the logical offset of the last byte of this
174
* segment is.
175
*/
176
this_seg_end = this_seg_start + bv.bv_len - 1;
177
178
/*
179
* We only need to operate on segments that have data we're
180
* copying.
181
*/
182
if (uio->uio_loffset >= this_seg_start &&
183
uio->uio_loffset <= this_seg_end) {
184
/*
185
* Some, or all, of the data in this segment needs to be
186
* copied.
187
*/
188
189
/*
190
* We may be not be copying from the first byte in the
191
* segment. Figure out how many bytes to skip copying
192
* from the beginning of this segment.
193
*/
194
skip_in_seg = uio->uio_loffset - this_seg_start;
195
196
/*
197
* Calculate the total number of bytes from this
198
* segment that we will be copying.
199
*/
200
copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
201
202
/* Copy the bytes */
203
zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
204
p = ((char *)p) + copy_from_seg;
205
206
n -= copy_from_seg;
207
uio->uio_resid -= copy_from_seg;
208
uio->uio_loffset += copy_from_seg;
209
copied = 1; /* We copied some data */
210
}
211
212
this_seg_start = this_seg_end + 1;
213
}
214
215
if (!copied) {
216
/* Didn't copy anything */
217
uio->uio_resid = 0;
218
}
219
return (0);
220
}
221
222
static int
223
zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
224
{
225
if (uio->rq != NULL)
226
return (zfs_uiomove_bvec_rq(p, n, rw, uio));
227
return (zfs_uiomove_bvec_impl(p, n, rw, uio));
228
}
229
230
static int
231
zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
232
boolean_t revert)
233
{
234
size_t cnt = MIN(n, uio->uio_resid);
235
236
if (rw == UIO_READ)
237
cnt = copy_to_iter(p, cnt, uio->uio_iter);
238
else
239
cnt = copy_from_iter(p, cnt, uio->uio_iter);
240
241
/*
242
* When operating on a full pipe no bytes are processed.
243
* In which case return EFAULT which is converted to EAGAIN
244
* by the kernel's generic_file_splice_read() function.
245
*/
246
if (cnt == 0)
247
return (EFAULT);
248
249
/*
250
* Revert advancing the uio_iter. This is set by zfs_uiocopy()
251
* to avoid consuming the uio and its iov_iter structure.
252
*/
253
if (revert)
254
iov_iter_revert(uio->uio_iter, cnt);
255
256
uio->uio_resid -= cnt;
257
uio->uio_loffset += cnt;
258
259
return (0);
260
}
261
262
int
263
zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
264
{
265
if (uio->uio_segflg == UIO_BVEC)
266
return (zfs_uiomove_bvec(p, n, rw, uio));
267
else if (uio->uio_segflg == UIO_ITER)
268
return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
269
else
270
return (zfs_uiomove_iov(p, n, rw, uio));
271
}
272
EXPORT_SYMBOL(zfs_uiomove);
273
274
/*
275
* Fault in the pages of the first n bytes specified by the uio structure.
276
* 1 byte in each page is touched and the uio struct is unmodified. Any
277
* error will terminate the process as this is only a best attempt to get
278
* the pages resident.
279
*/
280
int
281
zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
282
{
283
if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
284
(uio->uio_extflg & UIO_DIRECT)) {
285
/*
286
* There's never a need to fault in kernel pages or Direct I/O
287
* write pages. Direct I/O write pages have been pinned in so
288
* there is never a time for these pages a fault will occur.
289
*/
290
return (0);
291
} else {
292
ASSERT3S(uio->uio_segflg, ==, UIO_ITER);
293
/*
294
* At least a Linux 4.18 kernel, iov_iter_fault_in_readable()
295
* can be relied on to fault in user pages when referenced.
296
*/
297
if (iov_iter_fault_in_readable(uio->uio_iter, n))
298
return (EFAULT);
299
}
300
301
return (0);
302
}
303
EXPORT_SYMBOL(zfs_uio_prefaultpages);
304
305
/*
306
* The same as zfs_uiomove() but doesn't modify uio structure.
307
* return in cbytes how many bytes were copied.
308
*/
309
int
310
zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
311
{
312
zfs_uio_t uio_copy;
313
int ret;
314
315
memcpy(&uio_copy, uio, sizeof (zfs_uio_t));
316
317
if (uio->uio_segflg == UIO_BVEC)
318
ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
319
else if (uio->uio_segflg == UIO_ITER)
320
ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
321
else
322
ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
323
324
*cbytes = uio->uio_resid - uio_copy.uio_resid;
325
326
return (ret);
327
}
328
EXPORT_SYMBOL(zfs_uiocopy);
329
330
/*
331
* Drop the next n chars out of *uio.
332
*/
333
void
334
zfs_uioskip(zfs_uio_t *uio, size_t n)
335
{
336
if (n > uio->uio_resid)
337
return;
338
/*
339
* When using a uio with a struct request, we simply
340
* use uio_loffset as a pointer to the next logical byte to
341
* copy in the request. We don't have to do any fancy
342
* accounting with uio_bvec/uio_iovcnt since we don't use
343
* them.
344
*/
345
if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
346
uio->uio_skip += n;
347
while (uio->uio_iovcnt &&
348
uio->uio_skip >= uio->uio_bvec->bv_len) {
349
uio->uio_skip -= uio->uio_bvec->bv_len;
350
uio->uio_bvec++;
351
uio->uio_iovcnt--;
352
}
353
} else if (uio->uio_segflg == UIO_ITER) {
354
iov_iter_advance(uio->uio_iter, n);
355
} else {
356
ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
357
uio->uio_skip += n;
358
while (uio->uio_iovcnt &&
359
uio->uio_skip >= uio->uio_iov->iov_len) {
360
uio->uio_skip -= uio->uio_iov->iov_len;
361
uio->uio_iov++;
362
uio->uio_iovcnt--;
363
}
364
}
365
366
uio->uio_loffset += n;
367
uio->uio_resid -= n;
368
}
369
EXPORT_SYMBOL(zfs_uioskip);
370
371
/*
372
* Check if the uio is page-aligned in memory.
373
*/
374
boolean_t
375
zfs_uio_page_aligned(zfs_uio_t *uio)
376
{
377
boolean_t aligned = B_TRUE;
378
379
if (uio->uio_segflg == UIO_SYSSPACE) {
380
const struct iovec *iov = uio->uio_iov;
381
size_t skip = uio->uio_skip;
382
383
for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
384
uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
385
size_t size = iov->iov_len - skip;
386
if ((addr & (PAGE_SIZE - 1)) ||
387
(size & (PAGE_SIZE - 1))) {
388
aligned = B_FALSE;
389
break;
390
}
391
skip = 0;
392
}
393
} else if (uio->uio_segflg == UIO_ITER) {
394
unsigned long alignment =
395
iov_iter_alignment(uio->uio_iter);
396
aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
397
} else {
398
/* Currently not supported */
399
aligned = B_FALSE;
400
}
401
402
return (aligned);
403
}
404
405
#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
406
#define ZFS_MARKEED_PAGE 0x0
407
#define IS_ZFS_MARKED_PAGE(_p) 0
408
#define zfs_mark_page(_p)
409
#define zfs_unmark_page(_p)
410
#define IS_ZERO_PAGE(_p) 0
411
412
#else
413
/*
414
* Mark pages to know if they were allocated to replace ZERO_PAGE() for
415
* Direct I/O writes.
416
*/
417
#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
418
#define IS_ZFS_MARKED_PAGE(_p) \
419
(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
420
#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
421
422
static inline void
423
zfs_mark_page(struct page *page)
424
{
425
ASSERT3P(page, !=, NULL);
426
get_page(page);
427
SetPagePrivate(page);
428
set_page_private(page, ZFS_MARKED_PAGE);
429
}
430
431
static inline void
432
zfs_unmark_page(struct page *page)
433
{
434
ASSERT3P(page, !=, NULL);
435
set_page_private(page, 0UL);
436
ClearPagePrivate(page);
437
put_page(page);
438
}
439
#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
440
441
static void
442
zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
443
{
444
ASSERT3P(uio->uio_dio.pages, !=, NULL);
445
446
for (long i = 0; i < uio->uio_dio.npages; i++) {
447
struct page *p = uio->uio_dio.pages[i];
448
lock_page(p);
449
450
if (IS_ZERO_PAGE(p)) {
451
/*
452
* If the user page points the kernels ZERO_PAGE() a
453
* new zero filled page will just be allocated so the
454
* contents of the page can not be changed by the user
455
* while a Direct I/O write is taking place.
456
*/
457
gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
458
__GFP_ZERO | GFP_KERNEL;
459
460
ASSERT0(IS_ZFS_MARKED_PAGE(p));
461
unlock_page(p);
462
put_page(p);
463
464
uio->uio_dio.pages[i] =
465
__page_cache_alloc(gfp_zero_page);
466
zfs_mark_page(uio->uio_dio.pages[i]);
467
} else {
468
unlock_page(p);
469
}
470
}
471
}
472
473
void
474
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
475
{
476
477
ASSERT(uio->uio_extflg & UIO_DIRECT);
478
ASSERT3P(uio->uio_dio.pages, !=, NULL);
479
480
if (uio->uio_dio.pinned) {
481
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
482
unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
483
#endif
484
} else {
485
for (long i = 0; i < uio->uio_dio.npages; i++) {
486
struct page *p = uio->uio_dio.pages[i];
487
488
if (IS_ZFS_MARKED_PAGE(p)) {
489
zfs_unmark_page(p);
490
__free_page(p);
491
continue;
492
}
493
494
put_page(p);
495
}
496
}
497
498
vmem_free(uio->uio_dio.pages,
499
uio->uio_dio.npages * sizeof (struct page *));
500
}
501
502
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
503
static int
504
zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
505
{
506
long res;
507
size_t skip = uio->uio_iter->iov_offset;
508
size_t len = uio->uio_resid - skip;
509
unsigned int gup_flags = 0;
510
unsigned long addr;
511
unsigned long nr_pages;
512
513
ASSERT3U(uio->uio_segflg, ==, UIO_ITER);
514
515
/*
516
* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
517
* possibly be used here in the future to allow for P2P operations with
518
* user pages.
519
*/
520
if (rw == UIO_READ)
521
gup_flags = FOLL_WRITE;
522
523
if (len == 0)
524
return (0);
525
526
uio->uio_dio.pinned = B_TRUE;
527
#if defined(HAVE_ITER_IS_UBUF)
528
if (iter_is_ubuf(uio->uio_iter)) {
529
nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
530
addr = (unsigned long)uio->uio_iter->ubuf + skip;
531
res = pin_user_pages_unlocked(addr, nr_pages,
532
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
533
if (res < 0) {
534
return (SET_ERROR(-res));
535
} else if (len != (res * PAGE_SIZE)) {
536
uio->uio_dio.npages += res;
537
return (SET_ERROR(EFAULT));
538
}
539
uio->uio_dio.npages += res;
540
return (0);
541
}
542
#endif
543
const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
544
for (int i = 0; i < uio->uio_iovcnt; i++) {
545
size_t amt = iovp->iov_len - skip;
546
if (amt == 0) {
547
iovp++;
548
skip = 0;
549
continue;
550
}
551
552
addr = (unsigned long)iovp->iov_base + skip;
553
nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
554
res = pin_user_pages_unlocked(addr, nr_pages,
555
&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
556
if (res < 0) {
557
return (SET_ERROR(-res));
558
} else if (amt != (res * PAGE_SIZE)) {
559
uio->uio_dio.npages += res;
560
return (SET_ERROR(EFAULT));
561
}
562
563
len -= amt;
564
uio->uio_dio.npages += res;
565
skip = 0;
566
iovp++;
567
};
568
569
ASSERT0(len);
570
571
return (0);
572
}
573
#endif
574
575
static int
576
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
577
{
578
size_t start;
579
size_t wanted = uio->uio_resid;
580
ssize_t rollback = 0;
581
ssize_t cnt;
582
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
583
584
while (wanted) {
585
#if defined(HAVE_IOV_ITER_GET_PAGES2)
586
cnt = iov_iter_get_pages2(uio->uio_iter,
587
&uio->uio_dio.pages[uio->uio_dio.npages],
588
wanted, maxpages, &start);
589
#else
590
cnt = iov_iter_get_pages(uio->uio_iter,
591
&uio->uio_dio.pages[uio->uio_dio.npages],
592
wanted, maxpages, &start);
593
#endif
594
if (cnt < 0) {
595
iov_iter_revert(uio->uio_iter, rollback);
596
return (SET_ERROR(-cnt));
597
}
598
/*
599
* All Direct I/O operations must be page aligned.
600
*/
601
ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
602
uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
603
rollback += cnt;
604
wanted -= cnt;
605
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
606
/*
607
* iov_iter_get_pages2() advances the iov_iter on success.
608
*/
609
iov_iter_advance(uio->uio_iter, cnt);
610
#endif
611
612
}
613
ASSERT3U(rollback, ==, uio->uio_resid);
614
iov_iter_revert(uio->uio_iter, rollback);
615
616
return (0);
617
}
618
619
/*
620
* This function pins user pages. In the event that the user pages were not
621
* successfully pinned an error value is returned.
622
*
623
* On success, 0 is returned.
624
*/
625
int
626
zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
627
{
628
int error = 0;
629
long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
630
size_t size = npages * sizeof (struct page *);
631
632
if (uio->uio_segflg == UIO_ITER) {
633
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
634
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
635
if (zfs_user_backed_iov_iter(uio->uio_iter))
636
error = zfs_uio_pin_user_pages(uio, rw);
637
else
638
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
639
#else
640
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
641
#endif
642
} else {
643
return (SET_ERROR(EOPNOTSUPP));
644
}
645
646
ASSERT3S(uio->uio_dio.npages, >=, 0);
647
648
if (error) {
649
if (uio->uio_dio.pinned) {
650
#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
651
unpin_user_pages(uio->uio_dio.pages,
652
uio->uio_dio.npages);
653
#endif
654
} else {
655
for (long i = 0; i < uio->uio_dio.npages; i++)
656
put_page(uio->uio_dio.pages[i]);
657
}
658
659
vmem_free(uio->uio_dio.pages, size);
660
return (error);
661
} else {
662
ASSERT3S(uio->uio_dio.npages, ==, npages);
663
}
664
665
if (rw == UIO_WRITE && !uio->uio_dio.pinned)
666
zfs_uio_dio_check_for_zero_page(uio);
667
668
uio->uio_extflg |= UIO_DIRECT;
669
670
return (0);
671
}
672
673
#endif /* _KERNEL */
674
675