Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
108469 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright 2009 Sun Microsystems, Inc. All rights reserved.23* Use is subject to license terms.24*/2526/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */27/* All Rights Reserved */2829/*30* University Copyright- Copyright (c) 1982, 1986, 198831* The Regents of the University of California32* All Rights Reserved33*34* University Acknowledgment- Portions of this document are derived from35* software developed by the University of California, Berkeley, and its36* contributors.37*/38/*39* Copyright (c) 2015 by Chunwei Chen. All rights reserved.40*/4142#ifdef _KERNEL4344#include <sys/errno.h>45#include <sys/vmem.h>46#include <sys/sysmacros.h>47#include <sys/types.h>48#include <sys/uio_impl.h>49#include <sys/sysmacros.h>50#include <sys/string.h>51#include <sys/zfs_refcount.h>52#include <sys/zfs_debug.h>53#include <linux/kmap_compat.h>54#include <linux/uaccess.h>55#include <linux/pagemap.h>56#include <linux/mman.h>5758/*59* Move "n" bytes at byte address "p"; "rw" indicates the direction60* of the move, and the I/O parameters are provided in "uio", which is61* update to reflect the data which was moved. Returns 0 on success or62* a non-zero errno on failure.63*/64static int65zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)66{67const struct iovec *iov = uio->uio_iov;68size_t skip = uio->uio_skip;69ulong_t cnt;7071ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);72while (n && uio->uio_resid) {73cnt = MIN(iov->iov_len - skip, n);74if (rw == UIO_READ)75memcpy(iov->iov_base + skip, p, cnt);76else77memcpy(p, iov->iov_base + skip, cnt);78skip += cnt;79if (skip == iov->iov_len) {80skip = 0;81uio->uio_iov = (++iov);82uio->uio_iovcnt--;83}84uio->uio_skip = skip;85uio->uio_resid -= cnt;86uio->uio_loffset += cnt;87p = (caddr_t)p + cnt;88n -= cnt;89}90return (0);91}9293static int94zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)95{96const struct bio_vec *bv = uio->uio_bvec;97size_t skip = uio->uio_skip;98ulong_t cnt;99100while (n && uio->uio_resid) {101void *paddr;102size_t offset = bv->bv_offset + skip;103cnt = MIN(PAGE_SIZE - (offset & ~PAGE_MASK),104MIN(bv->bv_len - skip, n));105106paddr = zfs_kmap_local(bv->bv_page + (offset >> PAGE_SHIFT));107if (rw == UIO_READ) {108/* Copy from buffer 'p' to the bvec data */109memcpy(paddr + (offset & ~PAGE_MASK), p, cnt);110} else {111/* Copy from bvec data to buffer 'p' */112memcpy(p, paddr + (offset & ~PAGE_MASK), cnt);113}114zfs_kunmap_local(paddr);115116skip += cnt;117if (skip == bv->bv_len) {118skip = 0;119uio->uio_bvec = (++bv);120uio->uio_iovcnt--;121}122uio->uio_skip = skip;123uio->uio_resid -= cnt;124uio->uio_loffset += cnt;125p = (caddr_t)p + cnt;126n -= cnt;127}128return (0);129}130131static void132zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,133struct bio_vec *bv)134{135void *paddr;136137paddr = zfs_kmap_local(bv->bv_page);138if (rw == UIO_READ) {139/* Copy from buffer 'p' to the bvec data */140memcpy(paddr + bv->bv_offset + skip, p, cnt);141} else {142/* Copy from bvec data to buffer 'p' */143memcpy(p, paddr + bv->bv_offset + skip, cnt);144}145zfs_kunmap_local(paddr);146}147148/*149* Copy 'n' bytes of data between the buffer p[] and the data represented150* by the request in the uio.151*/152static int153zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)154{155struct request *rq = uio->rq;156struct bio_vec bv;157struct req_iterator iter;158size_t this_seg_start; /* logical offset */159size_t this_seg_end; /* logical offset */160size_t skip_in_seg;161size_t copy_from_seg;162size_t orig_loffset;163int copied = 0;164165/*166* Get the original logical offset of this entire request (because167* uio->uio_loffset will be modified over time).168*/169orig_loffset = io_offset(NULL, rq);170this_seg_start = orig_loffset;171172rq_for_each_segment(bv, rq, iter) {173/*174* Lookup what the logical offset of the last byte of this175* segment is.176*/177this_seg_end = this_seg_start + bv.bv_len - 1;178179/*180* We only need to operate on segments that have data we're181* copying.182*/183if (uio->uio_loffset >= this_seg_start &&184uio->uio_loffset <= this_seg_end) {185/*186* Some, or all, of the data in this segment needs to be187* copied.188*/189190/*191* We may be not be copying from the first byte in the192* segment. Figure out how many bytes to skip copying193* from the beginning of this segment.194*/195skip_in_seg = uio->uio_loffset - this_seg_start;196197/*198* Calculate the total number of bytes from this199* segment that we will be copying.200*/201copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);202203/* Copy the bytes */204zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);205p = ((char *)p) + copy_from_seg;206207n -= copy_from_seg;208uio->uio_resid -= copy_from_seg;209uio->uio_loffset += copy_from_seg;210copied = 1; /* We copied some data */211}212213this_seg_start = this_seg_end + 1;214}215216if (!copied) {217/* Didn't copy anything */218uio->uio_resid = 0;219}220return (0);221}222223static int224zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)225{226if (uio->rq != NULL)227return (zfs_uiomove_bvec_rq(p, n, rw, uio));228return (zfs_uiomove_bvec_impl(p, n, rw, uio));229}230231static int232zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,233boolean_t revert)234{235size_t cnt = MIN(n, uio->uio_resid);236237if (rw == UIO_READ)238cnt = copy_to_iter(p, cnt, uio->uio_iter);239else240cnt = copy_from_iter(p, cnt, uio->uio_iter);241242/*243* When operating on a full pipe no bytes are processed.244* In which case return EFAULT which is converted to EAGAIN245* by the kernel's generic_file_splice_read() function.246*/247if (cnt == 0)248return (EFAULT);249250/*251* Revert advancing the uio_iter. This is set by zfs_uiocopy()252* to avoid consuming the uio and its iov_iter structure.253*/254if (revert)255iov_iter_revert(uio->uio_iter, cnt);256257uio->uio_resid -= cnt;258uio->uio_loffset += cnt;259260return (0);261}262263int264zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)265{266if (uio->uio_segflg == UIO_BVEC)267return (zfs_uiomove_bvec(p, n, rw, uio));268else if (uio->uio_segflg == UIO_ITER)269return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));270else271return (zfs_uiomove_iov(p, n, rw, uio));272}273EXPORT_SYMBOL(zfs_uiomove);274275/*276* Fault in the pages of the first n bytes specified by the uio structure.277* 1 byte in each page is touched and the uio struct is unmodified. Any278* error will terminate the process as this is only a best attempt to get279* the pages resident.280*/281int282zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)283{284if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||285(uio->uio_extflg & UIO_DIRECT)) {286/*287* There's never a need to fault in kernel pages or Direct I/O288* write pages. Direct I/O write pages have been pinned in so289* there is never a time for these pages a fault will occur.290*/291return (0);292} else {293ASSERT3S(uio->uio_segflg, ==, UIO_ITER);294/*295* At least a Linux 4.18 kernel, iov_iter_fault_in_readable()296* can be relied on to fault in user pages when referenced.297*/298if (iov_iter_fault_in_readable(uio->uio_iter, n))299return (EFAULT);300}301302return (0);303}304EXPORT_SYMBOL(zfs_uio_prefaultpages);305306/*307* The same as zfs_uiomove() but doesn't modify uio structure.308* return in cbytes how many bytes were copied.309*/310int311zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)312{313zfs_uio_t uio_copy;314int ret;315316memcpy(&uio_copy, uio, sizeof (zfs_uio_t));317318if (uio->uio_segflg == UIO_BVEC)319ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);320else if (uio->uio_segflg == UIO_ITER)321ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);322else323ret = zfs_uiomove_iov(p, n, rw, &uio_copy);324325*cbytes = uio->uio_resid - uio_copy.uio_resid;326327return (ret);328}329EXPORT_SYMBOL(zfs_uiocopy);330331/*332* Drop the next n chars out of *uio.333*/334void335zfs_uioskip(zfs_uio_t *uio, size_t n)336{337if (n > uio->uio_resid)338return;339/*340* When using a uio with a struct request, we simply341* use uio_loffset as a pointer to the next logical byte to342* copy in the request. We don't have to do any fancy343* accounting with uio_bvec/uio_iovcnt since we don't use344* them.345*/346if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {347uio->uio_skip += n;348while (uio->uio_iovcnt &&349uio->uio_skip >= uio->uio_bvec->bv_len) {350uio->uio_skip -= uio->uio_bvec->bv_len;351uio->uio_bvec++;352uio->uio_iovcnt--;353}354} else if (uio->uio_segflg == UIO_ITER) {355iov_iter_advance(uio->uio_iter, n);356} else {357ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);358uio->uio_skip += n;359while (uio->uio_iovcnt &&360uio->uio_skip >= uio->uio_iov->iov_len) {361uio->uio_skip -= uio->uio_iov->iov_len;362uio->uio_iov++;363uio->uio_iovcnt--;364}365}366367uio->uio_loffset += n;368uio->uio_resid -= n;369}370EXPORT_SYMBOL(zfs_uioskip);371372/*373* Check if the uio is page-aligned in memory.374*/375boolean_t376zfs_uio_page_aligned(zfs_uio_t *uio)377{378boolean_t aligned = B_TRUE;379380if (uio->uio_segflg == UIO_SYSSPACE) {381const struct iovec *iov = uio->uio_iov;382size_t skip = uio->uio_skip;383384for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {385uintptr_t addr = (uintptr_t)(iov->iov_base + skip);386size_t size = iov->iov_len - skip;387if ((addr & (PAGE_SIZE - 1)) ||388(size & (PAGE_SIZE - 1))) {389aligned = B_FALSE;390break;391}392skip = 0;393}394} else if (uio->uio_segflg == UIO_ITER) {395unsigned long alignment =396iov_iter_alignment(uio->uio_iter);397aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);398} else {399/* Currently not supported */400aligned = B_FALSE;401}402403return (aligned);404}405406#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)407#define ZFS_MARKEED_PAGE 0x0408#define IS_ZFS_MARKED_PAGE(_p) 0409#define zfs_mark_page(_p)410#define zfs_unmark_page(_p)411#define IS_ZERO_PAGE(_p) 0412413#else414/*415* Mark pages to know if they were allocated to replace ZERO_PAGE() for416* Direct I/O writes.417*/418#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */419#define IS_ZFS_MARKED_PAGE(_p) \420(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)421#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))422423static inline void424zfs_mark_page(struct page *page)425{426ASSERT3P(page, !=, NULL);427get_page(page);428SetPagePrivate(page);429set_page_private(page, ZFS_MARKED_PAGE);430}431432static inline void433zfs_unmark_page(struct page *page)434{435ASSERT3P(page, !=, NULL);436set_page_private(page, 0UL);437ClearPagePrivate(page);438put_page(page);439}440#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */441442static void443zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)444{445ASSERT3P(uio->uio_dio.pages, !=, NULL);446447for (long i = 0; i < uio->uio_dio.npages; i++) {448struct page *p = uio->uio_dio.pages[i];449lock_page(p);450451if (IS_ZERO_PAGE(p)) {452/*453* If the user page points the kernels ZERO_PAGE() a454* new zero filled page will just be allocated so the455* contents of the page can not be changed by the user456* while a Direct I/O write is taking place.457*/458gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |459__GFP_ZERO | GFP_KERNEL;460461ASSERT0(IS_ZFS_MARKED_PAGE(p));462unlock_page(p);463put_page(p);464465uio->uio_dio.pages[i] =466__page_cache_alloc(gfp_zero_page);467zfs_mark_page(uio->uio_dio.pages[i]);468} else {469unlock_page(p);470}471}472}473474void475zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)476{477478ASSERT(uio->uio_extflg & UIO_DIRECT);479ASSERT3P(uio->uio_dio.pages, !=, NULL);480481if (uio->uio_dio.pinned) {482#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)483unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);484#endif485} else {486for (long i = 0; i < uio->uio_dio.npages; i++) {487struct page *p = uio->uio_dio.pages[i];488489if (IS_ZFS_MARKED_PAGE(p)) {490zfs_unmark_page(p);491__free_page(p);492continue;493}494495put_page(p);496}497}498499vmem_free(uio->uio_dio.pages,500uio->uio_dio.npages * sizeof (struct page *));501}502503#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)504static int505zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)506{507long res;508size_t skip = uio->uio_iter->iov_offset;509size_t len = uio->uio_resid - skip;510unsigned int gup_flags = 0;511unsigned long addr;512unsigned long nr_pages;513514ASSERT3U(uio->uio_segflg, ==, UIO_ITER);515516/*517* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could518* possibly be used here in the future to allow for P2P operations with519* user pages.520*/521if (rw == UIO_READ)522gup_flags = FOLL_WRITE;523524if (len == 0)525return (0);526527uio->uio_dio.pinned = B_TRUE;528#if defined(HAVE_ITER_IS_UBUF)529if (iter_is_ubuf(uio->uio_iter)) {530nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);531addr = (unsigned long)uio->uio_iter->ubuf + skip;532res = pin_user_pages_unlocked(addr, nr_pages,533&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);534if (res < 0) {535return (SET_ERROR(-res));536} else if (len != (res * PAGE_SIZE)) {537uio->uio_dio.npages += res;538return (SET_ERROR(EFAULT));539}540uio->uio_dio.npages += res;541return (0);542}543#endif544const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);545for (int i = 0; i < uio->uio_iovcnt; i++) {546size_t amt = iovp->iov_len - skip;547if (amt == 0) {548iovp++;549skip = 0;550continue;551}552553addr = (unsigned long)iovp->iov_base + skip;554nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);555res = pin_user_pages_unlocked(addr, nr_pages,556&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);557if (res < 0) {558return (SET_ERROR(-res));559} else if (amt != (res * PAGE_SIZE)) {560uio->uio_dio.npages += res;561return (SET_ERROR(EFAULT));562}563564len -= amt;565uio->uio_dio.npages += res;566skip = 0;567iovp++;568};569570ASSERT0(len);571572return (0);573}574#endif575576static int577zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)578{579size_t start;580size_t wanted = uio->uio_resid;581ssize_t rollback = 0;582ssize_t cnt;583unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);584585while (wanted) {586#if defined(HAVE_IOV_ITER_GET_PAGES2)587cnt = iov_iter_get_pages2(uio->uio_iter,588&uio->uio_dio.pages[uio->uio_dio.npages],589wanted, maxpages, &start);590#else591cnt = iov_iter_get_pages(uio->uio_iter,592&uio->uio_dio.pages[uio->uio_dio.npages],593wanted, maxpages, &start);594#endif595if (cnt < 0) {596iov_iter_revert(uio->uio_iter, rollback);597return (SET_ERROR(-cnt));598}599/*600* All Direct I/O operations must be page aligned.601*/602ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));603uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);604rollback += cnt;605wanted -= cnt;606#if !defined(HAVE_IOV_ITER_GET_PAGES2)607/*608* iov_iter_get_pages2() advances the iov_iter on success.609*/610iov_iter_advance(uio->uio_iter, cnt);611#endif612613}614ASSERT3U(rollback, ==, uio->uio_resid);615iov_iter_revert(uio->uio_iter, rollback);616617return (0);618}619620/*621* This function pins user pages. In the event that the user pages were not622* successfully pinned an error value is returned.623*624* On success, 0 is returned.625*/626int627zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)628{629int error = 0;630long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);631size_t size = npages * sizeof (struct page *);632633if (uio->uio_segflg == UIO_ITER) {634uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);635#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)636if (zfs_user_backed_iov_iter(uio->uio_iter))637error = zfs_uio_pin_user_pages(uio, rw);638else639error = zfs_uio_get_dio_pages_iov_iter(uio, rw);640#else641error = zfs_uio_get_dio_pages_iov_iter(uio, rw);642#endif643} else {644return (SET_ERROR(EOPNOTSUPP));645}646647ASSERT3S(uio->uio_dio.npages, >=, 0);648649if (error) {650if (uio->uio_dio.pinned) {651#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)652unpin_user_pages(uio->uio_dio.pages,653uio->uio_dio.npages);654#endif655} else {656for (long i = 0; i < uio->uio_dio.npages; i++)657put_page(uio->uio_dio.pages[i]);658}659660vmem_free(uio->uio_dio.pages, size);661return (error);662} else {663ASSERT3S(uio->uio_dio.npages, ==, npages);664}665666if (rw == UIO_WRITE && !uio->uio_dio.pinned)667zfs_uio_dio_check_for_zero_page(uio);668669uio->uio_extflg |= UIO_DIRECT;670671return (0);672}673674#endif /* _KERNEL */675676677