Path: blob/main/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
48774 views
// SPDX-License-Identifier: CDDL-1.01/*2* CDDL HEADER START3*4* The contents of this file are subject to the terms of the5* Common Development and Distribution License (the "License").6* You may not use this file except in compliance with the License.7*8* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE9* or https://opensource.org/licenses/CDDL-1.0.10* See the License for the specific language governing permissions11* and limitations under the License.12*13* When distributing Covered Code, include this CDDL HEADER in each14* file and include the License file at usr/src/OPENSOLARIS.LICENSE.15* If applicable, add the following below this CDDL HEADER, with the16* fields enclosed by brackets "[]" replaced with your own identifying17* information: Portions Copyright [yyyy] [name of copyright owner]18*19* CDDL HEADER END20*/21/*22* Copyright 2009 Sun Microsystems, Inc. All rights reserved.23* Use is subject to license terms.24*/2526/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */27/* All Rights Reserved */2829/*30* University Copyright- Copyright (c) 1982, 1986, 198831* The Regents of the University of California32* All Rights Reserved33*34* University Acknowledgment- Portions of this document are derived from35* software developed by the University of California, Berkeley, and its36* contributors.37*/38/*39* Copyright (c) 2015 by Chunwei Chen. All rights reserved.40*/4142#ifdef _KERNEL4344#include <sys/errno.h>45#include <sys/vmem.h>46#include <sys/sysmacros.h>47#include <sys/types.h>48#include <sys/uio_impl.h>49#include <sys/sysmacros.h>50#include <sys/string.h>51#include <sys/zfs_refcount.h>52#include <sys/zfs_debug.h>53#include <linux/kmap_compat.h>54#include <linux/uaccess.h>55#include <linux/pagemap.h>56#include <linux/mman.h>5758/*59* Move "n" bytes at byte address "p"; "rw" indicates the direction60* of the move, and the I/O parameters are provided in "uio", which is61* update to reflect the data which was moved. Returns 0 on success or62* a non-zero errno on failure.63*/64static int65zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)66{67const struct iovec *iov = uio->uio_iov;68size_t skip = uio->uio_skip;69ulong_t cnt;7071ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);72while (n && uio->uio_resid) {73cnt = MIN(iov->iov_len - skip, n);74if (rw == UIO_READ)75memcpy(iov->iov_base + skip, p, cnt);76else77memcpy(p, iov->iov_base + skip, cnt);78skip += cnt;79if (skip == iov->iov_len) {80skip = 0;81uio->uio_iov = (++iov);82uio->uio_iovcnt--;83}84uio->uio_skip = skip;85uio->uio_resid -= cnt;86uio->uio_loffset += cnt;87p = (caddr_t)p + cnt;88n -= cnt;89}90return (0);91}9293static int94zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)95{96const struct bio_vec *bv = uio->uio_bvec;97size_t skip = uio->uio_skip;98ulong_t cnt;99100while (n && uio->uio_resid) {101void *paddr;102cnt = MIN(bv->bv_len - skip, n);103104paddr = zfs_kmap_local(bv->bv_page);105if (rw == UIO_READ) {106/* Copy from buffer 'p' to the bvec data */107memcpy(paddr + bv->bv_offset + skip, p, cnt);108} else {109/* Copy from bvec data to buffer 'p' */110memcpy(p, paddr + bv->bv_offset + skip, cnt);111}112zfs_kunmap_local(paddr);113114skip += cnt;115if (skip == bv->bv_len) {116skip = 0;117uio->uio_bvec = (++bv);118uio->uio_iovcnt--;119}120uio->uio_skip = skip;121uio->uio_resid -= cnt;122uio->uio_loffset += cnt;123p = (caddr_t)p + cnt;124n -= cnt;125}126return (0);127}128129static void130zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,131struct bio_vec *bv)132{133void *paddr;134135paddr = zfs_kmap_local(bv->bv_page);136if (rw == UIO_READ) {137/* Copy from buffer 'p' to the bvec data */138memcpy(paddr + bv->bv_offset + skip, p, cnt);139} else {140/* Copy from bvec data to buffer 'p' */141memcpy(p, paddr + bv->bv_offset + skip, cnt);142}143zfs_kunmap_local(paddr);144}145146/*147* Copy 'n' bytes of data between the buffer p[] and the data represented148* by the request in the uio.149*/150static int151zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)152{153struct request *rq = uio->rq;154struct bio_vec bv;155struct req_iterator iter;156size_t this_seg_start; /* logical offset */157size_t this_seg_end; /* logical offset */158size_t skip_in_seg;159size_t copy_from_seg;160size_t orig_loffset;161int copied = 0;162163/*164* Get the original logical offset of this entire request (because165* uio->uio_loffset will be modified over time).166*/167orig_loffset = io_offset(NULL, rq);168this_seg_start = orig_loffset;169170rq_for_each_segment(bv, rq, iter) {171/*172* Lookup what the logical offset of the last byte of this173* segment is.174*/175this_seg_end = this_seg_start + bv.bv_len - 1;176177/*178* We only need to operate on segments that have data we're179* copying.180*/181if (uio->uio_loffset >= this_seg_start &&182uio->uio_loffset <= this_seg_end) {183/*184* Some, or all, of the data in this segment needs to be185* copied.186*/187188/*189* We may be not be copying from the first byte in the190* segment. Figure out how many bytes to skip copying191* from the beginning of this segment.192*/193skip_in_seg = uio->uio_loffset - this_seg_start;194195/*196* Calculate the total number of bytes from this197* segment that we will be copying.198*/199copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);200201/* Copy the bytes */202zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);203p = ((char *)p) + copy_from_seg;204205n -= copy_from_seg;206uio->uio_resid -= copy_from_seg;207uio->uio_loffset += copy_from_seg;208copied = 1; /* We copied some data */209}210211this_seg_start = this_seg_end + 1;212}213214if (!copied) {215/* Didn't copy anything */216uio->uio_resid = 0;217}218return (0);219}220221static int222zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)223{224if (uio->rq != NULL)225return (zfs_uiomove_bvec_rq(p, n, rw, uio));226return (zfs_uiomove_bvec_impl(p, n, rw, uio));227}228229static int230zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,231boolean_t revert)232{233size_t cnt = MIN(n, uio->uio_resid);234235if (rw == UIO_READ)236cnt = copy_to_iter(p, cnt, uio->uio_iter);237else238cnt = copy_from_iter(p, cnt, uio->uio_iter);239240/*241* When operating on a full pipe no bytes are processed.242* In which case return EFAULT which is converted to EAGAIN243* by the kernel's generic_file_splice_read() function.244*/245if (cnt == 0)246return (EFAULT);247248/*249* Revert advancing the uio_iter. This is set by zfs_uiocopy()250* to avoid consuming the uio and its iov_iter structure.251*/252if (revert)253iov_iter_revert(uio->uio_iter, cnt);254255uio->uio_resid -= cnt;256uio->uio_loffset += cnt;257258return (0);259}260261int262zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)263{264if (uio->uio_segflg == UIO_BVEC)265return (zfs_uiomove_bvec(p, n, rw, uio));266else if (uio->uio_segflg == UIO_ITER)267return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));268else269return (zfs_uiomove_iov(p, n, rw, uio));270}271EXPORT_SYMBOL(zfs_uiomove);272273/*274* Fault in the pages of the first n bytes specified by the uio structure.275* 1 byte in each page is touched and the uio struct is unmodified. Any276* error will terminate the process as this is only a best attempt to get277* the pages resident.278*/279int280zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)281{282if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||283(uio->uio_extflg & UIO_DIRECT)) {284/*285* There's never a need to fault in kernel pages or Direct I/O286* write pages. Direct I/O write pages have been pinned in so287* there is never a time for these pages a fault will occur.288*/289return (0);290} else {291ASSERT3S(uio->uio_segflg, ==, UIO_ITER);292/*293* At least a Linux 4.18 kernel, iov_iter_fault_in_readable()294* can be relied on to fault in user pages when referenced.295*/296if (iov_iter_fault_in_readable(uio->uio_iter, n))297return (EFAULT);298}299300return (0);301}302EXPORT_SYMBOL(zfs_uio_prefaultpages);303304/*305* The same as zfs_uiomove() but doesn't modify uio structure.306* return in cbytes how many bytes were copied.307*/308int309zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)310{311zfs_uio_t uio_copy;312int ret;313314memcpy(&uio_copy, uio, sizeof (zfs_uio_t));315316if (uio->uio_segflg == UIO_BVEC)317ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);318else if (uio->uio_segflg == UIO_ITER)319ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);320else321ret = zfs_uiomove_iov(p, n, rw, &uio_copy);322323*cbytes = uio->uio_resid - uio_copy.uio_resid;324325return (ret);326}327EXPORT_SYMBOL(zfs_uiocopy);328329/*330* Drop the next n chars out of *uio.331*/332void333zfs_uioskip(zfs_uio_t *uio, size_t n)334{335if (n > uio->uio_resid)336return;337/*338* When using a uio with a struct request, we simply339* use uio_loffset as a pointer to the next logical byte to340* copy in the request. We don't have to do any fancy341* accounting with uio_bvec/uio_iovcnt since we don't use342* them.343*/344if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {345uio->uio_skip += n;346while (uio->uio_iovcnt &&347uio->uio_skip >= uio->uio_bvec->bv_len) {348uio->uio_skip -= uio->uio_bvec->bv_len;349uio->uio_bvec++;350uio->uio_iovcnt--;351}352} else if (uio->uio_segflg == UIO_ITER) {353iov_iter_advance(uio->uio_iter, n);354} else {355ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);356uio->uio_skip += n;357while (uio->uio_iovcnt &&358uio->uio_skip >= uio->uio_iov->iov_len) {359uio->uio_skip -= uio->uio_iov->iov_len;360uio->uio_iov++;361uio->uio_iovcnt--;362}363}364365uio->uio_loffset += n;366uio->uio_resid -= n;367}368EXPORT_SYMBOL(zfs_uioskip);369370/*371* Check if the uio is page-aligned in memory.372*/373boolean_t374zfs_uio_page_aligned(zfs_uio_t *uio)375{376boolean_t aligned = B_TRUE;377378if (uio->uio_segflg == UIO_SYSSPACE) {379const struct iovec *iov = uio->uio_iov;380size_t skip = uio->uio_skip;381382for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {383uintptr_t addr = (uintptr_t)(iov->iov_base + skip);384size_t size = iov->iov_len - skip;385if ((addr & (PAGE_SIZE - 1)) ||386(size & (PAGE_SIZE - 1))) {387aligned = B_FALSE;388break;389}390skip = 0;391}392} else if (uio->uio_segflg == UIO_ITER) {393unsigned long alignment =394iov_iter_alignment(uio->uio_iter);395aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);396} else {397/* Currently not supported */398aligned = B_FALSE;399}400401return (aligned);402}403404#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)405#define ZFS_MARKEED_PAGE 0x0406#define IS_ZFS_MARKED_PAGE(_p) 0407#define zfs_mark_page(_p)408#define zfs_unmark_page(_p)409#define IS_ZERO_PAGE(_p) 0410411#else412/*413* Mark pages to know if they were allocated to replace ZERO_PAGE() for414* Direct I/O writes.415*/416#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */417#define IS_ZFS_MARKED_PAGE(_p) \418(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)419#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))420421static inline void422zfs_mark_page(struct page *page)423{424ASSERT3P(page, !=, NULL);425get_page(page);426SetPagePrivate(page);427set_page_private(page, ZFS_MARKED_PAGE);428}429430static inline void431zfs_unmark_page(struct page *page)432{433ASSERT3P(page, !=, NULL);434set_page_private(page, 0UL);435ClearPagePrivate(page);436put_page(page);437}438#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */439440static void441zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)442{443ASSERT3P(uio->uio_dio.pages, !=, NULL);444445for (long i = 0; i < uio->uio_dio.npages; i++) {446struct page *p = uio->uio_dio.pages[i];447lock_page(p);448449if (IS_ZERO_PAGE(p)) {450/*451* If the user page points the kernels ZERO_PAGE() a452* new zero filled page will just be allocated so the453* contents of the page can not be changed by the user454* while a Direct I/O write is taking place.455*/456gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |457__GFP_ZERO | GFP_KERNEL;458459ASSERT0(IS_ZFS_MARKED_PAGE(p));460unlock_page(p);461put_page(p);462463uio->uio_dio.pages[i] =464__page_cache_alloc(gfp_zero_page);465zfs_mark_page(uio->uio_dio.pages[i]);466} else {467unlock_page(p);468}469}470}471472void473zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)474{475476ASSERT(uio->uio_extflg & UIO_DIRECT);477ASSERT3P(uio->uio_dio.pages, !=, NULL);478479if (uio->uio_dio.pinned) {480#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)481unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);482#endif483} else {484for (long i = 0; i < uio->uio_dio.npages; i++) {485struct page *p = uio->uio_dio.pages[i];486487if (IS_ZFS_MARKED_PAGE(p)) {488zfs_unmark_page(p);489__free_page(p);490continue;491}492493put_page(p);494}495}496497vmem_free(uio->uio_dio.pages,498uio->uio_dio.npages * sizeof (struct page *));499}500501#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)502static int503zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)504{505long res;506size_t skip = uio->uio_iter->iov_offset;507size_t len = uio->uio_resid - skip;508unsigned int gup_flags = 0;509unsigned long addr;510unsigned long nr_pages;511512ASSERT3U(uio->uio_segflg, ==, UIO_ITER);513514/*515* Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could516* possibly be used here in the future to allow for P2P operations with517* user pages.518*/519if (rw == UIO_READ)520gup_flags = FOLL_WRITE;521522if (len == 0)523return (0);524525uio->uio_dio.pinned = B_TRUE;526#if defined(HAVE_ITER_IS_UBUF)527if (iter_is_ubuf(uio->uio_iter)) {528nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);529addr = (unsigned long)uio->uio_iter->ubuf + skip;530res = pin_user_pages_unlocked(addr, nr_pages,531&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);532if (res < 0) {533return (SET_ERROR(-res));534} else if (len != (res * PAGE_SIZE)) {535uio->uio_dio.npages += res;536return (SET_ERROR(EFAULT));537}538uio->uio_dio.npages += res;539return (0);540}541#endif542const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);543for (int i = 0; i < uio->uio_iovcnt; i++) {544size_t amt = iovp->iov_len - skip;545if (amt == 0) {546iovp++;547skip = 0;548continue;549}550551addr = (unsigned long)iovp->iov_base + skip;552nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);553res = pin_user_pages_unlocked(addr, nr_pages,554&uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);555if (res < 0) {556return (SET_ERROR(-res));557} else if (amt != (res * PAGE_SIZE)) {558uio->uio_dio.npages += res;559return (SET_ERROR(EFAULT));560}561562len -= amt;563uio->uio_dio.npages += res;564skip = 0;565iovp++;566};567568ASSERT0(len);569570return (0);571}572#endif573574static int575zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)576{577size_t start;578size_t wanted = uio->uio_resid;579ssize_t rollback = 0;580ssize_t cnt;581unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);582583while (wanted) {584#if defined(HAVE_IOV_ITER_GET_PAGES2)585cnt = iov_iter_get_pages2(uio->uio_iter,586&uio->uio_dio.pages[uio->uio_dio.npages],587wanted, maxpages, &start);588#else589cnt = iov_iter_get_pages(uio->uio_iter,590&uio->uio_dio.pages[uio->uio_dio.npages],591wanted, maxpages, &start);592#endif593if (cnt < 0) {594iov_iter_revert(uio->uio_iter, rollback);595return (SET_ERROR(-cnt));596}597/*598* All Direct I/O operations must be page aligned.599*/600ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));601uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);602rollback += cnt;603wanted -= cnt;604#if !defined(HAVE_IOV_ITER_GET_PAGES2)605/*606* iov_iter_get_pages2() advances the iov_iter on success.607*/608iov_iter_advance(uio->uio_iter, cnt);609#endif610611}612ASSERT3U(rollback, ==, uio->uio_resid);613iov_iter_revert(uio->uio_iter, rollback);614615return (0);616}617618/*619* This function pins user pages. In the event that the user pages were not620* successfully pinned an error value is returned.621*622* On success, 0 is returned.623*/624int625zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)626{627int error = 0;628long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);629size_t size = npages * sizeof (struct page *);630631if (uio->uio_segflg == UIO_ITER) {632uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);633#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)634if (zfs_user_backed_iov_iter(uio->uio_iter))635error = zfs_uio_pin_user_pages(uio, rw);636else637error = zfs_uio_get_dio_pages_iov_iter(uio, rw);638#else639error = zfs_uio_get_dio_pages_iov_iter(uio, rw);640#endif641} else {642return (SET_ERROR(EOPNOTSUPP));643}644645ASSERT3S(uio->uio_dio.npages, >=, 0);646647if (error) {648if (uio->uio_dio.pinned) {649#if defined(HAVE_PIN_USER_PAGES_UNLOCKED)650unpin_user_pages(uio->uio_dio.pages,651uio->uio_dio.npages);652#endif653} else {654for (long i = 0; i < uio->uio_dio.npages; i++)655put_page(uio->uio_dio.pages[i]);656}657658vmem_free(uio->uio_dio.pages, size);659return (error);660} else {661ASSERT3S(uio->uio_dio.npages, ==, npages);662}663664if (rw == UIO_WRITE && !uio->uio_dio.pinned)665zfs_uio_dio_check_for_zero_page(uio);666667uio->uio_extflg |= UIO_DIRECT;668669return (0);670}671672#endif /* _KERNEL */673674675