#include <sys/types.h>
#include <sys/param.h>
#include <sys/module.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/sx.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/unistd.h>
#include <sys/filedesc.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
#include <vm/vnode_pager.h>
#include "fuse.h"
#include "fuse_file.h"
#include "fuse_node.h"
#include "fuse_internal.h"
#include "fuse_ipc.h"
#include "fuse_io.h"
#define B_FUSEFS_WRITE_CACHE B_FS_FLAG1
SDT_PROVIDER_DECLARE(fusefs);
SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*");
SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int");
SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*");
SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int",
"struct buf*");
int
fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag,
struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid)
{
struct buf *bp;
struct mount *mp;
struct fuse_data *data;
daddr_t lbn, nextlbn;
int bcount, nextsize;
int err, n = 0, on = 0, seqcount;
off_t filesize;
const int biosize = fuse_iosize(vp);
mp = vnode_mount(vp);
data = fuse_get_mpdata(mp);
if (uio->uio_offset < 0)
return (EINVAL);
seqcount = ioflag >> IO_SEQSHIFT;
err = fuse_vnode_size(vp, &filesize, cred, curthread);
if (err)
return err;
for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
if (fuse_isdeadfs(vp)) {
err = ENXIO;
break;
}
if (filesize - uio->uio_offset <= 0)
break;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
if ((off_t)lbn * biosize >= filesize) {
bcount = 0;
} else if ((off_t)(lbn + 1) * biosize > filesize) {
bcount = filesize - (off_t)lbn *biosize;
} else {
bcount = biosize;
}
nextlbn = lbn + 1;
nextsize = MIN(biosize, filesize - nextlbn * biosize);
SDT_PROBE4(fusefs, , io, read_bio_backend_start,
biosize, (int)lbn, on, bcount);
if (bcount < biosize) {
err = bread(vp, lbn, bcount, NOCRED, &bp);
} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
long totread = uio->uio_resid + on;
seqcount = MIN(seqcount,
data->max_readahead_blocks + 1);
err = cluster_read(vp, filesize, lbn, bcount, NOCRED,
totread, seqcount, 0, &bp);
} else if (seqcount > 1 && data->max_readahead_blocks >= 1) {
err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1,
NOCRED, &bp);
} else {
err = bread(vp, lbn, bcount, NOCRED, &bp);
}
if (err) {
brelse(bp);
bp = NULL;
break;
}
n = 0;
if (on < bcount - bp->b_resid)
n = MIN((unsigned)(bcount - bp->b_resid - on),
uio->uio_resid);
if (n > 0) {
SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp);
err = uiomove(bp->b_data + on, n, uio);
}
vfs_bio_brelse(bp, ioflag);
SDT_PROBE4(fusefs, , io, read_bio_backend_end, err,
uio->uio_resid, n, bp);
if (bp->b_resid > 0) {
break;
}
}
return (err);
}
SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start,
"struct fuse_read_in*");
SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete,
"struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*");
int
fuse_read_directbackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh)
{
struct fuse_data *data;
struct fuse_dispatcher fdi;
struct fuse_read_in *fri;
int err = 0;
data = fuse_get_mpdata(vp->v_mount);
if (uio->uio_resid == 0)
return (0);
fdisp_init(&fdi, 0);
while (uio->uio_resid > 0) {
fdi.iosize = sizeof(*fri);
fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred);
fri = fdi.indata;
fri->fh = fufh->fh_id;
fri->offset = uio->uio_offset;
fri->size = MIN(uio->uio_resid,
fuse_get_mpdata(vp->v_mount)->max_read);
if (fuse_libabi_geq(data, 7, 9)) {
fri->read_flags = 0;
fri->flags = fufh_type_2_fflags(fufh->fufh_type);
}
SDT_PROBE1(fusefs, , io, read_directbackend_start, fri);
if ((err = fdisp_wait_answ(&fdi)))
goto out;
SDT_PROBE3(fusefs, , io, read_directbackend_complete,
&fdi, fri, uio);
if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio)))
break;
if (fdi.iosize < fri->size) {
break;
}
}
out:
fdisp_destroy(&fdi);
return (err);
}
int
fuse_write_directbackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize,
int ioflag, bool pages)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_data *data;
struct fuse_write_in *fwi;
struct fuse_write_out *fwo;
struct fuse_dispatcher fdi;
size_t chunksize;
ssize_t r;
void *fwi_data;
off_t as_written_offset;
int diff;
int err = 0;
bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO;
bool wrote_anything = false;
uint32_t write_flags;
data = fuse_get_mpdata(vp->v_mount);
write_flags = !pages && (
(ioflag & IO_DIRECT) ||
!fsess_opt_datacache(vnode_mount(vp)) ||
!fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE;
if (uio->uio_resid == 0)
return (0);
if (ioflag & IO_APPEND)
uio_setoffset(uio, filesize);
err = vn_rlimit_fsizex(vp, uio, 0, &r, uio->uio_td);
if (err != 0) {
vn_rlimit_fsizex_res(uio, r);
return (err);
}
fdisp_init(&fdi, 0);
while (uio->uio_resid > 0) {
size_t sizeof_fwi;
if (fuse_libabi_geq(data, 7, 9)) {
sizeof_fwi = sizeof(*fwi);
} else {
sizeof_fwi = FUSE_COMPAT_WRITE_IN_SIZE;
}
chunksize = MIN(uio->uio_resid, data->max_write);
fdi.iosize = sizeof_fwi + chunksize;
fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred);
fwi = fdi.indata;
fwi->fh = fufh->fh_id;
fwi->offset = uio->uio_offset;
fwi->size = chunksize;
fwi->write_flags = write_flags;
if (fuse_libabi_geq(data, 7, 9)) {
fwi->flags = fufh_type_2_fflags(fufh->fufh_type);
}
fwi_data = (char *)fdi.indata + sizeof_fwi;
if ((err = uiomove(fwi_data, chunksize, uio)))
break;
retry:
err = fdisp_wait_answ(&fdi);
if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) {
uio->uio_resid += fwi->size;
uio->uio_offset -= fwi->size;
if (err == ERESTART)
err = EINTR;
break;
} else if (err) {
break;
} else {
wrote_anything = true;
}
fwo = ((struct fuse_write_out *)fdi.answ);
if (fwo->size > fwi->size) {
fuse_warn(data, FSESS_WARN_WROTE_LONG,
"wrote more data than we provided it.");
fvdat->flag &= ~FN_SIZECHANGE;
fuse_vnode_clear_attr_cache(vp);
err = EINVAL;
break;
}
diff = fwi->size - fwo->size;
as_written_offset = uio->uio_offset - diff;
if (as_written_offset - diff > filesize) {
fuse_vnode_setsize(vp, as_written_offset, false);
getnanouptime(&fvdat->last_local_modify);
}
if (as_written_offset - diff >= filesize)
fvdat->flag &= ~FN_SIZECHANGE;
if (diff > 0) {
if (!direct_io) {
fuse_warn(data, FSESS_WARN_SHORT_WRITE,
"short writes are only allowed with "
"direct_io.");
}
if (ioflag & IO_DIRECT) {
uio->uio_resid += diff;
uio->uio_offset -= diff;
break;
} else {
fdi.iosize = sizeof_fwi + diff;
fdisp_refresh_vp(&fdi, FUSE_WRITE, vp,
uio->uio_td, cred);
fwi = fdi.indata;
MPASS2(fwi == fdi.indata, "FUSE dispatcher "
"reallocated despite no increase in "
"size?");
void *src = (char*)fwi_data + fwo->size;
memmove(fwi_data, src, diff);
fwi->fh = fufh->fh_id;
fwi->offset = as_written_offset;
fwi->size = diff;
fwi->write_flags = write_flags;
goto retry;
}
}
}
fdisp_destroy(&fdi);
if (wrote_anything)
fuse_vnode_undirty_cached_timestamps(vp, false);
vn_rlimit_fsizex_res(uio, r);
return (err);
}
SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int",
"struct uio*", "int", "bool");
SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int");
SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*");
int
fuse_write_biobackend(struct vnode *vp, struct uio *uio,
struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct buf *bp;
daddr_t lbn;
off_t filesize;
ssize_t r;
int bcount;
int n, on, seqcount, err = 0;
const int biosize = fuse_iosize(vp);
seqcount = ioflag >> IO_SEQSHIFT;
KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode"));
if (vp->v_type != VREG)
return (EIO);
if (uio->uio_offset < 0)
return (EINVAL);
if (uio->uio_resid == 0)
return (0);
err = fuse_vnode_size(vp, &filesize, cred, curthread);
if (err)
return err;
if (ioflag & IO_APPEND)
uio_setoffset(uio, filesize);
err = vn_rlimit_fsizex(vp, uio, 0, &r, uio->uio_td);
if (err != 0) {
vn_rlimit_fsizex_res(uio, r);
return (err);
}
do {
bool direct_append, extending;
if (fuse_isdeadfs(vp)) {
err = ENXIO;
break;
}
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
n = MIN((unsigned)(biosize - on), uio->uio_resid);
again:
direct_append = uio->uio_offset == filesize && n;
if (uio->uio_offset + n < filesize) {
extending = false;
if ((off_t)(lbn + 1) * biosize < filesize) {
bcount = biosize;
} else {
bcount = filesize - (off_t)lbn * biosize;
}
} else {
extending = true;
bcount = on + n;
}
if (direct_append) {
bp = getblk(vp, lbn, on, PCATCH, 0, 0);
if (bp != NULL) {
uint32_t save = bp->b_flags & B_CACHE;
allocbuf(bp, bcount);
bp->b_flags |= save;
}
} else {
bp = getblk(vp, lbn, bcount, PCATCH, 0, 0);
}
if (!bp) {
err = EINTR;
break;
}
if (extending) {
err = fuse_vnode_setsize(vp, uio->uio_offset + n, false);
filesize = uio->uio_offset + n;
getnanouptime(&fvdat->last_local_modify);
fvdat->flag |= FN_SIZECHANGE;
if (err) {
brelse(bp);
break;
}
}
SDT_PROBE6(fusefs, , io, write_biobackend_start,
lbn, on, n, uio, bcount, direct_append);
if (on == 0 && n == bcount) {
bp->b_flags |= B_CACHE;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
}
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_iocmd = BIO_READ;
vfs_busy_pages(bp, 0);
fuse_io_strategy(vp, bp);
if ((err = bp->b_error)) {
brelse(bp);
break;
}
if (bp->b_resid > 0) {
SDT_PROBE2(fusefs, , io, trace, 1,
"Short read during a RMW");
brelse(bp);
err = fuse_vnode_size(vp, &filesize, cred,
curthread);
if (err)
break;
else
goto again;
}
}
if (bp->b_wcred == NOCRED)
bp->b_wcred = crhold(cred);
if (bp->b_dirtyend > bcount) {
SDT_PROBE2(fusefs, , io, write_biobackend_append_race,
(long)bp->b_blkno * biosize,
bp->b_dirtyend - bcount);
bp->b_dirtyend = bcount;
}
if (bp->b_dirtyoff >= bp->b_dirtyend)
bp->b_dirtyoff = bp->b_dirtyend = 0;
if (bp->b_dirtyend > 0 &&
(on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp);
bwrite(bp);
if (bp->b_error == EINTR) {
err = EINTR;
break;
}
goto again;
}
err = uiomove((char *)bp->b_data + on, n, uio);
if (err) {
bp->b_ioflags |= BIO_ERROR;
bp->b_error = err;
brelse(bp);
break;
}
if (n) {
if (bp->b_dirtyend > 0) {
bp->b_dirtyoff = MIN(on, bp->b_dirtyoff);
bp->b_dirtyend = MAX((on + n), bp->b_dirtyend);
} else {
bp->b_dirtyoff = on;
bp->b_dirtyend = on + n;
}
vfs_bio_set_valid(bp, on, n);
}
vfs_bio_set_flags(bp, ioflag);
bp->b_flags |= B_FUSEFS_WRITE_CACHE;
if (ioflag & IO_SYNC) {
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp);
if (!(ioflag & IO_VMIO))
bp->b_flags &= ~B_FUSEFS_WRITE_CACHE;
err = bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
(ioflag & IO_ASYNC)) {
bp->b_flags |= B_CLUSTEROK;
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp);
bawrite(bp);
} else if (on == 0 && n == bcount) {
if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
bp->b_flags |= B_CLUSTEROK;
SDT_PROBE2(fusefs, , io, write_biobackend_issue,
4, bp);
cluster_write(vp, &fvdat->clusterw, bp,
filesize, seqcount, 0);
} else {
SDT_PROBE2(fusefs, , io, write_biobackend_issue,
5, bp);
bawrite(bp);
}
} else if (ioflag & IO_DIRECT) {
bp->b_flags |= B_CLUSTEROK;
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp);
bawrite(bp);
} else {
bp->b_flags &= ~B_CLUSTEROK;
SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp);
bdwrite(bp);
}
if (err)
break;
} while (uio->uio_resid > 0 && n > 0);
vn_rlimit_fsizex_res(uio, r);
return (err);
}
int
fuse_io_strategy(struct vnode *vp, struct buf *bp)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
struct fuse_filehandle *fufh;
struct ucred *cred;
struct uio *uiop;
struct uio uio;
struct iovec io;
off_t filesize;
int error = 0;
int fflag;
pid_t pid = 0;
const int biosize = fuse_iosize(vp);
MPASS(vp->v_type == VREG || vp->v_type == VDIR);
MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE);
fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE;
cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred;
error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
if (bp->b_iocmd == BIO_READ && error == EBADF) {
error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid);
}
if (error) {
printf("FUSE: strategy: filehandles are closed\n");
bp->b_ioflags |= BIO_ERROR;
bp->b_error = error;
bufdone(bp);
return (error);
}
uiop = &uio;
uiop->uio_iov = &io;
uiop->uio_iovcnt = 1;
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_td = curthread;
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
KASSERT(!(bp->b_flags & B_DONE),
("fuse_io_strategy: bp %p already marked done", bp));
if (bp->b_iocmd == BIO_READ) {
ssize_t left;
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize;
error = fuse_read_directbackend(vp, uiop, cred, fufh);
if (!error && uiop->uio_resid) {
int nread = bp->b_bcount - uiop->uio_resid;
left = uiop->uio_resid;
bzero((char *)bp->b_data + nread, left);
if ((fvdat->flag & FN_SIZECHANGE) == 0) {
SDT_PROBE2(fusefs, , io, trace, 1,
"Short read of a clean file");
fuse_vnode_clear_attr_cache(vp);
} else {
SDT_PROBE2(fusefs, , io, trace, 1,
"Short read of a dirty file");
uiop->uio_resid = 0;
}
}
if (error) {
bp->b_ioflags |= BIO_ERROR;
bp->b_error = error;
}
} else {
filesize = fvdat->cached_attrs.va_size;
KASSERT(filesize != VNOVAL, ("filesize should've been cached"));
if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize)
bp->b_dirtyend = filesize -
(off_t)bp->b_lblkno * biosize;
if (bp->b_dirtyend > bp->b_dirtyoff) {
io.iov_len = uiop->uio_resid = bp->b_dirtyend
- bp->b_dirtyoff;
uiop->uio_offset = (off_t)bp->b_lblkno * biosize
+ bp->b_dirtyoff;
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE;
error = fuse_write_directbackend(vp, uiop, cred, fufh,
filesize, 0, pages);
if (error == EINTR || error == ETIMEDOUT) {
bp->b_flags &= ~(B_INVAL | B_NOCACHE);
if ((bp->b_flags & B_PAGING) == 0) {
bdirty(bp);
bp->b_flags &= ~B_DONE;
}
if ((error == EINTR || error == ETIMEDOUT) &&
(bp->b_flags & B_ASYNC) == 0)
bp->b_flags |= B_EINTR;
} else {
if (error) {
bp->b_ioflags |= BIO_ERROR;
bp->b_flags |= B_INVAL;
bp->b_error = error;
}
bp->b_dirtyoff = bp->b_dirtyend = 0;
}
} else {
bp->b_resid = 0;
bufdone(bp);
return (0);
}
}
bp->b_resid = uiop->uio_resid;
bufdone(bp);
return (error);
}
int
fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td)
{
return (vn_fsync_buf(vp, waitfor));
}
int
fuse_io_invalbuf(struct vnode *vp, struct thread *td)
{
struct fuse_vnode_data *fvdat = VTOFUD(vp);
int error = 0;
if (VN_IS_DOOMED(vp))
return 0;
ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf");
while (fvdat->flag & FN_FLUSHINPROG) {
struct proc *p = td->td_proc;
if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF)
return EIO;
fvdat->flag |= FN_FLUSHWANT;
tsleep(&fvdat->flag, PRIBIO, "fusevinv", 2 * hz);
error = 0;
if (p != NULL) {
PROC_LOCK(p);
if (SIGNOTEMPTY(p->p_siglist) ||
SIGNOTEMPTY(td->td_siglist))
error = EINTR;
PROC_UNLOCK(p);
}
if (error == EINTR)
return EINTR;
}
fvdat->flag |= FN_FLUSHINPROG;
vnode_pager_clean_sync(vp);
error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
while (error) {
if (error == ERESTART || error == EINTR) {
fvdat->flag &= ~FN_FLUSHINPROG;
if (fvdat->flag & FN_FLUSHWANT) {
fvdat->flag &= ~FN_FLUSHWANT;
wakeup(&fvdat->flag);
}
return EINTR;
}
error = vinvalbuf(vp, V_SAVE, PCATCH, 0);
}
fvdat->flag &= ~FN_FLUSHINPROG;
if (fvdat->flag & FN_FLUSHWANT) {
fvdat->flag &= ~FN_FLUSHWANT;
wakeup(&fvdat->flag);
}
return (error);
}