CoCalc -- nfs

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/fs/nfsclient/nfs_clbio.c
³⁹⁴⁸³ views
1
/*-
2
 * SPDX-License-Identifier: BSD-3-Clause
3
 *
4
 * Copyright (c) 1989, 1993
5
 *	The Regents of the University of California.  All rights reserved.
6
 *
7
 * This code is derived from software contributed to Berkeley by
8
 * Rick Macklem at The University of Guelph.
9
 *
10
 * Redistribution and use in source and binary forms, with or without
11
 * modification, are permitted provided that the following conditions
12
 * are met:
13
 * 1. Redistributions of source code must retain the above copyright
14
 *    notice, this list of conditions and the following disclaimer.
15
 * 2. Redistributions in binary form must reproduce the above copyright
16
 *    notice, this list of conditions and the following disclaimer in the
17
 *    documentation and/or other materials provided with the distribution.
18
 * 3. Neither the name of the University nor the names of its contributors
19
 *    may be used to endorse or promote products derived from this software
20
 *    without specific prior written permission.
21
 *
22
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
 * SUCH DAMAGE.
33
 */
34

35
#include <sys/param.h>
36
#include <sys/systm.h>
37
#include <sys/bio.h>
38
#include <sys/buf.h>
39
#include <sys/kernel.h>
40
#include <sys/mount.h>
41
#include <sys/rwlock.h>
42
#include <sys/vmmeter.h>
43
#include <sys/vnode.h>
44

45
#include <vm/vm.h>
46
#include <vm/vm_param.h>
47
#include <vm/vm_extern.h>
48
#include <vm/vm_page.h>
49
#include <vm/vm_object.h>
50
#include <vm/vm_pager.h>
51
#include <vm/vnode_pager.h>
52

53
#include <fs/nfs/nfsport.h>
54
#include <fs/nfsclient/nfsmount.h>
55
#include <fs/nfsclient/nfs.h>
56
#include <fs/nfsclient/nfsnode.h>
57
#include <fs/nfsclient/nfs_kdtrace.h>
58

59
extern int newnfs_directio_allow_mmap;
60
extern struct nfsstatsv1 nfsstatsv1;
61
extern struct mtx ncl_iod_mutex;
62
extern int ncl_numasync;
63
extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
64
extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
65
extern int newnfs_directio_enable;
66
extern int nfs_keep_dirty_on_error;
67

68
uma_zone_t ncl_pbuf_zone;
69

70
static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
71
    struct thread *td);
72
static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
73
    struct ucred *cred, int ioflag);
74

75
/*
76
 * Vnode op for VM getpages.
77
 */
78
SYSCTL_DECL(_vfs_nfs);
79
static int use_buf_pager = 1;
80
SYSCTL_INT(_vfs_nfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
81
    &use_buf_pager, 0,
82
    "Use buffer pager instead of direct readrpc call");
83

84
static daddr_t
85
ncl_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
86
{
87

88
	return (off / vp->v_bufobj.bo_bsize);
89
}
90

91
static int
92
ncl_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
93
{
94
	struct nfsnode *np;
95
	u_quad_t nsize;
96
	int biosize, bcount;
97

98
	np = VTONFS(vp);
99
	NFSLOCKNODE(np);
100
	nsize = np->n_size;
101
	NFSUNLOCKNODE(np);
102

103
	biosize = vp->v_bufobj.bo_bsize;
104
	bcount = biosize;
105
	if ((off_t)lbn * biosize >= nsize)
106
		bcount = 0;
107
	else if ((off_t)(lbn + 1) * biosize > nsize)
108
		bcount = nsize - (off_t)lbn * biosize;
109
	*sz = bcount;
110
	return (0);
111
}
112

113
int
114
ncl_getpages(struct vop_getpages_args *ap)
115
{
116
	int i, error, nextoff, size, toff, count, npages;
117
	struct uio uio;
118
	struct iovec iov;
119
	vm_offset_t kva;
120
	struct buf *bp;
121
	struct vnode *vp;
122
	struct thread *td;
123
	struct ucred *cred;
124
	struct nfsmount *nmp;
125
	vm_object_t object;
126
	vm_page_t *pages;
127
	struct nfsnode *np;
128

129
	vp = ap->a_vp;
130
	np = VTONFS(vp);
131
	td = curthread;
132
	cred = curthread->td_ucred;
133
	nmp = VFSTONFS(vp->v_mount);
134
	pages = ap->a_m;
135
	npages = ap->a_count;
136

137
	if ((object = vp->v_object) == NULL) {
138
		printf("ncl_getpages: called with non-merged cache vnode\n");
139
		return (VM_PAGER_ERROR);
140
	}
141

142
	if (newnfs_directio_enable && !newnfs_directio_allow_mmap) {
143
		NFSLOCKNODE(np);
144
		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
145
			NFSUNLOCKNODE(np);
146
			printf("ncl_getpages: called on non-cacheable vnode\n");
147
			return (VM_PAGER_ERROR);
148
		} else
149
			NFSUNLOCKNODE(np);
150
	}
151

152
	mtx_lock(&nmp->nm_mtx);
153
	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
154
	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
155
		mtx_unlock(&nmp->nm_mtx);
156
		/* We'll never get here for v4, because we always have fsinfo */
157
		(void)ncl_fsinfo(nmp, vp, cred, td);
158
	} else
159
		mtx_unlock(&nmp->nm_mtx);
160

161
	if (use_buf_pager)
162
		return (vfs_bio_getpages(vp, pages, npages, ap->a_rbehind,
163
		    ap->a_rahead, ncl_gbp_getblkno, ncl_gbp_getblksz));
164

165
	/*
166
	 * If the requested page is partially valid, just return it and
167
	 * allow the pager to zero-out the blanks.  Partially valid pages
168
	 * can only occur at the file EOF.
169
	 *
170
	 * XXXGL: is that true for NFS, where short read can occur???
171
	 */
172
	VM_OBJECT_WLOCK(object);
173
	if (!vm_page_none_valid(pages[npages - 1]) && --npages == 0)
174
		goto out;
175
	VM_OBJECT_WUNLOCK(object);
176

177
	/*
178
	 * We use only the kva address for the buffer, but this is extremely
179
	 * convenient and fast.
180
	 */
181
	bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK);
182

183
	kva = (vm_offset_t) bp->b_data;
184
	pmap_qenter(kva, pages, npages);
185
	VM_CNT_INC(v_vnodein);
186
	VM_CNT_ADD(v_vnodepgsin, npages);
187

188
	count = npages << PAGE_SHIFT;
189
	iov.iov_base = (caddr_t) kva;
190
	iov.iov_len = count;
191
	uio.uio_iov = &iov;
192
	uio.uio_iovcnt = 1;
193
	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
194
	uio.uio_resid = count;
195
	uio.uio_segflg = UIO_SYSSPACE;
196
	uio.uio_rw = UIO_READ;
197
	uio.uio_td = td;
198

199
	error = ncl_readrpc(vp, &uio, cred);
200
	pmap_qremove(kva, npages);
201

202
	uma_zfree(ncl_pbuf_zone, bp);
203

204
	if (error && (uio.uio_resid == count)) {
205
		printf("ncl_getpages: error %d\n", error);
206
		return (VM_PAGER_ERROR);
207
	}
208

209
	/*
210
	 * Calculate the number of bytes read and validate only that number
211
	 * of bytes.  Note that due to pending writes, size may be 0.  This
212
	 * does not mean that the remaining data is invalid!
213
	 */
214

215
	size = count - uio.uio_resid;
216
	VM_OBJECT_WLOCK(object);
217
	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
218
		vm_page_t m;
219
		nextoff = toff + PAGE_SIZE;
220
		m = pages[i];
221

222
		if (nextoff <= size) {
223
			/*
224
			 * Read operation filled an entire page
225
			 */
226
			vm_page_valid(m);
227
			KASSERT(m->dirty == 0,
228
			    ("nfs_getpages: page %p is dirty", m));
229
		} else if (size > toff) {
230
			/*
231
			 * Read operation filled a partial page.
232
			 */
233
			vm_page_invalid(m);
234
			vm_page_set_valid_range(m, 0, size - toff);
235
			KASSERT(m->dirty == 0,
236
			    ("nfs_getpages: page %p is dirty", m));
237
		} else {
238
			/*
239
			 * Read operation was short.  If no error
240
			 * occurred we may have hit a zero-fill
241
			 * section.  We leave valid set to 0, and page
242
			 * is freed by vm_page_readahead_finish() if
243
			 * its index is not equal to requested, or
244
			 * page is zeroed and set valid by
245
			 * vm_pager_get_pages() for requested page.
246
			 */
247
			;
248
		}
249
	}
250
out:
251
	VM_OBJECT_WUNLOCK(object);
252
	if (ap->a_rbehind)
253
		*ap->a_rbehind = 0;
254
	if (ap->a_rahead)
255
		*ap->a_rahead = 0;
256
	return (VM_PAGER_OK);
257
}
258

259
/*
260
 * Vnode op for VM putpages.
261
 */
262
int
263
ncl_putpages(struct vop_putpages_args *ap)
264
{
265
	struct uio uio;
266
	struct iovec iov;
267
	int i, error, npages, count;
268
	off_t offset;
269
	int *rtvals;
270
	struct vnode *vp;
271
	struct thread *td;
272
	struct ucred *cred;
273
	struct nfsmount *nmp;
274
	struct nfsnode *np;
275
	vm_page_t *pages;
276

277
	vp = ap->a_vp;
278
	np = VTONFS(vp);
279
	td = curthread;				/* XXX */
280
	/* Set the cred to n_writecred for the write rpcs. */
281
	if (np->n_writecred != NULL)
282
		cred = crhold(np->n_writecred);
283
	else
284
		cred = crhold(curthread->td_ucred);	/* XXX */
285
	nmp = VFSTONFS(vp->v_mount);
286
	pages = ap->a_m;
287
	count = ap->a_count;
288
	rtvals = ap->a_rtvals;
289
	npages = btoc(count);
290
	offset = IDX_TO_OFF(pages[0]->pindex);
291

292
	mtx_lock(&nmp->nm_mtx);
293
	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
294
	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
295
		mtx_unlock(&nmp->nm_mtx);
296
		(void)ncl_fsinfo(nmp, vp, cred, td);
297
	} else
298
		mtx_unlock(&nmp->nm_mtx);
299

300
	NFSLOCKNODE(np);
301
	if (newnfs_directio_enable && !newnfs_directio_allow_mmap &&
302
	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
303
		NFSUNLOCKNODE(np);
304
		printf("ncl_putpages: called on noncache-able vnode\n");
305
		NFSLOCKNODE(np);
306
	}
307
	/*
308
	 * When putting pages, do not extend file past EOF.
309
	 */
310
	if (offset + count > np->n_size) {
311
		count = np->n_size - offset;
312
		if (count < 0)
313
			count = 0;
314
	}
315
	NFSUNLOCKNODE(np);
316

317
	for (i = 0; i < npages; i++)
318
		rtvals[i] = VM_PAGER_ERROR;
319

320
	VM_CNT_INC(v_vnodeout);
321
	VM_CNT_ADD(v_vnodepgsout, count);
322

323
	iov.iov_base = unmapped_buf;
324
	iov.iov_len = count;
325
	uio.uio_iov = &iov;
326
	uio.uio_iovcnt = 1;
327
	uio.uio_offset = offset;
328
	uio.uio_resid = count;
329
	uio.uio_segflg = UIO_NOCOPY;
330
	uio.uio_rw = UIO_WRITE;
331
	uio.uio_td = td;
332

333
	error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync),
334
	    cred);
335
	crfree(cred);
336

337
	if (error == 0 || !nfs_keep_dirty_on_error) {
338
		vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid,
339
		    np->n_size - offset, npages * PAGE_SIZE);
340
	}
341
	return (rtvals[0]);
342
}
343

344
/*
345
 * For nfs, cache consistency can only be maintained approximately.
346
 * Although RFC1094 does not specify the criteria, the following is
347
 * believed to be compatible with the reference port.
348
 * For nfs:
349
 * If the file's modify time on the server has changed since the
350
 * last read rpc or you have written to the file,
351
 * you may have lost data cache consistency with the
352
 * server, so flush all of the file's data out of the cache.
353
 * Then force a getattr rpc to ensure that you have up to date
354
 * attributes.
355
 * NB: This implies that cache data can be read when up to
356
 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
357
 * attributes this could be forced by setting n_attrstamp to 0 before
358
 * the VOP_GETATTR() call.
359
 */
360
static inline int
361
nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
362
{
363
	int error = 0;
364
	struct vattr vattr;
365
	struct nfsnode *np = VTONFS(vp);
366
	bool old_lock;
367

368
	/*
369
	 * Ensure the exclusive access to the node before checking
370
	 * whether the cache is consistent.
371
	 */
372
	old_lock = ncl_excl_start(vp);
373
	NFSLOCKNODE(np);
374
	if (np->n_flag & NMODIFIED) {
375
		NFSUNLOCKNODE(np);
376
		if (vp->v_type != VREG) {
377
			if (vp->v_type != VDIR)
378
				panic("nfs: bioread, not dir");
379
			ncl_invaldir(vp);
380
			error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
381
			if (error != 0)
382
				goto out;
383
		}
384
		np->n_attrstamp = 0;
385
		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
386
		error = VOP_GETATTR(vp, &vattr, cred);
387
		if (error)
388
			goto out;
389
		NFSLOCKNODE(np);
390
		np->n_mtime = vattr.va_mtime;
391
		NFSUNLOCKNODE(np);
392
	} else {
393
		NFSUNLOCKNODE(np);
394
		error = VOP_GETATTR(vp, &vattr, cred);
395
		if (error)
396
			goto out;
397
		NFSLOCKNODE(np);
398
		if ((np->n_flag & NSIZECHANGED)
399
		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
400
			NFSUNLOCKNODE(np);
401
			if (vp->v_type == VDIR)
402
				ncl_invaldir(vp);
403
			error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
404
			if (error != 0)
405
				goto out;
406
			NFSLOCKNODE(np);
407
			np->n_mtime = vattr.va_mtime;
408
			np->n_flag &= ~NSIZECHANGED;
409
		}
410
		NFSUNLOCKNODE(np);
411
	}
412
out:
413
	ncl_excl_finish(vp, old_lock);
414
	return (error);
415
}
416

417
static bool
418
ncl_bioread_dora(struct vnode *vp)
419
{
420
	vm_object_t obj;
421

422
	obj = vp->v_object;
423
	if (obj == NULL)
424
		return (true);
425
	return (!vm_object_mightbedirty(vp->v_object) &&
426
	    vp->v_object->un_pager.vnp.writemappings == 0);
427
}
428

429
/*
430
 * Vnode op for read using bio
431
 */
432
int
433
ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
434
{
435
	struct nfsnode *np = VTONFS(vp);
436
	struct buf *bp, *rabp;
437
	struct thread *td;
438
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
439
	daddr_t lbn, rabn;
440
	int biosize, bcount, error, i, n, nra, on, save2, seqcount;
441
	off_t tmp_off;
442

443
	KASSERT(uio->uio_rw == UIO_READ, ("ncl_read mode"));
444
	if (uio->uio_resid == 0)
445
		return (0);
446
	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
447
		return (EINVAL);
448
	td = uio->uio_td;
449

450
	mtx_lock(&nmp->nm_mtx);
451
	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
452
	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
453
		mtx_unlock(&nmp->nm_mtx);
454
		(void)ncl_fsinfo(nmp, vp, cred, td);
455
		mtx_lock(&nmp->nm_mtx);
456
	}
457
	if (nmp->nm_rsize == 0 || nmp->nm_readdirsize == 0)
458
		(void) newnfs_iosize(nmp);
459

460
	tmp_off = uio->uio_offset + uio->uio_resid;
461
	if (vp->v_type != VDIR &&
462
	    (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)) {
463
		mtx_unlock(&nmp->nm_mtx);
464
		return (EFBIG);
465
	}
466
	mtx_unlock(&nmp->nm_mtx);
467

468
	if (newnfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
469
		/* No caching/ no readaheads. Just read data into the user buffer */
470
		return ncl_readrpc(vp, uio, cred);
471

472
	n = 0;
473
	on = 0;
474
	biosize = vp->v_bufobj.bo_bsize;
475
	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
476

477
	error = nfs_bioread_check_cons(vp, td, cred);
478
	if (error)
479
		return error;
480

481
	save2 = curthread_pflags2_set(TDP2_SBPAGES);
482
	do {
483
	    u_quad_t nsize;
484

485
	    NFSLOCKNODE(np);
486
	    nsize = np->n_size;
487
	    NFSUNLOCKNODE(np);
488

489
	    switch (vp->v_type) {
490
	    case VREG:
491
		NFSINCRGLOBAL(nfsstatsv1.biocache_reads);
492
		lbn = uio->uio_offset / biosize;
493
		on = uio->uio_offset - (lbn * biosize);
494

495
		/*
496
		 * Start the read ahead(s), as required.  Do not do
497
		 * read-ahead if there are writeable mappings, since
498
		 * unlocked read by nfsiod could obliterate changes
499
		 * done by userspace.
500
		 */
501
		if (nmp->nm_readahead > 0 && ncl_bioread_dora(vp)) {
502
		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
503
			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
504
			rabn = lbn + 1 + nra;
505
			if (incore(&vp->v_bufobj, rabn) == NULL) {
506
			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
507
			    if (!rabp) {
508
				error = newnfs_sigintr(nmp, td);
509
				if (error == 0)
510
					error = EINTR;
511
				goto out;
512
			    }
513
			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
514
				rabp->b_flags |= B_ASYNC;
515
				rabp->b_iocmd = BIO_READ;
516
				vfs_busy_pages(rabp, 0);
517
				if (ncl_asyncio(nmp, rabp, cred, td)) {
518
				    rabp->b_flags |= B_INVAL;
519
				    rabp->b_ioflags |= BIO_ERROR;
520
				    vfs_unbusy_pages(rabp);
521
				    brelse(rabp);
522
				    break;
523
				}
524
			    } else {
525
				brelse(rabp);
526
			    }
527
			}
528
		    }
529
		}
530

531
		/* Note that bcount is *not* DEV_BSIZE aligned. */
532
		bcount = biosize;
533
		if ((off_t)lbn * biosize >= nsize) {
534
			bcount = 0;
535
		} else if ((off_t)(lbn + 1) * biosize > nsize) {
536
			bcount = nsize - (off_t)lbn * biosize;
537
		}
538
		bp = nfs_getcacheblk(vp, lbn, bcount, td);
539

540
		if (!bp) {
541
			error = newnfs_sigintr(nmp, td);
542
			if (error == 0)
543
				error = EINTR;
544
			goto out;
545
		}
546

547
		/*
548
		 * If B_CACHE is not set, we must issue the read.  If this
549
		 * fails, we return an error.
550
		 */
551

552
		if ((bp->b_flags & B_CACHE) == 0) {
553
		    bp->b_iocmd = BIO_READ;
554
		    vfs_busy_pages(bp, 0);
555
		    error = ncl_doio(vp, bp, cred, td, 0);
556
		    if (error) {
557
			brelse(bp);
558
			goto out;
559
		    }
560
		}
561

562
		/*
563
		 * on is the offset into the current bp.  Figure out how many
564
		 * bytes we can copy out of the bp.  Note that bcount is
565
		 * NOT DEV_BSIZE aligned.
566
		 *
567
		 * Then figure out how many bytes we can copy into the uio.
568
		 */
569

570
		n = 0;
571
		if (on < bcount)
572
			n = MIN((unsigned)(bcount - on), uio->uio_resid);
573
		break;
574
	    case VLNK:
575
		NFSINCRGLOBAL(nfsstatsv1.biocache_readlinks);
576
		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
577
		if (!bp) {
578
			error = newnfs_sigintr(nmp, td);
579
			if (error == 0)
580
				error = EINTR;
581
			goto out;
582
		}
583
		if ((bp->b_flags & B_CACHE) == 0) {
584
		    bp->b_iocmd = BIO_READ;
585
		    vfs_busy_pages(bp, 0);
586
		    error = ncl_doio(vp, bp, cred, td, 0);
587
		    if (error) {
588
			bp->b_ioflags |= BIO_ERROR;
589
			brelse(bp);
590
			goto out;
591
		    }
592
		}
593
		n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
594
		on = 0;
595
		break;
596
	    case VDIR:
597
		NFSINCRGLOBAL(nfsstatsv1.biocache_readdirs);
598
		NFSLOCKNODE(np);
599
		if (np->n_direofoffset
600
		    && uio->uio_offset >= np->n_direofoffset) {
601
			NFSUNLOCKNODE(np);
602
			error = 0;
603
			goto out;
604
		}
605
		NFSUNLOCKNODE(np);
606
		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
607
		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
608
		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
609
		if (!bp) {
610
			error = newnfs_sigintr(nmp, td);
611
			if (error == 0)
612
				error = EINTR;
613
			goto out;
614
		}
615
		if ((bp->b_flags & B_CACHE) == 0) {
616
		    bp->b_iocmd = BIO_READ;
617
		    vfs_busy_pages(bp, 0);
618
		    error = ncl_doio(vp, bp, cred, td, 0);
619
		    if (error) {
620
			    brelse(bp);
621
		    }
622
		    while (error == NFSERR_BAD_COOKIE) {
623
			ncl_invaldir(vp);
624
			error = ncl_vinvalbuf(vp, 0, td, 1);
625

626
			/*
627
			 * Yuck! The directory has been modified on the
628
			 * server. The only way to get the block is by
629
			 * reading from the beginning to get all the
630
			 * offset cookies.
631
			 *
632
			 * Leave the last bp intact unless there is an error.
633
			 * Loop back up to the while if the error is another
634
			 * NFSERR_BAD_COOKIE (double yuch!).
635
			 */
636
			for (i = 0; i <= lbn && !error; i++) {
637
			    NFSLOCKNODE(np);
638
			    if (np->n_direofoffset
639
				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
640
				    NFSUNLOCKNODE(np);
641
				    error = 0;
642
				    goto out;
643
			    }
644
			    NFSUNLOCKNODE(np);
645
			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
646
			    if (!bp) {
647
				error = newnfs_sigintr(nmp, td);
648
				if (error == 0)
649
					error = EINTR;
650
				goto out;
651
			    }
652
			    if ((bp->b_flags & B_CACHE) == 0) {
653
				    bp->b_iocmd = BIO_READ;
654
				    vfs_busy_pages(bp, 0);
655
				    error = ncl_doio(vp, bp, cred, td, 0);
656
				    /*
657
				     * no error + B_INVAL == directory EOF,
658
				     * use the block.
659
				     */
660
				    if (error == 0 && (bp->b_flags & B_INVAL))
661
					    break;
662
			    }
663
			    /*
664
			     * An error will throw away the block and the
665
			     * for loop will break out.  If no error and this
666
			     * is not the block we want, we throw away the
667
			     * block and go for the next one via the for loop.
668
			     */
669
			    if (error || i < lbn)
670
				    brelse(bp);
671
			}
672
		    }
673
		    /*
674
		     * The above while is repeated if we hit another cookie
675
		     * error.  If we hit an error and it wasn't a cookie error,
676
		     * we give up.
677
		     */
678
		    if (error)
679
			    goto out;
680
		}
681

682
		/*
683
		 * If not eof and read aheads are enabled, start one.
684
		 * (You need the current block first, so that you have the
685
		 *  directory offset cookie of the next block.)
686
		 */
687
		NFSLOCKNODE(np);
688
		if (nmp->nm_readahead > 0 && ncl_bioread_dora(vp) &&
689
		    (bp->b_flags & B_INVAL) == 0 &&
690
		    (np->n_direofoffset == 0 ||
691
		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
692
		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
693
			NFSUNLOCKNODE(np);
694
			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
695
			if (rabp) {
696
			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
697
				rabp->b_flags |= B_ASYNC;
698
				rabp->b_iocmd = BIO_READ;
699
				vfs_busy_pages(rabp, 0);
700
				if (ncl_asyncio(nmp, rabp, cred, td)) {
701
				    rabp->b_flags |= B_INVAL;
702
				    rabp->b_ioflags |= BIO_ERROR;
703
				    vfs_unbusy_pages(rabp);
704
				    brelse(rabp);
705
				}
706
			    } else {
707
				brelse(rabp);
708
			    }
709
			}
710
			NFSLOCKNODE(np);
711
		}
712
		/*
713
		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
714
		 * chopped for the EOF condition, we cannot tell how large
715
		 * NFS directories are going to be until we hit EOF.  So
716
		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
717
		 * it just so happens that b_resid will effectively chop it
718
		 * to EOF.  *BUT* this information is lost if the buffer goes
719
		 * away and is reconstituted into a B_CACHE state ( due to
720
		 * being VMIO ) later.  So we keep track of the directory eof
721
		 * in np->n_direofoffset and chop it off as an extra step
722
		 * right here.
723
		 */
724
		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
725
		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
726
			n = np->n_direofoffset - uio->uio_offset;
727
		NFSUNLOCKNODE(np);
728
		break;
729
	    default:
730
		printf(" ncl_bioread: type %x unexpected\n", vp->v_type);
731
		bp = NULL;
732
		break;
733
	    }
734

735
	    if (n > 0) {
736
		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
737
	    }
738
	    if (vp->v_type == VLNK)
739
		n = 0;
740
	    if (bp != NULL)
741
		brelse(bp);
742
	} while (error == 0 && uio->uio_resid > 0 && n > 0);
743
out:
744
	curthread_pflags2_restore(save2);
745
	if ((curthread->td_pflags2 & TDP2_SBPAGES) == 0) {
746
		NFSLOCKNODE(np);
747
		ncl_pager_setsize(vp, NULL);
748
	}
749
	return (error);
750
}
751

752
/*
753
 * The NFS write path cannot handle iovecs with len > 1. So we need to
754
 * break up iovecs accordingly (restricting them to wsize).
755
 * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
756
 * For the ASYNC case, 2 copies are needed. The first a copy from the
757
 * user buffer to a staging buffer and then a second copy from the staging
758
 * buffer to mbufs. This can be optimized by copying from the user buffer
759
 * directly into mbufs and passing the chain down, but that requires a
760
 * fair amount of re-working of the relevant codepaths (and can be done
761
 * later).
762
 */
763
static int
764
nfs_directio_write(struct vnode *vp, struct uio *uiop, struct ucred *cred,
765
    int ioflag)
766
{
767
	struct uio uio;
768
	struct iovec iov;
769
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
770
	struct thread *td = uiop->uio_td;
771
	int error, iomode, must_commit, size, wsize;
772

773
	KASSERT((ioflag & IO_SYNC) != 0, ("nfs_directio_write: not sync"));
774
	mtx_lock(&nmp->nm_mtx);
775
	wsize = nmp->nm_wsize;
776
	mtx_unlock(&nmp->nm_mtx);
777
	while (uiop->uio_resid > 0) {
778
		size = MIN(uiop->uio_resid, wsize);
779
		size = MIN(uiop->uio_iov->iov_len, size);
780
		iov.iov_base = uiop->uio_iov->iov_base;
781
		iov.iov_len = size;
782
		uio.uio_iov = &iov;
783
		uio.uio_iovcnt = 1;
784
		uio.uio_offset = uiop->uio_offset;
785
		uio.uio_resid = size;
786
		uio.uio_segflg = uiop->uio_segflg;
787
		uio.uio_rw = UIO_WRITE;
788
		uio.uio_td = td;
789
		iomode = NFSWRITE_FILESYNC;
790
		/*
791
		 * When doing direct I/O we do not care if the
792
		 * server's write verifier has changed, but we
793
		 * do not want to update the verifier if it has
794
		 * changed, since that hides the change from
795
		 * writes being done through the buffer cache.
796
		 * By passing must_commit in set to two, the code
797
		 * in nfsrpc_writerpc() will not update the
798
		 * verifier on the mount point.
799
		 */
800
		must_commit = 2;
801
		error = ncl_writerpc(vp, &uio, cred, &iomode,
802
		    &must_commit, 0, ioflag);
803
		KASSERT(must_commit == 2,
804
		    ("ncl_directio_write: Updated write verifier"));
805
		if (error != 0)
806
			return (error);
807
		if (iomode != NFSWRITE_FILESYNC)
808
			printf("nfs_directio_write: Broken server "
809
			    "did not reply FILE_SYNC\n");
810
		uiop->uio_offset += size;
811
		uiop->uio_resid -= size;
812
		if (uiop->uio_iov->iov_len <= size) {
813
			uiop->uio_iovcnt--;
814
			uiop->uio_iov++;
815
		} else {
816
			uiop->uio_iov->iov_base =
817
				(char *)uiop->uio_iov->iov_base + size;
818
			uiop->uio_iov->iov_len -= size;
819
		}
820
	}
821
	return (0);
822
}
823

824
/*
825
 * Vnode op for write using bio
826
 */
827
int
828
ncl_write(struct vop_write_args *ap)
829
{
830
	int biosize;
831
	struct uio *uio = ap->a_uio;
832
	struct thread *td = uio->uio_td;
833
	struct vnode *vp = ap->a_vp;
834
	struct nfsnode *np = VTONFS(vp);
835
	struct ucred *cred = ap->a_cred;
836
	int ioflag = ap->a_ioflag;
837
	struct buf *bp;
838
	struct vattr vattr;
839
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
840
	daddr_t lbn;
841
	int bcount, noncontig_write, obcount;
842
	int bp_cached, n, on, error = 0, error1, save2, wouldcommit;
843
	size_t orig_resid, local_resid;
844
	off_t orig_size, tmp_off;
845
	struct timespec ts;
846

847
	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
848
	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
849
	    ("ncl_write proc"));
850
	if (vp->v_type != VREG)
851
		return (EIO);
852
	NFSLOCKNODE(np);
853
	if (np->n_flag & NWRITEERR) {
854
		np->n_flag &= ~NWRITEERR;
855
		NFSUNLOCKNODE(np);
856
		return (np->n_error);
857
	} else
858
		NFSUNLOCKNODE(np);
859
	mtx_lock(&nmp->nm_mtx);
860
	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
861
	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
862
		mtx_unlock(&nmp->nm_mtx);
863
		(void)ncl_fsinfo(nmp, vp, cred, td);
864
		mtx_lock(&nmp->nm_mtx);
865
	}
866
	if (nmp->nm_wsize == 0)
867
		(void) newnfs_iosize(nmp);
868
	mtx_unlock(&nmp->nm_mtx);
869

870
	/*
871
	 * Synchronously flush pending buffers if we are in synchronous
872
	 * mode or if we are appending.
873
	 */
874
	if ((ioflag & IO_APPEND) || ((ioflag & IO_SYNC) && (np->n_flag &
875
	    NMODIFIED))) {
876
		/*
877
		 * For the case where IO_APPEND is being done using a
878
		 * direct output (to the NFS server) RPC and
879
		 * newnfs_directio_enable is 0, all buffer cache buffers,
880
		 * including ones not modified, must be invalidated.
881
		 * This ensures that stale data is not read out of the
882
		 * buffer cache.  The call also invalidates all mapped
883
		 * pages and, since the exclusive lock is held on the vnode,
884
		 * new pages cannot be faulted in.
885
		 *
886
		 * For the case where newnfs_directio_enable is set
887
		 * (which is not the default), it is not obvious that
888
		 * stale data should be left in the buffer cache, but
889
		 * the code has been this way for over a decade without
890
		 * complaints.  Note that, unlike doing IO_APPEND via
891
		 * a direct write RPC when newnfs_directio_enable is not set,
892
		 * when newnfs_directio_enable is set, reading is done via
893
		 * direct to NFS server RPCs as well.
894
		 */
895
		np->n_attrstamp = 0;
896
		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
897
		error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
898
		    IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
899
		if (error != 0)
900
			return (error);
901
	}
902

903
	orig_resid = uio->uio_resid;
904
	NFSLOCKNODE(np);
905
	orig_size = np->n_size;
906
	NFSUNLOCKNODE(np);
907

908
	/*
909
	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
910
	 * get the append lock.
911
	 */
912
	if (ioflag & IO_APPEND) {
913
		/*
914
		 * For NFSv4, the AppendWrite will Verify the size against
915
		 * the file's size on the server.  If not the same, the
916
		 * write will then be retried, using the file size returned
917
		 * by the AppendWrite.  However, for NFSv2 and NFSv3, the
918
		 * size must be acquired here via a Getattr RPC.
919
		 * The AppendWrite is not done for a pNFS mount.
920
		 */
921
		if (!NFSHASNFSV4(nmp) || NFSHASPNFS(nmp)) {
922
			np->n_attrstamp = 0;
923
			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
924
			error = VOP_GETATTR(vp, &vattr, cred);
925
			if (error)
926
				return (error);
927
		}
928
		NFSLOCKNODE(np);
929
		uio->uio_offset = np->n_size;
930
		NFSUNLOCKNODE(np);
931
	}
932

933
	if (uio->uio_offset < 0)
934
		return (EINVAL);
935
	tmp_off = uio->uio_offset + uio->uio_resid;
936
	if (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)
937
		return (EFBIG);
938
	if (uio->uio_resid == 0)
939
		return (0);
940

941
	/*
942
	 * Do IO_APPEND writing via a synchronous direct write.
943
	 * This can result in a significant performance improvement.
944
	 */
945
	if ((newnfs_directio_enable && (ioflag & IO_DIRECT)) ||
946
	    (ioflag & IO_APPEND)) {
947
		/*
948
		 * Direct writes to the server must be done NFSWRITE_FILESYNC,
949
		 * because the write data is not cached and, therefore, the
950
		 * write cannot be redone after a server reboot.
951
		 * Set IO_SYNC to make this happen.
952
		 */
953
		ioflag |= IO_SYNC;
954
		return (nfs_directio_write(vp, uio, cred, ioflag));
955
	}
956

957
	/*
958
	 * Maybe this should be above the vnode op call, but so long as
959
	 * file servers have no limits, i don't think it matters
960
	 */
961
	error = vn_rlimit_fsize(vp, uio, td);
962
	if (error != 0)
963
		return (error);
964

965
	save2 = curthread_pflags2_set(TDP2_SBPAGES);
966
	biosize = vp->v_bufobj.bo_bsize;
967
	/*
968
	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
969
	 * would exceed the local maximum per-file write commit size when
970
	 * combined with those, we must decide whether to flush,
971
	 * go synchronous, or return error.  We don't bother checking
972
	 * IO_UNIT -- we just make all writes atomic anyway, as there's
973
	 * no point optimizing for something that really won't ever happen.
974
	 */
975
	wouldcommit = 0;
976
	if (!(ioflag & IO_SYNC)) {
977
		int nflag;
978

979
		NFSLOCKNODE(np);
980
		nflag = np->n_flag;
981
		NFSUNLOCKNODE(np);
982
		if (nflag & NMODIFIED) {
983
			BO_LOCK(&vp->v_bufobj);
984
			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
985
				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
986
				    b_bobufs) {
987
					if (bp->b_flags & B_NEEDCOMMIT)
988
						wouldcommit += bp->b_bcount;
989
				}
990
			}
991
			BO_UNLOCK(&vp->v_bufobj);
992
		}
993
	}
994

995
	do {
996
		if (!(ioflag & IO_SYNC)) {
997
			wouldcommit += biosize;
998
			if (wouldcommit > nmp->nm_wcommitsize) {
999
				np->n_attrstamp = 0;
1000
				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1001
				error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
1002
				    IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
1003
				if (error != 0)
1004
					goto out;
1005
				wouldcommit = biosize;
1006
			}
1007
		}
1008

1009
		NFSINCRGLOBAL(nfsstatsv1.biocache_writes);
1010
		lbn = uio->uio_offset / biosize;
1011
		on = uio->uio_offset - (lbn * biosize);
1012
		n = MIN((unsigned)(biosize - on), uio->uio_resid);
1013
again:
1014
		/*
1015
		 * Handle direct append and file extension cases, calculate
1016
		 * unaligned buffer size.
1017
		 */
1018
		NFSLOCKNODE(np);
1019
		if ((np->n_flag & NHASBEENLOCKED) == 0 &&
1020
		    (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0)
1021
			noncontig_write = 1;
1022
		else
1023
			noncontig_write = 0;
1024
		if ((uio->uio_offset == np->n_size ||
1025
		    (noncontig_write != 0 &&
1026
		    lbn == (np->n_size / biosize) &&
1027
		    uio->uio_offset + n > np->n_size)) && n) {
1028
			NFSUNLOCKNODE(np);
1029
			/*
1030
			 * Get the buffer (in its pre-append state to maintain
1031
			 * B_CACHE if it was previously set).  Resize the
1032
			 * nfsnode after we have locked the buffer to prevent
1033
			 * readers from reading garbage.
1034
			 */
1035
			obcount = np->n_size - (lbn * biosize);
1036
			bp = nfs_getcacheblk(vp, lbn, obcount, td);
1037

1038
			if (bp != NULL) {
1039
				long save;
1040

1041
				NFSLOCKNODE(np);
1042
				np->n_size = uio->uio_offset + n;
1043
				np->n_flag |= NMODIFIED;
1044
				np->n_flag &= ~NVNSETSZSKIP;
1045
				vnode_pager_setsize(vp, np->n_size);
1046
				NFSUNLOCKNODE(np);
1047

1048
				save = bp->b_flags & B_CACHE;
1049
				bcount = on + n;
1050
				allocbuf(bp, bcount);
1051
				bp->b_flags |= save;
1052
				if (noncontig_write != 0 && on > obcount)
1053
					vfs_bio_bzero_buf(bp, obcount, on -
1054
					    obcount);
1055
			}
1056
		} else {
1057
			/*
1058
			 * Obtain the locked cache block first, and then
1059
			 * adjust the file's size as appropriate.
1060
			 */
1061
			bcount = on + n;
1062
			if ((off_t)lbn * biosize + bcount < np->n_size) {
1063
				if ((off_t)(lbn + 1) * biosize < np->n_size)
1064
					bcount = biosize;
1065
				else
1066
					bcount = np->n_size - (off_t)lbn * biosize;
1067
			}
1068
			NFSUNLOCKNODE(np);
1069
			bp = nfs_getcacheblk(vp, lbn, bcount, td);
1070
			NFSLOCKNODE(np);
1071
			if (uio->uio_offset + n > np->n_size) {
1072
				np->n_size = uio->uio_offset + n;
1073
				np->n_flag |= NMODIFIED;
1074
				np->n_flag &= ~NVNSETSZSKIP;
1075
				vnode_pager_setsize(vp, np->n_size);
1076
			}
1077
			NFSUNLOCKNODE(np);
1078
		}
1079

1080
		if (!bp) {
1081
			error = newnfs_sigintr(nmp, td);
1082
			if (!error)
1083
				error = EINTR;
1084
			break;
1085
		}
1086

1087
		/*
1088
		 * Issue a READ if B_CACHE is not set.  In special-append
1089
		 * mode, B_CACHE is based on the buffer prior to the write
1090
		 * op and is typically set, avoiding the read.  If a read
1091
		 * is required in special append mode, the server will
1092
		 * probably send us a short-read since we extended the file
1093
		 * on our end, resulting in b_resid == 0 and, thusly,
1094
		 * B_CACHE getting set.
1095
		 *
1096
		 * We can also avoid issuing the read if the write covers
1097
		 * the entire buffer.  We have to make sure the buffer state
1098
		 * is reasonable in this case since we will not be initiating
1099
		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
1100
		 * more information.
1101
		 *
1102
		 * B_CACHE may also be set due to the buffer being cached
1103
		 * normally.
1104
		 */
1105

1106
		bp_cached = 1;
1107
		if (on == 0 && n == bcount) {
1108
			if ((bp->b_flags & B_CACHE) == 0)
1109
				bp_cached = 0;
1110
			bp->b_flags |= B_CACHE;
1111
			bp->b_flags &= ~B_INVAL;
1112
			bp->b_ioflags &= ~BIO_ERROR;
1113
		}
1114

1115
		if ((bp->b_flags & B_CACHE) == 0) {
1116
			bp->b_iocmd = BIO_READ;
1117
			vfs_busy_pages(bp, 0);
1118
			error = ncl_doio(vp, bp, cred, td, 0);
1119
			if (error) {
1120
				brelse(bp);
1121
				break;
1122
			}
1123
		}
1124
		if (bp->b_wcred == NOCRED)
1125
			bp->b_wcred = crhold(cred);
1126
		NFSLOCKNODE(np);
1127
		np->n_flag |= NMODIFIED;
1128
		NFSUNLOCKNODE(np);
1129

1130
		/*
1131
		 * If dirtyend exceeds file size, chop it down.  This should
1132
		 * not normally occur but there is an append race where it
1133
		 * might occur XXX, so we log it.
1134
		 *
1135
		 * If the chopping creates a reverse-indexed or degenerate
1136
		 * situation with dirtyoff/end, we 0 both of them.
1137
		 */
1138

1139
		if (bp->b_dirtyend > bcount) {
1140
			printf("NFS append race @%lx:%d\n",
1141
			    (long)bp->b_blkno * DEV_BSIZE,
1142
			    bp->b_dirtyend - bcount);
1143
			bp->b_dirtyend = bcount;
1144
		}
1145

1146
		if (bp->b_dirtyoff >= bp->b_dirtyend)
1147
			bp->b_dirtyoff = bp->b_dirtyend = 0;
1148

1149
		/*
1150
		 * If the new write will leave a contiguous dirty
1151
		 * area, just update the b_dirtyoff and b_dirtyend,
1152
		 * otherwise force a write rpc of the old dirty area.
1153
		 *
1154
		 * If there has been a file lock applied to this file
1155
		 * or vfs.nfs.old_noncontig_writing is set, do the following:
1156
		 * While it is possible to merge discontiguous writes due to
1157
		 * our having a B_CACHE buffer ( and thus valid read data
1158
		 * for the hole), we don't because it could lead to
1159
		 * significant cache coherency problems with multiple clients,
1160
		 * especially if locking is implemented later on.
1161
		 *
1162
		 * If vfs.nfs.old_noncontig_writing is not set and there has
1163
		 * not been file locking done on this file:
1164
		 * Relax coherency a bit for the sake of performance and
1165
		 * expand the current dirty region to contain the new
1166
		 * write even if it means we mark some non-dirty data as
1167
		 * dirty.
1168
		 */
1169

1170
		if (noncontig_write == 0 && bp->b_dirtyend > 0 &&
1171
		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1172
			if (bwrite(bp) == EINTR) {
1173
				error = EINTR;
1174
				break;
1175
			}
1176
			goto again;
1177
		}
1178

1179
		local_resid = uio->uio_resid;
1180
		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
1181

1182
		if (error != 0 && !bp_cached) {
1183
			/*
1184
			 * This block has no other content then what
1185
			 * possibly was written by the faulty uiomove.
1186
			 * Release it, forgetting the data pages, to
1187
			 * prevent the leak of uninitialized data to
1188
			 * usermode.
1189
			 */
1190
			bp->b_ioflags |= BIO_ERROR;
1191
			brelse(bp);
1192
			uio->uio_offset -= local_resid - uio->uio_resid;
1193
			uio->uio_resid = local_resid;
1194
			break;
1195
		}
1196

1197
		/*
1198
		 * Since this block is being modified, it must be written
1199
		 * again and not just committed.  Since write clustering does
1200
		 * not work for the stage 1 data write, only the stage 2
1201
		 * commit rpc, we have to clear B_CLUSTEROK as well.
1202
		 */
1203
		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1204

1205
		/*
1206
		 * Get the partial update on the progress made from
1207
		 * uiomove, if an error occurred.
1208
		 */
1209
		if (error != 0)
1210
			n = local_resid - uio->uio_resid;
1211

1212
		/*
1213
		 * Only update dirtyoff/dirtyend if not a degenerate
1214
		 * condition.
1215
		 */
1216
		if (n > 0) {
1217
			if (bp->b_dirtyend > 0) {
1218
				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1219
				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1220
			} else {
1221
				bp->b_dirtyoff = on;
1222
				bp->b_dirtyend = on + n;
1223
			}
1224
			vfs_bio_set_valid(bp, on, n);
1225
		}
1226

1227
		/*
1228
		 * If IO_SYNC do bwrite().
1229
		 *
1230
		 * IO_INVAL appears to be unused.  The idea appears to be
1231
		 * to turn off caching in this case.  Very odd.  XXX
1232
		 */
1233
		if ((ioflag & IO_SYNC)) {
1234
			if (ioflag & IO_INVAL)
1235
				bp->b_flags |= B_NOCACHE;
1236
			error1 = bwrite(bp);
1237
			if (error1 != 0) {
1238
				if (error == 0)
1239
					error = error1;
1240
				break;
1241
			}
1242
		} else if ((n + on) == biosize || (ioflag & IO_ASYNC) != 0) {
1243
			bp->b_flags |= B_ASYNC;
1244
			(void) bwrite(bp);
1245
		} else {
1246
			bdwrite(bp);
1247
		}
1248

1249
		if (error != 0)
1250
			break;
1251
	} while (uio->uio_resid > 0 && n > 0);
1252

1253
	if (error == 0) {
1254
		nanouptime(&ts);
1255
		NFSLOCKNODE(np);
1256
		np->n_localmodtime = ts;
1257
		NFSUNLOCKNODE(np);
1258
	} else {
1259
		if (ioflag & IO_UNIT) {
1260
			VATTR_NULL(&vattr);
1261
			vattr.va_size = orig_size;
1262
			/* IO_SYNC is handled implicitely */
1263
			(void)VOP_SETATTR(vp, &vattr, cred);
1264
			uio->uio_offset -= orig_resid - uio->uio_resid;
1265
			uio->uio_resid = orig_resid;
1266
		}
1267
	}
1268

1269
out:
1270
	curthread_pflags2_restore(save2);
1271
	return (error);
1272
}
1273

1274
/*
1275
 * Get an nfs cache block.
1276
 *
1277
 * Allocate a new one if the block isn't currently in the cache
1278
 * and return the block marked busy. If the calling process is
1279
 * interrupted by a signal for an interruptible mount point, return
1280
 * NULL.
1281
 *
1282
 * The caller must carefully deal with the possible B_INVAL state of
1283
 * the buffer.  ncl_doio() clears B_INVAL (and ncl_asyncio() clears it
1284
 * indirectly), so synchronous reads can be issued without worrying about
1285
 * the B_INVAL state.  We have to be a little more careful when dealing
1286
 * with writes (see comments in nfs_write()) when extending a file past
1287
 * its EOF.
1288
 */
1289
static struct buf *
1290
nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
1291
{
1292
	struct buf *bp;
1293
	struct mount *mp;
1294
	struct nfsmount *nmp;
1295

1296
	mp = vp->v_mount;
1297
	nmp = VFSTONFS(mp);
1298

1299
	if (nmp->nm_flag & NFSMNT_INT) {
1300
		sigset_t oldset;
1301

1302
		newnfs_set_sigmask(td, &oldset);
1303
		bp = getblk(vp, bn, size, PCATCH, 0, 0);
1304
		newnfs_restore_sigmask(td, &oldset);
1305
		while (bp == NULL) {
1306
			if (newnfs_sigintr(nmp, td))
1307
				return (NULL);
1308
			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
1309
		}
1310
	} else {
1311
		bp = getblk(vp, bn, size, 0, 0, 0);
1312
	}
1313

1314
	if (vp->v_type == VREG)
1315
		bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE);
1316
	return (bp);
1317
}
1318

1319
/*
1320
 * Flush and invalidate all dirty buffers. If another process is already
1321
 * doing the flush, just wait for completion.
1322
 */
1323
int
1324
ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
1325
{
1326
	struct nfsnode *np = VTONFS(vp);
1327
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1328
	int error = 0, slpflag, slptimeo;
1329
	bool old_lock;
1330
	struct timespec ts;
1331

1332
	ASSERT_VOP_LOCKED(vp, "ncl_vinvalbuf");
1333

1334
	if ((nmp->nm_flag & NFSMNT_INT) == 0)
1335
		intrflg = 0;
1336
	if (NFSCL_FORCEDISM(nmp->nm_mountp))
1337
		intrflg = 1;
1338
	if (intrflg) {
1339
		slpflag = PCATCH;
1340
		slptimeo = 2 * hz;
1341
	} else {
1342
		slpflag = 0;
1343
		slptimeo = 0;
1344
	}
1345

1346
	old_lock = ncl_excl_start(vp);
1347
	if (old_lock)
1348
		flags |= V_ALLOWCLEAN;
1349

1350
	/*
1351
	 * Now, flush as required.
1352
	 */
1353
	if ((flags & (V_SAVE | V_VMIO)) == V_SAVE) {
1354
		vnode_pager_clean_sync(vp);
1355

1356
		/*
1357
		 * If the page clean was interrupted, fail the invalidation.
1358
		 * Not doing so, we run the risk of losing dirty pages in the
1359
		 * vinvalbuf() call below.
1360
		 */
1361
		if (intrflg && (error = newnfs_sigintr(nmp, td)))
1362
			goto out;
1363
	}
1364

1365
	error = vinvalbuf(vp, flags, slpflag, 0);
1366
	while (error) {
1367
		if (intrflg && (error = newnfs_sigintr(nmp, td)))
1368
			goto out;
1369
		error = vinvalbuf(vp, flags, 0, slptimeo);
1370
	}
1371
	if (NFSHASPNFS(nmp)) {
1372
		nfscl_layoutcommit(vp, td);
1373
		nanouptime(&ts);
1374
		/*
1375
		 * Invalidate the attribute cache, since writes to a DS
1376
		 * won't update the size attribute.
1377
		 */
1378
		NFSLOCKNODE(np);
1379
		np->n_attrstamp = 0;
1380
	} else {
1381
		nanouptime(&ts);
1382
		NFSLOCKNODE(np);
1383
	}
1384
	if ((np->n_flag & NMODIFIED) != 0) {
1385
		np->n_localmodtime = ts;
1386
		np->n_flag &= ~NMODIFIED;
1387
	}
1388
	NFSUNLOCKNODE(np);
1389
out:
1390
	ncl_excl_finish(vp, old_lock);
1391
	return error;
1392
}
1393

1394
/*
1395
 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1396
 * This is mainly to avoid queueing async I/O requests when the nfsiods
1397
 * are all hung on a dead server.
1398
 *
1399
 * Note: ncl_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
1400
 * is eventually dequeued by the async daemon, ncl_doio() *will*.
1401
 */
1402
int
1403
ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
1404
{
1405
	int iod;
1406
	int gotiod;
1407
	int slpflag = 0;
1408
	int slptimeo = 0;
1409
	int error, error2;
1410

1411
	/*
1412
	 * Commits are usually short and sweet so lets save some cpu and
1413
	 * leave the async daemons for more important rpc's (such as reads
1414
	 * and writes).
1415
	 *
1416
	 * Readdirplus RPCs do vget()s to acquire the vnodes for entries
1417
	 * in the directory in order to update attributes. This can deadlock
1418
	 * with another thread that is waiting for async I/O to be done by
1419
	 * an nfsiod thread while holding a lock on one of these vnodes.
1420
	 * To avoid this deadlock, don't allow the async nfsiod threads to
1421
	 * perform Readdirplus RPCs.
1422
	 */
1423
	NFSLOCKIOD();
1424
	if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1425
	     (nmp->nm_bufqiods > ncl_numasync / 2)) ||
1426
	    (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) {
1427
		NFSUNLOCKIOD();
1428
		return(EIO);
1429
	}
1430
again:
1431
	if (nmp->nm_flag & NFSMNT_INT)
1432
		slpflag = PCATCH;
1433
	gotiod = FALSE;
1434

1435
	/*
1436
	 * Find a free iod to process this request.
1437
	 */
1438
	for (iod = 0; iod < ncl_numasync; iod++)
1439
		if (ncl_iodwant[iod] == NFSIOD_AVAILABLE) {
1440
			gotiod = TRUE;
1441
			break;
1442
		}
1443

1444
	/*
1445
	 * Try to create one if none are free.
1446
	 */
1447
	if (!gotiod)
1448
		ncl_nfsiodnew();
1449
	else {
1450
		/*
1451
		 * Found one, so wake it up and tell it which
1452
		 * mount to process.
1453
		 */
1454
		NFS_DPF(ASYNCIO, ("ncl_asyncio: waking iod %d for mount %p\n",
1455
		    iod, nmp));
1456
		ncl_iodwant[iod] = NFSIOD_NOT_AVAILABLE;
1457
		ncl_iodmount[iod] = nmp;
1458
		nmp->nm_bufqiods++;
1459
		wakeup(&ncl_iodwant[iod]);
1460
	}
1461

1462
	/*
1463
	 * If none are free, we may already have an iod working on this mount
1464
	 * point.  If so, it will process our request.
1465
	 */
1466
	if (!gotiod) {
1467
		if (nmp->nm_bufqiods > 0) {
1468
			NFS_DPF(ASYNCIO,
1469
				("ncl_asyncio: %d iods are already processing mount %p\n",
1470
				 nmp->nm_bufqiods, nmp));
1471
			gotiod = TRUE;
1472
		}
1473
	}
1474

1475
	/*
1476
	 * If we have an iod which can process the request, then queue
1477
	 * the buffer.
1478
	 */
1479
	if (gotiod) {
1480
		/*
1481
		 * Ensure that the queue never grows too large.  We still want
1482
		 * to asynchronize so we block rather then return EIO.
1483
		 */
1484
		while (nmp->nm_bufqlen >= 2*ncl_numasync) {
1485
			NFS_DPF(ASYNCIO,
1486
				("ncl_asyncio: waiting for mount %p queue to drain\n", nmp));
1487
			nmp->nm_bufqwant = TRUE;
1488
			error = newnfs_msleep(td, &nmp->nm_bufq,
1489
			    &ncl_iod_mutex, slpflag | PRIBIO, "nfsaio",
1490
			   slptimeo);
1491
			if (error) {
1492
				error2 = newnfs_sigintr(nmp, td);
1493
				if (error2) {
1494
					NFSUNLOCKIOD();
1495
					return (error2);
1496
				}
1497
				if (slpflag == PCATCH) {
1498
					slpflag = 0;
1499
					slptimeo = 2 * hz;
1500
				}
1501
			}
1502
			/*
1503
			 * We might have lost our iod while sleeping,
1504
			 * so check and loop if necessary.
1505
			 */
1506
			goto again;
1507
		}
1508

1509
		/* We might have lost our nfsiod */
1510
		if (nmp->nm_bufqiods == 0) {
1511
			NFS_DPF(ASYNCIO,
1512
				("ncl_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1513
			goto again;
1514
		}
1515

1516
		if (bp->b_iocmd == BIO_READ) {
1517
			if (bp->b_rcred == NOCRED && cred != NOCRED)
1518
				bp->b_rcred = crhold(cred);
1519
		} else {
1520
			if (bp->b_wcred == NOCRED && cred != NOCRED)
1521
				bp->b_wcred = crhold(cred);
1522
		}
1523

1524
		if (bp->b_flags & B_REMFREE)
1525
			bremfreef(bp);
1526
		BUF_KERNPROC(bp);
1527
		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1528
		nmp->nm_bufqlen++;
1529
		KASSERT((bp->b_flags & B_DIRECT) == 0,
1530
		    ("ncl_asyncio: B_DIRECT set"));
1531
		NFSUNLOCKIOD();
1532
		return (0);
1533
	}
1534

1535
	NFSUNLOCKIOD();
1536

1537
	/*
1538
	 * All the iods are busy on other mounts, so return EIO to
1539
	 * force the caller to process the i/o synchronously.
1540
	 */
1541
	NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods available, i/o is synchronous\n"));
1542
	return (EIO);
1543
}
1544

1545
/*
1546
 * Do an I/O operation to/from a cache block. This may be called
1547
 * synchronously or from an nfsiod.
1548
 */
1549
int
1550
ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td,
1551
    int called_from_strategy)
1552
{
1553
	struct uio *uiop;
1554
	struct nfsnode *np;
1555
	struct nfsmount *nmp;
1556
	int error = 0, iomode, must_commit = 0;
1557
	struct uio uio;
1558
	struct iovec io;
1559
	struct proc *p = td ? td->td_proc : NULL;
1560
	uint8_t	iocmd;
1561

1562
	np = VTONFS(vp);
1563
	nmp = VFSTONFS(vp->v_mount);
1564
	uiop = &uio;
1565
	uiop->uio_iov = &io;
1566
	uiop->uio_iovcnt = 1;
1567
	uiop->uio_segflg = UIO_SYSSPACE;
1568
	uiop->uio_td = td;
1569

1570
	/*
1571
	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
1572
	 * do this here so we do not have to do it in all the code that
1573
	 * calls us.
1574
	 */
1575
	bp->b_flags &= ~B_INVAL;
1576
	bp->b_ioflags &= ~BIO_ERROR;
1577

1578
	KASSERT(!(bp->b_flags & B_DONE), ("ncl_doio: bp %p already marked done", bp));
1579
	iocmd = bp->b_iocmd;
1580
	if (iocmd == BIO_READ) {
1581
	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1582
	    io.iov_base = bp->b_data;
1583
	    uiop->uio_rw = UIO_READ;
1584

1585
	    switch (vp->v_type) {
1586
	    case VREG:
1587
		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1588
		NFSINCRGLOBAL(nfsstatsv1.read_bios);
1589
		error = ncl_readrpc(vp, uiop, cr);
1590

1591
		if (!error) {
1592
		    if (uiop->uio_resid) {
1593
			/*
1594
			 * If we had a short read with no error, we must have
1595
			 * hit a file hole.  We should zero-fill the remainder.
1596
			 * This can also occur if the server hits the file EOF.
1597
			 *
1598
			 * Holes used to be able to occur due to pending
1599
			 * writes, but that is not possible any longer.
1600
			 */
1601
			int nread = bp->b_bcount - uiop->uio_resid;
1602
			ssize_t left = uiop->uio_resid;
1603

1604
			if (left > 0)
1605
				bzero((char *)bp->b_data + nread, left);
1606
			uiop->uio_resid = 0;
1607
		    }
1608
		}
1609
		/* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */
1610
		if (p && vp->v_writecount <= -1) {
1611
			NFSLOCKNODE(np);
1612
			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) {
1613
				NFSUNLOCKNODE(np);
1614
				PROC_LOCK(p);
1615
				killproc(p, "text file modification");
1616
				PROC_UNLOCK(p);
1617
			} else
1618
				NFSUNLOCKNODE(np);
1619
		}
1620
		break;
1621
	    case VLNK:
1622
		uiop->uio_offset = (off_t)0;
1623
		NFSINCRGLOBAL(nfsstatsv1.readlink_bios);
1624
		error = ncl_readlinkrpc(vp, uiop, cr);
1625
		break;
1626
	    case VDIR:
1627
		NFSINCRGLOBAL(nfsstatsv1.readdir_bios);
1628
		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1629
		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
1630
			error = ncl_readdirplusrpc(vp, uiop, cr, td);
1631
			if (error == NFSERR_NOTSUPP)
1632
				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1633
		}
1634
		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1635
			error = ncl_readdirrpc(vp, uiop, cr, td);
1636
		/*
1637
		 * end-of-directory sets B_INVAL but does not generate an
1638
		 * error.
1639
		 */
1640
		if (error == 0 && uiop->uio_resid == bp->b_bcount)
1641
			bp->b_flags |= B_INVAL;
1642
		break;
1643
	    default:
1644
		printf("ncl_doio:  type %x unexpected\n", vp->v_type);
1645
		break;
1646
	    }
1647
	    if (error) {
1648
		bp->b_ioflags |= BIO_ERROR;
1649
		bp->b_error = error;
1650
	    }
1651
	} else {
1652
	    /*
1653
	     * If we only need to commit, try to commit
1654
	     */
1655
	    if (bp->b_flags & B_NEEDCOMMIT) {
1656
		    int retv;
1657
		    off_t off;
1658

1659
		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
1660
		    retv = ncl_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff,
1661
			bp->b_wcred, td);
1662
		    if (NFSCL_FORCEDISM(vp->v_mount) || retv == 0) {
1663
			    bp->b_dirtyoff = bp->b_dirtyend = 0;
1664
			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1665
			    bp->b_resid = 0;
1666
			    bufdone(bp);
1667
			    return (0);
1668
		    }
1669
		    if (retv == NFSERR_STALEWRITEVERF) {
1670
			    ncl_clearcommit(vp->v_mount);
1671
		    }
1672
	    }
1673

1674
	    /*
1675
	     * Setup for actual write
1676
	     */
1677
	    NFSLOCKNODE(np);
1678
	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1679
		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1680
	    NFSUNLOCKNODE(np);
1681

1682
	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1683
		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1684
		    - bp->b_dirtyoff;
1685
		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1686
		    + bp->b_dirtyoff;
1687
		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1688
		uiop->uio_rw = UIO_WRITE;
1689
		NFSINCRGLOBAL(nfsstatsv1.write_bios);
1690

1691
		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1692
		    iomode = NFSWRITE_UNSTABLE;
1693
		else
1694
		    iomode = NFSWRITE_FILESYNC;
1695

1696
		error = ncl_writerpc(vp, uiop, cr, &iomode, &must_commit,
1697
		    called_from_strategy, 0);
1698

1699
		/*
1700
		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1701
		 * to cluster the buffers needing commit.  This will allow
1702
		 * the system to submit a single commit rpc for the whole
1703
		 * cluster.  We can do this even if the buffer is not 100%
1704
		 * dirty (relative to the NFS blocksize), so we optimize the
1705
		 * append-to-file-case.
1706
		 *
1707
		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1708
		 * cleared because write clustering only works for commit
1709
		 * rpc's, not for the data portion of the write).
1710
		 */
1711

1712
		if (!error && iomode == NFSWRITE_UNSTABLE) {
1713
		    bp->b_flags |= B_NEEDCOMMIT;
1714
		    if (bp->b_dirtyoff == 0
1715
			&& bp->b_dirtyend == bp->b_bcount)
1716
			bp->b_flags |= B_CLUSTEROK;
1717
		} else {
1718
		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1719
		}
1720

1721
		/*
1722
		 * For an interrupted write, the buffer is still valid
1723
		 * and the write hasn't been pushed to the server yet,
1724
		 * so we can't set BIO_ERROR and report the interruption
1725
		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1726
		 * is not relevant, so the rpc attempt is essentially
1727
		 * a noop.  For the case of a V3 write rpc not being
1728
		 * committed to stable storage, the block is still
1729
		 * dirty and requires either a commit rpc or another
1730
		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1731
		 * the block is reused. This is indicated by setting
1732
		 * the B_DELWRI and B_NEEDCOMMIT flags.
1733
		 *
1734
		 * EIO is returned by ncl_writerpc() to indicate a recoverable
1735
		 * write error and is handled as above, except that
1736
		 * B_EINTR isn't set. One cause of this is a stale stateid
1737
		 * error for the RPC that indicates recovery is required,
1738
		 * when called with called_from_strategy != 0.
1739
		 *
1740
		 * If the buffer is marked B_PAGING, it does not reside on
1741
		 * the vp's paging queues so we cannot call bdirty().  The
1742
		 * bp in this case is not an NFS cache block so we should
1743
		 * be safe. XXX
1744
		 *
1745
		 * The logic below breaks up errors into recoverable and
1746
		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
1747
		 * and keep the buffer around for potential write retries.
1748
		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
1749
		 * and save the error in the nfsnode. This is less than ideal
1750
		 * but necessary. Keeping such buffers around could potentially
1751
		 * cause buffer exhaustion eventually (they can never be written
1752
		 * out, so will get constantly be re-dirtied). It also causes
1753
		 * all sorts of vfs panics. For non-recoverable write errors,
1754
		 * also invalidate the attrcache, so we'll be forced to go over
1755
		 * the wire for this object, returning an error to user on next
1756
		 * call (most of the time).
1757
		 */
1758
		if (error == EINTR || error == EIO || error == ETIMEDOUT
1759
		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1760
			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1761
			if ((bp->b_flags & B_PAGING) == 0) {
1762
			    bdirty(bp);
1763
			    bp->b_flags &= ~B_DONE;
1764
			}
1765
			if ((error == EINTR || error == ETIMEDOUT) &&
1766
			    (bp->b_flags & B_ASYNC) == 0)
1767
			    bp->b_flags |= B_EINTR;
1768
		} else {
1769
		    if (error) {
1770
			bp->b_ioflags |= BIO_ERROR;
1771
			bp->b_flags |= B_INVAL;
1772
			bp->b_error = np->n_error = error;
1773
			NFSLOCKNODE(np);
1774
			np->n_flag |= NWRITEERR;
1775
			np->n_attrstamp = 0;
1776
			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1777
			NFSUNLOCKNODE(np);
1778
		    }
1779
		    bp->b_dirtyoff = bp->b_dirtyend = 0;
1780
		}
1781
	    } else {
1782
		bp->b_resid = 0;
1783
		bufdone(bp);
1784
		return (0);
1785
	    }
1786
	}
1787
	bp->b_resid = uiop->uio_resid;
1788
	if (must_commit == 1)
1789
	    ncl_clearcommit(vp->v_mount);
1790
	bufdone(bp);
1791
	return (error);
1792
}
1793

1794
/*
1795
 * Used to aid in handling ftruncate() operations on the NFS client side.
1796
 * Truncation creates a number of special problems for NFS.  We have to
1797
 * throw away VM pages and buffer cache buffers that are beyond EOF, and
1798
 * we have to properly handle VM pages or (potentially dirty) buffers
1799
 * that straddle the truncation point.
1800
 */
1801

1802
int
1803
ncl_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize)
1804
{
1805
	struct nfsnode *np = VTONFS(vp);
1806
	u_quad_t tsize;
1807
	int biosize = vp->v_bufobj.bo_bsize;
1808
	int error = 0;
1809

1810
	NFSLOCKNODE(np);
1811
	tsize = np->n_size;
1812
	np->n_size = nsize;
1813
	NFSUNLOCKNODE(np);
1814

1815
	if (nsize < tsize) {
1816
		struct buf *bp;
1817
		daddr_t lbn;
1818
		int bufsize;
1819

1820
		/*
1821
		 * vtruncbuf() doesn't get the buffer overlapping the
1822
		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
1823
		 * buffer that now needs to be truncated.
1824
		 */
1825
		error = vtruncbuf(vp, nsize, biosize);
1826
		lbn = nsize / biosize;
1827
		bufsize = nsize - (lbn * biosize);
1828
		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
1829
		if (!bp)
1830
			return EINTR;
1831
		if (bp->b_dirtyoff > bp->b_bcount)
1832
			bp->b_dirtyoff = bp->b_bcount;
1833
		if (bp->b_dirtyend > bp->b_bcount)
1834
			bp->b_dirtyend = bp->b_bcount;
1835
		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
1836
		brelse(bp);
1837
	} else {
1838
		vnode_pager_setsize(vp, nsize);
1839
	}
1840
	return(error);
1841
}
1842

1843
Product

Resources

Company