Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/fs/nfsclient/nfs_clbio.c
39483 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1989, 1993
5
* The Regents of the University of California. All rights reserved.
6
*
7
* This code is derived from software contributed to Berkeley by
8
* Rick Macklem at The University of Guelph.
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
* 3. Neither the name of the University nor the names of its contributors
19
* may be used to endorse or promote products derived from this software
20
* without specific prior written permission.
21
*
22
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
* SUCH DAMAGE.
33
*/
34
35
#include <sys/param.h>
36
#include <sys/systm.h>
37
#include <sys/bio.h>
38
#include <sys/buf.h>
39
#include <sys/kernel.h>
40
#include <sys/mount.h>
41
#include <sys/rwlock.h>
42
#include <sys/vmmeter.h>
43
#include <sys/vnode.h>
44
45
#include <vm/vm.h>
46
#include <vm/vm_param.h>
47
#include <vm/vm_extern.h>
48
#include <vm/vm_page.h>
49
#include <vm/vm_object.h>
50
#include <vm/vm_pager.h>
51
#include <vm/vnode_pager.h>
52
53
#include <fs/nfs/nfsport.h>
54
#include <fs/nfsclient/nfsmount.h>
55
#include <fs/nfsclient/nfs.h>
56
#include <fs/nfsclient/nfsnode.h>
57
#include <fs/nfsclient/nfs_kdtrace.h>
58
59
extern int newnfs_directio_allow_mmap;
60
extern struct nfsstatsv1 nfsstatsv1;
61
extern struct mtx ncl_iod_mutex;
62
extern int ncl_numasync;
63
extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON];
64
extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON];
65
extern int newnfs_directio_enable;
66
extern int nfs_keep_dirty_on_error;
67
68
uma_zone_t ncl_pbuf_zone;
69
70
static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
71
struct thread *td);
72
static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
73
struct ucred *cred, int ioflag);
74
75
/*
76
* Vnode op for VM getpages.
77
*/
78
SYSCTL_DECL(_vfs_nfs);
79
static int use_buf_pager = 1;
80
SYSCTL_INT(_vfs_nfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
81
&use_buf_pager, 0,
82
"Use buffer pager instead of direct readrpc call");
83
84
static daddr_t
85
ncl_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
86
{
87
88
return (off / vp->v_bufobj.bo_bsize);
89
}
90
91
static int
92
ncl_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
93
{
94
struct nfsnode *np;
95
u_quad_t nsize;
96
int biosize, bcount;
97
98
np = VTONFS(vp);
99
NFSLOCKNODE(np);
100
nsize = np->n_size;
101
NFSUNLOCKNODE(np);
102
103
biosize = vp->v_bufobj.bo_bsize;
104
bcount = biosize;
105
if ((off_t)lbn * biosize >= nsize)
106
bcount = 0;
107
else if ((off_t)(lbn + 1) * biosize > nsize)
108
bcount = nsize - (off_t)lbn * biosize;
109
*sz = bcount;
110
return (0);
111
}
112
113
int
114
ncl_getpages(struct vop_getpages_args *ap)
115
{
116
int i, error, nextoff, size, toff, count, npages;
117
struct uio uio;
118
struct iovec iov;
119
vm_offset_t kva;
120
struct buf *bp;
121
struct vnode *vp;
122
struct thread *td;
123
struct ucred *cred;
124
struct nfsmount *nmp;
125
vm_object_t object;
126
vm_page_t *pages;
127
struct nfsnode *np;
128
129
vp = ap->a_vp;
130
np = VTONFS(vp);
131
td = curthread;
132
cred = curthread->td_ucred;
133
nmp = VFSTONFS(vp->v_mount);
134
pages = ap->a_m;
135
npages = ap->a_count;
136
137
if ((object = vp->v_object) == NULL) {
138
printf("ncl_getpages: called with non-merged cache vnode\n");
139
return (VM_PAGER_ERROR);
140
}
141
142
if (newnfs_directio_enable && !newnfs_directio_allow_mmap) {
143
NFSLOCKNODE(np);
144
if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
145
NFSUNLOCKNODE(np);
146
printf("ncl_getpages: called on non-cacheable vnode\n");
147
return (VM_PAGER_ERROR);
148
} else
149
NFSUNLOCKNODE(np);
150
}
151
152
mtx_lock(&nmp->nm_mtx);
153
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
154
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
155
mtx_unlock(&nmp->nm_mtx);
156
/* We'll never get here for v4, because we always have fsinfo */
157
(void)ncl_fsinfo(nmp, vp, cred, td);
158
} else
159
mtx_unlock(&nmp->nm_mtx);
160
161
if (use_buf_pager)
162
return (vfs_bio_getpages(vp, pages, npages, ap->a_rbehind,
163
ap->a_rahead, ncl_gbp_getblkno, ncl_gbp_getblksz));
164
165
/*
166
* If the requested page is partially valid, just return it and
167
* allow the pager to zero-out the blanks. Partially valid pages
168
* can only occur at the file EOF.
169
*
170
* XXXGL: is that true for NFS, where short read can occur???
171
*/
172
VM_OBJECT_WLOCK(object);
173
if (!vm_page_none_valid(pages[npages - 1]) && --npages == 0)
174
goto out;
175
VM_OBJECT_WUNLOCK(object);
176
177
/*
178
* We use only the kva address for the buffer, but this is extremely
179
* convenient and fast.
180
*/
181
bp = uma_zalloc(ncl_pbuf_zone, M_WAITOK);
182
183
kva = (vm_offset_t) bp->b_data;
184
pmap_qenter(kva, pages, npages);
185
VM_CNT_INC(v_vnodein);
186
VM_CNT_ADD(v_vnodepgsin, npages);
187
188
count = npages << PAGE_SHIFT;
189
iov.iov_base = (caddr_t) kva;
190
iov.iov_len = count;
191
uio.uio_iov = &iov;
192
uio.uio_iovcnt = 1;
193
uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
194
uio.uio_resid = count;
195
uio.uio_segflg = UIO_SYSSPACE;
196
uio.uio_rw = UIO_READ;
197
uio.uio_td = td;
198
199
error = ncl_readrpc(vp, &uio, cred);
200
pmap_qremove(kva, npages);
201
202
uma_zfree(ncl_pbuf_zone, bp);
203
204
if (error && (uio.uio_resid == count)) {
205
printf("ncl_getpages: error %d\n", error);
206
return (VM_PAGER_ERROR);
207
}
208
209
/*
210
* Calculate the number of bytes read and validate only that number
211
* of bytes. Note that due to pending writes, size may be 0. This
212
* does not mean that the remaining data is invalid!
213
*/
214
215
size = count - uio.uio_resid;
216
VM_OBJECT_WLOCK(object);
217
for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
218
vm_page_t m;
219
nextoff = toff + PAGE_SIZE;
220
m = pages[i];
221
222
if (nextoff <= size) {
223
/*
224
* Read operation filled an entire page
225
*/
226
vm_page_valid(m);
227
KASSERT(m->dirty == 0,
228
("nfs_getpages: page %p is dirty", m));
229
} else if (size > toff) {
230
/*
231
* Read operation filled a partial page.
232
*/
233
vm_page_invalid(m);
234
vm_page_set_valid_range(m, 0, size - toff);
235
KASSERT(m->dirty == 0,
236
("nfs_getpages: page %p is dirty", m));
237
} else {
238
/*
239
* Read operation was short. If no error
240
* occurred we may have hit a zero-fill
241
* section. We leave valid set to 0, and page
242
* is freed by vm_page_readahead_finish() if
243
* its index is not equal to requested, or
244
* page is zeroed and set valid by
245
* vm_pager_get_pages() for requested page.
246
*/
247
;
248
}
249
}
250
out:
251
VM_OBJECT_WUNLOCK(object);
252
if (ap->a_rbehind)
253
*ap->a_rbehind = 0;
254
if (ap->a_rahead)
255
*ap->a_rahead = 0;
256
return (VM_PAGER_OK);
257
}
258
259
/*
260
* Vnode op for VM putpages.
261
*/
262
int
263
ncl_putpages(struct vop_putpages_args *ap)
264
{
265
struct uio uio;
266
struct iovec iov;
267
int i, error, npages, count;
268
off_t offset;
269
int *rtvals;
270
struct vnode *vp;
271
struct thread *td;
272
struct ucred *cred;
273
struct nfsmount *nmp;
274
struct nfsnode *np;
275
vm_page_t *pages;
276
277
vp = ap->a_vp;
278
np = VTONFS(vp);
279
td = curthread; /* XXX */
280
/* Set the cred to n_writecred for the write rpcs. */
281
if (np->n_writecred != NULL)
282
cred = crhold(np->n_writecred);
283
else
284
cred = crhold(curthread->td_ucred); /* XXX */
285
nmp = VFSTONFS(vp->v_mount);
286
pages = ap->a_m;
287
count = ap->a_count;
288
rtvals = ap->a_rtvals;
289
npages = btoc(count);
290
offset = IDX_TO_OFF(pages[0]->pindex);
291
292
mtx_lock(&nmp->nm_mtx);
293
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
294
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
295
mtx_unlock(&nmp->nm_mtx);
296
(void)ncl_fsinfo(nmp, vp, cred, td);
297
} else
298
mtx_unlock(&nmp->nm_mtx);
299
300
NFSLOCKNODE(np);
301
if (newnfs_directio_enable && !newnfs_directio_allow_mmap &&
302
(np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
303
NFSUNLOCKNODE(np);
304
printf("ncl_putpages: called on noncache-able vnode\n");
305
NFSLOCKNODE(np);
306
}
307
/*
308
* When putting pages, do not extend file past EOF.
309
*/
310
if (offset + count > np->n_size) {
311
count = np->n_size - offset;
312
if (count < 0)
313
count = 0;
314
}
315
NFSUNLOCKNODE(np);
316
317
for (i = 0; i < npages; i++)
318
rtvals[i] = VM_PAGER_ERROR;
319
320
VM_CNT_INC(v_vnodeout);
321
VM_CNT_ADD(v_vnodepgsout, count);
322
323
iov.iov_base = unmapped_buf;
324
iov.iov_len = count;
325
uio.uio_iov = &iov;
326
uio.uio_iovcnt = 1;
327
uio.uio_offset = offset;
328
uio.uio_resid = count;
329
uio.uio_segflg = UIO_NOCOPY;
330
uio.uio_rw = UIO_WRITE;
331
uio.uio_td = td;
332
333
error = VOP_WRITE(vp, &uio, vnode_pager_putpages_ioflags(ap->a_sync),
334
cred);
335
crfree(cred);
336
337
if (error == 0 || !nfs_keep_dirty_on_error) {
338
vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid,
339
np->n_size - offset, npages * PAGE_SIZE);
340
}
341
return (rtvals[0]);
342
}
343
344
/*
345
* For nfs, cache consistency can only be maintained approximately.
346
* Although RFC1094 does not specify the criteria, the following is
347
* believed to be compatible with the reference port.
348
* For nfs:
349
* If the file's modify time on the server has changed since the
350
* last read rpc or you have written to the file,
351
* you may have lost data cache consistency with the
352
* server, so flush all of the file's data out of the cache.
353
* Then force a getattr rpc to ensure that you have up to date
354
* attributes.
355
* NB: This implies that cache data can be read when up to
356
* NFS_ATTRTIMEO seconds out of date. If you find that you need current
357
* attributes this could be forced by setting n_attrstamp to 0 before
358
* the VOP_GETATTR() call.
359
*/
360
static inline int
361
nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
362
{
363
int error = 0;
364
struct vattr vattr;
365
struct nfsnode *np = VTONFS(vp);
366
bool old_lock;
367
368
/*
369
* Ensure the exclusive access to the node before checking
370
* whether the cache is consistent.
371
*/
372
old_lock = ncl_excl_start(vp);
373
NFSLOCKNODE(np);
374
if (np->n_flag & NMODIFIED) {
375
NFSUNLOCKNODE(np);
376
if (vp->v_type != VREG) {
377
if (vp->v_type != VDIR)
378
panic("nfs: bioread, not dir");
379
ncl_invaldir(vp);
380
error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
381
if (error != 0)
382
goto out;
383
}
384
np->n_attrstamp = 0;
385
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
386
error = VOP_GETATTR(vp, &vattr, cred);
387
if (error)
388
goto out;
389
NFSLOCKNODE(np);
390
np->n_mtime = vattr.va_mtime;
391
NFSUNLOCKNODE(np);
392
} else {
393
NFSUNLOCKNODE(np);
394
error = VOP_GETATTR(vp, &vattr, cred);
395
if (error)
396
goto out;
397
NFSLOCKNODE(np);
398
if ((np->n_flag & NSIZECHANGED)
399
|| (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
400
NFSUNLOCKNODE(np);
401
if (vp->v_type == VDIR)
402
ncl_invaldir(vp);
403
error = ncl_vinvalbuf(vp, V_SAVE | V_ALLOWCLEAN, td, 1);
404
if (error != 0)
405
goto out;
406
NFSLOCKNODE(np);
407
np->n_mtime = vattr.va_mtime;
408
np->n_flag &= ~NSIZECHANGED;
409
}
410
NFSUNLOCKNODE(np);
411
}
412
out:
413
ncl_excl_finish(vp, old_lock);
414
return (error);
415
}
416
417
static bool
418
ncl_bioread_dora(struct vnode *vp)
419
{
420
vm_object_t obj;
421
422
obj = vp->v_object;
423
if (obj == NULL)
424
return (true);
425
return (!vm_object_mightbedirty(vp->v_object) &&
426
vp->v_object->un_pager.vnp.writemappings == 0);
427
}
428
429
/*
430
* Vnode op for read using bio
431
*/
432
int
433
ncl_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
434
{
435
struct nfsnode *np = VTONFS(vp);
436
struct buf *bp, *rabp;
437
struct thread *td;
438
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
439
daddr_t lbn, rabn;
440
int biosize, bcount, error, i, n, nra, on, save2, seqcount;
441
off_t tmp_off;
442
443
KASSERT(uio->uio_rw == UIO_READ, ("ncl_read mode"));
444
if (uio->uio_resid == 0)
445
return (0);
446
if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
447
return (EINVAL);
448
td = uio->uio_td;
449
450
mtx_lock(&nmp->nm_mtx);
451
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
452
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
453
mtx_unlock(&nmp->nm_mtx);
454
(void)ncl_fsinfo(nmp, vp, cred, td);
455
mtx_lock(&nmp->nm_mtx);
456
}
457
if (nmp->nm_rsize == 0 || nmp->nm_readdirsize == 0)
458
(void) newnfs_iosize(nmp);
459
460
tmp_off = uio->uio_offset + uio->uio_resid;
461
if (vp->v_type != VDIR &&
462
(tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)) {
463
mtx_unlock(&nmp->nm_mtx);
464
return (EFBIG);
465
}
466
mtx_unlock(&nmp->nm_mtx);
467
468
if (newnfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
469
/* No caching/ no readaheads. Just read data into the user buffer */
470
return ncl_readrpc(vp, uio, cred);
471
472
n = 0;
473
on = 0;
474
biosize = vp->v_bufobj.bo_bsize;
475
seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
476
477
error = nfs_bioread_check_cons(vp, td, cred);
478
if (error)
479
return error;
480
481
save2 = curthread_pflags2_set(TDP2_SBPAGES);
482
do {
483
u_quad_t nsize;
484
485
NFSLOCKNODE(np);
486
nsize = np->n_size;
487
NFSUNLOCKNODE(np);
488
489
switch (vp->v_type) {
490
case VREG:
491
NFSINCRGLOBAL(nfsstatsv1.biocache_reads);
492
lbn = uio->uio_offset / biosize;
493
on = uio->uio_offset - (lbn * biosize);
494
495
/*
496
* Start the read ahead(s), as required. Do not do
497
* read-ahead if there are writeable mappings, since
498
* unlocked read by nfsiod could obliterate changes
499
* done by userspace.
500
*/
501
if (nmp->nm_readahead > 0 && ncl_bioread_dora(vp)) {
502
for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
503
(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
504
rabn = lbn + 1 + nra;
505
if (incore(&vp->v_bufobj, rabn) == NULL) {
506
rabp = nfs_getcacheblk(vp, rabn, biosize, td);
507
if (!rabp) {
508
error = newnfs_sigintr(nmp, td);
509
if (error == 0)
510
error = EINTR;
511
goto out;
512
}
513
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
514
rabp->b_flags |= B_ASYNC;
515
rabp->b_iocmd = BIO_READ;
516
vfs_busy_pages(rabp, 0);
517
if (ncl_asyncio(nmp, rabp, cred, td)) {
518
rabp->b_flags |= B_INVAL;
519
rabp->b_ioflags |= BIO_ERROR;
520
vfs_unbusy_pages(rabp);
521
brelse(rabp);
522
break;
523
}
524
} else {
525
brelse(rabp);
526
}
527
}
528
}
529
}
530
531
/* Note that bcount is *not* DEV_BSIZE aligned. */
532
bcount = biosize;
533
if ((off_t)lbn * biosize >= nsize) {
534
bcount = 0;
535
} else if ((off_t)(lbn + 1) * biosize > nsize) {
536
bcount = nsize - (off_t)lbn * biosize;
537
}
538
bp = nfs_getcacheblk(vp, lbn, bcount, td);
539
540
if (!bp) {
541
error = newnfs_sigintr(nmp, td);
542
if (error == 0)
543
error = EINTR;
544
goto out;
545
}
546
547
/*
548
* If B_CACHE is not set, we must issue the read. If this
549
* fails, we return an error.
550
*/
551
552
if ((bp->b_flags & B_CACHE) == 0) {
553
bp->b_iocmd = BIO_READ;
554
vfs_busy_pages(bp, 0);
555
error = ncl_doio(vp, bp, cred, td, 0);
556
if (error) {
557
brelse(bp);
558
goto out;
559
}
560
}
561
562
/*
563
* on is the offset into the current bp. Figure out how many
564
* bytes we can copy out of the bp. Note that bcount is
565
* NOT DEV_BSIZE aligned.
566
*
567
* Then figure out how many bytes we can copy into the uio.
568
*/
569
570
n = 0;
571
if (on < bcount)
572
n = MIN((unsigned)(bcount - on), uio->uio_resid);
573
break;
574
case VLNK:
575
NFSINCRGLOBAL(nfsstatsv1.biocache_readlinks);
576
bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
577
if (!bp) {
578
error = newnfs_sigintr(nmp, td);
579
if (error == 0)
580
error = EINTR;
581
goto out;
582
}
583
if ((bp->b_flags & B_CACHE) == 0) {
584
bp->b_iocmd = BIO_READ;
585
vfs_busy_pages(bp, 0);
586
error = ncl_doio(vp, bp, cred, td, 0);
587
if (error) {
588
bp->b_ioflags |= BIO_ERROR;
589
brelse(bp);
590
goto out;
591
}
592
}
593
n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
594
on = 0;
595
break;
596
case VDIR:
597
NFSINCRGLOBAL(nfsstatsv1.biocache_readdirs);
598
NFSLOCKNODE(np);
599
if (np->n_direofoffset
600
&& uio->uio_offset >= np->n_direofoffset) {
601
NFSUNLOCKNODE(np);
602
error = 0;
603
goto out;
604
}
605
NFSUNLOCKNODE(np);
606
lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
607
on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
608
bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
609
if (!bp) {
610
error = newnfs_sigintr(nmp, td);
611
if (error == 0)
612
error = EINTR;
613
goto out;
614
}
615
if ((bp->b_flags & B_CACHE) == 0) {
616
bp->b_iocmd = BIO_READ;
617
vfs_busy_pages(bp, 0);
618
error = ncl_doio(vp, bp, cred, td, 0);
619
if (error) {
620
brelse(bp);
621
}
622
while (error == NFSERR_BAD_COOKIE) {
623
ncl_invaldir(vp);
624
error = ncl_vinvalbuf(vp, 0, td, 1);
625
626
/*
627
* Yuck! The directory has been modified on the
628
* server. The only way to get the block is by
629
* reading from the beginning to get all the
630
* offset cookies.
631
*
632
* Leave the last bp intact unless there is an error.
633
* Loop back up to the while if the error is another
634
* NFSERR_BAD_COOKIE (double yuch!).
635
*/
636
for (i = 0; i <= lbn && !error; i++) {
637
NFSLOCKNODE(np);
638
if (np->n_direofoffset
639
&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
640
NFSUNLOCKNODE(np);
641
error = 0;
642
goto out;
643
}
644
NFSUNLOCKNODE(np);
645
bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
646
if (!bp) {
647
error = newnfs_sigintr(nmp, td);
648
if (error == 0)
649
error = EINTR;
650
goto out;
651
}
652
if ((bp->b_flags & B_CACHE) == 0) {
653
bp->b_iocmd = BIO_READ;
654
vfs_busy_pages(bp, 0);
655
error = ncl_doio(vp, bp, cred, td, 0);
656
/*
657
* no error + B_INVAL == directory EOF,
658
* use the block.
659
*/
660
if (error == 0 && (bp->b_flags & B_INVAL))
661
break;
662
}
663
/*
664
* An error will throw away the block and the
665
* for loop will break out. If no error and this
666
* is not the block we want, we throw away the
667
* block and go for the next one via the for loop.
668
*/
669
if (error || i < lbn)
670
brelse(bp);
671
}
672
}
673
/*
674
* The above while is repeated if we hit another cookie
675
* error. If we hit an error and it wasn't a cookie error,
676
* we give up.
677
*/
678
if (error)
679
goto out;
680
}
681
682
/*
683
* If not eof and read aheads are enabled, start one.
684
* (You need the current block first, so that you have the
685
* directory offset cookie of the next block.)
686
*/
687
NFSLOCKNODE(np);
688
if (nmp->nm_readahead > 0 && ncl_bioread_dora(vp) &&
689
(bp->b_flags & B_INVAL) == 0 &&
690
(np->n_direofoffset == 0 ||
691
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
692
incore(&vp->v_bufobj, lbn + 1) == NULL) {
693
NFSUNLOCKNODE(np);
694
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
695
if (rabp) {
696
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
697
rabp->b_flags |= B_ASYNC;
698
rabp->b_iocmd = BIO_READ;
699
vfs_busy_pages(rabp, 0);
700
if (ncl_asyncio(nmp, rabp, cred, td)) {
701
rabp->b_flags |= B_INVAL;
702
rabp->b_ioflags |= BIO_ERROR;
703
vfs_unbusy_pages(rabp);
704
brelse(rabp);
705
}
706
} else {
707
brelse(rabp);
708
}
709
}
710
NFSLOCKNODE(np);
711
}
712
/*
713
* Unlike VREG files, whos buffer size ( bp->b_bcount ) is
714
* chopped for the EOF condition, we cannot tell how large
715
* NFS directories are going to be until we hit EOF. So
716
* an NFS directory buffer is *not* chopped to its EOF. Now,
717
* it just so happens that b_resid will effectively chop it
718
* to EOF. *BUT* this information is lost if the buffer goes
719
* away and is reconstituted into a B_CACHE state ( due to
720
* being VMIO ) later. So we keep track of the directory eof
721
* in np->n_direofoffset and chop it off as an extra step
722
* right here.
723
*/
724
n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
725
if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
726
n = np->n_direofoffset - uio->uio_offset;
727
NFSUNLOCKNODE(np);
728
break;
729
default:
730
printf(" ncl_bioread: type %x unexpected\n", vp->v_type);
731
bp = NULL;
732
break;
733
}
734
735
if (n > 0) {
736
error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
737
}
738
if (vp->v_type == VLNK)
739
n = 0;
740
if (bp != NULL)
741
brelse(bp);
742
} while (error == 0 && uio->uio_resid > 0 && n > 0);
743
out:
744
curthread_pflags2_restore(save2);
745
if ((curthread->td_pflags2 & TDP2_SBPAGES) == 0) {
746
NFSLOCKNODE(np);
747
ncl_pager_setsize(vp, NULL);
748
}
749
return (error);
750
}
751
752
/*
753
* The NFS write path cannot handle iovecs with len > 1. So we need to
754
* break up iovecs accordingly (restricting them to wsize).
755
* For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
756
* For the ASYNC case, 2 copies are needed. The first a copy from the
757
* user buffer to a staging buffer and then a second copy from the staging
758
* buffer to mbufs. This can be optimized by copying from the user buffer
759
* directly into mbufs and passing the chain down, but that requires a
760
* fair amount of re-working of the relevant codepaths (and can be done
761
* later).
762
*/
763
static int
764
nfs_directio_write(struct vnode *vp, struct uio *uiop, struct ucred *cred,
765
int ioflag)
766
{
767
struct uio uio;
768
struct iovec iov;
769
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
770
struct thread *td = uiop->uio_td;
771
int error, iomode, must_commit, size, wsize;
772
773
KASSERT((ioflag & IO_SYNC) != 0, ("nfs_directio_write: not sync"));
774
mtx_lock(&nmp->nm_mtx);
775
wsize = nmp->nm_wsize;
776
mtx_unlock(&nmp->nm_mtx);
777
while (uiop->uio_resid > 0) {
778
size = MIN(uiop->uio_resid, wsize);
779
size = MIN(uiop->uio_iov->iov_len, size);
780
iov.iov_base = uiop->uio_iov->iov_base;
781
iov.iov_len = size;
782
uio.uio_iov = &iov;
783
uio.uio_iovcnt = 1;
784
uio.uio_offset = uiop->uio_offset;
785
uio.uio_resid = size;
786
uio.uio_segflg = uiop->uio_segflg;
787
uio.uio_rw = UIO_WRITE;
788
uio.uio_td = td;
789
iomode = NFSWRITE_FILESYNC;
790
/*
791
* When doing direct I/O we do not care if the
792
* server's write verifier has changed, but we
793
* do not want to update the verifier if it has
794
* changed, since that hides the change from
795
* writes being done through the buffer cache.
796
* By passing must_commit in set to two, the code
797
* in nfsrpc_writerpc() will not update the
798
* verifier on the mount point.
799
*/
800
must_commit = 2;
801
error = ncl_writerpc(vp, &uio, cred, &iomode,
802
&must_commit, 0, ioflag);
803
KASSERT(must_commit == 2,
804
("ncl_directio_write: Updated write verifier"));
805
if (error != 0)
806
return (error);
807
if (iomode != NFSWRITE_FILESYNC)
808
printf("nfs_directio_write: Broken server "
809
"did not reply FILE_SYNC\n");
810
uiop->uio_offset += size;
811
uiop->uio_resid -= size;
812
if (uiop->uio_iov->iov_len <= size) {
813
uiop->uio_iovcnt--;
814
uiop->uio_iov++;
815
} else {
816
uiop->uio_iov->iov_base =
817
(char *)uiop->uio_iov->iov_base + size;
818
uiop->uio_iov->iov_len -= size;
819
}
820
}
821
return (0);
822
}
823
824
/*
825
* Vnode op for write using bio
826
*/
827
int
828
ncl_write(struct vop_write_args *ap)
829
{
830
int biosize;
831
struct uio *uio = ap->a_uio;
832
struct thread *td = uio->uio_td;
833
struct vnode *vp = ap->a_vp;
834
struct nfsnode *np = VTONFS(vp);
835
struct ucred *cred = ap->a_cred;
836
int ioflag = ap->a_ioflag;
837
struct buf *bp;
838
struct vattr vattr;
839
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
840
daddr_t lbn;
841
int bcount, noncontig_write, obcount;
842
int bp_cached, n, on, error = 0, error1, save2, wouldcommit;
843
size_t orig_resid, local_resid;
844
off_t orig_size, tmp_off;
845
struct timespec ts;
846
847
KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
848
KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
849
("ncl_write proc"));
850
if (vp->v_type != VREG)
851
return (EIO);
852
NFSLOCKNODE(np);
853
if (np->n_flag & NWRITEERR) {
854
np->n_flag &= ~NWRITEERR;
855
NFSUNLOCKNODE(np);
856
return (np->n_error);
857
} else
858
NFSUNLOCKNODE(np);
859
mtx_lock(&nmp->nm_mtx);
860
if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
861
(nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
862
mtx_unlock(&nmp->nm_mtx);
863
(void)ncl_fsinfo(nmp, vp, cred, td);
864
mtx_lock(&nmp->nm_mtx);
865
}
866
if (nmp->nm_wsize == 0)
867
(void) newnfs_iosize(nmp);
868
mtx_unlock(&nmp->nm_mtx);
869
870
/*
871
* Synchronously flush pending buffers if we are in synchronous
872
* mode or if we are appending.
873
*/
874
if ((ioflag & IO_APPEND) || ((ioflag & IO_SYNC) && (np->n_flag &
875
NMODIFIED))) {
876
/*
877
* For the case where IO_APPEND is being done using a
878
* direct output (to the NFS server) RPC and
879
* newnfs_directio_enable is 0, all buffer cache buffers,
880
* including ones not modified, must be invalidated.
881
* This ensures that stale data is not read out of the
882
* buffer cache. The call also invalidates all mapped
883
* pages and, since the exclusive lock is held on the vnode,
884
* new pages cannot be faulted in.
885
*
886
* For the case where newnfs_directio_enable is set
887
* (which is not the default), it is not obvious that
888
* stale data should be left in the buffer cache, but
889
* the code has been this way for over a decade without
890
* complaints. Note that, unlike doing IO_APPEND via
891
* a direct write RPC when newnfs_directio_enable is not set,
892
* when newnfs_directio_enable is set, reading is done via
893
* direct to NFS server RPCs as well.
894
*/
895
np->n_attrstamp = 0;
896
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
897
error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
898
IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
899
if (error != 0)
900
return (error);
901
}
902
903
orig_resid = uio->uio_resid;
904
NFSLOCKNODE(np);
905
orig_size = np->n_size;
906
NFSUNLOCKNODE(np);
907
908
/*
909
* If IO_APPEND then load uio_offset. We restart here if we cannot
910
* get the append lock.
911
*/
912
if (ioflag & IO_APPEND) {
913
/*
914
* For NFSv4, the AppendWrite will Verify the size against
915
* the file's size on the server. If not the same, the
916
* write will then be retried, using the file size returned
917
* by the AppendWrite. However, for NFSv2 and NFSv3, the
918
* size must be acquired here via a Getattr RPC.
919
* The AppendWrite is not done for a pNFS mount.
920
*/
921
if (!NFSHASNFSV4(nmp) || NFSHASPNFS(nmp)) {
922
np->n_attrstamp = 0;
923
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
924
error = VOP_GETATTR(vp, &vattr, cred);
925
if (error)
926
return (error);
927
}
928
NFSLOCKNODE(np);
929
uio->uio_offset = np->n_size;
930
NFSUNLOCKNODE(np);
931
}
932
933
if (uio->uio_offset < 0)
934
return (EINVAL);
935
tmp_off = uio->uio_offset + uio->uio_resid;
936
if (tmp_off > nmp->nm_maxfilesize || tmp_off < uio->uio_offset)
937
return (EFBIG);
938
if (uio->uio_resid == 0)
939
return (0);
940
941
/*
942
* Do IO_APPEND writing via a synchronous direct write.
943
* This can result in a significant performance improvement.
944
*/
945
if ((newnfs_directio_enable && (ioflag & IO_DIRECT)) ||
946
(ioflag & IO_APPEND)) {
947
/*
948
* Direct writes to the server must be done NFSWRITE_FILESYNC,
949
* because the write data is not cached and, therefore, the
950
* write cannot be redone after a server reboot.
951
* Set IO_SYNC to make this happen.
952
*/
953
ioflag |= IO_SYNC;
954
return (nfs_directio_write(vp, uio, cred, ioflag));
955
}
956
957
/*
958
* Maybe this should be above the vnode op call, but so long as
959
* file servers have no limits, i don't think it matters
960
*/
961
error = vn_rlimit_fsize(vp, uio, td);
962
if (error != 0)
963
return (error);
964
965
save2 = curthread_pflags2_set(TDP2_SBPAGES);
966
biosize = vp->v_bufobj.bo_bsize;
967
/*
968
* Find all of this file's B_NEEDCOMMIT buffers. If our writes
969
* would exceed the local maximum per-file write commit size when
970
* combined with those, we must decide whether to flush,
971
* go synchronous, or return error. We don't bother checking
972
* IO_UNIT -- we just make all writes atomic anyway, as there's
973
* no point optimizing for something that really won't ever happen.
974
*/
975
wouldcommit = 0;
976
if (!(ioflag & IO_SYNC)) {
977
int nflag;
978
979
NFSLOCKNODE(np);
980
nflag = np->n_flag;
981
NFSUNLOCKNODE(np);
982
if (nflag & NMODIFIED) {
983
BO_LOCK(&vp->v_bufobj);
984
if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
985
TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
986
b_bobufs) {
987
if (bp->b_flags & B_NEEDCOMMIT)
988
wouldcommit += bp->b_bcount;
989
}
990
}
991
BO_UNLOCK(&vp->v_bufobj);
992
}
993
}
994
995
do {
996
if (!(ioflag & IO_SYNC)) {
997
wouldcommit += biosize;
998
if (wouldcommit > nmp->nm_wcommitsize) {
999
np->n_attrstamp = 0;
1000
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1001
error = ncl_vinvalbuf(vp, V_SAVE | ((ioflag &
1002
IO_VMIO) != 0 ? V_VMIO : 0), td, 1);
1003
if (error != 0)
1004
goto out;
1005
wouldcommit = biosize;
1006
}
1007
}
1008
1009
NFSINCRGLOBAL(nfsstatsv1.biocache_writes);
1010
lbn = uio->uio_offset / biosize;
1011
on = uio->uio_offset - (lbn * biosize);
1012
n = MIN((unsigned)(biosize - on), uio->uio_resid);
1013
again:
1014
/*
1015
* Handle direct append and file extension cases, calculate
1016
* unaligned buffer size.
1017
*/
1018
NFSLOCKNODE(np);
1019
if ((np->n_flag & NHASBEENLOCKED) == 0 &&
1020
(nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0)
1021
noncontig_write = 1;
1022
else
1023
noncontig_write = 0;
1024
if ((uio->uio_offset == np->n_size ||
1025
(noncontig_write != 0 &&
1026
lbn == (np->n_size / biosize) &&
1027
uio->uio_offset + n > np->n_size)) && n) {
1028
NFSUNLOCKNODE(np);
1029
/*
1030
* Get the buffer (in its pre-append state to maintain
1031
* B_CACHE if it was previously set). Resize the
1032
* nfsnode after we have locked the buffer to prevent
1033
* readers from reading garbage.
1034
*/
1035
obcount = np->n_size - (lbn * biosize);
1036
bp = nfs_getcacheblk(vp, lbn, obcount, td);
1037
1038
if (bp != NULL) {
1039
long save;
1040
1041
NFSLOCKNODE(np);
1042
np->n_size = uio->uio_offset + n;
1043
np->n_flag |= NMODIFIED;
1044
np->n_flag &= ~NVNSETSZSKIP;
1045
vnode_pager_setsize(vp, np->n_size);
1046
NFSUNLOCKNODE(np);
1047
1048
save = bp->b_flags & B_CACHE;
1049
bcount = on + n;
1050
allocbuf(bp, bcount);
1051
bp->b_flags |= save;
1052
if (noncontig_write != 0 && on > obcount)
1053
vfs_bio_bzero_buf(bp, obcount, on -
1054
obcount);
1055
}
1056
} else {
1057
/*
1058
* Obtain the locked cache block first, and then
1059
* adjust the file's size as appropriate.
1060
*/
1061
bcount = on + n;
1062
if ((off_t)lbn * biosize + bcount < np->n_size) {
1063
if ((off_t)(lbn + 1) * biosize < np->n_size)
1064
bcount = biosize;
1065
else
1066
bcount = np->n_size - (off_t)lbn * biosize;
1067
}
1068
NFSUNLOCKNODE(np);
1069
bp = nfs_getcacheblk(vp, lbn, bcount, td);
1070
NFSLOCKNODE(np);
1071
if (uio->uio_offset + n > np->n_size) {
1072
np->n_size = uio->uio_offset + n;
1073
np->n_flag |= NMODIFIED;
1074
np->n_flag &= ~NVNSETSZSKIP;
1075
vnode_pager_setsize(vp, np->n_size);
1076
}
1077
NFSUNLOCKNODE(np);
1078
}
1079
1080
if (!bp) {
1081
error = newnfs_sigintr(nmp, td);
1082
if (!error)
1083
error = EINTR;
1084
break;
1085
}
1086
1087
/*
1088
* Issue a READ if B_CACHE is not set. In special-append
1089
* mode, B_CACHE is based on the buffer prior to the write
1090
* op and is typically set, avoiding the read. If a read
1091
* is required in special append mode, the server will
1092
* probably send us a short-read since we extended the file
1093
* on our end, resulting in b_resid == 0 and, thusly,
1094
* B_CACHE getting set.
1095
*
1096
* We can also avoid issuing the read if the write covers
1097
* the entire buffer. We have to make sure the buffer state
1098
* is reasonable in this case since we will not be initiating
1099
* I/O. See the comments in kern/vfs_bio.c's getblk() for
1100
* more information.
1101
*
1102
* B_CACHE may also be set due to the buffer being cached
1103
* normally.
1104
*/
1105
1106
bp_cached = 1;
1107
if (on == 0 && n == bcount) {
1108
if ((bp->b_flags & B_CACHE) == 0)
1109
bp_cached = 0;
1110
bp->b_flags |= B_CACHE;
1111
bp->b_flags &= ~B_INVAL;
1112
bp->b_ioflags &= ~BIO_ERROR;
1113
}
1114
1115
if ((bp->b_flags & B_CACHE) == 0) {
1116
bp->b_iocmd = BIO_READ;
1117
vfs_busy_pages(bp, 0);
1118
error = ncl_doio(vp, bp, cred, td, 0);
1119
if (error) {
1120
brelse(bp);
1121
break;
1122
}
1123
}
1124
if (bp->b_wcred == NOCRED)
1125
bp->b_wcred = crhold(cred);
1126
NFSLOCKNODE(np);
1127
np->n_flag |= NMODIFIED;
1128
NFSUNLOCKNODE(np);
1129
1130
/*
1131
* If dirtyend exceeds file size, chop it down. This should
1132
* not normally occur but there is an append race where it
1133
* might occur XXX, so we log it.
1134
*
1135
* If the chopping creates a reverse-indexed or degenerate
1136
* situation with dirtyoff/end, we 0 both of them.
1137
*/
1138
1139
if (bp->b_dirtyend > bcount) {
1140
printf("NFS append race @%lx:%d\n",
1141
(long)bp->b_blkno * DEV_BSIZE,
1142
bp->b_dirtyend - bcount);
1143
bp->b_dirtyend = bcount;
1144
}
1145
1146
if (bp->b_dirtyoff >= bp->b_dirtyend)
1147
bp->b_dirtyoff = bp->b_dirtyend = 0;
1148
1149
/*
1150
* If the new write will leave a contiguous dirty
1151
* area, just update the b_dirtyoff and b_dirtyend,
1152
* otherwise force a write rpc of the old dirty area.
1153
*
1154
* If there has been a file lock applied to this file
1155
* or vfs.nfs.old_noncontig_writing is set, do the following:
1156
* While it is possible to merge discontiguous writes due to
1157
* our having a B_CACHE buffer ( and thus valid read data
1158
* for the hole), we don't because it could lead to
1159
* significant cache coherency problems with multiple clients,
1160
* especially if locking is implemented later on.
1161
*
1162
* If vfs.nfs.old_noncontig_writing is not set and there has
1163
* not been file locking done on this file:
1164
* Relax coherency a bit for the sake of performance and
1165
* expand the current dirty region to contain the new
1166
* write even if it means we mark some non-dirty data as
1167
* dirty.
1168
*/
1169
1170
if (noncontig_write == 0 && bp->b_dirtyend > 0 &&
1171
(on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1172
if (bwrite(bp) == EINTR) {
1173
error = EINTR;
1174
break;
1175
}
1176
goto again;
1177
}
1178
1179
local_resid = uio->uio_resid;
1180
error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
1181
1182
if (error != 0 && !bp_cached) {
1183
/*
1184
* This block has no other content then what
1185
* possibly was written by the faulty uiomove.
1186
* Release it, forgetting the data pages, to
1187
* prevent the leak of uninitialized data to
1188
* usermode.
1189
*/
1190
bp->b_ioflags |= BIO_ERROR;
1191
brelse(bp);
1192
uio->uio_offset -= local_resid - uio->uio_resid;
1193
uio->uio_resid = local_resid;
1194
break;
1195
}
1196
1197
/*
1198
* Since this block is being modified, it must be written
1199
* again and not just committed. Since write clustering does
1200
* not work for the stage 1 data write, only the stage 2
1201
* commit rpc, we have to clear B_CLUSTEROK as well.
1202
*/
1203
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1204
1205
/*
1206
* Get the partial update on the progress made from
1207
* uiomove, if an error occurred.
1208
*/
1209
if (error != 0)
1210
n = local_resid - uio->uio_resid;
1211
1212
/*
1213
* Only update dirtyoff/dirtyend if not a degenerate
1214
* condition.
1215
*/
1216
if (n > 0) {
1217
if (bp->b_dirtyend > 0) {
1218
bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1219
bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1220
} else {
1221
bp->b_dirtyoff = on;
1222
bp->b_dirtyend = on + n;
1223
}
1224
vfs_bio_set_valid(bp, on, n);
1225
}
1226
1227
/*
1228
* If IO_SYNC do bwrite().
1229
*
1230
* IO_INVAL appears to be unused. The idea appears to be
1231
* to turn off caching in this case. Very odd. XXX
1232
*/
1233
if ((ioflag & IO_SYNC)) {
1234
if (ioflag & IO_INVAL)
1235
bp->b_flags |= B_NOCACHE;
1236
error1 = bwrite(bp);
1237
if (error1 != 0) {
1238
if (error == 0)
1239
error = error1;
1240
break;
1241
}
1242
} else if ((n + on) == biosize || (ioflag & IO_ASYNC) != 0) {
1243
bp->b_flags |= B_ASYNC;
1244
(void) bwrite(bp);
1245
} else {
1246
bdwrite(bp);
1247
}
1248
1249
if (error != 0)
1250
break;
1251
} while (uio->uio_resid > 0 && n > 0);
1252
1253
if (error == 0) {
1254
nanouptime(&ts);
1255
NFSLOCKNODE(np);
1256
np->n_localmodtime = ts;
1257
NFSUNLOCKNODE(np);
1258
} else {
1259
if (ioflag & IO_UNIT) {
1260
VATTR_NULL(&vattr);
1261
vattr.va_size = orig_size;
1262
/* IO_SYNC is handled implicitely */
1263
(void)VOP_SETATTR(vp, &vattr, cred);
1264
uio->uio_offset -= orig_resid - uio->uio_resid;
1265
uio->uio_resid = orig_resid;
1266
}
1267
}
1268
1269
out:
1270
curthread_pflags2_restore(save2);
1271
return (error);
1272
}
1273
1274
/*
1275
* Get an nfs cache block.
1276
*
1277
* Allocate a new one if the block isn't currently in the cache
1278
* and return the block marked busy. If the calling process is
1279
* interrupted by a signal for an interruptible mount point, return
1280
* NULL.
1281
*
1282
* The caller must carefully deal with the possible B_INVAL state of
1283
* the buffer. ncl_doio() clears B_INVAL (and ncl_asyncio() clears it
1284
* indirectly), so synchronous reads can be issued without worrying about
1285
* the B_INVAL state. We have to be a little more careful when dealing
1286
* with writes (see comments in nfs_write()) when extending a file past
1287
* its EOF.
1288
*/
1289
static struct buf *
1290
nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
1291
{
1292
struct buf *bp;
1293
struct mount *mp;
1294
struct nfsmount *nmp;
1295
1296
mp = vp->v_mount;
1297
nmp = VFSTONFS(mp);
1298
1299
if (nmp->nm_flag & NFSMNT_INT) {
1300
sigset_t oldset;
1301
1302
newnfs_set_sigmask(td, &oldset);
1303
bp = getblk(vp, bn, size, PCATCH, 0, 0);
1304
newnfs_restore_sigmask(td, &oldset);
1305
while (bp == NULL) {
1306
if (newnfs_sigintr(nmp, td))
1307
return (NULL);
1308
bp = getblk(vp, bn, size, 0, 2 * hz, 0);
1309
}
1310
} else {
1311
bp = getblk(vp, bn, size, 0, 0, 0);
1312
}
1313
1314
if (vp->v_type == VREG)
1315
bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE);
1316
return (bp);
1317
}
1318
1319
/*
1320
* Flush and invalidate all dirty buffers. If another process is already
1321
* doing the flush, just wait for completion.
1322
*/
1323
int
1324
ncl_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
1325
{
1326
struct nfsnode *np = VTONFS(vp);
1327
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1328
int error = 0, slpflag, slptimeo;
1329
bool old_lock;
1330
struct timespec ts;
1331
1332
ASSERT_VOP_LOCKED(vp, "ncl_vinvalbuf");
1333
1334
if ((nmp->nm_flag & NFSMNT_INT) == 0)
1335
intrflg = 0;
1336
if (NFSCL_FORCEDISM(nmp->nm_mountp))
1337
intrflg = 1;
1338
if (intrflg) {
1339
slpflag = PCATCH;
1340
slptimeo = 2 * hz;
1341
} else {
1342
slpflag = 0;
1343
slptimeo = 0;
1344
}
1345
1346
old_lock = ncl_excl_start(vp);
1347
if (old_lock)
1348
flags |= V_ALLOWCLEAN;
1349
1350
/*
1351
* Now, flush as required.
1352
*/
1353
if ((flags & (V_SAVE | V_VMIO)) == V_SAVE) {
1354
vnode_pager_clean_sync(vp);
1355
1356
/*
1357
* If the page clean was interrupted, fail the invalidation.
1358
* Not doing so, we run the risk of losing dirty pages in the
1359
* vinvalbuf() call below.
1360
*/
1361
if (intrflg && (error = newnfs_sigintr(nmp, td)))
1362
goto out;
1363
}
1364
1365
error = vinvalbuf(vp, flags, slpflag, 0);
1366
while (error) {
1367
if (intrflg && (error = newnfs_sigintr(nmp, td)))
1368
goto out;
1369
error = vinvalbuf(vp, flags, 0, slptimeo);
1370
}
1371
if (NFSHASPNFS(nmp)) {
1372
nfscl_layoutcommit(vp, td);
1373
nanouptime(&ts);
1374
/*
1375
* Invalidate the attribute cache, since writes to a DS
1376
* won't update the size attribute.
1377
*/
1378
NFSLOCKNODE(np);
1379
np->n_attrstamp = 0;
1380
} else {
1381
nanouptime(&ts);
1382
NFSLOCKNODE(np);
1383
}
1384
if ((np->n_flag & NMODIFIED) != 0) {
1385
np->n_localmodtime = ts;
1386
np->n_flag &= ~NMODIFIED;
1387
}
1388
NFSUNLOCKNODE(np);
1389
out:
1390
ncl_excl_finish(vp, old_lock);
1391
return error;
1392
}
1393
1394
/*
1395
* Initiate asynchronous I/O. Return an error if no nfsiods are available.
1396
* This is mainly to avoid queueing async I/O requests when the nfsiods
1397
* are all hung on a dead server.
1398
*
1399
* Note: ncl_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
1400
* is eventually dequeued by the async daemon, ncl_doio() *will*.
1401
*/
1402
int
1403
ncl_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
1404
{
1405
int iod;
1406
int gotiod;
1407
int slpflag = 0;
1408
int slptimeo = 0;
1409
int error, error2;
1410
1411
/*
1412
* Commits are usually short and sweet so lets save some cpu and
1413
* leave the async daemons for more important rpc's (such as reads
1414
* and writes).
1415
*
1416
* Readdirplus RPCs do vget()s to acquire the vnodes for entries
1417
* in the directory in order to update attributes. This can deadlock
1418
* with another thread that is waiting for async I/O to be done by
1419
* an nfsiod thread while holding a lock on one of these vnodes.
1420
* To avoid this deadlock, don't allow the async nfsiod threads to
1421
* perform Readdirplus RPCs.
1422
*/
1423
NFSLOCKIOD();
1424
if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1425
(nmp->nm_bufqiods > ncl_numasync / 2)) ||
1426
(bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) {
1427
NFSUNLOCKIOD();
1428
return(EIO);
1429
}
1430
again:
1431
if (nmp->nm_flag & NFSMNT_INT)
1432
slpflag = PCATCH;
1433
gotiod = FALSE;
1434
1435
/*
1436
* Find a free iod to process this request.
1437
*/
1438
for (iod = 0; iod < ncl_numasync; iod++)
1439
if (ncl_iodwant[iod] == NFSIOD_AVAILABLE) {
1440
gotiod = TRUE;
1441
break;
1442
}
1443
1444
/*
1445
* Try to create one if none are free.
1446
*/
1447
if (!gotiod)
1448
ncl_nfsiodnew();
1449
else {
1450
/*
1451
* Found one, so wake it up and tell it which
1452
* mount to process.
1453
*/
1454
NFS_DPF(ASYNCIO, ("ncl_asyncio: waking iod %d for mount %p\n",
1455
iod, nmp));
1456
ncl_iodwant[iod] = NFSIOD_NOT_AVAILABLE;
1457
ncl_iodmount[iod] = nmp;
1458
nmp->nm_bufqiods++;
1459
wakeup(&ncl_iodwant[iod]);
1460
}
1461
1462
/*
1463
* If none are free, we may already have an iod working on this mount
1464
* point. If so, it will process our request.
1465
*/
1466
if (!gotiod) {
1467
if (nmp->nm_bufqiods > 0) {
1468
NFS_DPF(ASYNCIO,
1469
("ncl_asyncio: %d iods are already processing mount %p\n",
1470
nmp->nm_bufqiods, nmp));
1471
gotiod = TRUE;
1472
}
1473
}
1474
1475
/*
1476
* If we have an iod which can process the request, then queue
1477
* the buffer.
1478
*/
1479
if (gotiod) {
1480
/*
1481
* Ensure that the queue never grows too large. We still want
1482
* to asynchronize so we block rather then return EIO.
1483
*/
1484
while (nmp->nm_bufqlen >= 2*ncl_numasync) {
1485
NFS_DPF(ASYNCIO,
1486
("ncl_asyncio: waiting for mount %p queue to drain\n", nmp));
1487
nmp->nm_bufqwant = TRUE;
1488
error = newnfs_msleep(td, &nmp->nm_bufq,
1489
&ncl_iod_mutex, slpflag | PRIBIO, "nfsaio",
1490
slptimeo);
1491
if (error) {
1492
error2 = newnfs_sigintr(nmp, td);
1493
if (error2) {
1494
NFSUNLOCKIOD();
1495
return (error2);
1496
}
1497
if (slpflag == PCATCH) {
1498
slpflag = 0;
1499
slptimeo = 2 * hz;
1500
}
1501
}
1502
/*
1503
* We might have lost our iod while sleeping,
1504
* so check and loop if necessary.
1505
*/
1506
goto again;
1507
}
1508
1509
/* We might have lost our nfsiod */
1510
if (nmp->nm_bufqiods == 0) {
1511
NFS_DPF(ASYNCIO,
1512
("ncl_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1513
goto again;
1514
}
1515
1516
if (bp->b_iocmd == BIO_READ) {
1517
if (bp->b_rcred == NOCRED && cred != NOCRED)
1518
bp->b_rcred = crhold(cred);
1519
} else {
1520
if (bp->b_wcred == NOCRED && cred != NOCRED)
1521
bp->b_wcred = crhold(cred);
1522
}
1523
1524
if (bp->b_flags & B_REMFREE)
1525
bremfreef(bp);
1526
BUF_KERNPROC(bp);
1527
TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1528
nmp->nm_bufqlen++;
1529
KASSERT((bp->b_flags & B_DIRECT) == 0,
1530
("ncl_asyncio: B_DIRECT set"));
1531
NFSUNLOCKIOD();
1532
return (0);
1533
}
1534
1535
NFSUNLOCKIOD();
1536
1537
/*
1538
* All the iods are busy on other mounts, so return EIO to
1539
* force the caller to process the i/o synchronously.
1540
*/
1541
NFS_DPF(ASYNCIO, ("ncl_asyncio: no iods available, i/o is synchronous\n"));
1542
return (EIO);
1543
}
1544
1545
/*
1546
* Do an I/O operation to/from a cache block. This may be called
1547
* synchronously or from an nfsiod.
1548
*/
1549
int
1550
ncl_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td,
1551
int called_from_strategy)
1552
{
1553
struct uio *uiop;
1554
struct nfsnode *np;
1555
struct nfsmount *nmp;
1556
int error = 0, iomode, must_commit = 0;
1557
struct uio uio;
1558
struct iovec io;
1559
struct proc *p = td ? td->td_proc : NULL;
1560
uint8_t iocmd;
1561
1562
np = VTONFS(vp);
1563
nmp = VFSTONFS(vp->v_mount);
1564
uiop = &uio;
1565
uiop->uio_iov = &io;
1566
uiop->uio_iovcnt = 1;
1567
uiop->uio_segflg = UIO_SYSSPACE;
1568
uiop->uio_td = td;
1569
1570
/*
1571
* clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We
1572
* do this here so we do not have to do it in all the code that
1573
* calls us.
1574
*/
1575
bp->b_flags &= ~B_INVAL;
1576
bp->b_ioflags &= ~BIO_ERROR;
1577
1578
KASSERT(!(bp->b_flags & B_DONE), ("ncl_doio: bp %p already marked done", bp));
1579
iocmd = bp->b_iocmd;
1580
if (iocmd == BIO_READ) {
1581
io.iov_len = uiop->uio_resid = bp->b_bcount;
1582
io.iov_base = bp->b_data;
1583
uiop->uio_rw = UIO_READ;
1584
1585
switch (vp->v_type) {
1586
case VREG:
1587
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1588
NFSINCRGLOBAL(nfsstatsv1.read_bios);
1589
error = ncl_readrpc(vp, uiop, cr);
1590
1591
if (!error) {
1592
if (uiop->uio_resid) {
1593
/*
1594
* If we had a short read with no error, we must have
1595
* hit a file hole. We should zero-fill the remainder.
1596
* This can also occur if the server hits the file EOF.
1597
*
1598
* Holes used to be able to occur due to pending
1599
* writes, but that is not possible any longer.
1600
*/
1601
int nread = bp->b_bcount - uiop->uio_resid;
1602
ssize_t left = uiop->uio_resid;
1603
1604
if (left > 0)
1605
bzero((char *)bp->b_data + nread, left);
1606
uiop->uio_resid = 0;
1607
}
1608
}
1609
/* ASSERT_VOP_LOCKED(vp, "ncl_doio"); */
1610
if (p && vp->v_writecount <= -1) {
1611
NFSLOCKNODE(np);
1612
if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.na_mtime)) {
1613
NFSUNLOCKNODE(np);
1614
PROC_LOCK(p);
1615
killproc(p, "text file modification");
1616
PROC_UNLOCK(p);
1617
} else
1618
NFSUNLOCKNODE(np);
1619
}
1620
break;
1621
case VLNK:
1622
uiop->uio_offset = (off_t)0;
1623
NFSINCRGLOBAL(nfsstatsv1.readlink_bios);
1624
error = ncl_readlinkrpc(vp, uiop, cr);
1625
break;
1626
case VDIR:
1627
NFSINCRGLOBAL(nfsstatsv1.readdir_bios);
1628
uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1629
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
1630
error = ncl_readdirplusrpc(vp, uiop, cr, td);
1631
if (error == NFSERR_NOTSUPP)
1632
nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1633
}
1634
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1635
error = ncl_readdirrpc(vp, uiop, cr, td);
1636
/*
1637
* end-of-directory sets B_INVAL but does not generate an
1638
* error.
1639
*/
1640
if (error == 0 && uiop->uio_resid == bp->b_bcount)
1641
bp->b_flags |= B_INVAL;
1642
break;
1643
default:
1644
printf("ncl_doio: type %x unexpected\n", vp->v_type);
1645
break;
1646
}
1647
if (error) {
1648
bp->b_ioflags |= BIO_ERROR;
1649
bp->b_error = error;
1650
}
1651
} else {
1652
/*
1653
* If we only need to commit, try to commit
1654
*/
1655
if (bp->b_flags & B_NEEDCOMMIT) {
1656
int retv;
1657
off_t off;
1658
1659
off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
1660
retv = ncl_commit(vp, off, bp->b_dirtyend-bp->b_dirtyoff,
1661
bp->b_wcred, td);
1662
if (NFSCL_FORCEDISM(vp->v_mount) || retv == 0) {
1663
bp->b_dirtyoff = bp->b_dirtyend = 0;
1664
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1665
bp->b_resid = 0;
1666
bufdone(bp);
1667
return (0);
1668
}
1669
if (retv == NFSERR_STALEWRITEVERF) {
1670
ncl_clearcommit(vp->v_mount);
1671
}
1672
}
1673
1674
/*
1675
* Setup for actual write
1676
*/
1677
NFSLOCKNODE(np);
1678
if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1679
bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1680
NFSUNLOCKNODE(np);
1681
1682
if (bp->b_dirtyend > bp->b_dirtyoff) {
1683
io.iov_len = uiop->uio_resid = bp->b_dirtyend
1684
- bp->b_dirtyoff;
1685
uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1686
+ bp->b_dirtyoff;
1687
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1688
uiop->uio_rw = UIO_WRITE;
1689
NFSINCRGLOBAL(nfsstatsv1.write_bios);
1690
1691
if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1692
iomode = NFSWRITE_UNSTABLE;
1693
else
1694
iomode = NFSWRITE_FILESYNC;
1695
1696
error = ncl_writerpc(vp, uiop, cr, &iomode, &must_commit,
1697
called_from_strategy, 0);
1698
1699
/*
1700
* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1701
* to cluster the buffers needing commit. This will allow
1702
* the system to submit a single commit rpc for the whole
1703
* cluster. We can do this even if the buffer is not 100%
1704
* dirty (relative to the NFS blocksize), so we optimize the
1705
* append-to-file-case.
1706
*
1707
* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1708
* cleared because write clustering only works for commit
1709
* rpc's, not for the data portion of the write).
1710
*/
1711
1712
if (!error && iomode == NFSWRITE_UNSTABLE) {
1713
bp->b_flags |= B_NEEDCOMMIT;
1714
if (bp->b_dirtyoff == 0
1715
&& bp->b_dirtyend == bp->b_bcount)
1716
bp->b_flags |= B_CLUSTEROK;
1717
} else {
1718
bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1719
}
1720
1721
/*
1722
* For an interrupted write, the buffer is still valid
1723
* and the write hasn't been pushed to the server yet,
1724
* so we can't set BIO_ERROR and report the interruption
1725
* by setting B_EINTR. For the B_ASYNC case, B_EINTR
1726
* is not relevant, so the rpc attempt is essentially
1727
* a noop. For the case of a V3 write rpc not being
1728
* committed to stable storage, the block is still
1729
* dirty and requires either a commit rpc or another
1730
* write rpc with iomode == NFSV3WRITE_FILESYNC before
1731
* the block is reused. This is indicated by setting
1732
* the B_DELWRI and B_NEEDCOMMIT flags.
1733
*
1734
* EIO is returned by ncl_writerpc() to indicate a recoverable
1735
* write error and is handled as above, except that
1736
* B_EINTR isn't set. One cause of this is a stale stateid
1737
* error for the RPC that indicates recovery is required,
1738
* when called with called_from_strategy != 0.
1739
*
1740
* If the buffer is marked B_PAGING, it does not reside on
1741
* the vp's paging queues so we cannot call bdirty(). The
1742
* bp in this case is not an NFS cache block so we should
1743
* be safe. XXX
1744
*
1745
* The logic below breaks up errors into recoverable and
1746
* unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
1747
* and keep the buffer around for potential write retries.
1748
* For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
1749
* and save the error in the nfsnode. This is less than ideal
1750
* but necessary. Keeping such buffers around could potentially
1751
* cause buffer exhaustion eventually (they can never be written
1752
* out, so will get constantly be re-dirtied). It also causes
1753
* all sorts of vfs panics. For non-recoverable write errors,
1754
* also invalidate the attrcache, so we'll be forced to go over
1755
* the wire for this object, returning an error to user on next
1756
* call (most of the time).
1757
*/
1758
if (error == EINTR || error == EIO || error == ETIMEDOUT
1759
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1760
bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1761
if ((bp->b_flags & B_PAGING) == 0) {
1762
bdirty(bp);
1763
bp->b_flags &= ~B_DONE;
1764
}
1765
if ((error == EINTR || error == ETIMEDOUT) &&
1766
(bp->b_flags & B_ASYNC) == 0)
1767
bp->b_flags |= B_EINTR;
1768
} else {
1769
if (error) {
1770
bp->b_ioflags |= BIO_ERROR;
1771
bp->b_flags |= B_INVAL;
1772
bp->b_error = np->n_error = error;
1773
NFSLOCKNODE(np);
1774
np->n_flag |= NWRITEERR;
1775
np->n_attrstamp = 0;
1776
KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1777
NFSUNLOCKNODE(np);
1778
}
1779
bp->b_dirtyoff = bp->b_dirtyend = 0;
1780
}
1781
} else {
1782
bp->b_resid = 0;
1783
bufdone(bp);
1784
return (0);
1785
}
1786
}
1787
bp->b_resid = uiop->uio_resid;
1788
if (must_commit == 1)
1789
ncl_clearcommit(vp->v_mount);
1790
bufdone(bp);
1791
return (error);
1792
}
1793
1794
/*
1795
* Used to aid in handling ftruncate() operations on the NFS client side.
1796
* Truncation creates a number of special problems for NFS. We have to
1797
* throw away VM pages and buffer cache buffers that are beyond EOF, and
1798
* we have to properly handle VM pages or (potentially dirty) buffers
1799
* that straddle the truncation point.
1800
*/
1801
1802
int
1803
ncl_meta_setsize(struct vnode *vp, struct thread *td, u_quad_t nsize)
1804
{
1805
struct nfsnode *np = VTONFS(vp);
1806
u_quad_t tsize;
1807
int biosize = vp->v_bufobj.bo_bsize;
1808
int error = 0;
1809
1810
NFSLOCKNODE(np);
1811
tsize = np->n_size;
1812
np->n_size = nsize;
1813
NFSUNLOCKNODE(np);
1814
1815
if (nsize < tsize) {
1816
struct buf *bp;
1817
daddr_t lbn;
1818
int bufsize;
1819
1820
/*
1821
* vtruncbuf() doesn't get the buffer overlapping the
1822
* truncation point. We may have a B_DELWRI and/or B_CACHE
1823
* buffer that now needs to be truncated.
1824
*/
1825
error = vtruncbuf(vp, nsize, biosize);
1826
lbn = nsize / biosize;
1827
bufsize = nsize - (lbn * biosize);
1828
bp = nfs_getcacheblk(vp, lbn, bufsize, td);
1829
if (!bp)
1830
return EINTR;
1831
if (bp->b_dirtyoff > bp->b_bcount)
1832
bp->b_dirtyoff = bp->b_bcount;
1833
if (bp->b_dirtyend > bp->b_bcount)
1834
bp->b_dirtyend = bp->b_bcount;
1835
bp->b_flags |= B_RELBUF; /* don't leave garbage around */
1836
brelse(bp);
1837
} else {
1838
vnode_pager_setsize(vp, nsize);
1839
}
1840
return(error);
1841
}
1842
1843