Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/fs/nfsserver/nfs_nfsdcache.c
39586 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1989, 1993
5
* The Regents of the University of California. All rights reserved.
6
*
7
* This code is derived from software contributed to Berkeley by
8
* Rick Macklem at The University of Guelph.
9
*
10
* Redistribution and use in source and binary forms, with or without
11
* modification, are permitted provided that the following conditions
12
* are met:
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
* 3. Neither the name of the University nor the names of its contributors
19
* may be used to endorse or promote products derived from this software
20
* without specific prior written permission.
21
*
22
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
* SUCH DAMAGE.
33
*
34
*/
35
36
#include <sys/cdefs.h>
37
/*
38
* Here is the basic algorithm:
39
* First, some design criteria I used:
40
* - I think a false hit is more serious than a false miss
41
* - A false hit for an RPC that has Op(s) that order via seqid# must be
42
* avoided at all cost
43
* - A valid hit will probably happen a long time after the original reply
44
* and the TCP socket that the original request was received on will no
45
* longer be active
46
* (The long time delay implies to me that LRU is not appropriate.)
47
* - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48
* in them as well as minimizing the risk of redoing retried non-idempotent
49
* Ops.
50
* Because it is biased towards avoiding false hits, multiple entries with
51
* the same xid are to be expected, especially for the case of the entry
52
* in the cache being related to a seqid# sequenced Op.
53
*
54
* The basic algorithm I'm about to code up:
55
* - Null RPCs bypass the cache and are just done
56
* For TCP
57
* - key on <xid, NFS version> (as noted above, there can be several
58
* entries with the same key)
59
* When a request arrives:
60
* For all that match key
61
* - if RPC# != OR request_size !=
62
* - not a match with this one
63
* - if NFSv4 and received on same TCP socket OR
64
* received on a TCP connection created before the
65
* entry was cached
66
* - not a match with this one
67
* (V2,3 clients might retry on same TCP socket)
68
* - calculate checksum on first N bytes of NFS XDR
69
* - if checksum !=
70
* - not a match for this one
71
* If any of the remaining ones that match has a
72
* seqid_refcnt > 0
73
* - not a match (go do RPC, using new cache entry)
74
* If one match left
75
* - a hit (reply from cache)
76
* else
77
* - miss (go do RPC, using new cache entry)
78
*
79
* During processing of NFSv4 request:
80
* - set a flag when a non-idempotent Op is processed
81
* - when an Op that uses a seqid# (Open,...) is processed
82
* - if same seqid# as referenced entry in cache
83
* - free new cache entry
84
* - reply from referenced cache entry
85
* else if next seqid# in order
86
* - free referenced cache entry
87
* - increment seqid_refcnt on new cache entry
88
* - set pointer from Openowner/Lockowner to
89
* new cache entry (aka reference it)
90
* else if first seqid# in sequence
91
* - increment seqid_refcnt on new cache entry
92
* - set pointer from Openowner/Lockowner to
93
* new cache entry (aka reference it)
94
*
95
* At end of RPC processing:
96
* - if seqid_refcnt > 0 OR flagged non-idempotent on new
97
* cache entry
98
* - save reply in cache entry
99
* - calculate checksum on first N bytes of NFS XDR
100
* request
101
* - note op and length of XDR request (in bytes)
102
* - timestamp it
103
* else
104
* - free new cache entry
105
* - Send reply (noting info for socket activity check, below)
106
*
107
* For cache entries saved above:
108
* - if saved since seqid_refcnt was > 0
109
* - free when seqid_refcnt decrements to 0
110
* (when next one in sequence is processed above, or
111
* when Openowner/Lockowner is discarded)
112
* else { non-idempotent Op(s) }
113
* - free when
114
* - some further activity observed on same
115
* socket
116
* (I'm not yet sure how I'm going to do
117
* this. Maybe look at the TCP connection
118
* to see if the send_tcp_sequence# is well
119
* past sent reply OR K additional RPCs
120
* replied on same socket OR?)
121
* OR
122
* - when very old (hours, days, weeks?)
123
*
124
* For UDP (v2, 3 only), pretty much the old way:
125
* - key on <xid, NFS version, RPC#, Client host ip#>
126
* (at most one entry for each key)
127
*
128
* When a Request arrives:
129
* - if a match with entry via key
130
* - if RPC marked In_progress
131
* - discard request (don't send reply)
132
* else
133
* - reply from cache
134
* - timestamp cache entry
135
* else
136
* - add entry to cache, marked In_progress
137
* - do RPC
138
* - when RPC done
139
* - if RPC# non-idempotent
140
* - mark entry Done (not In_progress)
141
* - save reply
142
* - timestamp cache entry
143
* else
144
* - free cache entry
145
* - send reply
146
*
147
* Later, entries with saved replies are free'd a short time (few minutes)
148
* after reply sent (timestamp).
149
* Reference: Chet Juszczak, "Improving the Performance and Correctness
150
* of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151
* pages 53-63. San Diego, February 1989.
152
* for the UDP case.
153
* nfsrc_floodlevel is set to the allowable upper limit for saved replies
154
* for TCP. For V3, a reply won't be saved when the flood level is
155
* hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156
* that case. This level should be set high enough that this almost
157
* never happens.
158
*/
159
#include <fs/nfs/nfsport.h>
160
161
extern struct mtx nfsrc_udpmtx;
162
163
NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
164
NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
165
NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
166
NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
167
168
NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
169
NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
170
171
SYSCTL_DECL(_vfs_nfsd);
172
173
static u_int nfsrc_tcphighwater = 0;
174
static int
175
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176
{
177
int error, newhighwater;
178
179
newhighwater = nfsrc_tcphighwater;
180
error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181
if (error != 0 || req->newptr == NULL)
182
return (error);
183
if (newhighwater < 0)
184
return (EINVAL);
185
if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
186
NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
187
nfsrc_tcphighwater = newhighwater;
188
return (0);
189
}
190
SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191
CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192
sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
193
194
static u_int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196
&nfsrc_udphighwater, 0,
197
"High water mark for UDP cache entries");
198
static u_int nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200
&nfsrc_tcptimeout, 0,
201
"Timeout for TCP entries in the DRC");
202
static u_int nfsrc_tcpnonidempotent = 1;
203
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204
&nfsrc_tcpnonidempotent, 0,
205
"Enable the DRC for NFS over TCP");
206
207
NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
208
NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
209
210
/*
211
* and the reverse mapping from generic to Version 2 procedure numbers
212
*/
213
static int newnfsv2_procid[NFS_V3NPROCS] = {
214
NFSV2PROC_NULL,
215
NFSV2PROC_GETATTR,
216
NFSV2PROC_SETATTR,
217
NFSV2PROC_LOOKUP,
218
NFSV2PROC_NOOP,
219
NFSV2PROC_READLINK,
220
NFSV2PROC_READ,
221
NFSV2PROC_WRITE,
222
NFSV2PROC_CREATE,
223
NFSV2PROC_MKDIR,
224
NFSV2PROC_SYMLINK,
225
NFSV2PROC_CREATE,
226
NFSV2PROC_REMOVE,
227
NFSV2PROC_RMDIR,
228
NFSV2PROC_RENAME,
229
NFSV2PROC_LINK,
230
NFSV2PROC_READDIR,
231
NFSV2PROC_NOOP,
232
NFSV2PROC_STATFS,
233
NFSV2PROC_NOOP,
234
NFSV2PROC_NOOP,
235
NFSV2PROC_NOOP,
236
};
237
238
#define nfsrc_hash(xid) (((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239
#define NFSRCUDPHASH(xid) \
240
(&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
241
#define NFSRCHASH(xid) \
242
(&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
243
#define NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
244
#define TRUE 1
245
#define FALSE 0
246
#define NFSRVCACHE_CHECKLEN 100
247
248
/* True iff the rpc reply is an nfs status ONLY! */
249
static int nfsv2_repstat[NFS_V3NPROCS] = {
250
FALSE,
251
FALSE,
252
FALSE,
253
FALSE,
254
FALSE,
255
FALSE,
256
FALSE,
257
FALSE,
258
FALSE,
259
FALSE,
260
TRUE,
261
TRUE,
262
TRUE,
263
TRUE,
264
FALSE,
265
TRUE,
266
FALSE,
267
FALSE,
268
FALSE,
269
FALSE,
270
FALSE,
271
FALSE,
272
};
273
274
/*
275
* Will NFS want to work over IPv6 someday?
276
*/
277
#define NETFAMILY(rp) \
278
(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
279
280
/* local functions */
281
static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282
static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283
static void nfsrc_lock(struct nfsrvcache *rp);
284
static void nfsrc_unlock(struct nfsrvcache *rp);
285
static void nfsrc_wanted(struct nfsrvcache *rp);
286
static void nfsrc_freecache(struct nfsrvcache *rp);
287
static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
288
static void nfsrc_marksametcpconn(u_int64_t);
289
290
/*
291
* Return the correct mutex for this cache entry.
292
*/
293
static __inline struct mtx *
294
nfsrc_cachemutex(struct nfsrvcache *rp)
295
{
296
297
if ((rp->rc_flag & RC_UDP) != 0)
298
return (&nfsrc_udpmtx);
299
return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
300
}
301
302
/*
303
* Initialize the server request cache list
304
*/
305
void
306
nfsrvd_initcache(void)
307
{
308
int i;
309
310
NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
311
NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
312
NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
313
NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314
NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315
NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317
mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
318
MTX_DEF);
319
mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
320
MTX_DEF);
321
}
322
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
323
LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
324
LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
325
LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
326
}
327
TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
328
NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
329
NFSD_VNET(nfsrc_udpcachesize) = 0;
330
}
331
332
/*
333
* Get a cache entry for this request. Basically just malloc a new one
334
* and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
335
*/
336
int
337
nfsrvd_getcache(struct nfsrv_descript *nd)
338
{
339
struct nfsrvcache *newrp;
340
int ret;
341
342
if (nd->nd_procnum == NFSPROC_NULL)
343
panic("nfsd cache null");
344
newrp = malloc(sizeof (struct nfsrvcache),
345
M_NFSRVCACHE, M_WAITOK);
346
NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
347
if (nd->nd_flag & ND_NFSV4)
348
newrp->rc_flag = RC_NFSV4;
349
else if (nd->nd_flag & ND_NFSV3)
350
newrp->rc_flag = RC_NFSV3;
351
else
352
newrp->rc_flag = RC_NFSV2;
353
newrp->rc_xid = nd->nd_retxid;
354
newrp->rc_proc = nd->nd_procnum;
355
newrp->rc_sockref = nd->nd_sockref;
356
newrp->rc_cachetime = nd->nd_tcpconntime;
357
if (nd->nd_flag & ND_SAMETCPCONN)
358
newrp->rc_flag |= RC_SAMETCPCONN;
359
if (nd->nd_nam2 != NULL) {
360
newrp->rc_flag |= RC_UDP;
361
ret = nfsrc_getudp(nd, newrp);
362
} else {
363
ret = nfsrc_gettcp(nd, newrp);
364
}
365
NFSEXITCODE2(0, nd);
366
return (ret);
367
}
368
369
/*
370
* For UDP (v2, v3):
371
* - key on <xid, NFS version, RPC#, Client host ip#>
372
* (at most one entry for each key)
373
*/
374
static int
375
nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
376
{
377
struct nfsrvcache *rp;
378
struct sockaddr_in *saddr;
379
struct sockaddr_in6 *saddr6;
380
struct nfsrvhashhead *hp;
381
int ret = 0;
382
struct mtx *mutex;
383
384
mutex = nfsrc_cachemutex(newrp);
385
hp = NFSRCUDPHASH(newrp->rc_xid);
386
loop:
387
mtx_lock(mutex);
388
LIST_FOREACH(rp, hp, rc_hash) {
389
if (newrp->rc_xid == rp->rc_xid &&
390
newrp->rc_proc == rp->rc_proc &&
391
(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
392
nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
393
if ((rp->rc_flag & RC_LOCKED) != 0) {
394
rp->rc_flag |= RC_WANTED;
395
(void)mtx_sleep(rp, mutex, PVFS | PDROP,
396
"nfsrc", 10 * hz);
397
goto loop;
398
}
399
if (rp->rc_flag == 0)
400
panic("nfs udp cache0");
401
rp->rc_flag |= RC_LOCKED;
402
TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
403
TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
404
if (rp->rc_flag & RC_INPROG) {
405
NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
406
mtx_unlock(mutex);
407
ret = RC_DROPIT;
408
} else if (rp->rc_flag & RC_REPSTATUS) {
409
/*
410
* V2 only.
411
*/
412
NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
413
mtx_unlock(mutex);
414
nfsrvd_rephead(nd);
415
*(nd->nd_errp) = rp->rc_status;
416
ret = RC_REPLY;
417
rp->rc_timestamp = NFSD_MONOSEC +
418
NFSRVCACHE_UDPTIMEOUT;
419
} else if (rp->rc_flag & RC_REPMBUF) {
420
NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
421
mtx_unlock(mutex);
422
nd->nd_mreq = m_copym(rp->rc_reply, 0,
423
M_COPYALL, M_WAITOK);
424
ret = RC_REPLY;
425
rp->rc_timestamp = NFSD_MONOSEC +
426
NFSRVCACHE_UDPTIMEOUT;
427
} else {
428
panic("nfs udp cache1");
429
}
430
nfsrc_unlock(rp);
431
free(newrp, M_NFSRVCACHE);
432
goto out;
433
}
434
}
435
NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
436
atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
437
NFSD_VNET(nfsrc_udpcachesize)++;
438
439
newrp->rc_flag |= RC_INPROG;
440
saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
441
if (saddr->sin_family == AF_INET)
442
newrp->rc_inet = saddr->sin_addr.s_addr;
443
else if (saddr->sin_family == AF_INET6) {
444
saddr6 = (struct sockaddr_in6 *)saddr;
445
NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
446
sizeof (struct in6_addr));
447
newrp->rc_flag |= RC_INETIPV6;
448
}
449
LIST_INSERT_HEAD(hp, newrp, rc_hash);
450
TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
451
mtx_unlock(mutex);
452
nd->nd_rp = newrp;
453
ret = RC_DOIT;
454
455
out:
456
NFSEXITCODE2(0, nd);
457
return (ret);
458
}
459
460
/*
461
* Update a request cache entry after the rpc has been done
462
*/
463
struct nfsrvcache *
464
nfsrvd_updatecache(struct nfsrv_descript *nd)
465
{
466
struct nfsrvcache *rp;
467
struct nfsrvcache *retrp = NULL;
468
struct mbuf *m;
469
struct mtx *mutex;
470
471
rp = nd->nd_rp;
472
if (!rp)
473
panic("nfsrvd_updatecache null rp");
474
nd->nd_rp = NULL;
475
mutex = nfsrc_cachemutex(rp);
476
mtx_lock(mutex);
477
nfsrc_lock(rp);
478
if (!(rp->rc_flag & RC_INPROG))
479
panic("nfsrvd_updatecache not inprog");
480
rp->rc_flag &= ~RC_INPROG;
481
if (rp->rc_flag & RC_UDP) {
482
TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
483
TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
484
}
485
486
/*
487
* Reply from cache is a special case returned by nfsrv_checkseqid().
488
*/
489
if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
490
NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
491
mtx_unlock(mutex);
492
nd->nd_repstat = 0;
493
if (nd->nd_mreq)
494
m_freem(nd->nd_mreq);
495
if (!(rp->rc_flag & RC_REPMBUF))
496
panic("reply from cache");
497
nd->nd_mreq = m_copym(rp->rc_reply, 0,
498
M_COPYALL, M_WAITOK);
499
rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500
nfsrc_unlock(rp);
501
goto out;
502
}
503
504
/*
505
* If rc_refcnt > 0, save it
506
* For UDP, save it if ND_SAVEREPLY is set
507
* For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
508
*/
509
if (nd->nd_repstat != NFSERR_DONTREPLY &&
510
(rp->rc_refcnt > 0 ||
511
((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
512
((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
513
NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
514
nfsrc_tcpnonidempotent))) {
515
if (rp->rc_refcnt > 0) {
516
if (!(rp->rc_flag & RC_NFSV4))
517
panic("update_cache refcnt");
518
rp->rc_flag |= RC_REFCNT;
519
}
520
if ((nd->nd_flag & ND_NFSV2) &&
521
nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
522
rp->rc_status = nd->nd_repstat;
523
rp->rc_flag |= RC_REPSTATUS;
524
mtx_unlock(mutex);
525
} else {
526
if (!(rp->rc_flag & RC_UDP)) {
527
atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
528
1);
529
if (NFSD_VNET(nfsrc_tcpsavedreplies) >
530
NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
531
NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
532
NFSD_VNET(nfsrc_tcpsavedreplies);
533
}
534
mtx_unlock(mutex);
535
m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
536
mtx_lock(mutex);
537
rp->rc_reply = m;
538
rp->rc_flag |= RC_REPMBUF;
539
mtx_unlock(mutex);
540
}
541
if (rp->rc_flag & RC_UDP) {
542
rp->rc_timestamp = NFSD_MONOSEC +
543
NFSRVCACHE_UDPTIMEOUT;
544
nfsrc_unlock(rp);
545
} else {
546
rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
547
if (rp->rc_refcnt > 0)
548
nfsrc_unlock(rp);
549
else
550
retrp = rp;
551
}
552
} else {
553
nfsrc_freecache(rp);
554
mtx_unlock(mutex);
555
}
556
557
out:
558
NFSEXITCODE2(0, nd);
559
return (retrp);
560
}
561
562
/*
563
* Invalidate and, if possible, free an in prog cache entry.
564
* Must not sleep.
565
*/
566
void
567
nfsrvd_delcache(struct nfsrvcache *rp)
568
{
569
struct mtx *mutex;
570
571
mutex = nfsrc_cachemutex(rp);
572
if (!(rp->rc_flag & RC_INPROG))
573
panic("nfsrvd_delcache not in prog");
574
mtx_lock(mutex);
575
rp->rc_flag &= ~RC_INPROG;
576
if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
577
nfsrc_freecache(rp);
578
mtx_unlock(mutex);
579
}
580
581
/*
582
* Called after nfsrvd_updatecache() once the reply is sent, to update
583
* the entry's sequence number and unlock it. The argument is
584
* the pointer returned by nfsrvd_updatecache().
585
*/
586
void
587
nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
588
{
589
struct nfsrchash_bucket *hbp;
590
591
KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
592
if (have_seq) {
593
hbp = NFSRCAHASH(rp->rc_sockref);
594
mtx_lock(&hbp->mtx);
595
rp->rc_tcpseq = seq;
596
if (rp->rc_acked != RC_NO_ACK)
597
LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
598
rp->rc_acked = RC_NO_ACK;
599
mtx_unlock(&hbp->mtx);
600
}
601
nfsrc_unlock(rp);
602
}
603
604
/*
605
* Get a cache entry for TCP
606
* - key on <xid, nfs version>
607
* (allow multiple entries for a given key)
608
*/
609
static int
610
nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
611
{
612
struct nfsrvcache *rp, *nextrp;
613
int i;
614
struct nfsrvcache *hitrp;
615
struct nfsrvhashhead *hp, nfsrc_templist;
616
int hit, ret = 0;
617
struct mtx *mutex;
618
619
mutex = nfsrc_cachemutex(newrp);
620
hp = NFSRCHASH(newrp->rc_xid);
621
newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
622
tryagain:
623
mtx_lock(mutex);
624
hit = 1;
625
LIST_INIT(&nfsrc_templist);
626
/*
627
* Get all the matches and put them on the temp list.
628
*/
629
rp = LIST_FIRST(hp);
630
while (rp != LIST_END(hp)) {
631
nextrp = LIST_NEXT(rp, rc_hash);
632
if (newrp->rc_xid == rp->rc_xid &&
633
(!(rp->rc_flag & RC_INPROG) ||
634
((newrp->rc_flag & RC_SAMETCPCONN) &&
635
newrp->rc_sockref == rp->rc_sockref)) &&
636
(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
637
newrp->rc_proc == rp->rc_proc &&
638
((newrp->rc_flag & RC_NFSV4) &&
639
newrp->rc_sockref != rp->rc_sockref &&
640
newrp->rc_cachetime >= rp->rc_cachetime)
641
&& newrp->rc_reqlen == rp->rc_reqlen &&
642
newrp->rc_cksum == rp->rc_cksum) {
643
LIST_REMOVE(rp, rc_hash);
644
LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
645
}
646
rp = nextrp;
647
}
648
649
/*
650
* Now, use nfsrc_templist to decide if there is a match.
651
*/
652
i = 0;
653
LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
654
i++;
655
if (rp->rc_refcnt > 0) {
656
hit = 0;
657
break;
658
}
659
}
660
/*
661
* Can be a hit only if one entry left.
662
* Note possible hit entry and put nfsrc_templist back on hash
663
* list.
664
*/
665
if (i != 1)
666
hit = 0;
667
hitrp = rp = LIST_FIRST(&nfsrc_templist);
668
while (rp != LIST_END(&nfsrc_templist)) {
669
nextrp = LIST_NEXT(rp, rc_hash);
670
LIST_REMOVE(rp, rc_hash);
671
LIST_INSERT_HEAD(hp, rp, rc_hash);
672
rp = nextrp;
673
}
674
if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
675
panic("nfs gettcp cache templist");
676
677
if (hit) {
678
rp = hitrp;
679
if ((rp->rc_flag & RC_LOCKED) != 0) {
680
rp->rc_flag |= RC_WANTED;
681
(void)mtx_sleep(rp, mutex, PVFS | PDROP,
682
"nfsrc", 10 * hz);
683
goto tryagain;
684
}
685
if (rp->rc_flag == 0)
686
panic("nfs tcp cache0");
687
rp->rc_flag |= RC_LOCKED;
688
if (rp->rc_flag & RC_INPROG) {
689
NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
690
mtx_unlock(mutex);
691
if (newrp->rc_sockref == rp->rc_sockref)
692
nfsrc_marksametcpconn(rp->rc_sockref);
693
ret = RC_DROPIT;
694
} else if (rp->rc_flag & RC_REPSTATUS) {
695
/*
696
* V2 only.
697
*/
698
NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
699
mtx_unlock(mutex);
700
if (newrp->rc_sockref == rp->rc_sockref)
701
nfsrc_marksametcpconn(rp->rc_sockref);
702
ret = RC_REPLY;
703
nfsrvd_rephead(nd);
704
*(nd->nd_errp) = rp->rc_status;
705
rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
706
} else if (rp->rc_flag & RC_REPMBUF) {
707
NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
708
mtx_unlock(mutex);
709
if (newrp->rc_sockref == rp->rc_sockref)
710
nfsrc_marksametcpconn(rp->rc_sockref);
711
ret = RC_REPLY;
712
nd->nd_mreq = m_copym(rp->rc_reply, 0,
713
M_COPYALL, M_WAITOK);
714
rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
715
} else {
716
panic("nfs tcp cache1");
717
}
718
nfsrc_unlock(rp);
719
free(newrp, M_NFSRVCACHE);
720
goto out;
721
}
722
NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
723
atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
724
725
/*
726
* For TCP, multiple entries for a key are allowed, so don't
727
* chain it into the hash table until done.
728
*/
729
newrp->rc_cachetime = NFSD_MONOSEC;
730
newrp->rc_flag |= RC_INPROG;
731
LIST_INSERT_HEAD(hp, newrp, rc_hash);
732
mtx_unlock(mutex);
733
nd->nd_rp = newrp;
734
ret = RC_DOIT;
735
736
out:
737
NFSEXITCODE2(0, nd);
738
return (ret);
739
}
740
741
/*
742
* Lock a cache entry.
743
*/
744
static void
745
nfsrc_lock(struct nfsrvcache *rp)
746
{
747
struct mtx *mutex;
748
749
mutex = nfsrc_cachemutex(rp);
750
mtx_assert(mutex, MA_OWNED);
751
while ((rp->rc_flag & RC_LOCKED) != 0) {
752
rp->rc_flag |= RC_WANTED;
753
(void)mtx_sleep(rp, mutex, PVFS, "nfsrc", 0);
754
}
755
rp->rc_flag |= RC_LOCKED;
756
}
757
758
/*
759
* Unlock a cache entry.
760
*/
761
static void
762
nfsrc_unlock(struct nfsrvcache *rp)
763
{
764
struct mtx *mutex;
765
766
mutex = nfsrc_cachemutex(rp);
767
mtx_lock(mutex);
768
rp->rc_flag &= ~RC_LOCKED;
769
nfsrc_wanted(rp);
770
mtx_unlock(mutex);
771
}
772
773
/*
774
* Wakeup anyone wanting entry.
775
*/
776
static void
777
nfsrc_wanted(struct nfsrvcache *rp)
778
{
779
if (rp->rc_flag & RC_WANTED) {
780
rp->rc_flag &= ~RC_WANTED;
781
wakeup((caddr_t)rp);
782
}
783
}
784
785
/*
786
* Free up the entry.
787
* Must not sleep.
788
*/
789
static void
790
nfsrc_freecache(struct nfsrvcache *rp)
791
{
792
struct nfsrchash_bucket *hbp;
793
794
LIST_REMOVE(rp, rc_hash);
795
if (rp->rc_flag & RC_UDP) {
796
TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
797
NFSD_VNET(nfsrc_udpcachesize)--;
798
} else if (rp->rc_acked != RC_NO_SEQ) {
799
hbp = NFSRCAHASH(rp->rc_sockref);
800
mtx_lock(&hbp->mtx);
801
if (rp->rc_acked == RC_NO_ACK)
802
LIST_REMOVE(rp, rc_ahash);
803
mtx_unlock(&hbp->mtx);
804
}
805
nfsrc_wanted(rp);
806
if (rp->rc_flag & RC_REPMBUF) {
807
m_freem(rp->rc_reply);
808
if (!(rp->rc_flag & RC_UDP))
809
atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
810
}
811
free(rp, M_NFSRVCACHE);
812
atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
813
}
814
815
/*
816
* Clean out the cache. Called when nfsserver module is unloaded.
817
*/
818
void
819
nfsrvd_cleancache(void)
820
{
821
struct nfsrvcache *rp, *nextrp;
822
int i;
823
824
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825
LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
826
rc_hash, nextrp)
827
nfsrc_freecache(rp);
828
}
829
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
830
LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
831
nextrp) {
832
nfsrc_freecache(rp);
833
}
834
}
835
NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
836
NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
837
}
838
839
#define HISTSIZE 16
840
/*
841
* The basic rule is to get rid of entries that are expired.
842
*/
843
void
844
nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
845
{
846
struct nfsrchash_bucket *hbp;
847
struct nfsrvcache *rp, *nextrp;
848
int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
849
time_t thisstamp;
850
static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
851
static int onethread = 0, oneslot = 0;
852
853
if (sockref != 0) {
854
hbp = NFSRCAHASH(sockref);
855
mtx_lock(&hbp->mtx);
856
LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
857
if (sockref == rp->rc_sockref) {
858
if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
859
rp->rc_acked = RC_ACK;
860
LIST_REMOVE(rp, rc_ahash);
861
} else if (final) {
862
rp->rc_acked = RC_NACK;
863
LIST_REMOVE(rp, rc_ahash);
864
}
865
}
866
}
867
mtx_unlock(&hbp->mtx);
868
}
869
870
if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
871
return;
872
if (NFSD_MONOSEC != udp_lasttrim ||
873
NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
874
nfsrc_udphighwater / 2)) {
875
mtx_lock(&nfsrc_udpmtx);
876
udp_lasttrim = NFSD_MONOSEC;
877
TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
878
nextrp) {
879
if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
880
&& rp->rc_refcnt == 0
881
&& ((rp->rc_flag & RC_REFCNT) ||
882
udp_lasttrim > rp->rc_timestamp ||
883
NFSD_VNET(nfsrc_udpcachesize) >
884
nfsrc_udphighwater))
885
nfsrc_freecache(rp);
886
}
887
mtx_unlock(&nfsrc_udpmtx);
888
}
889
if (NFSD_MONOSEC != tcp_lasttrim ||
890
NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
891
force = nfsrc_tcphighwater / 4;
892
if (force > 0 &&
893
NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
894
nfsrc_tcphighwater) {
895
for (i = 0; i < HISTSIZE; i++)
896
time_histo[i] = 0;
897
i = 0;
898
lastslot = NFSRVCACHE_HASHSIZE - 1;
899
} else {
900
force = 0;
901
if (NFSD_MONOSEC != tcp_lasttrim) {
902
i = 0;
903
lastslot = NFSRVCACHE_HASHSIZE - 1;
904
} else {
905
lastslot = i = oneslot;
906
if (++oneslot >= NFSRVCACHE_HASHSIZE)
907
oneslot = 0;
908
}
909
}
910
tto = nfsrc_tcptimeout;
911
tcp_lasttrim = NFSD_MONOSEC;
912
for (; i <= lastslot; i++) {
913
mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
914
LIST_FOREACH_SAFE(rp,
915
&NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
916
nextrp) {
917
if (!(rp->rc_flag &
918
(RC_INPROG|RC_LOCKED|RC_WANTED))
919
&& rp->rc_refcnt == 0) {
920
if ((rp->rc_flag & RC_REFCNT) ||
921
tcp_lasttrim > rp->rc_timestamp ||
922
rp->rc_acked == RC_ACK) {
923
nfsrc_freecache(rp);
924
continue;
925
}
926
927
if (force == 0)
928
continue;
929
/*
930
* The timestamps range from roughly the
931
* present (tcp_lasttrim) to the present
932
* + nfsrc_tcptimeout. Generate a simple
933
* histogram of where the timeouts fall.
934
*/
935
j = rp->rc_timestamp - tcp_lasttrim;
936
if (j >= tto)
937
j = HISTSIZE - 1;
938
else if (j < 0)
939
j = 0;
940
else
941
j = j * HISTSIZE / tto;
942
time_histo[j]++;
943
}
944
}
945
mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
946
}
947
if (force) {
948
/*
949
* Trim some more with a smaller timeout of as little
950
* as 20% of nfsrc_tcptimeout to try and get below
951
* 80% of the nfsrc_tcphighwater.
952
*/
953
k = 0;
954
for (i = 0; i < (HISTSIZE - 2); i++) {
955
k += time_histo[i];
956
if (k > force)
957
break;
958
}
959
k = tto * (i + 1) / HISTSIZE;
960
if (k < 1)
961
k = 1;
962
thisstamp = tcp_lasttrim + k;
963
for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
964
mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
965
LIST_FOREACH_SAFE(rp,
966
&NFSD_VNET(nfsrchash_table)[i].tbl,
967
rc_hash, nextrp) {
968
if (!(rp->rc_flag &
969
(RC_INPROG|RC_LOCKED|RC_WANTED))
970
&& rp->rc_refcnt == 0
971
&& ((rp->rc_flag & RC_REFCNT) ||
972
thisstamp > rp->rc_timestamp ||
973
rp->rc_acked == RC_ACK))
974
nfsrc_freecache(rp);
975
}
976
mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
977
}
978
}
979
}
980
atomic_store_rel_int(&onethread, 0);
981
}
982
983
/*
984
* Add a seqid# reference to the cache entry.
985
*/
986
void
987
nfsrvd_refcache(struct nfsrvcache *rp)
988
{
989
struct mtx *mutex;
990
991
if (rp == NULL)
992
/* For NFSv4.1, there is no cache entry. */
993
return;
994
mutex = nfsrc_cachemutex(rp);
995
mtx_lock(mutex);
996
if (rp->rc_refcnt < 0)
997
panic("nfs cache refcnt");
998
rp->rc_refcnt++;
999
mtx_unlock(mutex);
1000
}
1001
1002
/*
1003
* Dereference a seqid# cache entry.
1004
*/
1005
void
1006
nfsrvd_derefcache(struct nfsrvcache *rp)
1007
{
1008
struct mtx *mutex;
1009
1010
mutex = nfsrc_cachemutex(rp);
1011
mtx_lock(mutex);
1012
if (rp->rc_refcnt <= 0)
1013
panic("nfs cache derefcnt");
1014
rp->rc_refcnt--;
1015
if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016
nfsrc_freecache(rp);
1017
mtx_unlock(mutex);
1018
}
1019
1020
/*
1021
* Calculate the length of the mbuf list and a checksum on the first up to
1022
* NFSRVCACHE_CHECKLEN bytes.
1023
*/
1024
static int
1025
nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026
{
1027
int len = 0, cklen;
1028
struct mbuf *m;
1029
1030
m = m1;
1031
while (m) {
1032
len += m->m_len;
1033
m = m->m_next;
1034
}
1035
cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036
*cksum = in_cksum(m1, cklen);
1037
return (len);
1038
}
1039
1040
/*
1041
* Mark a TCP connection that is seeing retries. Should never happen for
1042
* NFSv4.
1043
*/
1044
static void
1045
nfsrc_marksametcpconn(u_int64_t sockref)
1046
{
1047
}
1048
1049