CoCalc -- nfs_nfsdcache.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/fs/nfsserver/nfs_nfsdcache.c
³⁹⁵⁸⁶ views
1
/*-
2
 * SPDX-License-Identifier: BSD-3-Clause
3
 *
4
 * Copyright (c) 1989, 1993
5
 *	The Regents of the University of California.  All rights reserved.
6
 *
7
 * This code is derived from software contributed to Berkeley by
8
 * Rick Macklem at The University of Guelph.
9
 *
10
 * Redistribution and use in source and binary forms, with or without
11
 * modification, are permitted provided that the following conditions
12
 * are met:
13
 * 1. Redistributions of source code must retain the above copyright
14
 *    notice, this list of conditions and the following disclaimer.
15
 * 2. Redistributions in binary form must reproduce the above copyright
16
 *    notice, this list of conditions and the following disclaimer in the
17
 *    documentation and/or other materials provided with the distribution.
18
 * 3. Neither the name of the University nor the names of its contributors
19
 *    may be used to endorse or promote products derived from this software
20
 *    without specific prior written permission.
21
 *
22
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
 * SUCH DAMAGE.
33
 *
34
 */
35

36
#include <sys/cdefs.h>
37
/*
38
 * Here is the basic algorithm:
39
 * First, some design criteria I used:
40
 * - I think a false hit is more serious than a false miss
41
 * - A false hit for an RPC that has Op(s) that order via seqid# must be
42
 *   avoided at all cost
43
 * - A valid hit will probably happen a long time after the original reply
44
 *   and the TCP socket that the original request was received on will no
45
 *   longer be active
46
 *   (The long time delay implies to me that LRU is not appropriate.)
47
 * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
48
 *   in them as well as minimizing the risk of redoing retried non-idempotent
49
 *   Ops.
50
 * Because it is biased towards avoiding false hits, multiple entries with
51
 * the same xid are to be expected, especially for the case of the entry
52
 * in the cache being related to a seqid# sequenced Op.
53
 * 
54
 * The basic algorithm I'm about to code up:
55
 * - Null RPCs bypass the cache and are just done
56
 * For TCP
57
 * 	- key on <xid, NFS version> (as noted above, there can be several
58
 * 				     entries with the same key)
59
 * 	When a request arrives:
60
 * 		For all that match key
61
 * 		- if RPC# != OR request_size !=
62
 * 			- not a match with this one
63
 * 		- if NFSv4 and received on same TCP socket OR
64
 *			received on a TCP connection created before the
65
 *			entry was cached
66
 * 			- not a match with this one
67
 * 			(V2,3 clients might retry on same TCP socket)
68
 * 		- calculate checksum on first N bytes of NFS XDR
69
 * 		- if checksum !=
70
 * 			- not a match for this one
71
 * 		If any of the remaining ones that match has a
72
 * 			seqid_refcnt > 0
73
 * 			- not a match (go do RPC, using new cache entry)
74
 * 		If one match left
75
 * 			- a hit (reply from cache)
76
 * 		else
77
 * 			- miss (go do RPC, using new cache entry)
78
 * 
79
 * 	During processing of NFSv4 request:
80
 * 		- set a flag when a non-idempotent Op is processed
81
 * 		- when an Op that uses a seqid# (Open,...) is processed
82
 * 			- if same seqid# as referenced entry in cache
83
 * 				- free new cache entry
84
 * 				- reply from referenced cache entry
85
 * 			  else if next seqid# in order
86
 * 				- free referenced cache entry
87
 * 				- increment seqid_refcnt on new cache entry
88
 * 				- set pointer from Openowner/Lockowner to
89
 * 					new cache entry (aka reference it)
90
 * 			  else if first seqid# in sequence
91
 * 				- increment seqid_refcnt on new cache entry
92
 * 				- set pointer from Openowner/Lockowner to
93
 * 					new cache entry (aka reference it)
94
 * 
95
 * 	At end of RPC processing:
96
 * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
97
 * 			cache entry
98
 * 			- save reply in cache entry
99
 * 			- calculate checksum on first N bytes of NFS XDR
100
 * 				request
101
 * 			- note op and length of XDR request (in bytes)
102
 * 			- timestamp it
103
 * 		  else
104
 * 			- free new cache entry
105
 * 		- Send reply (noting info for socket activity check, below)
106
 * 
107
 * 	For cache entries saved above:
108
 * 		- if saved since seqid_refcnt was > 0
109
 * 			- free when seqid_refcnt decrements to 0
110
 * 			  (when next one in sequence is processed above, or
111
 * 			   when Openowner/Lockowner is discarded)
112
 * 		  else { non-idempotent Op(s) }
113
 * 			- free when
114
 * 				- some further activity observed on same
115
 * 					socket
116
 * 				  (I'm not yet sure how I'm going to do
117
 * 				   this. Maybe look at the TCP connection
118
 * 				   to see if the send_tcp_sequence# is well
119
 * 				   past sent reply OR K additional RPCs
120
 * 				   replied on same socket OR?)
121
 * 			  OR
122
 * 				- when very old (hours, days, weeks?)
123
 * 
124
 * For UDP (v2, 3 only), pretty much the old way:
125
 * - key on <xid, NFS version, RPC#, Client host ip#>
126
 *   (at most one entry for each key)
127
 * 
128
 * When a Request arrives:
129
 * - if a match with entry via key
130
 * 	- if RPC marked In_progress
131
 * 		- discard request (don't send reply)
132
 * 	  else
133
 * 		- reply from cache
134
 * 		- timestamp cache entry
135
 *   else
136
 * 	- add entry to cache, marked In_progress
137
 * 	- do RPC
138
 * 	- when RPC done
139
 * 		- if RPC# non-idempotent
140
 * 			- mark entry Done (not In_progress)
141
 * 			- save reply
142
 * 			- timestamp cache entry
143
 * 		  else
144
 * 			- free cache entry
145
 * 		- send reply
146
 * 
147
 * Later, entries with saved replies are free'd a short time (few minutes)
148
 * after reply sent (timestamp).
149
 * Reference: Chet Juszczak, "Improving the Performance and Correctness
150
 *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
151
 *		pages 53-63. San Diego, February 1989.
152
 *	 for the UDP case.
153
 * nfsrc_floodlevel is set to the allowable upper limit for saved replies
154
 *	for TCP. For V3, a reply won't be saved when the flood level is
155
 *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
156
 *	that case. This level should be set high enough that this almost
157
 *	never happens.
158
 */
159
#include <fs/nfs/nfsport.h>
160

161
extern struct mtx nfsrc_udpmtx;
162

163
NFSD_VNET_DECLARE(struct nfsrvhashhead *, nfsrvudphashtbl);
164
NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrchash_table);
165
NFSD_VNET_DECLARE(struct nfsrchash_bucket *, nfsrcahash_table);
166
NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
167

168
NFSD_VNET_DEFINE(int, nfsrc_floodlevel) = NFSRVCACHE_FLOODLEVEL;
169
NFSD_VNET_DEFINE(int, nfsrc_tcpsavedreplies) = 0;
170

171
SYSCTL_DECL(_vfs_nfsd);
172

173
static u_int	nfsrc_tcphighwater = 0;
174
static int
175
sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
176
{
177
	int error, newhighwater;
178

179
	newhighwater = nfsrc_tcphighwater;
180
	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
181
	if (error != 0 || req->newptr == NULL)
182
		return (error);
183
	if (newhighwater < 0)
184
		return (EINVAL);
185
	if (newhighwater >= NFSD_VNET(nfsrc_floodlevel))
186
		NFSD_VNET(nfsrc_floodlevel) = newhighwater + newhighwater / 5;
187
	nfsrc_tcphighwater = newhighwater;
188
	return (0);
189
}
190
SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater,
191
    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrc_tcphighwater),
192
    sysctl_tcphighwater, "IU", "High water mark for TCP cache entries");
193

194
static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
195
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
196
    &nfsrc_udphighwater, 0,
197
    "High water mark for UDP cache entries");
198
static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
199
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
200
    &nfsrc_tcptimeout, 0,
201
    "Timeout for TCP entries in the DRC");
202
static u_int nfsrc_tcpnonidempotent = 1;
203
SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
204
    &nfsrc_tcpnonidempotent, 0,
205
    "Enable the DRC for NFS over TCP");
206

207
NFSD_VNET_DEFINE_STATIC(int, nfsrc_udpcachesize) = 0;
208
NFSD_VNET_DEFINE_STATIC(TAILQ_HEAD(, nfsrvcache), nfsrvudplru);
209

210
/*
211
 * and the reverse mapping from generic to Version 2 procedure numbers
212
 */
213
static int newnfsv2_procid[NFS_V3NPROCS] = {
214
	NFSV2PROC_NULL,
215
	NFSV2PROC_GETATTR,
216
	NFSV2PROC_SETATTR,
217
	NFSV2PROC_LOOKUP,
218
	NFSV2PROC_NOOP,
219
	NFSV2PROC_READLINK,
220
	NFSV2PROC_READ,
221
	NFSV2PROC_WRITE,
222
	NFSV2PROC_CREATE,
223
	NFSV2PROC_MKDIR,
224
	NFSV2PROC_SYMLINK,
225
	NFSV2PROC_CREATE,
226
	NFSV2PROC_REMOVE,
227
	NFSV2PROC_RMDIR,
228
	NFSV2PROC_RENAME,
229
	NFSV2PROC_LINK,
230
	NFSV2PROC_READDIR,
231
	NFSV2PROC_NOOP,
232
	NFSV2PROC_STATFS,
233
	NFSV2PROC_NOOP,
234
	NFSV2PROC_NOOP,
235
	NFSV2PROC_NOOP,
236
};
237

238
#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
239
#define	NFSRCUDPHASH(xid) \
240
	(&NFSD_VNET(nfsrvudphashtbl)[nfsrc_hash(xid)])
241
#define	NFSRCHASH(xid) \
242
	(&NFSD_VNET(nfsrchash_table)[nfsrc_hash(xid)].tbl)
243
#define	NFSRCAHASH(xid) (&NFSD_VNET(nfsrcahash_table)[nfsrc_hash(xid)])
244
#define	TRUE	1
245
#define	FALSE	0
246
#define	NFSRVCACHE_CHECKLEN	100
247

248
/* True iff the rpc reply is an nfs status ONLY! */
249
static int nfsv2_repstat[NFS_V3NPROCS] = {
250
	FALSE,
251
	FALSE,
252
	FALSE,
253
	FALSE,
254
	FALSE,
255
	FALSE,
256
	FALSE,
257
	FALSE,
258
	FALSE,
259
	FALSE,
260
	TRUE,
261
	TRUE,
262
	TRUE,
263
	TRUE,
264
	FALSE,
265
	TRUE,
266
	FALSE,
267
	FALSE,
268
	FALSE,
269
	FALSE,
270
	FALSE,
271
	FALSE,
272
};
273

274
/*
275
 * Will NFS want to work over IPv6 someday?
276
 */
277
#define	NETFAMILY(rp) \
278
		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
279

280
/* local functions */
281
static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
282
static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
283
static void nfsrc_lock(struct nfsrvcache *rp);
284
static void nfsrc_unlock(struct nfsrvcache *rp);
285
static void nfsrc_wanted(struct nfsrvcache *rp);
286
static void nfsrc_freecache(struct nfsrvcache *rp);
287
static int nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum);
288
static void nfsrc_marksametcpconn(u_int64_t);
289

290
/*
291
 * Return the correct mutex for this cache entry.
292
 */
293
static __inline struct mtx *
294
nfsrc_cachemutex(struct nfsrvcache *rp)
295
{
296

297
	if ((rp->rc_flag & RC_UDP) != 0)
298
		return (&nfsrc_udpmtx);
299
	return (&NFSD_VNET(nfsrchash_table)[nfsrc_hash(rp->rc_xid)].mtx);
300
}
301

302
/*
303
 * Initialize the server request cache list
304
 */
305
void
306
nfsrvd_initcache(void)
307
{
308
	int i;
309

310
	NFSD_VNET(nfsrvudphashtbl) = malloc(sizeof(struct nfsrvhashhead) *
311
	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
312
	NFSD_VNET(nfsrchash_table) = malloc(sizeof(struct nfsrchash_bucket) *
313
	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
314
	NFSD_VNET(nfsrcahash_table) = malloc(sizeof(struct nfsrchash_bucket) *
315
	    NFSRVCACHE_HASHSIZE, M_NFSRVCACHE, M_WAITOK | M_ZERO);
316
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
317
		mtx_init(&NFSD_VNET(nfsrchash_table)[i].mtx, "nfsrtc", NULL,
318
		    MTX_DEF);
319
		mtx_init(&NFSD_VNET(nfsrcahash_table)[i].mtx, "nfsrtca", NULL,
320
		    MTX_DEF);
321
	}
322
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
323
		LIST_INIT(&NFSD_VNET(nfsrvudphashtbl)[i]);
324
		LIST_INIT(&NFSD_VNET(nfsrchash_table)[i].tbl);
325
		LIST_INIT(&NFSD_VNET(nfsrcahash_table)[i].tbl);
326
	}
327
	TAILQ_INIT(&NFSD_VNET(nfsrvudplru));
328
	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
329
	NFSD_VNET(nfsrc_udpcachesize) = 0;
330
}
331

332
/*
333
 * Get a cache entry for this request. Basically just malloc a new one
334
 * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
335
 */
336
int
337
nfsrvd_getcache(struct nfsrv_descript *nd)
338
{
339
	struct nfsrvcache *newrp;
340
	int ret;
341

342
	if (nd->nd_procnum == NFSPROC_NULL)
343
		panic("nfsd cache null");
344
	newrp = malloc(sizeof (struct nfsrvcache),
345
	    M_NFSRVCACHE, M_WAITOK);
346
	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
347
	if (nd->nd_flag & ND_NFSV4)
348
		newrp->rc_flag = RC_NFSV4;
349
	else if (nd->nd_flag & ND_NFSV3)
350
		newrp->rc_flag = RC_NFSV3;
351
	else
352
		newrp->rc_flag = RC_NFSV2;
353
	newrp->rc_xid = nd->nd_retxid;
354
	newrp->rc_proc = nd->nd_procnum;
355
	newrp->rc_sockref = nd->nd_sockref;
356
	newrp->rc_cachetime = nd->nd_tcpconntime;
357
	if (nd->nd_flag & ND_SAMETCPCONN)
358
		newrp->rc_flag |= RC_SAMETCPCONN;
359
	if (nd->nd_nam2 != NULL) {
360
		newrp->rc_flag |= RC_UDP;
361
		ret = nfsrc_getudp(nd, newrp);
362
	} else {
363
		ret = nfsrc_gettcp(nd, newrp);
364
	}
365
	NFSEXITCODE2(0, nd);
366
	return (ret);
367
}
368

369
/*
370
 * For UDP (v2, v3):
371
 * - key on <xid, NFS version, RPC#, Client host ip#>
372
 *   (at most one entry for each key)
373
 */
374
static int
375
nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
376
{
377
	struct nfsrvcache *rp;
378
	struct sockaddr_in *saddr;
379
	struct sockaddr_in6 *saddr6;
380
	struct nfsrvhashhead *hp;
381
	int ret = 0;
382
	struct mtx *mutex;
383

384
	mutex = nfsrc_cachemutex(newrp);
385
	hp = NFSRCUDPHASH(newrp->rc_xid);
386
loop:
387
	mtx_lock(mutex);
388
	LIST_FOREACH(rp, hp, rc_hash) {
389
	    if (newrp->rc_xid == rp->rc_xid &&
390
		newrp->rc_proc == rp->rc_proc &&
391
		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
392
		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
393
			if ((rp->rc_flag & RC_LOCKED) != 0) {
394
				rp->rc_flag |= RC_WANTED;
395
				(void)mtx_sleep(rp, mutex, PVFS | PDROP,
396
				    "nfsrc", 10 * hz);
397
				goto loop;
398
			}
399
			if (rp->rc_flag == 0)
400
				panic("nfs udp cache0");
401
			rp->rc_flag |= RC_LOCKED;
402
			TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
403
			TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
404
			if (rp->rc_flag & RC_INPROG) {
405
				NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
406
				mtx_unlock(mutex);
407
				ret = RC_DROPIT;
408
			} else if (rp->rc_flag & RC_REPSTATUS) {
409
				/*
410
				 * V2 only.
411
				 */
412
				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
413
				mtx_unlock(mutex);
414
				nfsrvd_rephead(nd);
415
				*(nd->nd_errp) = rp->rc_status;
416
				ret = RC_REPLY;
417
				rp->rc_timestamp = NFSD_MONOSEC +
418
					NFSRVCACHE_UDPTIMEOUT;
419
			} else if (rp->rc_flag & RC_REPMBUF) {
420
				NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
421
				mtx_unlock(mutex);
422
				nd->nd_mreq = m_copym(rp->rc_reply, 0,
423
					M_COPYALL, M_WAITOK);
424
				ret = RC_REPLY;
425
				rp->rc_timestamp = NFSD_MONOSEC +
426
					NFSRVCACHE_UDPTIMEOUT;
427
			} else {
428
				panic("nfs udp cache1");
429
			}
430
			nfsrc_unlock(rp);
431
			free(newrp, M_NFSRVCACHE);
432
			goto out;
433
		}
434
	}
435
	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
436
	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
437
	NFSD_VNET(nfsrc_udpcachesize)++;
438

439
	newrp->rc_flag |= RC_INPROG;
440
	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
441
	if (saddr->sin_family == AF_INET)
442
		newrp->rc_inet = saddr->sin_addr.s_addr;
443
	else if (saddr->sin_family == AF_INET6) {
444
		saddr6 = (struct sockaddr_in6 *)saddr;
445
		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
446
		    sizeof (struct in6_addr));
447
		newrp->rc_flag |= RC_INETIPV6;
448
	}
449
	LIST_INSERT_HEAD(hp, newrp, rc_hash);
450
	TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), newrp, rc_lru);
451
	mtx_unlock(mutex);
452
	nd->nd_rp = newrp;
453
	ret = RC_DOIT;
454

455
out:
456
	NFSEXITCODE2(0, nd);
457
	return (ret);
458
}
459

460
/*
461
 * Update a request cache entry after the rpc has been done
462
 */
463
struct nfsrvcache *
464
nfsrvd_updatecache(struct nfsrv_descript *nd)
465
{
466
	struct nfsrvcache *rp;
467
	struct nfsrvcache *retrp = NULL;
468
	struct mbuf *m;
469
	struct mtx *mutex;
470

471
	rp = nd->nd_rp;
472
	if (!rp)
473
		panic("nfsrvd_updatecache null rp");
474
	nd->nd_rp = NULL;
475
	mutex = nfsrc_cachemutex(rp);
476
	mtx_lock(mutex);
477
	nfsrc_lock(rp);
478
	if (!(rp->rc_flag & RC_INPROG))
479
		panic("nfsrvd_updatecache not inprog");
480
	rp->rc_flag &= ~RC_INPROG;
481
	if (rp->rc_flag & RC_UDP) {
482
		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
483
		TAILQ_INSERT_TAIL(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
484
	}
485

486
	/*
487
	 * Reply from cache is a special case returned by nfsrv_checkseqid().
488
	 */
489
	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
490
		NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
491
		mtx_unlock(mutex);
492
		nd->nd_repstat = 0;
493
		if (nd->nd_mreq)
494
			m_freem(nd->nd_mreq);
495
		if (!(rp->rc_flag & RC_REPMBUF))
496
			panic("reply from cache");
497
		nd->nd_mreq = m_copym(rp->rc_reply, 0,
498
		    M_COPYALL, M_WAITOK);
499
		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
500
		nfsrc_unlock(rp);
501
		goto out;
502
	}
503

504
	/*
505
	 * If rc_refcnt > 0, save it
506
	 * For UDP, save it if ND_SAVEREPLY is set
507
	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
508
	 */
509
	if (nd->nd_repstat != NFSERR_DONTREPLY &&
510
	    (rp->rc_refcnt > 0 ||
511
	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
512
	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
513
	      NFSD_VNET(nfsrc_tcpsavedreplies) <= NFSD_VNET(nfsrc_floodlevel) &&
514
	      nfsrc_tcpnonidempotent))) {
515
		if (rp->rc_refcnt > 0) {
516
			if (!(rp->rc_flag & RC_NFSV4))
517
				panic("update_cache refcnt");
518
			rp->rc_flag |= RC_REFCNT;
519
		}
520
		if ((nd->nd_flag & ND_NFSV2) &&
521
		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
522
			rp->rc_status = nd->nd_repstat;
523
			rp->rc_flag |= RC_REPSTATUS;
524
			mtx_unlock(mutex);
525
		} else {
526
			if (!(rp->rc_flag & RC_UDP)) {
527
			    atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies),
528
				1);
529
			    if (NFSD_VNET(nfsrc_tcpsavedreplies) >
530
				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak)
531
				NFSD_VNET(nfsstatsv1_p)->srvcache_tcppeak =
532
				    NFSD_VNET(nfsrc_tcpsavedreplies);
533
			}
534
			mtx_unlock(mutex);
535
			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
536
			mtx_lock(mutex);
537
			rp->rc_reply = m;
538
			rp->rc_flag |= RC_REPMBUF;
539
			mtx_unlock(mutex);
540
		}
541
		if (rp->rc_flag & RC_UDP) {
542
			rp->rc_timestamp = NFSD_MONOSEC +
543
			    NFSRVCACHE_UDPTIMEOUT;
544
			nfsrc_unlock(rp);
545
		} else {
546
			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
547
			if (rp->rc_refcnt > 0)
548
				nfsrc_unlock(rp);
549
			else
550
				retrp = rp;
551
		}
552
	} else {
553
		nfsrc_freecache(rp);
554
		mtx_unlock(mutex);
555
	}
556

557
out:
558
	NFSEXITCODE2(0, nd);
559
	return (retrp);
560
}
561

562
/*
563
 * Invalidate and, if possible, free an in prog cache entry.
564
 * Must not sleep.
565
 */
566
void
567
nfsrvd_delcache(struct nfsrvcache *rp)
568
{
569
	struct mtx *mutex;
570

571
	mutex = nfsrc_cachemutex(rp);
572
	if (!(rp->rc_flag & RC_INPROG))
573
		panic("nfsrvd_delcache not in prog");
574
	mtx_lock(mutex);
575
	rp->rc_flag &= ~RC_INPROG;
576
	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
577
		nfsrc_freecache(rp);
578
	mtx_unlock(mutex);
579
}
580

581
/*
582
 * Called after nfsrvd_updatecache() once the reply is sent, to update
583
 * the entry's sequence number and unlock it. The argument is
584
 * the pointer returned by nfsrvd_updatecache().
585
 */
586
void
587
nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
588
{
589
	struct nfsrchash_bucket *hbp;
590

591
	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
592
	if (have_seq) {
593
		hbp = NFSRCAHASH(rp->rc_sockref);
594
		mtx_lock(&hbp->mtx);
595
		rp->rc_tcpseq = seq;
596
		if (rp->rc_acked != RC_NO_ACK)
597
			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
598
		rp->rc_acked = RC_NO_ACK;
599
		mtx_unlock(&hbp->mtx);
600
	}
601
	nfsrc_unlock(rp);
602
}
603

604
/*
605
 * Get a cache entry for TCP
606
 * - key on <xid, nfs version>
607
 *   (allow multiple entries for a given key)
608
 */
609
static int
610
nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
611
{
612
	struct nfsrvcache *rp, *nextrp;
613
	int i;
614
	struct nfsrvcache *hitrp;
615
	struct nfsrvhashhead *hp, nfsrc_templist;
616
	int hit, ret = 0;
617
	struct mtx *mutex;
618

619
	mutex = nfsrc_cachemutex(newrp);
620
	hp = NFSRCHASH(newrp->rc_xid);
621
	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
622
tryagain:
623
	mtx_lock(mutex);
624
	hit = 1;
625
	LIST_INIT(&nfsrc_templist);
626
	/*
627
	 * Get all the matches and put them on the temp list.
628
	 */
629
	rp = LIST_FIRST(hp);
630
	while (rp != LIST_END(hp)) {
631
		nextrp = LIST_NEXT(rp, rc_hash);
632
		if (newrp->rc_xid == rp->rc_xid &&
633
		    (!(rp->rc_flag & RC_INPROG) ||
634
		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
635
		      newrp->rc_sockref == rp->rc_sockref)) &&
636
		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
637
		    newrp->rc_proc == rp->rc_proc &&
638
		    ((newrp->rc_flag & RC_NFSV4) &&
639
		     newrp->rc_sockref != rp->rc_sockref &&
640
		     newrp->rc_cachetime >= rp->rc_cachetime)
641
		    && newrp->rc_reqlen == rp->rc_reqlen &&
642
		    newrp->rc_cksum == rp->rc_cksum) {
643
			LIST_REMOVE(rp, rc_hash);
644
			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
645
		}
646
		rp = nextrp;
647
	}
648

649
	/*
650
	 * Now, use nfsrc_templist to decide if there is a match.
651
	 */
652
	i = 0;
653
	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
654
		i++;
655
		if (rp->rc_refcnt > 0) {
656
			hit = 0;
657
			break;
658
		}
659
	}
660
	/*
661
	 * Can be a hit only if one entry left.
662
	 * Note possible hit entry and put nfsrc_templist back on hash
663
	 * list.
664
	 */
665
	if (i != 1)
666
		hit = 0;
667
	hitrp = rp = LIST_FIRST(&nfsrc_templist);
668
	while (rp != LIST_END(&nfsrc_templist)) {
669
		nextrp = LIST_NEXT(rp, rc_hash);
670
		LIST_REMOVE(rp, rc_hash);
671
		LIST_INSERT_HEAD(hp, rp, rc_hash);
672
		rp = nextrp;
673
	}
674
	if (LIST_FIRST(&nfsrc_templist) != LIST_END(&nfsrc_templist))
675
		panic("nfs gettcp cache templist");
676

677
	if (hit) {
678
		rp = hitrp;
679
		if ((rp->rc_flag & RC_LOCKED) != 0) {
680
			rp->rc_flag |= RC_WANTED;
681
			(void)mtx_sleep(rp, mutex, PVFS | PDROP,
682
			    "nfsrc", 10 * hz);
683
			goto tryagain;
684
		}
685
		if (rp->rc_flag == 0)
686
			panic("nfs tcp cache0");
687
		rp->rc_flag |= RC_LOCKED;
688
		if (rp->rc_flag & RC_INPROG) {
689
			NFSD_VNET(nfsstatsv1_p)->srvcache_inproghits++;
690
			mtx_unlock(mutex);
691
			if (newrp->rc_sockref == rp->rc_sockref)
692
				nfsrc_marksametcpconn(rp->rc_sockref);
693
			ret = RC_DROPIT;
694
		} else if (rp->rc_flag & RC_REPSTATUS) {
695
			/*
696
			 * V2 only.
697
			 */
698
			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
699
			mtx_unlock(mutex);
700
			if (newrp->rc_sockref == rp->rc_sockref)
701
				nfsrc_marksametcpconn(rp->rc_sockref);
702
			ret = RC_REPLY;
703
			nfsrvd_rephead(nd);
704
			*(nd->nd_errp) = rp->rc_status;
705
			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
706
		} else if (rp->rc_flag & RC_REPMBUF) {
707
			NFSD_VNET(nfsstatsv1_p)->srvcache_nonidemdonehits++;
708
			mtx_unlock(mutex);
709
			if (newrp->rc_sockref == rp->rc_sockref)
710
				nfsrc_marksametcpconn(rp->rc_sockref);
711
			ret = RC_REPLY;
712
			nd->nd_mreq = m_copym(rp->rc_reply, 0,
713
				M_COPYALL, M_WAITOK);
714
			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
715
		} else {
716
			panic("nfs tcp cache1");
717
		}
718
		nfsrc_unlock(rp);
719
		free(newrp, M_NFSRVCACHE);
720
		goto out;
721
	}
722
	NFSD_VNET(nfsstatsv1_p)->srvcache_misses++;
723
	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, 1);
724

725
	/*
726
	 * For TCP, multiple entries for a key are allowed, so don't
727
	 * chain it into the hash table until done.
728
	 */
729
	newrp->rc_cachetime = NFSD_MONOSEC;
730
	newrp->rc_flag |= RC_INPROG;
731
	LIST_INSERT_HEAD(hp, newrp, rc_hash);
732
	mtx_unlock(mutex);
733
	nd->nd_rp = newrp;
734
	ret = RC_DOIT;
735

736
out:
737
	NFSEXITCODE2(0, nd);
738
	return (ret);
739
}
740

741
/*
742
 * Lock a cache entry.
743
 */
744
static void
745
nfsrc_lock(struct nfsrvcache *rp)
746
{
747
	struct mtx *mutex;
748

749
	mutex = nfsrc_cachemutex(rp);
750
	mtx_assert(mutex, MA_OWNED);
751
	while ((rp->rc_flag & RC_LOCKED) != 0) {
752
		rp->rc_flag |= RC_WANTED;
753
		(void)mtx_sleep(rp, mutex, PVFS, "nfsrc", 0);
754
	}
755
	rp->rc_flag |= RC_LOCKED;
756
}
757

758
/*
759
 * Unlock a cache entry.
760
 */
761
static void
762
nfsrc_unlock(struct nfsrvcache *rp)
763
{
764
	struct mtx *mutex;
765

766
	mutex = nfsrc_cachemutex(rp);
767
	mtx_lock(mutex);
768
	rp->rc_flag &= ~RC_LOCKED;
769
	nfsrc_wanted(rp);
770
	mtx_unlock(mutex);
771
}
772

773
/*
774
 * Wakeup anyone wanting entry.
775
 */
776
static void
777
nfsrc_wanted(struct nfsrvcache *rp)
778
{
779
	if (rp->rc_flag & RC_WANTED) {
780
		rp->rc_flag &= ~RC_WANTED;
781
		wakeup((caddr_t)rp);
782
	}
783
}
784

785
/*
786
 * Free up the entry.
787
 * Must not sleep.
788
 */
789
static void
790
nfsrc_freecache(struct nfsrvcache *rp)
791
{
792
	struct nfsrchash_bucket *hbp;
793

794
	LIST_REMOVE(rp, rc_hash);
795
	if (rp->rc_flag & RC_UDP) {
796
		TAILQ_REMOVE(&NFSD_VNET(nfsrvudplru), rp, rc_lru);
797
		NFSD_VNET(nfsrc_udpcachesize)--;
798
	} else if (rp->rc_acked != RC_NO_SEQ) {
799
		hbp = NFSRCAHASH(rp->rc_sockref);
800
		mtx_lock(&hbp->mtx);
801
		if (rp->rc_acked == RC_NO_ACK)
802
			LIST_REMOVE(rp, rc_ahash);
803
		mtx_unlock(&hbp->mtx);
804
	}
805
	nfsrc_wanted(rp);
806
	if (rp->rc_flag & RC_REPMBUF) {
807
		m_freem(rp->rc_reply);
808
		if (!(rp->rc_flag & RC_UDP))
809
			atomic_add_int(&NFSD_VNET(nfsrc_tcpsavedreplies), -1);
810
	}
811
	free(rp, M_NFSRVCACHE);
812
	atomic_add_int(&NFSD_VNET(nfsstatsv1_p)->srvcache_size, -1);
813
}
814

815
/*
816
 * Clean out the cache. Called when nfsserver module is unloaded.
817
 */
818
void
819
nfsrvd_cleancache(void)
820
{
821
	struct nfsrvcache *rp, *nextrp;
822
	int i;
823

824
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
825
		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrchash_table)[i].tbl,
826
		    rc_hash, nextrp)
827
			nfsrc_freecache(rp);
828
	}
829
	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
830
		LIST_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudphashtbl)[i], rc_hash,
831
		    nextrp) {
832
			nfsrc_freecache(rp);
833
		}
834
	}
835
	NFSD_VNET(nfsstatsv1_p)->srvcache_size = 0;
836
	NFSD_VNET(nfsrc_tcpsavedreplies) = 0;
837
}
838

839
#define HISTSIZE	16
840
/*
841
 * The basic rule is to get rid of entries that are expired.
842
 */
843
void
844
nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
845
{
846
	struct nfsrchash_bucket *hbp;
847
	struct nfsrvcache *rp, *nextrp;
848
	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
849
	time_t thisstamp;
850
	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
851
	static int onethread = 0, oneslot = 0;
852

853
	if (sockref != 0) {
854
		hbp = NFSRCAHASH(sockref);
855
		mtx_lock(&hbp->mtx);
856
		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
857
			if (sockref == rp->rc_sockref) {
858
				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
859
					rp->rc_acked = RC_ACK;
860
					LIST_REMOVE(rp, rc_ahash);
861
				} else if (final) {
862
					rp->rc_acked = RC_NACK;
863
					LIST_REMOVE(rp, rc_ahash);
864
				}
865
			}
866
		}
867
		mtx_unlock(&hbp->mtx);
868
	}
869

870
	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
871
		return;
872
	if (NFSD_MONOSEC != udp_lasttrim ||
873
	    NFSD_VNET(nfsrc_udpcachesize) >= (nfsrc_udphighwater +
874
	    nfsrc_udphighwater / 2)) {
875
		mtx_lock(&nfsrc_udpmtx);
876
		udp_lasttrim = NFSD_MONOSEC;
877
		TAILQ_FOREACH_SAFE(rp, &NFSD_VNET(nfsrvudplru), rc_lru,
878
		    nextrp) {
879
			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
880
			     && rp->rc_refcnt == 0
881
			     && ((rp->rc_flag & RC_REFCNT) ||
882
				 udp_lasttrim > rp->rc_timestamp ||
883
				 NFSD_VNET(nfsrc_udpcachesize) >
884
				 nfsrc_udphighwater))
885
				nfsrc_freecache(rp);
886
		}
887
		mtx_unlock(&nfsrc_udpmtx);
888
	}
889
	if (NFSD_MONOSEC != tcp_lasttrim ||
890
	    NFSD_VNET(nfsrc_tcpsavedreplies) >= nfsrc_tcphighwater) {
891
		force = nfsrc_tcphighwater / 4;
892
		if (force > 0 &&
893
		    NFSD_VNET(nfsrc_tcpsavedreplies) + force >=
894
		    nfsrc_tcphighwater) {
895
			for (i = 0; i < HISTSIZE; i++)
896
				time_histo[i] = 0;
897
			i = 0;
898
			lastslot = NFSRVCACHE_HASHSIZE - 1;
899
		} else {
900
			force = 0;
901
			if (NFSD_MONOSEC != tcp_lasttrim) {
902
				i = 0;
903
				lastslot = NFSRVCACHE_HASHSIZE - 1;
904
			} else {
905
				lastslot = i = oneslot;
906
				if (++oneslot >= NFSRVCACHE_HASHSIZE)
907
					oneslot = 0;
908
			}
909
		}
910
		tto = nfsrc_tcptimeout;
911
		tcp_lasttrim = NFSD_MONOSEC;
912
		for (; i <= lastslot; i++) {
913
			mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
914
			LIST_FOREACH_SAFE(rp,
915
			    &NFSD_VNET(nfsrchash_table)[i].tbl, rc_hash,
916
			    nextrp) {
917
				if (!(rp->rc_flag &
918
				     (RC_INPROG|RC_LOCKED|RC_WANTED))
919
				     && rp->rc_refcnt == 0) {
920
					if ((rp->rc_flag & RC_REFCNT) ||
921
					    tcp_lasttrim > rp->rc_timestamp ||
922
					    rp->rc_acked == RC_ACK) {
923
						nfsrc_freecache(rp);
924
						continue;
925
					}
926

927
					if (force == 0)
928
						continue;
929
					/*
930
					 * The timestamps range from roughly the
931
					 * present (tcp_lasttrim) to the present
932
					 * + nfsrc_tcptimeout. Generate a simple
933
					 * histogram of where the timeouts fall.
934
					 */
935
					j = rp->rc_timestamp - tcp_lasttrim;
936
					if (j >= tto)
937
						j = HISTSIZE - 1;
938
					else if (j < 0)
939
						j = 0;
940
					else
941
						j = j * HISTSIZE / tto;
942
					time_histo[j]++;
943
				}
944
			}
945
			mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
946
		}
947
		if (force) {
948
			/*
949
			 * Trim some more with a smaller timeout of as little
950
			 * as 20% of nfsrc_tcptimeout to try and get below
951
			 * 80% of the nfsrc_tcphighwater.
952
			 */
953
			k = 0;
954
			for (i = 0; i < (HISTSIZE - 2); i++) {
955
				k += time_histo[i];
956
				if (k > force)
957
					break;
958
			}
959
			k = tto * (i + 1) / HISTSIZE;
960
			if (k < 1)
961
				k = 1;
962
			thisstamp = tcp_lasttrim + k;
963
			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
964
				mtx_lock(&NFSD_VNET(nfsrchash_table)[i].mtx);
965
				LIST_FOREACH_SAFE(rp,
966
				    &NFSD_VNET(nfsrchash_table)[i].tbl,
967
				    rc_hash, nextrp) {
968
					if (!(rp->rc_flag &
969
					     (RC_INPROG|RC_LOCKED|RC_WANTED))
970
					     && rp->rc_refcnt == 0
971
					     && ((rp->rc_flag & RC_REFCNT) ||
972
						 thisstamp > rp->rc_timestamp ||
973
						 rp->rc_acked == RC_ACK))
974
						nfsrc_freecache(rp);
975
				}
976
				mtx_unlock(&NFSD_VNET(nfsrchash_table)[i].mtx);
977
			}
978
		}
979
	}
980
	atomic_store_rel_int(&onethread, 0);
981
}
982

983
/*
984
 * Add a seqid# reference to the cache entry.
985
 */
986
void
987
nfsrvd_refcache(struct nfsrvcache *rp)
988
{
989
	struct mtx *mutex;
990

991
	if (rp == NULL)
992
		/* For NFSv4.1, there is no cache entry. */
993
		return;
994
	mutex = nfsrc_cachemutex(rp);
995
	mtx_lock(mutex);
996
	if (rp->rc_refcnt < 0)
997
		panic("nfs cache refcnt");
998
	rp->rc_refcnt++;
999
	mtx_unlock(mutex);
1000
}
1001

1002
/*
1003
 * Dereference a seqid# cache entry.
1004
 */
1005
void
1006
nfsrvd_derefcache(struct nfsrvcache *rp)
1007
{
1008
	struct mtx *mutex;
1009

1010
	mutex = nfsrc_cachemutex(rp);
1011
	mtx_lock(mutex);
1012
	if (rp->rc_refcnt <= 0)
1013
		panic("nfs cache derefcnt");
1014
	rp->rc_refcnt--;
1015
	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
1016
		nfsrc_freecache(rp);
1017
	mtx_unlock(mutex);
1018
}
1019

1020
/*
1021
 * Calculate the length of the mbuf list and a checksum on the first up to
1022
 * NFSRVCACHE_CHECKLEN bytes.
1023
 */
1024
static int
1025
nfsrc_getlenandcksum(struct mbuf *m1, u_int16_t *cksum)
1026
{
1027
	int len = 0, cklen;
1028
	struct mbuf *m;
1029

1030
	m = m1;
1031
	while (m) {
1032
		len += m->m_len;
1033
		m = m->m_next;
1034
	}
1035
	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
1036
	*cksum = in_cksum(m1, cklen);
1037
	return (len);
1038
}
1039

1040
/*
1041
 * Mark a TCP connection that is seeing retries. Should never happen for
1042
 * NFSv4.
1043
 */
1044
static void
1045
nfsrc_marksametcpconn(u_int64_t sockref)
1046
{
1047
}
1048

1049
Product

Resources

Company