CoCalc -- tcp

GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ipv4/tcp_cong.c
¹⁷⁴⁷⁶ views
1
/*
2
 * Plugable TCP congestion control support and newReno
3
 * congestion control.
4
 * Based on ideas from I/O scheduler suport and Web100.
5
 *
6
 * Copyright (C) 2005 Stephen Hemminger <[email protected]>
7
 */
8

9
#include <linux/module.h>
10
#include <linux/mm.h>
11
#include <linux/types.h>
12
#include <linux/list.h>
13
#include <linux/gfp.h>
14
#include <net/tcp.h>
15

16
int sysctl_tcp_max_ssthresh = 0;
17

18
static DEFINE_SPINLOCK(tcp_cong_list_lock);
19
static LIST_HEAD(tcp_cong_list);
20

21
/* Simple linear search, don't expect many entries! */
22
static struct tcp_congestion_ops *tcp_ca_find(const char *name)
23
{
24
	struct tcp_congestion_ops *e;
25

26
	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
27
		if (strcmp(e->name, name) == 0)
28
			return e;
29
	}
30

31
	return NULL;
32
}
33

34
/*
35
 * Attach new congestion control algorithm to the list
36
 * of available options.
37
 */
38
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
39
{
40
	int ret = 0;
41

42
	/* all algorithms must implement ssthresh and cong_avoid ops */
43
	if (!ca->ssthresh || !ca->cong_avoid) {
44
		printk(KERN_ERR "TCP %s does not implement required ops\n",
45
		       ca->name);
46
		return -EINVAL;
47
	}
48

49
	spin_lock(&tcp_cong_list_lock);
50
	if (tcp_ca_find(ca->name)) {
51
		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
52
		ret = -EEXIST;
53
	} else {
54
		list_add_tail_rcu(&ca->list, &tcp_cong_list);
55
		printk(KERN_INFO "TCP %s registered\n", ca->name);
56
	}
57
	spin_unlock(&tcp_cong_list_lock);
58

59
	return ret;
60
}
61
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
62

63
/*
64
 * Remove congestion control algorithm, called from
65
 * the module's remove function.  Module ref counts are used
66
 * to ensure that this can't be done till all sockets using
67
 * that method are closed.
68
 */
69
void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
70
{
71
	spin_lock(&tcp_cong_list_lock);
72
	list_del_rcu(&ca->list);
73
	spin_unlock(&tcp_cong_list_lock);
74
}
75
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
76

77
/* Assign choice of congestion control. */
78
void tcp_init_congestion_control(struct sock *sk)
79
{
80
	struct inet_connection_sock *icsk = inet_csk(sk);
81
	struct tcp_congestion_ops *ca;
82

83
	/* if no choice made yet assign the current value set as default */
84
	if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
85
		rcu_read_lock();
86
		list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
87
			if (try_module_get(ca->owner)) {
88
				icsk->icsk_ca_ops = ca;
89
				break;
90
			}
91

92
			/* fallback to next available */
93
		}
94
		rcu_read_unlock();
95
	}
96

97
	if (icsk->icsk_ca_ops->init)
98
		icsk->icsk_ca_ops->init(sk);
99
}
100

101
/* Manage refcounts on socket close. */
102
void tcp_cleanup_congestion_control(struct sock *sk)
103
{
104
	struct inet_connection_sock *icsk = inet_csk(sk);
105

106
	if (icsk->icsk_ca_ops->release)
107
		icsk->icsk_ca_ops->release(sk);
108
	module_put(icsk->icsk_ca_ops->owner);
109
}
110

111
/* Used by sysctl to change default congestion control */
112
int tcp_set_default_congestion_control(const char *name)
113
{
114
	struct tcp_congestion_ops *ca;
115
	int ret = -ENOENT;
116

117
	spin_lock(&tcp_cong_list_lock);
118
	ca = tcp_ca_find(name);
119
#ifdef CONFIG_MODULES
120
	if (!ca && capable(CAP_NET_ADMIN)) {
121
		spin_unlock(&tcp_cong_list_lock);
122

123
		request_module("tcp_%s", name);
124
		spin_lock(&tcp_cong_list_lock);
125
		ca = tcp_ca_find(name);
126
	}
127
#endif
128

129
	if (ca) {
130
		ca->flags |= TCP_CONG_NON_RESTRICTED;	/* default is always allowed */
131
		list_move(&ca->list, &tcp_cong_list);
132
		ret = 0;
133
	}
134
	spin_unlock(&tcp_cong_list_lock);
135

136
	return ret;
137
}
138

139
/* Set default value from kernel configuration at bootup */
140
static int __init tcp_congestion_default(void)
141
{
142
	return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
143
}
144
late_initcall(tcp_congestion_default);
145

146

147
/* Build string with list of available congestion control values */
148
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
149
{
150
	struct tcp_congestion_ops *ca;
151
	size_t offs = 0;
152

153
	rcu_read_lock();
154
	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
155
		offs += snprintf(buf + offs, maxlen - offs,
156
				 "%s%s",
157
				 offs == 0 ? "" : " ", ca->name);
158

159
	}
160
	rcu_read_unlock();
161
}
162

163
/* Get current default congestion control */
164
void tcp_get_default_congestion_control(char *name)
165
{
166
	struct tcp_congestion_ops *ca;
167
	/* We will always have reno... */
168
	BUG_ON(list_empty(&tcp_cong_list));
169

170
	rcu_read_lock();
171
	ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
172
	strncpy(name, ca->name, TCP_CA_NAME_MAX);
173
	rcu_read_unlock();
174
}
175

176
/* Built list of non-restricted congestion control values */
177
void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
178
{
179
	struct tcp_congestion_ops *ca;
180
	size_t offs = 0;
181

182
	*buf = '\0';
183
	rcu_read_lock();
184
	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
185
		if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
186
			continue;
187
		offs += snprintf(buf + offs, maxlen - offs,
188
				 "%s%s",
189
				 offs == 0 ? "" : " ", ca->name);
190

191
	}
192
	rcu_read_unlock();
193
}
194

195
/* Change list of non-restricted congestion control */
196
int tcp_set_allowed_congestion_control(char *val)
197
{
198
	struct tcp_congestion_ops *ca;
199
	char *saved_clone, *clone, *name;
200
	int ret = 0;
201

202
	saved_clone = clone = kstrdup(val, GFP_USER);
203
	if (!clone)
204
		return -ENOMEM;
205

206
	spin_lock(&tcp_cong_list_lock);
207
	/* pass 1 check for bad entries */
208
	while ((name = strsep(&clone, " ")) && *name) {
209
		ca = tcp_ca_find(name);
210
		if (!ca) {
211
			ret = -ENOENT;
212
			goto out;
213
		}
214
	}
215

216
	/* pass 2 clear old values */
217
	list_for_each_entry_rcu(ca, &tcp_cong_list, list)
218
		ca->flags &= ~TCP_CONG_NON_RESTRICTED;
219

220
	/* pass 3 mark as allowed */
221
	while ((name = strsep(&val, " ")) && *name) {
222
		ca = tcp_ca_find(name);
223
		WARN_ON(!ca);
224
		if (ca)
225
			ca->flags |= TCP_CONG_NON_RESTRICTED;
226
	}
227
out:
228
	spin_unlock(&tcp_cong_list_lock);
229
	kfree(saved_clone);
230

231
	return ret;
232
}
233

234

235
/* Change congestion control for socket */
236
int tcp_set_congestion_control(struct sock *sk, const char *name)
237
{
238
	struct inet_connection_sock *icsk = inet_csk(sk);
239
	struct tcp_congestion_ops *ca;
240
	int err = 0;
241

242
	rcu_read_lock();
243
	ca = tcp_ca_find(name);
244

245
	/* no change asking for existing value */
246
	if (ca == icsk->icsk_ca_ops)
247
		goto out;
248

249
#ifdef CONFIG_MODULES
250
	/* not found attempt to autoload module */
251
	if (!ca && capable(CAP_NET_ADMIN)) {
252
		rcu_read_unlock();
253
		request_module("tcp_%s", name);
254
		rcu_read_lock();
255
		ca = tcp_ca_find(name);
256
	}
257
#endif
258
	if (!ca)
259
		err = -ENOENT;
260

261
	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
262
		err = -EPERM;
263

264
	else if (!try_module_get(ca->owner))
265
		err = -EBUSY;
266

267
	else {
268
		tcp_cleanup_congestion_control(sk);
269
		icsk->icsk_ca_ops = ca;
270

271
		if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
272
			icsk->icsk_ca_ops->init(sk);
273
	}
274
 out:
275
	rcu_read_unlock();
276
	return err;
277
}
278

279
/* RFC2861 Check whether we are limited by application or congestion window
280
 * This is the inverse of cwnd check in tcp_tso_should_defer
281
 */
282
int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
283
{
284
	const struct tcp_sock *tp = tcp_sk(sk);
285
	u32 left;
286

287
	if (in_flight >= tp->snd_cwnd)
288
		return 1;
289

290
	left = tp->snd_cwnd - in_flight;
291
	if (sk_can_gso(sk) &&
292
	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
293
	    left * tp->mss_cache < sk->sk_gso_max_size)
294
		return 1;
295
	return left <= tcp_max_burst(tp);
296
}
297
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
298

299
/*
300
 * Slow start is used when congestion window is less than slow start
301
 * threshold. This version implements the basic RFC2581 version
302
 * and optionally supports:
303
 * 	RFC3742 Limited Slow Start  	  - growth limited to max_ssthresh
304
 *	RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
305
 */
306
void tcp_slow_start(struct tcp_sock *tp)
307
{
308
	int cnt; /* increase in packets */
309

310
	/* RFC3465: ABC Slow start
311
	 * Increase only after a full MSS of bytes is acked
312
	 *
313
	 * TCP sender SHOULD increase cwnd by the number of
314
	 * previously unacknowledged bytes ACKed by each incoming
315
	 * acknowledgment, provided the increase is not more than L
316
	 */
317
	if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
318
		return;
319

320
	if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
321
		cnt = sysctl_tcp_max_ssthresh >> 1;	/* limited slow start */
322
	else
323
		cnt = tp->snd_cwnd;			/* exponential increase */
324

325
	/* RFC3465: ABC
326
	 * We MAY increase by 2 if discovered delayed ack
327
	 */
328
	if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
329
		cnt <<= 1;
330
	tp->bytes_acked = 0;
331

332
	tp->snd_cwnd_cnt += cnt;
333
	while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
334
		tp->snd_cwnd_cnt -= tp->snd_cwnd;
335
		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
336
			tp->snd_cwnd++;
337
	}
338
}
339
EXPORT_SYMBOL_GPL(tcp_slow_start);
340

341
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
342
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
343
{
344
	if (tp->snd_cwnd_cnt >= w) {
345
		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
346
			tp->snd_cwnd++;
347
		tp->snd_cwnd_cnt = 0;
348
	} else {
349
		tp->snd_cwnd_cnt++;
350
	}
351
}
352
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
353

354
/*
355
 * TCP Reno congestion control
356
 * This is special case used for fallback as well.
357
 */
358
/* This is Jacobson's slow start and congestion avoidance.
359
 * SIGCOMM '88, p. 328.
360
 */
361
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
362
{
363
	struct tcp_sock *tp = tcp_sk(sk);
364

365
	if (!tcp_is_cwnd_limited(sk, in_flight))
366
		return;
367

368
	/* In "safe" area, increase. */
369
	if (tp->snd_cwnd <= tp->snd_ssthresh)
370
		tcp_slow_start(tp);
371

372
	/* In dangerous area, increase slowly. */
373
	else if (sysctl_tcp_abc) {
374
		/* RFC3465: Appropriate Byte Count
375
		 * increase once for each full cwnd acked
376
		 */
377
		if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
378
			tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
379
			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
380
				tp->snd_cwnd++;
381
		}
382
	} else {
383
		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
384
	}
385
}
386
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
387

388
/* Slow start threshold is half the congestion window (min 2) */
389
u32 tcp_reno_ssthresh(struct sock *sk)
390
{
391
	const struct tcp_sock *tp = tcp_sk(sk);
392
	return max(tp->snd_cwnd >> 1U, 2U);
393
}
394
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
395

396
/* Lower bound on congestion window with halving. */
397
u32 tcp_reno_min_cwnd(const struct sock *sk)
398
{
399
	const struct tcp_sock *tp = tcp_sk(sk);
400
	return tp->snd_ssthresh/2;
401
}
402
EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
403

404
struct tcp_congestion_ops tcp_reno = {
405
	.flags		= TCP_CONG_NON_RESTRICTED,
406
	.name		= "reno",
407
	.owner		= THIS_MODULE,
408
	.ssthresh	= tcp_reno_ssthresh,
409
	.cong_avoid	= tcp_reno_cong_avoid,
410
	.min_cwnd	= tcp_reno_min_cwnd,
411
};
412

413
/* Initial congestion control used (until SYN)
414
 * really reno under another name so we can tell difference
415
 * during tcp_set_default_congestion_control
416
 */
417
struct tcp_congestion_ops tcp_init_congestion_ops  = {
418
	.name		= "",
419
	.owner		= THIS_MODULE,
420
	.ssthresh	= tcp_reno_ssthresh,
421
	.cong_avoid	= tcp_reno_cong_avoid,
422
	.min_cwnd	= tcp_reno_min_cwnd,
423
};
424
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
425

426
Product

Resources

Company