Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/net/ipv4/tcp_cong.c
15109 views
1
/*
2
* Plugable TCP congestion control support and newReno
3
* congestion control.
4
* Based on ideas from I/O scheduler suport and Web100.
5
*
6
* Copyright (C) 2005 Stephen Hemminger <[email protected]>
7
*/
8
9
#include <linux/module.h>
10
#include <linux/mm.h>
11
#include <linux/types.h>
12
#include <linux/list.h>
13
#include <linux/gfp.h>
14
#include <net/tcp.h>
15
16
int sysctl_tcp_max_ssthresh = 0;
17
18
static DEFINE_SPINLOCK(tcp_cong_list_lock);
19
static LIST_HEAD(tcp_cong_list);
20
21
/* Simple linear search, don't expect many entries! */
22
static struct tcp_congestion_ops *tcp_ca_find(const char *name)
23
{
24
struct tcp_congestion_ops *e;
25
26
list_for_each_entry_rcu(e, &tcp_cong_list, list) {
27
if (strcmp(e->name, name) == 0)
28
return e;
29
}
30
31
return NULL;
32
}
33
34
/*
35
* Attach new congestion control algorithm to the list
36
* of available options.
37
*/
38
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
39
{
40
int ret = 0;
41
42
/* all algorithms must implement ssthresh and cong_avoid ops */
43
if (!ca->ssthresh || !ca->cong_avoid) {
44
printk(KERN_ERR "TCP %s does not implement required ops\n",
45
ca->name);
46
return -EINVAL;
47
}
48
49
spin_lock(&tcp_cong_list_lock);
50
if (tcp_ca_find(ca->name)) {
51
printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
52
ret = -EEXIST;
53
} else {
54
list_add_tail_rcu(&ca->list, &tcp_cong_list);
55
printk(KERN_INFO "TCP %s registered\n", ca->name);
56
}
57
spin_unlock(&tcp_cong_list_lock);
58
59
return ret;
60
}
61
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
62
63
/*
64
* Remove congestion control algorithm, called from
65
* the module's remove function. Module ref counts are used
66
* to ensure that this can't be done till all sockets using
67
* that method are closed.
68
*/
69
void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
70
{
71
spin_lock(&tcp_cong_list_lock);
72
list_del_rcu(&ca->list);
73
spin_unlock(&tcp_cong_list_lock);
74
}
75
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
76
77
/* Assign choice of congestion control. */
78
void tcp_init_congestion_control(struct sock *sk)
79
{
80
struct inet_connection_sock *icsk = inet_csk(sk);
81
struct tcp_congestion_ops *ca;
82
83
/* if no choice made yet assign the current value set as default */
84
if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
85
rcu_read_lock();
86
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
87
if (try_module_get(ca->owner)) {
88
icsk->icsk_ca_ops = ca;
89
break;
90
}
91
92
/* fallback to next available */
93
}
94
rcu_read_unlock();
95
}
96
97
if (icsk->icsk_ca_ops->init)
98
icsk->icsk_ca_ops->init(sk);
99
}
100
101
/* Manage refcounts on socket close. */
102
void tcp_cleanup_congestion_control(struct sock *sk)
103
{
104
struct inet_connection_sock *icsk = inet_csk(sk);
105
106
if (icsk->icsk_ca_ops->release)
107
icsk->icsk_ca_ops->release(sk);
108
module_put(icsk->icsk_ca_ops->owner);
109
}
110
111
/* Used by sysctl to change default congestion control */
112
int tcp_set_default_congestion_control(const char *name)
113
{
114
struct tcp_congestion_ops *ca;
115
int ret = -ENOENT;
116
117
spin_lock(&tcp_cong_list_lock);
118
ca = tcp_ca_find(name);
119
#ifdef CONFIG_MODULES
120
if (!ca && capable(CAP_NET_ADMIN)) {
121
spin_unlock(&tcp_cong_list_lock);
122
123
request_module("tcp_%s", name);
124
spin_lock(&tcp_cong_list_lock);
125
ca = tcp_ca_find(name);
126
}
127
#endif
128
129
if (ca) {
130
ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
131
list_move(&ca->list, &tcp_cong_list);
132
ret = 0;
133
}
134
spin_unlock(&tcp_cong_list_lock);
135
136
return ret;
137
}
138
139
/* Set default value from kernel configuration at bootup */
140
static int __init tcp_congestion_default(void)
141
{
142
return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
143
}
144
late_initcall(tcp_congestion_default);
145
146
147
/* Build string with list of available congestion control values */
148
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
149
{
150
struct tcp_congestion_ops *ca;
151
size_t offs = 0;
152
153
rcu_read_lock();
154
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
155
offs += snprintf(buf + offs, maxlen - offs,
156
"%s%s",
157
offs == 0 ? "" : " ", ca->name);
158
159
}
160
rcu_read_unlock();
161
}
162
163
/* Get current default congestion control */
164
void tcp_get_default_congestion_control(char *name)
165
{
166
struct tcp_congestion_ops *ca;
167
/* We will always have reno... */
168
BUG_ON(list_empty(&tcp_cong_list));
169
170
rcu_read_lock();
171
ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
172
strncpy(name, ca->name, TCP_CA_NAME_MAX);
173
rcu_read_unlock();
174
}
175
176
/* Built list of non-restricted congestion control values */
177
void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
178
{
179
struct tcp_congestion_ops *ca;
180
size_t offs = 0;
181
182
*buf = '\0';
183
rcu_read_lock();
184
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
185
if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
186
continue;
187
offs += snprintf(buf + offs, maxlen - offs,
188
"%s%s",
189
offs == 0 ? "" : " ", ca->name);
190
191
}
192
rcu_read_unlock();
193
}
194
195
/* Change list of non-restricted congestion control */
196
int tcp_set_allowed_congestion_control(char *val)
197
{
198
struct tcp_congestion_ops *ca;
199
char *saved_clone, *clone, *name;
200
int ret = 0;
201
202
saved_clone = clone = kstrdup(val, GFP_USER);
203
if (!clone)
204
return -ENOMEM;
205
206
spin_lock(&tcp_cong_list_lock);
207
/* pass 1 check for bad entries */
208
while ((name = strsep(&clone, " ")) && *name) {
209
ca = tcp_ca_find(name);
210
if (!ca) {
211
ret = -ENOENT;
212
goto out;
213
}
214
}
215
216
/* pass 2 clear old values */
217
list_for_each_entry_rcu(ca, &tcp_cong_list, list)
218
ca->flags &= ~TCP_CONG_NON_RESTRICTED;
219
220
/* pass 3 mark as allowed */
221
while ((name = strsep(&val, " ")) && *name) {
222
ca = tcp_ca_find(name);
223
WARN_ON(!ca);
224
if (ca)
225
ca->flags |= TCP_CONG_NON_RESTRICTED;
226
}
227
out:
228
spin_unlock(&tcp_cong_list_lock);
229
kfree(saved_clone);
230
231
return ret;
232
}
233
234
235
/* Change congestion control for socket */
236
int tcp_set_congestion_control(struct sock *sk, const char *name)
237
{
238
struct inet_connection_sock *icsk = inet_csk(sk);
239
struct tcp_congestion_ops *ca;
240
int err = 0;
241
242
rcu_read_lock();
243
ca = tcp_ca_find(name);
244
245
/* no change asking for existing value */
246
if (ca == icsk->icsk_ca_ops)
247
goto out;
248
249
#ifdef CONFIG_MODULES
250
/* not found attempt to autoload module */
251
if (!ca && capable(CAP_NET_ADMIN)) {
252
rcu_read_unlock();
253
request_module("tcp_%s", name);
254
rcu_read_lock();
255
ca = tcp_ca_find(name);
256
}
257
#endif
258
if (!ca)
259
err = -ENOENT;
260
261
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
262
err = -EPERM;
263
264
else if (!try_module_get(ca->owner))
265
err = -EBUSY;
266
267
else {
268
tcp_cleanup_congestion_control(sk);
269
icsk->icsk_ca_ops = ca;
270
271
if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
272
icsk->icsk_ca_ops->init(sk);
273
}
274
out:
275
rcu_read_unlock();
276
return err;
277
}
278
279
/* RFC2861 Check whether we are limited by application or congestion window
280
* This is the inverse of cwnd check in tcp_tso_should_defer
281
*/
282
int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
283
{
284
const struct tcp_sock *tp = tcp_sk(sk);
285
u32 left;
286
287
if (in_flight >= tp->snd_cwnd)
288
return 1;
289
290
left = tp->snd_cwnd - in_flight;
291
if (sk_can_gso(sk) &&
292
left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
293
left * tp->mss_cache < sk->sk_gso_max_size)
294
return 1;
295
return left <= tcp_max_burst(tp);
296
}
297
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
298
299
/*
300
* Slow start is used when congestion window is less than slow start
301
* threshold. This version implements the basic RFC2581 version
302
* and optionally supports:
303
* RFC3742 Limited Slow Start - growth limited to max_ssthresh
304
* RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
305
*/
306
void tcp_slow_start(struct tcp_sock *tp)
307
{
308
int cnt; /* increase in packets */
309
310
/* RFC3465: ABC Slow start
311
* Increase only after a full MSS of bytes is acked
312
*
313
* TCP sender SHOULD increase cwnd by the number of
314
* previously unacknowledged bytes ACKed by each incoming
315
* acknowledgment, provided the increase is not more than L
316
*/
317
if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
318
return;
319
320
if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
321
cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
322
else
323
cnt = tp->snd_cwnd; /* exponential increase */
324
325
/* RFC3465: ABC
326
* We MAY increase by 2 if discovered delayed ack
327
*/
328
if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
329
cnt <<= 1;
330
tp->bytes_acked = 0;
331
332
tp->snd_cwnd_cnt += cnt;
333
while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
334
tp->snd_cwnd_cnt -= tp->snd_cwnd;
335
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
336
tp->snd_cwnd++;
337
}
338
}
339
EXPORT_SYMBOL_GPL(tcp_slow_start);
340
341
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
342
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
343
{
344
if (tp->snd_cwnd_cnt >= w) {
345
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
346
tp->snd_cwnd++;
347
tp->snd_cwnd_cnt = 0;
348
} else {
349
tp->snd_cwnd_cnt++;
350
}
351
}
352
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
353
354
/*
355
* TCP Reno congestion control
356
* This is special case used for fallback as well.
357
*/
358
/* This is Jacobson's slow start and congestion avoidance.
359
* SIGCOMM '88, p. 328.
360
*/
361
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
362
{
363
struct tcp_sock *tp = tcp_sk(sk);
364
365
if (!tcp_is_cwnd_limited(sk, in_flight))
366
return;
367
368
/* In "safe" area, increase. */
369
if (tp->snd_cwnd <= tp->snd_ssthresh)
370
tcp_slow_start(tp);
371
372
/* In dangerous area, increase slowly. */
373
else if (sysctl_tcp_abc) {
374
/* RFC3465: Appropriate Byte Count
375
* increase once for each full cwnd acked
376
*/
377
if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
378
tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
379
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
380
tp->snd_cwnd++;
381
}
382
} else {
383
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
384
}
385
}
386
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
387
388
/* Slow start threshold is half the congestion window (min 2) */
389
u32 tcp_reno_ssthresh(struct sock *sk)
390
{
391
const struct tcp_sock *tp = tcp_sk(sk);
392
return max(tp->snd_cwnd >> 1U, 2U);
393
}
394
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
395
396
/* Lower bound on congestion window with halving. */
397
u32 tcp_reno_min_cwnd(const struct sock *sk)
398
{
399
const struct tcp_sock *tp = tcp_sk(sk);
400
return tp->snd_ssthresh/2;
401
}
402
EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
403
404
struct tcp_congestion_ops tcp_reno = {
405
.flags = TCP_CONG_NON_RESTRICTED,
406
.name = "reno",
407
.owner = THIS_MODULE,
408
.ssthresh = tcp_reno_ssthresh,
409
.cong_avoid = tcp_reno_cong_avoid,
410
.min_cwnd = tcp_reno_min_cwnd,
411
};
412
413
/* Initial congestion control used (until SYN)
414
* really reno under another name so we can tell difference
415
* during tcp_set_default_congestion_control
416
*/
417
struct tcp_congestion_ops tcp_init_congestion_ops = {
418
.name = "",
419
.owner = THIS_MODULE,
420
.ssthresh = tcp_reno_ssthresh,
421
.cong_avoid = tcp_reno_cong_avoid,
422
.min_cwnd = tcp_reno_min_cwnd,
423
};
424
EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
425
426