Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netinet/cc/cc_dctcp.c
39476 views
1
/*-
2
* Copyright (c) 2007-2008
3
* Swinburne University of Technology, Melbourne, Australia
4
* Copyright (c) 2009-2010 Lawrence Stewart <[email protected]>
5
* Copyright (c) 2014 Midori Kato <[email protected]>
6
* Copyright (c) 2014 The FreeBSD Foundation
7
* All rights reserved.
8
*
9
* Redistribution and use in source and binary forms, with or without
10
* modification, are permitted provided that the following conditions
11
* are met:
12
* 1. Redistributions of source code must retain the above copyright
13
* notice, this list of conditions and the following disclaimer.
14
* 2. Redistributions in binary form must reproduce the above copyright
15
* notice, this list of conditions and the following disclaimer in the
16
* documentation and/or other materials provided with the distribution.
17
*
18
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28
* SUCH DAMAGE.
29
*/
30
31
/*
32
* An implementation of the DCTCP algorithm for FreeBSD, based on
33
* "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz,
34
* J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan.,
35
* in ACM Conference on SIGCOMM 2010, New York, USA,
36
* Originally released as the contribution of Microsoft Research project.
37
*/
38
39
#include <sys/param.h>
40
#include <sys/kernel.h>
41
#include <sys/malloc.h>
42
#include <sys/module.h>
43
#include <sys/socket.h>
44
#include <sys/socketvar.h>
45
#include <sys/sysctl.h>
46
#include <sys/systm.h>
47
48
#include <net/vnet.h>
49
50
#include <net/route.h>
51
#include <net/route/nhop.h>
52
53
#include <netinet/in_pcb.h>
54
#include <netinet/tcp.h>
55
#include <netinet/tcp_seq.h>
56
#include <netinet/tcp_var.h>
57
#include <netinet/cc/cc.h>
58
#include <netinet/cc/cc_module.h>
59
60
#define DCTCP_SHIFT 10
61
#define MAX_ALPHA_VALUE (1<<DCTCP_SHIFT)
62
VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = MAX_ALPHA_VALUE;
63
#define V_dctcp_alpha VNET(dctcp_alpha)
64
VNET_DEFINE_STATIC(uint32_t, dctcp_shift_g) = 4;
65
#define V_dctcp_shift_g VNET(dctcp_shift_g)
66
VNET_DEFINE_STATIC(uint32_t, dctcp_slowstart) = 0;
67
#define V_dctcp_slowstart VNET(dctcp_slowstart)
68
VNET_DEFINE_STATIC(uint32_t, dctcp_ect1) = 0;
69
#define V_dctcp_ect1 VNET(dctcp_ect1)
70
71
struct dctcp {
72
uint32_t bytes_ecn; /* # of marked bytes during a RTT */
73
uint32_t bytes_total; /* # of acked bytes during a RTT */
74
int alpha; /* the fraction of marked bytes */
75
int ce_prev; /* CE state of the last segment */
76
tcp_seq save_sndnxt; /* end sequence number of the current window */
77
int ece_curr; /* ECE flag in this segment */
78
int ece_prev; /* ECE flag in the last segment */
79
uint32_t num_cong_events; /* # of congestion events */
80
};
81
82
static void dctcp_ack_received(struct cc_var *ccv, ccsignal_t type);
83
static void dctcp_after_idle(struct cc_var *ccv);
84
static void dctcp_cb_destroy(struct cc_var *ccv);
85
static int dctcp_cb_init(struct cc_var *ccv, void *ptr);
86
static void dctcp_cong_signal(struct cc_var *ccv, ccsignal_t type);
87
static void dctcp_conn_init(struct cc_var *ccv);
88
static void dctcp_post_recovery(struct cc_var *ccv);
89
static void dctcp_ecnpkt_handler(struct cc_var *ccv);
90
static void dctcp_update_alpha(struct cc_var *ccv);
91
static size_t dctcp_data_sz(void);
92
93
struct cc_algo dctcp_cc_algo = {
94
.name = "dctcp",
95
.ack_received = dctcp_ack_received,
96
.cb_destroy = dctcp_cb_destroy,
97
.cb_init = dctcp_cb_init,
98
.cong_signal = dctcp_cong_signal,
99
.conn_init = dctcp_conn_init,
100
.post_recovery = dctcp_post_recovery,
101
.ecnpkt_handler = dctcp_ecnpkt_handler,
102
.after_idle = dctcp_after_idle,
103
.cc_data_sz = dctcp_data_sz,
104
};
105
106
static void
107
dctcp_ack_received(struct cc_var *ccv, ccsignal_t type)
108
{
109
struct dctcp *dctcp_data;
110
int bytes_acked = 0;
111
uint32_t mss = tcp_fixed_maxseg(ccv->tp);
112
113
dctcp_data = ccv->cc_data;
114
115
if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) {
116
/*
117
* DCTCP doesn't treat receipt of ECN marked packet as a
118
* congestion event. Thus, DCTCP always executes the ACK
119
* processing out of congestion recovery.
120
*/
121
if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
122
EXIT_CONGRECOVERY(CCV(ccv, t_flags));
123
newreno_cc_ack_received(ccv, type);
124
ENTER_CONGRECOVERY(CCV(ccv, t_flags));
125
} else
126
newreno_cc_ack_received(ccv, type);
127
128
if (type == CC_DUPACK)
129
bytes_acked = min(ccv->bytes_this_ack, mss);
130
131
if (type == CC_ACK)
132
bytes_acked = ccv->bytes_this_ack;
133
134
/* Update total bytes. */
135
dctcp_data->bytes_total += bytes_acked;
136
137
/* Update total marked bytes. */
138
if (dctcp_data->ece_curr) {
139
//XXRMS: For fluid-model DCTCP, update
140
//cwnd here during for RTT fairness
141
if (!dctcp_data->ece_prev
142
&& bytes_acked > mss) {
143
dctcp_data->bytes_ecn +=
144
(bytes_acked - mss);
145
} else
146
dctcp_data->bytes_ecn += bytes_acked;
147
dctcp_data->ece_prev = 1;
148
} else {
149
if (dctcp_data->ece_prev
150
&& bytes_acked > mss)
151
dctcp_data->bytes_ecn += mss;
152
dctcp_data->ece_prev = 0;
153
}
154
dctcp_data->ece_curr = 0;
155
156
/*
157
* Update the fraction of marked bytes at the end of
158
* current window size.
159
*/
160
if (!IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
161
SEQ_GT(ccv->curack, dctcp_data->save_sndnxt))
162
dctcp_update_alpha(ccv);
163
} else
164
newreno_cc_ack_received(ccv, type);
165
}
166
167
static size_t
168
dctcp_data_sz(void)
169
{
170
return (sizeof(struct dctcp));
171
}
172
173
static void
174
dctcp_after_idle(struct cc_var *ccv)
175
{
176
struct dctcp *dctcp_data;
177
178
if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) {
179
dctcp_data = ccv->cc_data;
180
181
/* Initialize internal parameters after idle time */
182
dctcp_data->bytes_ecn = 0;
183
dctcp_data->bytes_total = 0;
184
dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
185
dctcp_data->alpha = V_dctcp_alpha;
186
dctcp_data->ece_curr = 0;
187
dctcp_data->ece_prev = 0;
188
dctcp_data->num_cong_events = 0;
189
}
190
191
newreno_cc_after_idle(ccv);
192
}
193
194
static void
195
dctcp_cb_destroy(struct cc_var *ccv)
196
{
197
free(ccv->cc_data, M_CC_MEM);
198
}
199
200
static int
201
dctcp_cb_init(struct cc_var *ccv, void *ptr)
202
{
203
struct dctcp *dctcp_data;
204
205
INP_WLOCK_ASSERT(tptoinpcb(ccv->tp));
206
if (ptr == NULL) {
207
dctcp_data = malloc(sizeof(struct dctcp), M_CC_MEM, M_NOWAIT|M_ZERO);
208
if (dctcp_data == NULL)
209
return (ENOMEM);
210
} else
211
dctcp_data = ptr;
212
/* Initialize some key variables with sensible defaults. */
213
dctcp_data->bytes_ecn = 0;
214
dctcp_data->bytes_total = 0;
215
/*
216
* When alpha is set to 0 in the beginning, DCTCP sender transfers as
217
* much data as possible until the value converges which may expand the
218
* queueing delay at the switch. When alpha is set to 1, queueing delay
219
* is kept small.
220
* Throughput-sensitive applications should have alpha = 0
221
* Latency-sensitive applications should have alpha = 1
222
*
223
* Note: DCTCP draft suggests initial alpha to be 1 but we've decided to
224
* keep it 0 as default.
225
*/
226
dctcp_data->alpha = V_dctcp_alpha;
227
dctcp_data->save_sndnxt = 0;
228
dctcp_data->ce_prev = 0;
229
dctcp_data->ece_curr = 0;
230
dctcp_data->ece_prev = 0;
231
dctcp_data->num_cong_events = 0;
232
233
ccv->cc_data = dctcp_data;
234
return (0);
235
}
236
237
/*
238
* Perform any necessary tasks before we enter congestion recovery.
239
*/
240
static void
241
dctcp_cong_signal(struct cc_var *ccv, ccsignal_t type)
242
{
243
struct dctcp *dctcp_data;
244
uint32_t cwin, mss, pipe;
245
246
if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) {
247
dctcp_data = ccv->cc_data;
248
cwin = CCV(ccv, snd_cwnd);
249
mss = tcp_fixed_maxseg(ccv->tp);
250
251
switch (type) {
252
case CC_NDUPACK:
253
if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
254
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
255
CCV(ccv, snd_ssthresh) =
256
max(cwin / 2, 2 * mss);
257
dctcp_data->num_cong_events++;
258
} else {
259
/* cwnd has already updated as congestion
260
* recovery. Reverse cwnd value using
261
* snd_cwnd_prev and recalculate snd_ssthresh
262
*/
263
cwin = CCV(ccv, snd_cwnd_prev);
264
CCV(ccv, snd_ssthresh) =
265
max(cwin / 2, 2 * mss);
266
}
267
ENTER_RECOVERY(CCV(ccv, t_flags));
268
}
269
break;
270
case CC_ECN:
271
/*
272
* Save current snd_cwnd when the host encounters both
273
* congestion recovery and fast recovery.
274
*/
275
CCV(ccv, snd_cwnd_prev) = cwin;
276
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
277
if (V_dctcp_slowstart &&
278
dctcp_data->num_cong_events++ == 0) {
279
CCV(ccv, snd_ssthresh) =
280
max(cwin / 2, 2 * mss);
281
dctcp_data->alpha = MAX_ALPHA_VALUE;
282
dctcp_data->bytes_ecn = 0;
283
dctcp_data->bytes_total = 0;
284
dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
285
} else
286
CCV(ccv, snd_ssthresh) =
287
max((cwin - (((uint64_t)cwin *
288
dctcp_data->alpha) >> (DCTCP_SHIFT+1))),
289
2 * mss);
290
CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
291
ENTER_CONGRECOVERY(CCV(ccv, t_flags));
292
}
293
dctcp_data->ece_curr = 1;
294
break;
295
case CC_RTO:
296
if (CCV(ccv, t_rxtshift) == 1) {
297
pipe = tcp_compute_pipe(ccv->tp);
298
CCV(ccv, snd_ssthresh) = max(2,
299
min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss;
300
}
301
CCV(ccv, snd_cwnd) = mss;
302
dctcp_update_alpha(ccv);
303
dctcp_data->save_sndnxt += mss;
304
dctcp_data->num_cong_events++;
305
break;
306
default:
307
break;
308
}
309
} else
310
newreno_cc_cong_signal(ccv, type);
311
}
312
313
static void
314
dctcp_conn_init(struct cc_var *ccv)
315
{
316
struct dctcp *dctcp_data;
317
318
dctcp_data = ccv->cc_data;
319
320
if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) {
321
dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
322
if (V_dctcp_ect1)
323
CCV(ccv, t_flags2) |= TF2_ECN_USE_ECT1;
324
}
325
}
326
327
/*
328
* Perform any necessary tasks before we exit congestion recovery.
329
*/
330
static void
331
dctcp_post_recovery(struct cc_var *ccv)
332
{
333
newreno_cc_post_recovery(ccv);
334
335
if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT)
336
dctcp_update_alpha(ccv);
337
}
338
339
/*
340
* Execute an additional ECN processing using ECN field in IP header
341
* and the CWR bit in TCP header.
342
*/
343
static void
344
dctcp_ecnpkt_handler(struct cc_var *ccv)
345
{
346
struct dctcp *dctcp_data;
347
uint32_t ccflag;
348
int acknow;
349
350
dctcp_data = ccv->cc_data;
351
ccflag = ccv->flags;
352
acknow = 0;
353
354
/*
355
* DCTCP responds with an ACK immediately when the CE state
356
* in between this segment and the last segment has changed.
357
*/
358
if (ccflag & CCF_IPHDR_CE) {
359
if (!dctcp_data->ce_prev) {
360
acknow = 1;
361
dctcp_data->ce_prev = 1;
362
CCV(ccv, t_flags2) |= TF2_ECN_SND_ECE;
363
}
364
} else {
365
if (dctcp_data->ce_prev) {
366
acknow = 1;
367
dctcp_data->ce_prev = 0;
368
CCV(ccv, t_flags2) &= ~TF2_ECN_SND_ECE;
369
}
370
}
371
372
if ((acknow) || (ccflag & CCF_TCPHDR_CWR)) {
373
ccv->flags |= CCF_ACKNOW;
374
} else {
375
ccv->flags &= ~CCF_ACKNOW;
376
}
377
}
378
379
/*
380
* Update the fraction of marked bytes represented as 'alpha'.
381
* Also initialize several internal parameters at the end of this function.
382
*/
383
static void
384
dctcp_update_alpha(struct cc_var *ccv)
385
{
386
struct dctcp *dctcp_data;
387
int alpha_prev;
388
389
dctcp_data = ccv->cc_data;
390
alpha_prev = dctcp_data->alpha;
391
dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1);
392
393
/*
394
* Update alpha: alpha = (1 - g) * alpha + g * M.
395
* Here:
396
* g is weight factor
397
* recommaded to be set to 1/16
398
* small g = slow convergence between competitive DCTCP flows
399
* large g = impacts low utilization of bandwidth at switches
400
* M is fraction of marked segments in last RTT
401
* updated every RTT
402
* Alpha must be round to 0 - MAX_ALPHA_VALUE.
403
*/
404
dctcp_data->alpha = ulmin(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
405
((uint64_t)dctcp_data->bytes_ecn << (DCTCP_SHIFT - V_dctcp_shift_g)) /
406
dctcp_data->bytes_total, MAX_ALPHA_VALUE);
407
408
/* Initialize internal parameters for next alpha calculation */
409
dctcp_data->bytes_ecn = 0;
410
dctcp_data->bytes_total = 0;
411
dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
412
}
413
414
static int
415
dctcp_alpha_handler(SYSCTL_HANDLER_ARGS)
416
{
417
uint32_t new;
418
int error;
419
420
new = V_dctcp_alpha;
421
error = sysctl_handle_int(oidp, &new, 0, req);
422
if (error == 0 && req->newptr != NULL) {
423
if (new > MAX_ALPHA_VALUE)
424
error = EINVAL;
425
else
426
V_dctcp_alpha = new;
427
}
428
429
return (error);
430
}
431
432
static int
433
dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS)
434
{
435
uint32_t new;
436
int error;
437
438
new = V_dctcp_shift_g;
439
error = sysctl_handle_int(oidp, &new, 0, req);
440
if (error == 0 && req->newptr != NULL) {
441
if (new > DCTCP_SHIFT)
442
error = EINVAL;
443
else
444
V_dctcp_shift_g = new;
445
}
446
447
return (error);
448
}
449
450
static int
451
dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS)
452
{
453
uint32_t new;
454
int error;
455
456
new = V_dctcp_slowstart;
457
error = sysctl_handle_int(oidp, &new, 0, req);
458
if (error == 0 && req->newptr != NULL) {
459
if (new > 1)
460
error = EINVAL;
461
else
462
V_dctcp_slowstart = new;
463
}
464
465
return (error);
466
}
467
468
SYSCTL_DECL(_net_inet_tcp_cc_dctcp);
469
SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp,
470
CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
471
"dctcp congestion control related settings");
472
473
SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha,
474
CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
475
&VNET_NAME(dctcp_alpha), 0, &dctcp_alpha_handler, "IU",
476
"dctcp alpha parameter at start of session");
477
478
SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g,
479
CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
480
&VNET_NAME(dctcp_shift_g), 4, &dctcp_shift_g_handler, "IU",
481
"dctcp shift parameter");
482
483
SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart,
484
CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
485
&VNET_NAME(dctcp_slowstart), 0, &dctcp_slowstart_handler, "IU",
486
"half CWND reduction after the first slow start");
487
488
SYSCTL_UINT(_net_inet_tcp_cc_dctcp, OID_AUTO, ect1,
489
CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
490
&VNET_NAME(dctcp_ect1), 0,
491
"Send DCTCP segments with ÍP ECT(0) or ECT(1)");
492
493
DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo);
494
MODULE_VERSION(dctcp, 2);
495
496