#include <sys/cdefs.h>
#include <opt_cc.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_hpts.h>
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>
#ifndef CC_DEFAULT
#define CC_DEFAULT "cubic"
#endif
uint32_t hystart_minrtt_thresh = 4000;
uint32_t hystart_maxrtt_thresh = 16000;
uint32_t hystart_n_rttsamples = 8;
uint32_t hystart_css_growth_div = 4;
uint32_t hystart_css_rounds = 5;
uint32_t hystart_bblogs = 0;
MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
struct rwlock cc_list_lock;
VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
VNET_DEFINE(uint32_t, newreno_beta) = 50;
#define V_newreno_beta VNET(newreno_beta)
VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
void
cc_refer(struct cc_algo *algo)
{
CC_LIST_LOCK_ASSERT();
refcount_acquire(&algo->cc_refcount);
}
void
cc_release(struct cc_algo *algo)
{
CC_LIST_LOCK_ASSERT();
refcount_release(&algo->cc_refcount);
}
void
cc_attach(struct tcpcb *tp, struct cc_algo *algo)
{
CC_LIST_RLOCK();
CC_ALGO(tp) = algo;
cc_refer(algo);
CC_LIST_RUNLOCK();
}
void
cc_detach(struct tcpcb *tp)
{
struct cc_algo *algo;
CC_LIST_RLOCK();
algo = CC_ALGO(tp);
CC_ALGO(tp) = NULL;
cc_release(algo);
CC_LIST_RUNLOCK();
}
static int
cc_default_algo(SYSCTL_HANDLER_ARGS)
{
char default_cc[TCP_CA_NAME_MAX];
struct cc_algo *funcs;
int error;
CC_LIST_RLOCK();
if (CC_DEFAULT_ALGO() != NULL)
strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc));
else
memset(default_cc, 0, TCP_CA_NAME_MAX);
CC_LIST_RUNLOCK();
error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req);
if (error != 0 || req->newptr == NULL)
goto done;
error = ESRCH;
CC_LIST_RLOCK();
STAILQ_FOREACH(funcs, &cc_list, entries) {
if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
continue;
if (funcs->flags & CC_MODULE_BEING_REMOVED) {
continue;
}
V_default_cc_ptr = funcs;
error = 0;
break;
}
CC_LIST_RUNLOCK();
done:
return (error);
}
static int
cc_list_available(SYSCTL_HANDLER_ARGS)
{
struct cc_algo *algo;
int error, nalgos;
int linesz;
char *buffer, *cp;
size_t bufsz, outsz;
error = nalgos = 0;
CC_LIST_RLOCK();
STAILQ_FOREACH(algo, &cc_list, entries) {
nalgos++;
}
CC_LIST_RUNLOCK();
if (nalgos == 0) {
return (ENOENT);
}
bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1);
buffer = malloc(bufsz, M_TEMP, M_WAITOK);
cp = buffer;
linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D',
"PCB count");
cp += linesz;
bufsz -= linesz;
outsz = linesz;
CC_LIST_RLOCK();
STAILQ_FOREACH(algo, &cc_list, entries) {
linesz = snprintf(cp, bufsz, "%-16s%c %u\n",
algo->name,
(algo == CC_DEFAULT_ALGO()) ? '*' : ' ',
algo->cc_refcount);
if (linesz >= bufsz) {
error = EOVERFLOW;
break;
}
cp += linesz;
bufsz -= linesz;
outsz += linesz;
}
CC_LIST_RUNLOCK();
if (error == 0)
error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
free(buffer, M_TEMP);
return (error);
}
static int
cc_check_default(struct cc_algo *remove_cc)
{
int cnt = 0;
VNET_ITERATOR_DECL(vnet_iter);
CC_LIST_LOCK_ASSERT();
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
if ((CC_DEFAULT_ALGO() != NULL) &&
strncmp(CC_DEFAULT_ALGO()->name,
remove_cc->name,
TCP_CA_NAME_MAX) == 0) {
cnt++;
}
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
return (cnt);
}
static void
cc_init(void)
{
CC_LIST_LOCK_INIT();
STAILQ_INIT(&cc_list);
}
static int
cc_deregister_algo_locked(struct cc_algo *remove_cc)
{
struct cc_algo *funcs;
int found = 0;
STAILQ_FOREACH(funcs, &cc_list, entries) {
if (funcs == remove_cc)
found = 1;
}
if (found == 0) {
return (ENOENT);
}
KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED),
("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc));
if (cc_check_default(remove_cc)) {
return(EBUSY);
}
if (remove_cc->cc_refcount != 0) {
return (EBUSY);
}
STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries);
return (0);
}
int
cc_deregister_algo(struct cc_algo *remove_cc)
{
int ret;
CC_LIST_WLOCK();
ret = cc_deregister_algo_locked(remove_cc);
CC_LIST_WUNLOCK();
return (ret);
}
int
cc_register_algo(struct cc_algo *add_cc)
{
struct cc_algo *funcs;
int err;
err = 0;
CC_LIST_WLOCK();
STAILQ_FOREACH(funcs, &cc_list, entries) {
if (funcs == add_cc ||
strncmp(funcs->name, add_cc->name,
TCP_CA_NAME_MAX) == 0) {
err = EEXIST;
break;
}
}
if (err == 0)
refcount_init(&add_cc->cc_refcount, 0);
if (!err) {
STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
if (strcmp(add_cc->name, CC_DEFAULT) == 0) {
V_default_cc_ptr = add_cc;
} else if (V_default_cc_ptr == NULL) {
V_default_cc_ptr = add_cc;
}
}
CC_LIST_WUNLOCK();
return (err);
}
static void
vnet_cc_sysinit(void *arg)
{
struct cc_algo *cc;
if (IS_DEFAULT_VNET(curvnet))
return;
CURVNET_SET(vnet0);
cc = V_default_cc_ptr;
CURVNET_RESTORE();
V_default_cc_ptr = cc;
}
VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
vnet_cc_sysinit, NULL);
void
newreno_cc_post_recovery(struct cc_var *ccv)
{
int pipe;
uint32_t mss = tcp_fixed_maxseg(ccv->tp);
if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
pipe = tcp_compute_pipe(ccv->tp);
if (pipe < CCV(ccv, snd_ssthresh))
CCV(ccv, snd_cwnd) = max(pipe, mss) + mss;
else
CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
}
}
void
newreno_cc_after_idle(struct cc_var *ccv)
{
uint32_t rw;
rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp));
CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
}
u_int
newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss)
{
uint32_t cwin, factor;
cwin = CCV(ccv, snd_cwnd);
factor = V_newreno_beta;
return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss;
}
void
newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type)
{
uint32_t cwin, mss, pipe;
mss = tcp_fixed_maxseg(ccv->tp);
KASSERT((type & CC_SIGPRIVMASK) == 0,
("%s: congestion signal type 0x%08x is private\n", __func__, type));
cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss);
switch (type) {
case CC_NDUPACK:
if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
CCV(ccv, snd_ssthresh) = cwin;
ENTER_RECOVERY(CCV(ccv, t_flags));
}
break;
case CC_ECN:
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
CCV(ccv, snd_ssthresh) = cwin;
CCV(ccv, snd_cwnd) = cwin;
ENTER_CONGRECOVERY(CCV(ccv, t_flags));
}
break;
case CC_RTO:
if (CCV(ccv, t_rxtshift) == 1) {
pipe = tcp_compute_pipe(ccv->tp);
CCV(ccv, snd_ssthresh) = max(2,
min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss;
}
CCV(ccv, snd_cwnd) = mss;
break;
default:
break;
}
}
u_int
newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv)
{
u_int cw = CCV(ccv, snd_cwnd);
u_int incr = tcp_fixed_maxseg(ccv->tp);
KASSERT(cw > CCV(ccv, snd_ssthresh),
("congestion control state not in congestion avoidance\n"));
if (V_tcp_do_rfc3465) {
if (ccv->flags & CCF_ABC_SENTAWND)
ccv->flags &= ~CCF_ABC_SENTAWND;
else
incr = 0;
} else
incr = max((incr * incr / cw), 1);
if (incr > 0)
return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale));
else
return cw;
}
u_int
newreno_cc_cwnd_in_slow_start(struct cc_var *ccv)
{
u_int cw = CCV(ccv, snd_cwnd);
u_int mss = tcp_fixed_maxseg(ccv->tp);
u_int incr = mss;
KASSERT(cw <= CCV(ccv, snd_ssthresh),
("congestion control state not in slow start\n"));
if (V_tcp_do_rfc3465) {
uint16_t abc_val;
if (ccv->flags & CCF_USE_LOCAL_ABC)
abc_val = ccv->labc;
else
abc_val = V_tcp_abc_l_var;
if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
incr = min(ccv->bytes_this_ack,
ccv->nsegs * abc_val * mss);
else
incr = min(ccv->bytes_this_ack, mss);
}
if (incr > 0)
return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale));
else
return cw;
}
void
newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type)
{
if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
(ccv->flags & CCF_CWND_LIMITED)) {
if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) {
CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv);
} else {
CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv);
}
}
}
static int
cc_stop_new_assignments(struct cc_algo *algo)
{
CC_LIST_WLOCK();
if (cc_check_default(algo)) {
CC_LIST_WUNLOCK();
return (EBUSY);
}
algo->flags |= CC_MODULE_BEING_REMOVED;
CC_LIST_WUNLOCK();
return (0);
}
int
cc_modevent(module_t mod, int event_type, void *data)
{
struct cc_algo *algo;
int err;
err = 0;
algo = (struct cc_algo *)data;
switch(event_type) {
case MOD_LOAD:
if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
err = EINVAL;
break;
}
if (algo->mod_init != NULL)
err = algo->mod_init();
if (!err)
err = cc_register_algo(algo);
break;
case MOD_SHUTDOWN:
break;
case MOD_QUIESCE:
err = cc_stop_new_assignments(algo);
break;
case MOD_UNLOAD:
CC_LIST_WLOCK();
if (cc_check_default(algo)) {
CC_LIST_WUNLOCK();
return (EBUSY);
}
err = cc_deregister_algo_locked(algo);
CC_LIST_WUNLOCK();
if ((err == 0) && (algo->mod_destroy != NULL)) {
algo->mod_destroy();
}
break;
default:
err = EINVAL;
break;
}
return (err);
}
SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
"Congestion control related settings");
SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm,
CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, 0, cc_default_algo, "A",
"Default congestion control algorithm");
SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available,
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
NULL, 0, cc_list_available, "A",
"List available congestion control algorithms");
SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus,
CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
"New Reno related HyStart++ settings");
SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh,
CTLFLAG_RW,
&hystart_minrtt_thresh, 4000,
"HyStarts++ minimum RTT thresh used in clamp (in microseconds)");
SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh,
CTLFLAG_RW,
&hystart_maxrtt_thresh, 16000,
"HyStarts++ maximum RTT thresh used in clamp (in microseconds)");
SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples,
CTLFLAG_RW,
&hystart_n_rttsamples, 8,
"The number of RTT samples that must be seen to consider HyStart++");
SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div,
CTLFLAG_RW,
&hystart_css_growth_div, 4,
"The divisor to the growth when in Hystart++ CSS");
SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds,
CTLFLAG_RW,
&hystart_css_rounds, 5,
"The number of rounds HyStart++ lasts in CSS before falling to CA");
SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs,
CTLFLAG_RW,
&hystart_bblogs, 0,
"Do we enable HyStart++ Black Box logs to be generated if BB logging is on");
VNET_DEFINE(int, cc_do_abe) = 0;
SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(cc_do_abe), 0,
"Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)");
VNET_DEFINE(int, cc_abe_frlossreduce) = 0;
SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(cc_abe_frlossreduce), 0,
"Apply standard beta instead of ABE-beta during ECN-signalled congestion "
"recovery episodes if loss also needs to be repaired");