Path: blob/main/sys/netpfil/ipfw/dn_sched_fq_codel.c
39482 views
/*1* FQ_Codel - The FlowQueue-Codel scheduler/AQM2*3* Copyright (C) 2016 Centre for Advanced Internet Architectures,4* Swinburne University of Technology, Melbourne, Australia.5* Portions of this code were made possible in part by a gift from6* The Comcast Innovation Fund.7* Implemented by Rasool Al-Saadi <[email protected]>8*9* Redistribution and use in source and binary forms, with or without10* modification, are permitted provided that the following conditions11* are met:12* 1. Redistributions of source code must retain the above copyright13* notice, this list of conditions and the following disclaimer.14* 2. Redistributions in binary form must reproduce the above copyright15* notice, this list of conditions and the following disclaimer in the16* documentation and/or other materials provided with the distribution.17*18* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND19* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE20* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE21* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE22* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL23* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS24* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)25* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT26* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY27* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF28* SUCH DAMAGE.29*/3031#ifdef _KERNEL32#include <sys/malloc.h>33#include <sys/socket.h>34//#include <sys/socketvar.h>35#include <sys/kernel.h>36#include <sys/mbuf.h>37#include <sys/module.h>38#include <net/if.h> /* IFNAMSIZ */39#include <netinet/in.h>40#include <netinet/ip_var.h> /* ipfw_rule_ref */41#include <netinet/ip_fw.h> /* flow_id */42#include <netinet/ip_dummynet.h>4344#include <sys/lock.h>45#include <sys/proc.h>46#include <sys/rwlock.h>4748#include <netpfil/ipfw/ip_fw_private.h>49#include <sys/sysctl.h>50#include <netinet/ip.h>51#include <netinet/ip6.h>52#include <netinet/ip_icmp.h>53#include <netinet/tcp.h>54#include <netinet/udp.h>55#include <sys/queue.h>56#include <sys/hash.h>5758#include <netpfil/ipfw/dn_heap.h>59#include <netpfil/ipfw/ip_dn_private.h>6061#include <netpfil/ipfw/dn_aqm.h>62#include <netpfil/ipfw/dn_aqm_codel.h>63#include <netpfil/ipfw/dn_sched.h>64#include <netpfil/ipfw/dn_sched_fq_codel.h>65#include <netpfil/ipfw/dn_sched_fq_codel_helper.h>6667#else68#include <dn_test.h>69#endif7071/* NOTE: In fq_codel module, we reimplements CoDel AQM functions72* because fq_codel use different flows (sub-queues) structure and73* dn_queue includes many variables not needed by a flow (sub-queue74* )i.e. avoid extra overhead (88 bytes vs 208 bytes).75* Also, CoDel functions manages stats of sub-queues as well as the main queue.76*/7778#define DN_SCHED_FQ_CODEL 67980static struct dn_alg fq_codel_desc;8182/* fq_codel default parameters including codel */83struct dn_sch_fq_codel_parms84fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US,85CODEL_ECN_ENABLED}, 1024, 10240, 1514};8687static int88fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)89{90int error;91long value;9293value = fq_codel_sysctl.ccfg.interval;94value /= AQM_TIME_1US;95error = sysctl_handle_long(oidp, &value, 0, req);96if (error != 0 || req->newptr == NULL)97return (error);98if (value < 1 || value > 100 * AQM_TIME_1S)99return (EINVAL);100fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ;101102return (0);103}104105static int106fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)107{108int error;109long value;110111value = fq_codel_sysctl.ccfg.target;112value /= AQM_TIME_1US;113error = sysctl_handle_long(oidp, &value, 0, req);114if (error != 0 || req->newptr == NULL)115return (error);116if (value < 1 || value > 5 * AQM_TIME_1S)117return (EINVAL);118fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ;119120return (0);121}122123SYSBEGIN(f4)124125SYSCTL_DECL(_net_inet);126SYSCTL_DECL(_net_inet_ip);127SYSCTL_DECL(_net_inet_ip_dummynet);128static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel,129CTLFLAG_RW | CTLFLAG_MPSAFE, 0,130"FQ_CODEL");131132#ifdef SYSCTL_NODE133134SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target,135CTLTYPE_LONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT,136NULL, 0, fqcodel_sysctl_target_handler, "L",137"FQ_CoDel target in microsecond");138SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval,139CTLTYPE_LONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT,140NULL, 0, fqcodel_sysctl_interval_handler, "L",141"FQ_CoDel interval in microsecond");142143SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum,144CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum");145SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows,146CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024,147"Number of queues for FQ_CoDel");148SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit,149CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit");150#endif151152/* Drop a packet form the head of codel queue */153static void154codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si)155{156struct mbuf *m = q->mq.head;157158if (m == NULL)159return;160q->mq.head = m->m_nextpkt;161162fq_update_stats(q, si, -m->m_pkthdr.len, 1);163164if (si->main_q.ni.length == 0) /* queue is now idle */165si->main_q.q_time = V_dn_cfg.curr_time;166167FREE_PKT(m);168}169170/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet.171* Return 1 when unable to add timestamp, otherwise return 0172*/173static int174codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si)175{176uint64_t len;177178len = m->m_pkthdr.len;179/* finding maximum packet size */180if (len > q->cst.maxpkt_size)181q->cst.maxpkt_size = len;182183/* Add timestamp to mbuf as MTAG */184struct m_tag *mtag;185mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);186if (mtag == NULL)187mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t),188M_NOWAIT);189if (mtag == NULL)190goto drop;191*(aqm_time_t *)(mtag + 1) = AQM_UNOW;192m_tag_prepend(m, mtag);193194if (m->m_pkthdr.rcvif != NULL)195m_rcvif_serialize(m);196197mq_append(&q->mq, m);198fq_update_stats(q, si, len, 0);199return 0;200201drop:202fq_update_stats(q, si, len, 1);203m_freem(m);204return 1;205}206207/*208* Classify a packet to queue number using Jenkins hash function.209* Return: queue number210* the input of the hash are protocol no, perturbation, src IP, dst IP,211* src port, dst port,212*/213static inline int214fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si)215{216struct ip *ip;217struct tcphdr *th;218struct udphdr *uh;219uint8_t tuple[41];220uint16_t hash=0;221222ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off);223//#ifdef INET6224struct ip6_hdr *ip6;225int isip6;226isip6 = (ip->ip_v == 6);227228if(isip6) {229ip6 = (struct ip6_hdr *)ip;230*((uint8_t *) &tuple[0]) = ip6->ip6_nxt;231*((uint32_t *) &tuple[1]) = si->perturbation;232memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);233memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);234235switch (ip6->ip6_nxt) {236case IPPROTO_TCP:237th = (struct tcphdr *)(ip6 + 1);238*((uint16_t *) &tuple[37]) = th->th_dport;239*((uint16_t *) &tuple[39]) = th->th_sport;240break;241242case IPPROTO_UDP:243uh = (struct udphdr *)(ip6 + 1);244*((uint16_t *) &tuple[37]) = uh->uh_dport;245*((uint16_t *) &tuple[39]) = uh->uh_sport;246break;247default:248memset(&tuple[37], 0, 4);249}250251hash = jenkins_hash(tuple, 41, HASHINIT) % fcount;252return hash;253}254//#endif255256/* IPv4 */257*((uint8_t *) &tuple[0]) = ip->ip_p;258*((uint32_t *) &tuple[1]) = si->perturbation;259*((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;260*((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;261262switch (ip->ip_p) {263case IPPROTO_TCP:264th = (struct tcphdr *)(ip + 1);265*((uint16_t *) &tuple[13]) = th->th_dport;266*((uint16_t *) &tuple[15]) = th->th_sport;267break;268269case IPPROTO_UDP:270uh = (struct udphdr *)(ip + 1);271*((uint16_t *) &tuple[13]) = uh->uh_dport;272*((uint16_t *) &tuple[15]) = uh->uh_sport;273break;274default:275memset(&tuple[13], 0, 4);276}277hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;278279return hash;280}281282/*283* Enqueue a packet into an appropriate queue according to284* FQ_CODEL algorithm.285*/286static int287fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,288struct mbuf *m)289{290struct fq_codel_si *si;291struct fq_codel_schk *schk;292struct dn_sch_fq_codel_parms *param;293struct dn_queue *mainq;294int idx, drop, i, maxidx;295296mainq = (struct dn_queue *)(_si + 1);297si = (struct fq_codel_si *)_si;298schk = (struct fq_codel_schk *)(si->_si.sched+1);299param = &schk->cfg;300301/* classify a packet to queue number*/302idx = fq_codel_classify_flow(m, param->flows_cnt, si);303/* enqueue packet into appropriate queue using CoDel AQM.304* Note: 'codel_enqueue' function returns 1 only when it unable to305* add timestamp to packet (no limit check)*/306drop = codel_enqueue(&si->flows[idx], m, si);307308/* codel unable to timestamp a packet */309if (drop)310return 1;311312/* If the flow (sub-queue) is not active ,then add it to the tail of313* new flows list, initialize and activate it.314*/315if (!si->flows[idx].active ) {316STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);317si->flows[idx].deficit = param->quantum;318si->flows[idx].cst.dropping = false;319si->flows[idx].cst.first_above_time = 0;320si->flows[idx].active = 1;321//D("activate %d",idx);322}323324/* check the limit for all queues and remove a packet from the325* largest one326*/327if (mainq->ni.length > schk->cfg.limit) { D("over limit");328/* find first active flow */329for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)330if (si->flows[maxidx].active)331break;332if (maxidx < schk->cfg.flows_cnt) {333/* find the largest sub- queue */334for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)335if (si->flows[i].active && si->flows[i].stats.length >336si->flows[maxidx].stats.length)337maxidx = i;338codel_drop_head(&si->flows[maxidx], si);339D("maxidx = %d",maxidx);340drop = 1;341}342}343344return drop;345}346347/*348* Dequeue a packet from an appropriate queue according to349* FQ_CODEL algorithm.350*/351static struct mbuf *352fq_codel_dequeue(struct dn_sch_inst *_si)353{354struct fq_codel_si *si;355struct fq_codel_schk *schk;356struct dn_sch_fq_codel_parms *param;357struct fq_codel_flow *f;358struct mbuf *mbuf;359struct fq_codel_list *fq_codel_flowlist;360361si = (struct fq_codel_si *)_si;362schk = (struct fq_codel_schk *)(si->_si.sched+1);363param = &schk->cfg;364365do {366/* select a list to start with */367if (STAILQ_EMPTY(&si->newflows))368fq_codel_flowlist = &si->oldflows;369else370fq_codel_flowlist = &si->newflows;371372/* Both new and old queue lists are empty, return NULL */373if (STAILQ_EMPTY(fq_codel_flowlist))374return NULL;375376f = STAILQ_FIRST(fq_codel_flowlist);377while (f != NULL) {378/* if there is no flow(sub-queue) deficit, increase deficit379* by quantum, move the flow to the tail of old flows list380* and try another flow.381* Otherwise, the flow will be used for dequeue.382*/383if (f->deficit < 0) {384f->deficit += param->quantum;385STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);386STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);387} else388break;389390f = STAILQ_FIRST(fq_codel_flowlist);391}392393/* the new flows list is empty, try old flows list */394if (STAILQ_EMPTY(fq_codel_flowlist))395continue;396397/* Dequeue a packet from the selected flow */398mbuf = fqc_codel_dequeue(f, si);399400/* Codel did not return a packet */401if (!mbuf) {402/* If the selected flow belongs to new flows list, then move403* it to the tail of old flows list. Otherwise, deactivate it and404* remove it from the old list and405*/406if (fq_codel_flowlist == &si->newflows) {407STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);408STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);409} else {410f->active = 0;411STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);412}413/* start again */414continue;415}416417/* we have a packet to return,418* update flow deficit and return the packet*/419f->deficit -= mbuf->m_pkthdr.len;420return mbuf;421422} while (1);423424/* unreachable point */425return NULL;426}427428/*429* Initialize fq_codel scheduler instance.430* also, allocate memory for flows array.431*/432static int433fq_codel_new_sched(struct dn_sch_inst *_si)434{435struct fq_codel_si *si;436struct dn_queue *q;437struct fq_codel_schk *schk;438int i;439440si = (struct fq_codel_si *)_si;441schk = (struct fq_codel_schk *)(_si->sched+1);442443if(si->flows) {444D("si already configured!");445return 0;446}447448/* init the main queue */449q = &si->main_q;450set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));451q->_si = _si;452q->fs = _si->sched->fs;453454/* allocate memory for flows array */455si->flows = mallocarray(schk->cfg.flows_cnt,456sizeof(struct fq_codel_flow), M_DUMMYNET, M_NOWAIT | M_ZERO);457if (si->flows == NULL) {458D("cannot allocate memory for fq_codel configuration parameters");459return ENOMEM ;460}461462/* init perturbation for this si */463si->perturbation = random();464465/* init the old and new flows lists */466STAILQ_INIT(&si->newflows);467STAILQ_INIT(&si->oldflows);468469/* init the flows (sub-queues) */470for (i = 0; i < schk->cfg.flows_cnt; i++) {471/* init codel */472si->flows[i].cst.maxpkt_size = 500;473}474475fq_codel_desc.ref_count++;476return 0;477}478479/*480* Free fq_codel scheduler instance.481*/482static int483fq_codel_free_sched(struct dn_sch_inst *_si)484{485struct fq_codel_si *si = (struct fq_codel_si *)_si ;486487/* free the flows array */488free(si->flows , M_DUMMYNET);489si->flows = NULL;490fq_codel_desc.ref_count--;491492return 0;493}494495/*496* Configure fq_codel scheduler.497* the configurations for the scheduler is passed from userland.498*/499static int500fq_codel_config(struct dn_schk *_schk)501{502struct fq_codel_schk *schk;503struct dn_extra_parms *ep;504struct dn_sch_fq_codel_parms *fqc_cfg;505506schk = (struct fq_codel_schk *)(_schk+1);507ep = (struct dn_extra_parms *) _schk->cfg;508509/* par array contains fq_codel configuration as follow510* Codel: 0- target,1- interval, 2- flags511* FQ_CODEL: 3- quantum, 4- limit, 5- flows512*/513if (ep && ep->oid.len ==sizeof(*ep) &&514ep->oid.subtype == DN_SCH_PARAMS) {515fqc_cfg = &schk->cfg;516if (ep->par[0] < 0)517fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target;518else519fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US;520521if (ep->par[1] < 0)522fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval;523else524fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US;525526if (ep->par[2] < 0)527fqc_cfg->ccfg.flags = 0;528else529fqc_cfg->ccfg.flags = ep->par[2];530531/* FQ configurations */532if (ep->par[3] < 0)533fqc_cfg->quantum = fq_codel_sysctl.quantum;534else535fqc_cfg->quantum = ep->par[3];536537if (ep->par[4] < 0)538fqc_cfg->limit = fq_codel_sysctl.limit;539else540fqc_cfg->limit = ep->par[4];541542if (ep->par[5] < 0)543fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt;544else545fqc_cfg->flows_cnt = ep->par[5];546547/* Bound the configurations */548fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 ,5495 * AQM_TIME_1S); ;550fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1,551100 * AQM_TIME_1S);552553fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000);554fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480);555fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536);556}557else558return 1;559560return 0;561}562563/*564* Return fq_codel scheduler configurations565* the configurations for the scheduler is passed to userland.566*/567static int568fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {569struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1);570struct dn_sch_fq_codel_parms *fqc_cfg;571572fqc_cfg = &schk->cfg;573574strcpy(ep->name, fq_codel_desc.name);575ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US;576ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US;577ep->par[2] = fqc_cfg->ccfg.flags;578579ep->par[3] = fqc_cfg->quantum;580ep->par[4] = fqc_cfg->limit;581ep->par[5] = fqc_cfg->flows_cnt;582583return 0;584}585586/*587* fq_codel scheduler descriptor588* contains the type of the scheduler, the name, the size of extra589* data structures, and function pointers.590*/591static struct dn_alg fq_codel_desc = {592_SI( .type = ) DN_SCHED_FQ_CODEL,593_SI( .name = ) "FQ_CODEL",594_SI( .flags = ) 0,595596_SI( .schk_datalen = ) sizeof(struct fq_codel_schk),597_SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst),598_SI( .q_datalen = ) 0,599600_SI( .enqueue = ) fq_codel_enqueue,601_SI( .dequeue = ) fq_codel_dequeue,602_SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/603_SI( .destroy = ) NULL, /*sched x delete */604_SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */605_SI( .free_sched = ) fq_codel_free_sched, /* delete schd instance */606_SI( .new_fsk = ) NULL,607_SI( .free_fsk = ) NULL,608_SI( .new_queue = ) NULL,609_SI( .free_queue = ) NULL,610_SI( .getconfig = ) fq_codel_getconfig,611_SI( .ref_count = ) 0612};613614DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc);615616617