/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa4* Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa5* All rights reserved6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions9* are met:10* 1. Redistributions of source code must retain the above copyright11* notice, this list of conditions and the following disclaimer.12* 2. Redistributions in binary form must reproduce the above copyright13* notice, this list of conditions and the following disclaimer in the14* documentation and/or other materials provided with the distribution.15*16* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND17* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE18* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE19* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE20* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL21* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS22* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)23* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT24* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY25* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF26* SUCH DAMAGE.27*/2829/*30*/3132#ifdef _KERNEL33#include <sys/malloc.h>34#include <sys/socket.h>35#include <sys/socketvar.h>36#include <sys/kernel.h>37#include <sys/lock.h>38#include <sys/mbuf.h>39#include <sys/module.h>40#include <sys/rwlock.h>41#include <net/if.h> /* IFNAMSIZ */42#include <netinet/in.h>43#include <netinet/ip_var.h> /* ipfw_rule_ref */44#include <netinet/ip_fw.h> /* flow_id */45#include <netinet/ip_dummynet.h>46#include <netpfil/ipfw/ip_fw_private.h>47#include <netpfil/ipfw/dn_heap.h>48#include <netpfil/ipfw/ip_dn_private.h>49#ifdef NEW_AQM50#include <netpfil/ipfw/dn_aqm.h>51#endif52#include <netpfil/ipfw/dn_sched.h>53#else54#include <dn_test.h>55#endif5657#ifndef MAX6458#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)59#endif6061/*62* timestamps are computed on 64 bit using fixed point arithmetic.63* LMAX_BITS, WMAX_BITS are the max number of bits for the packet len64* and sum of weights, respectively. FRAC_BITS is the number of65* fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large66* errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w67* using an unsigned 32-bit division, and to avoid wraparounds we need68* LMAX_BITS + WMAX_BITS + FRAC_BITS << 6469* As an example70* FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 1971*/72#ifndef FRAC_BITS73#define FRAC_BITS 28 /* shift for fixed point arithmetic */74#define ONE_FP (1UL << FRAC_BITS)75#endif7677/*78* Private information for the scheduler instance:79* sch_heap (key is Finish time) returns the next queue to serve80* ne_heap (key is Start time) stores not-eligible queues81* idle_heap (key=start/finish time) stores idle flows. It must82* support extract-from-middle.83* A flow is only in 1 of the three heaps.84* XXX todo: use a more efficient data structure, e.g. a tree sorted85* by F with min_subtree(S) in each node86*/87struct wf2qp_si {88struct dn_heap sch_heap; /* top extract - key Finish time */89struct dn_heap ne_heap; /* top extract - key Start time */90struct dn_heap idle_heap; /* random extract - key Start=Finish time */91uint64_t V; /* virtual time */92uint32_t inv_wsum; /* inverse of sum of weights */93uint32_t wsum; /* sum of weights */94};9596struct wf2qp_queue {97struct dn_queue _q;98uint64_t S, F; /* start time, finish time */99uint32_t inv_w; /* ONE_FP / weight */100int32_t heap_pos; /* position (index) of struct in heap */101};102103/*104* This file implements a WF2Q+ scheduler as it has been in dummynet105* since 2000.106* The scheduler supports per-flow queues and has O(log N) complexity.107*108* WF2Q+ needs to drain entries from the idle heap so that we109* can keep the sum of weights up to date. We can do it whenever110* we get a chance, or periodically, or following some other111* strategy. The function idle_check() drains at most N elements112* from the idle heap.113*/114static void115idle_check(struct wf2qp_si *si, int n, int force)116{117struct dn_heap *h = &si->idle_heap;118while (n-- > 0 && h->elements > 0 &&119(force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {120struct dn_queue *q = HEAP_TOP(h)->object;121struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;122123heap_extract(h, NULL);124/* XXX to let the flowset delete the queue we should125* mark it as 'unused' by the scheduler.126*/127alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */128si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */129if (si->wsum > 0)130si->inv_wsum = ONE_FP/si->wsum;131}132}133134static int135wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)136{137struct dn_fsk *fs = q->fs;138struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);139struct wf2qp_queue *alg_fq;140uint64_t len = m->m_pkthdr.len;141142if (m != q->mq.head) {143if (dn_enqueue(q, m, 0)) /* packet was dropped */144return 1;145if (m != q->mq.head) /* queue was already busy */146return 0;147}148149/* If reach this point, queue q was idle */150alg_fq = (struct wf2qp_queue *)q;151152if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {153/* F<S means timestamps are invalid ->brand new queue. */154alg_fq->S = si->V; /* init start time */155si->wsum += fs->fs.par[0]; /* add weight of new queue. */156si->inv_wsum = ONE_FP/si->wsum;157} else { /* if it was idle then it was in the idle heap */158if (! heap_extract(&si->idle_heap, q))159return 1;160alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */161}162alg_fq->F = alg_fq->S + len * alg_fq->inv_w;163164/* if nothing is backlogged, make sure this flow is eligible */165if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)166si->V = MAX64(alg_fq->S, si->V);167168/*169* Look at eligibility. A flow is not eligibile if S>V (when170* this happens, it means that there is some other flow already171* scheduled for the same pipe, so the sch_heap cannot be172* empty). If the flow is not eligible we just store it in the173* ne_heap. Otherwise, we store in the sch_heap.174* Note that for all flows in sch_heap (SCH), S_i <= V,175* and for all flows in ne_heap (NEH), S_i > V.176* So when we need to compute max(V, min(S_i)) forall i in177* SCH+NEH, we only need to look into NEH.178*/179if (DN_KEY_LT(si->V, alg_fq->S)) {180/* S>V means flow Not eligible. */181if (si->sch_heap.elements == 0)182D("++ ouch! not eligible but empty scheduler!");183heap_insert(&si->ne_heap, alg_fq->S, q);184} else {185heap_insert(&si->sch_heap, alg_fq->F, q);186}187return 0;188}189190/* XXX invariant: sch > 0 || V >= min(S in neh) */191static struct mbuf *192wf2qp_dequeue(struct dn_sch_inst *_si)193{194/* Access scheduler instance private data */195struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);196struct mbuf *m;197struct dn_queue *q;198struct dn_heap *sch = &si->sch_heap;199struct dn_heap *neh = &si->ne_heap;200struct wf2qp_queue *alg_fq;201202if (sch->elements == 0 && neh->elements == 0) {203/* we have nothing to do. We could kill the idle heap204* altogether and reset V205*/206idle_check(si, 0x7fffffff, 1);207si->V = 0;208si->wsum = 0; /* should be set already */209return NULL; /* quick return if nothing to do */210}211idle_check(si, 1, 0); /* drain something from the idle heap */212213/* make sure at least one element is eligible, bumping V214* and moving entries that have become eligible.215* We need to repeat the first part twice, before and216* after extracting the candidate, or enqueue() will217* find the data structure in a wrong state.218*/219m = NULL;220for(;;) {221/*222* Compute V = max(V, min(S_i)). Remember that all elements223* in sch have by definition S_i <= V so if sch is not empty,224* V is surely the max and we must not update it. Conversely,225* if sch is empty we only need to look at neh.226* We don't need to move the queues, as it will be done at the227* next enqueue228*/229if (sch->elements == 0 && neh->elements > 0) {230si->V = MAX64(si->V, HEAP_TOP(neh)->key);231}232while (neh->elements > 0 &&233DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {234q = HEAP_TOP(neh)->object;235alg_fq = (struct wf2qp_queue *)q;236heap_extract(neh, NULL);237heap_insert(sch, alg_fq->F, q);238}239if (m) /* pkt found in previous iteration */240break;241/* ok we have at least one eligible pkt */242q = HEAP_TOP(sch)->object;243alg_fq = (struct wf2qp_queue *)q;244m = dn_dequeue(q);245if (m == NULL)246return NULL;247heap_extract(sch, NULL); /* Remove queue from heap. */248si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;249alg_fq->S = alg_fq->F; /* Update start time. */250if (q->mq.head == 0) { /* not backlogged any more. */251heap_insert(&si->idle_heap, alg_fq->F, q);252} else { /* Still backlogged. */253/* Update F, store in neh or sch */254uint64_t len = q->mq.head->m_pkthdr.len;255alg_fq->F += len * alg_fq->inv_w;256if (DN_KEY_LEQ(alg_fq->S, si->V)) {257heap_insert(sch, alg_fq->F, q);258} else {259heap_insert(neh, alg_fq->S, q);260}261}262}263return m;264}265266static int267wf2qp_new_sched(struct dn_sch_inst *_si)268{269struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);270int ofs = offsetof(struct wf2qp_queue, heap_pos);271272/* all heaps support extract from middle */273if (heap_init(&si->idle_heap, 16, ofs) ||274heap_init(&si->sch_heap, 16, ofs) ||275heap_init(&si->ne_heap, 16, ofs)) {276heap_free(&si->ne_heap);277heap_free(&si->sch_heap);278heap_free(&si->idle_heap);279return ENOMEM;280}281return 0;282}283284static int285wf2qp_free_sched(struct dn_sch_inst *_si)286{287struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);288289heap_free(&si->sch_heap);290heap_free(&si->ne_heap);291heap_free(&si->idle_heap);292293return 0;294}295296static int297wf2qp_new_fsk(struct dn_fsk *fs)298{299ipdn_bound_var(&fs->fs.par[0], 1,3001, 100, "WF2Q+ weight");301return 0;302}303304static int305wf2qp_new_queue(struct dn_queue *_q)306{307struct wf2qp_queue *q = (struct wf2qp_queue *)_q;308309_q->ni.oid.subtype = DN_SCHED_WF2QP;310q->F = 0; /* not strictly necessary */311q->S = q->F + 1; /* mark timestamp as invalid. */312q->inv_w = ONE_FP / _q->fs->fs.par[0];313if (_q->mq.head != NULL) {314wf2qp_enqueue(_q->_si, _q, _q->mq.head);315}316return 0;317}318319/*320* Called when the infrastructure removes a queue (e.g. flowset321* is reconfigured). Nothing to do if we did not 'own' the queue,322* otherwise remove it from the right heap and adjust the sum323* of weights.324*/325static int326wf2qp_free_queue(struct dn_queue *q)327{328struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;329struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);330331if (alg_fq->S >= alg_fq->F + 1)332return 0; /* nothing to do, not in any heap */333si->wsum -= q->fs->fs.par[0];334if (si->wsum > 0)335si->inv_wsum = ONE_FP/si->wsum;336337/* extract from the heap. XXX TODO we may need to adjust V338* to make sure the invariants hold.339*/340heap_extract(&si->idle_heap, q);341heap_extract(&si->ne_heap, q);342heap_extract(&si->sch_heap, q);343344return 0;345}346347/*348* WF2Q+ scheduler descriptor349* contains the type of the scheduler, the name, the size of the350* structures and function pointers.351*/352static struct dn_alg wf2qp_desc = {353_SI( .type = ) DN_SCHED_WF2QP,354_SI( .name = ) "WF2Q+",355_SI( .flags = ) DN_MULTIQUEUE,356357/* we need extra space in the si and the queue */358_SI( .schk_datalen = ) 0,359_SI( .si_datalen = ) sizeof(struct wf2qp_si),360_SI( .q_datalen = ) sizeof(struct wf2qp_queue) -361sizeof(struct dn_queue),362363_SI( .enqueue = ) wf2qp_enqueue,364_SI( .dequeue = ) wf2qp_dequeue,365366_SI( .config = ) NULL,367_SI( .destroy = ) NULL,368_SI( .new_sched = ) wf2qp_new_sched,369_SI( .free_sched = ) wf2qp_free_sched,370371_SI( .new_fsk = ) wf2qp_new_fsk,372_SI( .free_fsk = ) NULL,373374_SI( .new_queue = ) wf2qp_new_queue,375_SI( .free_queue = ) wf2qp_free_queue,376#ifdef NEW_AQM377_SI( .getconfig = ) NULL,378#endif379380};381382DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);383384385