/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa4* All rights reserved5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE.26*/2728/*29* internal dummynet APIs.30*/3132#ifndef _IP_DN_PRIVATE_H33#define _IP_DN_PRIVATE_H3435/* debugging support36* use ND() to remove debugging, D() to print a line,37* DX(level, ...) to print above a certain level38* If you redefine D() you are expected to redefine all.39*/40#ifndef D41#define ND(fmt, ...) do {} while (0)42#define D1(fmt, ...) do {} while (0)43#define D(fmt, ...) printf("%-10s " fmt "\n", \44__FUNCTION__, ## __VA_ARGS__)45#define DX(lev, fmt, ...) do { \46if (V_dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)47#endif4849MALLOC_DECLARE(M_DUMMYNET);5051#ifndef __linux__52#define div64(a, b) ((int64_t)(a) / (int64_t)(b))53#endif5455#define DN_LOCK_INIT() do { \56mtx_init(&V_dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \57mtx_init(&V_dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \58} while (0)59#define DN_LOCK_DESTROY() do { \60mtx_destroy(&V_dn_cfg.uh_mtx); \61mtx_destroy(&V_dn_cfg.bh_mtx); \62} while (0)63#if 0 /* not used yet */64#define DN_UH_RLOCK() mtx_lock(&V_dn_cfg.uh_mtx)65#define DN_UH_RUNLOCK() mtx_unlock(&V_dn_cfg.uh_mtx)66#define DN_UH_WLOCK() mtx_lock(&V_dn_cfg.uh_mtx)67#define DN_UH_WUNLOCK() mtx_unlock(&V_dn_cfg.uh_mtx)68#define DN_UH_LOCK_ASSERT() mtx_assert(&V_dn_cfg.uh_mtx, MA_OWNED)69#endif7071#define DN_BH_RLOCK() mtx_lock(&V_dn_cfg.uh_mtx)72#define DN_BH_RUNLOCK() mtx_unlock(&V_dn_cfg.uh_mtx)73#define DN_BH_WLOCK() mtx_lock(&V_dn_cfg.uh_mtx)74#define DN_BH_WUNLOCK() mtx_unlock(&V_dn_cfg.uh_mtx)75#define DN_BH_LOCK_ASSERT() mtx_assert(&V_dn_cfg.uh_mtx, MA_OWNED)7677SLIST_HEAD(dn_fsk_head, dn_fsk);7879struct mq { /* a basic queue of packets*/80struct mbuf *head, *tail;81int count;82};8384static inline void85set_oid(struct dn_id *o, int type, int len)86{87o->type = type;88o->len = len;89o->subtype = 0;90}9192/*93* configuration and data for a dummynet instance94*95* When a configuration is modified from userland, 'id' is incremented96* so we can use the value to check for stale pointers.97*/98struct dn_parms {99uint32_t id; /* configuration version */100101/* defaults (sysctl-accessible) */102int red_lookup_depth;103int red_avg_pkt_size;104int red_max_pkt_size;105int hash_size;106int max_hash_size;107long byte_limit; /* max queue sizes */108long slot_limit;109110int io_fast;111int debug;112113/* timekeeping */114struct timeval prev_t; /* last time dummynet_tick ran */115struct dn_heap evheap; /* scheduled events */116117long tick_last; /* Last tick duration (usec). */118long tick_delta; /* Last vs standard tick diff (usec). */119long tick_delta_sum; /* Accumulated tick difference (usec).*/120long tick_adjustment; /* Tick adjustments done. */121long tick_lost; /* Lost(coalesced) ticks number. */122/* Adjusted vs non-adjusted curr_time difference (ticks). */123long tick_diff;124125/* counters of objects -- used for reporting space */126int schk_count;127int si_count;128int fsk_count;129int queue_count;130131/* packet counters */132unsigned long io_pkt;133unsigned long io_pkt_fast;134unsigned long io_pkt_drop;135136/* ticks and other stuff */137uint64_t curr_time;138/* flowsets and schedulers are in hash tables, with 'hash_size'139* buckets. fshash is looked up at every packet arrival140* so better be generous if we expect many entries.141*/142struct dn_ht *fshash;143struct dn_ht *schedhash;144/* list of flowsets without a scheduler -- use sch_chain */145struct dn_fsk_head fsu; /* list of unlinked flowsets */146147/* Store the fs/sch to scan when draining. The value is the148* bucket number of the hash table. Expire can be disabled149* with net.inet.ip.dummynet.expire=0, or it happens every150* expire ticks.151**/152int drain_fs;153int drain_sch;154uint32_t expire;155uint32_t expire_cycle; /* tick count */156157int init_done;158159#ifdef _KERNEL160/*161* This file is normally used in the kernel, unless we do162* some userland tests, in which case we do not need a mtx.163* uh_mtx arbitrates between system calls and also164* protects fshash, schedhash and fsunlinked.165* These structures are readonly for the lower half.166* bh_mtx protects all other structures which may be167* modified upon packet arrivals168*/169#if defined( __linux__ ) || defined( _WIN32 )170spinlock_t uh_mtx;171spinlock_t bh_mtx;172#else173struct mtx uh_mtx;174struct mtx bh_mtx;175#endif176177#endif /* _KERNEL */178};179180/*181* Delay line, contains all packets on output from a link.182* Every scheduler instance has one.183*/184struct delay_line {185struct dn_id oid;186struct dn_sch_inst *si;187struct mq mq;188};189190/*191* The kernel side of a flowset. It is linked in a hash table192* of flowsets, and in a list of children of their parent scheduler.193* qht is either the queue or (if HAVE_MASK) a hash table queues.194* Note that the mask to use is the (flow_mask|sched_mask), which195* changes as we attach/detach schedulers. So we store it here.196*197* XXX If we want to add scheduler-specific parameters, we need to198* put them in external storage because the scheduler may not be199* available when the fsk is created.200*/201struct dn_fsk { /* kernel side of a flowset */202struct dn_fs fs;203SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */204205struct ipfw_flow_id fsk_mask;206207/* qht is a hash table of queues, or just a single queue208* a bit in fs.flags tells us which one209*/210struct dn_ht *qht;211struct dn_schk *sched; /* Sched we are linked to */212SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */213214/* bucket index used by drain routine to drain queues for this215* flowset216*/217int drain_bucket;218/* Parameter realted to RED / GRED */219/* original values are in dn_fs*/220int w_q ; /* queue weight (scaled) */221int max_th ; /* maximum threshold for queue (scaled) */222int min_th ; /* minimum threshold for queue (scaled) */223int max_p ; /* maximum value for p_b (scaled) */224225u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */226u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */227u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */228u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */229u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */230u_int lookup_depth ; /* depth of lookup table */231int lookup_step ; /* granularity inside the lookup table */232int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */233int avg_pkt_size ; /* medium packet size */234int max_pkt_size ; /* max packet size */235#ifdef NEW_AQM236struct dn_aqm *aqmfp; /* Pointer to AQM functions */237void *aqmcfg; /* configuration parameters for AQM */238#endif239};240241/*242* A queue is created as a child of a flowset unless it belongs to243* a !MULTIQUEUE scheduler. It is normally in a hash table in the244* flowset. fs always points to the parent flowset.245* si normally points to the sch_inst, unless the flowset has been246* detached from the scheduler -- in this case si == NULL and we247* should not enqueue.248*/249struct dn_queue {250struct dn_flow ni; /* oid, flow_id, stats */251struct mq mq; /* packets queue */252struct dn_sch_inst *_si; /* owner scheduler instance */253SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */254struct dn_fsk *fs; /* parent flowset. */255256/* RED parameters */257int avg; /* average queue length est. (scaled) */258int count; /* arrivals since last RED drop */259int random; /* random value (scaled) */260uint64_t q_time; /* start of queue idle time */261#ifdef NEW_AQM262void *aqm_status; /* per-queue status variables*/263#endif264265};266267/*268* The kernel side of a scheduler. Contains the userland config,269* a link, pointer to extra config arguments from command line,270* kernel flags, and a pointer to the scheduler methods.271* It is stored in a hash table, and holds a list of all272* flowsets and scheduler instances.273* XXX sch must be at the beginning, see schk_hash().274*/275struct dn_schk {276struct dn_sch sch;277struct dn_alg *fp; /* Pointer to scheduler functions */278struct dn_link link; /* The link, embedded */279struct dn_profile *profile; /* delay profile, if any */280struct dn_id *cfg; /* extra config arguments */281282SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */283284struct dn_fsk_head fsk_list; /* all fsk linked to me */285struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */286287/* bucket index used by the drain routine to drain the scheduler288* instance for this flowset.289*/290int drain_bucket;291292/* Hash table of all instances (through sch.sched_mask)293* or single instance if no mask. Always valid.294*/295struct dn_ht *siht;296};297298/*299* Scheduler instance.300* Contains variables and all queues relative to a this instance.301* This struct is created a runtime.302*/303struct dn_sch_inst {304struct dn_flow ni; /* oid, flowid and stats */305SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */306struct delay_line dline;307struct dn_schk *sched; /* the template */308int kflags; /* DN_ACTIVE */309310int64_t credit; /* bits I can transmit (more or less). */311uint64_t sched_time; /* time link was scheduled in ready_heap */312uint64_t idle_time; /* start of scheduler instance idle time */313314/* q_count is the number of queues that this instance is using.315* The counter is incremented or decremented when316* a reference from the queue is created or deleted.317* It is used to make sure that a scheduler instance can be safely318* deleted by the drain routine. See notes below.319*/320int q_count;321322};323324/*325* NOTE about object drain.326* The system will automatically (XXX check when) drain queues and327* scheduler instances when they are idle.328* A queue is idle when it has no packets; an instance is idle when329* it is not in the evheap heap, and the corresponding delay line is empty.330* A queue can be safely deleted when it is idle because of the scheduler331* function xxx_free_queue() will remove any references to it.332* An instance can be only deleted when no queues reference it. To be sure333* of that, a counter (q_count) stores the number of queues that are pointing334* to the instance.335*336* XXX337* Order of scan:338* - take all flowset in a bucket for the flowset hash table339* - take all queues in a bucket for the flowset340* - increment the queue bucket341* - scan next flowset bucket342* Nothing is done if a bucket contains no entries.343*344* The same schema is used for sceduler instances345*/346347/* kernel-side flags. Linux has DN_DELETE in fcntl.h348*/349enum {350/* 1 and 2 are reserved for the SCAN flags */351DN_DESTROY = 0x0004, /* destroy */352DN_DELETE_FS = 0x0008, /* destroy flowset */353DN_DETACH = 0x0010,354DN_ACTIVE = 0x0020, /* object is in evheap */355DN_F_DLINE = 0x0040, /* object is a delay line */356DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed357* by scheduler */358DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */359};360361/*362* Packets processed by dummynet have an mbuf tag associated with363* them that carries their dummynet state.364* Outside dummynet, only the 'rule' field is relevant, and it must365* be at the beginning of the structure.366*/367struct dn_pkt_tag {368struct ipfw_rule_ref rule; /* matching rule */369370/* second part, dummynet specific */371int dn_dir; /* action when packet comes out.*/372/* see ip_fw_private.h */373uint64_t output_time; /* when the pkt is due for delivery*/374uint16_t if_index;375uint16_t if_idxgen;376uint16_t iphdr_off; /* IP header offset for mtodo() */377};378379/*380* Possible values for dn_dir. XXXGL: this needs to be reviewed381* and converted to same values ip_fw_args.flags use.382*/383enum {384DIR_OUT = 0,385DIR_IN = 1,386DIR_FWD = 2,387DIR_DROP = 3,388PROTO_LAYER2 = 0x4, /* set for layer 2 */389PROTO_IPV4 = 0x08,390PROTO_IPV6 = 0x10,391PROTO_IFB = 0x0c, /* layer2 + ifbridge */392};393394/*395* States for the Packet Loss Rate Gilbert-Elliott396* channel model397*/398enum {399PLR_STATE_G = 0,400PLR_STATE_B,401};402403//extern struct dn_parms V_dn_cfg;404VNET_DECLARE(struct dn_parms, dn_cfg);405#define V_dn_cfg VNET(dn_cfg)406407int dummynet_io(struct mbuf **, struct ip_fw_args *);408void dummynet_sched_lock(void);409void dummynet_sched_unlock(void);410void dummynet_task(void *context, int pending);411void dn_reschedule(void);412struct dn_pkt_tag * dn_tag_get(struct mbuf *m);413414struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,415struct ipfw_flow_id *);416struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);417418/*419* copy_range is a template for requests for ranges of pipes/queues/scheds.420* The number of ranges is variable and can be derived by o.len.421* As a default, we use a small number of entries so that the struct422* fits easily on the stack and is sufficient for most common requests.423*/424#define DEFAULT_RANGES 5425struct copy_range {426struct dn_id o;427uint32_t r[ 2 * DEFAULT_RANGES ];428};429430struct copy_args {431char **start;432char *end;433int flags;434int type;435struct copy_range *extra; /* extra filtering */436};437438struct sockopt;439int ip_dummynet_compat(struct sockopt *sopt);440int dummynet_get(struct sockopt *sopt, void **compat);441int dn_c_copy_q (void *_ni, void *arg);442int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);443int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);444int dn_compat_copy_queue(struct copy_args *a, void *_o);445int dn_compat_copy_pipe(struct copy_args *a, void *_o);446int copy_data_helper_compat(void *_o, void *_arg);447int dn_compat_calc_size(void);448int do_config(void *p, size_t l);449450/* function to drain idle object */451void dn_drain_scheduler(void);452void dn_drain_queue(void);453454#ifdef NEW_AQM455int ecn_mark(struct mbuf* m);456457/* moved from ip_dn_io.c to here to be available for AQMs modules*/458static inline void459mq_append(struct mq *q, struct mbuf *m)460{461#ifdef USERSPACE462// buffers from netmap need to be copied463// XXX note that the routine is not expected to fail464ND("append %p to %p", m, q);465if (m->m_flags & M_STACK) {466struct mbuf *m_new;467void *p;468int l, ofs;469470ofs = m->m_data - m->__m_extbuf;471// XXX allocate472MGETHDR(m_new, M_NOWAIT, MT_DATA);473ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p",474m, m->__m_extbuf, m->__m_extlen, ofs, m_new);475p = m_new->__m_extbuf; /* new pointer */476l = m_new->__m_extlen; /* new len */477if (l <= m->__m_extlen) {478panic("extlen too large");479}480481*m_new = *m; // copy482m_new->m_flags &= ~M_STACK;483m_new->__m_extbuf = p; // point to new buffer484_pkt_copy(m->__m_extbuf, p, m->__m_extlen);485m_new->m_data = p + ofs;486m = m_new;487}488#endif /* USERSPACE */489if (q->head == NULL)490q->head = m;491else492q->tail->m_nextpkt = m;493q->count++;494q->tail = m;495m->m_nextpkt = NULL;496}497#endif /* NEW_AQM */498499#endif /* _IP_DN_PRIVATE_H */500501502