Path: blob/main/sys/netpfil/ipfw/ip_fw_table_value.c
39482 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2014-2025 Yandex LLC4* Copyright (c) 2014 Alexander V. Chernikov5*6* Redistribution and use in source and binary forms, with or without7* modification, are permitted provided that the following conditions8* are met:9* 1. Redistributions of source code must retain the above copyright10* notice, this list of conditions and the following disclaimer.11* 2. Redistributions in binary form must reproduce the above copyright12* notice, this list of conditions and the following disclaimer in the13* documentation and/or other materials provided with the distribution.14*15* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND16* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE17* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE18* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE19* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL20* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS21* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)22* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT23* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY24* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF25* SUCH DAMAGE.26*/2728#include <sys/cdefs.h>29/*30* Multi-field value support for ipfw tables.31*32* This file contains necessary functions to convert33* large multi-field values into u32 indices suitable to be fed34* to various table algorithms. Other machinery like proper refcounting,35* internal structures resizing are also kept here.36*/3738#include "opt_ipfw.h"3940#include <sys/param.h>41#include <sys/systm.h>42#include <sys/malloc.h>43#include <sys/kernel.h>44#include <sys/hash.h>45#include <sys/lock.h>46#include <sys/rwlock.h>47#include <sys/rmlock.h>48#include <sys/socket.h>49#include <sys/socketvar.h>50#include <sys/queue.h>51#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */5253#include <netinet/in.h>54#include <netinet/ip_var.h> /* struct ipfw_rule_ref */55#include <netinet/ip_fw.h>5657#include <netpfil/ipfw/ip_fw_private.h>58#include <netpfil/ipfw/ip_fw_table.h>5960static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key,61uint32_t kopt);62static int cmp_table_value(struct named_object *no, const void *key,63uint32_t kopt);6465static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,66struct sockopt_data *sd);6768static struct ipfw_sopt_handler scodes[] = {69{ IP_FW_TABLE_VLIST, IP_FW3_OPVER, HDIR_GET, list_table_values },70};7172#define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash)7374struct table_val_link75{76struct named_object no;77struct table_value *pval; /* Pointer to real table value */78};79#define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */80#define VALDATA_HASH_SIZE 655368182struct vdump_args {83struct ip_fw_chain *ch;84struct sockopt_data *sd;85struct table_value *pval;86int error;87};8889static uint32_t90hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt)91{9293return (hash32_buf(key, 56, 0));94}9596static int97cmp_table_value(struct named_object *no, const void *key, uint32_t kopt)98{99100return (memcmp(((struct table_val_link *)no)->pval, key, 56));101}102103static void104mask_table_value(struct table_value *src, struct table_value *dst,105uint32_t mask)106{107#define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; }108109memset(dst, 0, sizeof(*dst));110_MCPY(tag, IPFW_VTYPE_TAG);111_MCPY(pipe, IPFW_VTYPE_PIPE);112_MCPY(divert, IPFW_VTYPE_DIVERT);113_MCPY(skipto, IPFW_VTYPE_SKIPTO);114_MCPY(netgraph, IPFW_VTYPE_NETGRAPH);115_MCPY(fib, IPFW_VTYPE_FIB);116_MCPY(nat, IPFW_VTYPE_NAT);117_MCPY(limit, IPFW_VTYPE_LIMIT);118_MCPY(mark, IPFW_VTYPE_MARK);119_MCPY(dscp, IPFW_VTYPE_DSCP);120_MCPY(nh4, IPFW_VTYPE_NH4);121_MCPY(nh6, IPFW_VTYPE_NH6);122_MCPY(zoneid, IPFW_VTYPE_NH6);123#undef _MCPY124}125126static void127get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared,128struct table_value **ptv, struct namedobj_instance **pvi)129{130struct table_value *pval;131struct namedobj_instance *vi;132133if (vshared != 0) {134pval = (struct table_value *)ch->valuestate;135vi = CHAIN_TO_VI(ch);136} else {137pval = NULL;138vi = NULL;139//pval = (struct table_value *)&tc->ti.data;140}141142if (ptv != NULL)143*ptv = pval;144if (pvi != NULL)145*pvi = vi;146}147148/*149* Update pointers to real vaues after @pval change.150*/151static int152update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)153{154struct vdump_args *da;155struct table_val_link *ptv;156struct table_value *pval;157158da = (struct vdump_args *)arg;159ptv = (struct table_val_link *)no;160161pval = da->pval;162ptv->pval = &pval[ptv->no.kidx];163ptv->no.name = (char *)&pval[ptv->no.kidx];164return (0);165}166167/*168* Grows value storage shared among all tables.169* Drops/reacquires UH locks.170* Notifies other running adds on @ch shared storage resize.171* Note function does not guarantee that free space172* will be available after invocation, so one caller needs173* to roll cycle himself.174*175* Returns 0 if case of no errors.176*/177static int178resize_shared_value_storage(struct ip_fw_chain *ch)179{180struct tables_config *tcfg;181struct namedobj_instance *vi;182struct table_value *pval, *valuestate, *old_valuestate;183void *new_idx;184struct vdump_args da;185int new_blocks;186int val_size, val_size_old;187188IPFW_UH_WLOCK_ASSERT(ch);189190valuestate = NULL;191new_idx = NULL;192193pval = (struct table_value *)ch->valuestate;194vi = CHAIN_TO_VI(ch);195tcfg = CHAIN_TO_TCFG(ch);196197val_size = tcfg->val_size * 2;198199if (val_size == (1 << 30))200return (ENOSPC);201202IPFW_UH_WUNLOCK(ch);203204valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW,205M_WAITOK | M_ZERO);206ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx,207&new_blocks);208209IPFW_UH_WLOCK(ch);210211/*212* Check if we still need to resize213*/214if (tcfg->val_size >= val_size)215goto done;216217/* Update pointers and notify everyone we're changing @ch */218pval = (struct table_value *)ch->valuestate;219rollback_toperation_state(ch, ch);220221/* Good. Let's merge */222memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size);223ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks);224225IPFW_WLOCK(ch);226/* Change pointers */227old_valuestate = ch->valuestate;228ch->valuestate = valuestate;229valuestate = old_valuestate;230ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks);231232val_size_old = tcfg->val_size;233tcfg->val_size = val_size;234val_size = val_size_old;235IPFW_WUNLOCK(ch);236/* Update pointers to reflect resize */237memset(&da, 0, sizeof(da));238da.pval = (struct table_value *)ch->valuestate;239ipfw_objhash_foreach(vi, update_tvalue, &da);240241done:242free(valuestate, M_IPFW);243ipfw_objhash_bitmap_free(new_idx, new_blocks);244245return (0);246}247248/*249* Drops reference for table value with index @kidx, stored in @pval and250* @vi. Frees value if it has no references.251*/252static void253unref_table_value(struct namedobj_instance *vi, struct table_value *pval,254uint32_t kidx)255{256struct table_val_link *ptvl;257258KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx));259if (--pval[kidx].refcnt > 0)260return;261262/* Last reference, delete item */263ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx);264KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx));265ipfw_objhash_del(vi, &ptvl->no);266ipfw_objhash_free_idx(vi, kidx);267free(ptvl, M_IPFW);268}269270struct flush_args {271struct ip_fw_chain *ch;272struct table_algo *ta;273struct table_info *ti;274void *astate;275ipfw_obj_tentry tent;276};277278static int279unref_table_value_cb(void *e, void *arg)280{281struct flush_args *fa;282struct ip_fw_chain *ch;283struct table_algo *ta;284ipfw_obj_tentry *tent;285int error;286287fa = (struct flush_args *)arg;288289ta = fa->ta;290memset(&fa->tent, 0, sizeof(fa->tent));291tent = &fa->tent;292error = ta->dump_tentry(fa->astate, fa->ti, e, tent);293if (error != 0)294return (error);295296ch = fa->ch;297298unref_table_value(CHAIN_TO_VI(ch),299(struct table_value *)ch->valuestate, tent->v.kidx);300301return (0);302}303304/*305* Drop references for each value used in @tc.306*/307void308ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc,309struct table_algo *ta, void *astate, struct table_info *ti)310{311struct flush_args fa;312313IPFW_UH_WLOCK_ASSERT(ch);314315memset(&fa, 0, sizeof(fa));316fa.ch = ch;317fa.ta = ta;318fa.astate = astate;319fa.ti = ti;320321ta->foreach(astate, ti, unref_table_value_cb, &fa);322}323324/*325* Table operation state handler.326* Called when we are going to change something in @tc which327* may lead to inconsistencies in on-going table data addition.328*329* Here we rollback all already committed state (table values, currently)330* and set "modified" field to non-zero value to indicate331* that we need to restart original operation.332*/333void334rollback_table_values(struct tableop_state *ts)335{336struct ip_fw_chain *ch;337struct table_value *pval;338struct tentry_info *ptei;339struct namedobj_instance *vi;340int i;341342ch = ts->ch;343344IPFW_UH_WLOCK_ASSERT(ch);345346/* Get current table value pointer */347get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);348349for (i = 0; i < ts->count; i++) {350ptei = &ts->tei[i];351352if (ptei->value == 0)353continue;354355unref_table_value(vi, pval, ptei->value);356}357}358359/*360* Allocate new value index in either shared or per-table array.361* Function may drop/reacquire UH lock.362*363* Returns 0 on success.364*/365static int366alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts,367struct namedobj_instance *vi, uint32_t *pvidx, uint8_t flags)368{369int error, vlimit;370uint32_t vidx;371372IPFW_UH_WLOCK_ASSERT(ch);373374error = ipfw_objhash_alloc_idx(vi, &vidx);375if (error != 0) {376/*377* We need to resize array. This involves378* lock/unlock, so we need to check "modified"379* state.380*/381ts->opstate.func(ts->tc, &ts->opstate);382error = resize_shared_value_storage(ch);383return (error); /* ts->modified should be set, we will restart */384}385386vlimit = ts->ta->vlimit;387if (vlimit != 0 && vidx >= vlimit && !(flags & IPFW_CTF_ATOMIC)) {388/*389* Algorithm is not able to store given index.390* We have to rollback state, start using391* per-table value array or return error392* if we're already using it.393*/394if (ts->vshared != 0) {395/* shared -> per-table */396return (ENOSPC); /* TODO: proper error */397}398399/* per-table. Fail for now. */400return (ENOSPC); /* TODO: proper error */401}402403*pvidx = vidx;404return (0);405}406407/*408* Drops value reference for unused values (updates, deletes, partially409* successful adds or rollbacks).410*/411void412ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc,413struct tentry_info *tei, uint32_t count, int rollback)414{415int i;416struct tentry_info *ptei;417struct table_value *pval;418struct namedobj_instance *vi;419420/*421* We have two slightly different ADD cases here:422* either (1) we are successful / partially successful,423* in that case we need424* * to ignore ADDED entries values425* * rollback every other values if atomicity is not426* * required (either UPDATED since old value has been427* stored there, or some failure like EXISTS or LIMIT428* or simply "ignored" case.429*430* (2): atomic rollback of partially successful operation431* in that case we simply need to unref all entries.432*433* DELETE case is simpler: no atomic support there, so434* we simply unref all non-zero values.435*/436437/*438* Get current table value pointers.439* XXX: Properly read vshared440*/441get_value_ptrs(ch, tc, 1, &pval, &vi);442443for (i = 0; i < count; i++) {444ptei = &tei[i];445446if (ptei->value == 0) {447/*448* We may be deleting non-existing record.449* Skip.450*/451continue;452}453454if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) {455ptei->value = 0;456continue;457}458459unref_table_value(vi, pval, ptei->value);460ptei->value = 0;461}462}463464/*465* Main function used to link values of entries going to be added,466* to the index. Since we may perform many UH locks drops/acquires,467* handle changes by checking tablestate "modified" field.468*469* Success: return 0.470*/471int472ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts,473uint8_t flags)474{475int error, i, found;476struct namedobj_instance *vi;477struct table_config *tc;478struct tentry_info *tei, *ptei;479uint32_t count, vidx, vlimit;480struct table_val_link *ptv;481struct table_value tval, *pval;482483/*484* Stage 1: reference all existing values and485* save their indices.486*/487IPFW_UH_WLOCK_ASSERT(ch);488get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi);489490error = 0;491found = 0;492vlimit = ts->ta->vlimit;493vidx = 0;494tc = ts->tc;495tei = ts->tei;496count = ts->count;497for (i = 0; i < count; i++) {498ptei = &tei[i];499ptei->value = 0; /* Ensure value is always 0 in the beginning */500mask_table_value(ptei->pvalue, &tval, ts->vmask);501ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,502(char *)&tval);503if (ptv == NULL)504continue;505/* Deal with vlimit later */506if (vlimit > 0 && vlimit <= ptv->no.kidx)507continue;508509/* Value found. Bump refcount */510ptv->pval->refcnt++;511ptei->value = ptv->no.kidx;512found++;513}514515if (ts->count == found) {516/* We've found all values , no need ts create new ones */517return (0);518}519520/*521* we have added some state here, let's attach operation522* state ts the list ts be able ts rollback if necessary.523*/524add_toperation_state(ch, ts);525/* Ensure table won't disappear */526tc_ref(tc);527IPFW_UH_WUNLOCK(ch);528529/*530* Stage 2: allocate objects for non-existing values.531*/532for (i = 0; i < count; i++) {533ptei = &tei[i];534if (ptei->value != 0)535continue;536if (ptei->ptv != NULL)537continue;538ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW,539M_WAITOK | M_ZERO);540}541542/*543* Stage 3: allocate index numbers for new values544* and link them to index.545*/546IPFW_UH_WLOCK(ch);547tc_unref(tc);548del_toperation_state(ch, ts);549if (ts->modified != 0) {550/*551* In general, we should free all state/indexes here552* and return. However, we keep allocated state instead553* to ensure we achieve some progress on each restart.554*/555return (0);556}557558KASSERT(pval == ch->valuestate, ("resize_storage() notify failure"));559560/* Let's try to link values */561for (i = 0; i < count; i++) {562ptei = &tei[i];563564/* Check if record has appeared */565mask_table_value(ptei->pvalue, &tval, ts->vmask);566ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0,567(char *)&tval);568if (ptv != NULL) {569ptv->pval->refcnt++;570ptei->value = ptv->no.kidx;571continue;572}573574/* May perform UH unlock/lock */575error = alloc_table_vidx(ch, ts, vi, &vidx, flags);576if (error != 0) {577ts->opstate.func(ts->tc, &ts->opstate);578return (error);579}580/* value storage resize has happened, return */581if (ts->modified != 0)582return (0);583584/* Finally, we have allocated valid index, let's add entry */585ptei->value = vidx;586ptv = (struct table_val_link *)ptei->ptv;587ptei->ptv = NULL;588589ptv->no.kidx = vidx;590ptv->no.name = (char *)&pval[vidx];591ptv->pval = &pval[vidx];592memcpy(ptv->pval, &tval, sizeof(struct table_value));593pval[vidx].refcnt = 1;594ipfw_objhash_add(vi, &ptv->no);595}596597return (0);598}599600/*601* Imports table value from current userland format.602* Saves value in kernel format to the same place.603*/604void605ipfw_import_table_value_v1(ipfw_table_value *iv)606{607struct table_value v;608609memset(&v, 0, sizeof(v));610v.tag = iv->tag;611v.pipe = iv->pipe;612v.divert = iv->divert;613v.skipto = iv->skipto;614v.netgraph = iv->netgraph;615v.fib = iv->fib;616v.nat = iv->nat;617v.dscp = iv->dscp;618v.nh4 = iv->nh4;619v.nh6 = iv->nh6;620v.limit = iv->limit;621v.zoneid = iv->zoneid;622v.mark = iv->mark;623624memcpy(iv, &v, sizeof(ipfw_table_value));625}626627/*628* Export real table value @v to current userland format.629* Note that @v and @piv may point to the same memory.630*/631void632ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv)633{634ipfw_table_value iv;635636memset(&iv, 0, sizeof(iv));637iv.tag = v->tag;638iv.pipe = v->pipe;639iv.divert = v->divert;640iv.skipto = v->skipto;641iv.netgraph = v->netgraph;642iv.fib = v->fib;643iv.nat = v->nat;644iv.dscp = v->dscp;645iv.limit = v->limit;646iv.nh4 = v->nh4;647iv.nh6 = v->nh6;648iv.zoneid = v->zoneid;649iv.mark = v->mark;650651memcpy(piv, &iv, sizeof(iv));652}653654/*655* Exports real value data into ipfw_table_value structure including refcnt.656*/657static int658dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg)659{660struct vdump_args *da;661struct table_val_link *ptv;662ipfw_table_value *v;663664da = (struct vdump_args *)arg;665ptv = (struct table_val_link *)no;666667v = (ipfw_table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v));668/* Out of memory, returning */669if (v == NULL) {670da->error = ENOMEM;671return (ENOMEM);672}673674ipfw_export_table_value_v1(ptv->pval, v);675v->refcnt = ptv->pval->refcnt;676v->kidx = ptv->no.kidx;677return (0);678}679680/*681* Dumps all shared/table value data682* Data layout (v1)(current):683* Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size684* Reply: [ ipfw_obj_lheader ipfw_table_value x N ]685*686* Returns 0 on success687*/688static int689list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3,690struct sockopt_data *sd)691{692struct _ipfw_obj_lheader *olh;693struct namedobj_instance *vi;694struct vdump_args da;695uint32_t count, size;696697olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));698if (olh == NULL)699return (EINVAL);700if (sd->valsize < olh->size)701return (EINVAL);702703IPFW_UH_RLOCK(ch);704vi = CHAIN_TO_VI(ch);705706count = ipfw_objhash_count(vi);707size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader);708709/* Fill in header regadless of buffer size */710olh->count = count;711olh->objsize = sizeof(ipfw_table_value);712713if (size > olh->size) {714olh->size = size;715IPFW_UH_RUNLOCK(ch);716return (ENOMEM);717}718olh->size = size;719720/*721* Do the actual value dump722*/723memset(&da, 0, sizeof(da));724da.ch = ch;725da.sd = sd;726ipfw_objhash_foreach(vi, dump_tvalue, &da);727728IPFW_UH_RUNLOCK(ch);729730return (0);731}732733void734ipfw_table_value_init(struct ip_fw_chain *ch, int first)735{736struct tables_config *tcfg;737738ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value),739M_IPFW, M_WAITOK | M_ZERO);740741tcfg = ch->tblcfg;742743tcfg->val_size = VALDATA_START_SIZE;744tcfg->valhash = ipfw_objhash_create(tcfg->val_size, VALDATA_HASH_SIZE);745ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value,746cmp_table_value);747748IPFW_ADD_SOPT_HANDLER(first, scodes);749}750751static int752destroy_value(struct namedobj_instance *ni, struct named_object *no,753void *arg)754{755756free(no, M_IPFW);757return (0);758}759760void761ipfw_table_value_destroy(struct ip_fw_chain *ch, int last)762{763764IPFW_DEL_SOPT_HANDLER(last, scodes);765766free(ch->valuestate, M_IPFW);767ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch);768ipfw_objhash_destroy(CHAIN_TO_VI(ch));769}770771772