Path: blob/main/sys/netpfil/ipfw/nat64/nat64lsn.c
101160 views
/*-1* SPDX-License-Identifier: BSD-2-Clause2*3* Copyright (c) 2015-2020 Yandex LLC4* Copyright (c) 2015 Alexander V. Chernikov <[email protected]>5* Copyright (c) 2016-2020 Andrey V. Elsukov <[email protected]>6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions9* are met:10*11* 1. Redistributions of source code must retain the above copyright12* notice, this list of conditions and the following disclaimer.13* 2. Redistributions in binary form must reproduce the above copyright14* notice, this list of conditions and the following disclaimer in the15* documentation and/or other materials provided with the distribution.16*17* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR18* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES19* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.20* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,21* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT22* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,23* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY24* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT25* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF26* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.27*/2829#include <sys/param.h>30#include <sys/systm.h>31#include <sys/counter.h>32#include <sys/ck.h>33#include <sys/epoch.h>34#include <sys/errno.h>35#include <sys/hash.h>36#include <sys/kernel.h>37#include <sys/lock.h>38#include <sys/malloc.h>39#include <sys/mbuf.h>40#include <sys/module.h>41#include <sys/rmlock.h>42#include <sys/socket.h>43#include <sys/syslog.h>44#include <sys/sysctl.h>4546#include <net/if.h>47#include <net/if_var.h>48#include <net/if_pflog.h>49#include <net/pfil.h>5051#include <netinet/in.h>52#include <netinet/ip.h>53#include <netinet/ip_var.h>54#include <netinet/ip_fw.h>55#include <netinet/ip6.h>56#include <netinet/icmp6.h>57#include <netinet/ip_icmp.h>58#include <netinet/tcp.h>59#include <netinet/udp.h>60#include <netinet6/in6_var.h>61#include <netinet6/ip6_var.h>62#include <netinet6/ip_fw_nat64.h>6364#include <netpfil/ipfw/ip_fw_private.h>65#include <netpfil/pf/pf.h>6667#include "nat64lsn.h"6869MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN");7071#define NAT64LSN_EPOCH_ENTER(et) NET_EPOCH_ENTER(et)72#define NAT64LSN_EPOCH_EXIT(et) NET_EPOCH_EXIT(et)73#define NAT64LSN_EPOCH_ASSERT() NET_EPOCH_ASSERT()74#define NAT64LSN_EPOCH_CALL(c, f) NET_EPOCH_CALL((f), (c))7576static uma_zone_t nat64lsn_host_zone;77static uma_zone_t nat64lsn_pgchunk_zone;78static uma_zone_t nat64lsn_pg_zone;79static uma_zone_t nat64lsn_aliaslink_zone;80static uma_zone_t nat64lsn_state_zone;81static uma_zone_t nat64lsn_job_zone;8283static void nat64lsn_periodic(void *data);84#define PERIODIC_DELAY 485#define NAT64_LOOKUP(chain, cmd) \86(struct nat64lsn_instance *)SRV_OBJECT((chain), insntod(cmd, kidx)->kidx)87/*88* Delayed job queue, used to create new hosts89* and new portgroups90*/91enum nat64lsn_jtype {92JTYPE_NEWHOST = 1,93JTYPE_NEWPORTGROUP,94JTYPE_DESTROY,95};9697struct nat64lsn_job_item {98STAILQ_ENTRY(nat64lsn_job_item) entries;99enum nat64lsn_jtype jtype;100101union {102struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */103struct mbuf *m;104struct nat64lsn_host *host;105struct nat64lsn_state *state;106uint32_t src6_hval;107uint32_t state_hval;108struct ipfw_flow_id f_id;109in_addr_t faddr;110uint16_t port;111uint8_t proto;112uint8_t done;113};114struct { /* used by JTYPE_DESTROY */115struct nat64lsn_hosts_slist hosts;116struct nat64lsn_pg_slist portgroups;117struct nat64lsn_pgchunk *pgchunk;118struct epoch_context epoch_ctx;119};120};121};122123static struct mtx jmtx;124#define JQUEUE_LOCK_INIT() mtx_init(&jmtx, "qlock", NULL, MTX_DEF)125#define JQUEUE_LOCK_DESTROY() mtx_destroy(&jmtx)126#define JQUEUE_LOCK() mtx_lock(&jmtx)127#define JQUEUE_UNLOCK() mtx_unlock(&jmtx)128129static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg,130struct nat64lsn_job_item *ji);131static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg,132struct nat64lsn_job_item *ji);133static struct nat64lsn_job_item *nat64lsn_create_job(134struct nat64lsn_cfg *cfg, int jtype);135static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg,136struct nat64lsn_job_item *ji);137static void nat64lsn_job_destroy(epoch_context_t ctx);138static void nat64lsn_destroy_host(struct nat64lsn_host *host);139static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg);140141static int nat64lsn_translate4(struct nat64lsn_cfg *cfg,142const struct ipfw_flow_id *f_id, struct mbuf **mp);143static int nat64lsn_translate6(struct nat64lsn_cfg *cfg,144struct ipfw_flow_id *f_id, struct mbuf **mp);145static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg,146struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags);147148#define NAT64_BIT_TCP_FIN 0 /* FIN was seen */149#define NAT64_BIT_TCP_SYN 1 /* First syn in->out */150#define NAT64_BIT_TCP_ESTAB 2 /* Packet with Ack */151#define NAT64_BIT_READY_IPV4 6 /* state is ready for translate4 */152#define NAT64_BIT_STALE 7 /* state is going to be expired */153154#define NAT64_FLAG_FIN (1 << NAT64_BIT_TCP_FIN)155#define NAT64_FLAG_SYN (1 << NAT64_BIT_TCP_SYN)156#define NAT64_FLAG_ESTAB (1 << NAT64_BIT_TCP_ESTAB)157#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN)158159#define NAT64_FLAG_READY (1 << NAT64_BIT_READY_IPV4)160#define NAT64_FLAG_STALE (1 << NAT64_BIT_STALE)161162static inline uint8_t163convert_tcp_flags(uint8_t flags)164{165uint8_t result;166167result = flags & (TH_FIN|TH_SYN);168result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */169result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */170171return (result);172}173174static void175nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,176struct nat64lsn_state *state)177{178179memset(plog, 0, sizeof(*plog));180plog->length = PFLOG_REAL_HDRLEN;181plog->af = family;182plog->action = PF_NAT;183plog->dir = PF_IN;184plog->rulenr = htonl(state->ip_src);185plog->subrulenr = htonl((uint32_t)(state->aport << 16) |186(state->proto << 8) | (state->ip_dst & 0xff));187plog->ruleset[0] = '\0';188strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));189ipfw_pflog_tap(plog, m);190}191192#define HVAL(p, n, s) jenkins_hash32((const uint32_t *)(p), (n), (s))193#define HOST_HVAL(c, a) HVAL((a),\194sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed)195#define HOSTS(c, v) ((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)])196197#define ALIASLINK_HVAL(c, f) HVAL(&(f)->dst_ip6,\198sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed)199#define ALIAS_BYHASH(c, v) \200((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)])201static struct nat64lsn_aliaslink*202nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused,203struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused)204{205206/*207* We can implement some different algorithms how208* select an alias address.209* XXX: for now we use first available.210*/211return (CK_SLIST_FIRST(&host->aliases));212}213214static struct nat64lsn_alias*215nat64lsn_get_alias(struct nat64lsn_cfg *cfg,216const struct ipfw_flow_id *f_id __unused)217{218static uint32_t idx = 0;219220/*221* We can choose alias by number of allocated PGs,222* not used yet by other hosts, or some static configured223* by user.224* XXX: for now we choose it using round robin.225*/226return (&ALIAS_BYHASH(cfg, idx++));227}228229#define STATE_HVAL(c, d) HVAL((d), 2, (c)->hash_seed)230#define STATE_HASH(h, v) \231((h)->states_hash[(v) & ((h)->states_hashsize - 1)])232#define STATES_CHUNK(p, v) \233((p)->chunks_count == 1 ? (p)->states : \234((p)->states_chunk[CHUNK_BY_FADDR(p, v)]))235236#ifdef __LP64__237#define FREEMASK_FFSLL(pg, faddr) \238ffsll(*FREEMASK_CHUNK((pg), (faddr)))239#define FREEMASK_BTR(pg, faddr, bit) \240ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit))241#define FREEMASK_BTS(pg, faddr, bit) \242ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit))243#define FREEMASK_ISSET(pg, faddr, bit) \244ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit))245#define FREEMASK_COPY(pg, n, out) \246(out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n)))247#else248static inline int249freemask_ffsll(uint32_t *freemask)250{251int i;252253if ((i = ffsl(freemask[0])) != 0)254return (i);255if ((i = ffsl(freemask[1])) != 0)256return (i + 32);257return (0);258}259#define FREEMASK_FFSLL(pg, faddr) \260freemask_ffsll(FREEMASK_CHUNK((pg), (faddr)))261#define FREEMASK_BTR(pg, faddr, bit) \262ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32)263#define FREEMASK_BTS(pg, faddr, bit) \264ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32)265#define FREEMASK_ISSET(pg, faddr, bit) \266ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32)267#define FREEMASK_COPY(pg, n, out) \268(out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \269((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32)270#endif /* !__LP64__ */271272273#define NAT64LSN_TRY_PGCNT 36274static struct nat64lsn_pg*275nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask,276struct nat64lsn_pgchunk **chunks, uint32_t *pgidx, in_addr_t faddr)277{278struct nat64lsn_pg *pg;279uint32_t idx, oldidx;280int cnt;281282/* First try last used PG. */283idx = oldidx = ck_pr_load_32(pgidx);284MPASS(idx < 1024);285cnt = 0;286do {287ck_pr_fence_load();288if (idx > 1023 || !ISSET32(*chunkmask, idx / 32)) {289/* If it is first try, reset idx to first PG */290idx = 0;291/* Stop if idx is out of range */292if (cnt > 0)293break;294}295if (ISSET32(pgmask[idx / 32], idx % 32)) {296pg = ck_pr_load_ptr(297&chunks[idx / 32]->pgptr[idx % 32]);298ck_pr_fence_load();299/*300* Make sure that pg did not become DEAD.301*/302if ((pg->flags & NAT64LSN_DEADPG) == 0 &&303FREEMASK_BITCOUNT(pg, faddr) > 0) {304if (cnt > 0)305ck_pr_cas_32(pgidx, oldidx, idx);306return (pg);307}308}309idx++;310} while (++cnt < NAT64LSN_TRY_PGCNT);311if (oldidx != idx)312ck_pr_cas_32(pgidx, oldidx, idx);313return (NULL);314}315316static struct nat64lsn_state*317nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host,318const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr,319uint16_t port, uint8_t proto)320{321struct nat64lsn_aliaslink *link;322struct nat64lsn_state *state;323struct nat64lsn_pg *pg;324int i, offset;325326NAT64LSN_EPOCH_ASSERT();327328/* Check that we already have state for given arguments */329CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) {330if (state->proto == proto && state->ip_dst == faddr &&331state->sport == port && state->dport == f_id->dst_port)332return (state);333}334335link = nat64lsn_get_aliaslink(cfg, host, f_id);336if (link == NULL)337return (NULL);338339switch (proto) {340case IPPROTO_TCP:341pg = nat64lsn_get_pg(&link->alias->tcp_chunkmask,342link->alias->tcp_pgmask, link->alias->tcp,343&link->alias->tcp_pgidx, faddr);344break;345case IPPROTO_UDP:346pg = nat64lsn_get_pg(&link->alias->udp_chunkmask,347link->alias->udp_pgmask, link->alias->udp,348&link->alias->udp_pgidx, faddr);349break;350case IPPROTO_ICMP:351pg = nat64lsn_get_pg(&link->alias->icmp_chunkmask,352link->alias->icmp_pgmask, link->alias->icmp,353&link->alias->icmp_pgidx, faddr);354break;355default:356panic("%s: wrong proto %d", __func__, proto);357}358if (pg == NULL || (pg->flags & NAT64LSN_DEADPG) != 0)359return (NULL);360361/* Check that PG has some free states */362state = NULL;363i = FREEMASK_BITCOUNT(pg, faddr);364while (i-- > 0) {365offset = FREEMASK_FFSLL(pg, faddr);366if (offset == 0) {367/*368* We lost the race.369* No more free states in this PG.370*/371break;372}373374/* Lets try to atomically grab the state */375if (FREEMASK_BTR(pg, faddr, offset - 1)) {376state = &STATES_CHUNK(pg, faddr)->state[offset - 1];377/* Initialize */378state->flags = proto != IPPROTO_TCP ? 0 :379convert_tcp_flags(f_id->_flags);380state->proto = proto;381state->aport = pg->base_port + offset - 1;382state->dport = f_id->dst_port;383state->sport = port;384state->ip6_dst = f_id->dst_ip6;385state->ip_dst = faddr;386state->ip_src = link->alias->addr;387state->hval = hval;388state->host = host;389SET_AGE(state->timestamp);390391/* Insert new state into host's hash table */392HOST_LOCK(host);393SET_AGE(host->timestamp);394CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval),395state, entries);396host->states_count++;397HOST_UNLOCK(host);398NAT64STAT_INC(&cfg->base.stats, screated);399/* Mark the state as ready for translate4 */400ck_pr_fence_store();401ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4);402break;403}404}405return (state);406}407408/*409* Inspects icmp packets to see if the message contains different410* packet header so we need to alter @addr and @port.411*/412static int413inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr,414uint16_t *port)415{416struct icmp *icmp;417struct ip *ip;418int off;419uint8_t inner_proto;420421ip = mtod(*mp, struct ip *); /* Outer IP header */422off = (ip->ip_hl << 2) + ICMP_MINLEN;423if ((*mp)->m_len < off)424*mp = m_pullup(*mp, off);425if (*mp == NULL)426return (ENOMEM);427428ip = mtod(*mp, struct ip *); /* Outer IP header */429icmp = L3HDR(ip, struct icmp *);430switch (icmp->icmp_type) {431case ICMP_ECHO:432case ICMP_ECHOREPLY:433/* Use icmp ID as distinguisher */434*port = ntohs(icmp->icmp_id);435return (0);436case ICMP_UNREACH:437case ICMP_TIMXCEED:438break;439default:440return (EOPNOTSUPP);441}442/*443* ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits444* of ULP header.445*/446if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)447return (EINVAL);448if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)449*mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN);450if (*mp == NULL)451return (ENOMEM);452ip = mtodo(*mp, off); /* Inner IP header */453inner_proto = ip->ip_p;454off += ip->ip_hl << 2; /* Skip inner IP header */455*addr = ntohl(ip->ip_src.s_addr);456if ((*mp)->m_len < off + ICMP_MINLEN)457*mp = m_pullup(*mp, off + ICMP_MINLEN);458if (*mp == NULL)459return (ENOMEM);460switch (inner_proto) {461case IPPROTO_TCP:462case IPPROTO_UDP:463/* Copy source port from the header */464*port = ntohs(*((uint16_t *)mtodo(*mp, off)));465*proto = inner_proto;466return (0);467case IPPROTO_ICMP:468/*469* We will translate only ICMP errors for our ICMP470* echo requests.471*/472icmp = mtodo(*mp, off);473if (icmp->icmp_type != ICMP_ECHO)474return (EOPNOTSUPP);475*port = ntohs(icmp->icmp_id);476return (0);477};478return (EOPNOTSUPP);479}480481static struct nat64lsn_state*482nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias,483in_addr_t faddr, uint16_t port, uint8_t proto)484{485struct nat64lsn_state *state;486struct nat64lsn_pg *pg;487int chunk_idx, pg_idx, state_idx;488489NAT64LSN_EPOCH_ASSERT();490491if (port < NAT64_MIN_PORT)492return (NULL);493/*494* Alias keeps 32 pgchunks for each protocol.495* Each pgchunk has 32 pointers to portgroup.496* Each portgroup has 64 states for ports.497*/498port -= NAT64_MIN_PORT;499chunk_idx = port / 2048;500501port -= chunk_idx * 2048;502pg_idx = port / 64;503state_idx = port % 64;504505/*506* First check in proto_chunkmask that we have allocated PG chunk.507* Then check in proto_pgmask that we have valid PG pointer.508*/509pg = NULL;510switch (proto) {511case IPPROTO_TCP:512if (ISSET32(alias->tcp_chunkmask, chunk_idx) &&513ISSET32(alias->tcp_pgmask[chunk_idx], pg_idx)) {514pg = alias->tcp[chunk_idx]->pgptr[pg_idx];515break;516}517return (NULL);518case IPPROTO_UDP:519if (ISSET32(alias->udp_chunkmask, chunk_idx) &&520ISSET32(alias->udp_pgmask[chunk_idx], pg_idx)) {521pg = alias->udp[chunk_idx]->pgptr[pg_idx];522break;523}524return (NULL);525case IPPROTO_ICMP:526if (ISSET32(alias->icmp_chunkmask, chunk_idx) &&527ISSET32(alias->icmp_pgmask[chunk_idx], pg_idx)) {528pg = alias->icmp[chunk_idx]->pgptr[pg_idx];529break;530}531return (NULL);532default:533panic("%s: wrong proto %d", __func__, proto);534}535if (pg == NULL)536return (NULL);537538if (FREEMASK_ISSET(pg, faddr, state_idx))539return (NULL);540541state = &STATES_CHUNK(pg, faddr)->state[state_idx];542ck_pr_fence_load();543if (ck_pr_load_32(&state->flags) & NAT64_FLAG_READY)544return (state);545return (NULL);546}547548/*549* Reassemble IPv4 fragments, make PULLUP if needed, get some ULP fields550* that might be unknown until reassembling is completed.551*/552static struct mbuf*553nat64lsn_reassemble4(struct nat64lsn_cfg *cfg, struct mbuf *m,554uint16_t *port)555{556struct ip *ip;557int len;558559m = ip_reass(m);560if (m == NULL)561return (NULL);562/* IP header must be contigious after ip_reass() */563ip = mtod(m, struct ip *);564len = ip->ip_hl << 2;565switch (ip->ip_p) {566case IPPROTO_ICMP:567len += ICMP_MINLEN;568break;569case IPPROTO_TCP:570len += sizeof(struct tcphdr);571break;572case IPPROTO_UDP:573len += sizeof(struct udphdr);574break;575default:576m_freem(m);577NAT64STAT_INC(&cfg->base.stats, noproto);578return (NULL);579}580if (m->m_len < len) {581m = m_pullup(m, len);582if (m == NULL) {583NAT64STAT_INC(&cfg->base.stats, nomem);584return (NULL);585}586ip = mtod(m, struct ip *);587}588switch (ip->ip_p) {589case IPPROTO_TCP:590*port = ntohs(L3HDR(ip, struct tcphdr *)->th_dport);591break;592case IPPROTO_UDP:593*port = ntohs(L3HDR(ip, struct udphdr *)->uh_dport);594break;595}596return (m);597}598599static int600nat64lsn_translate4(struct nat64lsn_cfg *cfg,601const struct ipfw_flow_id *f_id, struct mbuf **mp)602{603struct pfloghdr loghdr, *logdata;604struct in6_addr src6;605struct nat64lsn_state *state;606struct nat64lsn_alias *alias;607uint32_t addr, flags;608uint16_t port, ts;609int ret;610uint8_t proto;611612addr = f_id->dst_ip;613port = f_id->dst_port;614proto = f_id->proto;615if (addr < cfg->prefix4 || addr > cfg->pmask4) {616NAT64STAT_INC(&cfg->base.stats, nomatch4);617return (cfg->nomatch_verdict);618}619620/* Reassemble fragments if needed */621ret = ntohs(mtod(*mp, struct ip *)->ip_off);622if ((ret & (IP_MF | IP_OFFMASK)) != 0) {623*mp = nat64lsn_reassemble4(cfg, *mp, &port);624if (*mp == NULL)625return (IP_FW_DENY);626}627628/* Check if protocol is supported */629switch (proto) {630case IPPROTO_ICMP:631ret = inspect_icmp_mbuf(mp, &proto, &addr, &port);632if (ret != 0) {633if (ret == ENOMEM) {634NAT64STAT_INC(&cfg->base.stats, nomem);635return (IP_FW_DENY);636}637NAT64STAT_INC(&cfg->base.stats, noproto);638return (cfg->nomatch_verdict);639}640if (addr < cfg->prefix4 || addr > cfg->pmask4) {641NAT64STAT_INC(&cfg->base.stats, nomatch4);642return (cfg->nomatch_verdict);643}644/* FALLTHROUGH */645case IPPROTO_TCP:646case IPPROTO_UDP:647break;648default:649NAT64STAT_INC(&cfg->base.stats, noproto);650return (cfg->nomatch_verdict);651}652653alias = &ALIAS_BYHASH(cfg, addr);654MPASS(addr == alias->addr);655656/* Check that we have state for this port */657state = nat64lsn_get_state4to6(cfg, alias, f_id->src_ip,658port, proto);659if (state == NULL) {660NAT64STAT_INC(&cfg->base.stats, nomatch4);661return (cfg->nomatch_verdict);662}663664/* TODO: Check flags to see if we need to do some static mapping */665666/* Update some state fields if need */667SET_AGE(ts);668if (f_id->proto == IPPROTO_TCP)669flags = convert_tcp_flags(f_id->_flags);670else671flags = 0;672if (state->timestamp != ts)673state->timestamp = ts;674if ((state->flags & flags) != flags)675state->flags |= flags;676677port = htons(state->sport);678src6 = state->ip6_dst;679680if (cfg->base.flags & NAT64_LOG) {681logdata = &loghdr;682nat64lsn_log(logdata, *mp, AF_INET, state);683} else684logdata = NULL;685686/*687* We already have src6 with embedded address, but it is possible,688* that src_ip is different than state->ip_dst, this is why we689* do embedding again.690*/691nat64_embed_ip4(&src6, cfg->base.plat_plen, htonl(f_id->src_ip));692ret = nat64_do_handle_ip4(*mp, &src6, &state->host->addr, port,693&cfg->base, logdata);694if (ret == NAT64SKIP)695return (cfg->nomatch_verdict);696if (ret == NAT64RETURN)697*mp = NULL;698return (IP_FW_DENY);699}700701/*702* Check if particular state is stale and should be deleted.703* Return 1 if true, 0 otherwise.704*/705static int706nat64lsn_check_state(struct nat64lsn_cfg *cfg, struct nat64lsn_state *state)707{708int age, ttl;709710/* State was marked as stale in previous pass. */711if (ISSET32(state->flags, NAT64_BIT_STALE))712return (1);713714/* State is not yet initialized, it is going to be READY */715if (!ISSET32(state->flags, NAT64_BIT_READY_IPV4))716return (0);717718age = GET_AGE(state->timestamp);719switch (state->proto) {720case IPPROTO_TCP:721if (ISSET32(state->flags, NAT64_BIT_TCP_FIN))722ttl = cfg->st_close_ttl;723else if (ISSET32(state->flags, NAT64_BIT_TCP_ESTAB))724ttl = cfg->st_estab_ttl;725else if (ISSET32(state->flags, NAT64_BIT_TCP_SYN))726ttl = cfg->st_syn_ttl;727else728ttl = cfg->st_syn_ttl;729if (age > ttl)730return (1);731break;732case IPPROTO_UDP:733if (age > cfg->st_udp_ttl)734return (1);735break;736case IPPROTO_ICMP:737if (age > cfg->st_icmp_ttl)738return (1);739break;740}741return (0);742}743744#define PGCOUNT_ADD(alias, proto, value) \745switch (proto) { \746case IPPROTO_TCP: (alias)->tcp_pgcount += (value); break; \747case IPPROTO_UDP: (alias)->udp_pgcount += (value); break; \748case IPPROTO_ICMP: (alias)->icmp_pgcount += (value); break; \749}750#define PGCOUNT_INC(alias, proto) PGCOUNT_ADD(alias, proto, 1)751#define PGCOUNT_DEC(alias, proto) PGCOUNT_ADD(alias, proto, -1)752753static inline void754nat64lsn_state_cleanup(struct nat64lsn_state *state)755{756757/*758* Reset READY flag and wait until it become759* safe for translate4.760*/761ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4);762/*763* And set STALE flag for deferred deletion in the764* next pass of nat64lsn_maintain_pg().765*/766ck_pr_bts_32(&state->flags, NAT64_BIT_STALE);767ck_pr_fence_store();768}769770static int771nat64lsn_maintain_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg)772{773struct nat64lsn_state *state;774struct nat64lsn_host *host;775uint64_t freemask;776int c, i, update_age;777778update_age = 0;779for (c = 0; c < pg->chunks_count; c++) {780FREEMASK_COPY(pg, c, freemask);781for (i = 0; i < 64; i++) {782if (ISSET64(freemask, i))783continue;784state = &STATES_CHUNK(pg, c)->state[i];785if (nat64lsn_check_state(cfg, state) == 0) {786update_age = 1;787continue;788}789/*790* Expire state:791* 1. Mark as STALE and unlink from host's hash.792* 2. Set bit in freemask.793*/794if (ISSET32(state->flags, NAT64_BIT_STALE)) {795/*796* State was marked as STALE in previous797* pass. Now it is safe to release it.798*/799state->flags = 0;800ck_pr_fence_store();801FREEMASK_BTS(pg, c, i);802NAT64STAT_INC(&cfg->base.stats, sdeleted);803continue;804}805MPASS(state->flags & NAT64_FLAG_READY);806807host = state->host;808HOST_LOCK(host);809CK_SLIST_REMOVE(&STATE_HASH(host, state->hval),810state, nat64lsn_state, entries);811/*812* Now translate6 will not use this state.813*/814host->states_count--;815HOST_UNLOCK(host);816nat64lsn_state_cleanup(state);817}818}819820/*821* We have some alive states, update timestamp.822*/823if (update_age)824SET_AGE(pg->timestamp);825826if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay)827return (0);828829return (1);830}831832static void833nat64lsn_expire_portgroups(struct nat64lsn_cfg *cfg,834struct nat64lsn_pg_slist *portgroups)835{836struct nat64lsn_alias *alias;837struct nat64lsn_pg *pg, *tpg;838uint32_t *pgmask, *pgidx;839int i, idx;840841for (i = 0; i < 1 << (32 - cfg->plen4); i++) {842alias = &cfg->aliases[i];843CK_SLIST_FOREACH_SAFE(pg, &alias->portgroups, entries, tpg) {844if (nat64lsn_maintain_pg(cfg, pg) == 0)845continue;846/* Always keep first PG */847if (pg->base_port == NAT64_MIN_PORT)848continue;849/*850* PG expires in two passes:851* 1. Reset bit in pgmask, mark it as DEAD.852* 2. Unlink it and schedule for deferred destroying.853*/854idx = (pg->base_port - NAT64_MIN_PORT) / 64;855switch (pg->proto) {856case IPPROTO_TCP:857pgmask = alias->tcp_pgmask;858pgidx = &alias->tcp_pgidx;859break;860case IPPROTO_UDP:861pgmask = alias->udp_pgmask;862pgidx = &alias->udp_pgidx;863break;864case IPPROTO_ICMP:865pgmask = alias->icmp_pgmask;866pgidx = &alias->icmp_pgidx;867break;868}869if (pg->flags & NAT64LSN_DEADPG) {870/* Unlink PG from alias's chain */871ALIAS_LOCK(alias);872CK_SLIST_REMOVE(&alias->portgroups, pg,873nat64lsn_pg, entries);874PGCOUNT_DEC(alias, pg->proto);875ALIAS_UNLOCK(alias);876/*877* Link it to job's chain for deferred878* destroying.879*/880NAT64STAT_INC(&cfg->base.stats, spgdeleted);881CK_SLIST_INSERT_HEAD(portgroups, pg, entries);882continue;883}884885/* Reset the corresponding bit in pgmask array. */886ck_pr_btr_32(&pgmask[idx / 32], idx % 32);887pg->flags |= NAT64LSN_DEADPG;888ck_pr_fence_store();889/* If last used PG points to this PG, reset it. */890ck_pr_cas_32(pgidx, idx, 0);891}892}893}894895static void896nat64lsn_expire_hosts(struct nat64lsn_cfg *cfg,897struct nat64lsn_hosts_slist *hosts)898{899struct nat64lsn_host *host, *tmp;900int i;901902for (i = 0; i < cfg->hosts_hashsize; i++) {903CK_SLIST_FOREACH_SAFE(host, &cfg->hosts_hash[i],904entries, tmp) {905/* Is host was marked in previous call? */906if (host->flags & NAT64LSN_DEADHOST) {907if (host->states_count > 0 ||908GET_AGE(host->timestamp) <909cfg->host_delete_delay) {910host->flags &= ~NAT64LSN_DEADHOST;911continue;912}913/*914* Unlink host from hash table and schedule915* it for deferred destroying.916*/917CFG_LOCK(cfg);918CK_SLIST_REMOVE(&cfg->hosts_hash[i], host,919nat64lsn_host, entries);920cfg->hosts_count--;921CFG_UNLOCK(cfg);922CK_SLIST_INSERT_HEAD(hosts, host, entries);923continue;924}925if (host->states_count > 0 ||926GET_AGE(host->timestamp) < cfg->host_delete_delay)927continue;928/* Mark host as going to be expired in next pass */929host->flags |= NAT64LSN_DEADHOST;930ck_pr_fence_store();931}932}933}934935static struct nat64lsn_pgchunk*936nat64lsn_expire_pgchunk(struct nat64lsn_cfg *cfg)937{938#if 0939struct nat64lsn_alias *alias;940struct nat64lsn_pgchunk *chunk;941uint32_t pgmask;942int i, c;943944for (i = 0; i < 1 << (32 - cfg->plen4); i++) {945alias = &cfg->aliases[i];946if (GET_AGE(alias->timestamp) < cfg->pgchunk_delete_delay)947continue;948/* Always keep single chunk allocated */949for (c = 1; c < 32; c++) {950if ((alias->tcp_chunkmask & (1 << c)) == 0)951break;952chunk = ck_pr_load_ptr(&alias->tcp[c]);953if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0)954continue;955ck_pr_btr_32(&alias->tcp_chunkmask, c);956ck_pr_fence_load();957if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0)958continue;959}960}961#endif962return (NULL);963}964965#if 0966static void967nat64lsn_maintain_hosts(struct nat64lsn_cfg *cfg)968{969struct nat64lsn_host *h;970struct nat64lsn_states_slist *hash;971int i, j, hsize;972973for (i = 0; i < cfg->hosts_hashsize; i++) {974CK_SLIST_FOREACH(h, &cfg->hosts_hash[i], entries) {975if (h->states_count / 2 < h->states_hashsize ||976h->states_hashsize >= NAT64LSN_MAX_HSIZE)977continue;978hsize = h->states_hashsize * 2;979hash = malloc(sizeof(*hash)* hsize, M_NOWAIT);980if (hash == NULL)981continue;982for (j = 0; j < hsize; j++)983CK_SLIST_INIT(&hash[i]);984985ck_pr_bts_32(&h->flags, NAT64LSN_GROWHASH);986}987}988}989#endif990991/*992* This procedure is used to perform various maintance993* on dynamic hash list. Currently it is called every 4 seconds.994*/995static void996nat64lsn_periodic(void *data)997{998struct nat64lsn_job_item *ji;999struct nat64lsn_cfg *cfg;10001001cfg = (struct nat64lsn_cfg *) data;1002CURVNET_SET(cfg->vp);1003if (cfg->hosts_count > 0) {1004ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT);1005if (ji != NULL) {1006ji->jtype = JTYPE_DESTROY;1007CK_SLIST_INIT(&ji->hosts);1008CK_SLIST_INIT(&ji->portgroups);1009nat64lsn_expire_hosts(cfg, &ji->hosts);1010nat64lsn_expire_portgroups(cfg, &ji->portgroups);1011ji->pgchunk = nat64lsn_expire_pgchunk(cfg);1012NAT64LSN_EPOCH_CALL(&ji->epoch_ctx,1013nat64lsn_job_destroy);1014} else1015NAT64STAT_INC(&cfg->base.stats, jnomem);1016}1017callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY);1018CURVNET_RESTORE();1019}10201021#define ALLOC_ERROR(stage, type) ((stage) ? 10 * (type) + (stage): 0)1022#define HOST_ERROR(stage) ALLOC_ERROR(stage, 1)1023#define PG_ERROR(stage) ALLOC_ERROR(stage, 2)1024static int1025nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)1026{1027char a[INET6_ADDRSTRLEN];1028struct nat64lsn_aliaslink *link;1029struct nat64lsn_host *host;1030struct nat64lsn_state *state;1031uint32_t hval, data[2];1032int i;10331034/* Check that host was not yet added. */1035NAT64LSN_EPOCH_ASSERT();1036CK_SLIST_FOREACH(host, &HOSTS(cfg, ji->src6_hval), entries) {1037if (IN6_ARE_ADDR_EQUAL(&ji->f_id.src_ip6, &host->addr)) {1038/* The host was allocated in previous call. */1039ji->host = host;1040goto get_state;1041}1042}10431044host = ji->host = uma_zalloc(nat64lsn_host_zone, M_NOWAIT);1045if (ji->host == NULL)1046return (HOST_ERROR(1));10471048host->states_hashsize = NAT64LSN_HSIZE;1049host->states_hash = malloc(sizeof(struct nat64lsn_states_slist) *1050host->states_hashsize, M_NAT64LSN, M_NOWAIT);1051if (host->states_hash == NULL) {1052uma_zfree(nat64lsn_host_zone, host);1053return (HOST_ERROR(2));1054}10551056link = uma_zalloc(nat64lsn_aliaslink_zone, M_NOWAIT);1057if (link == NULL) {1058free(host->states_hash, M_NAT64LSN);1059uma_zfree(nat64lsn_host_zone, host);1060return (HOST_ERROR(3));1061}10621063/* Initialize */1064HOST_LOCK_INIT(host);1065SET_AGE(host->timestamp);1066host->addr = ji->f_id.src_ip6;1067host->hval = ji->src6_hval;1068host->flags = 0;1069host->states_count = 0;1070CK_SLIST_INIT(&host->aliases);1071for (i = 0; i < host->states_hashsize; i++)1072CK_SLIST_INIT(&host->states_hash[i]);10731074link->alias = nat64lsn_get_alias(cfg, &ji->f_id);1075CK_SLIST_INSERT_HEAD(&host->aliases, link, host_entries);10761077ALIAS_LOCK(link->alias);1078CK_SLIST_INSERT_HEAD(&link->alias->hosts, link, alias_entries);1079link->alias->hosts_count++;1080ALIAS_UNLOCK(link->alias);10811082CFG_LOCK(cfg);1083CK_SLIST_INSERT_HEAD(&HOSTS(cfg, ji->src6_hval), host, entries);1084cfg->hosts_count++;1085CFG_UNLOCK(cfg);10861087get_state:1088data[0] = ji->faddr;1089data[1] = (ji->f_id.dst_port << 16) | ji->port;1090ji->state_hval = hval = STATE_HVAL(cfg, data);1091state = nat64lsn_get_state6to4(cfg, host, &ji->f_id, hval,1092ji->faddr, ji->port, ji->proto);1093/*1094* We failed to obtain new state, used alias needs new PG.1095* XXX: or another alias should be used.1096*/1097if (state == NULL) {1098/* Try to allocate new PG */1099if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0))1100return (HOST_ERROR(4));1101/* We assume that nat64lsn_alloc_pg() got state */1102} else1103ji->state = state;11041105ji->done = 1;1106DPRINTF(DP_OBJ, "ALLOC HOST %s %p",1107inet_ntop(AF_INET6, &host->addr, a, sizeof(a)), host);1108return (HOST_ERROR(0));1109}11101111static int1112nat64lsn_find_pg_place(uint32_t *data)1113{1114int i;11151116for (i = 0; i < 32; i++) {1117if (~data[i] == 0)1118continue;1119return (i * 32 + ffs(~data[i]) - 1);1120}1121return (-1);1122}11231124static int1125nat64lsn_alloc_proto_pg(struct nat64lsn_cfg *cfg,1126struct nat64lsn_alias *alias, uint32_t *chunkmask, uint32_t *pgmask,1127struct nat64lsn_pgchunk **chunks, uint32_t *pgidx, uint8_t proto)1128{1129struct nat64lsn_pg *pg;1130int i, pg_idx, chunk_idx;11311132/* Find place in pgchunk where PG can be added */1133pg_idx = nat64lsn_find_pg_place(pgmask);1134if (pg_idx < 0) /* no more PGs */1135return (PG_ERROR(1));1136/* Check that we have allocated pgchunk for given PG index */1137chunk_idx = pg_idx / 32;1138if (!ISSET32(*chunkmask, chunk_idx)) {1139chunks[chunk_idx] = uma_zalloc(nat64lsn_pgchunk_zone,1140M_NOWAIT);1141if (chunks[chunk_idx] == NULL)1142return (PG_ERROR(2));1143ck_pr_bts_32(chunkmask, chunk_idx);1144ck_pr_fence_store();1145}1146/* Allocate PG and states chunks */1147pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT);1148if (pg == NULL)1149return (PG_ERROR(3));1150pg->chunks_count = cfg->states_chunks;1151if (pg->chunks_count > 1) {1152pg->freemask_chunk = malloc(pg->chunks_count *1153sizeof(uint64_t), M_NAT64LSN, M_NOWAIT);1154if (pg->freemask_chunk == NULL) {1155uma_zfree(nat64lsn_pg_zone, pg);1156return (PG_ERROR(4));1157}1158pg->states_chunk = malloc(pg->chunks_count *1159sizeof(struct nat64lsn_states_chunk *), M_NAT64LSN,1160M_NOWAIT | M_ZERO);1161if (pg->states_chunk == NULL) {1162free(pg->freemask_chunk, M_NAT64LSN);1163uma_zfree(nat64lsn_pg_zone, pg);1164return (PG_ERROR(5));1165}1166for (i = 0; i < pg->chunks_count; i++) {1167pg->states_chunk[i] = uma_zalloc(1168nat64lsn_state_zone, M_NOWAIT);1169if (pg->states_chunk[i] == NULL)1170goto states_failed;1171}1172memset(pg->freemask_chunk, 0xff,1173sizeof(uint64_t) * pg->chunks_count);1174} else {1175pg->states = uma_zalloc(nat64lsn_state_zone, M_NOWAIT);1176if (pg->states == NULL) {1177uma_zfree(nat64lsn_pg_zone, pg);1178return (PG_ERROR(6));1179}1180memset(&pg->freemask64, 0xff, sizeof(uint64_t));1181}11821183/* Initialize PG and hook it to pgchunk */1184SET_AGE(pg->timestamp);1185pg->flags = 0;1186pg->proto = proto;1187pg->base_port = NAT64_MIN_PORT + 64 * pg_idx;1188ck_pr_store_ptr(&chunks[chunk_idx]->pgptr[pg_idx % 32], pg);1189ck_pr_fence_store();11901191/* Set bit in pgmask and set index of last used PG */1192ck_pr_bts_32(&pgmask[chunk_idx], pg_idx % 32);1193ck_pr_store_32(pgidx, pg_idx);11941195ALIAS_LOCK(alias);1196CK_SLIST_INSERT_HEAD(&alias->portgroups, pg, entries);1197SET_AGE(alias->timestamp);1198PGCOUNT_INC(alias, proto);1199ALIAS_UNLOCK(alias);1200NAT64STAT_INC(&cfg->base.stats, spgcreated);1201return (PG_ERROR(0));12021203states_failed:1204for (i = 0; i < pg->chunks_count; i++)1205uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]);1206free(pg->freemask_chunk, M_NAT64LSN);1207free(pg->states_chunk, M_NAT64LSN);1208uma_zfree(nat64lsn_pg_zone, pg);1209return (PG_ERROR(7));1210}12111212static int1213nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)1214{1215struct nat64lsn_aliaslink *link;1216struct nat64lsn_alias *alias;1217int ret;12181219link = nat64lsn_get_aliaslink(cfg, ji->host, &ji->f_id);1220if (link == NULL)1221return (PG_ERROR(1));12221223/*1224* TODO: check that we did not already allocated PG in1225* previous call.1226*/12271228ret = 0;1229alias = link->alias;1230/* Find place in pgchunk where PG can be added */1231switch (ji->proto) {1232case IPPROTO_TCP:1233ret = nat64lsn_alloc_proto_pg(cfg, alias,1234&alias->tcp_chunkmask, alias->tcp_pgmask,1235alias->tcp, &alias->tcp_pgidx, ji->proto);1236break;1237case IPPROTO_UDP:1238ret = nat64lsn_alloc_proto_pg(cfg, alias,1239&alias->udp_chunkmask, alias->udp_pgmask,1240alias->udp, &alias->udp_pgidx, ji->proto);1241break;1242case IPPROTO_ICMP:1243ret = nat64lsn_alloc_proto_pg(cfg, alias,1244&alias->icmp_chunkmask, alias->icmp_pgmask,1245alias->icmp, &alias->icmp_pgidx, ji->proto);1246break;1247default:1248panic("%s: wrong proto %d", __func__, ji->proto);1249}1250if (ret == PG_ERROR(1)) {1251/*1252* PG_ERROR(1) means that alias lacks free PGs1253* XXX: try next alias.1254*/1255printf("NAT64LSN: %s: failed to obtain PG\n",1256__func__);1257return (ret);1258}1259if (ret == PG_ERROR(0)) {1260ji->state = nat64lsn_get_state6to4(cfg, ji->host, &ji->f_id,1261ji->state_hval, ji->faddr, ji->port, ji->proto);1262if (ji->state == NULL)1263ret = PG_ERROR(8);1264else1265ji->done = 1;1266}1267return (ret);1268}12691270static void1271nat64lsn_do_request(void *data)1272{1273struct epoch_tracker et;1274struct nat64lsn_job_head jhead;1275struct nat64lsn_job_item *ji, *ji2;1276struct nat64lsn_cfg *cfg;1277int jcount;1278uint8_t flags;12791280cfg = (struct nat64lsn_cfg *)data;1281if (cfg->jlen == 0)1282return;12831284CURVNET_SET(cfg->vp);1285STAILQ_INIT(&jhead);12861287/* Grab queue */1288JQUEUE_LOCK();1289STAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item);1290jcount = cfg->jlen;1291cfg->jlen = 0;1292JQUEUE_UNLOCK();12931294/* TODO: check if we need to resize hash */12951296NAT64STAT_INC(&cfg->base.stats, jcalls);1297DPRINTF(DP_JQUEUE, "count=%d", jcount);12981299/*1300* TODO:1301* What we should do here is to build a hash1302* to ensure we don't have lots of duplicate requests.1303* Skip this for now.1304*1305* TODO: Limit per-call number of items1306*/13071308NAT64LSN_EPOCH_ENTER(et);1309STAILQ_FOREACH(ji, &jhead, entries) {1310switch (ji->jtype) {1311case JTYPE_NEWHOST:1312if (nat64lsn_alloc_host(cfg, ji) != HOST_ERROR(0))1313NAT64STAT_INC(&cfg->base.stats, jhostfails);1314break;1315case JTYPE_NEWPORTGROUP:1316if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0))1317NAT64STAT_INC(&cfg->base.stats, jportfails);1318break;1319default:1320continue;1321}1322if (ji->done != 0) {1323flags = ji->proto != IPPROTO_TCP ? 0 :1324convert_tcp_flags(ji->f_id._flags);1325nat64lsn_translate6_internal(cfg, &ji->m,1326ji->state, flags);1327NAT64STAT_INC(&cfg->base.stats, jreinjected);1328}1329}1330NAT64LSN_EPOCH_EXIT(et);13311332ji = STAILQ_FIRST(&jhead);1333while (ji != NULL) {1334ji2 = STAILQ_NEXT(ji, entries);1335/*1336* In any case we must free mbuf if1337* translator did not consumed it.1338*/1339m_freem(ji->m);1340uma_zfree(nat64lsn_job_zone, ji);1341ji = ji2;1342}1343CURVNET_RESTORE();1344}13451346static struct nat64lsn_job_item *1347nat64lsn_create_job(struct nat64lsn_cfg *cfg, int jtype)1348{1349struct nat64lsn_job_item *ji;13501351/*1352* Do not try to lock possibly contested mutex if we're near the1353* limit. Drop packet instead.1354*/1355ji = NULL;1356if (cfg->jlen >= cfg->jmaxlen)1357NAT64STAT_INC(&cfg->base.stats, jmaxlen);1358else {1359ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT);1360if (ji == NULL)1361NAT64STAT_INC(&cfg->base.stats, jnomem);1362}1363if (ji == NULL) {1364NAT64STAT_INC(&cfg->base.stats, dropped);1365DPRINTF(DP_DROPS, "failed to create job");1366} else {1367ji->jtype = jtype;1368ji->done = 0;1369}1370return (ji);1371}13721373static void1374nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji)1375{13761377JQUEUE_LOCK();1378STAILQ_INSERT_TAIL(&cfg->jhead, ji, entries);1379NAT64STAT_INC(&cfg->base.stats, jrequests);1380cfg->jlen++;13811382if (callout_pending(&cfg->jcallout) == 0)1383callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg);1384JQUEUE_UNLOCK();1385}13861387/*1388* This function is used to clean up the result of less likely possible1389* race condition, when host object was deleted, but some translation1390* state was created before it is destroyed.1391*1392* Since the state expiration removes state from host's hash table,1393* we need to be sure, that there will not any states, that are linked1394* with this host entry.1395*/1396static void1397nat64lsn_host_cleanup(struct nat64lsn_host *host)1398{1399struct nat64lsn_state *state, *ts;1400int i;14011402printf("NAT64LSN: %s: race condition has been detected for host %p\n",1403__func__, host);1404for (i = 0; i < host->states_hashsize; i++) {1405CK_SLIST_FOREACH_SAFE(state, &host->states_hash[i],1406entries, ts) {1407/*1408* We can remove the state without lock,1409* because this host entry is unlinked and will1410* be destroyed.1411*/1412CK_SLIST_REMOVE(&host->states_hash[i], state,1413nat64lsn_state, entries);1414host->states_count--;1415nat64lsn_state_cleanup(state);1416}1417}1418MPASS(host->states_count == 0);1419}14201421/*1422* This function is used to clean up the result of less likely possible1423* race condition, when portgroup was deleted, but some translation state1424* was created before it is destroyed.1425*1426* Since states entries are accessible via host's hash table, we need1427* to be sure, that there will not any states from this PG, that are1428* linked with any host entries.1429*/1430static void1431nat64lsn_pg_cleanup(struct nat64lsn_pg *pg)1432{1433struct nat64lsn_state *state;1434uint64_t usedmask;1435int c, i;14361437printf("NAT64LSN: %s: race condition has been detected for pg %p\n",1438__func__, pg);1439for (c = 0; c < pg->chunks_count; c++) {1440/*1441* Use inverted freemask to find what state was created.1442*/1443usedmask = ~(*FREEMASK_CHUNK(pg, c));1444if (usedmask == 0)1445continue;1446for (i = 0; i < 64; i++) {1447if (!ISSET64(usedmask, i))1448continue;1449state = &STATES_CHUNK(pg, c)->state[i];1450/*1451* If we have STALE bit, this means that state1452* is already unlinked from host's hash table.1453* Thus we can just reset the bit in mask and1454* schedule destroying in the next epoch call.1455*/1456if (ISSET32(state->flags, NAT64_BIT_STALE)) {1457FREEMASK_BTS(pg, c, i);1458continue;1459}1460/*1461* There is small window, when we have bit1462* grabbed from freemask, but state is not yet1463* linked into host's hash table.1464* Check for READY flag, it is set just after1465* linking. If it is not set, defer cleanup1466* for next call.1467*/1468if (ISSET32(state->flags, NAT64_BIT_READY_IPV4)) {1469struct nat64lsn_host *host;14701471host = state->host;1472HOST_LOCK(host);1473CK_SLIST_REMOVE(&STATE_HASH(host,1474state->hval), state, nat64lsn_state,1475entries);1476host->states_count--;1477HOST_UNLOCK(host);1478nat64lsn_state_cleanup(state);1479}1480}1481}1482}14831484static void1485nat64lsn_job_destroy(epoch_context_t ctx)1486{1487struct nat64lsn_hosts_slist hosts;1488struct nat64lsn_pg_slist portgroups;1489struct nat64lsn_job_item *ji;1490struct nat64lsn_host *host;1491struct nat64lsn_pg *pg;1492int i;14931494CK_SLIST_INIT(&hosts);1495CK_SLIST_INIT(&portgroups);1496ji = __containerof(ctx, struct nat64lsn_job_item, epoch_ctx);1497MPASS(ji->jtype == JTYPE_DESTROY);1498while (!CK_SLIST_EMPTY(&ji->hosts)) {1499host = CK_SLIST_FIRST(&ji->hosts);1500CK_SLIST_REMOVE_HEAD(&ji->hosts, entries);1501if (host->states_count > 0) {1502/*1503* The state has been created during host deletion.1504*/1505printf("NAT64LSN: %s: destroying host with %d "1506"states\n", __func__, host->states_count);1507/*1508* We need to cleanup these states to avoid1509* possible access to already deleted host in1510* the state expiration code.1511*/1512nat64lsn_host_cleanup(host);1513CK_SLIST_INSERT_HEAD(&hosts, host, entries);1514/*1515* Keep host entry for next deferred destroying.1516* In the next epoch its states will be not1517* accessible.1518*/1519continue;1520}1521nat64lsn_destroy_host(host);1522}1523while (!CK_SLIST_EMPTY(&ji->portgroups)) {1524pg = CK_SLIST_FIRST(&ji->portgroups);1525CK_SLIST_REMOVE_HEAD(&ji->portgroups, entries);1526for (i = 0; i < pg->chunks_count; i++) {1527if (FREEMASK_BITCOUNT(pg, i) != 64) {1528/*1529* A state has been created during1530* PG deletion.1531*/1532printf("NAT64LSN: %s: destroying PG %p "1533"with non-empty chunk %d\n", __func__,1534pg, i);1535nat64lsn_pg_cleanup(pg);1536CK_SLIST_INSERT_HEAD(&portgroups,1537pg, entries);1538i = -1;1539break;1540}1541}1542if (i != -1)1543nat64lsn_destroy_pg(pg);1544}1545if (CK_SLIST_EMPTY(&hosts) &&1546CK_SLIST_EMPTY(&portgroups)) {1547uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk);1548uma_zfree(nat64lsn_job_zone, ji);1549return;1550}15511552/* Schedule job item again */1553CK_SLIST_MOVE(&ji->hosts, &hosts, entries);1554CK_SLIST_MOVE(&ji->portgroups, &portgroups, entries);1555NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, nat64lsn_job_destroy);1556}15571558static int1559nat64lsn_request_host(struct nat64lsn_cfg *cfg,1560const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval,1561in_addr_t faddr, uint16_t port, uint8_t proto)1562{1563struct nat64lsn_job_item *ji;15641565ji = nat64lsn_create_job(cfg, JTYPE_NEWHOST);1566if (ji != NULL) {1567ji->m = *mp;1568ji->f_id = *f_id;1569ji->faddr = faddr;1570ji->port = port;1571ji->proto = proto;1572ji->src6_hval = hval;15731574nat64lsn_enqueue_job(cfg, ji);1575NAT64STAT_INC(&cfg->base.stats, jhostsreq);1576*mp = NULL;1577}1578return (IP_FW_DENY);1579}15801581static int1582nat64lsn_request_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host,1583const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval,1584in_addr_t faddr, uint16_t port, uint8_t proto)1585{1586struct nat64lsn_job_item *ji;15871588ji = nat64lsn_create_job(cfg, JTYPE_NEWPORTGROUP);1589if (ji != NULL) {1590ji->m = *mp;1591ji->f_id = *f_id;1592ji->faddr = faddr;1593ji->port = port;1594ji->proto = proto;1595ji->state_hval = hval;1596ji->host = host;15971598nat64lsn_enqueue_job(cfg, ji);1599NAT64STAT_INC(&cfg->base.stats, jportreq);1600*mp = NULL;1601}1602return (IP_FW_DENY);1603}16041605static int1606nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp,1607struct nat64lsn_state *state, uint8_t flags)1608{1609struct pfloghdr loghdr, *logdata;1610int ret;1611uint16_t ts;16121613/* Update timestamp and flags if needed */1614SET_AGE(ts);1615if (state->timestamp != ts)1616state->timestamp = ts;1617if ((state->flags & flags) != 0)1618state->flags |= flags;16191620if (cfg->base.flags & NAT64_LOG) {1621logdata = &loghdr;1622nat64lsn_log(logdata, *mp, AF_INET6, state);1623} else1624logdata = NULL;16251626ret = nat64_do_handle_ip6(*mp, htonl(state->ip_src),1627htons(state->aport), &cfg->base, logdata);1628if (ret == NAT64SKIP)1629return (cfg->nomatch_verdict);1630if (ret == NAT64RETURN)1631*mp = NULL;1632return (IP_FW_DENY);1633}16341635static int1636nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id,1637struct mbuf **mp)1638{1639struct nat64lsn_state *state;1640struct nat64lsn_host *host;1641struct icmp6_hdr *icmp6;1642uint32_t addr, hval, data[2];1643int offset, proto;1644uint16_t port;1645uint8_t flags;16461647/* Check if protocol is supported */1648port = f_id->src_port;1649proto = f_id->proto;1650switch (f_id->proto) {1651case IPPROTO_ICMPV6:1652/*1653* For ICMPv6 echo reply/request we use icmp6_id as1654* local port.1655*/1656offset = 0;1657proto = nat64_getlasthdr(*mp, &offset);1658if (proto < 0) {1659NAT64STAT_INC(&cfg->base.stats, dropped);1660DPRINTF(DP_DROPS, "mbuf isn't contigious");1661return (IP_FW_DENY);1662}1663if (proto == IPPROTO_ICMPV6) {1664icmp6 = mtodo(*mp, offset);1665if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST ||1666icmp6->icmp6_type == ICMP6_ECHO_REPLY)1667port = ntohs(icmp6->icmp6_id);1668}1669proto = IPPROTO_ICMP;1670/* FALLTHROUGH */1671case IPPROTO_TCP:1672case IPPROTO_UDP:1673break;1674default:1675NAT64STAT_INC(&cfg->base.stats, noproto);1676return (cfg->nomatch_verdict);1677}16781679/* Extract IPv4 from destination IPv6 address */1680addr = nat64_extract_ip4(&f_id->dst_ip6, cfg->base.plat_plen);1681if (addr == 0 || nat64_check_private_ip4(&cfg->base, addr) != 0) {1682char a[INET_ADDRSTRLEN];16831684NAT64STAT_INC(&cfg->base.stats, dropped);1685DPRINTF(DP_DROPS, "dropped due to embedded IPv4 address %s",1686inet_ntop(AF_INET, &addr, a, sizeof(a)));1687return (IP_FW_DENY); /* XXX: add extra stats? */1688}16891690/* Try to find host */1691hval = HOST_HVAL(cfg, &f_id->src_ip6);1692CK_SLIST_FOREACH(host, &HOSTS(cfg, hval), entries) {1693if (IN6_ARE_ADDR_EQUAL(&f_id->src_ip6, &host->addr))1694break;1695}1696/* We use IPv4 address in host byte order */1697addr = ntohl(addr);1698if (host == NULL)1699return (nat64lsn_request_host(cfg, f_id, mp,1700hval, addr, port, proto));17011702flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags);17031704data[0] = addr;1705data[1] = (f_id->dst_port << 16) | port;1706hval = STATE_HVAL(cfg, data);1707state = nat64lsn_get_state6to4(cfg, host, f_id, hval, addr,1708port, proto);1709if (state == NULL)1710return (nat64lsn_request_pg(cfg, host, f_id, mp, hval, addr,1711port, proto));1712return (nat64lsn_translate6_internal(cfg, mp, state, flags));1713}17141715/*1716* Main dataplane entry point.1717*/1718int1719ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args,1720ipfw_insn *cmd, int *done)1721{1722struct nat64lsn_instance *i;1723ipfw_insn *icmd;1724int ret;17251726IPFW_RLOCK_ASSERT(ch);17271728*done = 0; /* continue the search in case of failure */1729icmd = cmd + F_LEN(cmd);1730if (cmd->opcode != O_EXTERNAL_ACTION ||1731insntod(cmd, kidx)->kidx != V_nat64lsn_eid ||1732icmd->opcode != O_EXTERNAL_INSTANCE ||1733(i = NAT64_LOOKUP(ch, icmd)) == NULL)1734return (IP_FW_DENY);17351736*done = 1; /* terminate the search */17371738switch (args->f_id.addr_type) {1739case 4:1740ret = nat64lsn_translate4(i->cfg, &args->f_id, &args->m);1741break;1742case 6:1743/*1744* Check that destination IPv6 address matches our prefix6.1745*/1746if ((i->cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 &&1747memcmp(&args->f_id.dst_ip6, &i->cfg->base.plat_prefix,1748i->cfg->base.plat_plen / 8) != 0) {1749ret = i->cfg->nomatch_verdict;1750break;1751}1752ret = nat64lsn_translate6(i->cfg, &args->f_id, &args->m);1753break;1754default:1755ret = i->cfg->nomatch_verdict;1756}17571758if (ret != IP_FW_PASS && args->m != NULL) {1759m_freem(args->m);1760args->m = NULL;1761}1762return (ret);1763}17641765static int1766nat64lsn_state_ctor(void *mem, int size, void *arg, int flags)1767{1768struct nat64lsn_states_chunk *chunk;1769int i;17701771chunk = (struct nat64lsn_states_chunk *)mem;1772for (i = 0; i < 64; i++)1773chunk->state[i].flags = 0;1774return (0);1775}17761777void1778nat64lsn_init_internal(void)1779{17801781nat64lsn_host_zone = uma_zcreate("NAT64LSN hosts",1782sizeof(struct nat64lsn_host), NULL, NULL, NULL, NULL,1783UMA_ALIGN_PTR, 0);1784nat64lsn_pgchunk_zone = uma_zcreate("NAT64LSN portgroup chunks",1785sizeof(struct nat64lsn_pgchunk), NULL, NULL, NULL, NULL,1786UMA_ALIGN_PTR, 0);1787nat64lsn_pg_zone = uma_zcreate("NAT64LSN portgroups",1788sizeof(struct nat64lsn_pg), NULL, NULL, NULL, NULL,1789UMA_ALIGN_PTR, 0);1790nat64lsn_aliaslink_zone = uma_zcreate("NAT64LSN links",1791sizeof(struct nat64lsn_aliaslink), NULL, NULL, NULL, NULL,1792UMA_ALIGN_PTR, 0);1793nat64lsn_state_zone = uma_zcreate("NAT64LSN states",1794sizeof(struct nat64lsn_states_chunk), nat64lsn_state_ctor,1795NULL, NULL, NULL, UMA_ALIGN_PTR, 0);1796nat64lsn_job_zone = uma_zcreate("NAT64LSN jobs",1797sizeof(struct nat64lsn_job_item), NULL, NULL, NULL, NULL,1798UMA_ALIGN_PTR, 0);1799JQUEUE_LOCK_INIT();1800}18011802void1803nat64lsn_uninit_internal(void)1804{18051806/* XXX: epoch_task drain */1807JQUEUE_LOCK_DESTROY();1808uma_zdestroy(nat64lsn_host_zone);1809uma_zdestroy(nat64lsn_pgchunk_zone);1810uma_zdestroy(nat64lsn_pg_zone);1811uma_zdestroy(nat64lsn_aliaslink_zone);1812uma_zdestroy(nat64lsn_state_zone);1813uma_zdestroy(nat64lsn_job_zone);1814}18151816void1817nat64lsn_start_instance(struct nat64lsn_cfg *cfg)1818{18191820CALLOUT_LOCK(cfg);1821callout_reset(&cfg->periodic, hz * PERIODIC_DELAY,1822nat64lsn_periodic, cfg);1823CALLOUT_UNLOCK(cfg);1824}18251826struct nat64lsn_cfg *1827nat64lsn_init_config(struct ip_fw_chain *ch, in_addr_t prefix, int plen)1828{1829struct nat64lsn_cfg *cfg;1830struct nat64lsn_alias *alias;1831int i, naddr;18321833cfg = malloc(sizeof(struct nat64lsn_cfg), M_NAT64LSN,1834M_WAITOK | M_ZERO);18351836CFG_LOCK_INIT(cfg);1837CALLOUT_LOCK_INIT(cfg);1838STAILQ_INIT(&cfg->jhead);1839cfg->vp = curvnet;1840COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK);18411842cfg->hash_seed = arc4random();1843cfg->hosts_hashsize = NAT64LSN_HOSTS_HSIZE;1844cfg->hosts_hash = malloc(sizeof(struct nat64lsn_hosts_slist) *1845cfg->hosts_hashsize, M_NAT64LSN, M_WAITOK | M_ZERO);1846for (i = 0; i < cfg->hosts_hashsize; i++)1847CK_SLIST_INIT(&cfg->hosts_hash[i]);18481849naddr = 1 << (32 - plen);1850cfg->prefix4 = prefix;1851cfg->pmask4 = prefix | (naddr - 1);1852cfg->plen4 = plen;1853cfg->aliases = malloc(sizeof(struct nat64lsn_alias) * naddr,1854M_NAT64LSN, M_WAITOK | M_ZERO);1855for (i = 0; i < naddr; i++) {1856alias = &cfg->aliases[i];1857alias->addr = prefix + i; /* host byte order */1858CK_SLIST_INIT(&alias->hosts);1859ALIAS_LOCK_INIT(alias);1860}18611862callout_init_mtx(&cfg->periodic, &cfg->periodic_lock, 0);1863callout_init(&cfg->jcallout, CALLOUT_MPSAFE);18641865return (cfg);1866}18671868static void1869nat64lsn_destroy_pg(struct nat64lsn_pg *pg)1870{1871int i;18721873if (pg->chunks_count == 1) {1874uma_zfree(nat64lsn_state_zone, pg->states);1875} else {1876for (i = 0; i < pg->chunks_count; i++)1877uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]);1878free(pg->states_chunk, M_NAT64LSN);1879free(pg->freemask_chunk, M_NAT64LSN);1880}1881uma_zfree(nat64lsn_pg_zone, pg);1882}18831884static void1885nat64lsn_destroy_alias(struct nat64lsn_cfg *cfg,1886struct nat64lsn_alias *alias)1887{1888struct nat64lsn_pg *pg;1889int i;18901891while (!CK_SLIST_EMPTY(&alias->portgroups)) {1892pg = CK_SLIST_FIRST(&alias->portgroups);1893CK_SLIST_REMOVE_HEAD(&alias->portgroups, entries);1894nat64lsn_destroy_pg(pg);1895}1896for (i = 0; i < 32; i++) {1897if (ISSET32(alias->tcp_chunkmask, i))1898uma_zfree(nat64lsn_pgchunk_zone, alias->tcp[i]);1899if (ISSET32(alias->udp_chunkmask, i))1900uma_zfree(nat64lsn_pgchunk_zone, alias->udp[i]);1901if (ISSET32(alias->icmp_chunkmask, i))1902uma_zfree(nat64lsn_pgchunk_zone, alias->icmp[i]);1903}1904ALIAS_LOCK_DESTROY(alias);1905}19061907static void1908nat64lsn_destroy_host(struct nat64lsn_host *host)1909{1910struct nat64lsn_aliaslink *link;19111912while (!CK_SLIST_EMPTY(&host->aliases)) {1913link = CK_SLIST_FIRST(&host->aliases);1914CK_SLIST_REMOVE_HEAD(&host->aliases, host_entries);19151916ALIAS_LOCK(link->alias);1917CK_SLIST_REMOVE(&link->alias->hosts, link,1918nat64lsn_aliaslink, alias_entries);1919link->alias->hosts_count--;1920ALIAS_UNLOCK(link->alias);19211922uma_zfree(nat64lsn_aliaslink_zone, link);1923}1924HOST_LOCK_DESTROY(host);1925free(host->states_hash, M_NAT64LSN);1926uma_zfree(nat64lsn_host_zone, host);1927}19281929void1930nat64lsn_destroy_config(struct nat64lsn_cfg *cfg)1931{1932struct nat64lsn_host *host;1933int i;19341935CALLOUT_LOCK(cfg);1936callout_drain(&cfg->periodic);1937CALLOUT_UNLOCK(cfg);1938callout_drain(&cfg->jcallout);19391940for (i = 0; i < cfg->hosts_hashsize; i++) {1941while (!CK_SLIST_EMPTY(&cfg->hosts_hash[i])) {1942host = CK_SLIST_FIRST(&cfg->hosts_hash[i]);1943CK_SLIST_REMOVE_HEAD(&cfg->hosts_hash[i], entries);1944nat64lsn_destroy_host(host);1945}1946}19471948for (i = 0; i < (1 << (32 - cfg->plen4)); i++)1949nat64lsn_destroy_alias(cfg, &cfg->aliases[i]);19501951CALLOUT_LOCK_DESTROY(cfg);1952CFG_LOCK_DESTROY(cfg);1953COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS);1954free(cfg->hosts_hash, M_NAT64LSN);1955free(cfg->aliases, M_NAT64LSN);1956free(cfg, M_NAT64LSN);1957}1958195919601961