CoCalc -- pf.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netpfil/pf/pf.c
³⁹⁵⁰⁷ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (c) 2001 Daniel Hartmeier
5
 * Copyright (c) 2002 - 2008 Henning Brauer
6
 * Copyright (c) 2012 Gleb Smirnoff <[email protected]>
7
 * All rights reserved.
8
 *
9
 * Redistribution and use in source and binary forms, with or without
10
 * modification, are permitted provided that the following conditions
11
 * are met:
12
 *
13
 *    - Redistributions of source code must retain the above copyright
14
 *      notice, this list of conditions and the following disclaimer.
15
 *    - Redistributions in binary form must reproduce the above
16
 *      copyright notice, this list of conditions and the following
17
 *      disclaimer in the documentation and/or other materials provided
18
 *      with the distribution.
19
 *
20
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24
 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31
 * POSSIBILITY OF SUCH DAMAGE.
32
 *
33
 * Effort sponsored in part by the Defense Advanced Research Projects
34
 * Agency (DARPA) and Air Force Research Laboratory, Air Force
35
 * Materiel Command, USAF, under agreement number F30602-01-2-0537.
36
 *
37
 *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
38
 */
39

40
#include <sys/cdefs.h>
41
#include "opt_bpf.h"
42
#include "opt_inet.h"
43
#include "opt_inet6.h"
44
#include "opt_pf.h"
45
#include "opt_sctp.h"
46

47
#include <sys/param.h>
48
#include <sys/bus.h>
49
#include <sys/endian.h>
50
#include <sys/gsb_crc32.h>
51
#include <sys/hash.h>
52
#include <sys/interrupt.h>
53
#include <sys/kernel.h>
54
#include <sys/kthread.h>
55
#include <sys/limits.h>
56
#include <sys/mbuf.h>
57
#include <sys/random.h>
58
#include <sys/refcount.h>
59
#include <sys/sdt.h>
60
#include <sys/socket.h>
61
#include <sys/sysctl.h>
62
#include <sys/taskqueue.h>
63
#include <sys/ucred.h>
64

65
#include <crypto/sha2/sha512.h>
66

67
#include <net/if.h>
68
#include <net/if_var.h>
69
#include <net/if_private.h>
70
#include <net/if_types.h>
71
#include <net/if_vlan_var.h>
72
#include <net/route.h>
73
#include <net/route/nhop.h>
74
#include <net/vnet.h>
75

76
#include <net/pfil.h>
77
#include <net/pfvar.h>
78
#include <net/if_pflog.h>
79
#include <net/if_pfsync.h>
80

81
#include <netinet/in_pcb.h>
82
#include <netinet/in_var.h>
83
#include <netinet/in_fib.h>
84
#include <netinet/ip.h>
85
#include <netinet/ip_fw.h>
86
#include <netinet/ip_icmp.h>
87
#include <netinet/icmp_var.h>
88
#include <netinet/ip_var.h>
89
#include <netinet/tcp.h>
90
#include <netinet/tcp_fsm.h>
91
#include <netinet/tcp_seq.h>
92
#include <netinet/tcp_timer.h>
93
#include <netinet/tcp_var.h>
94
#include <netinet/udp.h>
95
#include <netinet/udp_var.h>
96

97
/* dummynet */
98
#include <netinet/ip_dummynet.h>
99
#include <netinet/ip_fw.h>
100
#include <netpfil/ipfw/dn_heap.h>
101
#include <netpfil/ipfw/ip_fw_private.h>
102
#include <netpfil/ipfw/ip_dn_private.h>
103

104
#ifdef INET6
105
#include <netinet/ip6.h>
106
#include <netinet/icmp6.h>
107
#include <netinet6/nd6.h>
108
#include <netinet6/ip6_var.h>
109
#include <netinet6/in6_pcb.h>
110
#include <netinet6/in6_fib.h>
111
#include <netinet6/scope6_var.h>
112
#endif /* INET6 */
113

114
#include <netinet/sctp_header.h>
115
#include <netinet/sctp_crc32.h>
116

117
#include <netipsec/ah.h>
118

119
#include <machine/in_cksum.h>
120
#include <security/mac/mac_framework.h>
121

122
SDT_PROVIDER_DEFINE(pf);
123
SDT_PROBE_DEFINE2(pf, , test, reason_set, "int", "int");
124
SDT_PROBE_DEFINE4(pf, ip, test, done, "int", "int", "struct pf_krule *",
125
    "struct pf_kstate *");
126
SDT_PROBE_DEFINE5(pf, ip, state, lookup, "struct pfi_kkif *",
127
    "struct pf_state_key_cmp *", "int", "struct pf_pdesc *",
128
    "struct pf_kstate *");
129
SDT_PROBE_DEFINE2(pf, ip, , bound_iface, "struct pf_kstate *",
130
    "struct pfi_kkif *");
131
SDT_PROBE_DEFINE4(pf, ip, route_to, entry, "struct mbuf *",
132
    "struct pf_pdesc *", "struct pf_kstate *", "struct ifnet *");
133
SDT_PROBE_DEFINE1(pf, ip, route_to, drop, "int");
134
SDT_PROBE_DEFINE2(pf, ip, route_to, output, "struct ifnet *", "int");
135
SDT_PROBE_DEFINE4(pf, ip6, route_to, entry, "struct mbuf *",
136
    "struct pf_pdesc *", "struct pf_kstate *", "struct ifnet *");
137
SDT_PROBE_DEFINE1(pf, ip6, route_to, drop, "int");
138
SDT_PROBE_DEFINE2(pf, ip6, route_to, output, "struct ifnet *", "int");
139
SDT_PROBE_DEFINE4(pf, sctp, multihome, test, "struct pfi_kkif *",
140
    "struct pf_krule *", "struct mbuf *", "int");
141
SDT_PROBE_DEFINE2(pf, sctp, multihome, add, "uint32_t",
142
    "struct pf_sctp_source *");
143
SDT_PROBE_DEFINE3(pf, sctp, multihome, remove, "uint32_t",
144
    "struct pf_kstate *", "struct pf_sctp_source *");
145
SDT_PROBE_DEFINE4(pf, sctp, multihome_scan, entry, "int",
146
    "int", "struct pf_pdesc *", "int");
147
SDT_PROBE_DEFINE2(pf, sctp, multihome_scan, param, "uint16_t", "uint16_t");
148
SDT_PROBE_DEFINE2(pf, sctp, multihome_scan, ipv4, "struct in_addr *",
149
    "int");
150
SDT_PROBE_DEFINE2(pf, sctp, multihome_scan, ipv6, "struct in_addr6 *",
151
    "int");
152

153
SDT_PROBE_DEFINE3(pf, eth, test_rule, entry, "int", "struct ifnet *",
154
    "struct mbuf *");
155
SDT_PROBE_DEFINE2(pf, eth, test_rule, test, "int", "struct pf_keth_rule *");
156
SDT_PROBE_DEFINE3(pf, eth, test_rule, mismatch,
157
    "int", "struct pf_keth_rule *", "char *");
158
SDT_PROBE_DEFINE2(pf, eth, test_rule, match, "int", "struct pf_keth_rule *");
159
SDT_PROBE_DEFINE2(pf, eth, test_rule, final_match,
160
    "int", "struct pf_keth_rule *");
161
SDT_PROBE_DEFINE2(pf, purge, state, rowcount, "int", "size_t");
162
SDT_PROBE_DEFINE2(pf, , log, log, "int", "const char *");
163

164
/*
165
 * Global variables
166
 */
167

168
/* state tables */
169
VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[4]);
170
VNET_DEFINE(struct pf_kpalist,		 pf_pabuf[3]);
171
VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
172
VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_active);
173
VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
174
VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_inactive);
175
VNET_DEFINE(struct pf_kstatus,		 pf_status);
176

177
VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
178
VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
179
VNET_DEFINE(int,			 altqs_inactive_open);
180
VNET_DEFINE(u_int32_t,			 ticket_pabuf);
181

182
static const int			 PF_HDR_LIMIT = 20;	/* arbitrary limit */
183

184
VNET_DEFINE(SHA512_CTX,			 pf_tcp_secret_ctx);
185
#define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
186
VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
187
#define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
188
VNET_DEFINE(int,			 pf_tcp_secret_init);
189
#define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
190
VNET_DEFINE(int,			 pf_tcp_iss_off);
191
#define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
192
VNET_DECLARE(int,			 pf_vnet_active);
193
#define	V_pf_vnet_active		 VNET(pf_vnet_active)
194

195
VNET_DEFINE_STATIC(uint32_t, pf_purge_idx);
196
#define V_pf_purge_idx	VNET(pf_purge_idx)
197

198
#ifdef PF_WANT_32_TO_64_COUNTER
199
VNET_DEFINE_STATIC(uint32_t, pf_counter_periodic_iter);
200
#define	V_pf_counter_periodic_iter	VNET(pf_counter_periodic_iter)
201

202
VNET_DEFINE(struct allrulelist_head, pf_allrulelist);
203
VNET_DEFINE(size_t, pf_allrulecount);
204
VNET_DEFINE(struct pf_krule *, pf_rulemarker);
205
#endif
206

207
#define PF_SCTP_MAX_ENDPOINTS		8
208

209
struct pf_sctp_endpoint;
210
RB_HEAD(pf_sctp_endpoints, pf_sctp_endpoint);
211
struct pf_sctp_source {
212
	sa_family_t			af;
213
	struct pf_addr			addr;
214
	TAILQ_ENTRY(pf_sctp_source)	entry;
215
};
216
TAILQ_HEAD(pf_sctp_sources, pf_sctp_source);
217
struct pf_sctp_endpoint
218
{
219
	uint32_t		 v_tag;
220
	struct pf_sctp_sources	 sources;
221
	RB_ENTRY(pf_sctp_endpoint)	entry;
222
};
223
static int
224
pf_sctp_endpoint_compare(struct pf_sctp_endpoint *a, struct pf_sctp_endpoint *b)
225
{
226
	return (a->v_tag - b->v_tag);
227
}
228
RB_PROTOTYPE(pf_sctp_endpoints, pf_sctp_endpoint, entry, pf_sctp_endpoint_compare);
229
RB_GENERATE(pf_sctp_endpoints, pf_sctp_endpoint, entry, pf_sctp_endpoint_compare);
230
VNET_DEFINE_STATIC(struct pf_sctp_endpoints, pf_sctp_endpoints);
231
#define V_pf_sctp_endpoints	VNET(pf_sctp_endpoints)
232
static struct mtx_padalign pf_sctp_endpoints_mtx;
233
MTX_SYSINIT(pf_sctp_endpoints_mtx, &pf_sctp_endpoints_mtx, "SCTP endpoints", MTX_DEF);
234
#define	PF_SCTP_ENDPOINTS_LOCK()	mtx_lock(&pf_sctp_endpoints_mtx)
235
#define	PF_SCTP_ENDPOINTS_UNLOCK()	mtx_unlock(&pf_sctp_endpoints_mtx)
236

237
/*
238
 * Queue for pf_intr() sends.
239
 */
240
static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
241
struct pf_send_entry {
242
	STAILQ_ENTRY(pf_send_entry)	pfse_next;
243
	struct mbuf			*pfse_m;
244
	enum {
245
		PFSE_IP,
246
		PFSE_IP6,
247
		PFSE_ICMP,
248
		PFSE_ICMP6,
249
	}				pfse_type;
250
	struct {
251
		int		type;
252
		int		code;
253
		int		mtu;
254
	} icmpopts;
255
};
256

257
STAILQ_HEAD(pf_send_head, pf_send_entry);
258
VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue);
259
#define	V_pf_sendqueue	VNET(pf_sendqueue)
260

261
static struct mtx_padalign pf_sendqueue_mtx;
262
MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
263
#define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
264
#define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
265

266
/*
267
 * Queue for pf_overload_task() tasks.
268
 */
269
struct pf_overload_entry {
270
	SLIST_ENTRY(pf_overload_entry)	next;
271
	struct pf_addr  		addr;
272
	sa_family_t			af;
273
	uint8_t				dir;
274
	struct pf_krule  		*rule;
275
};
276

277
SLIST_HEAD(pf_overload_head, pf_overload_entry);
278
VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue);
279
#define V_pf_overloadqueue	VNET(pf_overloadqueue)
280
VNET_DEFINE_STATIC(struct task, pf_overloadtask);
281
#define	V_pf_overloadtask	VNET(pf_overloadtask)
282

283
static struct mtx_padalign pf_overloadqueue_mtx;
284
MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
285
    "pf overload/flush queue", MTX_DEF);
286
#define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
287
#define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
288

289
VNET_DEFINE(struct pf_krulequeue, pf_unlinked_rules);
290
struct mtx_padalign pf_unlnkdrules_mtx;
291
MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
292
    MTX_DEF);
293

294
struct sx pf_config_lock;
295
SX_SYSINIT(pf_config_lock, &pf_config_lock, "pf config");
296

297
struct mtx_padalign pf_table_stats_lock;
298
MTX_SYSINIT(pf_table_stats_lock, &pf_table_stats_lock, "pf table stats",
299
    MTX_DEF);
300

301
VNET_DEFINE_STATIC(uma_zone_t,	pf_sources_z);
302
#define	V_pf_sources_z	VNET(pf_sources_z)
303
uma_zone_t		pf_mtag_z;
304
VNET_DEFINE(uma_zone_t,	 pf_state_z);
305
VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
306
VNET_DEFINE(uma_zone_t,	 pf_udp_mapping_z);
307

308
VNET_DEFINE(struct unrhdr64, pf_stateid);
309

310
static void		 pf_src_tree_remove_state(struct pf_kstate *);
311
static int		 pf_check_threshold(struct pf_kthreshold *);
312

313
static void		 pf_change_ap(struct pf_pdesc *, struct pf_addr *, u_int16_t *,
314
			    struct pf_addr *, u_int16_t);
315
static int		 pf_modulate_sack(struct pf_pdesc *,
316
			    struct tcphdr *, struct pf_state_peer *);
317
int			 pf_icmp_mapping(struct pf_pdesc *, u_int8_t, int *,
318
			    u_int16_t *, u_int16_t *);
319
static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
320
			    struct pf_addr *, struct pf_addr *, u_int16_t,
321
			    u_int16_t *, u_int16_t *, u_int16_t *,
322
			    u_int16_t *, u_int8_t, sa_family_t);
323
int			 pf_change_icmp_af(struct mbuf *, int,
324
			    struct pf_pdesc *, struct pf_pdesc *,
325
			    struct pf_addr *, struct pf_addr *, sa_family_t,
326
			    sa_family_t);
327
int			 pf_translate_icmp_af(int, void *);
328
static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
329
			    int, sa_family_t, struct pf_krule *, int);
330
static void		 pf_detach_state(struct pf_kstate *);
331
static int		 pf_state_key_attach(struct pf_state_key *,
332
			    struct pf_state_key *, struct pf_kstate *);
333
static void		 pf_state_key_detach(struct pf_kstate *, int);
334
static int		 pf_state_key_ctor(void *, int, void *, int);
335
static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
336
static __inline void	 pf_dummynet_flag_remove(struct mbuf *m,
337
			    struct pf_mtag *pf_mtag);
338
static int		 pf_dummynet(struct pf_pdesc *, struct pf_kstate *,
339
			    struct pf_krule *, struct mbuf **);
340
static int		 pf_dummynet_route(struct pf_pdesc *,
341
			    struct pf_kstate *, struct pf_krule *,
342
			    struct ifnet *, const struct sockaddr *, struct mbuf **);
343
static int		 pf_test_eth_rule(int, struct pfi_kkif *,
344
			    struct mbuf **);
345
static int		 pf_test_rule(struct pf_krule **, struct pf_kstate **,
346
			    struct pf_pdesc *, struct pf_krule **,
347
			    struct pf_kruleset **, u_short *, struct inpcb *,
348
			    struct pf_krule_slist *);
349
static int		 pf_create_state(struct pf_krule *,
350
			    struct pf_test_ctx *,
351
			    struct pf_kstate **, u_int16_t, u_int16_t,
352
			    struct pf_krule_slist *match_rules);
353
static int		 pf_state_key_addr_setup(struct pf_pdesc *,
354
			    struct pf_state_key_cmp *, int);
355
static int		 pf_tcp_track_full(struct pf_kstate *,
356
			    struct pf_pdesc *, u_short *, int *,
357
			    struct pf_state_peer *, struct pf_state_peer *,
358
			    u_int8_t, u_int8_t);
359
static int		 pf_tcp_track_sloppy(struct pf_kstate *,
360
			    struct pf_pdesc *, u_short *,
361
			    struct pf_state_peer *, struct pf_state_peer *,
362
			    u_int8_t, u_int8_t);
363
static int		 pf_test_state(struct pf_kstate **, struct pf_pdesc *,
364
			    u_short *);
365
int			 pf_icmp_state_lookup(struct pf_state_key_cmp *,
366
			    struct pf_pdesc *, struct pf_kstate **,
367
			    u_int16_t, u_int16_t, int, int *, int, int);
368
static int		 pf_test_state_icmp(struct pf_kstate **,
369
			    struct pf_pdesc *, u_short *);
370
static int		 pf_sctp_track(struct pf_kstate *, struct pf_pdesc *,
371
			    u_short *);
372
static void		 pf_sctp_multihome_detach_addr(const struct pf_kstate *);
373
static void		 pf_sctp_multihome_delayed(struct pf_pdesc *,
374
			    struct pfi_kkif *, struct pf_kstate *, int);
375
static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
376
				int, u_int16_t);
377
static int		 pf_check_proto_cksum(struct mbuf *, int, int,
378
			    u_int8_t, sa_family_t);
379
static int		 pf_walk_option(struct pf_pdesc *, struct ip *,
380
			    int, int, u_short *);
381
static int		 pf_walk_header(struct pf_pdesc *, struct ip *, u_short *);
382
#ifdef INET6
383
static int		 pf_walk_option6(struct pf_pdesc *, struct ip6_hdr *,
384
			    int, int, u_short *);
385
static int		 pf_walk_header6(struct pf_pdesc *, struct ip6_hdr *,
386
			    u_short *);
387
#endif
388
static void		 pf_print_state_parts(struct pf_kstate *,
389
			    struct pf_state_key *, struct pf_state_key *);
390
static int		 pf_patch_8(struct pf_pdesc *, u_int8_t *, u_int8_t,
391
			    bool);
392
static int		 pf_find_state(struct pf_pdesc *,
393
			    const struct pf_state_key_cmp *, struct pf_kstate **);
394
static bool		 pf_src_connlimit(struct pf_kstate *);
395
static int		 pf_match_rcvif(struct mbuf *, struct pf_krule *);
396
static void		 pf_counters_inc(int, struct pf_pdesc *,
397
			    struct pf_kstate *, struct pf_krule *,
398
			    struct pf_krule *, struct pf_krule_slist *);
399
static void		 pf_log_matches(struct pf_pdesc *, struct pf_krule *,
400
			    struct pf_krule *, struct pf_kruleset *,
401
			    struct pf_krule_slist *);
402
static void		 pf_overload_task(void *v, int pending);
403
static u_short		 pf_insert_src_node(struct pf_ksrc_node *[PF_SN_MAX],
404
			    struct pf_srchash *[PF_SN_MAX], struct pf_krule *,
405
			    struct pf_addr *, sa_family_t, struct pf_addr *,
406
			    struct pfi_kkif *, sa_family_t, pf_sn_types_t);
407
static u_int		 pf_purge_expired_states(u_int, int);
408
static void		 pf_purge_unlinked_rules(void);
409
static int		 pf_mtag_uminit(void *, int, int);
410
static void		 pf_mtag_free(struct m_tag *);
411
static void		 pf_packet_rework_nat(struct pf_pdesc *, int,
412
			    struct pf_state_key *);
413
#ifdef INET
414
static int		 pf_route(struct pf_krule *,
415
			    struct ifnet *, struct pf_kstate *,
416
			    struct pf_pdesc *, struct inpcb *);
417
#endif /* INET */
418
#ifdef INET6
419
static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
420
			    struct pf_addr *, u_int8_t);
421
static int		 pf_route6(struct pf_krule *,
422
			    struct ifnet *, struct pf_kstate *,
423
			    struct pf_pdesc *, struct inpcb *);
424
#endif /* INET6 */
425
static __inline void pf_set_protostate(struct pf_kstate *, int, u_int8_t);
426

427
int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
428

429
extern int pf_end_threads;
430
extern struct proc *pf_purge_proc;
431

432
VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
433

434
#define	PACKET_UNDO_NAT(_pd, _off, _s)					\
435
	do {								\
436
		struct pf_state_key *nk;				\
437
		if ((pd->dir) == PF_OUT)				\
438
			nk = (_s)->key[PF_SK_STACK];			\
439
		else							\
440
			nk = (_s)->key[PF_SK_WIRE];			\
441
		pf_packet_rework_nat(_pd, _off, nk);		\
442
	} while (0)
443

444
#define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
445
				 (pd)->pf_mtag->flags & PF_MTAG_FLAG_PACKET_LOOPED)
446

447
static struct pfi_kkif *
448
BOUND_IFACE(struct pf_kstate *st, struct pf_pdesc *pd)
449
{
450
	struct pfi_kkif *k = pd->kif;
451

452
	SDT_PROBE2(pf, ip, , bound_iface, st, k);
453

454
	/* Floating unless otherwise specified. */
455
	if (! (st->rule->rule_flag & PFRULE_IFBOUND))
456
		return (V_pfi_all);
457

458
	/*
459
	 * Initially set to all, because we don't know what interface we'll be
460
	 * sending this out when we create the state.
461
	 */
462
	if (st->rule->rt == PF_REPLYTO || (pd->af != pd->naf && st->direction == PF_IN))
463
		return (V_pfi_all);
464

465
	/*
466
	 * If this state is created based on another state (e.g. SCTP
467
	 * multihome) always set it floating initially. We can't know for sure
468
	 * what interface the actual traffic for this state will come in on.
469
	 */
470
	if (pd->related_rule)
471
		return (V_pfi_all);
472

473
	/* Don't overrule the interface for states created on incoming packets. */
474
	if (st->direction == PF_IN)
475
		return (k);
476

477
	/* No route-to, so don't overrule. */
478
	if (st->act.rt != PF_ROUTETO)
479
		return (k);
480

481
	/* Bind to the route-to interface. */
482
	return (st->act.rt_kif);
483
}
484

485
#define	STATE_INC_COUNTERS(s)						\
486
	do {								\
487
		struct pf_krule_item *mrm;				\
488
		counter_u64_add(s->rule->states_cur, 1);		\
489
		counter_u64_add(s->rule->states_tot, 1);		\
490
		if (s->anchor != NULL) {				\
491
			counter_u64_add(s->anchor->states_cur, 1);	\
492
			counter_u64_add(s->anchor->states_tot, 1);	\
493
		}							\
494
		if (s->nat_rule != NULL && s->nat_rule != s->rule) {	\
495
			counter_u64_add(s->nat_rule->states_cur, 1);	\
496
			counter_u64_add(s->nat_rule->states_tot, 1);	\
497
		}							\
498
		SLIST_FOREACH(mrm, &s->match_rules, entry) {		\
499
			if (s->nat_rule != mrm->r) {			\
500
				counter_u64_add(mrm->r->states_cur, 1);	\
501
				counter_u64_add(mrm->r->states_tot, 1);	\
502
			}						\
503
		}							\
504
	} while (0)
505

506
#define	STATE_DEC_COUNTERS(s)						\
507
	do {								\
508
		struct pf_krule_item *mrm;				\
509
		counter_u64_add(s->rule->states_cur, -1);		\
510
		if (s->anchor != NULL)					\
511
			counter_u64_add(s->anchor->states_cur, -1);	\
512
		if (s->nat_rule != NULL && s->nat_rule != s->rule)	\
513
			counter_u64_add(s->nat_rule->states_cur, -1);	\
514
		SLIST_FOREACH(mrm, &s->match_rules, entry)		\
515
			if (s->nat_rule != mrm->r) {			\
516
				counter_u64_add(mrm->r->states_cur, -1);\
517
			}						\
518
	} while (0)
519

520
MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
521
MALLOC_DEFINE(M_PF_RULE_ITEM, "pf_krule_item", "pf(4) rule items");
522
VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
523
VNET_DEFINE(struct pf_idhash *, pf_idhash);
524
VNET_DEFINE(struct pf_srchash *, pf_srchash);
525
VNET_DEFINE(struct pf_udpendpointhash *, pf_udpendpointhash);
526
VNET_DEFINE(struct pf_udpendpointmapping *, pf_udpendpointmapping);
527

528
SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
529
    "pf(4)");
530

531
VNET_DEFINE(u_long, pf_hashmask);
532
VNET_DEFINE(u_long, pf_srchashmask);
533
VNET_DEFINE(u_long, pf_udpendpointhashmask);
534
VNET_DEFINE_STATIC(u_long, pf_hashsize);
535
#define V_pf_hashsize	VNET(pf_hashsize)
536
VNET_DEFINE_STATIC(u_long, pf_srchashsize);
537
#define V_pf_srchashsize	VNET(pf_srchashsize)
538
VNET_DEFINE_STATIC(u_long, pf_udpendpointhashsize);
539
#define V_pf_udpendpointhashsize	VNET(pf_udpendpointhashsize)
540
u_long	pf_ioctl_maxcount = 65535;
541

542
SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
543
    &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
544
SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
545
    &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
546
SYSCTL_ULONG(_net_pf, OID_AUTO, udpendpoint_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
547
    &VNET_NAME(pf_udpendpointhashsize), 0, "Size of pf(4) endpoint hashtable");
548
SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN,
549
    &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call");
550

551
VNET_DEFINE(void *, pf_swi_cookie);
552
VNET_DEFINE(struct intr_event *, pf_swi_ie);
553

554
VNET_DEFINE(uint32_t, pf_hashseed);
555
#define	V_pf_hashseed	VNET(pf_hashseed)
556

557
static void
558
pf_sctp_checksum(struct mbuf *m, int off)
559
{
560
	uint32_t sum = 0;
561

562
	/* Zero out the checksum, to enable recalculation. */
563
	m_copyback(m, off + offsetof(struct sctphdr, checksum),
564
	    sizeof(sum), (caddr_t)&sum);
565

566
	sum = sctp_calculate_cksum(m, off);
567

568
	m_copyback(m, off + offsetof(struct sctphdr, checksum),
569
	    sizeof(sum), (caddr_t)&sum);
570
}
571

572
int
573
pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
574
{
575

576
	switch (af) {
577
#ifdef INET
578
	case AF_INET:
579
		if (a->addr32[0] > b->addr32[0])
580
			return (1);
581
		if (a->addr32[0] < b->addr32[0])
582
			return (-1);
583
		break;
584
#endif /* INET */
585
#ifdef INET6
586
	case AF_INET6:
587
		if (a->addr32[3] > b->addr32[3])
588
			return (1);
589
		if (a->addr32[3] < b->addr32[3])
590
			return (-1);
591
		if (a->addr32[2] > b->addr32[2])
592
			return (1);
593
		if (a->addr32[2] < b->addr32[2])
594
			return (-1);
595
		if (a->addr32[1] > b->addr32[1])
596
			return (1);
597
		if (a->addr32[1] < b->addr32[1])
598
			return (-1);
599
		if (a->addr32[0] > b->addr32[0])
600
			return (1);
601
		if (a->addr32[0] < b->addr32[0])
602
			return (-1);
603
		break;
604
#endif /* INET6 */
605
	default:
606
		unhandled_af(af);
607
	}
608
	return (0);
609
}
610

611
static bool
612
pf_is_loopback(sa_family_t af, struct pf_addr *addr)
613
{
614
	switch (af) {
615
#ifdef INET
616
	case AF_INET:
617
		return IN_LOOPBACK(ntohl(addr->v4.s_addr));
618
#endif /* INET */
619
	case AF_INET6:
620
		return IN6_IS_ADDR_LOOPBACK(&addr->v6);
621
	default:
622
		unhandled_af(af);
623
	}
624
}
625

626
static void
627
pf_packet_rework_nat(struct pf_pdesc *pd, int off, struct pf_state_key *nk)
628
{
629

630
	switch (pd->virtual_proto) {
631
	case IPPROTO_TCP: {
632
		struct tcphdr *th = &pd->hdr.tcp;
633

634
		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af))
635
			pf_change_ap(pd, pd->src, &th->th_sport,
636
			    &nk->addr[pd->sidx], nk->port[pd->sidx]);
637
		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af))
638
			pf_change_ap(pd, pd->dst, &th->th_dport,
639
			    &nk->addr[pd->didx], nk->port[pd->didx]);
640
		m_copyback(pd->m, off, sizeof(*th), (caddr_t)th);
641
		break;
642
	}
643
	case IPPROTO_UDP: {
644
		struct udphdr *uh = &pd->hdr.udp;
645

646
		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af))
647
			pf_change_ap(pd, pd->src, &uh->uh_sport,
648
			    &nk->addr[pd->sidx], nk->port[pd->sidx]);
649
		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af))
650
			pf_change_ap(pd, pd->dst, &uh->uh_dport,
651
			    &nk->addr[pd->didx], nk->port[pd->didx]);
652
		m_copyback(pd->m, off, sizeof(*uh), (caddr_t)uh);
653
		break;
654
	}
655
	case IPPROTO_SCTP: {
656
		struct sctphdr *sh = &pd->hdr.sctp;
657

658
		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) {
659
			pf_change_ap(pd, pd->src, &sh->src_port,
660
			    &nk->addr[pd->sidx], nk->port[pd->sidx]);
661
		}
662
		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) {
663
			pf_change_ap(pd, pd->dst, &sh->dest_port,
664
			    &nk->addr[pd->didx], nk->port[pd->didx]);
665
		}
666

667
		break;
668
	}
669
	case IPPROTO_ICMP: {
670
		struct icmp *ih = &pd->hdr.icmp;
671

672
		if (nk->port[pd->sidx] != ih->icmp_id) {
673
			pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
674
			    ih->icmp_cksum, ih->icmp_id,
675
			    nk->port[pd->sidx], 0);
676
			ih->icmp_id = nk->port[pd->sidx];
677
			pd->sport = &ih->icmp_id;
678

679
			m_copyback(pd->m, off, ICMP_MINLEN, (caddr_t)ih);
680
		}
681
		/* FALLTHROUGH */
682
	}
683
	default:
684
		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) {
685
			switch (pd->af) {
686
			case AF_INET:
687
				pf_change_a(&pd->src->v4.s_addr,
688
				    pd->ip_sum, nk->addr[pd->sidx].v4.s_addr,
689
				    0);
690
				break;
691
			case AF_INET6:
692
				pf_addrcpy(pd->src, &nk->addr[pd->sidx],
693
				    pd->af);
694
				break;
695
			default:
696
				unhandled_af(pd->af);
697
			}
698
		}
699
		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) {
700
			switch (pd->af) {
701
			case AF_INET:
702
				pf_change_a(&pd->dst->v4.s_addr,
703
				    pd->ip_sum, nk->addr[pd->didx].v4.s_addr,
704
				    0);
705
				break;
706
			case AF_INET6:
707
				pf_addrcpy(pd->dst, &nk->addr[pd->didx],
708
				    pd->af);
709
				break;
710
			default:
711
				unhandled_af(pd->af);
712
			}
713
		}
714
		break;
715
	}
716
}
717

718
static __inline uint32_t
719
pf_hashkey(const struct pf_state_key *sk)
720
{
721
	uint32_t h;
722

723
	h = murmur3_32_hash32((const uint32_t *)sk,
724
	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
725
	    V_pf_hashseed);
726

727
	return (h & V_pf_hashmask);
728
}
729

730
__inline uint32_t
731
pf_hashsrc(struct pf_addr *addr, sa_family_t af)
732
{
733
	uint32_t h;
734

735
	switch (af) {
736
	case AF_INET:
737
		h = murmur3_32_hash32((uint32_t *)&addr->v4,
738
		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
739
		break;
740
	case AF_INET6:
741
		h = murmur3_32_hash32((uint32_t *)&addr->v6,
742
		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
743
		break;
744
	default:
745
		unhandled_af(af);
746
	}
747

748
	return (h & V_pf_srchashmask);
749
}
750

751
static inline uint32_t
752
pf_hashudpendpoint(struct pf_udp_endpoint *endpoint)
753
{
754
	uint32_t h;
755

756
	h = murmur3_32_hash32((uint32_t *)endpoint,
757
	    sizeof(struct pf_udp_endpoint_cmp)/sizeof(uint32_t),
758
	    V_pf_hashseed);
759
	return (h & V_pf_udpendpointhashmask);
760
}
761

762
#ifdef ALTQ
763
static int
764
pf_state_hash(struct pf_kstate *s)
765
{
766
	u_int32_t hv = (intptr_t)s / sizeof(*s);
767

768
	hv ^= crc32(&s->src, sizeof(s->src));
769
	hv ^= crc32(&s->dst, sizeof(s->dst));
770
	if (hv == 0)
771
		hv = 1;
772
	return (hv);
773
}
774
#endif /* ALTQ */
775

776
static __inline void
777
pf_set_protostate(struct pf_kstate *s, int which, u_int8_t newstate)
778
{
779
	if (which == PF_PEER_DST || which == PF_PEER_BOTH)
780
		s->dst.state = newstate;
781
	if (which == PF_PEER_DST)
782
		return;
783
	if (s->src.state == newstate)
784
		return;
785
	if (s->creatorid == V_pf_status.hostid &&
786
	    s->key[PF_SK_STACK] != NULL &&
787
	    s->key[PF_SK_STACK]->proto == IPPROTO_TCP &&
788
	    !(TCPS_HAVEESTABLISHED(s->src.state) ||
789
	    s->src.state == TCPS_CLOSED) &&
790
	    (TCPS_HAVEESTABLISHED(newstate) || newstate == TCPS_CLOSED))
791
		atomic_add_32(&V_pf_status.states_halfopen, -1);
792

793
	s->src.state = newstate;
794
}
795

796
bool
797
pf_init_threshold(struct pf_kthreshold *threshold,
798
    u_int32_t limit, u_int32_t seconds)
799
{
800
	threshold->limit = limit;
801
	threshold->seconds = seconds;
802
	threshold->cr = counter_rate_alloc(M_NOWAIT, seconds);
803

804
	return (threshold->cr != NULL);
805
}
806

807
static int
808
pf_check_threshold(struct pf_kthreshold *threshold)
809
{
810
	return (counter_ratecheck(threshold->cr, threshold->limit) < 0);
811
}
812

813
static bool
814
pf_src_connlimit(struct pf_kstate *state)
815
{
816
	struct pf_overload_entry	*pfoe;
817
	struct pf_ksrc_node		*src_node = state->sns[PF_SN_LIMIT];
818
	bool				 limited = false;
819

820
	PF_STATE_LOCK_ASSERT(state);
821
	PF_SRC_NODE_LOCK(src_node);
822

823
	src_node->conn++;
824
	state->src.tcp_est = 1;
825

826
	if (state->rule->max_src_conn &&
827
	    state->rule->max_src_conn <
828
	    src_node->conn) {
829
		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
830
		limited = true;
831
	}
832

833
	if (state->rule->max_src_conn_rate.limit &&
834
	    pf_check_threshold(&src_node->conn_rate)) {
835
		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
836
		limited = true;
837
	}
838

839
	if (!limited)
840
		goto done;
841

842
	/* Kill this state. */
843
	state->timeout = PFTM_PURGE;
844
	pf_set_protostate(state, PF_PEER_BOTH, TCPS_CLOSED);
845

846
	if (state->rule->overload_tbl == NULL)
847
		goto done;
848

849
	/* Schedule overloading and flushing task. */
850
	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
851
	if (pfoe == NULL)
852
		goto done;  /* too bad :( */
853

854
	bcopy(&src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
855
	pfoe->af = state->key[PF_SK_WIRE]->af;
856
	pfoe->rule = state->rule;
857
	pfoe->dir = state->direction;
858
	PF_OVERLOADQ_LOCK();
859
	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
860
	PF_OVERLOADQ_UNLOCK();
861
	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
862

863
done:
864
	PF_SRC_NODE_UNLOCK(src_node);
865
	return (limited);
866
}
867

868
static void
869
pf_overload_task(void *v, int pending)
870
{
871
	struct pf_overload_head queue;
872
	struct pfr_addr p;
873
	struct pf_overload_entry *pfoe, *pfoe1;
874
	uint32_t killed = 0;
875

876
	CURVNET_SET((struct vnet *)v);
877

878
	PF_OVERLOADQ_LOCK();
879
	queue = V_pf_overloadqueue;
880
	SLIST_INIT(&V_pf_overloadqueue);
881
	PF_OVERLOADQ_UNLOCK();
882

883
	bzero(&p, sizeof(p));
884
	SLIST_FOREACH(pfoe, &queue, next) {
885
		counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
886
		if (V_pf_status.debug >= PF_DEBUG_MISC) {
887
			printf("%s: blocking address ", __func__);
888
			pf_print_host(&pfoe->addr, 0, pfoe->af);
889
			printf("\n");
890
		}
891

892
		p.pfra_af = pfoe->af;
893
		switch (pfoe->af) {
894
#ifdef INET
895
		case AF_INET:
896
			p.pfra_net = 32;
897
			p.pfra_ip4addr = pfoe->addr.v4;
898
			break;
899
#endif /* INET */
900
#ifdef INET6
901
		case AF_INET6:
902
			p.pfra_net = 128;
903
			p.pfra_ip6addr = pfoe->addr.v6;
904
			break;
905
#endif /* INET6 */
906
		default:
907
			unhandled_af(pfoe->af);
908
		}
909

910
		PF_RULES_WLOCK();
911
		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
912
		PF_RULES_WUNLOCK();
913
	}
914

915
	/*
916
	 * Remove those entries, that don't need flushing.
917
	 */
918
	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
919
		if (pfoe->rule->flush == 0) {
920
			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
921
			free(pfoe, M_PFTEMP);
922
		} else
923
			counter_u64_add(
924
			    V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
925

926
	/* If nothing to flush, return. */
927
	if (SLIST_EMPTY(&queue)) {
928
		CURVNET_RESTORE();
929
		return;
930
	}
931

932
	for (int i = 0; i <= V_pf_hashmask; i++) {
933
		struct pf_idhash *ih = &V_pf_idhash[i];
934
		struct pf_state_key *sk;
935
		struct pf_kstate *s;
936

937
		PF_HASHROW_LOCK(ih);
938
		LIST_FOREACH(s, &ih->states, entry) {
939
		    sk = s->key[PF_SK_WIRE];
940
		    SLIST_FOREACH(pfoe, &queue, next)
941
			if (sk->af == pfoe->af &&
942
			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
943
			    pfoe->rule == s->rule) &&
944
			    ((pfoe->dir == PF_OUT &&
945
			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
946
			    (pfoe->dir == PF_IN &&
947
			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
948
				s->timeout = PFTM_PURGE;
949
				pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED);
950
				killed++;
951
			}
952
		}
953
		PF_HASHROW_UNLOCK(ih);
954
	}
955
	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
956
		free(pfoe, M_PFTEMP);
957
	if (V_pf_status.debug >= PF_DEBUG_MISC)
958
		printf("%s: %u states killed", __func__, killed);
959

960
	CURVNET_RESTORE();
961
}
962

963
/*
964
 * On node found always returns locked. On not found its configurable.
965
 */
966
struct pf_ksrc_node *
967
pf_find_src_node(struct pf_addr *src, struct pf_krule *rule, sa_family_t af,
968
    struct pf_srchash **sh, pf_sn_types_t sn_type, bool returnlocked)
969
{
970
	struct pf_ksrc_node *n;
971

972
	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
973

974
	*sh = &V_pf_srchash[pf_hashsrc(src, af)];
975
	PF_HASHROW_LOCK(*sh);
976
	LIST_FOREACH(n, &(*sh)->nodes, entry)
977
		if (n->rule == rule && n->af == af && n->type == sn_type &&
978
		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
979
		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
980
			break;
981

982
	if (n == NULL && !returnlocked)
983
		PF_HASHROW_UNLOCK(*sh);
984

985
	return (n);
986
}
987

988
bool
989
pf_src_node_exists(struct pf_ksrc_node **sn, struct pf_srchash *sh)
990
{
991
	struct pf_ksrc_node	*cur;
992

993
	if ((*sn) == NULL)
994
		return (false);
995

996
	KASSERT(sh != NULL, ("%s: sh is NULL", __func__));
997

998
	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
999
	PF_HASHROW_LOCK(sh);
1000
	LIST_FOREACH(cur, &(sh->nodes), entry) {
1001
		if (cur == (*sn) &&
1002
		    cur->expire != 1) /* Ignore nodes being killed */
1003
			return (true);
1004
	}
1005
	PF_HASHROW_UNLOCK(sh);
1006
	(*sn) = NULL;
1007
	return (false);
1008
}
1009

1010
static void
1011
pf_free_src_node(struct pf_ksrc_node *sn)
1012
{
1013

1014
	for (int i = 0; i < 2; i++) {
1015
		counter_u64_free(sn->bytes[i]);
1016
		counter_u64_free(sn->packets[i]);
1017
	}
1018
	counter_rate_free(sn->conn_rate.cr);
1019
	uma_zfree(V_pf_sources_z, sn);
1020
}
1021

1022
static u_short
1023
pf_insert_src_node(struct pf_ksrc_node *sns[PF_SN_MAX],
1024
    struct pf_srchash *snhs[PF_SN_MAX], struct pf_krule *rule,
1025
    struct pf_addr *src, sa_family_t af, struct pf_addr *raddr,
1026
    struct pfi_kkif *rkif, sa_family_t raf, pf_sn_types_t sn_type)
1027
{
1028
	u_short			 reason = 0;
1029
	struct pf_krule		*r_track = rule;
1030
	struct pf_ksrc_node	**sn = &(sns[sn_type]);
1031
	struct pf_srchash	**sh = &(snhs[sn_type]);
1032

1033
	KASSERT(sn_type != PF_SN_LIMIT || (raddr == NULL && rkif == NULL),
1034
	    ("%s: raddr and rkif must be NULL for PF_SN_LIMIT", __func__));
1035

1036
	KASSERT(sn_type != PF_SN_LIMIT || (rule->rule_flag & PFRULE_SRCTRACK),
1037
	    ("%s: PF_SN_LIMIT only valid for rules with PFRULE_SRCTRACK", __func__));
1038

1039
	/*
1040
	 * XXX: There could be a KASSERT for
1041
	 * sn_type == PF_SN_LIMIT || (pool->opts & PF_POOL_STICKYADDR)
1042
	 * but we'd need to pass pool *only* for this KASSERT.
1043
	 */
1044

1045
	if ( (rule->rule_flag & PFRULE_SRCTRACK) &&
1046
	    !(rule->rule_flag & PFRULE_RULESRCTRACK))
1047
		r_track = &V_pf_default_rule;
1048

1049
	/*
1050
	 * Request the sh to always be locked, as we might insert a new sn.
1051
	 */
1052
	if (*sn == NULL)
1053
		*sn = pf_find_src_node(src, r_track, af, sh, sn_type, true);
1054

1055
	if (*sn == NULL) {
1056
		PF_HASHROW_ASSERT(*sh);
1057

1058
		if (sn_type == PF_SN_LIMIT && rule->max_src_nodes &&
1059
		    counter_u64_fetch(r_track->src_nodes[sn_type]) >= rule->max_src_nodes) {
1060
			counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES], 1);
1061
			reason = PFRES_SRCLIMIT;
1062
			goto done;
1063
		}
1064

1065
		(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
1066
		if ((*sn) == NULL) {
1067
			reason = PFRES_MEMORY;
1068
			goto done;
1069
		}
1070

1071
		for (int i = 0; i < 2; i++) {
1072
			(*sn)->bytes[i] = counter_u64_alloc(M_NOWAIT);
1073
			(*sn)->packets[i] = counter_u64_alloc(M_NOWAIT);
1074

1075
			if ((*sn)->bytes[i] == NULL || (*sn)->packets[i] == NULL) {
1076
				pf_free_src_node(*sn);
1077
				reason = PFRES_MEMORY;
1078
				goto done;
1079
			}
1080
		}
1081

1082
		if (sn_type == PF_SN_LIMIT)
1083
			if (! pf_init_threshold(&(*sn)->conn_rate,
1084
			    rule->max_src_conn_rate.limit,
1085
			    rule->max_src_conn_rate.seconds)) {
1086
				pf_free_src_node(*sn);
1087
				reason = PFRES_MEMORY;
1088
				goto done;
1089
			}
1090

1091
		MPASS((*sn)->lock == NULL);
1092
		(*sn)->lock = &(*sh)->lock;
1093

1094
		(*sn)->af = af;
1095
		(*sn)->rule = r_track;
1096
		pf_addrcpy(&(*sn)->addr, src, af);
1097
		if (raddr != NULL)
1098
			pf_addrcpy(&(*sn)->raddr, raddr, raf);
1099
		(*sn)->rkif = rkif;
1100
		(*sn)->raf = raf;
1101
		LIST_INSERT_HEAD(&(*sh)->nodes, *sn, entry);
1102
		(*sn)->creation = time_uptime;
1103
		(*sn)->ruletype = rule->action;
1104
		(*sn)->type = sn_type;
1105
		counter_u64_add(r_track->src_nodes[sn_type], 1);
1106
		counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
1107
	} else {
1108
		if (sn_type == PF_SN_LIMIT && rule->max_src_states &&
1109
		    (*sn)->states >= rule->max_src_states) {
1110
			counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
1111
			    1);
1112
			reason = PFRES_SRCLIMIT;
1113
			goto done;
1114
		}
1115
	}
1116
done:
1117
	if (reason == 0)
1118
		(*sn)->states++;
1119
	else
1120
		(*sn) = NULL;
1121

1122
	PF_HASHROW_UNLOCK(*sh);
1123
	return (reason);
1124
}
1125

1126
void
1127
pf_unlink_src_node(struct pf_ksrc_node *src)
1128
{
1129
	PF_SRC_NODE_LOCK_ASSERT(src);
1130

1131
	LIST_REMOVE(src, entry);
1132
	if (src->rule)
1133
		counter_u64_add(src->rule->src_nodes[src->type], -1);
1134
}
1135

1136
u_int
1137
pf_free_src_nodes(struct pf_ksrc_node_list *head)
1138
{
1139
	struct pf_ksrc_node *sn, *tmp;
1140
	u_int count = 0;
1141

1142
	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
1143
		pf_free_src_node(sn);
1144
		count++;
1145
	}
1146

1147
	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
1148

1149
	return (count);
1150
}
1151

1152
void
1153
pf_mtag_initialize(void)
1154
{
1155

1156
	pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
1157
	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
1158
	    UMA_ALIGN_PTR, 0);
1159
}
1160

1161
/* Per-vnet data storage structures initialization. */
1162
void
1163
pf_initialize(void)
1164
{
1165
	struct pf_keyhash	*kh;
1166
	struct pf_idhash	*ih;
1167
	struct pf_srchash	*sh;
1168
	struct pf_udpendpointhash	*uh;
1169
	u_int i;
1170

1171
	if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
1172
		V_pf_hashsize = PF_HASHSIZ;
1173
	if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
1174
		V_pf_srchashsize = PF_SRCHASHSIZ;
1175
	if (V_pf_udpendpointhashsize == 0 || !powerof2(V_pf_udpendpointhashsize))
1176
		V_pf_udpendpointhashsize = PF_UDPENDHASHSIZ;
1177

1178
	V_pf_hashseed = arc4random();
1179

1180
	/* States and state keys storage. */
1181
	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_kstate),
1182
	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1183
	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
1184
	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
1185
	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
1186

1187
	V_pf_state_key_z = uma_zcreate("pf state keys",
1188
	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
1189
	    UMA_ALIGN_PTR, 0);
1190

1191
	V_pf_keyhash = mallocarray(V_pf_hashsize, sizeof(struct pf_keyhash),
1192
	    M_PFHASH, M_NOWAIT | M_ZERO);
1193
	V_pf_idhash = mallocarray(V_pf_hashsize, sizeof(struct pf_idhash),
1194
	    M_PFHASH, M_NOWAIT | M_ZERO);
1195
	if (V_pf_keyhash == NULL || V_pf_idhash == NULL) {
1196
		printf("pf: Unable to allocate memory for "
1197
		    "state_hashsize %lu.\n", V_pf_hashsize);
1198

1199
		free(V_pf_keyhash, M_PFHASH);
1200
		free(V_pf_idhash, M_PFHASH);
1201

1202
		V_pf_hashsize = PF_HASHSIZ;
1203
		V_pf_keyhash = mallocarray(V_pf_hashsize,
1204
		    sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO);
1205
		V_pf_idhash = mallocarray(V_pf_hashsize,
1206
		    sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO);
1207
	}
1208

1209
	V_pf_hashmask = V_pf_hashsize - 1;
1210
	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
1211
	    i++, kh++, ih++) {
1212
		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
1213
		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
1214
	}
1215

1216
	/* Source nodes. */
1217
	V_pf_sources_z = uma_zcreate("pf source nodes",
1218
	    sizeof(struct pf_ksrc_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1219
	    0);
1220
	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
1221
	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
1222
	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
1223

1224
	V_pf_srchash = mallocarray(V_pf_srchashsize,
1225
	    sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO);
1226
	if (V_pf_srchash == NULL) {
1227
		printf("pf: Unable to allocate memory for "
1228
		    "source_hashsize %lu.\n", V_pf_srchashsize);
1229

1230
		V_pf_srchashsize = PF_SRCHASHSIZ;
1231
		V_pf_srchash = mallocarray(V_pf_srchashsize,
1232
		    sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO);
1233
	}
1234

1235
	V_pf_srchashmask = V_pf_srchashsize - 1;
1236
	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
1237
		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
1238

1239

1240
	/* UDP endpoint mappings. */
1241
	V_pf_udp_mapping_z = uma_zcreate("pf UDP mappings",
1242
	    sizeof(struct pf_udp_mapping), NULL, NULL, NULL, NULL,
1243
	    UMA_ALIGN_PTR, 0);
1244
	V_pf_udpendpointhash = mallocarray(V_pf_udpendpointhashsize,
1245
	    sizeof(struct pf_udpendpointhash), M_PFHASH, M_NOWAIT | M_ZERO);
1246
	if (V_pf_udpendpointhash == NULL) {
1247
		printf("pf: Unable to allocate memory for "
1248
		    "udpendpoint_hashsize %lu.\n", V_pf_udpendpointhashsize);
1249

1250
		V_pf_udpendpointhashsize = PF_UDPENDHASHSIZ;
1251
		V_pf_udpendpointhash = mallocarray(V_pf_udpendpointhashsize,
1252
		    sizeof(struct pf_udpendpointhash), M_PFHASH, M_WAITOK | M_ZERO);
1253
	}
1254

1255
	V_pf_udpendpointhashmask = V_pf_udpendpointhashsize - 1;
1256
	for (i = 0, uh = V_pf_udpendpointhash;
1257
	    i <= V_pf_udpendpointhashmask;
1258
	    i++, uh++) {
1259
		mtx_init(&uh->lock, "pf_udpendpointhash", NULL,
1260
		    MTX_DEF | MTX_DUPOK);
1261
	}
1262

1263
	/* Anchors */
1264
	V_pf_anchor_z = uma_zcreate("pf anchors",
1265
	    sizeof(struct pf_kanchor), NULL, NULL, NULL, NULL,
1266
	    UMA_ALIGN_PTR, 0);
1267
	V_pf_limits[PF_LIMIT_ANCHORS].zone = V_pf_anchor_z;
1268
	uma_zone_set_max(V_pf_anchor_z, PF_ANCHOR_HIWAT);
1269
	uma_zone_set_warning(V_pf_anchor_z, "PF anchor limit reached");
1270

1271
	V_pf_eth_anchor_z = uma_zcreate("pf Ethernet anchors",
1272
	    sizeof(struct pf_keth_anchor), NULL, NULL, NULL, NULL,
1273
	    UMA_ALIGN_PTR, 0);
1274
	V_pf_limits[PF_LIMIT_ETH_ANCHORS].zone = V_pf_eth_anchor_z;
1275
	uma_zone_set_max(V_pf_eth_anchor_z, PF_ANCHOR_HIWAT);
1276
	uma_zone_set_warning(V_pf_eth_anchor_z, "PF Ethernet anchor limit reached");
1277

1278
	/* ALTQ */
1279
	TAILQ_INIT(&V_pf_altqs[0]);
1280
	TAILQ_INIT(&V_pf_altqs[1]);
1281
	TAILQ_INIT(&V_pf_altqs[2]);
1282
	TAILQ_INIT(&V_pf_altqs[3]);
1283
	TAILQ_INIT(&V_pf_pabuf[0]);
1284
	TAILQ_INIT(&V_pf_pabuf[1]);
1285
	TAILQ_INIT(&V_pf_pabuf[2]);
1286
	V_pf_altqs_active = &V_pf_altqs[0];
1287
	V_pf_altq_ifs_active = &V_pf_altqs[1];
1288
	V_pf_altqs_inactive = &V_pf_altqs[2];
1289
	V_pf_altq_ifs_inactive = &V_pf_altqs[3];
1290

1291
	/* Send & overload+flush queues. */
1292
	STAILQ_INIT(&V_pf_sendqueue);
1293
	SLIST_INIT(&V_pf_overloadqueue);
1294
	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
1295

1296
	/* Unlinked, but may be referenced rules. */
1297
	TAILQ_INIT(&V_pf_unlinked_rules);
1298
}
1299

1300
void
1301
pf_mtag_cleanup(void)
1302
{
1303

1304
	uma_zdestroy(pf_mtag_z);
1305
}
1306

1307
void
1308
pf_cleanup(void)
1309
{
1310
	struct pf_keyhash	*kh;
1311
	struct pf_idhash	*ih;
1312
	struct pf_srchash	*sh;
1313
	struct pf_udpendpointhash	*uh;
1314
	struct pf_send_entry	*pfse, *next;
1315
	u_int i;
1316

1317
	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash;
1318
	    i <= V_pf_hashmask;
1319
	    i++, kh++, ih++) {
1320
		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
1321
		    __func__));
1322
		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
1323
		    __func__));
1324
		mtx_destroy(&kh->lock);
1325
		mtx_destroy(&ih->lock);
1326
	}
1327
	free(V_pf_keyhash, M_PFHASH);
1328
	free(V_pf_idhash, M_PFHASH);
1329

1330
	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
1331
		KASSERT(LIST_EMPTY(&sh->nodes),
1332
		    ("%s: source node hash not empty", __func__));
1333
		mtx_destroy(&sh->lock);
1334
	}
1335
	free(V_pf_srchash, M_PFHASH);
1336

1337
	for (i = 0, uh = V_pf_udpendpointhash;
1338
	    i <= V_pf_udpendpointhashmask;
1339
	    i++, uh++) {
1340
		KASSERT(LIST_EMPTY(&uh->endpoints),
1341
		    ("%s: udp endpoint hash not empty", __func__));
1342
		mtx_destroy(&uh->lock);
1343
	}
1344
	free(V_pf_udpendpointhash, M_PFHASH);
1345

1346
	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
1347
		m_freem(pfse->pfse_m);
1348
		free(pfse, M_PFTEMP);
1349
	}
1350
	MPASS(RB_EMPTY(&V_pf_sctp_endpoints));
1351

1352
	uma_zdestroy(V_pf_sources_z);
1353
	uma_zdestroy(V_pf_state_z);
1354
	uma_zdestroy(V_pf_state_key_z);
1355
	uma_zdestroy(V_pf_udp_mapping_z);
1356
	uma_zdestroy(V_pf_anchor_z);
1357
	uma_zdestroy(V_pf_eth_anchor_z);
1358
}
1359

1360
static int
1361
pf_mtag_uminit(void *mem, int size, int how)
1362
{
1363
	struct m_tag *t;
1364

1365
	t = (struct m_tag *)mem;
1366
	t->m_tag_cookie = MTAG_ABI_COMPAT;
1367
	t->m_tag_id = PACKET_TAG_PF;
1368
	t->m_tag_len = sizeof(struct pf_mtag);
1369
	t->m_tag_free = pf_mtag_free;
1370

1371
	return (0);
1372
}
1373

1374
static void
1375
pf_mtag_free(struct m_tag *t)
1376
{
1377

1378
	uma_zfree(pf_mtag_z, t);
1379
}
1380

1381
struct pf_mtag *
1382
pf_get_mtag(struct mbuf *m)
1383
{
1384
	struct m_tag *mtag;
1385

1386
	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
1387
		return ((struct pf_mtag *)(mtag + 1));
1388

1389
	mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
1390
	if (mtag == NULL)
1391
		return (NULL);
1392
	bzero(mtag + 1, sizeof(struct pf_mtag));
1393
	m_tag_prepend(m, mtag);
1394

1395
	return ((struct pf_mtag *)(mtag + 1));
1396
}
1397

1398
static int
1399
pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
1400
    struct pf_kstate *s)
1401
{
1402
	struct pf_keyhash	*khs, *khw, *kh;
1403
	struct pf_state_key	*sk, *cur;
1404
	struct pf_kstate	*si, *olds = NULL;
1405
	int idx;
1406

1407
	NET_EPOCH_ASSERT();
1408
	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1409
	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
1410
	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
1411

1412
	/*
1413
	 * We need to lock hash slots of both keys. To avoid deadlock
1414
	 * we always lock the slot with lower address first. Unlock order
1415
	 * isn't important.
1416
	 *
1417
	 * We also need to lock ID hash slot before dropping key
1418
	 * locks. On success we return with ID hash slot locked.
1419
	 */
1420

1421
	if (skw == sks) {
1422
		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
1423
		PF_HASHROW_LOCK(khs);
1424
	} else {
1425
		khs = &V_pf_keyhash[pf_hashkey(sks)];
1426
		khw = &V_pf_keyhash[pf_hashkey(skw)];
1427
		if (khs == khw) {
1428
			PF_HASHROW_LOCK(khs);
1429
		} else if (khs < khw) {
1430
			PF_HASHROW_LOCK(khs);
1431
			PF_HASHROW_LOCK(khw);
1432
		} else {
1433
			PF_HASHROW_LOCK(khw);
1434
			PF_HASHROW_LOCK(khs);
1435
		}
1436
	}
1437

1438
#define	KEYS_UNLOCK()	do {			\
1439
	if (khs != khw) {			\
1440
		PF_HASHROW_UNLOCK(khs);		\
1441
		PF_HASHROW_UNLOCK(khw);		\
1442
	} else					\
1443
		PF_HASHROW_UNLOCK(khs);		\
1444
} while (0)
1445

1446
	/*
1447
	 * First run: start with wire key.
1448
	 */
1449
	sk = skw;
1450
	kh = khw;
1451
	idx = PF_SK_WIRE;
1452

1453
	MPASS(s->lock == NULL);
1454
	s->lock = &V_pf_idhash[PF_IDHASH(s)].lock;
1455

1456
keyattach:
1457
	LIST_FOREACH(cur, &kh->keys, entry)
1458
		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
1459
			break;
1460

1461
	if (cur != NULL) {
1462
		/* Key exists. Check for same kif, if none, add to key. */
1463
		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
1464
			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
1465

1466
			PF_HASHROW_LOCK(ih);
1467
			if (si->kif == s->kif &&
1468
			    ((si->key[PF_SK_WIRE]->af == sk->af &&
1469
			    si->direction == s->direction) ||
1470
			    (si->key[PF_SK_WIRE]->af !=
1471
			    si->key[PF_SK_STACK]->af &&
1472
			    sk->af == si->key[PF_SK_STACK]->af &&
1473
			    si->direction != s->direction))) {
1474
				bool reuse = false;
1475

1476
				if (sk->proto == IPPROTO_TCP &&
1477
				    si->src.state >= TCPS_FIN_WAIT_2 &&
1478
				    si->dst.state >= TCPS_FIN_WAIT_2)
1479
					reuse = true;
1480

1481
				if (V_pf_status.debug >= PF_DEBUG_MISC) {
1482
					printf("pf: %s key attach "
1483
					    "%s on %s: ",
1484
					    (idx == PF_SK_WIRE) ?
1485
					    "wire" : "stack",
1486
					    reuse ? "reuse" : "failed",
1487
					    s->kif->pfik_name);
1488
					pf_print_state_parts(s,
1489
					    (idx == PF_SK_WIRE) ?
1490
					    sk : NULL,
1491
					    (idx == PF_SK_STACK) ?
1492
					    sk : NULL);
1493
					printf(", existing: ");
1494
					pf_print_state_parts(si,
1495
					    (idx == PF_SK_WIRE) ?
1496
					    sk : NULL,
1497
					    (idx == PF_SK_STACK) ?
1498
					    sk : NULL);
1499
					printf("\n");
1500
				}
1501

1502
				if (reuse) {
1503
					/*
1504
					 * New state matches an old >FIN_WAIT_2
1505
					 * state. We can't drop key hash locks,
1506
					 * thus we can't unlink it properly.
1507
					 *
1508
					 * As a workaround we drop it into
1509
					 * TCPS_CLOSED state, schedule purge
1510
					 * ASAP and push it into the very end
1511
					 * of the slot TAILQ, so that it won't
1512
					 * conflict with our new state.
1513
					 */
1514
					pf_set_protostate(si, PF_PEER_BOTH,
1515
					    TCPS_CLOSED);
1516
					si->timeout = PFTM_PURGE;
1517
					olds = si;
1518
				} else {
1519
					s->timeout = PFTM_UNLINKED;
1520
					if (idx == PF_SK_STACK)
1521
						/*
1522
						 * Remove the wire key from
1523
						 * the hash. Other threads
1524
						 * can't be referencing it
1525
						 * because we still hold the
1526
						 * hash lock.
1527
						 */
1528
						pf_state_key_detach(s,
1529
						    PF_SK_WIRE);
1530
					PF_HASHROW_UNLOCK(ih);
1531
					KEYS_UNLOCK();
1532
					if (idx == PF_SK_WIRE)
1533
						/*
1534
						 * We've not inserted either key.
1535
						 * Free both.
1536
						 */
1537
						uma_zfree(V_pf_state_key_z, skw);
1538
					if (skw != sks)
1539
						uma_zfree(
1540
						    V_pf_state_key_z,
1541
						    sks);
1542
					return (EEXIST); /* collision! */
1543
				}
1544
			}
1545
			PF_HASHROW_UNLOCK(ih);
1546
		}
1547
		uma_zfree(V_pf_state_key_z, sk);
1548
		s->key[idx] = cur;
1549
	} else {
1550
		LIST_INSERT_HEAD(&kh->keys, sk, entry);
1551
		s->key[idx] = sk;
1552
	}
1553

1554
stateattach:
1555
	/* List is sorted, if-bound states before floating. */
1556
	if (s->kif == V_pfi_all)
1557
		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
1558
	else
1559
		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
1560

1561
	if (olds) {
1562
		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
1563
		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
1564
		    key_list[idx]);
1565
		olds = NULL;
1566
	}
1567

1568
	/*
1569
	 * Attach done. See how should we (or should not?)
1570
	 * attach a second key.
1571
	 */
1572
	if (sks == skw) {
1573
		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1574
		idx = PF_SK_STACK;
1575
		sks = NULL;
1576
		goto stateattach;
1577
	} else if (sks != NULL) {
1578
		/*
1579
		 * Continue attaching with stack key.
1580
		 */
1581
		sk = sks;
1582
		kh = khs;
1583
		idx = PF_SK_STACK;
1584
		sks = NULL;
1585
		goto keyattach;
1586
	}
1587

1588
	PF_STATE_LOCK(s);
1589
	KEYS_UNLOCK();
1590

1591
	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
1592
	    ("%s failure", __func__));
1593

1594
	return (0);
1595
#undef	KEYS_UNLOCK
1596
}
1597

1598
static void
1599
pf_detach_state(struct pf_kstate *s)
1600
{
1601
	struct pf_state_key *sks = s->key[PF_SK_STACK];
1602
	struct pf_keyhash *kh;
1603

1604
	NET_EPOCH_ASSERT();
1605
	MPASS(s->timeout >= PFTM_MAX);
1606

1607
	pf_sctp_multihome_detach_addr(s);
1608

1609
	if ((s->state_flags & PFSTATE_PFLOW) && V_pflow_export_state_ptr)
1610
		V_pflow_export_state_ptr(s);
1611

1612
	if (sks != NULL) {
1613
		kh = &V_pf_keyhash[pf_hashkey(sks)];
1614
		PF_HASHROW_LOCK(kh);
1615
		if (s->key[PF_SK_STACK] != NULL)
1616
			pf_state_key_detach(s, PF_SK_STACK);
1617
		/*
1618
		 * If both point to same key, then we are done.
1619
		 */
1620
		if (sks == s->key[PF_SK_WIRE]) {
1621
			pf_state_key_detach(s, PF_SK_WIRE);
1622
			PF_HASHROW_UNLOCK(kh);
1623
			return;
1624
		}
1625
		PF_HASHROW_UNLOCK(kh);
1626
	}
1627

1628
	if (s->key[PF_SK_WIRE] != NULL) {
1629
		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
1630
		PF_HASHROW_LOCK(kh);
1631
		if (s->key[PF_SK_WIRE] != NULL)
1632
			pf_state_key_detach(s, PF_SK_WIRE);
1633
		PF_HASHROW_UNLOCK(kh);
1634
	}
1635
}
1636

1637
static void
1638
pf_state_key_detach(struct pf_kstate *s, int idx)
1639
{
1640
	struct pf_state_key *sk = s->key[idx];
1641
#ifdef INVARIANTS
1642
	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1643

1644
	PF_HASHROW_ASSERT(kh);
1645
#endif /* INVARIANTS */
1646
	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1647
	s->key[idx] = NULL;
1648

1649
	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1650
		LIST_REMOVE(sk, entry);
1651
		uma_zfree(V_pf_state_key_z, sk);
1652
	}
1653
}
1654

1655
static int
1656
pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1657
{
1658
	struct pf_state_key *sk = mem;
1659

1660
	bzero(sk, sizeof(struct pf_state_key_cmp));
1661
	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1662
	TAILQ_INIT(&sk->states[PF_SK_STACK]);
1663

1664
	return (0);
1665
}
1666

1667
static int
1668
pf_state_key_addr_setup(struct pf_pdesc *pd,
1669
    struct pf_state_key_cmp *key, int multi)
1670
{
1671
	struct pf_addr *saddr = pd->src;
1672
	struct pf_addr *daddr = pd->dst;
1673
#ifdef INET6
1674
	struct nd_neighbor_solicit nd;
1675
	struct pf_addr *target;
1676

1677
	if (pd->af == AF_INET || pd->proto != IPPROTO_ICMPV6)
1678
		goto copy;
1679

1680
	switch (pd->hdr.icmp6.icmp6_type) {
1681
	case ND_NEIGHBOR_SOLICIT:
1682
		if (multi)
1683
			return (-1);
1684
		if (!pf_pull_hdr(pd->m, pd->off, &nd, sizeof(nd), NULL,
1685
		    pd->af))
1686
			return (-1);
1687
		target = (struct pf_addr *)&nd.nd_ns_target;
1688
		daddr = target;
1689
		break;
1690
	case ND_NEIGHBOR_ADVERT:
1691
		if (multi)
1692
			return (-1);
1693
		if (!pf_pull_hdr(pd->m, pd->off, &nd, sizeof(nd), NULL,
1694
		    pd->af))
1695
			return (-1);
1696
		target = (struct pf_addr *)&nd.nd_ns_target;
1697
		saddr = target;
1698
		if (IN6_IS_ADDR_MULTICAST(&pd->dst->v6)) {
1699
			key->addr[pd->didx].addr32[0] = 0;
1700
			key->addr[pd->didx].addr32[1] = 0;
1701
			key->addr[pd->didx].addr32[2] = 0;
1702
			key->addr[pd->didx].addr32[3] = 0;
1703
			daddr = NULL; /* overwritten */
1704
		}
1705
		break;
1706
	default:
1707
		if (multi) {
1708
			key->addr[pd->sidx].addr32[0] = IPV6_ADDR_INT32_MLL;
1709
			key->addr[pd->sidx].addr32[1] = 0;
1710
			key->addr[pd->sidx].addr32[2] = 0;
1711
			key->addr[pd->sidx].addr32[3] = IPV6_ADDR_INT32_ONE;
1712
			saddr = NULL; /* overwritten */
1713
		}
1714
	}
1715
copy:
1716
#endif /* INET6 */
1717
	if (saddr)
1718
		pf_addrcpy(&key->addr[pd->sidx], saddr, pd->af);
1719
	if (daddr)
1720
		pf_addrcpy(&key->addr[pd->didx], daddr, pd->af);
1721

1722
	return (0);
1723
}
1724

1725
int
1726
pf_state_key_setup(struct pf_pdesc *pd, u_int16_t sport, u_int16_t dport,
1727
    struct pf_state_key **sk, struct pf_state_key **nk)
1728
{
1729
	*sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1730
	if (*sk == NULL)
1731
		return (ENOMEM);
1732

1733
	if (pf_state_key_addr_setup(pd, (struct pf_state_key_cmp *)*sk,
1734
	    0)) {
1735
		uma_zfree(V_pf_state_key_z, *sk);
1736
		*sk = NULL;
1737
		return (ENOMEM);
1738
	}
1739

1740
	(*sk)->port[pd->sidx] = sport;
1741
	(*sk)->port[pd->didx] = dport;
1742
	(*sk)->proto = pd->proto;
1743
	(*sk)->af = pd->af;
1744

1745
	*nk = pf_state_key_clone(*sk);
1746
	if (*nk == NULL) {
1747
		uma_zfree(V_pf_state_key_z, *sk);
1748
		*sk = NULL;
1749
		return (ENOMEM);
1750
	}
1751

1752
	if (pd->af != pd->naf) {
1753
		(*sk)->port[pd->sidx] = pd->osport;
1754
		(*sk)->port[pd->didx] = pd->odport;
1755

1756
		(*nk)->af = pd->naf;
1757

1758
		/*
1759
		 * We're overwriting an address here, so potentially there's bits of an IPv6
1760
		 * address left in here. Clear that out first.
1761
		 */
1762
		bzero(&(*nk)->addr[0], sizeof((*nk)->addr[0]));
1763
		bzero(&(*nk)->addr[1], sizeof((*nk)->addr[1]));
1764
		if (pd->dir == PF_IN) {
1765
			pf_addrcpy(&(*nk)->addr[pd->didx], &pd->nsaddr,
1766
			    pd->naf);
1767
			pf_addrcpy(&(*nk)->addr[pd->sidx], &pd->ndaddr,
1768
			    pd->naf);
1769
			(*nk)->port[pd->didx] = pd->nsport;
1770
			(*nk)->port[pd->sidx] = pd->ndport;
1771
		} else {
1772
			pf_addrcpy(&(*nk)->addr[pd->sidx], &pd->nsaddr,
1773
			    pd->naf);
1774
			pf_addrcpy(&(*nk)->addr[pd->didx], &pd->ndaddr,
1775
			    pd->naf);
1776
			(*nk)->port[pd->sidx] = pd->nsport;
1777
			(*nk)->port[pd->didx] = pd->ndport;
1778
		}
1779

1780
		switch (pd->proto) {
1781
		case IPPROTO_ICMP:
1782
			(*nk)->proto = IPPROTO_ICMPV6;
1783
			break;
1784
		case IPPROTO_ICMPV6:
1785
			(*nk)->proto = IPPROTO_ICMP;
1786
			break;
1787
		default:
1788
			(*nk)->proto = pd->proto;
1789
		}
1790
	}
1791

1792
	return (0);
1793
}
1794

1795
struct pf_state_key *
1796
pf_state_key_clone(const struct pf_state_key *orig)
1797
{
1798
	struct pf_state_key *sk;
1799

1800
	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1801
	if (sk == NULL)
1802
		return (NULL);
1803

1804
	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1805

1806
	return (sk);
1807
}
1808

1809
int
1810
pf_state_insert(struct pfi_kkif *kif, struct pfi_kkif *orig_kif,
1811
    struct pf_state_key *skw, struct pf_state_key *sks, struct pf_kstate *s)
1812
{
1813
	struct pf_idhash *ih;
1814
	struct pf_kstate *cur;
1815
	int error;
1816

1817
	NET_EPOCH_ASSERT();
1818

1819
	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1820
	    ("%s: sks not pristine", __func__));
1821
	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1822
	    ("%s: skw not pristine", __func__));
1823
	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1824

1825
	s->kif = kif;
1826
	s->orig_kif = orig_kif;
1827

1828
	if (s->id == 0 && s->creatorid == 0) {
1829
		s->id = alloc_unr64(&V_pf_stateid);
1830
		s->id = htobe64(s->id);
1831
		s->creatorid = V_pf_status.hostid;
1832
	}
1833

1834
	/* Returns with ID locked on success. */
1835
	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
1836
		return (error);
1837
	skw = sks = NULL;
1838

1839
	ih = &V_pf_idhash[PF_IDHASH(s)];
1840
	PF_HASHROW_ASSERT(ih);
1841
	LIST_FOREACH(cur, &ih->states, entry)
1842
		if (cur->id == s->id && cur->creatorid == s->creatorid)
1843
			break;
1844

1845
	if (cur != NULL) {
1846
		s->timeout = PFTM_UNLINKED;
1847
		PF_HASHROW_UNLOCK(ih);
1848
		if (V_pf_status.debug >= PF_DEBUG_MISC) {
1849
			printf("pf: state ID collision: "
1850
			    "id: %016llx creatorid: %08x\n",
1851
			    (unsigned long long)be64toh(s->id),
1852
			    ntohl(s->creatorid));
1853
		}
1854
		pf_detach_state(s);
1855
		return (EEXIST);
1856
	}
1857
	LIST_INSERT_HEAD(&ih->states, s, entry);
1858
	/* One for keys, one for ID hash. */
1859
	refcount_init(&s->refs, 2);
1860

1861
	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
1862
	if (V_pfsync_insert_state_ptr != NULL)
1863
		V_pfsync_insert_state_ptr(s);
1864

1865
	/* Returns locked. */
1866
	return (0);
1867
}
1868

1869
/*
1870
 * Find state by ID: returns with locked row on success.
1871
 */
1872
struct pf_kstate *
1873
pf_find_state_byid(uint64_t id, uint32_t creatorid)
1874
{
1875
	struct pf_idhash *ih;
1876
	struct pf_kstate *s;
1877

1878
	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1879

1880
	ih = &V_pf_idhash[PF_IDHASHID(id)];
1881

1882
	PF_HASHROW_LOCK(ih);
1883
	LIST_FOREACH(s, &ih->states, entry)
1884
		if (s->id == id && s->creatorid == creatorid)
1885
			break;
1886

1887
	if (s == NULL)
1888
		PF_HASHROW_UNLOCK(ih);
1889

1890
	return (s);
1891
}
1892

1893
/*
1894
 * Find state by key.
1895
 * Returns with ID hash slot locked on success.
1896
 */
1897
static int
1898
pf_find_state(struct pf_pdesc *pd, const struct pf_state_key_cmp *key,
1899
    struct pf_kstate **state)
1900
{
1901
	struct pf_keyhash	*kh;
1902
	struct pf_state_key	*sk;
1903
	struct pf_kstate	*s;
1904
	int idx;
1905

1906
	*state = NULL;
1907

1908
	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1909

1910
	kh = &V_pf_keyhash[pf_hashkey((const struct pf_state_key *)key)];
1911

1912
	PF_HASHROW_LOCK(kh);
1913
	LIST_FOREACH(sk, &kh->keys, entry)
1914
		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1915
			break;
1916
	if (sk == NULL) {
1917
		PF_HASHROW_UNLOCK(kh);
1918
		return (PF_DROP);
1919
	}
1920

1921
	idx = (pd->dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1922

1923
	/* List is sorted, if-bound states before floating ones. */
1924
	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1925
		if (s->kif == V_pfi_all || s->kif == pd->kif ||
1926
		    s->orig_kif == pd->kif) {
1927
			PF_STATE_LOCK(s);
1928
			PF_HASHROW_UNLOCK(kh);
1929
			if (__predict_false(s->timeout >= PFTM_MAX)) {
1930
				/*
1931
				 * State is either being processed by
1932
				 * pf_remove_state() in an other thread, or
1933
				 * is scheduled for immediate expiry.
1934
				 */
1935
				PF_STATE_UNLOCK(s);
1936
				SDT_PROBE5(pf, ip, state, lookup, pd->kif,
1937
				    key, (pd->dir), pd, *state);
1938
				return (PF_DROP);
1939
			}
1940
			goto out;
1941
		}
1942

1943
	/* Look through the other list, in case of AF-TO */
1944
	idx = idx == PF_SK_WIRE ? PF_SK_STACK : PF_SK_WIRE;
1945
	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1946
		if (s->key[PF_SK_WIRE]->af == s->key[PF_SK_STACK]->af)
1947
			continue;
1948
		if (s->kif == V_pfi_all || s->kif == pd->kif ||
1949
		    s->orig_kif == pd->kif) {
1950
			PF_STATE_LOCK(s);
1951
			PF_HASHROW_UNLOCK(kh);
1952
			if (__predict_false(s->timeout >= PFTM_MAX)) {
1953
				/*
1954
				 * State is either being processed by
1955
				 * pf_remove_state() in an other thread, or
1956
				 * is scheduled for immediate expiry.
1957
				 */
1958
				PF_STATE_UNLOCK(s);
1959
				SDT_PROBE5(pf, ip, state, lookup, pd->kif,
1960
				    key, (pd->dir), pd, NULL);
1961
				return (PF_DROP);
1962
			}
1963
			goto out;
1964
		}
1965
	}
1966

1967
	PF_HASHROW_UNLOCK(kh);
1968

1969
out:
1970
	SDT_PROBE5(pf, ip, state, lookup, pd->kif, key, (pd->dir), pd, *state);
1971

1972
	if (s == NULL || s->timeout == PFTM_PURGE) {
1973
		if (s)
1974
			PF_STATE_UNLOCK(s);
1975
		return (PF_DROP);
1976
	}
1977

1978
	if ((s)->rule->pktrate.limit && pd->dir == (s)->direction) {
1979
		if (pf_check_threshold(&(s)->rule->pktrate)) {
1980
			PF_STATE_UNLOCK(s);
1981
			return (PF_DROP);
1982
		}
1983
	}
1984
	if (PACKET_LOOPED(pd)) {
1985
		PF_STATE_UNLOCK(s);
1986
		return (PF_PASS);
1987
	}
1988

1989
	*state = s;
1990

1991
	return (PF_MATCH);
1992
}
1993

1994
/*
1995
 * Returns with ID hash slot locked on success.
1996
 */
1997
struct pf_kstate *
1998
pf_find_state_all(const struct pf_state_key_cmp *key, u_int dir, int *more)
1999
{
2000
	struct pf_keyhash	*kh;
2001
	struct pf_state_key	*sk;
2002
	struct pf_kstate	*s, *ret = NULL;
2003
	int			 idx, inout = 0;
2004

2005
	if (more != NULL)
2006
		*more = 0;
2007

2008
	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
2009

2010
	kh = &V_pf_keyhash[pf_hashkey((const struct pf_state_key *)key)];
2011

2012
	PF_HASHROW_LOCK(kh);
2013
	LIST_FOREACH(sk, &kh->keys, entry)
2014
		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
2015
			break;
2016
	if (sk == NULL) {
2017
		PF_HASHROW_UNLOCK(kh);
2018
		return (NULL);
2019
	}
2020
	switch (dir) {
2021
	case PF_IN:
2022
		idx = PF_SK_WIRE;
2023
		break;
2024
	case PF_OUT:
2025
		idx = PF_SK_STACK;
2026
		break;
2027
	case PF_INOUT:
2028
		idx = PF_SK_WIRE;
2029
		inout = 1;
2030
		break;
2031
	default:
2032
		panic("%s: dir %u", __func__, dir);
2033
	}
2034
second_run:
2035
	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
2036
		if (more == NULL) {
2037
			PF_STATE_LOCK(s);
2038
			PF_HASHROW_UNLOCK(kh);
2039
			return (s);
2040
		}
2041

2042
		if (ret)
2043
			(*more)++;
2044
		else {
2045
			ret = s;
2046
			PF_STATE_LOCK(s);
2047
		}
2048
	}
2049
	if (inout == 1) {
2050
		inout = 0;
2051
		idx = PF_SK_STACK;
2052
		goto second_run;
2053
	}
2054
	PF_HASHROW_UNLOCK(kh);
2055

2056
	return (ret);
2057
}
2058

2059
/*
2060
 * FIXME
2061
 * This routine is inefficient -- locks the state only to unlock immediately on
2062
 * return.
2063
 * It is racy -- after the state is unlocked nothing stops other threads from
2064
 * removing it.
2065
 */
2066
bool
2067
pf_find_state_all_exists(const struct pf_state_key_cmp *key, u_int dir)
2068
{
2069
	struct pf_kstate *s;
2070

2071
	s = pf_find_state_all(key, dir, NULL);
2072
	if (s != NULL) {
2073
		PF_STATE_UNLOCK(s);
2074
		return (true);
2075
	}
2076
	return (false);
2077
}
2078

2079
void
2080
pf_state_peer_hton(const struct pf_state_peer *s, struct pf_state_peer_export *d)
2081
{
2082
	d->seqlo = htonl(s->seqlo);
2083
	d->seqhi = htonl(s->seqhi);
2084
	d->seqdiff = htonl(s->seqdiff);
2085
	d->max_win = htons(s->max_win);
2086
	d->mss = htons(s->mss);
2087
	d->state = s->state;
2088
	d->wscale = s->wscale;
2089
	if (s->scrub) {
2090
		d->scrub.pfss_flags = htons(
2091
		    s->scrub->pfss_flags & PFSS_TIMESTAMP);
2092
		d->scrub.pfss_ttl = (s)->scrub->pfss_ttl;
2093
		d->scrub.pfss_ts_mod = htonl((s)->scrub->pfss_ts_mod);
2094
		d->scrub.scrub_flag = PF_SCRUB_FLAG_VALID;
2095
	}
2096
}
2097

2098
void
2099
pf_state_peer_ntoh(const struct pf_state_peer_export *s, struct pf_state_peer *d)
2100
{
2101
	d->seqlo = ntohl(s->seqlo);
2102
	d->seqhi = ntohl(s->seqhi);
2103
	d->seqdiff = ntohl(s->seqdiff);
2104
	d->max_win = ntohs(s->max_win);
2105
	d->mss = ntohs(s->mss);
2106
	d->state = s->state;
2107
	d->wscale = s->wscale;
2108
	if (s->scrub.scrub_flag == PF_SCRUB_FLAG_VALID &&
2109
	    d->scrub != NULL) {
2110
		d->scrub->pfss_flags = ntohs(s->scrub.pfss_flags) &
2111
		    PFSS_TIMESTAMP;
2112
		d->scrub->pfss_ttl = s->scrub.pfss_ttl;
2113
		d->scrub->pfss_ts_mod = ntohl(s->scrub.pfss_ts_mod);
2114
	}
2115
}
2116

2117
struct pf_udp_mapping *
2118
pf_udp_mapping_create(sa_family_t af, struct pf_addr *src_addr, uint16_t src_port,
2119
    struct pf_addr *nat_addr, uint16_t nat_port)
2120
{
2121
	struct pf_udp_mapping *mapping;
2122

2123
	mapping = uma_zalloc(V_pf_udp_mapping_z, M_NOWAIT | M_ZERO);
2124
	if (mapping == NULL)
2125
		return (NULL);
2126
	pf_addrcpy(&mapping->endpoints[0].addr, src_addr, af);
2127
	mapping->endpoints[0].port = src_port;
2128
	mapping->endpoints[0].af = af;
2129
	mapping->endpoints[0].mapping = mapping;
2130
	pf_addrcpy(&mapping->endpoints[1].addr, nat_addr, af);
2131
	mapping->endpoints[1].port = nat_port;
2132
	mapping->endpoints[1].af = af;
2133
	mapping->endpoints[1].mapping = mapping;
2134
	refcount_init(&mapping->refs, 1);
2135
	return (mapping);
2136
}
2137

2138
int
2139
pf_udp_mapping_insert(struct pf_udp_mapping *mapping)
2140
{
2141
	struct pf_udpendpointhash *h0, *h1;
2142
	struct pf_udp_endpoint *endpoint;
2143
	int ret = EEXIST;
2144

2145
	h0 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[0])];
2146
	h1 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[1])];
2147
	if (h0 == h1) {
2148
		PF_HASHROW_LOCK(h0);
2149
	} else if (h0 < h1) {
2150
		PF_HASHROW_LOCK(h0);
2151
		PF_HASHROW_LOCK(h1);
2152
	} else {
2153
		PF_HASHROW_LOCK(h1);
2154
		PF_HASHROW_LOCK(h0);
2155
	}
2156

2157
	LIST_FOREACH(endpoint, &h0->endpoints, entry) {
2158
		if (bcmp(endpoint, &mapping->endpoints[0],
2159
		    sizeof(struct pf_udp_endpoint_cmp)) == 0)
2160
			break;
2161
	}
2162
	if (endpoint != NULL)
2163
		goto cleanup;
2164
	LIST_FOREACH(endpoint, &h1->endpoints, entry) {
2165
		if (bcmp(endpoint, &mapping->endpoints[1],
2166
		    sizeof(struct pf_udp_endpoint_cmp)) == 0)
2167
			break;
2168
	}
2169
	if (endpoint != NULL)
2170
		goto cleanup;
2171
	LIST_INSERT_HEAD(&h0->endpoints, &mapping->endpoints[0], entry);
2172
	LIST_INSERT_HEAD(&h1->endpoints, &mapping->endpoints[1], entry);
2173
	ret = 0;
2174

2175
cleanup:
2176
	if (h0 != h1) {
2177
		PF_HASHROW_UNLOCK(h0);
2178
		PF_HASHROW_UNLOCK(h1);
2179
	} else {
2180
		PF_HASHROW_UNLOCK(h0);
2181
	}
2182
	return (ret);
2183
}
2184

2185
void
2186
pf_udp_mapping_release(struct pf_udp_mapping *mapping)
2187
{
2188
	/* refcount is synchronized on the source endpoint's row lock */
2189
	struct pf_udpendpointhash *h0, *h1;
2190

2191
	if (mapping == NULL)
2192
		return;
2193

2194
	h0 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[0])];
2195
	PF_HASHROW_LOCK(h0);
2196
	if (refcount_release(&mapping->refs)) {
2197
		LIST_REMOVE(&mapping->endpoints[0], entry);
2198
		PF_HASHROW_UNLOCK(h0);
2199
		h1 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[1])];
2200
		PF_HASHROW_LOCK(h1);
2201
		LIST_REMOVE(&mapping->endpoints[1], entry);
2202
		PF_HASHROW_UNLOCK(h1);
2203

2204
		uma_zfree(V_pf_udp_mapping_z, mapping);
2205
	} else {
2206
			PF_HASHROW_UNLOCK(h0);
2207
	}
2208
}
2209

2210

2211
struct pf_udp_mapping *
2212
pf_udp_mapping_find(struct pf_udp_endpoint_cmp *key)
2213
{
2214
	struct pf_udpendpointhash *uh;
2215
	struct pf_udp_endpoint *endpoint;
2216

2217
	uh = &V_pf_udpendpointhash[pf_hashudpendpoint((struct pf_udp_endpoint*)key)];
2218

2219
	PF_HASHROW_LOCK(uh);
2220
	LIST_FOREACH(endpoint, &uh->endpoints, entry) {
2221
		if (bcmp(endpoint, key, sizeof(struct pf_udp_endpoint_cmp)) == 0 &&
2222
			bcmp(endpoint, &endpoint->mapping->endpoints[0],
2223
			    sizeof(struct pf_udp_endpoint_cmp)) == 0)
2224
			break;
2225
	}
2226
	if (endpoint == NULL) {
2227
		PF_HASHROW_UNLOCK(uh);
2228
		return (NULL);
2229
	}
2230
	refcount_acquire(&endpoint->mapping->refs);
2231
	PF_HASHROW_UNLOCK(uh);
2232
	return (endpoint->mapping);
2233
}
2234
/* END state table stuff */
2235

2236
static void
2237
pf_send(struct pf_send_entry *pfse)
2238
{
2239

2240
	PF_SENDQ_LOCK();
2241
	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
2242
	PF_SENDQ_UNLOCK();
2243
	swi_sched(V_pf_swi_cookie, 0);
2244
}
2245

2246
static bool
2247
pf_isforlocal(struct mbuf *m, int af)
2248
{
2249
	switch (af) {
2250
#ifdef INET
2251
	case AF_INET: {
2252
		struct ip *ip = mtod(m, struct ip *);
2253

2254
		return (in_localip(ip->ip_dst));
2255
	}
2256
#endif /* INET */
2257
#ifdef INET6
2258
	case AF_INET6: {
2259
		struct ip6_hdr *ip6;
2260
		struct in6_ifaddr *ia;
2261
		ip6 = mtod(m, struct ip6_hdr *);
2262
		ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
2263
		if (ia == NULL)
2264
			return (false);
2265
		return (! (ia->ia6_flags & IN6_IFF_NOTREADY));
2266
	}
2267
#endif /* INET6 */
2268
	default:
2269
		unhandled_af(af);
2270
	}
2271

2272
	return (false);
2273
}
2274

2275
int
2276
pf_icmp_mapping(struct pf_pdesc *pd, u_int8_t type,
2277
    int *icmp_dir, u_int16_t *virtual_id, u_int16_t *virtual_type)
2278
{
2279
	/*
2280
	 * ICMP types marked with PF_OUT are typically responses to
2281
	 * PF_IN, and will match states in the opposite direction.
2282
	 * PF_IN ICMP types need to match a state with that type.
2283
	 */
2284
	*icmp_dir = PF_OUT;
2285

2286
	/* Queries (and responses) */
2287
	switch (pd->af) {
2288
#ifdef INET
2289
	case AF_INET:
2290
		switch (type) {
2291
		case ICMP_ECHO:
2292
			*icmp_dir = PF_IN;
2293
			/* FALLTHROUGH */
2294
		case ICMP_ECHOREPLY:
2295
			*virtual_type = ICMP_ECHO;
2296
			*virtual_id = pd->hdr.icmp.icmp_id;
2297
			break;
2298

2299
		case ICMP_TSTAMP:
2300
			*icmp_dir = PF_IN;
2301
			/* FALLTHROUGH */
2302
		case ICMP_TSTAMPREPLY:
2303
			*virtual_type = ICMP_TSTAMP;
2304
			*virtual_id = pd->hdr.icmp.icmp_id;
2305
			break;
2306

2307
		case ICMP_IREQ:
2308
			*icmp_dir = PF_IN;
2309
			/* FALLTHROUGH */
2310
		case ICMP_IREQREPLY:
2311
			*virtual_type = ICMP_IREQ;
2312
			*virtual_id = pd->hdr.icmp.icmp_id;
2313
			break;
2314

2315
		case ICMP_MASKREQ:
2316
			*icmp_dir = PF_IN;
2317
			/* FALLTHROUGH */
2318
		case ICMP_MASKREPLY:
2319
			*virtual_type = ICMP_MASKREQ;
2320
			*virtual_id = pd->hdr.icmp.icmp_id;
2321
			break;
2322

2323
		case ICMP_IPV6_WHEREAREYOU:
2324
			*icmp_dir = PF_IN;
2325
			/* FALLTHROUGH */
2326
		case ICMP_IPV6_IAMHERE:
2327
			*virtual_type = ICMP_IPV6_WHEREAREYOU;
2328
			*virtual_id = 0; /* Nothing sane to match on! */
2329
			break;
2330

2331
		case ICMP_MOBILE_REGREQUEST:
2332
			*icmp_dir = PF_IN;
2333
			/* FALLTHROUGH */
2334
		case ICMP_MOBILE_REGREPLY:
2335
			*virtual_type = ICMP_MOBILE_REGREQUEST;
2336
			*virtual_id = 0; /* Nothing sane to match on! */
2337
			break;
2338

2339
		case ICMP_ROUTERSOLICIT:
2340
			*icmp_dir = PF_IN;
2341
			/* FALLTHROUGH */
2342
		case ICMP_ROUTERADVERT:
2343
			*virtual_type = ICMP_ROUTERSOLICIT;
2344
			*virtual_id = 0; /* Nothing sane to match on! */
2345
			break;
2346

2347
		/* These ICMP types map to other connections */
2348
		case ICMP_UNREACH:
2349
		case ICMP_SOURCEQUENCH:
2350
		case ICMP_REDIRECT:
2351
		case ICMP_TIMXCEED:
2352
		case ICMP_PARAMPROB:
2353
			/* These will not be used, but set them anyway */
2354
			*icmp_dir = PF_IN;
2355
			*virtual_type = type;
2356
			*virtual_id = 0;
2357
			*virtual_type = htons(*virtual_type);
2358
			return (1);  /* These types match to another state */
2359

2360
		/*
2361
		 * All remaining ICMP types get their own states,
2362
		 * and will only match in one direction.
2363
		 */
2364
		default:
2365
			*icmp_dir = PF_IN;
2366
			*virtual_type = type;
2367
			*virtual_id = 0;
2368
			break;
2369
		}
2370
		break;
2371
#endif /* INET */
2372
#ifdef INET6
2373
	case AF_INET6:
2374
		switch (type) {
2375
		case ICMP6_ECHO_REQUEST:
2376
			*icmp_dir = PF_IN;
2377
			/* FALLTHROUGH */
2378
		case ICMP6_ECHO_REPLY:
2379
			*virtual_type = ICMP6_ECHO_REQUEST;
2380
			*virtual_id = pd->hdr.icmp6.icmp6_id;
2381
			break;
2382

2383
		case MLD_LISTENER_QUERY:
2384
		case MLD_LISTENER_REPORT: {
2385
			/*
2386
			 * Listener Report can be sent by clients
2387
			 * without an associated Listener Query.
2388
			 * In addition to that, when Report is sent as a
2389
			 * reply to a Query its source and destination
2390
			 * address are different.
2391
			 */
2392
			*icmp_dir = PF_IN;
2393
			*virtual_type = MLD_LISTENER_QUERY;
2394
			*virtual_id = 0;
2395
			break;
2396
		}
2397
		case MLD_MTRACE:
2398
			*icmp_dir = PF_IN;
2399
			/* FALLTHROUGH */
2400
		case MLD_MTRACE_RESP:
2401
			*virtual_type = MLD_MTRACE;
2402
			*virtual_id = 0; /* Nothing sane to match on! */
2403
			break;
2404

2405
		case ND_NEIGHBOR_SOLICIT:
2406
			*icmp_dir = PF_IN;
2407
			/* FALLTHROUGH */
2408
		case ND_NEIGHBOR_ADVERT: {
2409
			*virtual_type = ND_NEIGHBOR_SOLICIT;
2410
			*virtual_id = 0;
2411
			break;
2412
		}
2413

2414
		/*
2415
		 * These ICMP types map to other connections.
2416
		 * ND_REDIRECT can't be in this list because the triggering
2417
		 * packet header is optional.
2418
		 */
2419
		case ICMP6_DST_UNREACH:
2420
		case ICMP6_PACKET_TOO_BIG:
2421
		case ICMP6_TIME_EXCEEDED:
2422
		case ICMP6_PARAM_PROB:
2423
			/* These will not be used, but set them anyway */
2424
			*icmp_dir = PF_IN;
2425
			*virtual_type = type;
2426
			*virtual_id = 0;
2427
			*virtual_type = htons(*virtual_type);
2428
			return (1);  /* These types match to another state */
2429
		/*
2430
		 * All remaining ICMP6 types get their own states,
2431
		 * and will only match in one direction.
2432
		 */
2433
		default:
2434
			*icmp_dir = PF_IN;
2435
			*virtual_type = type;
2436
			*virtual_id = 0;
2437
			break;
2438
		}
2439
		break;
2440
#endif /* INET6 */
2441
	default:
2442
		unhandled_af(pd->af);
2443
	}
2444
	*virtual_type = htons(*virtual_type);
2445
	return (0);  /* These types match to their own state */
2446
}
2447

2448
void
2449
pf_intr(void *v)
2450
{
2451
	struct epoch_tracker et;
2452
	struct pf_send_head queue;
2453
	struct pf_send_entry *pfse, *next;
2454

2455
	CURVNET_SET((struct vnet *)v);
2456

2457
	PF_SENDQ_LOCK();
2458
	queue = V_pf_sendqueue;
2459
	STAILQ_INIT(&V_pf_sendqueue);
2460
	PF_SENDQ_UNLOCK();
2461

2462
	NET_EPOCH_ENTER(et);
2463

2464
	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
2465
		switch (pfse->pfse_type) {
2466
#ifdef INET
2467
		case PFSE_IP: {
2468
			if (pf_isforlocal(pfse->pfse_m, AF_INET)) {
2469
				KASSERT(pfse->pfse_m->m_pkthdr.rcvif == V_loif,
2470
				    ("%s: rcvif != loif", __func__));
2471

2472
				pfse->pfse_m->m_flags |= M_SKIP_FIREWALL;
2473
				pfse->pfse_m->m_pkthdr.csum_flags |=
2474
				    CSUM_IP_VALID | CSUM_IP_CHECKED |
2475
				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2476
				pfse->pfse_m->m_pkthdr.csum_data = 0xffff;
2477
				ip_input(pfse->pfse_m);
2478
			} else {
2479
				ip_output(pfse->pfse_m, NULL, NULL, 0, NULL,
2480
				    NULL);
2481
			}
2482
			break;
2483
		}
2484
		case PFSE_ICMP:
2485
			icmp_error(pfse->pfse_m, pfse->icmpopts.type,
2486
			    pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
2487
			break;
2488
#endif /* INET */
2489
#ifdef INET6
2490
		case PFSE_IP6:
2491
			if (pf_isforlocal(pfse->pfse_m, AF_INET6)) {
2492
				KASSERT(pfse->pfse_m->m_pkthdr.rcvif == V_loif,
2493
				    ("%s: rcvif != loif", __func__));
2494

2495
				pfse->pfse_m->m_flags |= M_SKIP_FIREWALL |
2496
				    M_LOOP;
2497
				pfse->pfse_m->m_pkthdr.csum_flags |=
2498
				    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2499
				pfse->pfse_m->m_pkthdr.csum_data = 0xffff;
2500
				ip6_input(pfse->pfse_m);
2501
			} else {
2502
				ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL,
2503
				    NULL, NULL);
2504
			}
2505
			break;
2506
		case PFSE_ICMP6:
2507
			icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
2508
			    pfse->icmpopts.code, pfse->icmpopts.mtu);
2509
			break;
2510
#endif /* INET6 */
2511
		default:
2512
			panic("%s: unknown type", __func__);
2513
		}
2514
		free(pfse, M_PFTEMP);
2515
	}
2516
	NET_EPOCH_EXIT(et);
2517
	CURVNET_RESTORE();
2518
}
2519

2520
#define	pf_purge_thread_period	(hz / 10)
2521

2522
#ifdef PF_WANT_32_TO_64_COUNTER
2523
static void
2524
pf_status_counter_u64_periodic(void)
2525
{
2526

2527
	PF_RULES_RASSERT();
2528

2529
	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 60)) != 0) {
2530
		return;
2531
	}
2532

2533
	for (int i = 0; i < FCNT_MAX; i++) {
2534
		pf_counter_u64_periodic(&V_pf_status.fcounters[i]);
2535
	}
2536
}
2537

2538
static void
2539
pf_kif_counter_u64_periodic(void)
2540
{
2541
	struct pfi_kkif *kif;
2542
	size_t r, run;
2543

2544
	PF_RULES_RASSERT();
2545

2546
	if (__predict_false(V_pf_allkifcount == 0)) {
2547
		return;
2548
	}
2549

2550
	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) {
2551
		return;
2552
	}
2553

2554
	run = V_pf_allkifcount / 10;
2555
	if (run < 5)
2556
		run = 5;
2557

2558
	for (r = 0; r < run; r++) {
2559
		kif = LIST_NEXT(V_pf_kifmarker, pfik_allkiflist);
2560
		if (kif == NULL) {
2561
			LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
2562
			LIST_INSERT_HEAD(&V_pf_allkiflist, V_pf_kifmarker, pfik_allkiflist);
2563
			break;
2564
		}
2565

2566
		LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
2567
		LIST_INSERT_AFTER(kif, V_pf_kifmarker, pfik_allkiflist);
2568

2569
		for (int i = 0; i < 2; i++) {
2570
			for (int j = 0; j < 2; j++) {
2571
				for (int k = 0; k < 2; k++) {
2572
					pf_counter_u64_periodic(&kif->pfik_packets[i][j][k]);
2573
					pf_counter_u64_periodic(&kif->pfik_bytes[i][j][k]);
2574
				}
2575
			}
2576
		}
2577
	}
2578
}
2579

2580
static void
2581
pf_rule_counter_u64_periodic(void)
2582
{
2583
	struct pf_krule *rule;
2584
	size_t r, run;
2585

2586
	PF_RULES_RASSERT();
2587

2588
	if (__predict_false(V_pf_allrulecount == 0)) {
2589
		return;
2590
	}
2591

2592
	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) {
2593
		return;
2594
	}
2595

2596
	run = V_pf_allrulecount / 10;
2597
	if (run < 5)
2598
		run = 5;
2599

2600
	for (r = 0; r < run; r++) {
2601
		rule = LIST_NEXT(V_pf_rulemarker, allrulelist);
2602
		if (rule == NULL) {
2603
			LIST_REMOVE(V_pf_rulemarker, allrulelist);
2604
			LIST_INSERT_HEAD(&V_pf_allrulelist, V_pf_rulemarker, allrulelist);
2605
			break;
2606
		}
2607

2608
		LIST_REMOVE(V_pf_rulemarker, allrulelist);
2609
		LIST_INSERT_AFTER(rule, V_pf_rulemarker, allrulelist);
2610

2611
		pf_counter_u64_periodic(&rule->evaluations);
2612
		for (int i = 0; i < 2; i++) {
2613
			pf_counter_u64_periodic(&rule->packets[i]);
2614
			pf_counter_u64_periodic(&rule->bytes[i]);
2615
		}
2616
	}
2617
}
2618

2619
static void
2620
pf_counter_u64_periodic_main(void)
2621
{
2622
	PF_RULES_RLOCK_TRACKER;
2623

2624
	V_pf_counter_periodic_iter++;
2625

2626
	PF_RULES_RLOCK();
2627
	pf_counter_u64_critical_enter();
2628
	pf_status_counter_u64_periodic();
2629
	pf_kif_counter_u64_periodic();
2630
	pf_rule_counter_u64_periodic();
2631
	pf_counter_u64_critical_exit();
2632
	PF_RULES_RUNLOCK();
2633
}
2634
#else
2635
#define	pf_counter_u64_periodic_main()	do { } while (0)
2636
#endif
2637

2638
void
2639
pf_purge_thread(void *unused __unused)
2640
{
2641
	struct epoch_tracker	 et;
2642

2643
	VNET_ITERATOR_DECL(vnet_iter);
2644

2645
	sx_xlock(&pf_end_lock);
2646
	while (pf_end_threads == 0) {
2647
		sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", pf_purge_thread_period);
2648

2649
		VNET_LIST_RLOCK();
2650
		NET_EPOCH_ENTER(et);
2651
		VNET_FOREACH(vnet_iter) {
2652
			CURVNET_SET(vnet_iter);
2653

2654
			/* Wait until V_pf_default_rule is initialized. */
2655
			if (V_pf_vnet_active == 0) {
2656
				CURVNET_RESTORE();
2657
				continue;
2658
			}
2659

2660
			pf_counter_u64_periodic_main();
2661

2662
			/*
2663
			 *  Process 1/interval fraction of the state
2664
			 * table every run.
2665
			 */
2666
			V_pf_purge_idx =
2667
			    pf_purge_expired_states(V_pf_purge_idx, V_pf_hashmask /
2668
			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
2669

2670
			/*
2671
			 * Purge other expired types every
2672
			 * PFTM_INTERVAL seconds.
2673
			 */
2674
			if (V_pf_purge_idx == 0) {
2675
				/*
2676
				 * Order is important:
2677
				 * - states and src nodes reference rules
2678
				 * - states and rules reference kifs
2679
				 */
2680
				pf_purge_expired_fragments();
2681
				pf_purge_expired_src_nodes();
2682
				pf_purge_unlinked_rules();
2683
				pfi_kkif_purge();
2684
			}
2685
			CURVNET_RESTORE();
2686
		}
2687
		NET_EPOCH_EXIT(et);
2688
		VNET_LIST_RUNLOCK();
2689
	}
2690

2691
	pf_end_threads++;
2692
	sx_xunlock(&pf_end_lock);
2693
	kproc_exit(0);
2694
}
2695

2696
void
2697
pf_unload_vnet_purge(void)
2698
{
2699

2700
	/*
2701
	 * To cleanse up all kifs and rules we need
2702
	 * two runs: first one clears reference flags,
2703
	 * then pf_purge_expired_states() doesn't
2704
	 * raise them, and then second run frees.
2705
	 */
2706
	pf_purge_unlinked_rules();
2707
	pfi_kkif_purge();
2708

2709
	/*
2710
	 * Now purge everything.
2711
	 */
2712
	pf_purge_expired_states(0, V_pf_hashmask);
2713
	pf_purge_fragments(UINT_MAX);
2714
	pf_purge_expired_src_nodes();
2715

2716
	/*
2717
	 * Now all kifs & rules should be unreferenced,
2718
	 * thus should be successfully freed.
2719
	 */
2720
	pf_purge_unlinked_rules();
2721
	pfi_kkif_purge();
2722
}
2723

2724
u_int32_t
2725
pf_state_expires(const struct pf_kstate *state)
2726
{
2727
	u_int32_t	timeout;
2728
	u_int32_t	start;
2729
	u_int32_t	end;
2730
	u_int32_t	states;
2731

2732
	/* handle all PFTM_* > PFTM_MAX here */
2733
	if (state->timeout == PFTM_PURGE)
2734
		return (time_uptime);
2735
	KASSERT(state->timeout != PFTM_UNLINKED,
2736
	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
2737
	KASSERT((state->timeout < PFTM_MAX),
2738
	    ("pf_state_expires: timeout > PFTM_MAX"));
2739
	timeout = state->rule->timeout[state->timeout];
2740
	if (!timeout)
2741
		timeout = V_pf_default_rule.timeout[state->timeout];
2742
	start = state->rule->timeout[PFTM_ADAPTIVE_START];
2743
	if (start && state->rule != &V_pf_default_rule) {
2744
		end = state->rule->timeout[PFTM_ADAPTIVE_END];
2745
		states = counter_u64_fetch(state->rule->states_cur);
2746
	} else {
2747
		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
2748
		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
2749
		states = V_pf_status.states;
2750
	}
2751
	if (end && states > start && start < end) {
2752
		if (states < end) {
2753
			timeout = (u_int64_t)timeout * (end - states) /
2754
			    (end - start);
2755
			return ((state->expire / 1000) + timeout);
2756
		}
2757
		else
2758
			return (time_uptime);
2759
	}
2760
	return ((state->expire / 1000) + timeout);
2761
}
2762

2763
void
2764
pf_purge_expired_src_nodes(void)
2765
{
2766
	struct pf_ksrc_node_list	 freelist;
2767
	struct pf_srchash	*sh;
2768
	struct pf_ksrc_node	*cur, *next;
2769
	int i;
2770

2771
	LIST_INIT(&freelist);
2772
	for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
2773
	    PF_HASHROW_LOCK(sh);
2774
	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
2775
		if (cur->states == 0 && cur->expire <= time_uptime) {
2776
			pf_unlink_src_node(cur);
2777
			LIST_INSERT_HEAD(&freelist, cur, entry);
2778
		} else if (cur->rule != NULL)
2779
			cur->rule->rule_ref |= PFRULE_REFS;
2780
	    PF_HASHROW_UNLOCK(sh);
2781
	}
2782

2783
	pf_free_src_nodes(&freelist);
2784

2785
	V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
2786
}
2787

2788
static void
2789
pf_src_tree_remove_state(struct pf_kstate *s)
2790
{
2791
	uint32_t timeout;
2792

2793
	timeout = s->rule->timeout[PFTM_SRC_NODE] ?
2794
	    s->rule->timeout[PFTM_SRC_NODE] :
2795
	    V_pf_default_rule.timeout[PFTM_SRC_NODE];
2796

2797
	for (pf_sn_types_t sn_type=0; sn_type<PF_SN_MAX; sn_type++) {
2798
		if (s->sns[sn_type] == NULL)
2799
			continue;
2800
		PF_SRC_NODE_LOCK(s->sns[sn_type]);
2801
		if (sn_type == PF_SN_LIMIT && s->src.tcp_est)
2802
			--(s->sns[sn_type]->conn);
2803
		if (--(s->sns[sn_type]->states) == 0)
2804
			s->sns[sn_type]->expire = time_uptime + timeout;
2805
		PF_SRC_NODE_UNLOCK(s->sns[sn_type]);
2806
		s->sns[sn_type] = NULL;
2807
	}
2808

2809
}
2810

2811
/*
2812
 * Unlink and potentilly free a state. Function may be
2813
 * called with ID hash row locked, but always returns
2814
 * unlocked, since it needs to go through key hash locking.
2815
 */
2816
int
2817
pf_remove_state(struct pf_kstate *s)
2818
{
2819
	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
2820

2821
	NET_EPOCH_ASSERT();
2822
	PF_HASHROW_ASSERT(ih);
2823

2824
	if (s->timeout == PFTM_UNLINKED) {
2825
		/*
2826
		 * State is being processed
2827
		 * by pf_remove_state() in
2828
		 * an other thread.
2829
		 */
2830
		PF_HASHROW_UNLOCK(ih);
2831
		return (0);	/* XXXGL: undefined actually */
2832
	}
2833

2834
	if (s->src.state == PF_TCPS_PROXY_DST) {
2835
		/* XXX wire key the right one? */
2836
		pf_send_tcp(s->rule, s->key[PF_SK_WIRE]->af,
2837
		    &s->key[PF_SK_WIRE]->addr[1],
2838
		    &s->key[PF_SK_WIRE]->addr[0],
2839
		    s->key[PF_SK_WIRE]->port[1],
2840
		    s->key[PF_SK_WIRE]->port[0],
2841
		    s->src.seqhi, s->src.seqlo + 1,
2842
		    TH_RST|TH_ACK, 0, 0, 0, M_SKIP_FIREWALL, s->tag, 0,
2843
		    s->act.rtableid, NULL);
2844
	}
2845

2846
	LIST_REMOVE(s, entry);
2847
	pf_src_tree_remove_state(s);
2848

2849
	if (V_pfsync_delete_state_ptr != NULL)
2850
		V_pfsync_delete_state_ptr(s);
2851

2852
	STATE_DEC_COUNTERS(s);
2853

2854
	s->timeout = PFTM_UNLINKED;
2855

2856
	/* Ensure we remove it from the list of halfopen states, if needed. */
2857
	if (s->key[PF_SK_STACK] != NULL &&
2858
	    s->key[PF_SK_STACK]->proto == IPPROTO_TCP)
2859
		pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED);
2860

2861
	PF_HASHROW_UNLOCK(ih);
2862

2863
	pf_detach_state(s);
2864

2865
	pf_udp_mapping_release(s->udp_mapping);
2866

2867
	/* pf_state_insert() initialises refs to 2 */
2868
	return (pf_release_staten(s, 2));
2869
}
2870

2871
struct pf_kstate *
2872
pf_alloc_state(int flags)
2873
{
2874

2875
	return (uma_zalloc(V_pf_state_z, flags | M_ZERO));
2876
}
2877

2878
static __inline void
2879
pf_free_match_rules(struct pf_krule_slist *match_rules) {
2880
	struct pf_krule_item	*ri;
2881

2882
	while ((ri = SLIST_FIRST(match_rules))) {
2883
		SLIST_REMOVE_HEAD(match_rules, entry);
2884
		free(ri, M_PF_RULE_ITEM);
2885
	}
2886
}
2887

2888
void
2889
pf_free_state(struct pf_kstate *cur)
2890
{
2891
	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
2892
	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
2893
	    cur->timeout));
2894

2895
	pf_free_match_rules(&(cur->match_rules));
2896
	pf_normalize_tcp_cleanup(cur);
2897
	uma_zfree(V_pf_state_z, cur);
2898
	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
2899
}
2900

2901
/*
2902
 * Called only from pf_purge_thread(), thus serialized.
2903
 */
2904
static u_int
2905
pf_purge_expired_states(u_int i, int maxcheck)
2906
{
2907
	struct pf_idhash *ih;
2908
	struct pf_kstate *s;
2909
	struct pf_krule_item *mrm;
2910
	size_t count __unused;
2911

2912
	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
2913

2914
	/*
2915
	 * Go through hash and unlink states that expire now.
2916
	 */
2917
	while (maxcheck > 0) {
2918
		count = 0;
2919
		ih = &V_pf_idhash[i];
2920

2921
		/* only take the lock if we expect to do work */
2922
		if (!LIST_EMPTY(&ih->states)) {
2923
relock:
2924
			PF_HASHROW_LOCK(ih);
2925
			LIST_FOREACH(s, &ih->states, entry) {
2926
				if (pf_state_expires(s) <= time_uptime) {
2927
					V_pf_status.states -=
2928
					    pf_remove_state(s);
2929
					goto relock;
2930
				}
2931
				s->rule->rule_ref |= PFRULE_REFS;
2932
				if (s->nat_rule != NULL)
2933
					s->nat_rule->rule_ref |= PFRULE_REFS;
2934
				if (s->anchor != NULL)
2935
					s->anchor->rule_ref |= PFRULE_REFS;
2936
				s->kif->pfik_flags |= PFI_IFLAG_REFS;
2937
				SLIST_FOREACH(mrm, &s->match_rules, entry)
2938
					mrm->r->rule_ref |= PFRULE_REFS;
2939
				if (s->act.rt_kif)
2940
					s->act.rt_kif->pfik_flags |= PFI_IFLAG_REFS;
2941
				count++;
2942
			}
2943
			PF_HASHROW_UNLOCK(ih);
2944
		}
2945

2946
		SDT_PROBE2(pf, purge, state, rowcount, i, count);
2947

2948
		/* Return when we hit end of hash. */
2949
		if (++i > V_pf_hashmask) {
2950
			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
2951
			return (0);
2952
		}
2953

2954
		maxcheck--;
2955
	}
2956

2957
	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
2958

2959
	return (i);
2960
}
2961

2962
static void
2963
pf_purge_unlinked_rules(void)
2964
{
2965
	struct pf_krulequeue tmpq;
2966
	struct pf_krule *r, *r1;
2967

2968
	/*
2969
	 * If we have overloading task pending, then we'd
2970
	 * better skip purging this time. There is a tiny
2971
	 * probability that overloading task references
2972
	 * an already unlinked rule.
2973
	 */
2974
	PF_OVERLOADQ_LOCK();
2975
	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
2976
		PF_OVERLOADQ_UNLOCK();
2977
		return;
2978
	}
2979
	PF_OVERLOADQ_UNLOCK();
2980

2981
	/*
2982
	 * Do naive mark-and-sweep garbage collecting of old rules.
2983
	 * Reference flag is raised by pf_purge_expired_states()
2984
	 * and pf_purge_expired_src_nodes().
2985
	 *
2986
	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
2987
	 * use a temporary queue.
2988
	 */
2989
	TAILQ_INIT(&tmpq);
2990
	PF_UNLNKDRULES_LOCK();
2991
	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
2992
		if (!(r->rule_ref & PFRULE_REFS)) {
2993
			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
2994
			TAILQ_INSERT_TAIL(&tmpq, r, entries);
2995
		} else
2996
			r->rule_ref &= ~PFRULE_REFS;
2997
	}
2998
	PF_UNLNKDRULES_UNLOCK();
2999

3000
	if (!TAILQ_EMPTY(&tmpq)) {
3001
		PF_CONFIG_LOCK();
3002
		PF_RULES_WLOCK();
3003
		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
3004
			TAILQ_REMOVE(&tmpq, r, entries);
3005
			pf_free_rule(r);
3006
		}
3007
		PF_RULES_WUNLOCK();
3008
		PF_CONFIG_UNLOCK();
3009
	}
3010
}
3011

3012
void
3013
pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
3014
{
3015
	switch (af) {
3016
#ifdef INET
3017
	case AF_INET: {
3018
		u_int32_t a = ntohl(addr->addr32[0]);
3019
		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
3020
		    (a>>8)&255, a&255);
3021
		if (p) {
3022
			p = ntohs(p);
3023
			printf(":%u", p);
3024
		}
3025
		break;
3026
	}
3027
#endif /* INET */
3028
#ifdef INET6
3029
	case AF_INET6: {
3030
		u_int16_t b;
3031
		u_int8_t i, curstart, curend, maxstart, maxend;
3032
		curstart = curend = maxstart = maxend = 255;
3033
		for (i = 0; i < 8; i++) {
3034
			if (!addr->addr16[i]) {
3035
				if (curstart == 255)
3036
					curstart = i;
3037
				curend = i;
3038
			} else {
3039
				if ((curend - curstart) >
3040
				    (maxend - maxstart)) {
3041
					maxstart = curstart;
3042
					maxend = curend;
3043
				}
3044
				curstart = curend = 255;
3045
			}
3046
		}
3047
		if ((curend - curstart) >
3048
		    (maxend - maxstart)) {
3049
			maxstart = curstart;
3050
			maxend = curend;
3051
		}
3052
		for (i = 0; i < 8; i++) {
3053
			if (i >= maxstart && i <= maxend) {
3054
				if (i == 0)
3055
					printf(":");
3056
				if (i == maxend)
3057
					printf(":");
3058
			} else {
3059
				b = ntohs(addr->addr16[i]);
3060
				printf("%x", b);
3061
				if (i < 7)
3062
					printf(":");
3063
			}
3064
		}
3065
		if (p) {
3066
			p = ntohs(p);
3067
			printf("[%u]", p);
3068
		}
3069
		break;
3070
	}
3071
#endif /* INET6 */
3072
	default:
3073
		unhandled_af(af);
3074
	}
3075
}
3076

3077
void
3078
pf_print_state(struct pf_kstate *s)
3079
{
3080
	pf_print_state_parts(s, NULL, NULL);
3081
}
3082

3083
static void
3084
pf_print_state_parts(struct pf_kstate *s,
3085
    struct pf_state_key *skwp, struct pf_state_key *sksp)
3086
{
3087
	struct pf_state_key *skw, *sks;
3088
	u_int8_t proto, dir;
3089

3090
	/* Do our best to fill these, but they're skipped if NULL */
3091
	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
3092
	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
3093
	proto = skw ? skw->proto : (sks ? sks->proto : 0);
3094
	dir = s ? s->direction : 0;
3095

3096
	switch (proto) {
3097
	case IPPROTO_IPV4:
3098
		printf("IPv4");
3099
		break;
3100
	case IPPROTO_IPV6:
3101
		printf("IPv6");
3102
		break;
3103
	case IPPROTO_TCP:
3104
		printf("TCP");
3105
		break;
3106
	case IPPROTO_UDP:
3107
		printf("UDP");
3108
		break;
3109
	case IPPROTO_ICMP:
3110
		printf("ICMP");
3111
		break;
3112
	case IPPROTO_ICMPV6:
3113
		printf("ICMPv6");
3114
		break;
3115
	default:
3116
		printf("%u", proto);
3117
		break;
3118
	}
3119
	switch (dir) {
3120
	case PF_IN:
3121
		printf(" in");
3122
		break;
3123
	case PF_OUT:
3124
		printf(" out");
3125
		break;
3126
	}
3127
	if (skw) {
3128
		printf(" wire: ");
3129
		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
3130
		printf(" ");
3131
		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
3132
	}
3133
	if (sks) {
3134
		printf(" stack: ");
3135
		if (sks != skw) {
3136
			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
3137
			printf(" ");
3138
			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
3139
		} else
3140
			printf("-");
3141
	}
3142
	if (s) {
3143
		if (proto == IPPROTO_TCP) {
3144
			printf(" [lo=%u high=%u win=%u modulator=%u",
3145
			    s->src.seqlo, s->src.seqhi,
3146
			    s->src.max_win, s->src.seqdiff);
3147
			if (s->src.wscale && s->dst.wscale)
3148
				printf(" wscale=%u",
3149
				    s->src.wscale & PF_WSCALE_MASK);
3150
			printf("]");
3151
			printf(" [lo=%u high=%u win=%u modulator=%u",
3152
			    s->dst.seqlo, s->dst.seqhi,
3153
			    s->dst.max_win, s->dst.seqdiff);
3154
			if (s->src.wscale && s->dst.wscale)
3155
				printf(" wscale=%u",
3156
				s->dst.wscale & PF_WSCALE_MASK);
3157
			printf("]");
3158
		}
3159
		printf(" %u:%u", s->src.state, s->dst.state);
3160
		if (s->rule)
3161
			printf(" @%d", s->rule->nr);
3162
	}
3163
}
3164

3165
void
3166
pf_print_flags(uint16_t f)
3167
{
3168
	if (f)
3169
		printf(" ");
3170
	if (f & TH_FIN)
3171
		printf("F");
3172
	if (f & TH_SYN)
3173
		printf("S");
3174
	if (f & TH_RST)
3175
		printf("R");
3176
	if (f & TH_PUSH)
3177
		printf("P");
3178
	if (f & TH_ACK)
3179
		printf("A");
3180
	if (f & TH_URG)
3181
		printf("U");
3182
	if (f & TH_ECE)
3183
		printf("E");
3184
	if (f & TH_CWR)
3185
		printf("W");
3186
	if (f & TH_AE)
3187
		printf("e");
3188
}
3189

3190
#define	PF_SET_SKIP_STEPS(i)					\
3191
	do {							\
3192
		while (head[i] != cur) {			\
3193
			head[i]->skip[i] = cur;			\
3194
			head[i] = TAILQ_NEXT(head[i], entries);	\
3195
		}						\
3196
	} while (0)
3197

3198
void
3199
pf_calc_skip_steps(struct pf_krulequeue *rules)
3200
{
3201
	struct pf_krule *cur, *prev, *head[PF_SKIP_COUNT];
3202
	int i;
3203

3204
	cur = TAILQ_FIRST(rules);
3205
	prev = cur;
3206
	for (i = 0; i < PF_SKIP_COUNT; ++i)
3207
		head[i] = cur;
3208
	while (cur != NULL) {
3209
		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
3210
			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
3211
		if (cur->direction != prev->direction)
3212
			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
3213
		if (cur->af != prev->af)
3214
			PF_SET_SKIP_STEPS(PF_SKIP_AF);
3215
		if (cur->proto != prev->proto)
3216
			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
3217
		if (cur->src.neg != prev->src.neg ||
3218
		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
3219
			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
3220
		if (cur->dst.neg != prev->dst.neg ||
3221
		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
3222
			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
3223
		if (cur->src.port[0] != prev->src.port[0] ||
3224
		    cur->src.port[1] != prev->src.port[1] ||
3225
		    cur->src.port_op != prev->src.port_op)
3226
			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
3227
		if (cur->dst.port[0] != prev->dst.port[0] ||
3228
		    cur->dst.port[1] != prev->dst.port[1] ||
3229
		    cur->dst.port_op != prev->dst.port_op)
3230
			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
3231

3232
		prev = cur;
3233
		cur = TAILQ_NEXT(cur, entries);
3234
	}
3235
	for (i = 0; i < PF_SKIP_COUNT; ++i)
3236
		PF_SET_SKIP_STEPS(i);
3237
}
3238

3239
int
3240
pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
3241
{
3242
	if (aw1->type != aw2->type)
3243
		return (1);
3244
	switch (aw1->type) {
3245
	case PF_ADDR_ADDRMASK:
3246
	case PF_ADDR_RANGE:
3247
		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
3248
			return (1);
3249
		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
3250
			return (1);
3251
		return (0);
3252
	case PF_ADDR_DYNIFTL:
3253
		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
3254
	case PF_ADDR_NONE:
3255
	case PF_ADDR_NOROUTE:
3256
	case PF_ADDR_URPFFAILED:
3257
		return (0);
3258
	case PF_ADDR_TABLE:
3259
		return (aw1->p.tbl != aw2->p.tbl);
3260
	default:
3261
		printf("invalid address type: %d\n", aw1->type);
3262
		return (1);
3263
	}
3264
}
3265

3266
/**
3267
 * Checksum updates are a little complicated because the checksum in the TCP/UDP
3268
 * header isn't always a full checksum. In some cases (i.e. output) it's a
3269
 * pseudo-header checksum, which is a partial checksum over src/dst IP
3270
 * addresses, protocol number and length.
3271
 *
3272
 * That means we have the following cases:
3273
 *  * Input or forwarding: we don't have TSO, the checksum fields are full
3274
 *  	checksums, we need to update the checksum whenever we change anything.
3275
 *  * Output (i.e. the checksum is a pseudo-header checksum):
3276
 *  	x The field being updated is src/dst address or affects the length of
3277
 *  	the packet. We need to update the pseudo-header checksum (note that this
3278
 *  	checksum is not ones' complement).
3279
 *  	x Some other field is being modified (e.g. src/dst port numbers): We
3280
 *  	don't have to update anything.
3281
 **/
3282
u_int16_t
3283
pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
3284
{
3285
	u_int32_t x;
3286

3287
	x = cksum + old - new;
3288
	x = (x + (x >> 16)) & 0xffff;
3289

3290
	/* optimise: eliminate a branch when not udp */
3291
	if (udp && cksum == 0x0000)
3292
		return cksum;
3293
	if (udp && x == 0x0000)
3294
		x = 0xffff;
3295

3296
	return (u_int16_t)(x);
3297
}
3298

3299
static int
3300
pf_patch_8(struct pf_pdesc *pd, u_int8_t *f, u_int8_t v, bool hi)
3301
{
3302
	int	 rewrite = 0;
3303

3304
	if (*f != v) {
3305
		uint16_t old = htons(hi ? (*f << 8) : *f);
3306
		uint16_t new = htons(hi ? ( v << 8) :  v);
3307

3308
		*f = v;
3309

3310
		if (! (pd->m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA |
3311
		    CSUM_DELAY_DATA_IPV6)))
3312
			*pd->pcksum = pf_cksum_fixup(*pd->pcksum, old, new,
3313
			    pd->proto == IPPROTO_UDP);
3314

3315
		rewrite = 1;
3316
	}
3317

3318
	return (rewrite);
3319
}
3320

3321
int
3322
pf_patch_16(struct pf_pdesc *pd, void *f, u_int16_t v, bool hi)
3323
{
3324
	int rewrite = 0;
3325
	u_int8_t *fb = (u_int8_t *)f;
3326
	u_int8_t *vb = (u_int8_t *)&v;
3327

3328
	rewrite += pf_patch_8(pd, fb++, *vb++, hi);
3329
	rewrite += pf_patch_8(pd, fb++, *vb++, !hi);
3330

3331
	return (rewrite);
3332
}
3333

3334
int
3335
pf_patch_32(struct pf_pdesc *pd, void *f, u_int32_t v, bool hi)
3336
{
3337
	int rewrite = 0;
3338
	u_int8_t *fb = (u_int8_t *)f;
3339
	u_int8_t *vb = (u_int8_t *)&v;
3340

3341
	rewrite += pf_patch_8(pd, fb++, *vb++, hi);
3342
	rewrite += pf_patch_8(pd, fb++, *vb++, !hi);
3343
	rewrite += pf_patch_8(pd, fb++, *vb++, hi);
3344
	rewrite += pf_patch_8(pd, fb++, *vb++, !hi);
3345

3346
	return (rewrite);
3347
}
3348

3349
u_int16_t
3350
pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
3351
        u_int16_t new, u_int8_t udp)
3352
{
3353
	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
3354
		return (cksum);
3355

3356
	return (pf_cksum_fixup(cksum, old, new, udp));
3357
}
3358

3359
static void
3360
pf_change_ap(struct pf_pdesc *pd, struct pf_addr *a, u_int16_t *p,
3361
        struct pf_addr *an, u_int16_t pn)
3362
{
3363
	struct pf_addr	ao;
3364
	u_int16_t	po;
3365
	uint8_t		u = pd->virtual_proto == IPPROTO_UDP;
3366

3367
	MPASS(pd->pcksum != NULL);
3368
	if (pd->af == AF_INET) {
3369
		MPASS(pd->ip_sum);
3370
	}
3371

3372
	pf_addrcpy(&ao, a, pd->af);
3373
	if (pd->af == pd->naf)
3374
		pf_addrcpy(a, an, pd->af);
3375

3376
	if (pd->m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
3377
		*pd->pcksum = ~*pd->pcksum;
3378

3379
	if (p == NULL)  /* no port -> done. no cksum to worry about. */
3380
		return;
3381
	po = *p;
3382
	*p = pn;
3383

3384
	switch (pd->af) {
3385
#ifdef INET
3386
	case AF_INET:
3387
		switch (pd->naf) {
3388
		case AF_INET:
3389
			*pd->ip_sum = pf_cksum_fixup(pf_cksum_fixup(*pd->ip_sum,
3390
			    ao.addr16[0], an->addr16[0], 0),
3391
			    ao.addr16[1], an->addr16[1], 0);
3392
			*p = pn;
3393

3394
			*pd->pcksum = pf_cksum_fixup(pf_cksum_fixup(*pd->pcksum,
3395
			    ao.addr16[0], an->addr16[0], u),
3396
			    ao.addr16[1], an->addr16[1], u);
3397

3398
			*pd->pcksum = pf_proto_cksum_fixup(pd->m, *pd->pcksum, po, pn, u);
3399
			break;
3400
#ifdef INET6
3401
		case AF_INET6:
3402
			*pd->pcksum = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3403
			   pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3404
			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pd->pcksum,
3405
			    ao.addr16[0], an->addr16[0], u),
3406
			    ao.addr16[1], an->addr16[1], u),
3407
			    0,            an->addr16[2], u),
3408
			    0,            an->addr16[3], u),
3409
			    0,            an->addr16[4], u),
3410
			    0,            an->addr16[5], u),
3411
			    0,            an->addr16[6], u),
3412
			    0,            an->addr16[7], u),
3413
			    po, pn, u);
3414
			break;
3415
#endif /* INET6 */
3416
		default:
3417
			unhandled_af(pd->naf);
3418
		}
3419
		break;
3420
#endif /* INET */
3421
#ifdef INET6
3422
	case AF_INET6:
3423
		switch (pd->naf) {
3424
#ifdef INET
3425
		case AF_INET:
3426
			*pd->pcksum = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3427
			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3428
			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pd->pcksum,
3429
			    ao.addr16[0], an->addr16[0], u),
3430
			    ao.addr16[1], an->addr16[1], u),
3431
			    ao.addr16[2], 0,             u),
3432
			    ao.addr16[3], 0,             u),
3433
			    ao.addr16[4], 0,             u),
3434
			    ao.addr16[5], 0,             u),
3435
			    ao.addr16[6], 0,             u),
3436
			    ao.addr16[7], 0,             u),
3437
			    po, pn, u);
3438
			break;
3439
#endif /* INET */
3440
		case AF_INET6:
3441
			*pd->pcksum  = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3442
			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3443
			    pf_cksum_fixup(pf_cksum_fixup(*pd->pcksum,
3444
			    ao.addr16[0], an->addr16[0], u),
3445
			    ao.addr16[1], an->addr16[1], u),
3446
			    ao.addr16[2], an->addr16[2], u),
3447
			    ao.addr16[3], an->addr16[3], u),
3448
			    ao.addr16[4], an->addr16[4], u),
3449
			    ao.addr16[5], an->addr16[5], u),
3450
			    ao.addr16[6], an->addr16[6], u),
3451
			    ao.addr16[7], an->addr16[7], u);
3452

3453
			*pd->pcksum = pf_proto_cksum_fixup(pd->m, *pd->pcksum, po, pn, u);
3454
			break;
3455
		default:
3456
			unhandled_af(pd->naf);
3457
		}
3458
		break;
3459
#endif /* INET6 */
3460
	default:
3461
		unhandled_af(pd->af);
3462
	}
3463

3464
	if (pd->m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA |
3465
	    CSUM_DELAY_DATA_IPV6)) {
3466
		*pd->pcksum = ~*pd->pcksum;
3467
		if (! *pd->pcksum)
3468
			*pd->pcksum = 0xffff;
3469
	}
3470
}
3471

3472
/* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
3473
void
3474
pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
3475
{
3476
	u_int32_t	ao;
3477

3478
	memcpy(&ao, a, sizeof(ao));
3479
	memcpy(a, &an, sizeof(u_int32_t));
3480
	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
3481
	    ao % 65536, an % 65536, u);
3482
}
3483

3484
void
3485
pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
3486
{
3487
	u_int32_t	ao;
3488

3489
	memcpy(&ao, a, sizeof(ao));
3490
	memcpy(a, &an, sizeof(u_int32_t));
3491

3492
	*c = pf_proto_cksum_fixup(m,
3493
	    pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
3494
	    ao % 65536, an % 65536, udp);
3495
}
3496

3497
#ifdef INET6
3498
static void
3499
pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
3500
{
3501
	struct pf_addr	ao;
3502

3503
	pf_addrcpy(&ao, a, AF_INET6);
3504
	pf_addrcpy(a, an, AF_INET6);
3505

3506
	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3507
	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3508
	    pf_cksum_fixup(pf_cksum_fixup(*c,
3509
	    ao.addr16[0], an->addr16[0], u),
3510
	    ao.addr16[1], an->addr16[1], u),
3511
	    ao.addr16[2], an->addr16[2], u),
3512
	    ao.addr16[3], an->addr16[3], u),
3513
	    ao.addr16[4], an->addr16[4], u),
3514
	    ao.addr16[5], an->addr16[5], u),
3515
	    ao.addr16[6], an->addr16[6], u),
3516
	    ao.addr16[7], an->addr16[7], u);
3517
}
3518
#endif /* INET6 */
3519

3520
static void
3521
pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
3522
    struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
3523
    u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
3524
{
3525
	struct pf_addr	oia, ooa;
3526

3527
	pf_addrcpy(&oia, ia, af);
3528
	if (oa)
3529
		pf_addrcpy(&ooa, oa, af);
3530

3531
	/* Change inner protocol port, fix inner protocol checksum. */
3532
	if (ip != NULL) {
3533
		u_int16_t	oip = *ip;
3534
		u_int32_t	opc;
3535

3536
		if (pc != NULL)
3537
			opc = *pc;
3538
		*ip = np;
3539
		if (pc != NULL)
3540
			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
3541
		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
3542
		if (pc != NULL)
3543
			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
3544
	}
3545
	/* Change inner ip address, fix inner ip and icmp checksums. */
3546
	pf_addrcpy(ia, na, af);
3547
	switch (af) {
3548
#ifdef INET
3549
	case AF_INET: {
3550
		u_int32_t	 oh2c = *h2c;
3551

3552
		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
3553
		    oia.addr16[0], ia->addr16[0], 0),
3554
		    oia.addr16[1], ia->addr16[1], 0);
3555
		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
3556
		    oia.addr16[0], ia->addr16[0], 0),
3557
		    oia.addr16[1], ia->addr16[1], 0);
3558
		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
3559
		break;
3560
	}
3561
#endif /* INET */
3562
#ifdef INET6
3563
	case AF_INET6:
3564
		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3565
		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3566
		    pf_cksum_fixup(pf_cksum_fixup(*ic,
3567
		    oia.addr16[0], ia->addr16[0], u),
3568
		    oia.addr16[1], ia->addr16[1], u),
3569
		    oia.addr16[2], ia->addr16[2], u),
3570
		    oia.addr16[3], ia->addr16[3], u),
3571
		    oia.addr16[4], ia->addr16[4], u),
3572
		    oia.addr16[5], ia->addr16[5], u),
3573
		    oia.addr16[6], ia->addr16[6], u),
3574
		    oia.addr16[7], ia->addr16[7], u);
3575
		break;
3576
#endif /* INET6 */
3577
	}
3578
	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
3579
	if (oa) {
3580
		pf_addrcpy(oa, na, af);
3581
		switch (af) {
3582
#ifdef INET
3583
		case AF_INET:
3584
			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
3585
			    ooa.addr16[0], oa->addr16[0], 0),
3586
			    ooa.addr16[1], oa->addr16[1], 0);
3587
			break;
3588
#endif /* INET */
3589
#ifdef INET6
3590
		case AF_INET6:
3591
			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3592
			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
3593
			    pf_cksum_fixup(pf_cksum_fixup(*ic,
3594
			    ooa.addr16[0], oa->addr16[0], u),
3595
			    ooa.addr16[1], oa->addr16[1], u),
3596
			    ooa.addr16[2], oa->addr16[2], u),
3597
			    ooa.addr16[3], oa->addr16[3], u),
3598
			    ooa.addr16[4], oa->addr16[4], u),
3599
			    ooa.addr16[5], oa->addr16[5], u),
3600
			    ooa.addr16[6], oa->addr16[6], u),
3601
			    ooa.addr16[7], oa->addr16[7], u);
3602
			break;
3603
#endif /* INET6 */
3604
		}
3605
	}
3606
}
3607

3608
int
3609
pf_translate_af(struct pf_pdesc *pd)
3610
{
3611
#if defined(INET) && defined(INET6)
3612
	struct mbuf		*mp;
3613
	struct ip		*ip4;
3614
	struct ip6_hdr		*ip6;
3615
	struct icmp6_hdr	*icmp;
3616
	struct m_tag		*mtag;
3617
	struct pf_fragment_tag	*ftag;
3618
	int			 hlen;
3619

3620
	hlen = pd->naf == AF_INET ? sizeof(*ip4) : sizeof(*ip6);
3621

3622
	/* trim the old header */
3623
	m_adj(pd->m, pd->off);
3624

3625
	/* prepend a new one */
3626
	M_PREPEND(pd->m, hlen, M_NOWAIT);
3627
	if (pd->m == NULL)
3628
		return (-1);
3629

3630
	switch (pd->naf) {
3631
	case AF_INET:
3632
		ip4 = mtod(pd->m, struct ip *);
3633
		bzero(ip4, hlen);
3634
		ip4->ip_v = IPVERSION;
3635
		ip4->ip_hl = hlen >> 2;
3636
		ip4->ip_tos = pd->tos;
3637
		ip4->ip_len = htons(hlen + (pd->tot_len - pd->off));
3638
		ip_fillid(ip4, V_ip_random_id);
3639
		ip4->ip_ttl = pd->ttl;
3640
		ip4->ip_p = pd->proto;
3641
		ip4->ip_src = pd->nsaddr.v4;
3642
		ip4->ip_dst = pd->ndaddr.v4;
3643
		pd->src = (struct pf_addr *)&ip4->ip_src;
3644
		pd->dst = (struct pf_addr *)&ip4->ip_dst;
3645
		pd->off = sizeof(struct ip);
3646
		if (pd->m->m_pkthdr.csum_flags & CSUM_TCP_IPV6) {
3647
			pd->m->m_pkthdr.csum_flags &= ~CSUM_TCP_IPV6;
3648
			pd->m->m_pkthdr.csum_flags |= CSUM_TCP;
3649
		}
3650
		if (pd->m->m_pkthdr.csum_flags & CSUM_UDP_IPV6) {
3651
			pd->m->m_pkthdr.csum_flags &= ~CSUM_UDP_IPV6;
3652
			pd->m->m_pkthdr.csum_flags |= CSUM_UDP;
3653
		}
3654
		if (pd->m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
3655
			pd->m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
3656
			pd->m->m_pkthdr.csum_flags |= CSUM_SCTP;
3657
		}
3658
		break;
3659
	case AF_INET6:
3660
		ip6 = mtod(pd->m, struct ip6_hdr *);
3661
		bzero(ip6, hlen);
3662
		ip6->ip6_vfc = IPV6_VERSION;
3663
		ip6->ip6_flow |= htonl((u_int32_t)pd->tos << 20);
3664
		ip6->ip6_plen = htons(pd->tot_len - pd->off);
3665
		ip6->ip6_nxt = pd->proto;
3666
		if (!pd->ttl || pd->ttl > IPV6_DEFHLIM)
3667
			ip6->ip6_hlim = IPV6_DEFHLIM;
3668
		else
3669
			ip6->ip6_hlim = pd->ttl;
3670
		ip6->ip6_src = pd->nsaddr.v6;
3671
		ip6->ip6_dst = pd->ndaddr.v6;
3672
		pd->src = (struct pf_addr *)&ip6->ip6_src;
3673
		pd->dst = (struct pf_addr *)&ip6->ip6_dst;
3674
		pd->off = sizeof(struct ip6_hdr);
3675
		if (pd->m->m_pkthdr.csum_flags & CSUM_TCP) {
3676
			pd->m->m_pkthdr.csum_flags &= ~CSUM_TCP;
3677
			pd->m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
3678
		}
3679
		if (pd->m->m_pkthdr.csum_flags & CSUM_UDP) {
3680
			pd->m->m_pkthdr.csum_flags &= ~CSUM_UDP;
3681
			pd->m->m_pkthdr.csum_flags |= CSUM_UDP_IPV6;
3682
		}
3683
		if (pd->m->m_pkthdr.csum_flags & CSUM_SCTP) {
3684
			pd->m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
3685
			pd->m->m_pkthdr.csum_flags |= CSUM_SCTP_IPV6;
3686
		}
3687

3688
		/*
3689
		 * If we're dealing with a reassembled packet we need to adjust
3690
		 * the header length from the IPv4 header size to IPv6 header
3691
		 * size.
3692
		 */
3693
		mtag = m_tag_find(pd->m, PACKET_TAG_PF_REASSEMBLED, NULL);
3694
		if (mtag) {
3695
			ftag = (struct pf_fragment_tag *)(mtag + 1);
3696
			ftag->ft_hdrlen = sizeof(*ip6);
3697
			ftag->ft_maxlen -= sizeof(struct ip6_hdr) -
3698
			    sizeof(struct ip) + sizeof(struct ip6_frag);
3699
		}
3700
		break;
3701
	default:
3702
		return (-1);
3703
	}
3704

3705
	/* recalculate icmp/icmp6 checksums */
3706
	if (pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) {
3707
		int off;
3708
		if ((mp = m_pulldown(pd->m, hlen, sizeof(*icmp), &off)) ==
3709
		    NULL) {
3710
			pd->m = NULL;
3711
			return (-1);
3712
		}
3713
		icmp = (struct icmp6_hdr *)(mp->m_data + off);
3714
		icmp->icmp6_cksum = 0;
3715
		icmp->icmp6_cksum = pd->naf == AF_INET ?
3716
		    in4_cksum(pd->m, 0, hlen, ntohs(ip4->ip_len) - hlen) :
3717
		    in6_cksum(pd->m, IPPROTO_ICMPV6, hlen,
3718
			ntohs(ip6->ip6_plen));
3719
	}
3720
#endif /* INET && INET6 */
3721

3722
	return (0);
3723
}
3724

3725
int
3726
pf_change_icmp_af(struct mbuf *m, int off, struct pf_pdesc *pd,
3727
    struct pf_pdesc *pd2, struct pf_addr *src, struct pf_addr *dst,
3728
    sa_family_t af, sa_family_t naf)
3729
{
3730
#if defined(INET) && defined(INET6)
3731
	struct mbuf	*n = NULL;
3732
	struct ip	*ip4;
3733
	struct ip6_hdr	*ip6;
3734
	int		 hlen, olen, mlen;
3735

3736
	if (af == naf || (af != AF_INET && af != AF_INET6) ||
3737
	    (naf != AF_INET && naf != AF_INET6))
3738
		return (-1);
3739

3740
	/* split the mbuf chain on the inner ip/ip6 header boundary */
3741
	if ((n = m_split(m, off, M_NOWAIT)) == NULL)
3742
		return (-1);
3743

3744
	/* old header */
3745
	olen = pd2->off - off;
3746
	/* new header */
3747
	hlen = naf == AF_INET ? sizeof(*ip4) : sizeof(*ip6);
3748

3749
	/* trim old header */
3750
	m_adj(n, olen);
3751

3752
	/* prepend a new one */
3753
	M_PREPEND(n, hlen, M_NOWAIT);
3754
	if (n == NULL)
3755
		return (-1);
3756

3757
	/* translate inner ip/ip6 header */
3758
	switch (naf) {
3759
	case AF_INET:
3760
		ip4 = mtod(n, struct ip *);
3761
		bzero(ip4, sizeof(*ip4));
3762
		ip4->ip_v = IPVERSION;
3763
		ip4->ip_hl = sizeof(*ip4) >> 2;
3764
		ip4->ip_len = htons(sizeof(*ip4) + pd2->tot_len - olen);
3765
		ip_fillid(ip4, V_ip_random_id);
3766
		ip4->ip_off = htons(IP_DF);
3767
		ip4->ip_ttl = pd2->ttl;
3768
		if (pd2->proto == IPPROTO_ICMPV6)
3769
			ip4->ip_p = IPPROTO_ICMP;
3770
		else
3771
			ip4->ip_p = pd2->proto;
3772
		ip4->ip_src = src->v4;
3773
		ip4->ip_dst = dst->v4;
3774
		ip4->ip_sum = in_cksum(n, ip4->ip_hl << 2);
3775
		break;
3776
	case AF_INET6:
3777
		ip6 = mtod(n, struct ip6_hdr *);
3778
		bzero(ip6, sizeof(*ip6));
3779
		ip6->ip6_vfc = IPV6_VERSION;
3780
		ip6->ip6_plen = htons(pd2->tot_len - olen);
3781
		if (pd2->proto == IPPROTO_ICMP)
3782
			ip6->ip6_nxt = IPPROTO_ICMPV6;
3783
		else
3784
			ip6->ip6_nxt = pd2->proto;
3785
		if (!pd2->ttl || pd2->ttl > IPV6_DEFHLIM)
3786
			ip6->ip6_hlim = IPV6_DEFHLIM;
3787
		else
3788
			ip6->ip6_hlim = pd2->ttl;
3789
		ip6->ip6_src = src->v6;
3790
		ip6->ip6_dst = dst->v6;
3791
		break;
3792
	default:
3793
		unhandled_af(naf);
3794
	}
3795

3796
	/* adjust payload offset and total packet length */
3797
	pd2->off += hlen - olen;
3798
	pd->tot_len += hlen - olen;
3799

3800
	/* merge modified inner packet with the original header */
3801
	mlen = n->m_pkthdr.len;
3802
	m_cat(m, n);
3803
	m->m_pkthdr.len += mlen;
3804
#endif /* INET && INET6 */
3805

3806
	return (0);
3807
}
3808

3809
#define PTR_IP(field)	(offsetof(struct ip, field))
3810
#define PTR_IP6(field)	(offsetof(struct ip6_hdr, field))
3811

3812
int
3813
pf_translate_icmp_af(int af, void *arg)
3814
{
3815
#if defined(INET) && defined(INET6)
3816
	struct icmp		*icmp4;
3817
	struct icmp6_hdr	*icmp6;
3818
	u_int32_t		 mtu;
3819
	int32_t			 ptr = -1;
3820
	u_int8_t		 type;
3821
	u_int8_t		 code;
3822

3823
	switch (af) {
3824
	case AF_INET:
3825
		icmp6 = arg;
3826
		type = icmp6->icmp6_type;
3827
		code = icmp6->icmp6_code;
3828
		mtu = ntohl(icmp6->icmp6_mtu);
3829

3830
		switch (type) {
3831
		case ICMP6_ECHO_REQUEST:
3832
			type = ICMP_ECHO;
3833
			break;
3834
		case ICMP6_ECHO_REPLY:
3835
			type = ICMP_ECHOREPLY;
3836
			break;
3837
		case ICMP6_DST_UNREACH:
3838
			type = ICMP_UNREACH;
3839
			switch (code) {
3840
			case ICMP6_DST_UNREACH_NOROUTE:
3841
			case ICMP6_DST_UNREACH_BEYONDSCOPE:
3842
			case ICMP6_DST_UNREACH_ADDR:
3843
				code = ICMP_UNREACH_HOST;
3844
				break;
3845
			case ICMP6_DST_UNREACH_ADMIN:
3846
				code = ICMP_UNREACH_HOST_PROHIB;
3847
				break;
3848
			case ICMP6_DST_UNREACH_NOPORT:
3849
				code = ICMP_UNREACH_PORT;
3850
				break;
3851
			default:
3852
				return (-1);
3853
			}
3854
			break;
3855
		case ICMP6_PACKET_TOO_BIG:
3856
			type = ICMP_UNREACH;
3857
			code = ICMP_UNREACH_NEEDFRAG;
3858
			mtu -= 20;
3859
			break;
3860
		case ICMP6_TIME_EXCEEDED:
3861
			type = ICMP_TIMXCEED;
3862
			break;
3863
		case ICMP6_PARAM_PROB:
3864
			switch (code) {
3865
			case ICMP6_PARAMPROB_HEADER:
3866
				type = ICMP_PARAMPROB;
3867
				code = ICMP_PARAMPROB_ERRATPTR;
3868
				ptr = ntohl(icmp6->icmp6_pptr);
3869

3870
				if (ptr == PTR_IP6(ip6_vfc))
3871
					; /* preserve */
3872
				else if (ptr == PTR_IP6(ip6_vfc) + 1)
3873
					ptr = PTR_IP(ip_tos);
3874
				else if (ptr == PTR_IP6(ip6_plen) ||
3875
				    ptr == PTR_IP6(ip6_plen) + 1)
3876
					ptr = PTR_IP(ip_len);
3877
				else if (ptr == PTR_IP6(ip6_nxt))
3878
					ptr = PTR_IP(ip_p);
3879
				else if (ptr == PTR_IP6(ip6_hlim))
3880
					ptr = PTR_IP(ip_ttl);
3881
				else if (ptr >= PTR_IP6(ip6_src) &&
3882
				    ptr < PTR_IP6(ip6_dst))
3883
					ptr = PTR_IP(ip_src);
3884
				else if (ptr >= PTR_IP6(ip6_dst) &&
3885
				    ptr < sizeof(struct ip6_hdr))
3886
					ptr = PTR_IP(ip_dst);
3887
				else {
3888
					return (-1);
3889
				}
3890
				break;
3891
			case ICMP6_PARAMPROB_NEXTHEADER:
3892
				type = ICMP_UNREACH;
3893
				code = ICMP_UNREACH_PROTOCOL;
3894
				break;
3895
			default:
3896
				return (-1);
3897
			}
3898
			break;
3899
		default:
3900
			return (-1);
3901
		}
3902
		if (icmp6->icmp6_type != type) {
3903
			icmp6->icmp6_cksum = pf_cksum_fixup(icmp6->icmp6_cksum,
3904
			    icmp6->icmp6_type, type, 0);
3905
			icmp6->icmp6_type = type;
3906
		}
3907
		if (icmp6->icmp6_code != code) {
3908
			icmp6->icmp6_cksum = pf_cksum_fixup(icmp6->icmp6_cksum,
3909
			    icmp6->icmp6_code, code, 0);
3910
			icmp6->icmp6_code = code;
3911
		}
3912
		if (icmp6->icmp6_mtu != htonl(mtu)) {
3913
			icmp6->icmp6_cksum = pf_cksum_fixup(icmp6->icmp6_cksum,
3914
			    htons(ntohl(icmp6->icmp6_mtu)), htons(mtu), 0);
3915
			/* aligns well with a icmpv4 nextmtu */
3916
			icmp6->icmp6_mtu = htonl(mtu);
3917
		}
3918
		if (ptr >= 0 && icmp6->icmp6_pptr != htonl(ptr)) {
3919
			icmp6->icmp6_cksum = pf_cksum_fixup(icmp6->icmp6_cksum,
3920
			    htons(ntohl(icmp6->icmp6_pptr)), htons(ptr), 0);
3921
			/* icmpv4 pptr is a one most significant byte */
3922
			icmp6->icmp6_pptr = htonl(ptr << 24);
3923
		}
3924
		break;
3925
	case AF_INET6:
3926
		icmp4 = arg;
3927
		type = icmp4->icmp_type;
3928
		code = icmp4->icmp_code;
3929
		mtu = ntohs(icmp4->icmp_nextmtu);
3930

3931
		switch (type) {
3932
		case ICMP_ECHO:
3933
			type = ICMP6_ECHO_REQUEST;
3934
			break;
3935
		case ICMP_ECHOREPLY:
3936
			type = ICMP6_ECHO_REPLY;
3937
			break;
3938
		case ICMP_UNREACH:
3939
			type = ICMP6_DST_UNREACH;
3940
			switch (code) {
3941
			case ICMP_UNREACH_NET:
3942
			case ICMP_UNREACH_HOST:
3943
			case ICMP_UNREACH_NET_UNKNOWN:
3944
			case ICMP_UNREACH_HOST_UNKNOWN:
3945
			case ICMP_UNREACH_ISOLATED:
3946
			case ICMP_UNREACH_TOSNET:
3947
			case ICMP_UNREACH_TOSHOST:
3948
				code = ICMP6_DST_UNREACH_NOROUTE;
3949
				break;
3950
			case ICMP_UNREACH_PORT:
3951
				code = ICMP6_DST_UNREACH_NOPORT;
3952
				break;
3953
			case ICMP_UNREACH_NET_PROHIB:
3954
			case ICMP_UNREACH_HOST_PROHIB:
3955
			case ICMP_UNREACH_FILTER_PROHIB:
3956
			case ICMP_UNREACH_PRECEDENCE_CUTOFF:
3957
				code = ICMP6_DST_UNREACH_ADMIN;
3958
				break;
3959
			case ICMP_UNREACH_PROTOCOL:
3960
				type = ICMP6_PARAM_PROB;
3961
				code = ICMP6_PARAMPROB_NEXTHEADER;
3962
				ptr = offsetof(struct ip6_hdr, ip6_nxt);
3963
				break;
3964
			case ICMP_UNREACH_NEEDFRAG:
3965
				type = ICMP6_PACKET_TOO_BIG;
3966
				code = 0;
3967
				mtu += 20;
3968
				break;
3969
			default:
3970
				return (-1);
3971
			}
3972
			break;
3973
		case ICMP_TIMXCEED:
3974
			type = ICMP6_TIME_EXCEEDED;
3975
			break;
3976
		case ICMP_PARAMPROB:
3977
			type = ICMP6_PARAM_PROB;
3978
			switch (code) {
3979
			case ICMP_PARAMPROB_ERRATPTR:
3980
				code = ICMP6_PARAMPROB_HEADER;
3981
				break;
3982
			case ICMP_PARAMPROB_LENGTH:
3983
				code = ICMP6_PARAMPROB_HEADER;
3984
				break;
3985
			default:
3986
				return (-1);
3987
			}
3988

3989
			ptr = icmp4->icmp_pptr;
3990
			if (ptr == 0 || ptr == PTR_IP(ip_tos))
3991
				; /* preserve */
3992
			else if (ptr == PTR_IP(ip_len) ||
3993
			    ptr == PTR_IP(ip_len) + 1)
3994
				ptr = PTR_IP6(ip6_plen);
3995
			else if (ptr == PTR_IP(ip_ttl))
3996
				ptr = PTR_IP6(ip6_hlim);
3997
			else if (ptr == PTR_IP(ip_p))
3998
				ptr = PTR_IP6(ip6_nxt);
3999
			else if (ptr >= PTR_IP(ip_src) && ptr < PTR_IP(ip_dst))
4000
				ptr = PTR_IP6(ip6_src);
4001
			else if (ptr >= PTR_IP(ip_dst) &&
4002
			    ptr < sizeof(struct ip))
4003
				ptr = PTR_IP6(ip6_dst);
4004
			else {
4005
				return (-1);
4006
			}
4007
			break;
4008
		default:
4009
			return (-1);
4010
		}
4011
		if (icmp4->icmp_type != type) {
4012
			icmp4->icmp_cksum = pf_cksum_fixup(icmp4->icmp_cksum,
4013
			    icmp4->icmp_type, type, 0);
4014
			icmp4->icmp_type = type;
4015
		}
4016
		if (icmp4->icmp_code != code) {
4017
			icmp4->icmp_cksum = pf_cksum_fixup(icmp4->icmp_cksum,
4018
			    icmp4->icmp_code, code, 0);
4019
			icmp4->icmp_code = code;
4020
		}
4021
		if (icmp4->icmp_nextmtu != htons(mtu)) {
4022
			icmp4->icmp_cksum = pf_cksum_fixup(icmp4->icmp_cksum,
4023
			    icmp4->icmp_nextmtu, htons(mtu), 0);
4024
			icmp4->icmp_nextmtu = htons(mtu);
4025
		}
4026
		if (ptr >= 0 && icmp4->icmp_void != ptr) {
4027
			icmp4->icmp_cksum = pf_cksum_fixup(icmp4->icmp_cksum,
4028
			    htons(icmp4->icmp_pptr), htons(ptr), 0);
4029
			icmp4->icmp_void = htonl(ptr);
4030
		}
4031
		break;
4032
	default:
4033
		unhandled_af(af);
4034
	}
4035
#endif /* INET && INET6 */
4036

4037
	return (0);
4038
}
4039

4040
/*
4041
 * Need to modulate the sequence numbers in the TCP SACK option
4042
 * (credits to Krzysztof Pfaff for report and patch)
4043
 */
4044
static int
4045
pf_modulate_sack(struct pf_pdesc *pd, struct tcphdr *th,
4046
    struct pf_state_peer *dst)
4047
{
4048
	struct sackblk	 sack;
4049
	int		 copyback = 0, i;
4050
	int		 olen, optsoff;
4051
	uint8_t		 opts[MAX_TCPOPTLEN], *opt, *eoh;
4052

4053
	olen = (pd->hdr.tcp.th_off << 2) - sizeof(struct tcphdr);
4054
	optsoff = pd->off + sizeof(struct tcphdr);
4055
#define	TCPOLEN_MINSACK	(TCPOLEN_SACK + 2)
4056
	if (olen < TCPOLEN_MINSACK ||
4057
	    !pf_pull_hdr(pd->m, optsoff, opts, olen, NULL, pd->af))
4058
		return (0);
4059

4060
	eoh = opts + olen;
4061
	opt = opts;
4062
	while ((opt = pf_find_tcpopt(opt, opts, olen,
4063
	    TCPOPT_SACK, TCPOLEN_MINSACK)) != NULL)
4064
	{
4065
		size_t safelen = MIN(opt[1], (eoh - opt));
4066
		for (i = 2; i + TCPOLEN_SACK <= safelen; i += TCPOLEN_SACK) {
4067
			size_t startoff = (opt + i) - opts;
4068
			memcpy(&sack, &opt[i], sizeof(sack));
4069
			pf_patch_32(pd, &sack.start,
4070
			    htonl(ntohl(sack.start) - dst->seqdiff),
4071
			    PF_ALGNMNT(startoff));
4072
			pf_patch_32(pd, &sack.end,
4073
			    htonl(ntohl(sack.end) - dst->seqdiff),
4074
			    PF_ALGNMNT(startoff + sizeof(sack.start)));
4075
			memcpy(&opt[i], &sack, sizeof(sack));
4076
		}
4077
		copyback = 1;
4078
		opt += opt[1];
4079
	}
4080

4081
	if (copyback)
4082
		m_copyback(pd->m, optsoff, olen, (caddr_t)opts);
4083

4084
	return (copyback);
4085
}
4086

4087
struct mbuf *
4088
pf_build_tcp(const struct pf_krule *r, sa_family_t af,
4089
    const struct pf_addr *saddr, const struct pf_addr *daddr,
4090
    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
4091
    u_int8_t tcp_flags, u_int16_t win, u_int16_t mss, u_int8_t ttl,
4092
    int mbuf_flags, u_int16_t mtag_tag, u_int16_t mtag_flags, u_int sack,
4093
    int rtableid, u_short *reason)
4094
{
4095
	struct mbuf	*m;
4096
	int		 len, tlen;
4097
#ifdef INET
4098
	struct ip	*h = NULL;
4099
#endif /* INET */
4100
#ifdef INET6
4101
	struct ip6_hdr	*h6 = NULL;
4102
#endif /* INET6 */
4103
	struct tcphdr	*th;
4104
	char		*opt;
4105
	struct pf_mtag  *pf_mtag;
4106

4107
	len = 0;
4108
	th = NULL;
4109

4110
	/* maximum segment size tcp option */
4111
	tlen = sizeof(struct tcphdr);
4112
	if (mss)
4113
		tlen += 4;
4114
	if (sack)
4115
		tlen += 2;
4116

4117
	switch (af) {
4118
#ifdef INET
4119
	case AF_INET:
4120
		len = sizeof(struct ip) + tlen;
4121
		break;
4122
#endif /* INET */
4123
#ifdef INET6
4124
	case AF_INET6:
4125
		len = sizeof(struct ip6_hdr) + tlen;
4126
		break;
4127
#endif /* INET6 */
4128
	default:
4129
		unhandled_af(af);
4130
	}
4131

4132
	m = m_gethdr(M_NOWAIT, MT_DATA);
4133
	if (m == NULL) {
4134
		REASON_SET(reason, PFRES_MEMORY);
4135
		return (NULL);
4136
	}
4137

4138
#ifdef MAC
4139
	mac_netinet_firewall_send(m);
4140
#endif
4141
	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
4142
		REASON_SET(reason, PFRES_MEMORY);
4143
		m_freem(m);
4144
		return (NULL);
4145
	}
4146
	m->m_flags |= mbuf_flags;
4147
	pf_mtag->tag = mtag_tag;
4148
	pf_mtag->flags = mtag_flags;
4149

4150
	if (rtableid >= 0)
4151
		M_SETFIB(m, rtableid);
4152

4153
#ifdef ALTQ
4154
	if (r != NULL && r->qid) {
4155
		pf_mtag->qid = r->qid;
4156

4157
		/* add hints for ecn */
4158
		pf_mtag->hdr = mtod(m, struct ip *);
4159
	}
4160
#endif /* ALTQ */
4161
	m->m_data += max_linkhdr;
4162
	m->m_pkthdr.len = m->m_len = len;
4163
	/* The rest of the stack assumes a rcvif, so provide one.
4164
	 * This is a locally generated packet, so .. close enough. */
4165
	m->m_pkthdr.rcvif = V_loif;
4166
	bzero(m->m_data, len);
4167
	switch (af) {
4168
#ifdef INET
4169
	case AF_INET:
4170
		m->m_pkthdr.csum_flags |= CSUM_TCP;
4171
		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
4172

4173
		h = mtod(m, struct ip *);
4174

4175
		h->ip_p = IPPROTO_TCP;
4176
		h->ip_len = htons(tlen);
4177
		h->ip_v = 4;
4178
		h->ip_hl = sizeof(*h) >> 2;
4179
		h->ip_tos = IPTOS_LOWDELAY;
4180
		h->ip_len = htons(len);
4181
		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
4182
		h->ip_ttl = ttl ? ttl : V_ip_defttl;
4183
		h->ip_sum = 0;
4184
		h->ip_src.s_addr = saddr->v4.s_addr;
4185
		h->ip_dst.s_addr = daddr->v4.s_addr;
4186

4187
		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
4188
		th->th_sum = in_pseudo(h->ip_src.s_addr, h->ip_dst.s_addr,
4189
		    htons(len - sizeof(struct ip) + IPPROTO_TCP));
4190
		break;
4191
#endif /* INET */
4192
#ifdef INET6
4193
	case AF_INET6:
4194
		m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
4195
		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
4196

4197
		h6 = mtod(m, struct ip6_hdr *);
4198

4199
		/* IP header fields included in the TCP checksum */
4200
		h6->ip6_nxt = IPPROTO_TCP;
4201
		h6->ip6_plen = htons(tlen);
4202
		h6->ip6_vfc |= IPV6_VERSION;
4203
		h6->ip6_hlim = V_ip6_defhlim;
4204
		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
4205
		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
4206

4207
		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
4208
		th->th_sum = in6_cksum_pseudo(h6, len - sizeof(struct ip6_hdr),
4209
		    IPPROTO_TCP, 0);
4210
		break;
4211
#endif /* INET6 */
4212
	}
4213

4214
	/* TCP header */
4215
	th->th_sport = sport;
4216
	th->th_dport = dport;
4217
	th->th_seq = htonl(seq);
4218
	th->th_ack = htonl(ack);
4219
	th->th_off = tlen >> 2;
4220
	tcp_set_flags(th, tcp_flags);
4221
	th->th_win = htons(win);
4222

4223
	opt = (char *)(th + 1);
4224
	if (mss) {
4225
		opt = (char *)(th + 1);
4226
		opt[0] = TCPOPT_MAXSEG;
4227
		opt[1] = 4;
4228
		mss = htons(mss);
4229
		memcpy((opt + 2), &mss, 2);
4230
		opt += 4;
4231
	}
4232
	if (sack) {
4233
		opt[0] = TCPOPT_SACK_PERMITTED;
4234
		opt[1] = 2;
4235
		opt += 2;
4236
	}
4237

4238
	return (m);
4239
}
4240

4241
static void
4242
pf_send_sctp_abort(sa_family_t af, struct pf_pdesc *pd,
4243
    uint8_t ttl, int rtableid)
4244
{
4245
	struct mbuf		*m;
4246
#ifdef INET
4247
	struct ip		*h = NULL;
4248
#endif /* INET */
4249
#ifdef INET6
4250
	struct ip6_hdr		*h6 = NULL;
4251
#endif /* INET6 */
4252
	struct sctphdr		*hdr;
4253
	struct sctp_chunkhdr	*chunk;
4254
	struct pf_send_entry	*pfse;
4255
	int			 off = 0;
4256

4257
	MPASS(af == pd->af);
4258

4259
	m = m_gethdr(M_NOWAIT, MT_DATA);
4260
	if (m == NULL)
4261
		return;
4262

4263
	m->m_data += max_linkhdr;
4264
	m->m_flags |= M_SKIP_FIREWALL;
4265
	/* The rest of the stack assumes a rcvif, so provide one.
4266
	 * This is a locally generated packet, so .. close enough. */
4267
	m->m_pkthdr.rcvif = V_loif;
4268

4269
	/* IPv4|6 header */
4270
	switch (af) {
4271
#ifdef INET
4272
	case AF_INET:
4273
		bzero(m->m_data, sizeof(struct ip) + sizeof(*hdr) + sizeof(*chunk));
4274

4275
		h = mtod(m, struct ip *);
4276

4277
		/* IP header fields included in the TCP checksum */
4278

4279
		h->ip_p = IPPROTO_SCTP;
4280
		h->ip_len = htons(sizeof(*h) + sizeof(*hdr) + sizeof(*chunk));
4281
		h->ip_ttl = ttl ? ttl : V_ip_defttl;
4282
		h->ip_src = pd->dst->v4;
4283
		h->ip_dst = pd->src->v4;
4284

4285
		off += sizeof(struct ip);
4286
		break;
4287
#endif /* INET */
4288
#ifdef INET6
4289
	case AF_INET6:
4290
		bzero(m->m_data, sizeof(struct ip6_hdr) + sizeof(*hdr) + sizeof(*chunk));
4291

4292
		h6 = mtod(m, struct ip6_hdr *);
4293

4294
		/* IP header fields included in the TCP checksum */
4295
		h6->ip6_vfc |= IPV6_VERSION;
4296
		h6->ip6_nxt = IPPROTO_SCTP;
4297
		h6->ip6_plen = htons(sizeof(*h6) + sizeof(*hdr) + sizeof(*chunk));
4298
		h6->ip6_hlim = ttl ? ttl : V_ip6_defhlim;
4299
		memcpy(&h6->ip6_src, &pd->dst->v6, sizeof(struct in6_addr));
4300
		memcpy(&h6->ip6_dst, &pd->src->v6, sizeof(struct in6_addr));
4301

4302
		off += sizeof(struct ip6_hdr);
4303
		break;
4304
#endif /* INET6 */
4305
	default:
4306
		unhandled_af(af);
4307
	}
4308

4309
	/* SCTP header */
4310
	hdr = mtodo(m, off);
4311

4312
	hdr->src_port = pd->hdr.sctp.dest_port;
4313
	hdr->dest_port = pd->hdr.sctp.src_port;
4314
	hdr->v_tag = pd->sctp_initiate_tag;
4315
	hdr->checksum = 0;
4316

4317
	/* Abort chunk. */
4318
	off += sizeof(struct sctphdr);
4319
	chunk = mtodo(m, off);
4320

4321
	chunk->chunk_type = SCTP_ABORT_ASSOCIATION;
4322
	chunk->chunk_length = htons(sizeof(*chunk));
4323

4324
	/* SCTP checksum */
4325
	off += sizeof(*chunk);
4326
	m->m_pkthdr.len = m->m_len = off;
4327

4328
	pf_sctp_checksum(m, off - sizeof(*hdr) - sizeof(*chunk));
4329

4330
	if (rtableid >= 0)
4331
		M_SETFIB(m, rtableid);
4332

4333
	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
4334
	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
4335
	if (pfse == NULL) {
4336
		m_freem(m);
4337
		return;
4338
	}
4339

4340
	switch (af) {
4341
#ifdef INET
4342
	case AF_INET:
4343
		pfse->pfse_type = PFSE_IP;
4344
		break;
4345
#endif /* INET */
4346
#ifdef INET6
4347
	case AF_INET6:
4348
		pfse->pfse_type = PFSE_IP6;
4349
		break;
4350
#endif /* INET6 */
4351
	}
4352

4353
	pfse->pfse_m = m;
4354
	pf_send(pfse);
4355
}
4356

4357
void
4358
pf_send_tcp(const struct pf_krule *r, sa_family_t af,
4359
    const struct pf_addr *saddr, const struct pf_addr *daddr,
4360
    u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
4361
    u_int8_t tcp_flags, u_int16_t win, u_int16_t mss, u_int8_t ttl,
4362
    int mbuf_flags, u_int16_t mtag_tag, u_int16_t mtag_flags, int rtableid,
4363
    u_short *reason)
4364
{
4365
	struct pf_send_entry *pfse;
4366
	struct mbuf	*m;
4367

4368
	m = pf_build_tcp(r, af, saddr, daddr, sport, dport, seq, ack, tcp_flags,
4369
	    win, mss, ttl, mbuf_flags, mtag_tag, mtag_flags, 0, rtableid, reason);
4370
	if (m == NULL)
4371
		return;
4372

4373
	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
4374
	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
4375
	if (pfse == NULL) {
4376
		m_freem(m);
4377
		REASON_SET(reason, PFRES_MEMORY);
4378
		return;
4379
	}
4380

4381
	switch (af) {
4382
#ifdef INET
4383
	case AF_INET:
4384
		pfse->pfse_type = PFSE_IP;
4385
		break;
4386
#endif /* INET */
4387
#ifdef INET6
4388
	case AF_INET6:
4389
		pfse->pfse_type = PFSE_IP6;
4390
		break;
4391
#endif /* INET6 */
4392
	default:
4393
		unhandled_af(af);
4394
	}
4395

4396
	pfse->pfse_m = m;
4397
	pf_send(pfse);
4398
}
4399

4400
static void
4401
pf_undo_nat(struct pf_krule *nr, struct pf_pdesc *pd, uint16_t bip_sum)
4402
{
4403
	/* undo NAT changes, if they have taken place */
4404
	if (nr != NULL) {
4405
		pf_addrcpy(pd->src, &pd->osrc, pd->af);
4406
		pf_addrcpy(pd->dst, &pd->odst, pd->af);
4407
		if (pd->sport)
4408
			*pd->sport = pd->osport;
4409
		if (pd->dport)
4410
			*pd->dport = pd->odport;
4411
		if (pd->ip_sum)
4412
			*pd->ip_sum = bip_sum;
4413
		m_copyback(pd->m, pd->off, pd->hdrlen, pd->hdr.any);
4414
	}
4415
}
4416

4417
static void
4418
pf_return(struct pf_krule *r, struct pf_krule *nr, struct pf_pdesc *pd,
4419
    struct tcphdr *th, u_int16_t bproto_sum, u_int16_t bip_sum,
4420
    u_short *reason, int rtableid)
4421
{
4422
	pf_undo_nat(nr, pd, bip_sum);
4423

4424
	if (pd->proto == IPPROTO_TCP &&
4425
	    ((r->rule_flag & PFRULE_RETURNRST) ||
4426
	    (r->rule_flag & PFRULE_RETURN)) &&
4427
	    !(tcp_get_flags(th) & TH_RST)) {
4428
		u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
4429

4430
		if (pf_check_proto_cksum(pd->m, pd->off, pd->tot_len - pd->off,
4431
		    IPPROTO_TCP, pd->af))
4432
			REASON_SET(reason, PFRES_PROTCKSUM);
4433
		else {
4434
			if (tcp_get_flags(th) & TH_SYN)
4435
				ack++;
4436
			if (tcp_get_flags(th) & TH_FIN)
4437
				ack++;
4438
			pf_send_tcp(r, pd->af, pd->dst,
4439
			    pd->src, th->th_dport, th->th_sport,
4440
			    ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
4441
			    r->return_ttl, M_SKIP_FIREWALL, 0, 0, rtableid,
4442
			    reason);
4443
		}
4444
	} else if (pd->proto == IPPROTO_SCTP &&
4445
	    (r->rule_flag & PFRULE_RETURN)) {
4446
		pf_send_sctp_abort(pd->af, pd, r->return_ttl, rtableid);
4447
	} else if (pd->proto != IPPROTO_ICMP && pd->af == AF_INET &&
4448
		r->return_icmp)
4449
		pf_send_icmp(pd->m, r->return_icmp >> 8,
4450
			r->return_icmp & 255, 0, pd->af, r, rtableid);
4451
	else if (pd->proto != IPPROTO_ICMPV6 && pd->af == AF_INET6 &&
4452
		r->return_icmp6)
4453
		pf_send_icmp(pd->m, r->return_icmp6 >> 8,
4454
			r->return_icmp6 & 255, 0, pd->af, r, rtableid);
4455
}
4456

4457
static int
4458
pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
4459
{
4460
	struct m_tag *mtag;
4461
	u_int8_t mpcp;
4462

4463
	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
4464
	if (mtag == NULL)
4465
		return (0);
4466

4467
	if (prio == PF_PRIO_ZERO)
4468
		prio = 0;
4469

4470
	mpcp = *(uint8_t *)(mtag + 1);
4471

4472
	return (mpcp == prio);
4473
}
4474

4475
static int
4476
pf_icmp_to_bandlim(uint8_t type)
4477
{
4478
	switch (type) {
4479
		case ICMP_ECHO:
4480
		case ICMP_ECHOREPLY:
4481
			return (BANDLIM_ICMP_ECHO);
4482
		case ICMP_TSTAMP:
4483
		case ICMP_TSTAMPREPLY:
4484
			return (BANDLIM_ICMP_TSTAMP);
4485
		case ICMP_UNREACH:
4486
		default:
4487
			return (BANDLIM_ICMP_UNREACH);
4488
	}
4489
}
4490

4491
static void
4492
pf_send_challenge_ack(struct pf_pdesc *pd, struct pf_kstate *s,
4493
    struct pf_state_peer *src, struct pf_state_peer *dst,
4494
    u_short *reason)
4495
{
4496
	/*
4497
	 * We are sending challenge ACK as a response to SYN packet, which
4498
	 * matches existing state (modulo TCP window check). Therefore packet
4499
	 * must be sent on behalf of destination.
4500
	 *
4501
	 * We expect sender to remain either silent, or send RST packet
4502
	 * so both, firewall and remote peer, can purge dead state from
4503
	 * memory.
4504
	 */
4505
	pf_send_tcp(s->rule, pd->af, pd->dst, pd->src,
4506
	    pd->hdr.tcp.th_dport, pd->hdr.tcp.th_sport, dst->seqlo,
4507
	    src->seqlo, TH_ACK, 0, 0, s->rule->return_ttl, 0, 0, 0,
4508
	    s->rule->rtableid, reason);
4509
}
4510

4511
static void
4512
pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, int mtu,
4513
    sa_family_t af, struct pf_krule *r, int rtableid)
4514
{
4515
	struct pf_send_entry *pfse;
4516
	struct mbuf *m0;
4517
	struct pf_mtag *pf_mtag;
4518

4519
	/* ICMP packet rate limitation. */
4520
	switch (af) {
4521
#ifdef INET6
4522
	case AF_INET6:
4523
		if (icmp6_ratelimit(NULL, type, code))
4524
			return;
4525
		break;
4526
#endif /* INET6 */
4527
#ifdef INET
4528
	case AF_INET:
4529
		if (badport_bandlim(pf_icmp_to_bandlim(type)) != 0)
4530
			return;
4531
		break;
4532
#endif /* INET */
4533
	}
4534

4535
	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
4536
	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
4537
	if (pfse == NULL)
4538
		return;
4539

4540
	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
4541
		free(pfse, M_PFTEMP);
4542
		return;
4543
	}
4544

4545
	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
4546
		free(pfse, M_PFTEMP);
4547
		return;
4548
	}
4549
	/* XXX: revisit */
4550
	m0->m_flags |= M_SKIP_FIREWALL;
4551

4552
	if (rtableid >= 0)
4553
		M_SETFIB(m0, rtableid);
4554

4555
#ifdef ALTQ
4556
	if (r->qid) {
4557
		pf_mtag->qid = r->qid;
4558
		/* add hints for ecn */
4559
		pf_mtag->hdr = mtod(m0, struct ip *);
4560
	}
4561
#endif /* ALTQ */
4562

4563
	switch (af) {
4564
#ifdef INET
4565
	case AF_INET:
4566
		pfse->pfse_type = PFSE_ICMP;
4567
		break;
4568
#endif /* INET */
4569
#ifdef INET6
4570
	case AF_INET6:
4571
		pfse->pfse_type = PFSE_ICMP6;
4572
		break;
4573
#endif /* INET6 */
4574
	}
4575
	pfse->pfse_m = m0;
4576
	pfse->icmpopts.type = type;
4577
	pfse->icmpopts.code = code;
4578
	pfse->icmpopts.mtu = mtu;
4579
	pf_send(pfse);
4580
}
4581

4582
/*
4583
 * Return ((n = 0) == (a = b [with mask m]))
4584
 * Note: n != 0 => returns (a != b [with mask m])
4585
 */
4586
int
4587
pf_match_addr(u_int8_t n, const struct pf_addr *a, const struct pf_addr *m,
4588
    const struct pf_addr *b, sa_family_t af)
4589
{
4590
	switch (af) {
4591
#ifdef INET
4592
	case AF_INET:
4593
		if (IN_ARE_MASKED_ADDR_EQUAL(a->v4, b->v4, m->v4))
4594
			return (n == 0);
4595
		break;
4596
#endif /* INET */
4597
#ifdef INET6
4598
	case AF_INET6:
4599
		if (IN6_ARE_MASKED_ADDR_EQUAL(&a->v6, &b->v6, &m->v6))
4600
			return (n == 0);
4601
		break;
4602
#endif /* INET6 */
4603
	}
4604

4605
	return (n != 0);
4606
}
4607

4608
/*
4609
 * Return 1 if b <= a <= e, otherwise return 0.
4610
 */
4611
int
4612
pf_match_addr_range(const struct pf_addr *b, const struct pf_addr *e,
4613
    const struct pf_addr *a, sa_family_t af)
4614
{
4615
	switch (af) {
4616
#ifdef INET
4617
	case AF_INET:
4618
		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
4619
		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
4620
			return (0);
4621
		break;
4622
#endif /* INET */
4623
#ifdef INET6
4624
	case AF_INET6: {
4625
		int	i;
4626

4627
		/* check a >= b */
4628
		for (i = 0; i < 4; ++i)
4629
			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
4630
				break;
4631
			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
4632
				return (0);
4633
		/* check a <= e */
4634
		for (i = 0; i < 4; ++i)
4635
			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
4636
				break;
4637
			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
4638
				return (0);
4639
		break;
4640
	}
4641
#endif /* INET6 */
4642
	}
4643
	return (1);
4644
}
4645

4646
static int
4647
pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
4648
{
4649
	switch (op) {
4650
	case PF_OP_IRG:
4651
		return ((p > a1) && (p < a2));
4652
	case PF_OP_XRG:
4653
		return ((p < a1) || (p > a2));
4654
	case PF_OP_RRG:
4655
		return ((p >= a1) && (p <= a2));
4656
	case PF_OP_EQ:
4657
		return (p == a1);
4658
	case PF_OP_NE:
4659
		return (p != a1);
4660
	case PF_OP_LT:
4661
		return (p < a1);
4662
	case PF_OP_LE:
4663
		return (p <= a1);
4664
	case PF_OP_GT:
4665
		return (p > a1);
4666
	case PF_OP_GE:
4667
		return (p >= a1);
4668
	}
4669
	return (0); /* never reached */
4670
}
4671

4672
int
4673
pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
4674
{
4675
	return (pf_match(op, ntohs(a1), ntohs(a2), ntohs(p)));
4676
}
4677

4678
static int
4679
pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
4680
{
4681
	if (u == -1 && op != PF_OP_EQ && op != PF_OP_NE)
4682
		return (0);
4683
	return (pf_match(op, a1, a2, u));
4684
}
4685

4686
static int
4687
pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
4688
{
4689
	if (g == -1 && op != PF_OP_EQ && op != PF_OP_NE)
4690
		return (0);
4691
	return (pf_match(op, a1, a2, g));
4692
}
4693

4694
int
4695
pf_match_tag(struct mbuf *m, struct pf_krule *r, int *tag, int mtag)
4696
{
4697
	if (*tag == -1)
4698
		*tag = mtag;
4699

4700
	return ((!r->match_tag_not && r->match_tag == *tag) ||
4701
	    (r->match_tag_not && r->match_tag != *tag));
4702
}
4703

4704
static int
4705
pf_match_rcvif(struct mbuf *m, struct pf_krule *r)
4706
{
4707
	struct ifnet *ifp = m->m_pkthdr.rcvif;
4708
	struct pfi_kkif *kif;
4709

4710
	if (ifp == NULL)
4711
		return (0);
4712

4713
	kif = (struct pfi_kkif *)ifp->if_pf_kif;
4714

4715
	if (kif == NULL) {
4716
		DPFPRINTF(PF_DEBUG_URGENT,
4717
		    "%s: kif == NULL, @%d via %s", __func__, r->nr,
4718
			r->rcv_ifname);
4719
		return (0);
4720
	}
4721

4722
	return (pfi_kkif_match(r->rcv_kif, kif));
4723
}
4724

4725
int
4726
pf_tag_packet(struct pf_pdesc *pd, int tag)
4727
{
4728

4729
	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
4730

4731
	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(pd->m)) == NULL))
4732
		return (ENOMEM);
4733

4734
	pd->pf_mtag->tag = tag;
4735

4736
	return (0);
4737
}
4738

4739
/*
4740
 * XXX: We rely on malloc(9) returning pointer aligned addresses.
4741
 */
4742
#define	PF_ANCHORSTACK_MATCH	0x00000001
4743
#define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
4744

4745
#define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
4746
#define	PF_ANCHOR_RULE(f)	(struct pf_krule *)			\
4747
				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
4748
#define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
4749
				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
4750
} while (0)
4751

4752
enum pf_test_status
4753
pf_step_into_anchor(struct pf_test_ctx *ctx, struct pf_krule *r,
4754
    struct pf_krule_slist *match_rules)
4755
{
4756
	enum pf_test_status	rv;
4757

4758
	PF_RULES_RASSERT();
4759

4760
	if (ctx->depth >= PF_ANCHOR_STACK_MAX) {
4761
		printf("%s: anchor stack overflow on %s\n",
4762
		    __func__, r->anchor->name);
4763
		return (PF_TEST_FAIL);
4764
	}
4765

4766
	ctx->depth++;
4767

4768
	if (r->anchor_wildcard) {
4769
		struct pf_kanchor *child;
4770
		rv = PF_TEST_OK;
4771
		RB_FOREACH(child, pf_kanchor_node, &r->anchor->children) {
4772
			rv = pf_match_rule(ctx, &child->ruleset, match_rules);
4773
			if ((rv == PF_TEST_QUICK) || (rv == PF_TEST_FAIL)) {
4774
				/*
4775
				 * we either hit a rule with quick action
4776
				 * (more likely), or hit some runtime
4777
				 * error (e.g. pool_get() failure).
4778
				 */
4779
				break;
4780
			}
4781
		}
4782
	} else {
4783
		rv = pf_match_rule(ctx, &r->anchor->ruleset, match_rules);
4784
		/*
4785
		 * Unless errors occured, stop iff any rule matched
4786
		 * within quick anchors.
4787
		 */
4788
		if (rv != PF_TEST_FAIL && r->quick == PF_TEST_QUICK &&
4789
		    *ctx->am == r)
4790
			rv = PF_TEST_QUICK;
4791
	}
4792

4793
	ctx->depth--;
4794

4795
	return (rv);
4796
}
4797

4798
struct pf_keth_anchor_stackframe {
4799
	struct pf_keth_ruleset	*rs;
4800
	struct pf_keth_rule	*r;	/* XXX: + match bit */
4801
	struct pf_keth_anchor	*child;
4802
};
4803

4804
#define	PF_ETH_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
4805
#define	PF_ETH_ANCHOR_RULE(f)	(struct pf_keth_rule *)			\
4806
				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
4807
#define	PF_ETH_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 		\
4808
				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
4809
} while (0)
4810

4811
void
4812
pf_step_into_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
4813
    struct pf_keth_ruleset **rs, struct pf_keth_rule **r,
4814
    struct pf_keth_rule **a, int *match)
4815
{
4816
	struct pf_keth_anchor_stackframe	*f;
4817

4818
	NET_EPOCH_ASSERT();
4819

4820
	if (match)
4821
		*match = 0;
4822
	if (*depth >= PF_ANCHOR_STACK_MAX) {
4823
		printf("%s: anchor stack overflow on %s\n",
4824
		    __func__, (*r)->anchor->name);
4825
		*r = TAILQ_NEXT(*r, entries);
4826
		return;
4827
	} else if (*depth == 0 && a != NULL)
4828
		*a = *r;
4829
	f = stack + (*depth)++;
4830
	f->rs = *rs;
4831
	f->r = *r;
4832
	if ((*r)->anchor_wildcard) {
4833
		struct pf_keth_anchor_node *parent = &(*r)->anchor->children;
4834

4835
		if ((f->child = RB_MIN(pf_keth_anchor_node, parent)) == NULL) {
4836
			*r = NULL;
4837
			return;
4838
		}
4839
		*rs = &f->child->ruleset;
4840
	} else {
4841
		f->child = NULL;
4842
		*rs = &(*r)->anchor->ruleset;
4843
	}
4844
	*r = TAILQ_FIRST((*rs)->active.rules);
4845
}
4846

4847
int
4848
pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
4849
    struct pf_keth_ruleset **rs, struct pf_keth_rule **r,
4850
    struct pf_keth_rule **a, int *match)
4851
{
4852
	struct pf_keth_anchor_stackframe	*f;
4853
	struct pf_keth_rule *fr;
4854
	int quick = 0;
4855

4856
	NET_EPOCH_ASSERT();
4857

4858
	do {
4859
		if (*depth <= 0)
4860
			break;
4861
		f = stack + *depth - 1;
4862
		fr = PF_ETH_ANCHOR_RULE(f);
4863
		if (f->child != NULL) {
4864
			/*
4865
			 * This block traverses through
4866
			 * a wildcard anchor.
4867
			 */
4868
			if (match != NULL && *match) {
4869
				/*
4870
				 * If any of "*" matched, then
4871
				 * "foo/ *" matched, mark frame
4872
				 * appropriately.
4873
				 */
4874
				PF_ETH_ANCHOR_SET_MATCH(f);
4875
				*match = 0;
4876
			}
4877
			f->child = RB_NEXT(pf_keth_anchor_node,
4878
			    &fr->anchor->children, f->child);
4879
			if (f->child != NULL) {
4880
				*rs = &f->child->ruleset;
4881
				*r = TAILQ_FIRST((*rs)->active.rules);
4882
				if (*r == NULL)
4883
					continue;
4884
				else
4885
					break;
4886
			}
4887
		}
4888
		(*depth)--;
4889
		if (*depth == 0 && a != NULL)
4890
			*a = NULL;
4891
		*rs = f->rs;
4892
		if (PF_ETH_ANCHOR_MATCH(f) || (match != NULL && *match))
4893
			quick = fr->quick;
4894
		*r = TAILQ_NEXT(fr, entries);
4895
	} while (*r == NULL);
4896

4897
	return (quick);
4898
}
4899

4900
void
4901
pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
4902
    struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
4903
{
4904
	switch (af) {
4905
#ifdef INET
4906
	case AF_INET:
4907
		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
4908
		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
4909
		break;
4910
#endif /* INET */
4911
#ifdef INET6
4912
	case AF_INET6:
4913
		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
4914
		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
4915
		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
4916
		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
4917
		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
4918
		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
4919
		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
4920
		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
4921
		break;
4922
#endif /* INET6 */
4923
	}
4924
}
4925

4926
void
4927
pf_addr_inc(struct pf_addr *addr, sa_family_t af)
4928
{
4929
	switch (af) {
4930
#ifdef INET
4931
	case AF_INET:
4932
		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
4933
		break;
4934
#endif /* INET */
4935
#ifdef INET6
4936
	case AF_INET6:
4937
		if (addr->addr32[3] == 0xffffffff) {
4938
			addr->addr32[3] = 0;
4939
			if (addr->addr32[2] == 0xffffffff) {
4940
				addr->addr32[2] = 0;
4941
				if (addr->addr32[1] == 0xffffffff) {
4942
					addr->addr32[1] = 0;
4943
					addr->addr32[0] =
4944
					    htonl(ntohl(addr->addr32[0]) + 1);
4945
				} else
4946
					addr->addr32[1] =
4947
					    htonl(ntohl(addr->addr32[1]) + 1);
4948
			} else
4949
				addr->addr32[2] =
4950
				    htonl(ntohl(addr->addr32[2]) + 1);
4951
		} else
4952
			addr->addr32[3] =
4953
			    htonl(ntohl(addr->addr32[3]) + 1);
4954
		break;
4955
#endif /* INET6 */
4956
	}
4957
}
4958

4959
void
4960
pf_rule_to_actions(struct pf_krule *r, struct pf_rule_actions *a)
4961
{
4962
	/*
4963
	 * Modern rules use the same flags in rules as they do in states.
4964
	 */
4965
	a->flags |= (r->scrub_flags & (PFSTATE_NODF|PFSTATE_RANDOMID|
4966
	    PFSTATE_SCRUB_TCP|PFSTATE_SETPRIO));
4967

4968
	/*
4969
	 * Old-style scrub rules have different flags which need to be translated.
4970
	 */
4971
	if (r->rule_flag & PFRULE_RANDOMID)
4972
		a->flags |= PFSTATE_RANDOMID;
4973
	if (r->scrub_flags & PFSTATE_SETTOS || r->rule_flag & PFRULE_SET_TOS ) {
4974
		a->flags |= PFSTATE_SETTOS;
4975
		a->set_tos = r->set_tos;
4976
	}
4977

4978
	if (r->qid)
4979
		a->qid = r->qid;
4980
	if (r->pqid)
4981
		a->pqid = r->pqid;
4982
	if (r->rtableid >= 0)
4983
		a->rtableid = r->rtableid;
4984
	a->log |= r->log;
4985
	if (r->min_ttl)
4986
		a->min_ttl = r->min_ttl;
4987
	if (r->max_mss)
4988
		a->max_mss = r->max_mss;
4989
	if (r->dnpipe)
4990
		a->dnpipe = r->dnpipe;
4991
	if (r->dnrpipe)
4992
		a->dnrpipe = r->dnrpipe;
4993
	if (r->dnpipe || r->dnrpipe) {
4994
		if (r->free_flags & PFRULE_DN_IS_PIPE)
4995
			a->flags |= PFSTATE_DN_IS_PIPE;
4996
		else
4997
			a->flags &= ~PFSTATE_DN_IS_PIPE;
4998
	}
4999
	if (r->scrub_flags & PFSTATE_SETPRIO) {
5000
		a->set_prio[0] = r->set_prio[0];
5001
		a->set_prio[1] = r->set_prio[1];
5002
	}
5003
	if (r->allow_opts)
5004
		a->allow_opts = r->allow_opts;
5005
	if (r->max_pkt_size)
5006
		a->max_pkt_size = r->max_pkt_size;
5007
}
5008

5009
int
5010
pf_socket_lookup(struct pf_pdesc *pd)
5011
{
5012
	struct pf_addr		*saddr, *daddr;
5013
	u_int16_t		 sport, dport;
5014
	struct inpcbinfo	*pi;
5015
	struct inpcb		*inp;
5016

5017
	pd->lookup.uid = -1;
5018
	pd->lookup.gid = -1;
5019

5020
	switch (pd->proto) {
5021
	case IPPROTO_TCP:
5022
		sport = pd->hdr.tcp.th_sport;
5023
		dport = pd->hdr.tcp.th_dport;
5024
		pi = &V_tcbinfo;
5025
		break;
5026
	case IPPROTO_UDP:
5027
		sport = pd->hdr.udp.uh_sport;
5028
		dport = pd->hdr.udp.uh_dport;
5029
		pi = &V_udbinfo;
5030
		break;
5031
	default:
5032
		return (-1);
5033
	}
5034
	if (pd->dir == PF_IN) {
5035
		saddr = pd->src;
5036
		daddr = pd->dst;
5037
	} else {
5038
		u_int16_t	p;
5039

5040
		p = sport;
5041
		sport = dport;
5042
		dport = p;
5043
		saddr = pd->dst;
5044
		daddr = pd->src;
5045
	}
5046
	switch (pd->af) {
5047
#ifdef INET
5048
	case AF_INET:
5049
		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
5050
		    dport, INPLOOKUP_RLOCKPCB, NULL, pd->m);
5051
		if (inp == NULL) {
5052
			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
5053
			   daddr->v4, dport, INPLOOKUP_WILDCARD |
5054
			   INPLOOKUP_RLOCKPCB, NULL, pd->m);
5055
			if (inp == NULL)
5056
				return (-1);
5057
		}
5058
		break;
5059
#endif /* INET */
5060
#ifdef INET6
5061
	case AF_INET6:
5062
		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
5063
		    dport, INPLOOKUP_RLOCKPCB, NULL, pd->m);
5064
		if (inp == NULL) {
5065
			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
5066
			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
5067
			    INPLOOKUP_RLOCKPCB, NULL, pd->m);
5068
			if (inp == NULL)
5069
				return (-1);
5070
		}
5071
		break;
5072
#endif /* INET6 */
5073
	default:
5074
		unhandled_af(pd->af);
5075
	}
5076
	INP_RLOCK_ASSERT(inp);
5077
	pd->lookup.uid = inp->inp_cred->cr_uid;
5078
	pd->lookup.gid = inp->inp_cred->cr_gid;
5079
	INP_RUNLOCK(inp);
5080

5081
	return (1);
5082
}
5083

5084
/* post: r  => (r[0] == type /\ r[1] >= min_typelen >= 2  "validity"
5085
 *                      /\ (eoh - r) >= min_typelen >= 2  "safety"  )
5086
 *
5087
 * warning: r + r[1] may exceed opts bounds for r[1] > min_typelen
5088
 */
5089
uint8_t*
5090
pf_find_tcpopt(u_int8_t *opt, u_int8_t *opts, size_t hlen, u_int8_t type,
5091
    u_int8_t min_typelen)
5092
{
5093
	uint8_t	*eoh = opts + hlen;
5094

5095
	if (min_typelen < 2)
5096
		return (NULL);
5097

5098
	while ((eoh - opt) >= min_typelen) {
5099
		switch (*opt) {
5100
		case TCPOPT_EOL:
5101
			/* FALLTHROUGH - Workaround the failure of some
5102
			 systems to NOP-pad their bzero'd option buffers,
5103
			 producing spurious EOLs */
5104
		case TCPOPT_NOP:
5105
			opt++;
5106
			continue;
5107
		default:
5108
		if (opt[0] == type &&
5109
			    opt[1] >= min_typelen)
5110
			return (opt);
5111
		}
5112

5113
		opt += MAX(opt[1], 2); /* evade infinite loops */
5114
	}
5115

5116
	return (NULL);
5117
}
5118

5119
u_int8_t
5120
pf_get_wscale(struct pf_pdesc *pd)
5121
{
5122
	int	 olen;
5123
	uint8_t	 opts[MAX_TCPOPTLEN], *opt;
5124
	uint8_t	 wscale = 0;
5125

5126
	olen = (pd->hdr.tcp.th_off << 2) - sizeof(struct tcphdr);
5127
	if (olen < TCPOLEN_WINDOW || !pf_pull_hdr(pd->m,
5128
	    pd->off + sizeof(struct tcphdr), opts, olen, NULL, pd->af))
5129
		return (0);
5130

5131
	opt = opts;
5132
	while ((opt = pf_find_tcpopt(opt, opts, olen,
5133
		    TCPOPT_WINDOW, TCPOLEN_WINDOW)) != NULL) {
5134
		wscale = opt[2];
5135
		wscale = MIN(wscale, TCP_MAX_WINSHIFT);
5136
		wscale |= PF_WSCALE_FLAG;
5137

5138
		opt += opt[1];
5139
	}
5140

5141
	return (wscale);
5142
}
5143

5144
u_int16_t
5145
pf_get_mss(struct pf_pdesc *pd)
5146
{
5147
	int		 olen;
5148
	uint8_t		 opts[MAX_TCPOPTLEN], *opt;
5149
	u_int16_t	 mss = V_tcp_mssdflt;
5150

5151
	olen = (pd->hdr.tcp.th_off << 2) - sizeof(struct tcphdr);
5152
	if (olen < TCPOLEN_MAXSEG || !pf_pull_hdr(pd->m,
5153
	    pd->off + sizeof(struct tcphdr), opts, olen, NULL, pd->af))
5154
		return (0);
5155

5156
	opt = opts;
5157
	while ((opt = pf_find_tcpopt(opt, opts, olen,
5158
	    TCPOPT_MAXSEG, TCPOLEN_MAXSEG)) != NULL) {
5159
		memcpy(&mss, (opt + 2), 2);
5160
		mss = ntohs(mss);
5161
		opt += opt[1];
5162
	}
5163

5164
	return (mss);
5165
}
5166

5167
static u_int16_t
5168
pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
5169
{
5170
	struct nhop_object *nh;
5171
#ifdef INET6
5172
	struct in6_addr		dst6;
5173
	uint32_t		scopeid;
5174
#endif /* INET6 */
5175
	int			 hlen = 0;
5176
	uint16_t		 mss = 0;
5177

5178
	NET_EPOCH_ASSERT();
5179

5180
	switch (af) {
5181
#ifdef INET
5182
	case AF_INET:
5183
		hlen = sizeof(struct ip);
5184
		nh = fib4_lookup(rtableid, addr->v4, 0, 0, 0);
5185
		if (nh != NULL)
5186
			mss = nh->nh_mtu - hlen - sizeof(struct tcphdr);
5187
		break;
5188
#endif /* INET */
5189
#ifdef INET6
5190
	case AF_INET6:
5191
		hlen = sizeof(struct ip6_hdr);
5192
		in6_splitscope(&addr->v6, &dst6, &scopeid);
5193
		nh = fib6_lookup(rtableid, &dst6, scopeid, 0, 0);
5194
		if (nh != NULL)
5195
			mss = nh->nh_mtu - hlen - sizeof(struct tcphdr);
5196
		break;
5197
#endif /* INET6 */
5198
	}
5199

5200
	mss = max(V_tcp_mssdflt, mss);
5201
	mss = min(mss, offer);
5202
	mss = max(mss, 64);		/* sanity - at least max opt space */
5203
	return (mss);
5204
}
5205

5206
static u_int32_t
5207
pf_tcp_iss(struct pf_pdesc *pd)
5208
{
5209
	SHA512_CTX ctx;
5210
	union {
5211
		uint8_t bytes[SHA512_DIGEST_LENGTH];
5212
		uint32_t words[1];
5213
	} digest;
5214

5215
	if (V_pf_tcp_secret_init == 0) {
5216
		arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
5217
		SHA512_Init(&V_pf_tcp_secret_ctx);
5218
		SHA512_Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
5219
		    sizeof(V_pf_tcp_secret));
5220
		V_pf_tcp_secret_init = 1;
5221
	}
5222

5223
	ctx = V_pf_tcp_secret_ctx;
5224

5225
	SHA512_Update(&ctx, &pd->hdr.tcp.th_sport, sizeof(u_short));
5226
	SHA512_Update(&ctx, &pd->hdr.tcp.th_dport, sizeof(u_short));
5227
	switch (pd->af) {
5228
	case AF_INET6:
5229
		SHA512_Update(&ctx, &pd->src->v6, sizeof(struct in6_addr));
5230
		SHA512_Update(&ctx, &pd->dst->v6, sizeof(struct in6_addr));
5231
		break;
5232
	case AF_INET:
5233
		SHA512_Update(&ctx, &pd->src->v4, sizeof(struct in_addr));
5234
		SHA512_Update(&ctx, &pd->dst->v4, sizeof(struct in_addr));
5235
		break;
5236
	}
5237
	SHA512_Final(digest.bytes, &ctx);
5238
	V_pf_tcp_iss_off += 4096;
5239
#define	ISN_RANDOM_INCREMENT (4096 - 1)
5240
	return (digest.words[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
5241
	    V_pf_tcp_iss_off);
5242
#undef	ISN_RANDOM_INCREMENT
5243
}
5244

5245
static bool
5246
pf_match_eth_addr(const uint8_t *a, const struct pf_keth_rule_addr *r)
5247
{
5248
	bool match = true;
5249

5250
	/* Always matches if not set */
5251
	if (! r->isset)
5252
		return (!r->neg);
5253

5254
	for (int i = 0; i < ETHER_ADDR_LEN; i++) {
5255
		if ((a[i] & r->mask[i]) != (r->addr[i] & r->mask[i])) {
5256
			match = false;
5257
			break;
5258
		}
5259
	}
5260

5261
	return (match ^ r->neg);
5262
}
5263

5264
static int
5265
pf_match_eth_tag(struct mbuf *m, struct pf_keth_rule *r, int *tag, int mtag)
5266
{
5267
	if (*tag == -1)
5268
		*tag = mtag;
5269

5270
	return ((!r->match_tag_not && r->match_tag == *tag) ||
5271
	    (r->match_tag_not && r->match_tag != *tag));
5272
}
5273

5274
static void
5275
pf_bridge_to(struct ifnet *ifp, struct mbuf *m)
5276
{
5277
	/* If we don't have the interface drop the packet. */
5278
	if (ifp == NULL) {
5279
		m_freem(m);
5280
		return;
5281
	}
5282

5283
	switch (ifp->if_type) {
5284
	case IFT_ETHER:
5285
	case IFT_XETHER:
5286
	case IFT_L2VLAN:
5287
	case IFT_BRIDGE:
5288
	case IFT_IEEE8023ADLAG:
5289
		break;
5290
	default:
5291
		m_freem(m);
5292
		return;
5293
	}
5294

5295
	ifp->if_transmit(ifp, m);
5296
}
5297

5298
static int
5299
pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
5300
{
5301
#ifdef INET
5302
	struct ip ip;
5303
#endif /* INET */
5304
#ifdef INET6
5305
	struct ip6_hdr ip6;
5306
#endif /* INET6 */
5307
	struct mbuf *m = *m0;
5308
	struct ether_header *e;
5309
	struct pf_keth_rule *r, *rm, *a = NULL;
5310
	struct pf_keth_ruleset *ruleset = NULL;
5311
	struct pf_mtag *mtag;
5312
	struct pf_keth_ruleq *rules;
5313
	struct pf_addr *src = NULL, *dst = NULL;
5314
	struct pfi_kkif *bridge_to;
5315
	sa_family_t af = 0;
5316
	uint16_t proto;
5317
	int asd = 0, match = 0;
5318
	int tag = -1;
5319
	uint8_t action;
5320
	struct pf_keth_anchor_stackframe	anchor_stack[PF_ANCHOR_STACK_MAX];
5321

5322
	MPASS(kif->pfik_ifp->if_vnet == curvnet);
5323
	NET_EPOCH_ASSERT();
5324

5325
	PF_RULES_RLOCK_TRACKER;
5326

5327
	SDT_PROBE3(pf, eth, test_rule, entry, dir, kif->pfik_ifp, m);
5328

5329
	mtag = pf_find_mtag(m);
5330
	if (mtag != NULL && mtag->flags & PF_MTAG_FLAG_DUMMYNET) {
5331
		/* Dummynet re-injects packets after they've
5332
		 * completed their delay. We've already
5333
		 * processed them, so pass unconditionally. */
5334

5335
		/* But only once. We may see the packet multiple times (e.g.
5336
		 * PFIL_IN/PFIL_OUT). */
5337
		pf_dummynet_flag_remove(m, mtag);
5338

5339
		return (PF_PASS);
5340
	}
5341

5342
	if (__predict_false(m->m_len < sizeof(struct ether_header)) &&
5343
	    (m = *m0 = m_pullup(*m0, sizeof(struct ether_header))) == NULL) {
5344
		DPFPRINTF(PF_DEBUG_URGENT,
5345
		    "%s: m_len < sizeof(struct ether_header)"
5346
		     ", pullup failed", __func__);
5347
		return (PF_DROP);
5348
	}
5349
	e = mtod(m, struct ether_header *);
5350
	proto = ntohs(e->ether_type);
5351

5352
	switch (proto) {
5353
#ifdef INET
5354
	case ETHERTYPE_IP: {
5355
		if (m_length(m, NULL) < (sizeof(struct ether_header) +
5356
		    sizeof(ip)))
5357
			return (PF_DROP);
5358

5359
		af = AF_INET;
5360
		m_copydata(m, sizeof(struct ether_header), sizeof(ip),
5361
		    (caddr_t)&ip);
5362
		src = (struct pf_addr *)&ip.ip_src;
5363
		dst = (struct pf_addr *)&ip.ip_dst;
5364
		break;
5365
	}
5366
#endif /* INET */
5367
#ifdef INET6
5368
	case ETHERTYPE_IPV6: {
5369
		if (m_length(m, NULL) < (sizeof(struct ether_header) +
5370
		    sizeof(ip6)))
5371
			return (PF_DROP);
5372

5373
		af = AF_INET6;
5374
		m_copydata(m, sizeof(struct ether_header), sizeof(ip6),
5375
		    (caddr_t)&ip6);
5376
		src = (struct pf_addr *)&ip6.ip6_src;
5377
		dst = (struct pf_addr *)&ip6.ip6_dst;
5378
		break;
5379
	}
5380
#endif /* INET6 */
5381
	}
5382

5383
	PF_RULES_RLOCK();
5384

5385
	ruleset = V_pf_keth;
5386
	rules = atomic_load_ptr(&ruleset->active.rules);
5387
	for (r = TAILQ_FIRST(rules), rm = NULL; r != NULL;) {
5388
		counter_u64_add(r->evaluations, 1);
5389
		SDT_PROBE2(pf, eth, test_rule, test, r->nr, r);
5390

5391
		if (pfi_kkif_match(r->kif, kif) == r->ifnot) {
5392
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5393
			    "kif");
5394
			r = r->skip[PFE_SKIP_IFP].ptr;
5395
		}
5396
		else if (r->direction && r->direction != dir) {
5397
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5398
			    "dir");
5399
			r = r->skip[PFE_SKIP_DIR].ptr;
5400
		}
5401
		else if (r->proto && r->proto != proto) {
5402
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5403
			    "proto");
5404
			r = r->skip[PFE_SKIP_PROTO].ptr;
5405
		}
5406
		else if (! pf_match_eth_addr(e->ether_shost, &r->src)) {
5407
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5408
			    "src");
5409
			r = r->skip[PFE_SKIP_SRC_ADDR].ptr;
5410
		}
5411
		else if (! pf_match_eth_addr(e->ether_dhost, &r->dst)) {
5412
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5413
			    "dst");
5414
			r = r->skip[PFE_SKIP_DST_ADDR].ptr;
5415
		}
5416
		else if (src != NULL && PF_MISMATCHAW(&r->ipsrc.addr, src, af,
5417
		    r->ipsrc.neg, kif, M_GETFIB(m))) {
5418
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5419
			    "ip_src");
5420
			r = r->skip[PFE_SKIP_SRC_IP_ADDR].ptr;
5421
		}
5422
		else if (dst != NULL && PF_MISMATCHAW(&r->ipdst.addr, dst, af,
5423
		    r->ipdst.neg, kif, M_GETFIB(m))) {
5424
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5425
			    "ip_dst");
5426
			r = r->skip[PFE_SKIP_DST_IP_ADDR].ptr;
5427
		}
5428
		else if (r->match_tag && !pf_match_eth_tag(m, r, &tag,
5429
		    mtag ? mtag->tag : 0)) {
5430
			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
5431
			    "match_tag");
5432
			r = TAILQ_NEXT(r, entries);
5433
		}
5434
		else {
5435
			if (r->tag)
5436
				tag = r->tag;
5437
			if (r->anchor == NULL) {
5438
				/* Rule matches */
5439
				rm = r;
5440

5441
				SDT_PROBE2(pf, eth, test_rule, match, r->nr, r);
5442

5443
				if (r->quick)
5444
					break;
5445

5446
				r = TAILQ_NEXT(r, entries);
5447
			} else {
5448
				pf_step_into_keth_anchor(anchor_stack, &asd,
5449
				    &ruleset, &r, &a, &match);
5450
			}
5451
		}
5452
		if (r == NULL && pf_step_out_of_keth_anchor(anchor_stack, &asd,
5453
		    &ruleset, &r, &a, &match))
5454
			break;
5455
	}
5456

5457
	r = rm;
5458

5459
	SDT_PROBE2(pf, eth, test_rule, final_match, (r != NULL ? r->nr : -1), r);
5460

5461
	/* Default to pass. */
5462
	if (r == NULL) {
5463
		PF_RULES_RUNLOCK();
5464
		return (PF_PASS);
5465
	}
5466

5467
	/* Execute action. */
5468
	counter_u64_add(r->packets[dir == PF_OUT], 1);
5469
	counter_u64_add(r->bytes[dir == PF_OUT], m_length(m, NULL));
5470
	pf_update_timestamp(r);
5471

5472
	/* Shortcut. Don't tag if we're just going to drop anyway. */
5473
	if (r->action == PF_DROP) {
5474
		PF_RULES_RUNLOCK();
5475
		return (PF_DROP);
5476
	}
5477

5478
	if (tag > 0) {
5479
		if (mtag == NULL)
5480
			mtag = pf_get_mtag(m);
5481
		if (mtag == NULL) {
5482
			PF_RULES_RUNLOCK();
5483
			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
5484
			return (PF_DROP);
5485
		}
5486
		mtag->tag = tag;
5487
	}
5488

5489
	if (r->qid != 0) {
5490
		if (mtag == NULL)
5491
			mtag = pf_get_mtag(m);
5492
		if (mtag == NULL) {
5493
			PF_RULES_RUNLOCK();
5494
			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
5495
			return (PF_DROP);
5496
		}
5497
		mtag->qid = r->qid;
5498
	}
5499

5500
	action = r->action;
5501
	bridge_to = r->bridge_to;
5502

5503
	/* Dummynet */
5504
	if (r->dnpipe) {
5505
		struct ip_fw_args dnflow;
5506

5507
		/* Drop packet if dummynet is not loaded. */
5508
		if (ip_dn_io_ptr == NULL) {
5509
			PF_RULES_RUNLOCK();
5510
			m_freem(m);
5511
			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
5512
			return (PF_DROP);
5513
		}
5514
		if (mtag == NULL)
5515
			mtag = pf_get_mtag(m);
5516
		if (mtag == NULL) {
5517
			PF_RULES_RUNLOCK();
5518
			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
5519
			return (PF_DROP);
5520
		}
5521

5522
		bzero(&dnflow, sizeof(dnflow));
5523

5524
		/* We don't have port numbers here, so we set 0.  That means
5525
		 * that we'll be somewhat limited in distinguishing flows (i.e.
5526
		 * only based on IP addresses, not based on port numbers), but
5527
		 * it's better than nothing. */
5528
		dnflow.f_id.dst_port = 0;
5529
		dnflow.f_id.src_port = 0;
5530
		dnflow.f_id.proto = 0;
5531

5532
		dnflow.rule.info = r->dnpipe;
5533
		dnflow.rule.info |= IPFW_IS_DUMMYNET;
5534
		if (r->dnflags & PFRULE_DN_IS_PIPE)
5535
			dnflow.rule.info |= IPFW_IS_PIPE;
5536

5537
		dnflow.f_id.extra = dnflow.rule.info;
5538

5539
		dnflow.flags = dir == PF_IN ? IPFW_ARGS_IN : IPFW_ARGS_OUT;
5540
		dnflow.flags |= IPFW_ARGS_ETHER;
5541
		dnflow.ifp = kif->pfik_ifp;
5542

5543
		switch (af) {
5544
		case AF_INET:
5545
			dnflow.f_id.addr_type = 4;
5546
			dnflow.f_id.src_ip = src->v4.s_addr;
5547
			dnflow.f_id.dst_ip = dst->v4.s_addr;
5548
			break;
5549
		case AF_INET6:
5550
			dnflow.flags |= IPFW_ARGS_IP6;
5551
			dnflow.f_id.addr_type = 6;
5552
			dnflow.f_id.src_ip6 = src->v6;
5553
			dnflow.f_id.dst_ip6 = dst->v6;
5554
			break;
5555
		}
5556

5557
		PF_RULES_RUNLOCK();
5558

5559
		mtag->flags |= PF_MTAG_FLAG_DUMMYNET;
5560
		ip_dn_io_ptr(m0, &dnflow);
5561
		if (*m0 != NULL)
5562
			pf_dummynet_flag_remove(m, mtag);
5563
	} else {
5564
		PF_RULES_RUNLOCK();
5565
	}
5566

5567
	if (action == PF_PASS && bridge_to) {
5568
		pf_bridge_to(bridge_to->pfik_ifp, *m0);
5569
		*m0 = NULL; /* We've eaten the packet. */
5570
	}
5571

5572
	return (action);
5573
}
5574

5575
#define PF_TEST_ATTRIB(t, a)		\
5576
	if (t) {			\
5577
		r = a;			\
5578
		continue;		\
5579
	} else do {			\
5580
	} while (0)
5581

5582
static __inline u_short
5583
pf_rule_apply_nat(struct pf_test_ctx *ctx, struct pf_krule *r)
5584
{
5585
	struct pf_pdesc	*pd = ctx->pd;
5586
	u_short		 transerror;
5587
	u_int8_t	 nat_action;
5588

5589
	if (r->rule_flag & PFRULE_AFTO) {
5590
		/* Don't translate if there was an old style NAT rule */
5591
		if (ctx->nr != NULL)
5592
			return (PFRES_TRANSLATE);
5593

5594
		/* pass af-to rules, unsupported on match rules */
5595
		KASSERT(r->action != PF_MATCH, ("%s: af-to on match rule", __func__));
5596
		/* XXX I can imagine scenarios where we have both NAT and RDR source tracking */
5597
		ctx->nat_pool = &(r->nat);
5598
		ctx->nr = r;
5599
		pd->naf = r->naf;
5600
		if (pf_get_transaddr_af(ctx->nr, pd) == -1) {
5601
			return (PFRES_TRANSLATE);
5602
		}
5603
		return (PFRES_MATCH);
5604
	} else if (r->rdr.cur || r->nat.cur) {
5605
		/* Don't translate if there was an old style NAT rule */
5606
		if (ctx->nr != NULL)
5607
			return (PFRES_TRANSLATE);
5608

5609
		/* match/pass nat-to/rdr-to rules */
5610
		ctx->nr = r;
5611
		if (r->nat.cur) {
5612
			nat_action = PF_NAT;
5613
			ctx->nat_pool = &(r->nat);
5614
		} else {
5615
			nat_action = PF_RDR;
5616
			ctx->nat_pool = &(r->rdr);
5617
		}
5618

5619
		transerror = pf_get_transaddr(ctx, ctx->nr,
5620
		    nat_action, ctx->nat_pool);
5621
		if (transerror == PFRES_MATCH) {
5622
			ctx->rewrite += pf_translate_compat(ctx);
5623
			return(PFRES_MATCH);
5624
		}
5625
		return (transerror);
5626
	}
5627

5628
	return (PFRES_MAX);
5629
}
5630

5631
enum pf_test_status
5632
pf_match_rule(struct pf_test_ctx *ctx, struct pf_kruleset *ruleset,
5633
    struct pf_krule_slist *match_rules)
5634
{
5635
	struct pf_krule_item	*ri, *rt;
5636
	struct pf_krule		*r;
5637
	struct pf_krule		*save_a;
5638
	struct pf_kruleset	*save_aruleset;
5639
	struct pf_pdesc		*pd = ctx->pd;
5640
	u_short			 transerror;
5641

5642
	r = TAILQ_FIRST(ruleset->rules[PF_RULESET_FILTER].active.ptr);
5643
	while (r != NULL) {
5644
		if (ctx->pd->related_rule) {
5645
			*ctx->rm = ctx->pd->related_rule;
5646
			break;
5647
		}
5648
		PF_TEST_ATTRIB(r->rule_flag & PFRULE_EXPIRED,
5649
		    TAILQ_NEXT(r, entries));
5650
		/* Don't count expired rule evaluations. */
5651
		pf_counter_u64_add(&r->evaluations, 1);
5652
		PF_TEST_ATTRIB(pfi_kkif_match(r->kif, pd->kif) == r->ifnot,
5653
			r->skip[PF_SKIP_IFP]);
5654
		PF_TEST_ATTRIB(r->direction && r->direction != pd->dir,
5655
			r->skip[PF_SKIP_DIR]);
5656
		PF_TEST_ATTRIB(r->af && r->af != pd->af,
5657
			r->skip[PF_SKIP_AF]);
5658
		PF_TEST_ATTRIB(r->proto && r->proto != pd->proto,
5659
			r->skip[PF_SKIP_PROTO]);
5660
		PF_TEST_ATTRIB(PF_MISMATCHAW(&r->src.addr, &pd->nsaddr, pd->naf,
5661
		    r->src.neg, pd->kif, M_GETFIB(pd->m)),
5662
			r->skip[PF_SKIP_SRC_ADDR]);
5663
		PF_TEST_ATTRIB(PF_MISMATCHAW(&r->dst.addr, &pd->ndaddr, pd->af,
5664
		    r->dst.neg, NULL, M_GETFIB(pd->m)),
5665
			r->skip[PF_SKIP_DST_ADDR]);
5666
		switch (pd->virtual_proto) {
5667
		case PF_VPROTO_FRAGMENT:
5668
			/* tcp/udp only. port_op always 0 in other cases */
5669
			PF_TEST_ATTRIB((r->src.port_op || r->dst.port_op),
5670
				TAILQ_NEXT(r, entries));
5671
			PF_TEST_ATTRIB((pd->proto == IPPROTO_TCP && r->flagset),
5672
				TAILQ_NEXT(r, entries));
5673
			/* icmp only. type/code always 0 in other cases */
5674
			PF_TEST_ATTRIB((r->type || r->code),
5675
				TAILQ_NEXT(r, entries));
5676
			/* tcp/udp only. {uid|gid}.op always 0 in other cases */
5677
			PF_TEST_ATTRIB((r->gid.op || r->uid.op),
5678
				TAILQ_NEXT(r, entries));
5679
			break;
5680

5681
		case IPPROTO_TCP:
5682
			PF_TEST_ATTRIB((r->flagset & tcp_get_flags(ctx->th))
5683
			    != r->flags,
5684
				TAILQ_NEXT(r, entries));
5685
			/* FALLTHROUGH */
5686
		case IPPROTO_SCTP:
5687
		case IPPROTO_UDP:
5688
			/* tcp/udp only. port_op always 0 in other cases */
5689
			PF_TEST_ATTRIB(r->src.port_op && !pf_match_port(r->src.port_op,
5690
			    r->src.port[0], r->src.port[1], pd->nsport),
5691
				r->skip[PF_SKIP_SRC_PORT]);
5692
			/* tcp/udp only. port_op always 0 in other cases */
5693
			PF_TEST_ATTRIB(r->dst.port_op && !pf_match_port(r->dst.port_op,
5694
			    r->dst.port[0], r->dst.port[1], pd->ndport),
5695
				r->skip[PF_SKIP_DST_PORT]);
5696
			/* tcp/udp only. uid.op always 0 in other cases */
5697
			PF_TEST_ATTRIB(r->uid.op && (pd->lookup.done || (pd->lookup.done =
5698
			    pf_socket_lookup(pd), 1)) &&
5699
			    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
5700
			    pd->lookup.uid),
5701
				TAILQ_NEXT(r, entries));
5702
			/* tcp/udp only. gid.op always 0 in other cases */
5703
			PF_TEST_ATTRIB(r->gid.op && (pd->lookup.done || (pd->lookup.done =
5704
			    pf_socket_lookup(pd), 1)) &&
5705
			    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
5706
			    pd->lookup.gid),
5707
				TAILQ_NEXT(r, entries));
5708
			break;
5709

5710
		case IPPROTO_ICMP:
5711
		case IPPROTO_ICMPV6:
5712
			/* icmp only. type always 0 in other cases */
5713
			PF_TEST_ATTRIB(r->type && r->type != ctx->icmptype + 1,
5714
				TAILQ_NEXT(r, entries));
5715
			/* icmp only. type always 0 in other cases */
5716
			PF_TEST_ATTRIB(r->code && r->code != ctx->icmpcode + 1,
5717
				TAILQ_NEXT(r, entries));
5718
			break;
5719

5720
		default:
5721
			break;
5722
		}
5723
		PF_TEST_ATTRIB(r->tos && !(r->tos == pd->tos),
5724
			TAILQ_NEXT(r, entries));
5725
		PF_TEST_ATTRIB(r->prio &&
5726
		    !pf_match_ieee8021q_pcp(r->prio, pd->m),
5727
			TAILQ_NEXT(r, entries));
5728
		PF_TEST_ATTRIB(r->prob &&
5729
		    r->prob <= arc4random(),
5730
			TAILQ_NEXT(r, entries));
5731
		PF_TEST_ATTRIB(r->match_tag && !pf_match_tag(pd->m, r,
5732
		    &ctx->tag, pd->pf_mtag ? pd->pf_mtag->tag : 0),
5733
			TAILQ_NEXT(r, entries));
5734
		PF_TEST_ATTRIB((r->rcv_kif && pf_match_rcvif(pd->m, r) ==
5735
		   r->rcvifnot),
5736
			TAILQ_NEXT(r, entries));
5737
		PF_TEST_ATTRIB((r->rule_flag & PFRULE_FRAGMENT &&
5738
		    pd->virtual_proto != PF_VPROTO_FRAGMENT),
5739
			TAILQ_NEXT(r, entries));
5740
		PF_TEST_ATTRIB(r->os_fingerprint != PF_OSFP_ANY &&
5741
		    (pd->virtual_proto != IPPROTO_TCP || !pf_osfp_match(
5742
		    pf_osfp_fingerprint(pd, ctx->th),
5743
		    r->os_fingerprint)),
5744
			TAILQ_NEXT(r, entries));
5745
		/* must be last! */
5746
		if (r->pktrate.limit) {
5747
			PF_TEST_ATTRIB((pf_check_threshold(&r->pktrate)),
5748
			    TAILQ_NEXT(r, entries));
5749
		}
5750
		/* FALLTHROUGH */
5751
		if (r->tag)
5752
			ctx->tag = r->tag;
5753
		if (r->anchor == NULL) {
5754

5755
			if (r->rule_flag & PFRULE_ONCE) {
5756
				uint32_t	rule_flag;
5757

5758
				rule_flag = r->rule_flag;
5759
				if ((rule_flag & PFRULE_EXPIRED) == 0 &&
5760
				    atomic_cmpset_int(&r->rule_flag, rule_flag,
5761
				    rule_flag | PFRULE_EXPIRED)) {
5762
					r->exptime = time_uptime;
5763
				} else {
5764
					r = TAILQ_NEXT(r, entries);
5765
					continue;
5766
				}
5767
			}
5768

5769
			if (r->action == PF_MATCH) {
5770
				/*
5771
				 * Apply translations before increasing counters,
5772
				 * in case it fails.
5773
				 */
5774
				transerror = pf_rule_apply_nat(ctx, r);
5775
				switch (transerror) {
5776
				case PFRES_MATCH:
5777
					/* Translation action found in rule and applied successfully */
5778
				case PFRES_MAX:
5779
					/* No translation action found in rule */
5780
					break;
5781
				default:
5782
					/* Translation action found in rule but failed to apply */
5783
					REASON_SET(&ctx->reason, transerror);
5784
					return (PF_TEST_FAIL);
5785
				}
5786
				ri = malloc(sizeof(struct pf_krule_item), M_PF_RULE_ITEM, M_NOWAIT | M_ZERO);
5787
				if (ri == NULL) {
5788
					REASON_SET(&ctx->reason, PFRES_MEMORY);
5789
					return (PF_TEST_FAIL);
5790
				}
5791
				ri->r = r;
5792

5793
				if (SLIST_EMPTY(match_rules)) {
5794
					SLIST_INSERT_HEAD(match_rules, ri, entry);
5795
				} else {
5796
					SLIST_INSERT_AFTER(rt, ri, entry);
5797
				}
5798
				rt = ri;
5799

5800
				pf_rule_to_actions(r, &pd->act);
5801
				if (r->log)
5802
					PFLOG_PACKET(r->action, PFRES_MATCH, r,
5803
					    ctx->a, ruleset, pd, 1, NULL);
5804
			} else {
5805
				/*
5806
				 * found matching r
5807
				 */
5808
				*ctx->rm = r;
5809
				/*
5810
				 * anchor, with ruleset, where r belongs to
5811
				 */
5812
				*ctx->am = ctx->a;
5813
				/*
5814
				 * ruleset where r belongs to
5815
				 */
5816
				*ctx->rsm = ruleset;
5817
				/*
5818
				 * ruleset, where anchor belongs to.
5819
				 */
5820
				ctx->arsm = ctx->aruleset;
5821
			}
5822
			if (pd->act.log & PF_LOG_MATCHES)
5823
				pf_log_matches(pd, r, ctx->a, ruleset, match_rules);
5824
			if (r->quick) {
5825
				ctx->test_status = PF_TEST_QUICK;
5826
				break;
5827
			}
5828
		} else {
5829
			save_a = ctx->a;
5830
			save_aruleset = ctx->aruleset;
5831

5832
			ctx->a = r;			/* remember anchor */
5833
			ctx->aruleset = ruleset;	/* and its ruleset */
5834
			if (ctx->a->quick)
5835
				ctx->test_status = PF_TEST_QUICK;
5836
			/*
5837
			 * Note: we don't need to restore if we are not going
5838
			 * to continue with ruleset evaluation.
5839
			 */
5840
			if (pf_step_into_anchor(ctx, r, match_rules) != PF_TEST_OK) {
5841
				break;
5842
			}
5843
			ctx->a = save_a;
5844
			ctx->aruleset = save_aruleset;
5845
		}
5846
		r = TAILQ_NEXT(r, entries);
5847
	}
5848

5849

5850
	return (ctx->test_status);
5851
}
5852

5853
static int
5854
pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
5855
    struct pf_pdesc *pd, struct pf_krule **am,
5856
    struct pf_kruleset **rsm, u_short *reason, struct inpcb *inp,
5857
    struct pf_krule_slist *match_rules)
5858
{
5859
	struct pf_krule		*r = NULL;
5860
	struct pf_kruleset	*ruleset = NULL;
5861
	struct pf_test_ctx	 ctx;
5862
	u_short			 transerror;
5863
	int			 action = PF_PASS;
5864
	u_int16_t		 bproto_sum = 0, bip_sum = 0;
5865
	enum pf_test_status	 rv;
5866

5867
	PF_RULES_RASSERT();
5868

5869
	bzero(&ctx, sizeof(ctx));
5870
	ctx.tag = -1;
5871
	ctx.pd = pd;
5872
	ctx.rm = rm;
5873
	ctx.am = am;
5874
	ctx.rsm = rsm;
5875
	ctx.th = &pd->hdr.tcp;
5876
	ctx.reason = *reason;
5877

5878
	pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
5879
	pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
5880

5881
	if (inp != NULL) {
5882
		INP_LOCK_ASSERT(inp);
5883
		pd->lookup.uid = inp->inp_cred->cr_uid;
5884
		pd->lookup.gid = inp->inp_cred->cr_gid;
5885
		pd->lookup.done = 1;
5886
	}
5887

5888
	if (pd->ip_sum)
5889
		bip_sum = *pd->ip_sum;
5890

5891
	switch (pd->virtual_proto) {
5892
	case IPPROTO_TCP:
5893
		bproto_sum = ctx.th->th_sum;
5894
		pd->nsport = ctx.th->th_sport;
5895
		pd->ndport = ctx.th->th_dport;
5896
		break;
5897
	case IPPROTO_UDP:
5898
		bproto_sum = pd->hdr.udp.uh_sum;
5899
		pd->nsport = pd->hdr.udp.uh_sport;
5900
		pd->ndport = pd->hdr.udp.uh_dport;
5901
		break;
5902
	case IPPROTO_SCTP:
5903
		pd->nsport = pd->hdr.sctp.src_port;
5904
		pd->ndport = pd->hdr.sctp.dest_port;
5905
		break;
5906
#ifdef INET
5907
	case IPPROTO_ICMP:
5908
		MPASS(pd->af == AF_INET);
5909
		ctx.icmptype = pd->hdr.icmp.icmp_type;
5910
		ctx.icmpcode = pd->hdr.icmp.icmp_code;
5911
		ctx.state_icmp = pf_icmp_mapping(pd, ctx.icmptype,
5912
		    &ctx.icmp_dir, &ctx.virtual_id, &ctx.virtual_type);
5913
		if (ctx.icmp_dir == PF_IN) {
5914
			pd->nsport = ctx.virtual_id;
5915
			pd->ndport = ctx.virtual_type;
5916
		} else {
5917
			pd->nsport = ctx.virtual_type;
5918
			pd->ndport = ctx.virtual_id;
5919
		}
5920
		break;
5921
#endif /* INET */
5922
#ifdef INET6
5923
	case IPPROTO_ICMPV6:
5924
		MPASS(pd->af == AF_INET6);
5925
		ctx.icmptype = pd->hdr.icmp6.icmp6_type;
5926
		ctx.icmpcode = pd->hdr.icmp6.icmp6_code;
5927
		ctx.state_icmp = pf_icmp_mapping(pd, ctx.icmptype,
5928
		    &ctx.icmp_dir, &ctx.virtual_id, &ctx.virtual_type);
5929
		if (ctx.icmp_dir == PF_IN) {
5930
			pd->nsport = ctx.virtual_id;
5931
			pd->ndport = ctx.virtual_type;
5932
		} else {
5933
			pd->nsport = ctx.virtual_type;
5934
			pd->ndport = ctx.virtual_id;
5935
		}
5936

5937
		break;
5938
#endif /* INET6 */
5939
	default:
5940
		pd->nsport = pd->ndport = 0;
5941
		break;
5942
	}
5943
	pd->osport = pd->nsport;
5944
	pd->odport = pd->ndport;
5945

5946
	/* check packet for BINAT/NAT/RDR */
5947
	transerror = pf_get_translation(&ctx);
5948
	switch (transerror) {
5949
	default:
5950
		/* A translation error occurred. */
5951
		REASON_SET(&ctx.reason, transerror);
5952
		goto cleanup;
5953
	case PFRES_MAX:
5954
		/* No match. */
5955
		break;
5956
	case PFRES_MATCH:
5957
		KASSERT(ctx.sk != NULL, ("%s: null sk", __func__));
5958
		KASSERT(ctx.nk != NULL, ("%s: null nk", __func__));
5959
		if (ctx.nr->log) {
5960
			PFLOG_PACKET(ctx.nr->action, PFRES_MATCH, ctx.nr, ctx.a,
5961
			    ruleset, pd, 1, NULL);
5962
		}
5963

5964
		ctx.rewrite += pf_translate_compat(&ctx);
5965
		ctx.nat_pool = &(ctx.nr->rdr);
5966
	}
5967

5968
	if (ctx.nr && ctx.nr->natpass) {
5969
		r = ctx.nr;
5970
		ruleset = *ctx.rsm;
5971
	} else {
5972
		ruleset = &pf_main_ruleset;
5973
		rv = pf_match_rule(&ctx, ruleset, match_rules);
5974
		if (rv == PF_TEST_FAIL) {
5975
			/*
5976
			 * Reason has been set in pf_match_rule() already.
5977
			 */
5978
			goto cleanup;
5979
		}
5980

5981
		r = *ctx.rm;			/* matching rule */
5982
		ctx.a = *ctx.am;		/* rule that defines an anchor containing 'r' */
5983
		ruleset = *ctx.rsm;		/* ruleset of the anchor defined by the rule 'a' */
5984
		ctx.aruleset = ctx.arsm;	/* ruleset of the 'a' rule itself */
5985

5986
		/* apply actions for last matching pass/block rule */
5987
		pf_rule_to_actions(r, &pd->act);
5988
		transerror = pf_rule_apply_nat(&ctx, r);
5989
		switch (transerror) {
5990
		case PFRES_MATCH:
5991
			/* Translation action found in rule and applied successfully */
5992
		case PFRES_MAX:
5993
			/* No translation action found in rule */
5994
			break;
5995
		default:
5996
			/* Translation action found in rule but failed to apply */
5997
			REASON_SET(&ctx.reason, transerror);
5998
			goto cleanup;
5999
		}
6000
	}
6001

6002
	REASON_SET(&ctx.reason, PFRES_MATCH);
6003

6004
	if (r->log) {
6005
		if (ctx.rewrite)
6006
			m_copyback(pd->m, pd->off, pd->hdrlen, pd->hdr.any);
6007
		PFLOG_PACKET(r->action, ctx.reason, r, ctx.a, ruleset, pd, 1, NULL);
6008
	}
6009
	if (pd->act.log & PF_LOG_MATCHES)
6010
		pf_log_matches(pd, r, ctx.a, ruleset, match_rules);
6011
	if (pd->virtual_proto != PF_VPROTO_FRAGMENT &&
6012
	   (r->action == PF_DROP) &&
6013
	    ((r->rule_flag & PFRULE_RETURNRST) ||
6014
	    (r->rule_flag & PFRULE_RETURNICMP) ||
6015
	    (r->rule_flag & PFRULE_RETURN))) {
6016
		pf_return(r, ctx.nr, pd, ctx.th, bproto_sum,
6017
		    bip_sum, &ctx.reason, r->rtableid);
6018
	}
6019

6020
	if (r->action == PF_DROP)
6021
		goto cleanup;
6022

6023
	if (ctx.tag > 0 && pf_tag_packet(pd, ctx.tag)) {
6024
		REASON_SET(&ctx.reason, PFRES_MEMORY);
6025
		goto cleanup;
6026
	}
6027
	if (pd->act.rtableid >= 0)
6028
		M_SETFIB(pd->m, pd->act.rtableid);
6029

6030
	if (r->rt) {
6031
		/*
6032
		 * Set act.rt here instead of in pf_rule_to_actions() because
6033
		 * it is applied only from the last pass rule. For rules
6034
		 * with the prefer-ipv6-nexthop option act.rt_af is a hint
6035
		 * about AF of the forwarded packet and might be changed.
6036
		 */
6037
		pd->act.rt = r->rt;
6038
		if (r->rt == PF_REPLYTO)
6039
			pd->act.rt_af = pd->af;
6040
		else
6041
			pd->act.rt_af = pd->naf;
6042
		if ((transerror = pf_map_addr_sn(pd->af, r, pd->src,
6043
		    &pd->act.rt_addr, &pd->act.rt_af, &pd->act.rt_kif, NULL,
6044
		    &(r->route), PF_SN_ROUTE)) != PFRES_MATCH) {
6045
			REASON_SET(&ctx.reason, transerror);
6046
			goto cleanup;
6047
		}
6048
	}
6049

6050
	if (pd->virtual_proto != PF_VPROTO_FRAGMENT &&
6051
	   (!ctx.state_icmp && (r->keep_state || ctx.nr != NULL ||
6052
	    (pd->flags & PFDESC_TCP_NORM)))) {
6053
		bool nat64;
6054

6055
		action = pf_create_state(r, &ctx, sm, bproto_sum, bip_sum,
6056
		    match_rules);
6057
		ctx.sk = ctx.nk = NULL;
6058
		if (action != PF_PASS) {
6059
			pf_udp_mapping_release(ctx.udp_mapping);
6060
			if (r->log || (ctx.nr != NULL && ctx.nr->log) ||
6061
			    ctx.reason == PFRES_MEMORY)
6062
				pd->act.log |= PF_LOG_FORCE;
6063
			if (action == PF_DROP &&
6064
			    (r->rule_flag & PFRULE_RETURN))
6065
				pf_return(r, ctx.nr, pd, ctx.th,
6066
				    bproto_sum, bip_sum, &ctx.reason,
6067
				    pd->act.rtableid);
6068
			*reason = ctx.reason;
6069
			return (action);
6070
		}
6071

6072
		nat64 = pd->af != pd->naf;
6073
		if (nat64) {
6074
			int			 ret;
6075

6076
			if (ctx.sk == NULL)
6077
				ctx.sk = (*sm)->key[pd->dir == PF_IN ? PF_SK_STACK : PF_SK_WIRE];
6078
			if (ctx.nk == NULL)
6079
				ctx.nk = (*sm)->key[pd->dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK];
6080

6081
			if (pd->dir == PF_IN) {
6082
				ret = pf_translate(pd, &ctx.sk->addr[pd->didx],
6083
				    ctx.sk->port[pd->didx], &ctx.sk->addr[pd->sidx],
6084
				    ctx.sk->port[pd->sidx], ctx.virtual_type,
6085
				    ctx.icmp_dir);
6086
			} else {
6087
				ret = pf_translate(pd, &ctx.sk->addr[pd->sidx],
6088
				    ctx.sk->port[pd->sidx], &ctx.sk->addr[pd->didx],
6089
				    ctx.sk->port[pd->didx], ctx.virtual_type,
6090
				    ctx.icmp_dir);
6091
			}
6092

6093
			if (ret < 0)
6094
				goto cleanup;
6095

6096
			ctx.rewrite += ret;
6097

6098
			if (ctx.rewrite && ctx.sk->af != ctx.nk->af)
6099
				action = PF_AFRT;
6100
		}
6101
	} else {
6102
		uma_zfree(V_pf_state_key_z, ctx.sk);
6103
		uma_zfree(V_pf_state_key_z, ctx.nk);
6104
		ctx.sk = ctx.nk = NULL;
6105
		pf_udp_mapping_release(ctx.udp_mapping);
6106
	}
6107

6108
	/* copy back packet headers if we performed NAT operations */
6109
	if (ctx.rewrite)
6110
		m_copyback(pd->m, pd->off, pd->hdrlen, pd->hdr.any);
6111

6112
	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
6113
	    pd->dir == PF_OUT &&
6114
	    V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, pd->m)) {
6115
		/*
6116
		 * We want the state created, but we dont
6117
		 * want to send this in case a partner
6118
		 * firewall has to know about it to allow
6119
		 * replies through it.
6120
		 */
6121
		*reason = ctx.reason;
6122
		return (PF_DEFER);
6123
	}
6124

6125
	*reason = ctx.reason;
6126
	return (action);
6127

6128
cleanup:
6129
	uma_zfree(V_pf_state_key_z, ctx.sk);
6130
	uma_zfree(V_pf_state_key_z, ctx.nk);
6131
	pf_udp_mapping_release(ctx.udp_mapping);
6132
	*reason = ctx.reason;
6133

6134
	return (PF_DROP);
6135
}
6136

6137
static int
6138
pf_create_state(struct pf_krule *r, struct pf_test_ctx *ctx,
6139
    struct pf_kstate **sm, u_int16_t bproto_sum, u_int16_t bip_sum,
6140
    struct pf_krule_slist *match_rules)
6141
{
6142
	struct pf_pdesc		*pd = ctx->pd;
6143
	struct pf_kstate	*s = NULL;
6144
	struct pf_ksrc_node	*sns[PF_SN_MAX] = { NULL };
6145
	/*
6146
	 * XXXKS: The hash for PF_SN_LIMIT and PF_SN_ROUTE should be the same
6147
	 *        but for PF_SN_NAT it is different. Don't try optimizing it,
6148
	 *        just store all 3 hashes.
6149
	 */
6150
	struct pf_srchash	*snhs[PF_SN_MAX] = { NULL };
6151
	struct tcphdr		*th = &pd->hdr.tcp;
6152
	u_int16_t		 mss = V_tcp_mssdflt;
6153
	u_short			 sn_reason;
6154

6155
	/* check maximums */
6156
	if (r->max_states &&
6157
	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
6158
		counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
6159
		REASON_SET(&ctx->reason, PFRES_MAXSTATES);
6160
		goto csfailed;
6161
	}
6162
	/* src node for limits */
6163
	if ((r->rule_flag & PFRULE_SRCTRACK) &&
6164
	    (sn_reason = pf_insert_src_node(sns, snhs, r, pd->src, pd->af,
6165
	    NULL, NULL, pd->af, PF_SN_LIMIT)) != 0) {
6166
		REASON_SET(&ctx->reason, sn_reason);
6167
		goto csfailed;
6168
	}
6169
	/* src node for route-to rule */
6170
	if (r->rt) {
6171
		if ((r->route.opts & PF_POOL_STICKYADDR) &&
6172
		    (sn_reason = pf_insert_src_node(sns, snhs, r, pd->src,
6173
		    pd->af, &pd->act.rt_addr, pd->act.rt_kif, pd->act.rt_af,
6174
		    PF_SN_ROUTE)) != 0) {
6175
			REASON_SET(&ctx->reason, sn_reason);
6176
			goto csfailed;
6177
		}
6178
	}
6179
	/* src node for translation rule */
6180
	if (ctx->nr != NULL) {
6181
		KASSERT(ctx->nat_pool != NULL, ("%s: nat_pool is NULL", __func__));
6182
		/*
6183
		 * The NAT addresses are chosen during ruleset parsing.
6184
		 * The new afto code stores post-nat addresses in nsaddr.
6185
		 * The old nat code (also used for new nat-to rules) creates
6186
		 * state keys and stores addresses in them.
6187
		 */
6188
		if ((ctx->nat_pool->opts & PF_POOL_STICKYADDR) &&
6189
		    (sn_reason = pf_insert_src_node(sns, snhs, ctx->nr,
6190
		    ctx->sk ? &(ctx->sk->addr[pd->sidx]) : pd->src, pd->af,
6191
		    ctx->nk ? &(ctx->nk->addr[1]) : &(pd->nsaddr), NULL,
6192
		    pd->naf, PF_SN_NAT)) != 0 ) {
6193
			REASON_SET(&ctx->reason, sn_reason);
6194
			goto csfailed;
6195
		}
6196
	}
6197
	s = pf_alloc_state(M_NOWAIT);
6198
	if (s == NULL) {
6199
		REASON_SET(&ctx->reason, PFRES_MEMORY);
6200
		goto csfailed;
6201
	}
6202
	s->rule = r;
6203
	s->nat_rule = ctx->nr;
6204
	s->anchor = ctx->a;
6205
	s->match_rules = *match_rules;
6206
	memcpy(&s->act, &pd->act, sizeof(struct pf_rule_actions));
6207

6208
	if (pd->act.allow_opts)
6209
		s->state_flags |= PFSTATE_ALLOWOPTS;
6210
	if (r->rule_flag & PFRULE_STATESLOPPY)
6211
		s->state_flags |= PFSTATE_SLOPPY;
6212
	if (pd->flags & PFDESC_TCP_NORM) /* Set by old-style scrub rules */
6213
		s->state_flags |= PFSTATE_SCRUB_TCP;
6214
	if ((r->rule_flag & PFRULE_PFLOW) ||
6215
	    (ctx->nr != NULL && ctx->nr->rule_flag & PFRULE_PFLOW))
6216
		s->state_flags |= PFSTATE_PFLOW;
6217

6218
	s->act.log = pd->act.log & PF_LOG_ALL;
6219
	s->sync_state = PFSYNC_S_NONE;
6220
	s->state_flags |= pd->act.flags; /* Only needed for pfsync and state export */
6221

6222
	if (ctx->nr != NULL)
6223
		s->act.log |= ctx->nr->log & PF_LOG_ALL;
6224
	switch (pd->proto) {
6225
	case IPPROTO_TCP:
6226
		s->src.seqlo = ntohl(th->th_seq);
6227
		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
6228
		if ((tcp_get_flags(th) & (TH_SYN|TH_ACK)) == TH_SYN &&
6229
		    r->keep_state == PF_STATE_MODULATE) {
6230
			/* Generate sequence number modulator */
6231
			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
6232
			    0)
6233
				s->src.seqdiff = 1;
6234
			pf_change_proto_a(pd->m, &th->th_seq, &th->th_sum,
6235
			    htonl(s->src.seqlo + s->src.seqdiff), 0);
6236
			ctx->rewrite = 1;
6237
		} else
6238
			s->src.seqdiff = 0;
6239
		if (tcp_get_flags(th) & TH_SYN) {
6240
			s->src.seqhi++;
6241
			s->src.wscale = pf_get_wscale(pd);
6242
		}
6243
		s->src.max_win = MAX(ntohs(th->th_win), 1);
6244
		if (s->src.wscale & PF_WSCALE_MASK) {
6245
			/* Remove scale factor from initial window */
6246
			int win = s->src.max_win;
6247
			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
6248
			s->src.max_win = (win - 1) >>
6249
			    (s->src.wscale & PF_WSCALE_MASK);
6250
		}
6251
		if (tcp_get_flags(th) & TH_FIN)
6252
			s->src.seqhi++;
6253
		s->dst.seqhi = 1;
6254
		s->dst.max_win = 1;
6255
		pf_set_protostate(s, PF_PEER_SRC, TCPS_SYN_SENT);
6256
		pf_set_protostate(s, PF_PEER_DST, TCPS_CLOSED);
6257
		s->timeout = PFTM_TCP_FIRST_PACKET;
6258
		atomic_add_32(&V_pf_status.states_halfopen, 1);
6259
		break;
6260
	case IPPROTO_UDP:
6261
		pf_set_protostate(s, PF_PEER_SRC, PFUDPS_SINGLE);
6262
		pf_set_protostate(s, PF_PEER_DST, PFUDPS_NO_TRAFFIC);
6263
		s->timeout = PFTM_UDP_FIRST_PACKET;
6264
		break;
6265
	case IPPROTO_SCTP:
6266
		pf_set_protostate(s, PF_PEER_SRC, SCTP_COOKIE_WAIT);
6267
		pf_set_protostate(s, PF_PEER_DST, SCTP_CLOSED);
6268
		s->timeout = PFTM_SCTP_FIRST_PACKET;
6269
		break;
6270
	case IPPROTO_ICMP:
6271
#ifdef INET6
6272
	case IPPROTO_ICMPV6:
6273
#endif /* INET6 */
6274
		s->timeout = PFTM_ICMP_FIRST_PACKET;
6275
		break;
6276
	default:
6277
		pf_set_protostate(s, PF_PEER_SRC, PFOTHERS_SINGLE);
6278
		pf_set_protostate(s, PF_PEER_DST, PFOTHERS_NO_TRAFFIC);
6279
		s->timeout = PFTM_OTHER_FIRST_PACKET;
6280
	}
6281

6282
	s->creation = s->expire = pf_get_uptime();
6283

6284
	if (pd->proto == IPPROTO_TCP) {
6285
		if (s->state_flags & PFSTATE_SCRUB_TCP &&
6286
		    pf_normalize_tcp_init(pd, th, &s->src)) {
6287
			REASON_SET(&ctx->reason, PFRES_MEMORY);
6288
			goto csfailed;
6289
		}
6290
		if (s->state_flags & PFSTATE_SCRUB_TCP && s->src.scrub &&
6291
		    pf_normalize_tcp_stateful(pd, &ctx->reason, th, s,
6292
		    &s->src, &s->dst, &ctx->rewrite)) {
6293
			/* This really shouldn't happen!!! */
6294
			DPFPRINTF(PF_DEBUG_URGENT,
6295
			    "%s: tcp normalize failed on first "
6296
			     "pkt", __func__);
6297
			goto csfailed;
6298
		}
6299
	} else if (pd->proto == IPPROTO_SCTP) {
6300
		if (pf_normalize_sctp_init(pd, &s->src, &s->dst))
6301
			goto csfailed;
6302
		if (! (pd->sctp_flags & (PFDESC_SCTP_INIT | PFDESC_SCTP_ADD_IP)))
6303
			goto csfailed;
6304
	}
6305
	s->direction = pd->dir;
6306

6307
	/*
6308
	 * sk/nk could already been setup by pf_get_translation().
6309
	 */
6310
	if (ctx->sk == NULL && ctx->nk == NULL) {
6311
		MPASS(pd->sport == NULL || (pd->osport == *pd->sport));
6312
		MPASS(pd->dport == NULL || (pd->odport == *pd->dport));
6313
		if (pf_state_key_setup(pd, pd->nsport, pd->ndport,
6314
		    &ctx->sk, &ctx->nk)) {
6315
			goto csfailed;
6316
		}
6317
	} else
6318
		KASSERT((ctx->sk != NULL && ctx->nk != NULL), ("%s: nr %p sk %p, nk %p",
6319
		    __func__, ctx->nr, ctx->sk, ctx->nk));
6320

6321
	/* Swap sk/nk for PF_OUT. */
6322
	if (pf_state_insert(BOUND_IFACE(s, pd), pd->kif,
6323
	    (pd->dir == PF_IN) ? ctx->sk : ctx->nk,
6324
	    (pd->dir == PF_IN) ? ctx->nk : ctx->sk, s)) {
6325
		REASON_SET(&ctx->reason, PFRES_STATEINS);
6326
		goto drop;
6327
	} else
6328
		*sm = s;
6329
	ctx->sk = ctx->nk = NULL;
6330

6331
	STATE_INC_COUNTERS(s);
6332

6333
	/*
6334
	 * Lock order is important: first state, then source node.
6335
	 */
6336
	for (pf_sn_types_t sn_type=0; sn_type<PF_SN_MAX; sn_type++) {
6337
		if (pf_src_node_exists(&sns[sn_type], snhs[sn_type])) {
6338
			s->sns[sn_type] = sns[sn_type];
6339
			PF_HASHROW_UNLOCK(snhs[sn_type]);
6340
		}
6341
	}
6342

6343
	if (ctx->tag > 0)
6344
		s->tag = ctx->tag;
6345
	if (pd->proto == IPPROTO_TCP && (tcp_get_flags(th) & (TH_SYN|TH_ACK)) ==
6346
	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY && pd->dir == PF_IN) {
6347
		pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC);
6348
		pf_undo_nat(ctx->nr, pd, bip_sum);
6349
		s->src.seqhi = arc4random();
6350
		/* Find mss option */
6351
		int rtid = M_GETFIB(pd->m);
6352
		mss = pf_get_mss(pd);
6353
		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
6354
		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
6355
		s->src.mss = mss;
6356
		pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
6357
		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
6358
		    TH_SYN|TH_ACK, 0, s->src.mss, 0, M_SKIP_FIREWALL, 0, 0,
6359
		    pd->act.rtableid, &ctx->reason);
6360
		REASON_SET(&ctx->reason, PFRES_SYNPROXY);
6361
		return (PF_SYNPROXY_DROP);
6362
	}
6363

6364
	s->udp_mapping = ctx->udp_mapping;
6365

6366
	return (PF_PASS);
6367

6368
csfailed:
6369
	uma_zfree(V_pf_state_key_z, ctx->sk);
6370
	uma_zfree(V_pf_state_key_z, ctx->nk);
6371

6372
	for (pf_sn_types_t sn_type=0; sn_type<PF_SN_MAX; sn_type++) {
6373
		if (pf_src_node_exists(&sns[sn_type], snhs[sn_type])) {
6374
			if (--sns[sn_type]->states == 0 &&
6375
			    sns[sn_type]->expire == 0) {
6376
				pf_unlink_src_node(sns[sn_type]);
6377
				pf_free_src_node(sns[sn_type]);
6378
				counter_u64_add(
6379
				    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
6380
			}
6381
			PF_HASHROW_UNLOCK(snhs[sn_type]);
6382
		}
6383
	}
6384

6385
drop:
6386
	if (s != NULL) {
6387
		pf_src_tree_remove_state(s);
6388
		s->timeout = PFTM_UNLINKED;
6389
		pf_free_state(s);
6390
	}
6391

6392
	return (PF_DROP);
6393
}
6394

6395
int
6396
pf_translate(struct pf_pdesc *pd, struct pf_addr *saddr, u_int16_t sport,
6397
    struct pf_addr *daddr, u_int16_t dport, u_int16_t virtual_type,
6398
    int icmp_dir)
6399
{
6400
	/*
6401
	 * pf_translate() implements OpenBSD's "new" NAT approach.
6402
	 * We don't follow it, because it involves a breaking syntax change
6403
	 * (removing nat/rdr rules, moving it into regular pf rules.)
6404
	 * It also moves NAT processing to be done after normal rules evaluation
6405
	 * whereas in FreeBSD that's done before rules processing.
6406
	 *
6407
	 * We adopt the function only for nat64, and keep other NAT processing
6408
	 * before rules processing.
6409
	 */
6410
	int	rewrite = 0;
6411
	int	afto = pd->af != pd->naf;
6412

6413
	MPASS(afto);
6414

6415
	switch (pd->proto) {
6416
	case IPPROTO_TCP:
6417
	case IPPROTO_UDP:
6418
	case IPPROTO_SCTP:
6419
		if (afto || *pd->sport != sport) {
6420
			pf_change_ap(pd, pd->src, pd->sport,
6421
			    saddr, sport);
6422
			rewrite = 1;
6423
		}
6424
		if (afto || *pd->dport != dport) {
6425
			pf_change_ap(pd, pd->dst, pd->dport,
6426
			    daddr, dport);
6427
			rewrite = 1;
6428
		}
6429
		break;
6430

6431
#ifdef INET
6432
	case IPPROTO_ICMP:
6433
		/* pf_translate() is also used when logging invalid packets */
6434
		if (pd->af != AF_INET)
6435
			return (0);
6436

6437
		if (afto) {
6438
			if (pf_translate_icmp_af(AF_INET6, &pd->hdr.icmp))
6439
				return (-1);
6440
			pd->proto = IPPROTO_ICMPV6;
6441
			rewrite = 1;
6442
		}
6443
		if (virtual_type == htons(ICMP_ECHO)) {
6444
			u_int16_t icmpid = (icmp_dir == PF_IN) ? sport : dport;
6445

6446
			if (icmpid != pd->hdr.icmp.icmp_id) {
6447
				pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
6448
				    pd->hdr.icmp.icmp_cksum,
6449
				    pd->hdr.icmp.icmp_id, icmpid, 0);
6450
				pd->hdr.icmp.icmp_id = icmpid;
6451
				/* XXX TODO copyback. */
6452
				rewrite = 1;
6453
			}
6454
		}
6455
		break;
6456
#endif /* INET */
6457

6458
#ifdef INET6
6459
	case IPPROTO_ICMPV6:
6460
		/* pf_translate() is also used when logging invalid packets */
6461
		if (pd->af != AF_INET6)
6462
			return (0);
6463

6464
		if (afto) {
6465
			/* ip_sum will be recalculated in pf_translate_af */
6466
			if (pf_translate_icmp_af(AF_INET, &pd->hdr.icmp6))
6467
				return (0);
6468
			pd->proto = IPPROTO_ICMP;
6469
			rewrite = 1;
6470
		}
6471
		break;
6472
#endif /* INET6 */
6473

6474
	default:
6475
		break;
6476
	}
6477

6478
	return (rewrite);
6479
}
6480

6481
int
6482
pf_translate_compat(struct pf_test_ctx *ctx)
6483
{
6484
	struct pf_pdesc		*pd = ctx->pd;
6485
	struct pf_state_key	*nk = ctx->nk;
6486
	struct tcphdr		*th = &pd->hdr.tcp;
6487
	int 			 rewrite = 0;
6488

6489
	KASSERT(ctx->sk != NULL, ("%s: null sk", __func__));
6490
	KASSERT(ctx->nk != NULL, ("%s: null nk", __func__));
6491

6492
	switch (pd->virtual_proto) {
6493
	case IPPROTO_TCP:
6494
		if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], pd->af) ||
6495
		    nk->port[pd->sidx] != pd->nsport) {
6496
			pf_change_ap(pd, pd->src, &th->th_sport,
6497
			    &nk->addr[pd->sidx], nk->port[pd->sidx]);
6498
			pd->sport = &th->th_sport;
6499
			pd->nsport = th->th_sport;
6500
			pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
6501
		}
6502

6503
		if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) ||
6504
		    nk->port[pd->didx] != pd->ndport) {
6505
			pf_change_ap(pd, pd->dst, &th->th_dport,
6506
			    &nk->addr[pd->didx], nk->port[pd->didx]);
6507
			pd->dport = &th->th_dport;
6508
			pd->ndport = th->th_dport;
6509
			pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
6510
		}
6511
		rewrite++;
6512
		break;
6513
	case IPPROTO_UDP:
6514
		if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], pd->af) ||
6515
		    nk->port[pd->sidx] != pd->nsport) {
6516
			pf_change_ap(pd, pd->src,
6517
			    &pd->hdr.udp.uh_sport,
6518
			    &nk->addr[pd->sidx],
6519
			    nk->port[pd->sidx]);
6520
			pd->sport = &pd->hdr.udp.uh_sport;
6521
			pd->nsport = pd->hdr.udp.uh_sport;
6522
			pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
6523
		}
6524

6525
		if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) ||
6526
		    nk->port[pd->didx] != pd->ndport) {
6527
			pf_change_ap(pd, pd->dst,
6528
			    &pd->hdr.udp.uh_dport,
6529
			    &nk->addr[pd->didx],
6530
			    nk->port[pd->didx]);
6531
			pd->dport = &pd->hdr.udp.uh_dport;
6532
			pd->ndport = pd->hdr.udp.uh_dport;
6533
			pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
6534
		}
6535
		rewrite++;
6536
		break;
6537
	case IPPROTO_SCTP: {
6538
		if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], pd->af) ||
6539
		    nk->port[pd->sidx] != pd->nsport) {
6540
			pf_change_ap(pd, pd->src,
6541
			    &pd->hdr.sctp.src_port,
6542
			    &nk->addr[pd->sidx],
6543
			    nk->port[pd->sidx]);
6544
			pd->sport = &pd->hdr.sctp.src_port;
6545
			pd->nsport = pd->hdr.sctp.src_port;
6546
			pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
6547
		}
6548
		if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], pd->af) ||
6549
		    nk->port[pd->didx] != pd->ndport) {
6550
			pf_change_ap(pd, pd->dst,
6551
			    &pd->hdr.sctp.dest_port,
6552
			    &nk->addr[pd->didx],
6553
			    nk->port[pd->didx]);
6554
			pd->dport = &pd->hdr.sctp.dest_port;
6555
			pd->ndport = pd->hdr.sctp.dest_port;
6556
			pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
6557
		}
6558
		break;
6559
	}
6560
#ifdef INET
6561
	case IPPROTO_ICMP:
6562
		if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET)) {
6563
			pf_change_a(&pd->src->v4.s_addr, pd->ip_sum,
6564
			    nk->addr[pd->sidx].v4.s_addr, 0);
6565
			pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
6566
		}
6567

6568
		if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET)) {
6569
			pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum,
6570
			    nk->addr[pd->didx].v4.s_addr, 0);
6571
			pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
6572
		}
6573

6574
		if (ctx->virtual_type == htons(ICMP_ECHO) &&
6575
		    nk->port[pd->sidx] != pd->hdr.icmp.icmp_id) {
6576
			pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
6577
			    pd->hdr.icmp.icmp_cksum, pd->nsport,
6578
			    nk->port[pd->sidx], 0);
6579
			pd->hdr.icmp.icmp_id = nk->port[pd->sidx];
6580
			pd->sport = &pd->hdr.icmp.icmp_id;
6581
		}
6582
		m_copyback(pd->m, pd->off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp);
6583
		break;
6584
#endif /* INET */
6585
#ifdef INET6
6586
	case IPPROTO_ICMPV6:
6587
		if (PF_ANEQ(&pd->nsaddr, &nk->addr[pd->sidx], AF_INET6)) {
6588
			pf_change_a6(pd->src, &pd->hdr.icmp6.icmp6_cksum,
6589
			    &nk->addr[pd->sidx], 0);
6590
			pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
6591
		}
6592

6593
		if (PF_ANEQ(&pd->ndaddr, &nk->addr[pd->didx], AF_INET6)) {
6594
			pf_change_a6(pd->dst, &pd->hdr.icmp6.icmp6_cksum,
6595
			    &nk->addr[pd->didx], 0);
6596
			pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
6597
		}
6598
		rewrite++;
6599
		break;
6600
#endif /* INET */
6601
	default:
6602
		switch (pd->af) {
6603
#ifdef INET
6604
		case AF_INET:
6605
			if (PF_ANEQ(&pd->nsaddr,
6606
				&nk->addr[pd->sidx], AF_INET)) {
6607
				pf_change_a(&pd->src->v4.s_addr,
6608
				    pd->ip_sum,
6609
				    nk->addr[pd->sidx].v4.s_addr, 0);
6610
				pf_addrcpy(&pd->nsaddr, pd->src, pd->af);
6611
			}
6612

6613
			if (PF_ANEQ(&pd->ndaddr,
6614
				&nk->addr[pd->didx], AF_INET)) {
6615
				pf_change_a(&pd->dst->v4.s_addr,
6616
				    pd->ip_sum,
6617
				    nk->addr[pd->didx].v4.s_addr, 0);
6618
				pf_addrcpy(&pd->ndaddr, pd->dst, pd->af);
6619
			}
6620
			break;
6621
#endif /* INET */
6622
#ifdef INET6
6623
		case AF_INET6:
6624
			if (PF_ANEQ(&pd->nsaddr,
6625
				&nk->addr[pd->sidx], AF_INET6)) {
6626
				pf_addrcpy(&pd->nsaddr, &nk->addr[pd->sidx],
6627
				    pd->af);
6628
				pf_addrcpy(pd->src, &nk->addr[pd->sidx], pd->af);
6629
			}
6630

6631
			if (PF_ANEQ(&pd->ndaddr,
6632
				&nk->addr[pd->didx], AF_INET6)) {
6633
				pf_addrcpy(&pd->ndaddr, &nk->addr[pd->didx],
6634
				    pd->af);
6635
				pf_addrcpy(pd->dst, &nk->addr[pd->didx],
6636
				    pd->af);
6637
			}
6638
			break;
6639
#endif /* INET6 */
6640
		}
6641
		break;
6642
	}
6643
	return (rewrite);
6644
}
6645

6646
static int
6647
pf_tcp_track_full(struct pf_kstate *state, struct pf_pdesc *pd,
6648
    u_short *reason, int *copyback, struct pf_state_peer *src,
6649
    struct pf_state_peer *dst, u_int8_t psrc, u_int8_t pdst)
6650
{
6651
	struct tcphdr		*th = &pd->hdr.tcp;
6652
	u_int16_t		 win = ntohs(th->th_win);
6653
	u_int32_t		 ack, end, data_end, seq, orig_seq;
6654
	u_int8_t		 sws, dws;
6655
	int			 ackskew;
6656

6657
	if (src->wscale && dst->wscale && !(tcp_get_flags(th) & TH_SYN)) {
6658
		sws = src->wscale & PF_WSCALE_MASK;
6659
		dws = dst->wscale & PF_WSCALE_MASK;
6660
	} else
6661
		sws = dws = 0;
6662

6663
	/*
6664
	 * Sequence tracking algorithm from Guido van Rooij's paper:
6665
	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
6666
	 *	tcp_filtering.ps
6667
	 */
6668

6669
	orig_seq = seq = ntohl(th->th_seq);
6670
	if (src->seqlo == 0) {
6671
		/* First packet from this end. Set its state */
6672

6673
		if ((state->state_flags & PFSTATE_SCRUB_TCP || dst->scrub) &&
6674
		    src->scrub == NULL) {
6675
			if (pf_normalize_tcp_init(pd, th, src)) {
6676
				REASON_SET(reason, PFRES_MEMORY);
6677
				return (PF_DROP);
6678
			}
6679
		}
6680

6681
		/* Deferred generation of sequence number modulator */
6682
		if (dst->seqdiff && !src->seqdiff) {
6683
			/* use random iss for the TCP server */
6684
			while ((src->seqdiff = arc4random() - seq) == 0)
6685
				;
6686
			ack = ntohl(th->th_ack) - dst->seqdiff;
6687
			pf_change_proto_a(pd->m, &th->th_seq, &th->th_sum, htonl(seq +
6688
			    src->seqdiff), 0);
6689
			pf_change_proto_a(pd->m, &th->th_ack, &th->th_sum, htonl(ack), 0);
6690
			*copyback = 1;
6691
		} else {
6692
			ack = ntohl(th->th_ack);
6693
		}
6694

6695
		end = seq + pd->p_len;
6696
		if (tcp_get_flags(th) & TH_SYN) {
6697
			end++;
6698
			if (dst->wscale & PF_WSCALE_FLAG) {
6699
				src->wscale = pf_get_wscale(pd);
6700
				if (src->wscale & PF_WSCALE_FLAG) {
6701
					/* Remove scale factor from initial
6702
					 * window */
6703
					sws = src->wscale & PF_WSCALE_MASK;
6704
					win = ((u_int32_t)win + (1 << sws) - 1)
6705
					    >> sws;
6706
					dws = dst->wscale & PF_WSCALE_MASK;
6707
				} else {
6708
					/* fixup other window */
6709
					dst->max_win = MIN(TCP_MAXWIN,
6710
					    (u_int32_t)dst->max_win <<
6711
					    (dst->wscale & PF_WSCALE_MASK));
6712
					/* in case of a retrans SYN|ACK */
6713
					dst->wscale = 0;
6714
				}
6715
			}
6716
		}
6717
		data_end = end;
6718
		if (tcp_get_flags(th) & TH_FIN)
6719
			end++;
6720

6721
		src->seqlo = seq;
6722
		if (src->state < TCPS_SYN_SENT)
6723
			pf_set_protostate(state, psrc, TCPS_SYN_SENT);
6724

6725
		/*
6726
		 * May need to slide the window (seqhi may have been set by
6727
		 * the crappy stack check or if we picked up the connection
6728
		 * after establishment)
6729
		 */
6730
		if (src->seqhi == 1 ||
6731
		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
6732
			src->seqhi = end + MAX(1, dst->max_win << dws);
6733
		if (win > src->max_win)
6734
			src->max_win = win;
6735

6736
	} else {
6737
		ack = ntohl(th->th_ack) - dst->seqdiff;
6738
		if (src->seqdiff) {
6739
			/* Modulate sequence numbers */
6740
			pf_change_proto_a(pd->m, &th->th_seq, &th->th_sum, htonl(seq +
6741
			    src->seqdiff), 0);
6742
			pf_change_proto_a(pd->m, &th->th_ack, &th->th_sum, htonl(ack), 0);
6743
			*copyback = 1;
6744
		}
6745
		end = seq + pd->p_len;
6746
		if (tcp_get_flags(th) & TH_SYN)
6747
			end++;
6748
		data_end = end;
6749
		if (tcp_get_flags(th) & TH_FIN)
6750
			end++;
6751
	}
6752

6753
	if ((tcp_get_flags(th) & TH_ACK) == 0) {
6754
		/* Let it pass through the ack skew check */
6755
		ack = dst->seqlo;
6756
	} else if ((ack == 0 &&
6757
	    (tcp_get_flags(th) & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
6758
	    /* broken tcp stacks do not set ack */
6759
	    (dst->state < TCPS_SYN_SENT)) {
6760
		/*
6761
		 * Many stacks (ours included) will set the ACK number in an
6762
		 * FIN|ACK if the SYN times out -- no sequence to ACK.
6763
		 */
6764
		ack = dst->seqlo;
6765
	}
6766

6767
	if (seq == end) {
6768
		/* Ease sequencing restrictions on no data packets */
6769
		seq = src->seqlo;
6770
		data_end = end = seq;
6771
	}
6772

6773
	ackskew = dst->seqlo - ack;
6774

6775
	/*
6776
	 * Need to demodulate the sequence numbers in any TCP SACK options
6777
	 * (Selective ACK). We could optionally validate the SACK values
6778
	 * against the current ACK window, either forwards or backwards, but
6779
	 * I'm not confident that SACK has been implemented properly
6780
	 * everywhere. It wouldn't surprise me if several stacks accidentally
6781
	 * SACK too far backwards of previously ACKed data. There really aren't
6782
	 * any security implications of bad SACKing unless the target stack
6783
	 * doesn't validate the option length correctly. Someone trying to
6784
	 * spoof into a TCP connection won't bother blindly sending SACK
6785
	 * options anyway.
6786
	 */
6787
	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
6788
		if (pf_modulate_sack(pd, th, dst))
6789
			*copyback = 1;
6790
	}
6791

6792
#define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
6793
	if (SEQ_GEQ(src->seqhi, data_end) &&
6794
	    /* Last octet inside other's window space */
6795
	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
6796
	    /* Retrans: not more than one window back */
6797
	    (ackskew >= -MAXACKWINDOW) &&
6798
	    /* Acking not more than one reassembled fragment backwards */
6799
	    (ackskew <= (MAXACKWINDOW << sws)) &&
6800
	    /* Acking not more than one window forward */
6801
	    ((tcp_get_flags(th) & TH_RST) == 0 || orig_seq == src->seqlo ||
6802
	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
6803
	    /* Require an exact/+1 sequence match on resets when possible */
6804
	    (SEQ_GEQ(orig_seq, src->seqlo - (dst->max_win << dws)) &&
6805
	    SEQ_LEQ(orig_seq, src->seqlo + 1) && ackskew == 0 &&
6806
	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)))) {
6807
		/* Allow resets to match sequence window if ack is perfect match */
6808

6809
		if (dst->scrub || src->scrub) {
6810
			if (pf_normalize_tcp_stateful(pd, reason, th,
6811
			    state, src, dst, copyback))
6812
				return (PF_DROP);
6813
		}
6814

6815
		/* update max window */
6816
		if (src->max_win < win)
6817
			src->max_win = win;
6818
		/* synchronize sequencing */
6819
		if (SEQ_GT(end, src->seqlo))
6820
			src->seqlo = end;
6821
		/* slide the window of what the other end can send */
6822
		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
6823
			dst->seqhi = ack + MAX((win << sws), 1);
6824

6825
		/* update states */
6826
		if (tcp_get_flags(th) & TH_SYN)
6827
			if (src->state < TCPS_SYN_SENT)
6828
				pf_set_protostate(state, psrc, TCPS_SYN_SENT);
6829
		if (tcp_get_flags(th) & TH_FIN)
6830
			if (src->state < TCPS_CLOSING)
6831
				pf_set_protostate(state, psrc, TCPS_CLOSING);
6832
		if (tcp_get_flags(th) & TH_ACK) {
6833
			if (dst->state == TCPS_SYN_SENT) {
6834
				pf_set_protostate(state, pdst,
6835
				    TCPS_ESTABLISHED);
6836
				if (src->state == TCPS_ESTABLISHED &&
6837
				    state->sns[PF_SN_LIMIT] != NULL &&
6838
				    pf_src_connlimit(state)) {
6839
					REASON_SET(reason, PFRES_SRCLIMIT);
6840
					return (PF_DROP);
6841
				}
6842
			} else if (dst->state == TCPS_CLOSING)
6843
				pf_set_protostate(state, pdst,
6844
				    TCPS_FIN_WAIT_2);
6845
		}
6846
		if (tcp_get_flags(th) & TH_RST)
6847
			pf_set_protostate(state, PF_PEER_BOTH, TCPS_TIME_WAIT);
6848

6849
		/* update expire time */
6850
		state->expire = pf_get_uptime();
6851
		if (src->state >= TCPS_FIN_WAIT_2 &&
6852
		    dst->state >= TCPS_FIN_WAIT_2)
6853
			state->timeout = PFTM_TCP_CLOSED;
6854
		else if (src->state >= TCPS_CLOSING &&
6855
		    dst->state >= TCPS_CLOSING)
6856
			state->timeout = PFTM_TCP_FIN_WAIT;
6857
		else if (src->state < TCPS_ESTABLISHED ||
6858
		    dst->state < TCPS_ESTABLISHED)
6859
			state->timeout = PFTM_TCP_OPENING;
6860
		else if (src->state >= TCPS_CLOSING ||
6861
		    dst->state >= TCPS_CLOSING)
6862
			state->timeout = PFTM_TCP_CLOSING;
6863
		else
6864
			state->timeout = PFTM_TCP_ESTABLISHED;
6865

6866
		/* Fall through to PASS packet */
6867

6868
	} else if ((dst->state < TCPS_SYN_SENT ||
6869
		dst->state >= TCPS_FIN_WAIT_2 ||
6870
		src->state >= TCPS_FIN_WAIT_2) &&
6871
	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, data_end) &&
6872
	    /* Within a window forward of the originating packet */
6873
	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
6874
	    /* Within a window backward of the originating packet */
6875

6876
		/*
6877
		 * This currently handles three situations:
6878
		 *  1) Stupid stacks will shotgun SYNs before their peer
6879
		 *     replies.
6880
		 *  2) When PF catches an already established stream (the
6881
		 *     firewall rebooted, the state table was flushed, routes
6882
		 *     changed...)
6883
		 *  3) Packets get funky immediately after the connection
6884
		 *     closes (this should catch Solaris spurious ACK|FINs
6885
		 *     that web servers like to spew after a close)
6886
		 *
6887
		 * This must be a little more careful than the above code
6888
		 * since packet floods will also be caught here. We don't
6889
		 * update the TTL here to mitigate the damage of a packet
6890
		 * flood and so the same code can handle awkward establishment
6891
		 * and a loosened connection close.
6892
		 * In the establishment case, a correct peer response will
6893
		 * validate the connection, go through the normal state code
6894
		 * and keep updating the state TTL.
6895
		 */
6896

6897
		if (V_pf_status.debug >= PF_DEBUG_MISC) {
6898
			printf("pf: loose state match: ");
6899
			pf_print_state(state);
6900
			pf_print_flags(tcp_get_flags(th));
6901
			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
6902
			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
6903
			    pd->p_len, ackskew, (unsigned long long)state->packets[0],
6904
			    (unsigned long long)state->packets[1],
6905
			    pd->dir == PF_IN ? "in" : "out",
6906
			    pd->dir == state->direction ? "fwd" : "rev");
6907
		}
6908

6909
		if (dst->scrub || src->scrub) {
6910
			if (pf_normalize_tcp_stateful(pd, reason, th,
6911
			    state, src, dst, copyback))
6912
				return (PF_DROP);
6913
		}
6914

6915
		/* update max window */
6916
		if (src->max_win < win)
6917
			src->max_win = win;
6918
		/* synchronize sequencing */
6919
		if (SEQ_GT(end, src->seqlo))
6920
			src->seqlo = end;
6921
		/* slide the window of what the other end can send */
6922
		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
6923
			dst->seqhi = ack + MAX((win << sws), 1);
6924

6925
		/*
6926
		 * Cannot set dst->seqhi here since this could be a shotgunned
6927
		 * SYN and not an already established connection.
6928
		 */
6929

6930
		if (tcp_get_flags(th) & TH_FIN)
6931
			if (src->state < TCPS_CLOSING)
6932
				pf_set_protostate(state, psrc, TCPS_CLOSING);
6933
		if (tcp_get_flags(th) & TH_RST)
6934
			pf_set_protostate(state, PF_PEER_BOTH, TCPS_TIME_WAIT);
6935

6936
		/* Fall through to PASS packet */
6937

6938
	} else {
6939
		if (state->dst.state == TCPS_SYN_SENT &&
6940
		    state->src.state == TCPS_SYN_SENT) {
6941
			/* Send RST for state mismatches during handshake */
6942
			if (!(tcp_get_flags(th) & TH_RST))
6943
				pf_send_tcp(state->rule, pd->af,
6944
				    pd->dst, pd->src, th->th_dport,
6945
				    th->th_sport, ntohl(th->th_ack), 0,
6946
				    TH_RST, 0, 0,
6947
				    state->rule->return_ttl, M_SKIP_FIREWALL,
6948
				    0, 0, state->act.rtableid, reason);
6949
			src->seqlo = 0;
6950
			src->seqhi = 1;
6951
			src->max_win = 1;
6952
		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
6953
			printf("pf: BAD state: ");
6954
			pf_print_state(state);
6955
			pf_print_flags(tcp_get_flags(th));
6956
			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
6957
			    "pkts=%llu:%llu dir=%s,%s\n",
6958
			    seq, orig_seq, ack, pd->p_len, ackskew,
6959
			    (unsigned long long)state->packets[0],
6960
			    (unsigned long long)state->packets[1],
6961
			    pd->dir == PF_IN ? "in" : "out",
6962
			    pd->dir == state->direction ? "fwd" : "rev");
6963
			printf("pf: State failure on: %c %c %c %c | %c %c\n",
6964
			    SEQ_GEQ(src->seqhi, data_end) ? ' ' : '1',
6965
			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
6966
			    ' ': '2',
6967
			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
6968
			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
6969
			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, data_end) ?' ' :'5',
6970
			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
6971
		}
6972
		REASON_SET(reason, PFRES_BADSTATE);
6973
		return (PF_DROP);
6974
	}
6975

6976
	return (PF_PASS);
6977
}
6978

6979
static int
6980
pf_tcp_track_sloppy(struct pf_kstate *state, struct pf_pdesc *pd,
6981
    u_short *reason, struct pf_state_peer *src, struct pf_state_peer *dst,
6982
    u_int8_t psrc, u_int8_t pdst)
6983
{
6984
	struct tcphdr		*th = &pd->hdr.tcp;
6985

6986
	if (tcp_get_flags(th) & TH_SYN)
6987
		if (src->state < TCPS_SYN_SENT)
6988
			pf_set_protostate(state, psrc, TCPS_SYN_SENT);
6989
	if (tcp_get_flags(th) & TH_FIN)
6990
		if (src->state < TCPS_CLOSING)
6991
			pf_set_protostate(state, psrc, TCPS_CLOSING);
6992
	if (tcp_get_flags(th) & TH_ACK) {
6993
		if (dst->state == TCPS_SYN_SENT) {
6994
			pf_set_protostate(state, pdst, TCPS_ESTABLISHED);
6995
			if (src->state == TCPS_ESTABLISHED &&
6996
			    state->sns[PF_SN_LIMIT] != NULL &&
6997
			    pf_src_connlimit(state)) {
6998
				REASON_SET(reason, PFRES_SRCLIMIT);
6999
				return (PF_DROP);
7000
			}
7001
		} else if (dst->state == TCPS_CLOSING) {
7002
			pf_set_protostate(state, pdst, TCPS_FIN_WAIT_2);
7003
		} else if (src->state == TCPS_SYN_SENT &&
7004
		    dst->state < TCPS_SYN_SENT) {
7005
			/*
7006
			 * Handle a special sloppy case where we only see one
7007
			 * half of the connection. If there is a ACK after
7008
			 * the initial SYN without ever seeing a packet from
7009
			 * the destination, set the connection to established.
7010
			 */
7011
			pf_set_protostate(state, PF_PEER_BOTH,
7012
			    TCPS_ESTABLISHED);
7013
			dst->state = src->state = TCPS_ESTABLISHED;
7014
			if (state->sns[PF_SN_LIMIT] != NULL &&
7015
			    pf_src_connlimit(state)) {
7016
				REASON_SET(reason, PFRES_SRCLIMIT);
7017
				return (PF_DROP);
7018
			}
7019
		} else if (src->state == TCPS_CLOSING &&
7020
		    dst->state == TCPS_ESTABLISHED &&
7021
		    dst->seqlo == 0) {
7022
			/*
7023
			 * Handle the closing of half connections where we
7024
			 * don't see the full bidirectional FIN/ACK+ACK
7025
			 * handshake.
7026
			 */
7027
			pf_set_protostate(state, pdst, TCPS_CLOSING);
7028
		}
7029
	}
7030
	if (tcp_get_flags(th) & TH_RST)
7031
		pf_set_protostate(state, PF_PEER_BOTH, TCPS_TIME_WAIT);
7032

7033
	/* update expire time */
7034
	state->expire = pf_get_uptime();
7035
	if (src->state >= TCPS_FIN_WAIT_2 &&
7036
	    dst->state >= TCPS_FIN_WAIT_2)
7037
		state->timeout = PFTM_TCP_CLOSED;
7038
	else if (src->state >= TCPS_CLOSING &&
7039
	    dst->state >= TCPS_CLOSING)
7040
		state->timeout = PFTM_TCP_FIN_WAIT;
7041
	else if (src->state < TCPS_ESTABLISHED ||
7042
	    dst->state < TCPS_ESTABLISHED)
7043
		state->timeout = PFTM_TCP_OPENING;
7044
	else if (src->state >= TCPS_CLOSING ||
7045
	    dst->state >= TCPS_CLOSING)
7046
		state->timeout = PFTM_TCP_CLOSING;
7047
	else
7048
		state->timeout = PFTM_TCP_ESTABLISHED;
7049

7050
	return (PF_PASS);
7051
}
7052

7053
static int
7054
pf_synproxy(struct pf_pdesc *pd, struct pf_kstate *state, u_short *reason)
7055
{
7056
	struct pf_state_key	*sk = state->key[pd->didx];
7057
	struct tcphdr		*th = &pd->hdr.tcp;
7058

7059
	if (state->src.state == PF_TCPS_PROXY_SRC) {
7060
		if (pd->dir != state->direction) {
7061
			REASON_SET(reason, PFRES_SYNPROXY);
7062
			return (PF_SYNPROXY_DROP);
7063
		}
7064
		if (tcp_get_flags(th) & TH_SYN) {
7065
			if (ntohl(th->th_seq) != state->src.seqlo) {
7066
				REASON_SET(reason, PFRES_SYNPROXY);
7067
				return (PF_DROP);
7068
			}
7069
			pf_send_tcp(state->rule, pd->af, pd->dst,
7070
			    pd->src, th->th_dport, th->th_sport,
7071
			    state->src.seqhi, ntohl(th->th_seq) + 1,
7072
			    TH_SYN|TH_ACK, 0, state->src.mss, 0,
7073
			    M_SKIP_FIREWALL, 0, 0, state->act.rtableid,
7074
			    reason);
7075
			REASON_SET(reason, PFRES_SYNPROXY);
7076
			return (PF_SYNPROXY_DROP);
7077
		} else if ((tcp_get_flags(th) & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK ||
7078
		    (ntohl(th->th_ack) != state->src.seqhi + 1) ||
7079
		    (ntohl(th->th_seq) != state->src.seqlo + 1)) {
7080
			REASON_SET(reason, PFRES_SYNPROXY);
7081
			return (PF_DROP);
7082
		} else if (state->sns[PF_SN_LIMIT] != NULL &&
7083
		    pf_src_connlimit(state)) {
7084
			REASON_SET(reason, PFRES_SRCLIMIT);
7085
			return (PF_DROP);
7086
		} else
7087
			pf_set_protostate(state, PF_PEER_SRC,
7088
			    PF_TCPS_PROXY_DST);
7089
	}
7090
	if (state->src.state == PF_TCPS_PROXY_DST) {
7091
		if (pd->dir == state->direction) {
7092
			if (((tcp_get_flags(th) & (TH_SYN|TH_ACK)) != TH_ACK) ||
7093
			    (ntohl(th->th_ack) != state->src.seqhi + 1) ||
7094
			    (ntohl(th->th_seq) != state->src.seqlo + 1)) {
7095
				REASON_SET(reason, PFRES_SYNPROXY);
7096
				return (PF_DROP);
7097
			}
7098
			state->src.max_win = MAX(ntohs(th->th_win), 1);
7099
			if (state->dst.seqhi == 1)
7100
				state->dst.seqhi = arc4random();
7101
			pf_send_tcp(state->rule, pd->af,
7102
			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
7103
			    sk->port[pd->sidx], sk->port[pd->didx],
7104
			    state->dst.seqhi, 0, TH_SYN, 0,
7105
			    state->src.mss, 0,
7106
			    state->orig_kif->pfik_ifp == V_loif ? M_LOOP : 0,
7107
			    state->tag, 0, state->act.rtableid,
7108
			    reason);
7109
			REASON_SET(reason, PFRES_SYNPROXY);
7110
			return (PF_SYNPROXY_DROP);
7111
		} else if (((tcp_get_flags(th) & (TH_SYN|TH_ACK)) !=
7112
		    (TH_SYN|TH_ACK)) ||
7113
		    (ntohl(th->th_ack) != state->dst.seqhi + 1)) {
7114
			REASON_SET(reason, PFRES_SYNPROXY);
7115
			return (PF_DROP);
7116
		} else {
7117
			state->dst.max_win = MAX(ntohs(th->th_win), 1);
7118
			state->dst.seqlo = ntohl(th->th_seq);
7119
			pf_send_tcp(state->rule, pd->af, pd->dst,
7120
			    pd->src, th->th_dport, th->th_sport,
7121
			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
7122
			    TH_ACK, state->src.max_win, 0, 0, 0,
7123
			    state->tag, 0, state->act.rtableid,
7124
			    reason);
7125
			pf_send_tcp(state->rule, pd->af,
7126
			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
7127
			    sk->port[pd->sidx], sk->port[pd->didx],
7128
			    state->src.seqhi + 1, state->src.seqlo + 1,
7129
			    TH_ACK, state->dst.max_win, 0, 0,
7130
			    M_SKIP_FIREWALL, 0, 0, state->act.rtableid,
7131
			    reason);
7132
			state->src.seqdiff = state->dst.seqhi -
7133
			    state->src.seqlo;
7134
			state->dst.seqdiff = state->src.seqhi -
7135
			    state->dst.seqlo;
7136
			state->src.seqhi = state->src.seqlo +
7137
			    state->dst.max_win;
7138
			state->dst.seqhi = state->dst.seqlo +
7139
			    state->src.max_win;
7140
			state->src.wscale = state->dst.wscale = 0;
7141
			pf_set_protostate(state, PF_PEER_BOTH,
7142
			    TCPS_ESTABLISHED);
7143
			REASON_SET(reason, PFRES_SYNPROXY);
7144
			return (PF_SYNPROXY_DROP);
7145
		}
7146
	}
7147

7148
	return (PF_PASS);
7149
}
7150

7151
static int
7152
pf_test_state(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason)
7153
{
7154
	struct pf_state_key_cmp	 key;
7155
	int			 copyback = 0;
7156
	struct pf_state_peer	*src, *dst;
7157
	uint8_t			 psrc, pdst;
7158
	int			 action;
7159

7160
	bzero(&key, sizeof(key));
7161
	key.af = pd->af;
7162
	key.proto = pd->virtual_proto;
7163
	pf_addrcpy(&key.addr[pd->sidx], pd->src, key.af);
7164
	pf_addrcpy(&key.addr[pd->didx], pd->dst, key.af);
7165
	key.port[pd->sidx] = pd->osport;
7166
	key.port[pd->didx] = pd->odport;
7167

7168
	action = pf_find_state(pd, &key, state);
7169
	if (action != PF_MATCH)
7170
		return (action);
7171

7172
	action = PF_PASS;
7173
	if (pd->dir == (*state)->direction) {
7174
		if (PF_REVERSED_KEY(*state, pd->af)) {
7175
			src = &(*state)->dst;
7176
			dst = &(*state)->src;
7177
			psrc = PF_PEER_DST;
7178
			pdst = PF_PEER_SRC;
7179
		} else {
7180
			src = &(*state)->src;
7181
			dst = &(*state)->dst;
7182
			psrc = PF_PEER_SRC;
7183
			pdst = PF_PEER_DST;
7184
		}
7185
	} else {
7186
		if (PF_REVERSED_KEY(*state, pd->af)) {
7187
			src = &(*state)->src;
7188
			dst = &(*state)->dst;
7189
			psrc = PF_PEER_SRC;
7190
			pdst = PF_PEER_DST;
7191
		} else {
7192
			src = &(*state)->dst;
7193
			dst = &(*state)->src;
7194
			psrc = PF_PEER_DST;
7195
			pdst = PF_PEER_SRC;
7196
		}
7197
	}
7198

7199
	switch (pd->virtual_proto) {
7200
	case IPPROTO_TCP: {
7201
		struct tcphdr		*th = &pd->hdr.tcp;
7202

7203
		if ((action = pf_synproxy(pd, *state, reason)) != PF_PASS)
7204
			return (action);
7205
		if (((tcp_get_flags(th) & (TH_SYN | TH_ACK)) == TH_SYN) ||
7206
		    ((th->th_flags & (TH_SYN | TH_ACK | TH_RST)) == TH_ACK &&
7207
		    pf_syncookie_check(pd) && pd->dir == PF_IN)) {
7208
			if ((*state)->src.state >= TCPS_FIN_WAIT_2 &&
7209
			    (*state)->dst.state >= TCPS_FIN_WAIT_2) {
7210
				if (V_pf_status.debug >= PF_DEBUG_MISC) {
7211
					printf("pf: state reuse ");
7212
					pf_print_state(*state);
7213
					pf_print_flags(tcp_get_flags(th));
7214
					printf("\n");
7215
				}
7216
				/* XXX make sure it's the same direction ?? */
7217
				pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED);
7218
				pf_remove_state(*state);
7219
				*state = NULL;
7220
				return (PF_DROP);
7221
			} else if ((*state)->src.state >= TCPS_ESTABLISHED &&
7222
			    (*state)->dst.state >= TCPS_ESTABLISHED) {
7223
				/*
7224
				 * SYN matches existing state???
7225
				 * Typically happens when sender boots up after
7226
				 * sudden panic. Certain protocols (NFSv3) are
7227
				 * always using same port numbers. Challenge
7228
				 * ACK enables all parties (firewall and peers)
7229
				 * to get in sync again.
7230
				 */
7231
				pf_send_challenge_ack(pd, *state, src, dst, reason);
7232
				return (PF_DROP);
7233
			}
7234
		}
7235
		if ((*state)->state_flags & PFSTATE_SLOPPY) {
7236
			if (pf_tcp_track_sloppy(*state, pd, reason, src, dst,
7237
			    psrc, pdst) == PF_DROP)
7238
				return (PF_DROP);
7239
		} else {
7240
			int	 ret;
7241

7242
			ret = pf_tcp_track_full(*state, pd, reason,
7243
			    &copyback, src, dst, psrc, pdst);
7244
			if (ret == PF_DROP)
7245
				return (PF_DROP);
7246
		}
7247
		break;
7248
	}
7249
	case IPPROTO_UDP:
7250
		/* update states */
7251
		if (src->state < PFUDPS_SINGLE)
7252
			pf_set_protostate(*state, psrc, PFUDPS_SINGLE);
7253
		if (dst->state == PFUDPS_SINGLE)
7254
			pf_set_protostate(*state, pdst, PFUDPS_MULTIPLE);
7255

7256
		/* update expire time */
7257
		(*state)->expire = pf_get_uptime();
7258
		if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
7259
			(*state)->timeout = PFTM_UDP_MULTIPLE;
7260
		else
7261
			(*state)->timeout = PFTM_UDP_SINGLE;
7262
		break;
7263
	case IPPROTO_SCTP:
7264
		if ((src->state >= SCTP_SHUTDOWN_SENT || src->state == SCTP_CLOSED) &&
7265
		    (dst->state >= SCTP_SHUTDOWN_SENT || dst->state == SCTP_CLOSED) &&
7266
		    pd->sctp_flags & PFDESC_SCTP_INIT) {
7267
			pf_set_protostate(*state, PF_PEER_BOTH, SCTP_CLOSED);
7268
			pf_remove_state(*state);
7269
			*state = NULL;
7270
			return (PF_DROP);
7271
		}
7272

7273
		if (pf_sctp_track(*state, pd, reason) != PF_PASS)
7274
			return (PF_DROP);
7275

7276
		/* Track state. */
7277
		if (pd->sctp_flags & PFDESC_SCTP_INIT) {
7278
			if (src->state < SCTP_COOKIE_WAIT) {
7279
				pf_set_protostate(*state, psrc, SCTP_COOKIE_WAIT);
7280
				(*state)->timeout = PFTM_SCTP_OPENING;
7281
			}
7282
		}
7283
		if (pd->sctp_flags & PFDESC_SCTP_INIT_ACK) {
7284
			MPASS(dst->scrub != NULL);
7285
			if (dst->scrub->pfss_v_tag == 0)
7286
				dst->scrub->pfss_v_tag = pd->sctp_initiate_tag;
7287
		}
7288

7289
		/*
7290
		 * Bind to the correct interface if we're if-bound. For multihomed
7291
		 * extra associations we don't know which interface that will be until
7292
		 * here, so we've inserted the state on V_pf_all. Fix that now.
7293
		 */
7294
		if ((*state)->kif == V_pfi_all &&
7295
		    (*state)->rule->rule_flag & PFRULE_IFBOUND)
7296
			(*state)->kif = pd->kif;
7297

7298
		if (pd->sctp_flags & (PFDESC_SCTP_COOKIE | PFDESC_SCTP_HEARTBEAT_ACK)) {
7299
			if (src->state < SCTP_ESTABLISHED) {
7300
				pf_set_protostate(*state, psrc, SCTP_ESTABLISHED);
7301
				(*state)->timeout = PFTM_SCTP_ESTABLISHED;
7302
			}
7303
		}
7304
		if (pd->sctp_flags & (PFDESC_SCTP_SHUTDOWN |
7305
		    PFDESC_SCTP_SHUTDOWN_COMPLETE)) {
7306
			if (src->state < SCTP_SHUTDOWN_PENDING) {
7307
				pf_set_protostate(*state, psrc, SCTP_SHUTDOWN_PENDING);
7308
				(*state)->timeout = PFTM_SCTP_CLOSING;
7309
			}
7310
		}
7311
		if (pd->sctp_flags & (PFDESC_SCTP_SHUTDOWN_COMPLETE | PFDESC_SCTP_ABORT)) {
7312
			pf_set_protostate(*state, psrc, SCTP_CLOSED);
7313
			(*state)->timeout = PFTM_SCTP_CLOSED;
7314
		}
7315

7316
		(*state)->expire = pf_get_uptime();
7317
		break;
7318
	default:
7319
		/* update states */
7320
		if (src->state < PFOTHERS_SINGLE)
7321
			pf_set_protostate(*state, psrc, PFOTHERS_SINGLE);
7322
		if (dst->state == PFOTHERS_SINGLE)
7323
			pf_set_protostate(*state, pdst, PFOTHERS_MULTIPLE);
7324

7325
		/* update expire time */
7326
		(*state)->expire = pf_get_uptime();
7327
		if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
7328
			(*state)->timeout = PFTM_OTHER_MULTIPLE;
7329
		else
7330
			(*state)->timeout = PFTM_OTHER_SINGLE;
7331
		break;
7332
	}
7333

7334
	/* translate source/destination address, if necessary */
7335
	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
7336
		struct pf_state_key	*nk;
7337
		int			 afto, sidx, didx;
7338

7339
		if (PF_REVERSED_KEY(*state, pd->af))
7340
			nk = (*state)->key[pd->sidx];
7341
		else
7342
			nk = (*state)->key[pd->didx];
7343

7344
		afto = pd->af != nk->af;
7345

7346
		if (afto && (*state)->direction == PF_IN) {
7347
			sidx = pd->didx;
7348
			didx = pd->sidx;
7349
		} else {
7350
			sidx = pd->sidx;
7351
			didx = pd->didx;
7352
		}
7353

7354
		if (afto) {
7355
			pf_addrcpy(&pd->nsaddr, &nk->addr[sidx], nk->af);
7356
			pf_addrcpy(&pd->ndaddr, &nk->addr[didx], nk->af);
7357
			pd->naf = nk->af;
7358
			action = PF_AFRT;
7359
		}
7360

7361
		if (afto || PF_ANEQ(pd->src, &nk->addr[sidx], pd->af) ||
7362
		    nk->port[sidx] != pd->osport)
7363
			pf_change_ap(pd, pd->src, pd->sport,
7364
			    &nk->addr[sidx], nk->port[sidx]);
7365

7366
		if (afto || PF_ANEQ(pd->dst, &nk->addr[didx], pd->af) ||
7367
		    nk->port[didx] != pd->odport)
7368
			pf_change_ap(pd, pd->dst, pd->dport,
7369
			    &nk->addr[didx], nk->port[didx]);
7370

7371
		copyback = 1;
7372
	}
7373

7374
	if (copyback && pd->hdrlen > 0)
7375
		m_copyback(pd->m, pd->off, pd->hdrlen, pd->hdr.any);
7376

7377
	return (action);
7378
}
7379

7380
static int
7381
pf_sctp_track(struct pf_kstate *state, struct pf_pdesc *pd,
7382
    u_short *reason)
7383
{
7384
	struct pf_state_peer	*src;
7385
	if (pd->dir == state->direction) {
7386
		if (PF_REVERSED_KEY(state, pd->af))
7387
			src = &state->dst;
7388
		else
7389
			src = &state->src;
7390
	} else {
7391
		if (PF_REVERSED_KEY(state, pd->af))
7392
			src = &state->src;
7393
		else
7394
			src = &state->dst;
7395
	}
7396

7397
	if (src->scrub != NULL) {
7398
		if (src->scrub->pfss_v_tag == 0)
7399
			src->scrub->pfss_v_tag = pd->hdr.sctp.v_tag;
7400
		else  if (src->scrub->pfss_v_tag != pd->hdr.sctp.v_tag)
7401
			return (PF_DROP);
7402
	}
7403

7404
	return (PF_PASS);
7405
}
7406

7407
static void
7408
pf_sctp_multihome_detach_addr(const struct pf_kstate *s)
7409
{
7410
	struct pf_sctp_endpoint key;
7411
	struct pf_sctp_endpoint *ep;
7412
	struct pf_state_key *sks = s->key[PF_SK_STACK];
7413
	struct pf_sctp_source *i, *tmp;
7414

7415
	if (sks == NULL || sks->proto != IPPROTO_SCTP || s->dst.scrub == NULL)
7416
		return;
7417

7418
	PF_SCTP_ENDPOINTS_LOCK();
7419

7420
	key.v_tag = s->dst.scrub->pfss_v_tag;
7421
	ep  = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
7422
	if (ep != NULL) {
7423
		TAILQ_FOREACH_SAFE(i, &ep->sources, entry, tmp) {
7424
			if (pf_addr_cmp(&i->addr,
7425
			    &s->key[PF_SK_WIRE]->addr[s->direction == PF_OUT],
7426
			    s->key[PF_SK_WIRE]->af) == 0) {
7427
				SDT_PROBE3(pf, sctp, multihome, remove,
7428
				    key.v_tag, s, i);
7429
				TAILQ_REMOVE(&ep->sources, i, entry);
7430
				free(i, M_PFTEMP);
7431
				break;
7432
			}
7433
		}
7434

7435
		if (TAILQ_EMPTY(&ep->sources)) {
7436
			RB_REMOVE(pf_sctp_endpoints, &V_pf_sctp_endpoints, ep);
7437
			free(ep, M_PFTEMP);
7438
		}
7439
	}
7440

7441
	/* Other direction. */
7442
	key.v_tag = s->src.scrub->pfss_v_tag;
7443
	ep = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
7444
	if (ep != NULL) {
7445
		TAILQ_FOREACH_SAFE(i, &ep->sources, entry, tmp) {
7446
			if (pf_addr_cmp(&i->addr,
7447
			    &s->key[PF_SK_WIRE]->addr[s->direction == PF_IN],
7448
			    s->key[PF_SK_WIRE]->af) == 0) {
7449
				SDT_PROBE3(pf, sctp, multihome, remove,
7450
				    key.v_tag, s, i);
7451
				TAILQ_REMOVE(&ep->sources, i, entry);
7452
				free(i, M_PFTEMP);
7453
				break;
7454
			}
7455
		}
7456

7457
		if (TAILQ_EMPTY(&ep->sources)) {
7458
			RB_REMOVE(pf_sctp_endpoints, &V_pf_sctp_endpoints, ep);
7459
			free(ep, M_PFTEMP);
7460
		}
7461
	}
7462

7463
	PF_SCTP_ENDPOINTS_UNLOCK();
7464
}
7465

7466
static void
7467
pf_sctp_multihome_add_addr(struct pf_pdesc *pd, struct pf_addr *a, uint32_t v_tag)
7468
{
7469
	struct pf_sctp_endpoint key = {
7470
		.v_tag = v_tag,
7471
	};
7472
	struct pf_sctp_source *i;
7473
	struct pf_sctp_endpoint *ep;
7474
	int count;
7475

7476
	PF_SCTP_ENDPOINTS_LOCK();
7477

7478
	ep = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
7479
	if (ep == NULL) {
7480
		ep = malloc(sizeof(struct pf_sctp_endpoint),
7481
		    M_PFTEMP, M_NOWAIT);
7482
		if (ep == NULL) {
7483
			PF_SCTP_ENDPOINTS_UNLOCK();
7484
			return;
7485
		}
7486

7487
		ep->v_tag = v_tag;
7488
		TAILQ_INIT(&ep->sources);
7489
		RB_INSERT(pf_sctp_endpoints, &V_pf_sctp_endpoints, ep);
7490
	}
7491

7492
	/* Avoid inserting duplicates. */
7493
	count = 0;
7494
	TAILQ_FOREACH(i, &ep->sources, entry) {
7495
		count++;
7496
		if (pf_addr_cmp(&i->addr, a, pd->af) == 0) {
7497
			PF_SCTP_ENDPOINTS_UNLOCK();
7498
			return;
7499
		}
7500
	}
7501

7502
	/* Limit the number of addresses per endpoint. */
7503
	if (count >= PF_SCTP_MAX_ENDPOINTS) {
7504
		PF_SCTP_ENDPOINTS_UNLOCK();
7505
		return;
7506
	}
7507

7508
	i = malloc(sizeof(*i), M_PFTEMP, M_NOWAIT);
7509
	if (i == NULL) {
7510
		PF_SCTP_ENDPOINTS_UNLOCK();
7511
		return;
7512
	}
7513

7514
	i->af = pd->af;
7515
	memcpy(&i->addr, a, sizeof(*a));
7516
	TAILQ_INSERT_TAIL(&ep->sources, i, entry);
7517
	SDT_PROBE2(pf, sctp, multihome, add, v_tag, i);
7518

7519
	PF_SCTP_ENDPOINTS_UNLOCK();
7520
}
7521

7522
static void
7523
pf_sctp_multihome_delayed(struct pf_pdesc *pd, struct pfi_kkif *kif,
7524
    struct pf_kstate *s, int action)
7525
{
7526
	struct pf_krule_slist		 match_rules;
7527
	struct pf_sctp_multihome_job	*j, *tmp;
7528
	struct pf_sctp_source		*i;
7529
	int			 ret;
7530
	struct pf_kstate	*sm = NULL;
7531
	struct pf_krule		*ra = NULL;
7532
	struct pf_krule		*r = &V_pf_default_rule;
7533
	struct pf_kruleset	*rs = NULL;
7534
	u_short			 reason;
7535
	bool do_extra = true;
7536

7537
	PF_RULES_RLOCK_TRACKER;
7538

7539
again:
7540
	TAILQ_FOREACH_SAFE(j, &pd->sctp_multihome_jobs, next, tmp) {
7541
		if (s == NULL || action != PF_PASS)
7542
			goto free;
7543

7544
		/* Confirm we don't recurse here. */
7545
		MPASS(! (pd->sctp_flags & PFDESC_SCTP_ADD_IP));
7546

7547
		switch (j->op) {
7548
		case  SCTP_ADD_IP_ADDRESS: {
7549
			uint32_t v_tag = pd->sctp_initiate_tag;
7550

7551
			if (v_tag == 0) {
7552
				if (s->direction == pd->dir)
7553
					v_tag = s->src.scrub->pfss_v_tag;
7554
				else
7555
					v_tag = s->dst.scrub->pfss_v_tag;
7556
			}
7557

7558
			/*
7559
			 * Avoid duplicating states. We'll already have
7560
			 * created a state based on the source address of
7561
			 * the packet, but SCTP endpoints may also list this
7562
			 * address again in the INIT(_ACK) parameters.
7563
			 */
7564
			if (pf_addr_cmp(&j->src, pd->src, pd->af) == 0) {
7565
				break;
7566
			}
7567

7568
			j->pd.sctp_flags |= PFDESC_SCTP_ADD_IP;
7569
			PF_RULES_RLOCK();
7570
			sm = NULL;
7571
			if (s->rule->rule_flag & PFRULE_ALLOW_RELATED) {
7572
				j->pd.related_rule = s->rule;
7573
			}
7574
			SLIST_INIT(&match_rules);
7575
			ret = pf_test_rule(&r, &sm,
7576
			    &j->pd, &ra, &rs, &reason, NULL, &match_rules);
7577
			/*
7578
			 * Nothing to do about match rules, the processed
7579
			 * packet has already increased the counters.
7580
			 */
7581
			pf_free_match_rules(&match_rules);
7582
			PF_RULES_RUNLOCK();
7583
			SDT_PROBE4(pf, sctp, multihome, test, kif, r, j->pd.m, ret);
7584
			if (ret != PF_DROP && sm != NULL) {
7585
				/* Inherit v_tag values. */
7586
				if (sm->direction == s->direction) {
7587
					sm->src.scrub->pfss_v_tag = s->src.scrub->pfss_v_tag;
7588
					sm->dst.scrub->pfss_v_tag = s->dst.scrub->pfss_v_tag;
7589
				} else {
7590
					sm->src.scrub->pfss_v_tag = s->dst.scrub->pfss_v_tag;
7591
					sm->dst.scrub->pfss_v_tag = s->src.scrub->pfss_v_tag;
7592
				}
7593
				PF_STATE_UNLOCK(sm);
7594
			} else {
7595
				/* If we try duplicate inserts? */
7596
				break;
7597
			}
7598

7599
			/* Only add the address if we've actually allowed the state. */
7600
			pf_sctp_multihome_add_addr(pd, &j->src, v_tag);
7601

7602
			if (! do_extra) {
7603
				break;
7604
			}
7605
			/*
7606
			 * We need to do this for each of our source addresses.
7607
			 * Find those based on the verification tag.
7608
			 */
7609
			struct pf_sctp_endpoint key = {
7610
				.v_tag = pd->hdr.sctp.v_tag,
7611
			};
7612
			struct pf_sctp_endpoint *ep;
7613

7614
			PF_SCTP_ENDPOINTS_LOCK();
7615
			ep = RB_FIND(pf_sctp_endpoints, &V_pf_sctp_endpoints, &key);
7616
			if (ep == NULL) {
7617
				PF_SCTP_ENDPOINTS_UNLOCK();
7618
				break;
7619
			}
7620
			MPASS(ep != NULL);
7621

7622
			TAILQ_FOREACH(i, &ep->sources, entry) {
7623
				struct pf_sctp_multihome_job *nj;
7624

7625
				/* SCTP can intermingle IPv4 and IPv6. */
7626
				if (i->af != pd->af)
7627
					continue;
7628

7629
				nj = malloc(sizeof(*nj), M_PFTEMP, M_NOWAIT | M_ZERO);
7630
				if (! nj) {
7631
					continue;
7632
				}
7633
				memcpy(&nj->pd, &j->pd, sizeof(j->pd));
7634
				memcpy(&nj->src, &j->src, sizeof(nj->src));
7635
				nj->pd.src = &nj->src;
7636
				// New destination address!
7637
				memcpy(&nj->dst, &i->addr, sizeof(nj->dst));
7638
				nj->pd.dst = &nj->dst;
7639
				nj->pd.m = j->pd.m;
7640
				nj->op = j->op;
7641

7642
				MPASS(nj->pd.pcksum);
7643
				TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, nj, next);
7644
			}
7645
			PF_SCTP_ENDPOINTS_UNLOCK();
7646

7647
			break;
7648
		}
7649
		case SCTP_DEL_IP_ADDRESS: {
7650
			struct pf_state_key_cmp key;
7651
			uint8_t psrc;
7652
			int action;
7653

7654
			bzero(&key, sizeof(key));
7655
			key.af = j->pd.af;
7656
			key.proto = IPPROTO_SCTP;
7657
			if (j->pd.dir == PF_IN)	{	/* wire side, straight */
7658
				pf_addrcpy(&key.addr[0], j->pd.src, key.af);
7659
				pf_addrcpy(&key.addr[1], j->pd.dst, key.af);
7660
				key.port[0] = j->pd.hdr.sctp.src_port;
7661
				key.port[1] = j->pd.hdr.sctp.dest_port;
7662
			} else {			/* stack side, reverse */
7663
				pf_addrcpy(&key.addr[1], j->pd.src, key.af);
7664
				pf_addrcpy(&key.addr[0], j->pd.dst, key.af);
7665
				key.port[1] = j->pd.hdr.sctp.src_port;
7666
				key.port[0] = j->pd.hdr.sctp.dest_port;
7667
			}
7668

7669
			action = pf_find_state(&j->pd, &key, &sm);
7670
			if (action == PF_MATCH) {
7671
				PF_STATE_LOCK_ASSERT(sm);
7672
				if (j->pd.dir == sm->direction) {
7673
					psrc = PF_PEER_SRC;
7674
				} else {
7675
					psrc = PF_PEER_DST;
7676
				}
7677
				pf_set_protostate(sm, psrc, SCTP_SHUTDOWN_PENDING);
7678
				sm->timeout = PFTM_SCTP_CLOSING;
7679
				PF_STATE_UNLOCK(sm);
7680
			}
7681
			break;
7682
		default:
7683
			panic("Unknown op %#x", j->op);
7684
		}
7685
	}
7686

7687
	free:
7688
		TAILQ_REMOVE(&pd->sctp_multihome_jobs, j, next);
7689
		free(j, M_PFTEMP);
7690
	}
7691

7692
	/* We may have inserted extra work while processing the list. */
7693
	if (! TAILQ_EMPTY(&pd->sctp_multihome_jobs)) {
7694
		do_extra = false;
7695
		goto again;
7696
	}
7697
}
7698

7699
static int
7700
pf_multihome_scan(int start, int len, struct pf_pdesc *pd, int op)
7701
{
7702
	int			 off = 0;
7703
	struct pf_sctp_multihome_job	*job;
7704

7705
	SDT_PROBE4(pf, sctp, multihome_scan, entry, start, len, pd, op);
7706

7707
	while (off < len) {
7708
		struct sctp_paramhdr h;
7709

7710
		if (!pf_pull_hdr(pd->m, start + off, &h, sizeof(h), NULL,
7711
		    pd->af))
7712
			return (PF_DROP);
7713

7714
		/* Parameters are at least 4 bytes. */
7715
		if (ntohs(h.param_length) < 4)
7716
			return (PF_DROP);
7717

7718
		SDT_PROBE2(pf, sctp, multihome_scan, param, ntohs(h.param_type),
7719
		    ntohs(h.param_length));
7720

7721
		switch (ntohs(h.param_type)) {
7722
		case  SCTP_IPV4_ADDRESS: {
7723
			struct in_addr t;
7724

7725
			if (ntohs(h.param_length) !=
7726
			    (sizeof(struct sctp_paramhdr) + sizeof(t)))
7727
				return (PF_DROP);
7728

7729
			if (!pf_pull_hdr(pd->m, start + off + sizeof(h), &t, sizeof(t),
7730
			    NULL, pd->af))
7731
				return (PF_DROP);
7732

7733
			if (in_nullhost(t))
7734
				t.s_addr = pd->src->v4.s_addr;
7735

7736
			/*
7737
			 * We hold the state lock (idhash) here, which means
7738
			 * that we can't acquire the keyhash, or we'll get a
7739
			 * LOR (and potentially double-lock things too). We also
7740
			 * can't release the state lock here, so instead we'll
7741
			 * enqueue this for async handling.
7742
			 * There's a relatively small race here, in that a
7743
			 * packet using the new addresses could arrive already,
7744
			 * but that's just though luck for it.
7745
			 */
7746
			job = malloc(sizeof(*job), M_PFTEMP, M_NOWAIT | M_ZERO);
7747
			if (! job)
7748
				return (PF_DROP);
7749

7750
			SDT_PROBE2(pf, sctp, multihome_scan, ipv4, &t, op);
7751

7752
			memcpy(&job->pd, pd, sizeof(*pd));
7753

7754
			// New source address!
7755
			memcpy(&job->src, &t, sizeof(t));
7756
			job->pd.src = &job->src;
7757
			memcpy(&job->dst, pd->dst, sizeof(job->dst));
7758
			job->pd.dst = &job->dst;
7759
			job->pd.m = pd->m;
7760
			job->op = op;
7761

7762
			MPASS(job->pd.pcksum);
7763
			TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, job, next);
7764
			break;
7765
		}
7766
#ifdef INET6
7767
		case SCTP_IPV6_ADDRESS: {
7768
			struct in6_addr t;
7769

7770
			if (ntohs(h.param_length) !=
7771
			    (sizeof(struct sctp_paramhdr) + sizeof(t)))
7772
				return (PF_DROP);
7773

7774
			if (!pf_pull_hdr(pd->m, start + off + sizeof(h), &t, sizeof(t),
7775
			    NULL, pd->af))
7776
				return (PF_DROP);
7777
			if (memcmp(&t, &pd->src->v6, sizeof(t)) == 0)
7778
				break;
7779
			if (memcmp(&t, &in6addr_any, sizeof(t)) == 0)
7780
				memcpy(&t, &pd->src->v6, sizeof(t));
7781

7782
			job = malloc(sizeof(*job), M_PFTEMP, M_NOWAIT | M_ZERO);
7783
			if (! job)
7784
				return (PF_DROP);
7785

7786
			SDT_PROBE2(pf, sctp, multihome_scan, ipv6, &t, op);
7787

7788
			memcpy(&job->pd, pd, sizeof(*pd));
7789
			memcpy(&job->src, &t, sizeof(t));
7790
			job->pd.src = &job->src;
7791
			memcpy(&job->dst, pd->dst, sizeof(job->dst));
7792
			job->pd.dst = &job->dst;
7793
			job->pd.m = pd->m;
7794
			job->op = op;
7795

7796
			MPASS(job->pd.pcksum);
7797
			TAILQ_INSERT_TAIL(&pd->sctp_multihome_jobs, job, next);
7798
			break;
7799
		}
7800
#endif /* INET6 */
7801
		case SCTP_ADD_IP_ADDRESS: {
7802
			int ret;
7803
			struct sctp_asconf_paramhdr ah;
7804

7805
			if (!pf_pull_hdr(pd->m, start + off, &ah, sizeof(ah),
7806
			    NULL, pd->af))
7807
				return (PF_DROP);
7808

7809
			ret = pf_multihome_scan(start + off + sizeof(ah),
7810
			    ntohs(ah.ph.param_length) - sizeof(ah), pd,
7811
			    SCTP_ADD_IP_ADDRESS);
7812
			if (ret != PF_PASS)
7813
				return (ret);
7814
			break;
7815
		}
7816
		case SCTP_DEL_IP_ADDRESS: {
7817
			int ret;
7818
			struct sctp_asconf_paramhdr ah;
7819

7820
			if (!pf_pull_hdr(pd->m, start + off, &ah, sizeof(ah),
7821
			    NULL, pd->af))
7822
				return (PF_DROP);
7823
			ret = pf_multihome_scan(start + off + sizeof(ah),
7824
			    ntohs(ah.ph.param_length) - sizeof(ah), pd,
7825
			    SCTP_DEL_IP_ADDRESS);
7826
			if (ret != PF_PASS)
7827
				return (ret);
7828
			break;
7829
		}
7830
		default:
7831
			break;
7832
		}
7833

7834
		off += roundup(ntohs(h.param_length), 4);
7835
	}
7836

7837
	return (PF_PASS);
7838
}
7839

7840
int
7841
pf_multihome_scan_init(int start, int len, struct pf_pdesc *pd)
7842
{
7843
	start += sizeof(struct sctp_init_chunk);
7844
	len -= sizeof(struct sctp_init_chunk);
7845

7846
	return (pf_multihome_scan(start, len, pd, SCTP_ADD_IP_ADDRESS));
7847
}
7848

7849
int
7850
pf_multihome_scan_asconf(int start, int len, struct pf_pdesc *pd)
7851
{
7852
	start += sizeof(struct sctp_asconf_chunk);
7853
	len -= sizeof(struct sctp_asconf_chunk);
7854

7855
	return (pf_multihome_scan(start, len, pd, SCTP_ADD_IP_ADDRESS));
7856
}
7857

7858
int
7859
pf_icmp_state_lookup(struct pf_state_key_cmp *key, struct pf_pdesc *pd,
7860
    struct pf_kstate **state, u_int16_t icmpid, u_int16_t type, int icmp_dir,
7861
    int *iidx, int multi, int inner)
7862
{
7863
	int	 action, direction = pd->dir;
7864

7865
	key->af = pd->af;
7866
	key->proto = pd->proto;
7867
	if (icmp_dir == PF_IN) {
7868
		*iidx = pd->sidx;
7869
		key->port[pd->sidx] = icmpid;
7870
		key->port[pd->didx] = type;
7871
	} else {
7872
		*iidx = pd->didx;
7873
		key->port[pd->sidx] = type;
7874
		key->port[pd->didx] = icmpid;
7875
	}
7876
	if (pf_state_key_addr_setup(pd, key, multi))
7877
		return (PF_DROP);
7878

7879
	action = pf_find_state(pd, key, state);
7880
	if (action != PF_MATCH)
7881
		return (action);
7882

7883
	if ((*state)->state_flags & PFSTATE_SLOPPY)
7884
		return (-1);
7885

7886
	/* Is this ICMP message flowing in right direction? */
7887
	if ((*state)->key[PF_SK_WIRE]->af != (*state)->key[PF_SK_STACK]->af)
7888
		direction = (pd->af == (*state)->key[PF_SK_WIRE]->af) ?
7889
		    PF_IN : PF_OUT;
7890
	else
7891
		direction = (*state)->direction;
7892
	if ((*state)->rule->type &&
7893
	    (((!inner && direction == pd->dir) ||
7894
	    (inner && direction != pd->dir)) ?
7895
	    PF_IN : PF_OUT) != icmp_dir) {
7896
		if (V_pf_status.debug >= PF_DEBUG_MISC) {
7897
			printf("pf: icmp type %d in wrong direction (%d): ",
7898
			    ntohs(type), icmp_dir);
7899
			pf_print_state(*state);
7900
			printf("\n");
7901
		}
7902
		PF_STATE_UNLOCK(*state);
7903
		*state = NULL;
7904
		return (PF_DROP);
7905
	}
7906
	return (-1);
7907
}
7908

7909
static int
7910
pf_test_state_icmp(struct pf_kstate **state, struct pf_pdesc *pd,
7911
    u_short *reason)
7912
{
7913
	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
7914
	u_int16_t	*icmpsum, virtual_id, virtual_type;
7915
	u_int8_t	 icmptype, icmpcode;
7916
	int		 icmp_dir, iidx, ret;
7917
	struct pf_state_key_cmp key;
7918
#ifdef INET
7919
	u_int16_t	 icmpid;
7920
#endif /* INET*/
7921

7922
	MPASS(*state == NULL);
7923

7924
	bzero(&key, sizeof(key));
7925
	switch (pd->proto) {
7926
#ifdef INET
7927
	case IPPROTO_ICMP:
7928
		icmptype = pd->hdr.icmp.icmp_type;
7929
		icmpcode = pd->hdr.icmp.icmp_code;
7930
		icmpid = pd->hdr.icmp.icmp_id;
7931
		icmpsum = &pd->hdr.icmp.icmp_cksum;
7932
		break;
7933
#endif /* INET */
7934
#ifdef INET6
7935
	case IPPROTO_ICMPV6:
7936
		icmptype = pd->hdr.icmp6.icmp6_type;
7937
		icmpcode = pd->hdr.icmp6.icmp6_code;
7938
#ifdef INET
7939
		icmpid = pd->hdr.icmp6.icmp6_id;
7940
#endif /* INET */
7941
		icmpsum = &pd->hdr.icmp6.icmp6_cksum;
7942
		break;
7943
#endif /* INET6 */
7944
	default:
7945
		panic("unhandled proto %d", pd->proto);
7946
	}
7947

7948
	if (pf_icmp_mapping(pd, icmptype, &icmp_dir, &virtual_id,
7949
	    &virtual_type) == 0) {
7950
		/*
7951
		 * ICMP query/reply message not related to a TCP/UDP/SCTP
7952
		 * packet. Search for an ICMP state.
7953
		 */
7954
		ret = pf_icmp_state_lookup(&key, pd, state, virtual_id,
7955
		    virtual_type, icmp_dir, &iidx, 0, 0);
7956
		/* IPv6? try matching a multicast address */
7957
		if (ret == PF_DROP && pd->af == AF_INET6 && icmp_dir == PF_OUT) {
7958
			MPASS(*state == NULL);
7959
			ret = pf_icmp_state_lookup(&key, pd, state,
7960
			    virtual_id, virtual_type,
7961
			    icmp_dir, &iidx, 1, 0);
7962
		}
7963
		if (ret >= 0) {
7964
			MPASS(*state == NULL);
7965
			return (ret);
7966
		}
7967

7968
		(*state)->expire = pf_get_uptime();
7969
		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
7970

7971
		/* translate source/destination address, if necessary */
7972
		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
7973
			struct pf_state_key	*nk;
7974
			int			 afto, sidx, didx;
7975

7976
			if (PF_REVERSED_KEY(*state, pd->af))
7977
				nk = (*state)->key[pd->sidx];
7978
			else
7979
				nk = (*state)->key[pd->didx];
7980

7981
			afto = pd->af != nk->af;
7982

7983
			if (afto && (*state)->direction == PF_IN) {
7984
				sidx = pd->didx;
7985
				didx = pd->sidx;
7986
				iidx = !iidx;
7987
			} else {
7988
				sidx = pd->sidx;
7989
				didx = pd->didx;
7990
			}
7991

7992
			switch (pd->af) {
7993
#ifdef INET
7994
			case AF_INET:
7995
#ifdef INET6
7996
				if (afto) {
7997
					if (pf_translate_icmp_af(AF_INET6,
7998
					    &pd->hdr.icmp))
7999
						return (PF_DROP);
8000
					pd->proto = IPPROTO_ICMPV6;
8001
				}
8002
#endif /* INET6 */
8003
				if (!afto &&
8004
				    PF_ANEQ(pd->src, &nk->addr[sidx], AF_INET))
8005
					pf_change_a(&saddr->v4.s_addr,
8006
					    pd->ip_sum,
8007
					    nk->addr[sidx].v4.s_addr,
8008
					    0);
8009

8010
				if (!afto && PF_ANEQ(pd->dst,
8011
				    &nk->addr[didx], AF_INET))
8012
					pf_change_a(&daddr->v4.s_addr,
8013
					    pd->ip_sum,
8014
					    nk->addr[didx].v4.s_addr, 0);
8015

8016
				if (nk->port[iidx] !=
8017
				    pd->hdr.icmp.icmp_id) {
8018
					pd->hdr.icmp.icmp_cksum =
8019
					    pf_cksum_fixup(
8020
					    pd->hdr.icmp.icmp_cksum, icmpid,
8021
					    nk->port[iidx], 0);
8022
					pd->hdr.icmp.icmp_id =
8023
					    nk->port[iidx];
8024
				}
8025

8026
				m_copyback(pd->m, pd->off, ICMP_MINLEN,
8027
				    (caddr_t )&pd->hdr.icmp);
8028
				break;
8029
#endif /* INET */
8030
#ifdef INET6
8031
			case AF_INET6:
8032
#ifdef INET
8033
				if (afto) {
8034
					if (pf_translate_icmp_af(AF_INET,
8035
					    &pd->hdr.icmp6))
8036
						return (PF_DROP);
8037
					pd->proto = IPPROTO_ICMP;
8038
				}
8039
#endif /* INET */
8040
				if (!afto &&
8041
				    PF_ANEQ(pd->src, &nk->addr[sidx], AF_INET6))
8042
					pf_change_a6(saddr,
8043
					    &pd->hdr.icmp6.icmp6_cksum,
8044
					    &nk->addr[sidx], 0);
8045

8046
				if (!afto && PF_ANEQ(pd->dst,
8047
				    &nk->addr[didx], AF_INET6))
8048
					pf_change_a6(daddr,
8049
					    &pd->hdr.icmp6.icmp6_cksum,
8050
					    &nk->addr[didx], 0);
8051

8052
				if (nk->port[iidx] != pd->hdr.icmp6.icmp6_id)
8053
					pd->hdr.icmp6.icmp6_id =
8054
					    nk->port[iidx];
8055

8056
				m_copyback(pd->m, pd->off, sizeof(struct icmp6_hdr),
8057
				    (caddr_t )&pd->hdr.icmp6);
8058
				break;
8059
#endif /* INET6 */
8060
			}
8061
			if (afto) {
8062
				pf_addrcpy(&pd->nsaddr, &nk->addr[sidx],
8063
				    nk->af);
8064
				pf_addrcpy(&pd->ndaddr, &nk->addr[didx],
8065
				    nk->af);
8066
				pd->naf = nk->af;
8067
				return (PF_AFRT);
8068
			}
8069
		}
8070
		return (PF_PASS);
8071

8072
	} else {
8073
		/*
8074
		 * ICMP error message in response to a TCP/UDP packet.
8075
		 * Extract the inner TCP/UDP header and search for that state.
8076
		 */
8077

8078
		struct pf_pdesc	pd2;
8079
		bzero(&pd2, sizeof pd2);
8080
#ifdef INET
8081
		struct ip	h2;
8082
#endif /* INET */
8083
#ifdef INET6
8084
		struct ip6_hdr	h2_6;
8085
#endif /* INET6 */
8086
		int		ipoff2 = 0;
8087

8088
		pd2.af = pd->af;
8089
		pd2.dir = pd->dir;
8090
		/* Payload packet is from the opposite direction. */
8091
		pd2.sidx = (pd->dir == PF_IN) ? 1 : 0;
8092
		pd2.didx = (pd->dir == PF_IN) ? 0 : 1;
8093
		pd2.m = pd->m;
8094
		pd2.pf_mtag = pd->pf_mtag;
8095
		pd2.kif = pd->kif;
8096
		switch (pd->af) {
8097
#ifdef INET
8098
		case AF_INET:
8099
			/* offset of h2 in mbuf chain */
8100
			ipoff2 = pd->off + ICMP_MINLEN;
8101

8102
			if (!pf_pull_hdr(pd->m, ipoff2, &h2, sizeof(h2),
8103
			    reason, pd2.af)) {
8104
				DPFPRINTF(PF_DEBUG_MISC,
8105
				    "pf: ICMP error message too short "
8106
				    "(ip)");
8107
				return (PF_DROP);
8108
			}
8109
			/*
8110
			 * ICMP error messages don't refer to non-first
8111
			 * fragments
8112
			 */
8113
			if (h2.ip_off & htons(IP_OFFMASK)) {
8114
				REASON_SET(reason, PFRES_FRAG);
8115
				return (PF_DROP);
8116
			}
8117

8118
			/* offset of protocol header that follows h2 */
8119
			pd2.off = ipoff2;
8120
			if (pf_walk_header(&pd2, &h2, reason) != PF_PASS)
8121
				return (PF_DROP);
8122

8123
			pd2.tot_len = ntohs(h2.ip_len);
8124
			pd2.ttl = h2.ip_ttl;
8125
			pd2.src = (struct pf_addr *)&h2.ip_src;
8126
			pd2.dst = (struct pf_addr *)&h2.ip_dst;
8127
			pd2.ip_sum = &h2.ip_sum;
8128
			break;
8129
#endif /* INET */
8130
#ifdef INET6
8131
		case AF_INET6:
8132
			ipoff2 = pd->off + sizeof(struct icmp6_hdr);
8133

8134
			if (!pf_pull_hdr(pd->m, ipoff2, &h2_6, sizeof(h2_6),
8135
			    reason, pd2.af)) {
8136
				DPFPRINTF(PF_DEBUG_MISC,
8137
				    "pf: ICMP error message too short "
8138
				    "(ip6)");
8139
				return (PF_DROP);
8140
			}
8141
			pd2.off = ipoff2;
8142
			if (pf_walk_header6(&pd2, &h2_6, reason) != PF_PASS)
8143
				return (PF_DROP);
8144

8145
			pd2.tot_len = ntohs(h2_6.ip6_plen) +
8146
			    sizeof(struct ip6_hdr);
8147
			pd2.ttl = h2_6.ip6_hlim;
8148
			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
8149
			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
8150
			pd2.ip_sum = NULL;
8151
			break;
8152
#endif /* INET6 */
8153
		default:
8154
			unhandled_af(pd->af);
8155
		}
8156

8157
		if (PF_ANEQ(pd->dst, pd2.src, pd->af)) {
8158
			if (V_pf_status.debug >= PF_DEBUG_MISC) {
8159
				printf("pf: BAD ICMP %d:%d outer dst: ",
8160
				    icmptype, icmpcode);
8161
				pf_print_host(pd->src, 0, pd->af);
8162
				printf(" -> ");
8163
				pf_print_host(pd->dst, 0, pd->af);
8164
				printf(" inner src: ");
8165
				pf_print_host(pd2.src, 0, pd2.af);
8166
				printf(" -> ");
8167
				pf_print_host(pd2.dst, 0, pd2.af);
8168
				printf("\n");
8169
			}
8170
			REASON_SET(reason, PFRES_BADSTATE);
8171
			return (PF_DROP);
8172
		}
8173

8174
		switch (pd2.proto) {
8175
		case IPPROTO_TCP: {
8176
			struct tcphdr		*th = &pd2.hdr.tcp;
8177
			u_int32_t		 seq;
8178
			struct pf_state_peer	*src, *dst;
8179
			u_int8_t		 dws;
8180
			int			 copyback = 0;
8181
			int			 action;
8182

8183
			/*
8184
			 * Only the first 8 bytes of the TCP header can be
8185
			 * expected. Don't access any TCP header fields after
8186
			 * th_seq, an ackskew test is not possible.
8187
			 */
8188
			if (!pf_pull_hdr(pd->m, pd2.off, th, 8, reason,
8189
			    pd2.af)) {
8190
				DPFPRINTF(PF_DEBUG_MISC,
8191
				    "pf: ICMP error message too short "
8192
				    "(tcp)");
8193
				return (PF_DROP);
8194
			}
8195
			pd2.pcksum = &pd2.hdr.tcp.th_sum;
8196

8197
			key.af = pd2.af;
8198
			key.proto = IPPROTO_TCP;
8199
			pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
8200
			pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
8201
			key.port[pd2.sidx] = th->th_sport;
8202
			key.port[pd2.didx] = th->th_dport;
8203

8204
			action = pf_find_state(&pd2, &key, state);
8205
			if (action != PF_MATCH)
8206
				return (action);
8207

8208
			if (pd->dir == (*state)->direction) {
8209
				if (PF_REVERSED_KEY(*state, pd->af)) {
8210
					src = &(*state)->src;
8211
					dst = &(*state)->dst;
8212
				} else {
8213
					src = &(*state)->dst;
8214
					dst = &(*state)->src;
8215
				}
8216
			} else {
8217
				if (PF_REVERSED_KEY(*state, pd->af)) {
8218
					src = &(*state)->dst;
8219
					dst = &(*state)->src;
8220
				} else {
8221
					src = &(*state)->src;
8222
					dst = &(*state)->dst;
8223
				}
8224
			}
8225

8226
			if (src->wscale && dst->wscale)
8227
				dws = dst->wscale & PF_WSCALE_MASK;
8228
			else
8229
				dws = 0;
8230

8231
			/* Demodulate sequence number */
8232
			seq = ntohl(th->th_seq) - src->seqdiff;
8233
			if (src->seqdiff) {
8234
				pf_change_a(&th->th_seq, icmpsum,
8235
				    htonl(seq), 0);
8236
				copyback = 1;
8237
			}
8238

8239
			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
8240
			    (!SEQ_GEQ(src->seqhi, seq) ||
8241
			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
8242
				if (V_pf_status.debug >= PF_DEBUG_MISC) {
8243
					printf("pf: BAD ICMP %d:%d ",
8244
					    icmptype, icmpcode);
8245
					pf_print_host(pd->src, 0, pd->af);
8246
					printf(" -> ");
8247
					pf_print_host(pd->dst, 0, pd->af);
8248
					printf(" state: ");
8249
					pf_print_state(*state);
8250
					printf(" seq=%u\n", seq);
8251
				}
8252
				REASON_SET(reason, PFRES_BADSTATE);
8253
				return (PF_DROP);
8254
			} else {
8255
				if (V_pf_status.debug >= PF_DEBUG_MISC) {
8256
					printf("pf: OK ICMP %d:%d ",
8257
					    icmptype, icmpcode);
8258
					pf_print_host(pd->src, 0, pd->af);
8259
					printf(" -> ");
8260
					pf_print_host(pd->dst, 0, pd->af);
8261
					printf(" state: ");
8262
					pf_print_state(*state);
8263
					printf(" seq=%u\n", seq);
8264
				}
8265
			}
8266

8267
			/* translate source/destination address, if necessary */
8268
			if ((*state)->key[PF_SK_WIRE] !=
8269
			    (*state)->key[PF_SK_STACK]) {
8270

8271
				struct pf_state_key	*nk;
8272

8273
				if (PF_REVERSED_KEY(*state, pd->af))
8274
					nk = (*state)->key[pd->sidx];
8275
				else
8276
					nk = (*state)->key[pd->didx];
8277

8278
#if defined(INET) && defined(INET6)
8279
				int		 afto, sidx, didx;
8280

8281
				afto = pd->af != nk->af;
8282

8283
				if (afto && (*state)->direction == PF_IN) {
8284
					sidx = pd2.didx;
8285
					didx = pd2.sidx;
8286
				} else {
8287
					sidx = pd2.sidx;
8288
					didx = pd2.didx;
8289
				}
8290

8291
				if (afto) {
8292
					if (pf_translate_icmp_af(nk->af,
8293
					    &pd->hdr.icmp))
8294
						return (PF_DROP);
8295
					m_copyback(pd->m, pd->off,
8296
					    sizeof(struct icmp6_hdr),
8297
					    (c_caddr_t)&pd->hdr.icmp6);
8298
					if (pf_change_icmp_af(pd->m, ipoff2, pd,
8299
					    &pd2, &nk->addr[sidx],
8300
					    &nk->addr[didx], pd->af,
8301
					    nk->af))
8302
						return (PF_DROP);
8303
					pf_addrcpy(&pd->nsaddr,
8304
					    &nk->addr[pd2.sidx], nk->af);
8305
					pf_addrcpy(&pd->ndaddr,
8306
					    &nk->addr[pd2.didx], nk->af);
8307
					if (nk->af == AF_INET) {
8308
						pd->proto = IPPROTO_ICMP;
8309
					} else {
8310
						pd->proto = IPPROTO_ICMPV6;
8311
						/*
8312
						 * IPv4 becomes IPv6 so we must
8313
						 * copy IPv4 src addr to least
8314
						 * 32bits in IPv6 address to
8315
						 * keep traceroute/icmp
8316
						 * working.
8317
						 */
8318
						pd->nsaddr.addr32[3] =
8319
						    pd->src->addr32[0];
8320
					}
8321
					pd->naf = pd2.naf = nk->af;
8322
					pf_change_ap(&pd2, pd2.src, &th->th_sport,
8323
					    &nk->addr[pd2.sidx], nk->port[sidx]);
8324
					pf_change_ap(&pd2, pd2.dst, &th->th_dport,
8325
					    &nk->addr[pd2.didx], nk->port[didx]);
8326
					m_copyback(pd2.m, pd2.off, 8, (c_caddr_t)th);
8327
					return (PF_AFRT);
8328
				}
8329
#endif /* INET && INET6 */
8330

8331
				if (PF_ANEQ(pd2.src,
8332
				    &nk->addr[pd2.sidx], pd2.af) ||
8333
				    nk->port[pd2.sidx] != th->th_sport)
8334
					pf_change_icmp(pd2.src, &th->th_sport,
8335
					    daddr, &nk->addr[pd2.sidx],
8336
					    nk->port[pd2.sidx], NULL,
8337
					    pd2.ip_sum, icmpsum,
8338
					    pd->ip_sum, 0, pd2.af);
8339

8340
				if (PF_ANEQ(pd2.dst,
8341
				    &nk->addr[pd2.didx], pd2.af) ||
8342
				    nk->port[pd2.didx] != th->th_dport)
8343
					pf_change_icmp(pd2.dst, &th->th_dport,
8344
					    saddr, &nk->addr[pd2.didx],
8345
					    nk->port[pd2.didx], NULL,
8346
					    pd2.ip_sum, icmpsum,
8347
					    pd->ip_sum, 0, pd2.af);
8348
				copyback = 1;
8349
			}
8350

8351
			if (copyback) {
8352
				switch (pd2.af) {
8353
#ifdef INET
8354
				case AF_INET:
8355
					m_copyback(pd->m, pd->off, ICMP_MINLEN,
8356
					    (caddr_t )&pd->hdr.icmp);
8357
					m_copyback(pd->m, ipoff2, sizeof(h2),
8358
					    (caddr_t )&h2);
8359
					break;
8360
#endif /* INET */
8361
#ifdef INET6
8362
				case AF_INET6:
8363
					m_copyback(pd->m, pd->off,
8364
					    sizeof(struct icmp6_hdr),
8365
					    (caddr_t )&pd->hdr.icmp6);
8366
					m_copyback(pd->m, ipoff2, sizeof(h2_6),
8367
					    (caddr_t )&h2_6);
8368
					break;
8369
#endif /* INET6 */
8370
				default:
8371
					unhandled_af(pd->af);
8372
				}
8373
				m_copyback(pd->m, pd2.off, 8, (caddr_t)th);
8374
			}
8375

8376
			return (PF_PASS);
8377
			break;
8378
		}
8379
		case IPPROTO_UDP: {
8380
			struct udphdr		*uh = &pd2.hdr.udp;
8381
			int			 action;
8382

8383
			if (!pf_pull_hdr(pd->m, pd2.off, uh, sizeof(*uh),
8384
			    reason, pd2.af)) {
8385
				DPFPRINTF(PF_DEBUG_MISC,
8386
				    "pf: ICMP error message too short "
8387
				    "(udp)");
8388
				return (PF_DROP);
8389
			}
8390
			pd2.pcksum = &pd2.hdr.udp.uh_sum;
8391

8392
			key.af = pd2.af;
8393
			key.proto = IPPROTO_UDP;
8394
			pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
8395
			pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
8396
			key.port[pd2.sidx] = uh->uh_sport;
8397
			key.port[pd2.didx] = uh->uh_dport;
8398

8399
			action = pf_find_state(&pd2, &key, state);
8400
			if (action != PF_MATCH)
8401
				return (action);
8402

8403
			/* translate source/destination address, if necessary */
8404
			if ((*state)->key[PF_SK_WIRE] !=
8405
			    (*state)->key[PF_SK_STACK]) {
8406
				struct pf_state_key	*nk;
8407

8408
				if (PF_REVERSED_KEY(*state, pd->af))
8409
					nk = (*state)->key[pd->sidx];
8410
				else
8411
					nk = (*state)->key[pd->didx];
8412

8413
#if defined(INET) && defined(INET6)
8414
				int	 afto, sidx, didx;
8415

8416
				afto = pd->af != nk->af;
8417

8418
				if (afto && (*state)->direction == PF_IN) {
8419
					sidx = pd2.didx;
8420
					didx = pd2.sidx;
8421
				} else {
8422
					sidx = pd2.sidx;
8423
					didx = pd2.didx;
8424
				}
8425

8426
				if (afto) {
8427
					if (pf_translate_icmp_af(nk->af,
8428
					    &pd->hdr.icmp))
8429
						return (PF_DROP);
8430
					m_copyback(pd->m, pd->off,
8431
					    sizeof(struct icmp6_hdr),
8432
					    (c_caddr_t)&pd->hdr.icmp6);
8433
					if (pf_change_icmp_af(pd->m, ipoff2, pd,
8434
					    &pd2, &nk->addr[sidx],
8435
					    &nk->addr[didx], pd->af,
8436
					    nk->af))
8437
						return (PF_DROP);
8438
					pf_addrcpy(&pd->nsaddr,
8439
					    &nk->addr[pd2.sidx], nk->af);
8440
					pf_addrcpy(&pd->ndaddr,
8441
					    &nk->addr[pd2.didx], nk->af);
8442
					if (nk->af == AF_INET) {
8443
						pd->proto = IPPROTO_ICMP;
8444
					} else {
8445
						pd->proto = IPPROTO_ICMPV6;
8446
						/*
8447
						 * IPv4 becomes IPv6 so we must
8448
						 * copy IPv4 src addr to least
8449
						 * 32bits in IPv6 address to
8450
						 * keep traceroute/icmp
8451
						 * working.
8452
						 */
8453
						pd->nsaddr.addr32[3] =
8454
						    pd->src->addr32[0];
8455
					}
8456
					pd->naf = pd2.naf = nk->af;
8457
					pf_change_ap(&pd2, pd2.src, &uh->uh_sport,
8458
					    &nk->addr[pd2.sidx], nk->port[sidx]);
8459
					pf_change_ap(&pd2, pd2.dst, &uh->uh_dport,
8460
					    &nk->addr[pd2.didx], nk->port[didx]);
8461
					m_copyback(pd2.m, pd2.off, sizeof(*uh),
8462
					    (c_caddr_t)uh);
8463
					return (PF_AFRT);
8464
				}
8465
#endif /* INET && INET6 */
8466

8467
				if (PF_ANEQ(pd2.src,
8468
				    &nk->addr[pd2.sidx], pd2.af) ||
8469
				    nk->port[pd2.sidx] != uh->uh_sport)
8470
					pf_change_icmp(pd2.src, &uh->uh_sport,
8471
					    daddr, &nk->addr[pd2.sidx],
8472
					    nk->port[pd2.sidx], &uh->uh_sum,
8473
					    pd2.ip_sum, icmpsum,
8474
					    pd->ip_sum, 1, pd2.af);
8475

8476
				if (PF_ANEQ(pd2.dst,
8477
				    &nk->addr[pd2.didx], pd2.af) ||
8478
				    nk->port[pd2.didx] != uh->uh_dport)
8479
					pf_change_icmp(pd2.dst, &uh->uh_dport,
8480
					    saddr, &nk->addr[pd2.didx],
8481
					    nk->port[pd2.didx], &uh->uh_sum,
8482
					    pd2.ip_sum, icmpsum,
8483
					    pd->ip_sum, 1, pd2.af);
8484

8485
				switch (pd2.af) {
8486
#ifdef INET
8487
				case AF_INET:
8488
					m_copyback(pd->m, pd->off, ICMP_MINLEN,
8489
					    (caddr_t )&pd->hdr.icmp);
8490
					m_copyback(pd->m, ipoff2, sizeof(h2), (caddr_t)&h2);
8491
					break;
8492
#endif /* INET */
8493
#ifdef INET6
8494
				case AF_INET6:
8495
					m_copyback(pd->m, pd->off,
8496
					    sizeof(struct icmp6_hdr),
8497
					    (caddr_t )&pd->hdr.icmp6);
8498
					m_copyback(pd->m, ipoff2, sizeof(h2_6),
8499
					    (caddr_t )&h2_6);
8500
					break;
8501
#endif /* INET6 */
8502
				}
8503
				m_copyback(pd->m, pd2.off, sizeof(*uh), (caddr_t)uh);
8504
			}
8505
			return (PF_PASS);
8506
			break;
8507
		}
8508
#ifdef INET
8509
		case IPPROTO_SCTP: {
8510
			struct sctphdr		*sh = &pd2.hdr.sctp;
8511
			struct pf_state_peer	*src;
8512
			int			 copyback = 0;
8513
			int			 action;
8514

8515
			if (! pf_pull_hdr(pd->m, pd2.off, sh, sizeof(*sh), reason,
8516
			    pd2.af)) {
8517
				DPFPRINTF(PF_DEBUG_MISC,
8518
				    "pf: ICMP error message too short "
8519
				    "(sctp)");
8520
				return (PF_DROP);
8521
			}
8522
			pd2.pcksum = &pd2.sctp_dummy_sum;
8523

8524
			key.af = pd2.af;
8525
			key.proto = IPPROTO_SCTP;
8526
			pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
8527
			pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
8528
			key.port[pd2.sidx] = sh->src_port;
8529
			key.port[pd2.didx] = sh->dest_port;
8530

8531
			action = pf_find_state(&pd2, &key, state);
8532
			if (action != PF_MATCH)
8533
				return (action);
8534

8535
			if (pd->dir == (*state)->direction) {
8536
				if (PF_REVERSED_KEY(*state, pd->af))
8537
					src = &(*state)->src;
8538
				else
8539
					src = &(*state)->dst;
8540
			} else {
8541
				if (PF_REVERSED_KEY(*state, pd->af))
8542
					src = &(*state)->dst;
8543
				else
8544
					src = &(*state)->src;
8545
			}
8546

8547
			if (src->scrub->pfss_v_tag != sh->v_tag) {
8548
				DPFPRINTF(PF_DEBUG_MISC,
8549
				    "pf: ICMP error message has incorrect "
8550
				    "SCTP v_tag");
8551
				return (PF_DROP);
8552
			}
8553

8554
			/* translate source/destination address, if necessary */
8555
			if ((*state)->key[PF_SK_WIRE] !=
8556
			    (*state)->key[PF_SK_STACK]) {
8557

8558
				struct pf_state_key	*nk;
8559

8560
				if (PF_REVERSED_KEY(*state, pd->af))
8561
					nk = (*state)->key[pd->sidx];
8562
				else
8563
					nk = (*state)->key[pd->didx];
8564

8565
#if defined(INET) && defined(INET6)
8566
				int	 afto, sidx, didx;
8567

8568
				afto = pd->af != nk->af;
8569

8570
				if (afto && (*state)->direction == PF_IN) {
8571
					sidx = pd2.didx;
8572
					didx = pd2.sidx;
8573
				} else {
8574
					sidx = pd2.sidx;
8575
					didx = pd2.didx;
8576
				}
8577

8578
				if (afto) {
8579
					if (pf_translate_icmp_af(nk->af,
8580
					    &pd->hdr.icmp))
8581
						return (PF_DROP);
8582
					m_copyback(pd->m, pd->off,
8583
					    sizeof(struct icmp6_hdr),
8584
					    (c_caddr_t)&pd->hdr.icmp6);
8585
					if (pf_change_icmp_af(pd->m, ipoff2, pd,
8586
					    &pd2, &nk->addr[sidx],
8587
					    &nk->addr[didx], pd->af,
8588
					    nk->af))
8589
						return (PF_DROP);
8590
					sh->src_port = nk->port[sidx];
8591
					sh->dest_port = nk->port[didx];
8592
					m_copyback(pd2.m, pd2.off, sizeof(*sh), (c_caddr_t)sh);
8593
					pf_addrcpy(&pd->nsaddr,
8594
					    &nk->addr[pd2.sidx], nk->af);
8595
					pf_addrcpy(&pd->ndaddr,
8596
					    &nk->addr[pd2.didx], nk->af);
8597
					if (nk->af == AF_INET) {
8598
						pd->proto = IPPROTO_ICMP;
8599
					} else {
8600
						pd->proto = IPPROTO_ICMPV6;
8601
						/*
8602
						 * IPv4 becomes IPv6 so we must
8603
						 * copy IPv4 src addr to least
8604
						 * 32bits in IPv6 address to
8605
						 * keep traceroute/icmp
8606
						 * working.
8607
						 */
8608
						pd->nsaddr.addr32[3] =
8609
						    pd->src->addr32[0];
8610
					}
8611
					pd->naf = nk->af;
8612
					return (PF_AFRT);
8613
				}
8614
#endif /* INET && INET6 */
8615

8616
				if (PF_ANEQ(pd2.src,
8617
				    &nk->addr[pd2.sidx], pd2.af) ||
8618
				    nk->port[pd2.sidx] != sh->src_port)
8619
					pf_change_icmp(pd2.src, &sh->src_port,
8620
					    daddr, &nk->addr[pd2.sidx],
8621
					    nk->port[pd2.sidx], NULL,
8622
					    pd2.ip_sum, icmpsum,
8623
					    pd->ip_sum, 0, pd2.af);
8624

8625
				if (PF_ANEQ(pd2.dst,
8626
				    &nk->addr[pd2.didx], pd2.af) ||
8627
				    nk->port[pd2.didx] != sh->dest_port)
8628
					pf_change_icmp(pd2.dst, &sh->dest_port,
8629
					    saddr, &nk->addr[pd2.didx],
8630
					    nk->port[pd2.didx], NULL,
8631
					    pd2.ip_sum, icmpsum,
8632
					    pd->ip_sum, 0, pd2.af);
8633
				copyback = 1;
8634
			}
8635

8636
			if (copyback) {
8637
				switch (pd2.af) {
8638
#ifdef INET
8639
				case AF_INET:
8640
					m_copyback(pd->m, pd->off, ICMP_MINLEN,
8641
					    (caddr_t )&pd->hdr.icmp);
8642
					m_copyback(pd->m, ipoff2, sizeof(h2),
8643
					    (caddr_t )&h2);
8644
					break;
8645
#endif /* INET */
8646
#ifdef INET6
8647
				case AF_INET6:
8648
					m_copyback(pd->m, pd->off,
8649
					    sizeof(struct icmp6_hdr),
8650
					    (caddr_t )&pd->hdr.icmp6);
8651
					m_copyback(pd->m, ipoff2, sizeof(h2_6),
8652
					    (caddr_t )&h2_6);
8653
					break;
8654
#endif /* INET6 */
8655
				}
8656
				m_copyback(pd->m, pd2.off, sizeof(*sh), (caddr_t)sh);
8657
			}
8658

8659
			return (PF_PASS);
8660
			break;
8661
		}
8662
		case IPPROTO_ICMP: {
8663
			struct icmp	*iih = &pd2.hdr.icmp;
8664

8665
			if (pd2.af != AF_INET) {
8666
				REASON_SET(reason, PFRES_NORM);
8667
				return (PF_DROP);
8668
			}
8669

8670
			if (!pf_pull_hdr(pd->m, pd2.off, iih, ICMP_MINLEN,
8671
			    reason, pd2.af)) {
8672
				DPFPRINTF(PF_DEBUG_MISC,
8673
				    "pf: ICMP error message too short i"
8674
				    "(icmp)");
8675
				return (PF_DROP);
8676
			}
8677
			pd2.pcksum = &pd2.hdr.icmp.icmp_cksum;
8678

8679
			icmpid = iih->icmp_id;
8680
			pf_icmp_mapping(&pd2, iih->icmp_type,
8681
			    &icmp_dir, &virtual_id, &virtual_type);
8682

8683
			ret = pf_icmp_state_lookup(&key, &pd2, state,
8684
			    virtual_id, virtual_type, icmp_dir, &iidx, 0, 1);
8685
			if (ret >= 0) {
8686
				MPASS(*state == NULL);
8687
				return (ret);
8688
			}
8689

8690
			/* translate source/destination address, if necessary */
8691
			if ((*state)->key[PF_SK_WIRE] !=
8692
			    (*state)->key[PF_SK_STACK]) {
8693
				struct pf_state_key	*nk;
8694

8695
				if (PF_REVERSED_KEY(*state, pd->af))
8696
					nk = (*state)->key[pd->sidx];
8697
				else
8698
					nk = (*state)->key[pd->didx];
8699

8700
#if defined(INET) && defined(INET6)
8701
				int	 afto, sidx, didx;
8702

8703
				afto = pd->af != nk->af;
8704

8705
				if (afto && (*state)->direction == PF_IN) {
8706
					sidx = pd2.didx;
8707
					didx = pd2.sidx;
8708
					iidx = !iidx;
8709
				} else {
8710
					sidx = pd2.sidx;
8711
					didx = pd2.didx;
8712
				}
8713

8714
				if (afto) {
8715
					if (nk->af != AF_INET6)
8716
						return (PF_DROP);
8717
					if (pf_translate_icmp_af(nk->af,
8718
					    &pd->hdr.icmp))
8719
						return (PF_DROP);
8720
					m_copyback(pd->m, pd->off,
8721
					    sizeof(struct icmp6_hdr),
8722
					    (c_caddr_t)&pd->hdr.icmp6);
8723
					if (pf_change_icmp_af(pd->m, ipoff2, pd,
8724
					    &pd2, &nk->addr[sidx],
8725
					    &nk->addr[didx], pd->af,
8726
					    nk->af))
8727
						return (PF_DROP);
8728
					pd->proto = IPPROTO_ICMPV6;
8729
					if (pf_translate_icmp_af(nk->af, iih))
8730
						return (PF_DROP);
8731
					if (virtual_type == htons(ICMP_ECHO) &&
8732
					    nk->port[iidx] != iih->icmp_id)
8733
						iih->icmp_id = nk->port[iidx];
8734
					m_copyback(pd2.m, pd2.off, ICMP_MINLEN,
8735
					    (c_caddr_t)iih);
8736
					pf_addrcpy(&pd->nsaddr,
8737
					    &nk->addr[pd2.sidx], nk->af);
8738
					pf_addrcpy(&pd->ndaddr,
8739
					    &nk->addr[pd2.didx], nk->af);
8740
					/*
8741
					 * IPv4 becomes IPv6 so we must copy
8742
					 * IPv4 src addr to least 32bits in
8743
					 * IPv6 address to keep traceroute
8744
					 * working.
8745
					 */
8746
					pd->nsaddr.addr32[3] =
8747
					    pd->src->addr32[0];
8748
					pd->naf = nk->af;
8749
					return (PF_AFRT);
8750
				}
8751
#endif /* INET && INET6 */
8752

8753
				if (PF_ANEQ(pd2.src,
8754
				    &nk->addr[pd2.sidx], pd2.af) ||
8755
				    (virtual_type == htons(ICMP_ECHO) &&
8756
				    nk->port[iidx] != iih->icmp_id))
8757
					pf_change_icmp(pd2.src,
8758
					    (virtual_type == htons(ICMP_ECHO)) ?
8759
					    &iih->icmp_id : NULL,
8760
					    daddr, &nk->addr[pd2.sidx],
8761
					    (virtual_type == htons(ICMP_ECHO)) ?
8762
					    nk->port[iidx] : 0, NULL,
8763
					    pd2.ip_sum, icmpsum,
8764
					    pd->ip_sum, 0, AF_INET);
8765

8766
				if (PF_ANEQ(pd2.dst,
8767
				    &nk->addr[pd2.didx], pd2.af))
8768
					pf_change_icmp(pd2.dst, NULL, NULL,
8769
					    &nk->addr[pd2.didx], 0, NULL,
8770
					    pd2.ip_sum, icmpsum, pd->ip_sum, 0,
8771
					    AF_INET);
8772

8773
				m_copyback(pd->m, pd->off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp);
8774
				m_copyback(pd->m, ipoff2, sizeof(h2), (caddr_t)&h2);
8775
				m_copyback(pd->m, pd2.off, ICMP_MINLEN, (caddr_t)iih);
8776
			}
8777
			return (PF_PASS);
8778
			break;
8779
		}
8780
#endif /* INET */
8781
#ifdef INET6
8782
		case IPPROTO_ICMPV6: {
8783
			struct icmp6_hdr	*iih = &pd2.hdr.icmp6;
8784

8785
			if (pd2.af != AF_INET6) {
8786
				REASON_SET(reason, PFRES_NORM);
8787
				return (PF_DROP);
8788
			}
8789

8790
			if (!pf_pull_hdr(pd->m, pd2.off, iih,
8791
			    sizeof(struct icmp6_hdr), reason, pd2.af)) {
8792
				DPFPRINTF(PF_DEBUG_MISC,
8793
				    "pf: ICMP error message too short "
8794
				    "(icmp6)");
8795
				return (PF_DROP);
8796
			}
8797
			pd2.pcksum = &pd2.hdr.icmp6.icmp6_cksum;
8798

8799
			pf_icmp_mapping(&pd2, iih->icmp6_type,
8800
			    &icmp_dir, &virtual_id, &virtual_type);
8801

8802
			ret = pf_icmp_state_lookup(&key, &pd2, state,
8803
			    virtual_id, virtual_type, icmp_dir, &iidx, 0, 1);
8804
			/* IPv6? try matching a multicast address */
8805
			if (ret == PF_DROP && pd2.af == AF_INET6 &&
8806
			    icmp_dir == PF_OUT) {
8807
				MPASS(*state == NULL);
8808
				ret = pf_icmp_state_lookup(&key, &pd2,
8809
				    state, virtual_id, virtual_type,
8810
				    icmp_dir, &iidx, 1, 1);
8811
			}
8812
			if (ret >= 0) {
8813
				MPASS(*state == NULL);
8814
				return (ret);
8815
			}
8816

8817
			/* translate source/destination address, if necessary */
8818
			if ((*state)->key[PF_SK_WIRE] !=
8819
			    (*state)->key[PF_SK_STACK]) {
8820
				struct pf_state_key	*nk;
8821

8822
				if (PF_REVERSED_KEY(*state, pd->af))
8823
					nk = (*state)->key[pd->sidx];
8824
				else
8825
					nk = (*state)->key[pd->didx];
8826

8827
#if defined(INET) && defined(INET6)
8828
				int	 afto, sidx, didx;
8829

8830
				afto = pd->af != nk->af;
8831

8832
				if (afto && (*state)->direction == PF_IN) {
8833
					sidx = pd2.didx;
8834
					didx = pd2.sidx;
8835
					iidx = !iidx;
8836
				} else {
8837
					sidx = pd2.sidx;
8838
					didx = pd2.didx;
8839
				}
8840

8841
				if (afto) {
8842
					if (nk->af != AF_INET)
8843
						return (PF_DROP);
8844
					if (pf_translate_icmp_af(nk->af,
8845
					    &pd->hdr.icmp))
8846
						return (PF_DROP);
8847
					m_copyback(pd->m, pd->off,
8848
					    sizeof(struct icmp6_hdr),
8849
					    (c_caddr_t)&pd->hdr.icmp6);
8850
					if (pf_change_icmp_af(pd->m, ipoff2, pd,
8851
					    &pd2, &nk->addr[sidx],
8852
					    &nk->addr[didx], pd->af,
8853
					    nk->af))
8854
						return (PF_DROP);
8855
					pd->proto = IPPROTO_ICMP;
8856
					if (pf_translate_icmp_af(nk->af, iih))
8857
						return (PF_DROP);
8858
					if (virtual_type ==
8859
					    htons(ICMP6_ECHO_REQUEST) &&
8860
					    nk->port[iidx] != iih->icmp6_id)
8861
						iih->icmp6_id = nk->port[iidx];
8862
					m_copyback(pd2.m, pd2.off,
8863
					    sizeof(struct icmp6_hdr), (c_caddr_t)iih);
8864
					pf_addrcpy(&pd->nsaddr,
8865
					    &nk->addr[pd2.sidx], nk->af);
8866
					pf_addrcpy(&pd->ndaddr,
8867
					    &nk->addr[pd2.didx], nk->af);
8868
					pd->naf = nk->af;
8869
					return (PF_AFRT);
8870
				}
8871
#endif /* INET && INET6 */
8872

8873
				if (PF_ANEQ(pd2.src,
8874
				    &nk->addr[pd2.sidx], pd2.af) ||
8875
				    ((virtual_type == htons(ICMP6_ECHO_REQUEST)) &&
8876
				    nk->port[pd2.sidx] != iih->icmp6_id))
8877
					pf_change_icmp(pd2.src,
8878
					    (virtual_type == htons(ICMP6_ECHO_REQUEST))
8879
					    ? &iih->icmp6_id : NULL,
8880
					    daddr, &nk->addr[pd2.sidx],
8881
					    (virtual_type == htons(ICMP6_ECHO_REQUEST))
8882
					    ? nk->port[iidx] : 0, NULL,
8883
					    pd2.ip_sum, icmpsum,
8884
					    pd->ip_sum, 0, AF_INET6);
8885

8886
				if (PF_ANEQ(pd2.dst,
8887
				    &nk->addr[pd2.didx], pd2.af))
8888
					pf_change_icmp(pd2.dst, NULL, NULL,
8889
					    &nk->addr[pd2.didx], 0, NULL,
8890
					    pd2.ip_sum, icmpsum,
8891
					    pd->ip_sum, 0, AF_INET6);
8892

8893
				m_copyback(pd->m, pd->off, sizeof(struct icmp6_hdr),
8894
				    (caddr_t)&pd->hdr.icmp6);
8895
				m_copyback(pd->m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
8896
				m_copyback(pd->m, pd2.off, sizeof(struct icmp6_hdr),
8897
				    (caddr_t)iih);
8898
			}
8899
			return (PF_PASS);
8900
			break;
8901
		}
8902
#endif /* INET6 */
8903
		default: {
8904
			int	action;
8905

8906
			/*
8907
			 * Placeholder value, so future calls to pf_change_ap()
8908
			 * don't try to update a NULL checksum pointer.
8909
			 */
8910
			pd->pcksum = &pd->sctp_dummy_sum;
8911
			key.af = pd2.af;
8912
			key.proto = pd2.proto;
8913
			pf_addrcpy(&key.addr[pd2.sidx], pd2.src, key.af);
8914
			pf_addrcpy(&key.addr[pd2.didx], pd2.dst, key.af);
8915
			key.port[0] = key.port[1] = 0;
8916

8917
			action = pf_find_state(&pd2, &key, state);
8918
			if (action != PF_MATCH)
8919
				return (action);
8920

8921
			/* translate source/destination address, if necessary */
8922
			if ((*state)->key[PF_SK_WIRE] !=
8923
			    (*state)->key[PF_SK_STACK]) {
8924
				struct pf_state_key *nk =
8925
				    (*state)->key[pd->didx];
8926

8927
				if (PF_ANEQ(pd2.src,
8928
				    &nk->addr[pd2.sidx], pd2.af))
8929
					pf_change_icmp(pd2.src, NULL, daddr,
8930
					    &nk->addr[pd2.sidx], 0, NULL,
8931
					    pd2.ip_sum, icmpsum,
8932
					    pd->ip_sum, 0, pd2.af);
8933

8934
				if (PF_ANEQ(pd2.dst,
8935
				    &nk->addr[pd2.didx], pd2.af))
8936
					pf_change_icmp(pd2.dst, NULL, saddr,
8937
					    &nk->addr[pd2.didx], 0, NULL,
8938
					    pd2.ip_sum, icmpsum,
8939
					    pd->ip_sum, 0, pd2.af);
8940

8941
				switch (pd2.af) {
8942
#ifdef INET
8943
				case AF_INET:
8944
					m_copyback(pd->m, pd->off, ICMP_MINLEN,
8945
					    (caddr_t)&pd->hdr.icmp);
8946
					m_copyback(pd->m, ipoff2, sizeof(h2), (caddr_t)&h2);
8947
					break;
8948
#endif /* INET */
8949
#ifdef INET6
8950
				case AF_INET6:
8951
					m_copyback(pd->m, pd->off,
8952
					    sizeof(struct icmp6_hdr),
8953
					    (caddr_t )&pd->hdr.icmp6);
8954
					m_copyback(pd->m, ipoff2, sizeof(h2_6),
8955
					    (caddr_t )&h2_6);
8956
					break;
8957
#endif /* INET6 */
8958
				}
8959
			}
8960
			return (PF_PASS);
8961
			break;
8962
		}
8963
		}
8964
	}
8965
}
8966

8967
/*
8968
 * ipoff and off are measured from the start of the mbuf chain.
8969
 * h must be at "ipoff" on the mbuf chain.
8970
 */
8971
void *
8972
pf_pull_hdr(const struct mbuf *m, int off, void *p, int len,
8973
    u_short *reasonp, sa_family_t af)
8974
{
8975
	int iplen = 0;
8976
	switch (af) {
8977
#ifdef INET
8978
	case AF_INET: {
8979
		const struct ip	*h = mtod(m, struct ip *);
8980
		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
8981

8982
		if (fragoff) {
8983
			REASON_SET(reasonp, PFRES_FRAG);
8984
			return (NULL);
8985
		}
8986
		iplen = ntohs(h->ip_len);
8987
		break;
8988
	}
8989
#endif /* INET */
8990
#ifdef INET6
8991
	case AF_INET6: {
8992
		const struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
8993

8994
		iplen = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
8995
		break;
8996
	}
8997
#endif /* INET6 */
8998
	}
8999
	if (m->m_pkthdr.len < off + len || iplen < off + len) {
9000
		REASON_SET(reasonp, PFRES_SHORT);
9001
		return (NULL);
9002
	}
9003
	m_copydata(m, off, len, p);
9004
	return (p);
9005
}
9006

9007
int
9008
pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kkif *kif,
9009
    int rtableid)
9010
{
9011
	struct ifnet		*ifp;
9012

9013
	/*
9014
	 * Skip check for addresses with embedded interface scope,
9015
	 * as they would always match anyway.
9016
	 */
9017
	if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
9018
		return (1);
9019

9020
	if (af != AF_INET && af != AF_INET6)
9021
		return (0);
9022

9023
	if (kif == V_pfi_all)
9024
		return (1);
9025

9026
	/* Skip checks for ipsec interfaces */
9027
	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
9028
		return (1);
9029

9030
	ifp = (kif != NULL) ? kif->pfik_ifp : NULL;
9031

9032
	switch (af) {
9033
#ifdef INET6
9034
	case AF_INET6:
9035
		return (fib6_check_urpf(rtableid, &addr->v6, 0, NHR_NONE,
9036
		    ifp));
9037
#endif /* INET6 */
9038
#ifdef INET
9039
	case AF_INET:
9040
		return (fib4_check_urpf(rtableid, addr->v4, 0, NHR_NONE,
9041
		    ifp));
9042
#endif /* INET */
9043
	}
9044

9045
	return (0);
9046
}
9047

9048
#ifdef INET
9049
static int
9050
pf_route(struct pf_krule *r, struct ifnet *oifp,
9051
    struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
9052
{
9053
	struct mbuf		*m0, *m1, *md;
9054
	struct route_in6	 ro;
9055
	union sockaddr_union	 rt_gw;
9056
	const union sockaddr_union	*gw = (const union sockaddr_union *)&ro.ro_dst;
9057
	union sockaddr_union	*dst;
9058
	struct ip		*ip;
9059
	struct ifnet		*ifp = NULL;
9060
	int			 error = 0;
9061
	uint16_t		 ip_len, ip_off;
9062
	uint16_t		 tmp;
9063
	int			 r_dir;
9064
	bool			 skip_test = false;
9065
	int			 action = PF_PASS;
9066

9067
	KASSERT(pd->m && r && oifp, ("%s: invalid parameters", __func__));
9068

9069
	SDT_PROBE4(pf, ip, route_to, entry, pd->m, pd, s, oifp);
9070

9071
	if (s) {
9072
		r_dir = s->direction;
9073
	} else {
9074
		r_dir = r->direction;
9075
	}
9076

9077
	KASSERT(pd->dir == PF_IN || pd->dir == PF_OUT ||
9078
	    r_dir == PF_IN || r_dir == PF_OUT, ("%s: invalid direction",
9079
	    __func__));
9080

9081
	if ((pd->pf_mtag == NULL &&
9082
	    ((pd->pf_mtag = pf_get_mtag(pd->m)) == NULL)) ||
9083
	    pd->pf_mtag->routed++ > 3) {
9084
		m0 = pd->m;
9085
		pd->m = NULL;
9086
		SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9087
		action = PF_DROP;
9088
		goto bad_locked;
9089
	}
9090

9091
	if (pd->act.rt_kif != NULL)
9092
		ifp = pd->act.rt_kif->pfik_ifp;
9093

9094
	if (pd->act.rt == PF_DUPTO) {
9095
		if ((pd->pf_mtag->flags & PF_MTAG_FLAG_DUPLICATED)) {
9096
			if (s != NULL) {
9097
				PF_STATE_UNLOCK(s);
9098
			}
9099
			if (ifp == oifp) {
9100
				/* When the 2nd interface is not skipped */
9101
				return (action);
9102
			} else {
9103
				m0 = pd->m;
9104
				pd->m = NULL;
9105
				SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9106
				action = PF_DROP;
9107
				goto bad;
9108
			}
9109
		} else {
9110
			pd->pf_mtag->flags |= PF_MTAG_FLAG_DUPLICATED;
9111
			if (((m0 = m_dup(pd->m, M_NOWAIT)) == NULL)) {
9112
				if (s)
9113
					PF_STATE_UNLOCK(s);
9114
				return (action);
9115
			}
9116
		}
9117
	} else {
9118
		if ((pd->act.rt == PF_REPLYTO) == (r_dir == pd->dir)) {
9119
			if (pd->af == pd->naf) {
9120
				pf_dummynet(pd, s, r, &pd->m);
9121
				if (s)
9122
					PF_STATE_UNLOCK(s);
9123
				return (action);
9124
			} else {
9125
				if (r_dir == PF_IN) {
9126
					skip_test = true;
9127
				}
9128
			}
9129
		}
9130

9131
		/*
9132
		 * If we're actually doing route-to and af-to and are in the
9133
		 * reply direction.
9134
		 */
9135
		if (pd->act.rt_kif && pd->act.rt_kif->pfik_ifp &&
9136
		    pd->af != pd->naf) {
9137
			if (pd->act.rt == PF_ROUTETO && r->naf != AF_INET) {
9138
				/* Un-set ifp so we do a plain route lookup. */
9139
				ifp = NULL;
9140
			}
9141
			if (pd->act.rt == PF_REPLYTO && r->naf != AF_INET6) {
9142
				/* Un-set ifp so we do a plain route lookup. */
9143
				ifp = NULL;
9144
			}
9145
		}
9146
		m0 = pd->m;
9147
	}
9148

9149
	ip = mtod(m0, struct ip *);
9150

9151
	bzero(&ro, sizeof(ro));
9152
	dst = (union sockaddr_union *)&ro.ro_dst;
9153
	dst->sin.sin_family = AF_INET;
9154
	dst->sin.sin_len = sizeof(struct sockaddr_in);
9155
	dst->sin.sin_addr = ip->ip_dst;
9156
	if (ifp) { /* Only needed in forward direction and route-to */
9157
		bzero(&rt_gw, sizeof(rt_gw));
9158
		ro.ro_flags |= RT_HAS_GW;
9159
		gw = &rt_gw;
9160
		switch (pd->act.rt_af) {
9161
#ifdef INET
9162
		case AF_INET:
9163
			rt_gw.sin.sin_family = AF_INET;
9164
			rt_gw.sin.sin_len = sizeof(struct sockaddr_in);
9165
			rt_gw.sin.sin_addr.s_addr = pd->act.rt_addr.v4.s_addr;
9166
			break;
9167
#endif /* INET */
9168
#ifdef INET6
9169
		case AF_INET6:
9170
			rt_gw.sin6.sin6_family = AF_INET6;
9171
			rt_gw.sin6.sin6_len = sizeof(struct sockaddr_in6);
9172
			pf_addrcpy((struct pf_addr *)&rt_gw.sin6.sin6_addr,
9173
			    &pd->act.rt_addr, AF_INET6);
9174
			break;
9175
#endif /* INET6 */
9176
		default:
9177
			/* Normal af-to without route-to */
9178
			break;
9179
		}
9180
	}
9181

9182
	if (pd->dir == PF_IN) {
9183
		if (ip->ip_ttl <= IPTTLDEC) {
9184
			if (r->rt != PF_DUPTO)
9185
				pf_send_icmp(m0, ICMP_TIMXCEED,
9186
				    ICMP_TIMXCEED_INTRANS, 0, pd->af, r,
9187
				    pd->act.rtableid);
9188
			action = PF_DROP;
9189
			goto bad_locked;
9190
		}
9191
		ip->ip_ttl -= IPTTLDEC;
9192
	}
9193

9194
	if (s != NULL) {
9195
		if (ifp == NULL && (pd->af != pd->naf)) {
9196
			/* We're in the AFTO case. Do a route lookup. */
9197
			const struct nhop_object *nh;
9198
			nh = fib4_lookup(M_GETFIB(m0), ip->ip_dst, 0, NHR_NONE, 0);
9199
			if (nh) {
9200
				ifp = nh->nh_ifp;
9201

9202
				/* Use the gateway if needed. */
9203
				if (nh->nh_flags & NHF_GATEWAY) {
9204
					gw = (const union sockaddr_union *)&nh->gw_sa;
9205
					ro.ro_flags |= RT_HAS_GW;
9206
				} else {
9207
					dst->sin.sin_addr = ip->ip_dst;
9208
				}
9209
			}
9210
		}
9211
		PF_STATE_UNLOCK(s);
9212
	}
9213

9214
	/* It must have been either set from rt_af or from fib4_lookup */
9215
	KASSERT(gw->sin.sin_family != 0, ("%s: gw address family undetermined", __func__));
9216

9217
	if (ifp == NULL) {
9218
		m0 = pd->m;
9219
		pd->m = NULL;
9220
		action = PF_DROP;
9221
		SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9222
		goto bad;
9223
	}
9224

9225
	/*
9226
	 * Bind to the correct interface if we're if-bound. We don't know which
9227
	 * interface that will be until here, so we've inserted the state
9228
	 * on V_pf_all. Fix that now.
9229
	 */
9230
	if (s != NULL && s->kif == V_pfi_all && r->rule_flag & PFRULE_IFBOUND) {
9231
		/* Verify that we're here because of BOUND_IFACE */
9232
		MPASS(r->rt == PF_REPLYTO || (pd->af != pd->naf && s->direction == PF_IN));
9233
		s->kif = ifp->if_pf_kif;
9234
		if (pd->act.rt == PF_REPLYTO) {
9235
			s->orig_kif = oifp->if_pf_kif;
9236
		}
9237
	}
9238

9239
	if (r->rt == PF_DUPTO || (pd->af != pd->naf && s->direction == PF_IN))
9240
		skip_test = true;
9241

9242
	if (pd->dir == PF_IN) {
9243
		if (skip_test) {
9244
			struct pfi_kkif *out_kif = (struct pfi_kkif *)ifp->if_pf_kif;
9245
			MPASS(s != NULL);
9246
			pf_counter_u64_critical_enter();
9247
			pf_counter_u64_add_protected(
9248
			    &out_kif->pfik_bytes[pd->naf == AF_INET6][1]
9249
			    [action != PF_PASS && action != PF_AFRT], pd->tot_len);
9250
			pf_counter_u64_add_protected(
9251
			    &out_kif->pfik_packets[pd->naf == AF_INET6][1]
9252
			    [action != PF_PASS && action != PF_AFRT], 1);
9253
			pf_counter_u64_critical_exit();
9254
		} else {
9255
			if (pf_test(AF_INET, PF_OUT, PFIL_FWD, ifp, &m0, inp,
9256
			    &pd->act) != PF_PASS) {
9257
				action = PF_DROP;
9258
				SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9259
				goto bad;
9260
			} else if (m0 == NULL) {
9261
				action = PF_DROP;
9262
				SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9263
				goto done;
9264
			}
9265
			if (m0->m_len < sizeof(struct ip)) {
9266
				DPFPRINTF(PF_DEBUG_URGENT,
9267
				    "%s: m0->m_len < sizeof(struct ip)", __func__);
9268
				SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9269
				action = PF_DROP;
9270
				goto bad;
9271
			}
9272
			ip = mtod(m0, struct ip *);
9273
		}
9274
	}
9275

9276
	if (ifp->if_flags & IFF_LOOPBACK)
9277
		m0->m_flags |= M_SKIP_FIREWALL;
9278

9279
	ip_len = ntohs(ip->ip_len);
9280
	ip_off = ntohs(ip->ip_off);
9281

9282
	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
9283
	m0->m_pkthdr.csum_flags |= CSUM_IP;
9284
	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
9285
		in_delayed_cksum(m0);
9286
		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
9287
	}
9288
	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
9289
		pf_sctp_checksum(m0, (uint32_t)(ip->ip_hl << 2));
9290
		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
9291
	}
9292

9293
	if (pd->dir == PF_IN) {
9294
		/*
9295
		 * Make sure dummynet gets the correct direction, in case it needs to
9296
		 * re-inject later.
9297
		 */
9298
		pd->dir = PF_OUT;
9299

9300
		/*
9301
		 * The following processing is actually the rest of the inbound processing, even
9302
		 * though we've marked it as outbound (so we don't look through dummynet) and it
9303
		 * happens after the outbound processing (pf_test(PF_OUT) above).
9304
		 * Swap the dummynet pipe numbers, because it's going to come to the wrong
9305
		 * conclusion about what direction it's processing, and we can't fix it or it
9306
		 * will re-inject incorrectly. Swapping the pipe numbers means that its incorrect
9307
		 * decision will pick the right pipe, and everything will mostly work as expected.
9308
		 */
9309
		tmp = pd->act.dnrpipe;
9310
		pd->act.dnrpipe = pd->act.dnpipe;
9311
		pd->act.dnpipe = tmp;
9312
	}
9313

9314
	/*
9315
	 * If small enough for interface, or the interface will take
9316
	 * care of the fragmentation for us, we can just send directly.
9317
	 */
9318
	if (ip_len <= ifp->if_mtu ||
9319
	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
9320
		ip->ip_sum = 0;
9321
		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
9322
			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
9323
			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
9324
		}
9325
		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
9326

9327
		md = m0;
9328
		error = pf_dummynet_route(pd, s, r, ifp,
9329
		    (const struct sockaddr *)gw, &md);
9330
		if (md != NULL) {
9331
			error = (*ifp->if_output)(ifp, md,
9332
			    (const struct sockaddr *)gw, (struct route *)&ro);
9333
			SDT_PROBE2(pf, ip, route_to, output, ifp, error);
9334
		}
9335
		goto done;
9336
	}
9337

9338
	/* Balk when DF bit is set or the interface didn't support TSO. */
9339
	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
9340
		error = EMSGSIZE;
9341
		KMOD_IPSTAT_INC(ips_cantfrag);
9342
		if (pd->act.rt != PF_DUPTO) {
9343
			if (s && s->nat_rule != NULL) {
9344
				MPASS(m0 == pd->m);
9345
				PACKET_UNDO_NAT(pd,
9346
				    (ip->ip_hl << 2) + (ip_off & IP_OFFMASK),
9347
				    s);
9348
			}
9349

9350
			pf_send_icmp(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
9351
			   ifp->if_mtu, pd->af, r, pd->act.rtableid);
9352
		}
9353
		SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9354
		action = PF_DROP;
9355
		goto bad;
9356
	}
9357

9358
	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
9359
	if (error) {
9360
		SDT_PROBE1(pf, ip, route_to, drop, __LINE__);
9361
		action = PF_DROP;
9362
		goto bad;
9363
	}
9364

9365
	for (; m0; m0 = m1) {
9366
		m1 = m0->m_nextpkt;
9367
		m0->m_nextpkt = NULL;
9368
		if (error == 0) {
9369
			m_clrprotoflags(m0);
9370
			md = m0;
9371
			pd->pf_mtag = pf_find_mtag(md);
9372
			error = pf_dummynet_route(pd, s, r, ifp,
9373
			    (const struct sockaddr *)gw, &md);
9374
			if (md != NULL) {
9375
				error = (*ifp->if_output)(ifp, md,
9376
				    (const struct sockaddr *)gw,
9377
				    (struct route *)&ro);
9378
				SDT_PROBE2(pf, ip, route_to, output, ifp, error);
9379
			}
9380
		} else
9381
			m_freem(m0);
9382
	}
9383

9384
	if (error == 0)
9385
		KMOD_IPSTAT_INC(ips_fragmented);
9386

9387
done:
9388
	if (pd->act.rt != PF_DUPTO)
9389
		pd->m = NULL;
9390
	else
9391
		action = PF_PASS;
9392
	return (action);
9393

9394
bad_locked:
9395
	if (s)
9396
		PF_STATE_UNLOCK(s);
9397
bad:
9398
	m_freem(m0);
9399
	goto done;
9400
}
9401
#endif /* INET */
9402

9403
#ifdef INET6
9404
static int
9405
pf_route6(struct pf_krule *r, struct ifnet *oifp,
9406
    struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
9407
{
9408
	struct mbuf		*m0, *md;
9409
	struct m_tag		*mtag;
9410
	struct sockaddr_in6	dst;
9411
	struct ip6_hdr		*ip6;
9412
	struct ifnet		*ifp = NULL;
9413
	int			 r_dir;
9414
	bool			 skip_test = false;
9415
	int			 action = PF_PASS;
9416

9417
	KASSERT(pd->m && r && oifp, ("%s: invalid parameters", __func__));
9418

9419
	SDT_PROBE4(pf, ip6, route_to, entry, pd->m, pd, s, oifp);
9420

9421
	if (s) {
9422
		r_dir = s->direction;
9423
	} else {
9424
		r_dir = r->direction;
9425
	}
9426

9427
	KASSERT(pd->dir == PF_IN || pd->dir == PF_OUT ||
9428
	    r_dir == PF_IN || r_dir == PF_OUT, ("%s: invalid direction",
9429
	    __func__));
9430

9431
	if ((pd->pf_mtag == NULL &&
9432
	    ((pd->pf_mtag = pf_get_mtag(pd->m)) == NULL)) ||
9433
	    pd->pf_mtag->routed++ > 3) {
9434
		m0 = pd->m;
9435
		pd->m = NULL;
9436
		action = PF_DROP;
9437
		SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9438
		goto bad_locked;
9439
	}
9440

9441
	if (pd->act.rt_kif != NULL)
9442
		ifp = pd->act.rt_kif->pfik_ifp;
9443

9444
	if (pd->act.rt == PF_DUPTO) {
9445
		if ((pd->pf_mtag->flags & PF_MTAG_FLAG_DUPLICATED)) {
9446
			if (s != NULL) {
9447
				PF_STATE_UNLOCK(s);
9448
			}
9449
			if (ifp == oifp) {
9450
				/* When the 2nd interface is not skipped */
9451
				return (action);
9452
			} else {
9453
				m0 = pd->m;
9454
				pd->m = NULL;
9455
				action = PF_DROP;
9456
				SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9457
				goto bad;
9458
			}
9459
		} else {
9460
			pd->pf_mtag->flags |= PF_MTAG_FLAG_DUPLICATED;
9461
			if (((m0 = m_dup(pd->m, M_NOWAIT)) == NULL)) {
9462
				if (s)
9463
					PF_STATE_UNLOCK(s);
9464
				return (action);
9465
			}
9466
		}
9467
	} else {
9468
		if ((pd->act.rt == PF_REPLYTO) == (r_dir == pd->dir)) {
9469
			if (pd->af == pd->naf) {
9470
				pf_dummynet(pd, s, r, &pd->m);
9471
				if (s)
9472
					PF_STATE_UNLOCK(s);
9473
				return (action);
9474
			} else {
9475
				if (r_dir == PF_IN) {
9476
					skip_test = true;
9477
				}
9478
			}
9479
		}
9480

9481
		/*
9482
		 * If we're actually doing route-to and af-to and are in the
9483
		 * reply direction.
9484
		 */
9485
		if (pd->act.rt_kif && pd->act.rt_kif->pfik_ifp &&
9486
		    pd->af != pd->naf) {
9487
			if (pd->act.rt == PF_ROUTETO && r->naf != AF_INET6) {
9488
				/* Un-set ifp so we do a plain route lookup. */
9489
				ifp = NULL;
9490
			}
9491
			if (pd->act.rt == PF_REPLYTO && r->naf != AF_INET) {
9492
				/* Un-set ifp so we do a plain route lookup. */
9493
				ifp = NULL;
9494
			}
9495
		}
9496
		m0 = pd->m;
9497
	}
9498

9499
	ip6 = mtod(m0, struct ip6_hdr *);
9500

9501
	bzero(&dst, sizeof(dst));
9502
	dst.sin6_family = AF_INET6;
9503
	dst.sin6_len = sizeof(dst);
9504
	pf_addrcpy((struct pf_addr *)&dst.sin6_addr, &pd->act.rt_addr,
9505
	    AF_INET6);
9506

9507
	if (pd->dir == PF_IN) {
9508
		if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
9509
			if (r->rt != PF_DUPTO)
9510
				pf_send_icmp(m0, ICMP6_TIME_EXCEEDED,
9511
				    ICMP6_TIME_EXCEED_TRANSIT, 0, pd->af, r,
9512
				    pd->act.rtableid);
9513
			action = PF_DROP;
9514
			goto bad_locked;
9515
		}
9516
		ip6->ip6_hlim -= IPV6_HLIMDEC;
9517
	}
9518

9519
	if (s != NULL) {
9520
		if (ifp == NULL && (pd->af != pd->naf)) {
9521
			const struct nhop_object *nh;
9522
			nh = fib6_lookup(M_GETFIB(m0), &ip6->ip6_dst, 0, NHR_NONE, 0);
9523
			if (nh) {
9524
				ifp = nh->nh_ifp;
9525

9526
				/* Use the gateway if needed. */
9527
				if (nh->nh_flags & NHF_GATEWAY)
9528
					bcopy(&nh->gw6_sa.sin6_addr, &dst.sin6_addr,
9529
					    sizeof(dst.sin6_addr));
9530
				else
9531
					dst.sin6_addr = ip6->ip6_dst;
9532
			}
9533
		}
9534
		PF_STATE_UNLOCK(s);
9535
	}
9536

9537
	if (pd->af != pd->naf) {
9538
		struct udphdr *uh = &pd->hdr.udp;
9539

9540
		if (pd->proto == IPPROTO_UDP && uh->uh_sum == 0) {
9541
			uh->uh_sum = in6_cksum_pseudo(ip6,
9542
			    ntohs(uh->uh_ulen), IPPROTO_UDP, 0);
9543
			m_copyback(m0, pd->off, sizeof(*uh), pd->hdr.any);
9544
		}
9545
	}
9546

9547
	if (ifp == NULL) {
9548
		m0 = pd->m;
9549
		pd->m = NULL;
9550
		action = PF_DROP;
9551
		SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9552
		goto bad;
9553
	}
9554

9555
	/*
9556
	 * Bind to the correct interface if we're if-bound. We don't know which
9557
	 * interface that will be until here, so we've inserted the state
9558
	 * on V_pf_all. Fix that now.
9559
	 */
9560
	if (s != NULL && s->kif == V_pfi_all && r->rule_flag & PFRULE_IFBOUND) {
9561
		/* Verify that we're here because of BOUND_IFACE */
9562
		MPASS(r->rt == PF_REPLYTO || (pd->af != pd->naf && s->direction == PF_IN));
9563
		s->kif = ifp->if_pf_kif;
9564
		if (pd->act.rt == PF_REPLYTO) {
9565
			s->orig_kif = oifp->if_pf_kif;
9566
		}
9567
	}
9568

9569
	if (r->rt == PF_DUPTO || (pd->af != pd->naf && s->direction == PF_IN))
9570
		skip_test = true;
9571

9572
	if (pd->dir == PF_IN) {
9573
		if (skip_test) {
9574
			struct pfi_kkif *out_kif = (struct pfi_kkif *)ifp->if_pf_kif;
9575
			MPASS(s != NULL);
9576
			pf_counter_u64_critical_enter();
9577
			pf_counter_u64_add_protected(
9578
			    &out_kif->pfik_bytes[pd->naf == AF_INET6][1]
9579
			    [action != PF_PASS && action != PF_AFRT], pd->tot_len);
9580
			pf_counter_u64_add_protected(
9581
			    &out_kif->pfik_packets[pd->naf == AF_INET6][1]
9582
			    [action != PF_PASS && action != PF_AFRT], 1);
9583
			pf_counter_u64_critical_exit();
9584
		} else {
9585
			if (pf_test(AF_INET6, PF_OUT, PFIL_FWD | PF_PFIL_NOREFRAGMENT,
9586
			    ifp, &m0, inp, &pd->act) != PF_PASS) {
9587
				action = PF_DROP;
9588
				SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9589
				goto bad;
9590
			} else if (m0 == NULL) {
9591
				action = PF_DROP;
9592
				SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9593
				goto done;
9594
			}
9595
			if (m0->m_len < sizeof(struct ip6_hdr)) {
9596
				DPFPRINTF(PF_DEBUG_URGENT,
9597
				    "%s: m0->m_len < sizeof(struct ip6_hdr)",
9598
				    __func__);
9599
				action = PF_DROP;
9600
				SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9601
				goto bad;
9602
			}
9603
			ip6 = mtod(m0, struct ip6_hdr *);
9604
		}
9605
	}
9606

9607
	if (ifp->if_flags & IFF_LOOPBACK)
9608
		m0->m_flags |= M_SKIP_FIREWALL;
9609

9610
	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
9611
	    ~ifp->if_hwassist) {
9612
		uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
9613
		in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
9614
		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
9615
	}
9616

9617
	if (pd->dir == PF_IN) {
9618
		uint16_t	 tmp;
9619
		/*
9620
		 * Make sure dummynet gets the correct direction, in case it needs to
9621
		 * re-inject later.
9622
		 */
9623
		pd->dir = PF_OUT;
9624

9625
		/*
9626
		 * The following processing is actually the rest of the inbound processing, even
9627
		 * though we've marked it as outbound (so we don't look through dummynet) and it
9628
		 * happens after the outbound processing (pf_test(PF_OUT) above).
9629
		 * Swap the dummynet pipe numbers, because it's going to come to the wrong
9630
		 * conclusion about what direction it's processing, and we can't fix it or it
9631
		 * will re-inject incorrectly. Swapping the pipe numbers means that its incorrect
9632
		 * decision will pick the right pipe, and everything will mostly work as expected.
9633
		 */
9634
		tmp = pd->act.dnrpipe;
9635
		pd->act.dnrpipe = pd->act.dnpipe;
9636
		pd->act.dnpipe = tmp;
9637
	}
9638

9639
	/*
9640
	 * If the packet is too large for the outgoing interface,
9641
	 * send back an icmp6 error.
9642
	 */
9643
	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
9644
		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
9645
	mtag = m_tag_find(m0, PACKET_TAG_PF_REASSEMBLED, NULL);
9646
	if (mtag != NULL) {
9647
		int ret __sdt_used;
9648
		ret = pf_refragment6(ifp, &m0, mtag, ifp, true);
9649
		SDT_PROBE2(pf, ip6, route_to, output, ifp, ret);
9650
		goto done;
9651
	}
9652

9653
	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
9654
		md = m0;
9655
		pf_dummynet_route(pd, s, r, ifp, sintosa(&dst), &md);
9656
		if (md != NULL) {
9657
			int ret __sdt_used;
9658
			ret = nd6_output_ifp(ifp, ifp, md, &dst, NULL);
9659
			SDT_PROBE2(pf, ip6, route_to, output, ifp, ret);
9660
		}
9661
	}
9662
	else {
9663
		in6_ifstat_inc(ifp, ifs6_in_toobig);
9664
		if (pd->act.rt != PF_DUPTO) {
9665
			if (s && s->nat_rule != NULL) {
9666
				MPASS(m0 == pd->m);
9667
				PACKET_UNDO_NAT(pd,
9668
				    ((caddr_t)ip6 - m0->m_data) +
9669
				    sizeof(struct ip6_hdr), s);
9670
			}
9671

9672
			if (r->rt != PF_DUPTO)
9673
				pf_send_icmp(m0, ICMP6_PACKET_TOO_BIG, 0,
9674
				    ifp->if_mtu, pd->af, r, pd->act.rtableid);
9675
		}
9676
		action = PF_DROP;
9677
		SDT_PROBE1(pf, ip6, route_to, drop, __LINE__);
9678
		goto bad;
9679
	}
9680

9681
done:
9682
	if (pd->act.rt != PF_DUPTO)
9683
		pd->m = NULL;
9684
	else
9685
		action = PF_PASS;
9686
	return (action);
9687

9688
bad_locked:
9689
	if (s)
9690
		PF_STATE_UNLOCK(s);
9691
bad:
9692
	m_freem(m0);
9693
	goto done;
9694
}
9695
#endif /* INET6 */
9696

9697
/*
9698
 * FreeBSD supports cksum offloads for the following drivers.
9699
 *  em(4), fxp(4), lge(4), nge(4), re(4), ti(4), txp(4), xl(4)
9700
 *
9701
 * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
9702
 *  network driver performed cksum including pseudo header, need to verify
9703
 *   csum_data
9704
 * CSUM_DATA_VALID :
9705
 *  network driver performed cksum, needs to additional pseudo header
9706
 *  cksum computation with partial csum_data(i.e. lack of H/W support for
9707
 *  pseudo header, for instance sk(4) and possibly gem(4))
9708
 *
9709
 * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
9710
 * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
9711
 * TCP/UDP layer.
9712
 * Also, set csum_data to 0xffff to force cksum validation.
9713
 */
9714
static int
9715
pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
9716
{
9717
	u_int16_t sum = 0;
9718
	int hw_assist = 0;
9719
	struct ip *ip;
9720

9721
	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
9722
		return (1);
9723
	if (m->m_pkthdr.len < off + len)
9724
		return (1);
9725

9726
	switch (p) {
9727
	case IPPROTO_TCP:
9728
		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
9729
			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
9730
				sum = m->m_pkthdr.csum_data;
9731
			} else {
9732
				ip = mtod(m, struct ip *);
9733
				sum = in_pseudo(ip->ip_src.s_addr,
9734
				ip->ip_dst.s_addr, htonl((u_short)len +
9735
				m->m_pkthdr.csum_data + IPPROTO_TCP));
9736
			}
9737
			sum ^= 0xffff;
9738
			++hw_assist;
9739
		}
9740
		break;
9741
	case IPPROTO_UDP:
9742
		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
9743
			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
9744
				sum = m->m_pkthdr.csum_data;
9745
			} else {
9746
				ip = mtod(m, struct ip *);
9747
				sum = in_pseudo(ip->ip_src.s_addr,
9748
				ip->ip_dst.s_addr, htonl((u_short)len +
9749
				m->m_pkthdr.csum_data + IPPROTO_UDP));
9750
			}
9751
			sum ^= 0xffff;
9752
			++hw_assist;
9753
		}
9754
		break;
9755
	case IPPROTO_ICMP:
9756
#ifdef INET6
9757
	case IPPROTO_ICMPV6:
9758
#endif /* INET6 */
9759
		break;
9760
	default:
9761
		return (1);
9762
	}
9763

9764
	if (!hw_assist) {
9765
		switch (af) {
9766
		case AF_INET:
9767
			if (m->m_len < sizeof(struct ip))
9768
				return (1);
9769
			sum = in4_cksum(m, (p == IPPROTO_ICMP ? 0 : p), off, len);
9770
			break;
9771
#ifdef INET6
9772
		case AF_INET6:
9773
			if (m->m_len < sizeof(struct ip6_hdr))
9774
				return (1);
9775
			sum = in6_cksum(m, p, off, len);
9776
			break;
9777
#endif /* INET6 */
9778
		}
9779
	}
9780
	if (sum) {
9781
		switch (p) {
9782
		case IPPROTO_TCP:
9783
		    {
9784
			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
9785
			break;
9786
		    }
9787
		case IPPROTO_UDP:
9788
		    {
9789
			KMOD_UDPSTAT_INC(udps_badsum);
9790
			break;
9791
		    }
9792
#ifdef INET
9793
		case IPPROTO_ICMP:
9794
		    {
9795
			KMOD_ICMPSTAT_INC(icps_checksum);
9796
			break;
9797
		    }
9798
#endif
9799
#ifdef INET6
9800
		case IPPROTO_ICMPV6:
9801
		    {
9802
			KMOD_ICMP6STAT_INC(icp6s_checksum);
9803
			break;
9804
		    }
9805
#endif /* INET6 */
9806
		}
9807
		return (1);
9808
	} else {
9809
		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
9810
			m->m_pkthdr.csum_flags |=
9811
			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
9812
			m->m_pkthdr.csum_data = 0xffff;
9813
		}
9814
	}
9815
	return (0);
9816
}
9817

9818
static bool
9819
pf_pdesc_to_dnflow(const struct pf_pdesc *pd, const struct pf_krule *r,
9820
    const struct pf_kstate *s, struct ip_fw_args *dnflow)
9821
{
9822
	int dndir = r->direction;
9823
	sa_family_t af  = pd->naf;
9824

9825
	if (s && dndir == PF_INOUT) {
9826
		dndir = s->direction;
9827
	} else if (dndir == PF_INOUT) {
9828
		/* Assume primary direction. Happens when we've set dnpipe in
9829
		 * the ethernet level code. */
9830
		dndir = pd->dir;
9831
	}
9832

9833
	if (pd->pf_mtag->flags & PF_MTAG_FLAG_DUMMYNETED)
9834
		return (false);
9835

9836
	memset(dnflow, 0, sizeof(*dnflow));
9837

9838
	if (pd->dport != NULL)
9839
		dnflow->f_id.dst_port = ntohs(*pd->dport);
9840
	if (pd->sport != NULL)
9841
		dnflow->f_id.src_port = ntohs(*pd->sport);
9842

9843
	if (pd->dir == PF_IN)
9844
		dnflow->flags |= IPFW_ARGS_IN;
9845
	else
9846
		dnflow->flags |= IPFW_ARGS_OUT;
9847

9848
	if (pd->dir != dndir && pd->act.dnrpipe) {
9849
		dnflow->rule.info = pd->act.dnrpipe;
9850
	}
9851
	else if (pd->dir == dndir && pd->act.dnpipe) {
9852
		dnflow->rule.info = pd->act.dnpipe;
9853
	}
9854
	else {
9855
		return (false);
9856
	}
9857

9858
	dnflow->rule.info |= IPFW_IS_DUMMYNET;
9859
	if (r->free_flags & PFRULE_DN_IS_PIPE || pd->act.flags & PFSTATE_DN_IS_PIPE)
9860
		dnflow->rule.info |= IPFW_IS_PIPE;
9861

9862
	dnflow->f_id.proto = pd->proto;
9863
	dnflow->f_id.extra = dnflow->rule.info;
9864
	if (s)
9865
		af = s->key[PF_SK_STACK]->af;
9866

9867
	switch (af) {
9868
	case AF_INET:
9869
		dnflow->f_id.addr_type = 4;
9870
		if (s) {
9871
			dnflow->f_id.src_ip = htonl(
9872
			    s->key[PF_SK_STACK]->addr[pd->sidx].v4.s_addr);
9873
			dnflow->f_id.dst_ip = htonl(
9874
			    s->key[PF_SK_STACK]->addr[pd->didx].v4.s_addr);
9875
		} else {
9876
			dnflow->f_id.src_ip = ntohl(pd->src->v4.s_addr);
9877
			dnflow->f_id.dst_ip = ntohl(pd->dst->v4.s_addr);
9878
		}
9879
		break;
9880
	case AF_INET6:
9881
		dnflow->f_id.addr_type = 6;
9882

9883
		if (s) {
9884
			dnflow->f_id.src_ip6 =
9885
			    s->key[PF_SK_STACK]->addr[pd->sidx].v6;
9886
			dnflow->f_id.dst_ip6 =
9887
			    s->key[PF_SK_STACK]->addr[pd->didx].v6;
9888
		} else {
9889
			dnflow->f_id.src_ip6 = pd->src->v6;
9890
			dnflow->f_id.dst_ip6 = pd->dst->v6;
9891
		}
9892
		break;
9893
	}
9894

9895
	/*
9896
	 * Separate this out, because while we pass the pre-NAT addresses to
9897
	 * dummynet we want the post-nat address family in case of nat64.
9898
	 * Dummynet may call ip_output/ip6_output itself, and we need it to
9899
	 * call the correct one.
9900
	 */
9901
	if (pd->naf == AF_INET6)
9902
		dnflow->flags |= IPFW_ARGS_IP6;
9903

9904
	return (true);
9905
}
9906

9907
int
9908
pf_test_eth(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0,
9909
    struct inpcb *inp)
9910
{
9911
	struct pfi_kkif		*kif;
9912
	struct mbuf		*m = *m0;
9913

9914
	M_ASSERTPKTHDR(m);
9915
	MPASS(ifp->if_vnet == curvnet);
9916
	NET_EPOCH_ASSERT();
9917

9918
	if (!V_pf_status.running)
9919
		return (PF_PASS);
9920

9921
	kif = (struct pfi_kkif *)ifp->if_pf_kif;
9922

9923
	if (kif == NULL) {
9924
		DPFPRINTF(PF_DEBUG_URGENT,
9925
		    "%s: kif == NULL, if_xname %s", __func__, ifp->if_xname);
9926
		return (PF_DROP);
9927
	}
9928
	if (kif->pfik_flags & PFI_IFLAG_SKIP)
9929
		return (PF_PASS);
9930

9931
	if (m->m_flags & M_SKIP_FIREWALL)
9932
		return (PF_PASS);
9933

9934
	if (__predict_false(! M_WRITABLE(*m0))) {
9935
		m = *m0 = m_unshare(*m0, M_NOWAIT);
9936
		if (*m0 == NULL)
9937
			return (PF_DROP);
9938
	}
9939

9940
	/* Stateless! */
9941
	return (pf_test_eth_rule(dir, kif, m0));
9942
}
9943

9944
static __inline void
9945
pf_dummynet_flag_remove(struct mbuf *m, struct pf_mtag *pf_mtag)
9946
{
9947
	struct m_tag *mtag;
9948

9949
	pf_mtag->flags &= ~PF_MTAG_FLAG_DUMMYNET;
9950

9951
	/* dummynet adds this tag, but pf does not need it,
9952
	 * and keeping it creates unexpected behavior,
9953
	 * e.g. in case of divert(4) usage right after dummynet. */
9954
	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
9955
	if (mtag != NULL)
9956
		m_tag_delete(m, mtag);
9957
}
9958

9959
static int
9960
pf_dummynet(struct pf_pdesc *pd, struct pf_kstate *s,
9961
    struct pf_krule *r, struct mbuf **m0)
9962
{
9963
	return (pf_dummynet_route(pd, s, r, NULL, NULL, m0));
9964
}
9965

9966
static int
9967
pf_dummynet_route(struct pf_pdesc *pd, struct pf_kstate *s,
9968
    struct pf_krule *r, struct ifnet *ifp, const struct sockaddr *sa,
9969
    struct mbuf **m0)
9970
{
9971
	struct ip_fw_args dnflow;
9972

9973
	NET_EPOCH_ASSERT();
9974

9975
	if (pd->act.dnpipe == 0 && pd->act.dnrpipe == 0)
9976
		return (0);
9977

9978
	if (ip_dn_io_ptr == NULL) {
9979
		m_freem(*m0);
9980
		*m0 = NULL;
9981
		return (ENOMEM);
9982
	}
9983

9984
	if (pd->pf_mtag == NULL &&
9985
	    ((pd->pf_mtag = pf_get_mtag(*m0)) == NULL)) {
9986
		m_freem(*m0);
9987
		*m0 = NULL;
9988
		return (ENOMEM);
9989
	}
9990

9991
	if (ifp != NULL) {
9992
		pd->pf_mtag->flags |= PF_MTAG_FLAG_ROUTE_TO;
9993

9994
		pd->pf_mtag->if_index = ifp->if_index;
9995
		pd->pf_mtag->if_idxgen = ifp->if_idxgen;
9996

9997
		MPASS(sa != NULL);
9998

9999
		switch (sa->sa_family) {
10000
		case AF_INET:
10001
			memcpy(&pd->pf_mtag->dst, sa,
10002
			    sizeof(struct sockaddr_in));
10003
			break;
10004
		case AF_INET6:
10005
			memcpy(&pd->pf_mtag->dst, sa,
10006
			    sizeof(struct sockaddr_in6));
10007
			break;
10008
		}
10009
	}
10010

10011
	if (s != NULL && s->nat_rule != NULL &&
10012
	    s->nat_rule->action == PF_RDR &&
10013
	    (
10014
#ifdef INET
10015
	    (pd->af == AF_INET && IN_LOOPBACK(ntohl(pd->dst->v4.s_addr))) ||
10016
#endif /* INET */
10017
	    (pd->af == AF_INET6 && IN6_IS_ADDR_LOOPBACK(&pd->dst->v6)))) {
10018
		/*
10019
		 * If we're redirecting to loopback mark this packet
10020
		 * as being local. Otherwise it might get dropped
10021
		 * if dummynet re-injects.
10022
		 */
10023
		(*m0)->m_pkthdr.rcvif = V_loif;
10024
	}
10025

10026
	if (pf_pdesc_to_dnflow(pd, r, s, &dnflow)) {
10027
		pd->pf_mtag->flags |= PF_MTAG_FLAG_DUMMYNET;
10028
		pd->pf_mtag->flags |= PF_MTAG_FLAG_DUMMYNETED;
10029
		ip_dn_io_ptr(m0, &dnflow);
10030
		if (*m0 != NULL) {
10031
			pd->pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO;
10032
			pf_dummynet_flag_remove(*m0, pd->pf_mtag);
10033
		}
10034
	}
10035

10036
	return (0);
10037
}
10038

10039
static int
10040
pf_walk_option(struct pf_pdesc *pd, struct ip *h, int off, int end,
10041
    u_short *reason)
10042
{
10043
	uint8_t type, length, opts[15 * 4 - sizeof(struct ip)];
10044

10045
	/* IP header in payload of ICMP packet may be too short */
10046
	if (pd->m->m_pkthdr.len < end) {
10047
		DPFPRINTF(PF_DEBUG_MISC, "IP option too short");
10048
		REASON_SET(reason, PFRES_SHORT);
10049
		return (PF_DROP);
10050
	}
10051

10052
	MPASS(end - off <= sizeof(opts));
10053
	m_copydata(pd->m, off, end - off, opts);
10054
	end -= off;
10055
	off = 0;
10056

10057
	while (off < end) {
10058
		type = opts[off];
10059
		if (type == IPOPT_EOL)
10060
			break;
10061
		if (type == IPOPT_NOP) {
10062
			off++;
10063
			continue;
10064
		}
10065
		if (off + 2 > end) {
10066
			DPFPRINTF(PF_DEBUG_MISC, "IP length opt");
10067
			REASON_SET(reason, PFRES_IPOPTIONS);
10068
			return (PF_DROP);
10069
		}
10070
		length = opts[off + 1];
10071
		if (length < 2) {
10072
			DPFPRINTF(PF_DEBUG_MISC, "IP short opt");
10073
			REASON_SET(reason, PFRES_IPOPTIONS);
10074
			return (PF_DROP);
10075
		}
10076
		if (off + length > end) {
10077
			DPFPRINTF(PF_DEBUG_MISC, "IP long opt");
10078
			REASON_SET(reason, PFRES_IPOPTIONS);
10079
			return (PF_DROP);
10080
		}
10081
		switch (type) {
10082
		case IPOPT_RA:
10083
			pd->badopts |= PF_OPT_ROUTER_ALERT;
10084
			break;
10085
		default:
10086
			pd->badopts |= PF_OPT_OTHER;
10087
			break;
10088
		}
10089
		off += length;
10090
	}
10091

10092
	return (PF_PASS);
10093
}
10094

10095
static int
10096
pf_walk_header(struct pf_pdesc *pd, struct ip *h, u_short *reason)
10097
{
10098
	struct ah	 ext;
10099
	u_int32_t	 hlen, end;
10100
	int		 hdr_cnt;
10101

10102
	hlen = h->ip_hl << 2;
10103
	if (hlen < sizeof(struct ip) || hlen > ntohs(h->ip_len)) {
10104
		REASON_SET(reason, PFRES_SHORT);
10105
		return (PF_DROP);
10106
	}
10107
	if (hlen != sizeof(struct ip)) {
10108
		if (pf_walk_option(pd, h, pd->off + sizeof(struct ip),
10109
		    pd->off + hlen, reason) != PF_PASS)
10110
			return (PF_DROP);
10111
		/* header options which contain only padding is fishy */
10112
		if (pd->badopts == 0)
10113
			pd->badopts |= PF_OPT_OTHER;
10114
	}
10115
	end = pd->off + ntohs(h->ip_len);
10116
	pd->off += hlen;
10117
	pd->proto = h->ip_p;
10118
	/* IGMP packets have router alert options, allow them */
10119
	if (pd->proto == IPPROTO_IGMP) {
10120
		/*
10121
		 * According to RFC 1112 ttl must be set to 1 in all IGMP
10122
		 * packets sent to 224.0.0.1
10123
		 */
10124
		if ((h->ip_ttl != 1) &&
10125
		    (h->ip_dst.s_addr == INADDR_ALLHOSTS_GROUP)) {
10126
			DPFPRINTF(PF_DEBUG_MISC, "Invalid IGMP");
10127
			REASON_SET(reason, PFRES_IPOPTIONS);
10128
			return (PF_DROP);
10129
		}
10130
		pd->badopts &= ~PF_OPT_ROUTER_ALERT;
10131
	}
10132
	/* stop walking over non initial fragments */
10133
	if ((h->ip_off & htons(IP_OFFMASK)) != 0)
10134
		return (PF_PASS);
10135
	for (hdr_cnt = 0; hdr_cnt < PF_HDR_LIMIT; hdr_cnt++) {
10136
		switch (pd->proto) {
10137
		case IPPROTO_AH:
10138
			/* fragments may be short */
10139
			if ((h->ip_off & htons(IP_MF | IP_OFFMASK)) != 0 &&
10140
			    end < pd->off + sizeof(ext))
10141
				return (PF_PASS);
10142
			if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext),
10143
				reason, AF_INET)) {
10144
				DPFPRINTF(PF_DEBUG_MISC, "IP short exthdr");
10145
				return (PF_DROP);
10146
			}
10147
			pd->off += (ext.ah_len + 2) * 4;
10148
			pd->proto = ext.ah_nxt;
10149
			break;
10150
		default:
10151
			return (PF_PASS);
10152
		}
10153
	}
10154
	DPFPRINTF(PF_DEBUG_MISC, "IPv4 nested authentication header limit");
10155
	REASON_SET(reason, PFRES_IPOPTIONS);
10156
	return (PF_DROP);
10157
}
10158

10159
#ifdef INET6
10160
static int
10161
pf_walk_option6(struct pf_pdesc *pd, struct ip6_hdr *h, int off, int end,
10162
    u_short *reason)
10163
{
10164
	struct ip6_opt		 opt;
10165
	struct ip6_opt_jumbo	 jumbo;
10166

10167
	while (off < end) {
10168
		if (!pf_pull_hdr(pd->m, off, &opt.ip6o_type,
10169
		    sizeof(opt.ip6o_type), reason, AF_INET6)) {
10170
			DPFPRINTF(PF_DEBUG_MISC, "IPv6 short opt type");
10171
			return (PF_DROP);
10172
		}
10173
		if (opt.ip6o_type == IP6OPT_PAD1) {
10174
			off++;
10175
			continue;
10176
		}
10177
		if (!pf_pull_hdr(pd->m, off, &opt, sizeof(opt),
10178
		    reason, AF_INET6)) {
10179
			DPFPRINTF(PF_DEBUG_MISC, "IPv6 short opt");
10180
			return (PF_DROP);
10181
		}
10182
		if (off + sizeof(opt) + opt.ip6o_len > end) {
10183
			DPFPRINTF(PF_DEBUG_MISC, "IPv6 long opt");
10184
			REASON_SET(reason, PFRES_IPOPTIONS);
10185
			return (PF_DROP);
10186
		}
10187
		switch (opt.ip6o_type) {
10188
		case IP6OPT_PADN:
10189
			break;
10190
		case IP6OPT_JUMBO:
10191
			pd->badopts |= PF_OPT_JUMBO;
10192
			if (pd->jumbolen != 0) {
10193
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple jumbo");
10194
				REASON_SET(reason, PFRES_IPOPTIONS);
10195
				return (PF_DROP);
10196
			}
10197
			if (ntohs(h->ip6_plen) != 0) {
10198
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 bad jumbo plen");
10199
				REASON_SET(reason, PFRES_IPOPTIONS);
10200
				return (PF_DROP);
10201
			}
10202
			if (!pf_pull_hdr(pd->m, off, &jumbo, sizeof(jumbo),
10203
				reason, AF_INET6)) {
10204
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 short jumbo");
10205
				return (PF_DROP);
10206
			}
10207
			memcpy(&pd->jumbolen, jumbo.ip6oj_jumbo_len,
10208
			    sizeof(pd->jumbolen));
10209
			pd->jumbolen = ntohl(pd->jumbolen);
10210
			if (pd->jumbolen < IPV6_MAXPACKET) {
10211
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 short jumbolen");
10212
				REASON_SET(reason, PFRES_IPOPTIONS);
10213
				return (PF_DROP);
10214
			}
10215
			break;
10216
		case IP6OPT_ROUTER_ALERT:
10217
			pd->badopts |= PF_OPT_ROUTER_ALERT;
10218
			break;
10219
		default:
10220
			pd->badopts |= PF_OPT_OTHER;
10221
			break;
10222
		}
10223
		off += sizeof(opt) + opt.ip6o_len;
10224
	}
10225

10226
	return (PF_PASS);
10227
}
10228

10229
int
10230
pf_walk_header6(struct pf_pdesc *pd, struct ip6_hdr *h, u_short *reason)
10231
{
10232
	struct ip6_frag		 frag;
10233
	struct ip6_ext		 ext;
10234
	struct icmp6_hdr	 icmp6;
10235
	struct ip6_rthdr	 rthdr;
10236
	uint32_t		 end;
10237
	int			 hdr_cnt, fraghdr_cnt = 0, rthdr_cnt = 0;
10238

10239
	pd->off += sizeof(struct ip6_hdr);
10240
	end = pd->off + ntohs(h->ip6_plen);
10241
	pd->fragoff = pd->extoff = pd->jumbolen = 0;
10242
	pd->proto = h->ip6_nxt;
10243
	for (hdr_cnt = 0; hdr_cnt < PF_HDR_LIMIT; hdr_cnt++) {
10244
		switch (pd->proto) {
10245
		case IPPROTO_ROUTING:
10246
		case IPPROTO_DSTOPTS:
10247
			pd->badopts |= PF_OPT_OTHER;
10248
			break;
10249
		case IPPROTO_HOPOPTS:
10250
			if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext),
10251
			    reason, AF_INET6)) {
10252
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 short exthdr");
10253
				return (PF_DROP);
10254
			}
10255
			if (pf_walk_option6(pd, h, pd->off + sizeof(ext),
10256
				pd->off + (ext.ip6e_len + 1) * 8,
10257
				reason) != PF_PASS)
10258
				return (PF_DROP);
10259
			/* option header which contains only padding is fishy */
10260
			if (pd->badopts == 0)
10261
				pd->badopts |= PF_OPT_OTHER;
10262
			break;
10263
		}
10264
		switch (pd->proto) {
10265
		case IPPROTO_FRAGMENT:
10266
			if (fraghdr_cnt++) {
10267
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple fragment");
10268
				REASON_SET(reason, PFRES_FRAG);
10269
				return (PF_DROP);
10270
			}
10271
			/* jumbo payload packets cannot be fragmented */
10272
			if (pd->jumbolen != 0) {
10273
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 fragmented jumbo");
10274
				REASON_SET(reason, PFRES_FRAG);
10275
				return (PF_DROP);
10276
			}
10277
			if (!pf_pull_hdr(pd->m, pd->off, &frag, sizeof(frag),
10278
			    reason, AF_INET6)) {
10279
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 short fragment");
10280
				return (PF_DROP);
10281
			}
10282
			/* stop walking over non initial fragments */
10283
			if (ntohs((frag.ip6f_offlg & IP6F_OFF_MASK)) != 0) {
10284
				pd->fragoff = pd->off;
10285
				return (PF_PASS);
10286
			}
10287
			/* RFC6946:  reassemble only non atomic fragments */
10288
			if (frag.ip6f_offlg & IP6F_MORE_FRAG)
10289
				pd->fragoff = pd->off;
10290
			pd->off += sizeof(frag);
10291
			pd->proto = frag.ip6f_nxt;
10292
			break;
10293
		case IPPROTO_ROUTING:
10294
			if (rthdr_cnt++) {
10295
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 multiple rthdr");
10296
				REASON_SET(reason, PFRES_IPOPTIONS);
10297
				return (PF_DROP);
10298
			}
10299
			/* fragments may be short */
10300
			if (pd->fragoff != 0 && end < pd->off + sizeof(rthdr)) {
10301
				pd->off = pd->fragoff;
10302
				pd->proto = IPPROTO_FRAGMENT;
10303
				return (PF_PASS);
10304
			}
10305
			if (!pf_pull_hdr(pd->m, pd->off, &rthdr, sizeof(rthdr),
10306
			    reason, AF_INET6)) {
10307
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 short rthdr");
10308
				return (PF_DROP);
10309
			}
10310
			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
10311
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 rthdr0");
10312
				REASON_SET(reason, PFRES_IPOPTIONS);
10313
				return (PF_DROP);
10314
			}
10315
			/* FALLTHROUGH */
10316
		case IPPROTO_HOPOPTS:
10317
			/* RFC2460 4.1:  Hop-by-Hop only after IPv6 header */
10318
			if (pd->proto == IPPROTO_HOPOPTS && hdr_cnt > 0) {
10319
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 hopopts not first");
10320
				REASON_SET(reason, PFRES_IPOPTIONS);
10321
				return (PF_DROP);
10322
			}
10323
			/* FALLTHROUGH */
10324
		case IPPROTO_AH:
10325
		case IPPROTO_DSTOPTS:
10326
			if (!pf_pull_hdr(pd->m, pd->off, &ext, sizeof(ext),
10327
			    reason, AF_INET6)) {
10328
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 short exthdr");
10329
				return (PF_DROP);
10330
			}
10331
			/* fragments may be short */
10332
			if (pd->fragoff != 0 && end < pd->off + sizeof(ext)) {
10333
				pd->off = pd->fragoff;
10334
				pd->proto = IPPROTO_FRAGMENT;
10335
				return (PF_PASS);
10336
			}
10337
			/* reassembly needs the ext header before the frag */
10338
			if (pd->fragoff == 0)
10339
				pd->extoff = pd->off;
10340
			if (pd->proto == IPPROTO_HOPOPTS && pd->fragoff == 0 &&
10341
			    ntohs(h->ip6_plen) == 0 && pd->jumbolen != 0) {
10342
				DPFPRINTF(PF_DEBUG_MISC, "IPv6 missing jumbo");
10343
				REASON_SET(reason, PFRES_IPOPTIONS);
10344
				return (PF_DROP);
10345
			}
10346
			if (pd->proto == IPPROTO_AH)
10347
				pd->off += (ext.ip6e_len + 2) * 4;
10348
			else
10349
				pd->off += (ext.ip6e_len + 1) * 8;
10350
			pd->proto = ext.ip6e_nxt;
10351
			break;
10352
		case IPPROTO_ICMPV6:
10353
			/* fragments may be short, ignore inner header then */
10354
			if (pd->fragoff != 0 && end < pd->off + sizeof(icmp6)) {
10355
				pd->off = pd->fragoff;
10356
				pd->proto = IPPROTO_FRAGMENT;
10357
				return (PF_PASS);
10358
			}
10359
			if (!pf_pull_hdr(pd->m, pd->off, &icmp6, sizeof(icmp6),
10360
				reason, AF_INET6)) {
10361
				DPFPRINTF(PF_DEBUG_MISC,
10362
				    "IPv6 short icmp6hdr");
10363
				return (PF_DROP);
10364
			}
10365
			/* ICMP multicast packets have router alert options */
10366
			switch (icmp6.icmp6_type) {
10367
			case MLD_LISTENER_QUERY:
10368
			case MLD_LISTENER_REPORT:
10369
			case MLD_LISTENER_DONE:
10370
			case MLDV2_LISTENER_REPORT:
10371
				/*
10372
				 * According to RFC 2710 all MLD messages are
10373
				 * sent with hop-limit (ttl) set to 1, and link
10374
				 * local source address.  If either one is
10375
				 * missing then MLD message is invalid and
10376
				 * should be discarded.
10377
				 */
10378
				if ((h->ip6_hlim != 1) ||
10379
				    !IN6_IS_ADDR_LINKLOCAL(&h->ip6_src)) {
10380
					DPFPRINTF(PF_DEBUG_MISC, "Invalid MLD");
10381
					REASON_SET(reason, PFRES_IPOPTIONS);
10382
					return (PF_DROP);
10383
				}
10384
				pd->badopts &= ~PF_OPT_ROUTER_ALERT;
10385
				break;
10386
			}
10387
			return (PF_PASS);
10388
		case IPPROTO_TCP:
10389
		case IPPROTO_UDP:
10390
		case IPPROTO_SCTP:
10391
			/* fragments may be short, ignore inner header then */
10392
			if (pd->fragoff != 0 && end < pd->off +
10393
			    (pd->proto == IPPROTO_TCP ? sizeof(struct tcphdr) :
10394
			    pd->proto == IPPROTO_UDP ? sizeof(struct udphdr) :
10395
			    pd->proto == IPPROTO_SCTP ? sizeof(struct sctphdr) :
10396
			    sizeof(struct icmp6_hdr))) {
10397
				pd->off = pd->fragoff;
10398
				pd->proto = IPPROTO_FRAGMENT;
10399
			}
10400
			/* FALLTHROUGH */
10401
		default:
10402
			return (PF_PASS);
10403
		}
10404
	}
10405
	DPFPRINTF(PF_DEBUG_MISC, "IPv6 nested extension header limit");
10406
	REASON_SET(reason, PFRES_IPOPTIONS);
10407
	return (PF_DROP);
10408
}
10409
#endif /* INET6 */
10410

10411
static void
10412
pf_init_pdesc(struct pf_pdesc *pd, struct mbuf *m)
10413
{
10414
	memset(pd, 0, sizeof(*pd));
10415
	pd->pf_mtag = pf_find_mtag(m);
10416
	pd->m = m;
10417
}
10418

10419
static int
10420
pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
10421
    u_short *action, u_short *reason, struct pfi_kkif *kif,
10422
    struct pf_rule_actions *default_actions)
10423
{
10424
	pd->dir = dir;
10425
	pd->kif = kif;
10426
	pd->m = *m0;
10427
	pd->sidx = (dir == PF_IN) ? 0 : 1;
10428
	pd->didx = (dir == PF_IN) ? 1 : 0;
10429
	pd->af = pd->naf = af;
10430

10431
	PF_RULES_ASSERT();
10432

10433
	TAILQ_INIT(&pd->sctp_multihome_jobs);
10434
	if (default_actions != NULL)
10435
		memcpy(&pd->act, default_actions, sizeof(pd->act));
10436

10437
	if (pd->pf_mtag && pd->pf_mtag->dnpipe) {
10438
		pd->act.dnpipe = pd->pf_mtag->dnpipe;
10439
		pd->act.flags = pd->pf_mtag->dnflags;
10440
	}
10441

10442
	switch (af) {
10443
#ifdef INET
10444
	case AF_INET: {
10445
		struct ip *h;
10446

10447
		if (__predict_false((*m0)->m_len < sizeof(struct ip)) &&
10448
		    (pd->m = *m0 = m_pullup(*m0, sizeof(struct ip))) == NULL) {
10449
			DPFPRINTF(PF_DEBUG_URGENT,
10450
			    "%s: m_len < sizeof(struct ip), pullup failed",
10451
			    __func__);
10452
			*action = PF_DROP;
10453
			REASON_SET(reason, PFRES_SHORT);
10454
			return (PF_DROP);
10455
		}
10456

10457
		h = mtod(pd->m, struct ip *);
10458
		if (pd->m->m_pkthdr.len < ntohs(h->ip_len)) {
10459
			*action = PF_DROP;
10460
			REASON_SET(reason, PFRES_SHORT);
10461
			return (PF_DROP);
10462
		}
10463

10464
		if (pf_normalize_ip(reason, pd) != PF_PASS) {
10465
			/* We do IP header normalization and packet reassembly here */
10466
			*m0 = pd->m;
10467
			*action = PF_DROP;
10468
			return (PF_DROP);
10469
		}
10470
		*m0 = pd->m;
10471
		h = mtod(pd->m, struct ip *);
10472

10473
		if (pf_walk_header(pd, h, reason) != PF_PASS) {
10474
			*action = PF_DROP;
10475
			return (PF_DROP);
10476
		}
10477

10478
		pd->src = (struct pf_addr *)&h->ip_src;
10479
		pd->dst = (struct pf_addr *)&h->ip_dst;
10480
		pf_addrcpy(&pd->osrc, pd->src, af);
10481
		pf_addrcpy(&pd->odst, pd->dst, af);
10482
		pd->ip_sum = &h->ip_sum;
10483
		pd->tos = h->ip_tos & ~IPTOS_ECN_MASK;
10484
		pd->ttl = h->ip_ttl;
10485
		pd->tot_len = ntohs(h->ip_len);
10486
		pd->act.rtableid = -1;
10487
		pd->df = h->ip_off & htons(IP_DF);
10488
		pd->virtual_proto = (h->ip_off & htons(IP_MF | IP_OFFMASK)) ?
10489
		    PF_VPROTO_FRAGMENT : pd->proto;
10490

10491
		break;
10492
	}
10493
#endif /* INET */
10494
#ifdef INET6
10495
	case AF_INET6: {
10496
		struct ip6_hdr *h;
10497

10498
		if (__predict_false((*m0)->m_len < sizeof(struct ip6_hdr)) &&
10499
		    (pd->m = *m0 = m_pullup(*m0, sizeof(struct ip6_hdr))) == NULL) {
10500
			DPFPRINTF(PF_DEBUG_URGENT,
10501
			    "%s: m_len < sizeof(struct ip6_hdr)"
10502
			     ", pullup failed", __func__);
10503
			*action = PF_DROP;
10504
			REASON_SET(reason, PFRES_SHORT);
10505
			return (PF_DROP);
10506
		}
10507

10508
		h = mtod(pd->m, struct ip6_hdr *);
10509
		if (pd->m->m_pkthdr.len <
10510
		    sizeof(struct ip6_hdr) + ntohs(h->ip6_plen)) {
10511
			*action = PF_DROP;
10512
			REASON_SET(reason, PFRES_SHORT);
10513
			return (PF_DROP);
10514
		}
10515

10516
		/*
10517
		 * we do not support jumbogram.  if we keep going, zero ip6_plen
10518
		 * will do something bad, so drop the packet for now.
10519
		 */
10520
		if (htons(h->ip6_plen) == 0) {
10521
			*action = PF_DROP;
10522
			return (PF_DROP);
10523
		}
10524

10525
		if (pf_walk_header6(pd, h, reason) != PF_PASS) {
10526
			*action = PF_DROP;
10527
			return (PF_DROP);
10528
		}
10529

10530
		h = mtod(pd->m, struct ip6_hdr *);
10531
		pd->src = (struct pf_addr *)&h->ip6_src;
10532
		pd->dst = (struct pf_addr *)&h->ip6_dst;
10533
		pf_addrcpy(&pd->osrc, pd->src, af);
10534
		pf_addrcpy(&pd->odst, pd->dst, af);
10535
		pd->ip_sum = NULL;
10536
		pd->tos = IPV6_DSCP(h);
10537
		pd->ttl = h->ip6_hlim;
10538
		pd->tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
10539
		pd->act.rtableid = -1;
10540

10541
		pd->virtual_proto = (pd->fragoff != 0) ?
10542
		    PF_VPROTO_FRAGMENT : pd->proto;
10543

10544
		/* We do IP header normalization and packet reassembly here */
10545
		if (pf_normalize_ip6(pd->fragoff, reason, pd) !=
10546
		    PF_PASS) {
10547
			*m0 = pd->m;
10548
			*action = PF_DROP;
10549
			return (PF_DROP);
10550
		}
10551
		*m0 = pd->m;
10552
		if (pd->m == NULL) {
10553
			/* packet sits in reassembly queue, no error */
10554
			*action = PF_PASS;
10555
			return (PF_DROP);
10556
		}
10557

10558
		/* Update pointers into the packet. */
10559
		h = mtod(pd->m, struct ip6_hdr *);
10560
		pd->src = (struct pf_addr *)&h->ip6_src;
10561
		pd->dst = (struct pf_addr *)&h->ip6_dst;
10562

10563
		pd->off = 0;
10564

10565
		if (pf_walk_header6(pd, h, reason) != PF_PASS) {
10566
			*action = PF_DROP;
10567
			return (PF_DROP);
10568
		}
10569

10570
		if (m_tag_find(pd->m, PACKET_TAG_PF_REASSEMBLED, NULL) != NULL) {
10571
			/*
10572
			 * Reassembly may have changed the next protocol from
10573
			 * fragment to something else, so update.
10574
			 */
10575
			pd->virtual_proto = pd->proto;
10576
			MPASS(pd->fragoff == 0);
10577
		}
10578

10579
		if (pd->fragoff != 0)
10580
			pd->virtual_proto = PF_VPROTO_FRAGMENT;
10581

10582
		break;
10583
	}
10584
#endif /* INET6 */
10585
	default:
10586
		panic("pf_setup_pdesc called with illegal af %u", af);
10587
	}
10588

10589
	switch (pd->virtual_proto) {
10590
	case IPPROTO_TCP: {
10591
		struct tcphdr *th = &pd->hdr.tcp;
10592

10593
		if (!pf_pull_hdr(pd->m, pd->off, th, sizeof(*th),
10594
			reason, af)) {
10595
			*action = PF_DROP;
10596
			REASON_SET(reason, PFRES_SHORT);
10597
			return (PF_DROP);
10598
		}
10599
		pd->hdrlen = sizeof(*th);
10600
		pd->p_len = pd->tot_len - pd->off - (th->th_off << 2);
10601
		pd->sport = &th->th_sport;
10602
		pd->dport = &th->th_dport;
10603
		pd->pcksum = &th->th_sum;
10604
		break;
10605
	}
10606
	case IPPROTO_UDP: {
10607
		struct udphdr *uh = &pd->hdr.udp;
10608

10609
		if (!pf_pull_hdr(pd->m, pd->off, uh, sizeof(*uh),
10610
			reason, af)) {
10611
			*action = PF_DROP;
10612
			REASON_SET(reason, PFRES_SHORT);
10613
			return (PF_DROP);
10614
		}
10615
		pd->hdrlen = sizeof(*uh);
10616
		if (uh->uh_dport == 0 ||
10617
		    ntohs(uh->uh_ulen) > pd->m->m_pkthdr.len - pd->off ||
10618
		    ntohs(uh->uh_ulen) < sizeof(struct udphdr)) {
10619
			*action = PF_DROP;
10620
			REASON_SET(reason, PFRES_SHORT);
10621
			return (PF_DROP);
10622
		}
10623
		pd->sport = &uh->uh_sport;
10624
		pd->dport = &uh->uh_dport;
10625
		pd->pcksum = &uh->uh_sum;
10626
		break;
10627
	}
10628
	case IPPROTO_SCTP: {
10629
		if (!pf_pull_hdr(pd->m, pd->off, &pd->hdr.sctp, sizeof(pd->hdr.sctp),
10630
		    reason, af)) {
10631
			*action = PF_DROP;
10632
			REASON_SET(reason, PFRES_SHORT);
10633
			return (PF_DROP);
10634
		}
10635
		pd->hdrlen = sizeof(pd->hdr.sctp);
10636
		pd->p_len = pd->tot_len - pd->off;
10637

10638
		pd->sport = &pd->hdr.sctp.src_port;
10639
		pd->dport = &pd->hdr.sctp.dest_port;
10640
		if (pd->hdr.sctp.src_port == 0 || pd->hdr.sctp.dest_port == 0) {
10641
			*action = PF_DROP;
10642
			REASON_SET(reason, PFRES_SHORT);
10643
			return (PF_DROP);
10644
		}
10645

10646
		/*
10647
		 * Placeholder. The SCTP checksum is 32-bits, but
10648
		 * pf_test_state() expects to update a 16-bit checksum.
10649
		 * Provide a dummy value which we'll subsequently ignore.
10650
		 * Do this before pf_scan_sctp() so any jobs we enqueue
10651
		 * have a pcksum set.
10652
		 */
10653
		pd->pcksum = &pd->sctp_dummy_sum;
10654

10655
		if (pf_scan_sctp(pd) != PF_PASS) {
10656
			*action = PF_DROP;
10657
			REASON_SET(reason, PFRES_SHORT);
10658
			return (PF_DROP);
10659
		}
10660
		break;
10661
	}
10662
	case IPPROTO_ICMP: {
10663
		if (!pf_pull_hdr(pd->m, pd->off, &pd->hdr.icmp, ICMP_MINLEN,
10664
			reason, af)) {
10665
			*action = PF_DROP;
10666
			REASON_SET(reason, PFRES_SHORT);
10667
			return (PF_DROP);
10668
		}
10669
		pd->pcksum = &pd->hdr.icmp.icmp_cksum;
10670
		pd->hdrlen = ICMP_MINLEN;
10671
		break;
10672
	}
10673
#ifdef INET6
10674
	case IPPROTO_ICMPV6: {
10675
		size_t icmp_hlen = sizeof(struct icmp6_hdr);
10676

10677
		if (!pf_pull_hdr(pd->m, pd->off, &pd->hdr.icmp6, icmp_hlen,
10678
			reason, af)) {
10679
			*action = PF_DROP;
10680
			REASON_SET(reason, PFRES_SHORT);
10681
			return (PF_DROP);
10682
		}
10683
		/* ICMP headers we look further into to match state */
10684
		switch (pd->hdr.icmp6.icmp6_type) {
10685
		case MLD_LISTENER_QUERY:
10686
		case MLD_LISTENER_REPORT:
10687
			icmp_hlen = sizeof(struct mld_hdr);
10688
			break;
10689
		case ND_NEIGHBOR_SOLICIT:
10690
		case ND_NEIGHBOR_ADVERT:
10691
			icmp_hlen = sizeof(struct nd_neighbor_solicit);
10692
			/* FALLTHROUGH */
10693
		case ND_ROUTER_SOLICIT:
10694
		case ND_ROUTER_ADVERT:
10695
		case ND_REDIRECT:
10696
			if (pd->ttl != 255) {
10697
				REASON_SET(reason, PFRES_NORM);
10698
				return (PF_DROP);
10699
			}
10700
			break;
10701
		}
10702
		if (icmp_hlen > sizeof(struct icmp6_hdr) &&
10703
		    !pf_pull_hdr(pd->m, pd->off, &pd->hdr.icmp6, icmp_hlen,
10704
			reason, af)) {
10705
			*action = PF_DROP;
10706
			REASON_SET(reason, PFRES_SHORT);
10707
			return (PF_DROP);
10708
		}
10709
		pd->hdrlen = icmp_hlen;
10710
		pd->pcksum = &pd->hdr.icmp6.icmp6_cksum;
10711
		break;
10712
	}
10713
#endif /* INET6 */
10714
	default:
10715
		/*
10716
		 * Placeholder value, so future calls to pf_change_ap() don't
10717
		 * try to update a NULL checksum pointer.
10718
		*/
10719
		pd->pcksum = &pd->sctp_dummy_sum;
10720
		break;
10721
	}
10722

10723
	if (pd->sport)
10724
		pd->osport = pd->nsport = *pd->sport;
10725
	if (pd->dport)
10726
		pd->odport = pd->ndport = *pd->dport;
10727

10728
	MPASS(pd->pcksum != NULL);
10729

10730
	return (PF_PASS);
10731
}
10732

10733
static __inline void
10734
pf_rule_counters_inc(struct pf_pdesc *pd, struct pf_krule *r, int dir_out,
10735
    int op_pass, sa_family_t af, struct pf_addr *src_host,
10736
    struct pf_addr *dst_host)
10737
{
10738
	pf_counter_u64_add_protected(&(r->packets[dir_out]), 1);
10739
	pf_counter_u64_add_protected(&(r->bytes[dir_out]), pd->tot_len);
10740
	pf_update_timestamp(r);
10741

10742
	if (r->src.addr.type == PF_ADDR_TABLE)
10743
		pfr_update_stats(r->src.addr.p.tbl, src_host, af,
10744
		    pd->tot_len, dir_out, op_pass, r->src.neg);
10745
	if (r->dst.addr.type == PF_ADDR_TABLE)
10746
		pfr_update_stats(r->dst.addr.p.tbl, dst_host, af,
10747
		    pd->tot_len, dir_out, op_pass, r->dst.neg);
10748
}
10749

10750
static void
10751
pf_counters_inc(int action, struct pf_pdesc *pd, struct pf_kstate *s,
10752
    struct pf_krule *r, struct pf_krule *a, struct pf_krule_slist *match_rules)
10753
{
10754
	struct pf_krule_slist	*mr = match_rules;
10755
	struct pf_krule_item	*ri;
10756
	struct pf_krule		*nr = NULL;
10757
	struct pf_addr		*src_host = pd->src;
10758
	struct pf_addr		*dst_host = pd->dst;
10759
	struct pf_state_key	*key;
10760
	int			 dir_out = (pd->dir == PF_OUT);
10761
	int			 op_r_pass = (r->action == PF_PASS);
10762
	int			 op_pass = (action == PF_PASS || action == PF_AFRT);
10763
	int			 s_dir_in, s_dir_out, s_dir_rev;
10764
	sa_family_t		 af = pd->af;
10765

10766
	pf_counter_u64_critical_enter();
10767

10768
	/*
10769
	 * Set AF for interface counters, it will be later overwritten for
10770
	 * rule and state counters with value from proper state key.
10771
	 */
10772
	if (action == PF_AFRT) {
10773
		MPASS(s != NULL);
10774
		if (s->direction == PF_OUT && dir_out)
10775
			af = pd->naf;
10776
	}
10777

10778
	pf_counter_u64_add_protected(
10779
	    &pd->kif->pfik_bytes[af == AF_INET6][dir_out][!op_pass],
10780
	    pd->tot_len);
10781
	pf_counter_u64_add_protected(
10782
	    &pd->kif->pfik_packets[af == AF_INET6][dir_out][!op_pass],
10783
	    1);
10784

10785
	/* If the rule has failed to apply, don't increase its counters */
10786
	if (!(op_pass || r->action == PF_DROP)) {
10787
		pf_counter_u64_critical_exit();
10788
		return;
10789
	}
10790

10791
	if (s != NULL) {
10792
		PF_STATE_LOCK_ASSERT(s);
10793
		mr = &(s->match_rules);
10794

10795
		/*
10796
		 * For af-to on the inbound direction we can determine
10797
		 * the direction of passing packet only by checking direction
10798
		 * of AF translation. The af-to in "in" direction covers both
10799
		 * the inbound and the outbound side of state tracking,
10800
		 * so pd->dir is always PF_IN. We set dir_out and s_dir_rev
10801
		 * in a way to count packets as if the state was outbound,
10802
		 * because pfctl -ss shows the state with "->", as if it was
10803
		 * oubound.
10804
		 */
10805
		if (action == PF_AFRT && s->direction == PF_IN) {
10806
			dir_out = (pd->naf == s->rule->naf);
10807
			s_dir_in = 1;
10808
			s_dir_out = 0;
10809
			s_dir_rev = (pd->naf == s->rule->af);
10810
		} else {
10811
			dir_out = (pd->dir == PF_OUT);
10812
			s_dir_in = (s->direction == PF_IN);
10813
			s_dir_out = (s->direction == PF_OUT);
10814
			s_dir_rev = (pd->dir != s->direction);
10815
		}
10816

10817
		/* pd->tot_len is a problematic with af-to rules. Sure, we can
10818
		 * agree that it's the post-af-to packet length that was
10819
		 * forwarded through a state, but what about tables which match
10820
		 * on pre-af-to addresses? We don't have access the the original
10821
		 * packet length anymore.
10822
		 */
10823
		s->packets[s_dir_rev]++;
10824
		s->bytes[s_dir_rev] += pd->tot_len;
10825

10826
		/*
10827
		 * Source nodes are accessed unlocked here. But since we are
10828
		 * operating with stateful tracking and the state is locked,
10829
		 * those SNs could not have been freed.
10830
		 */
10831
		for (pf_sn_types_t sn_type=0; sn_type<PF_SN_MAX; sn_type++) {
10832
			if (s->sns[sn_type] != NULL) {
10833
				counter_u64_add(
10834
				    s->sns[sn_type]->packets[dir_out],
10835
				    1);
10836
				counter_u64_add(
10837
				    s->sns[sn_type]->bytes[dir_out],
10838
				    pd->tot_len);
10839
			}
10840
		}
10841

10842
		/* Start with pre-NAT addresses */
10843
		key = s->key[(s->direction == PF_OUT)];
10844
		src_host = &(key->addr[s_dir_out]);
10845
		dst_host = &(key->addr[s_dir_in]);
10846
		af = key->af;
10847
		if (s->nat_rule) {
10848
			/* Old-style NAT rules */
10849
			if (s->nat_rule->action == PF_NAT ||
10850
			    s->nat_rule->action == PF_RDR ||
10851
			    s->nat_rule->action == PF_BINAT) {
10852
				nr = s->nat_rule;
10853
				pf_rule_counters_inc(pd, s->nat_rule, dir_out,
10854
				    op_r_pass, af, src_host, dst_host);
10855
				/* Use post-NAT addresses from now on */
10856
				key = s->key[s_dir_in];
10857
				src_host = &(key->addr[s_dir_out]);
10858
				dst_host = &(key->addr[s_dir_in]);
10859
				af = key->af;
10860
			}
10861
		}
10862
	}
10863

10864
	SLIST_FOREACH(ri, mr, entry) {
10865
		pf_rule_counters_inc(pd, ri->r, dir_out, op_r_pass, af,
10866
		    src_host, dst_host);
10867
		if (s && s->nat_rule == ri->r) {
10868
			/* Use post-NAT addresses after a match NAT rule */
10869
			key = s->key[s_dir_in];
10870
			src_host = &(key->addr[s_dir_out]);
10871
			dst_host = &(key->addr[s_dir_in]);
10872
			af = key->af;
10873
		}
10874
	}
10875

10876
	if (s == NULL) {
10877
		pf_free_match_rules(mr);
10878
	}
10879

10880
	if (a != NULL) {
10881
		pf_rule_counters_inc(pd, a, dir_out, op_r_pass, af,
10882
		    src_host, dst_host);
10883
	}
10884

10885
	if (r != nr) {
10886
		pf_rule_counters_inc(pd, r, dir_out, op_r_pass, af,
10887
		    src_host, dst_host);
10888
	}
10889

10890
	pf_counter_u64_critical_exit();
10891
}
10892

10893
static void
10894
pf_log_matches(struct pf_pdesc *pd, struct pf_krule *rm,
10895
    struct pf_krule *am, struct pf_kruleset *ruleset,
10896
    struct pf_krule_slist *match_rules)
10897
{
10898
	struct pf_krule_item	*ri;
10899

10900
	/* if this is the log(matches) rule, packet has been logged already */
10901
	if (rm->log & PF_LOG_MATCHES)
10902
		return;
10903

10904
	SLIST_FOREACH(ri, match_rules, entry)
10905
		if (ri->r->log & PF_LOG_MATCHES)
10906
			PFLOG_PACKET(rm->action, PFRES_MATCH, rm, am,
10907
			    ruleset, pd, 1, ri->r);
10908
}
10909

10910
#if defined(INET) || defined(INET6)
10911
int
10912
pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0,
10913
    struct inpcb *inp, struct pf_rule_actions *default_actions)
10914
{
10915
	struct pfi_kkif		*kif;
10916
	u_short			 action, reason = 0;
10917
	struct m_tag		*mtag;
10918
	struct pf_krule		*a = NULL, *r = &V_pf_default_rule;
10919
	struct pf_kstate	*s = NULL;
10920
	struct pf_kruleset	*ruleset = NULL;
10921
	struct pf_krule_item	*ri;
10922
	struct pf_krule_slist	 match_rules;
10923
	struct pf_pdesc		 pd;
10924
	int			 use_2nd_queue = 0;
10925
	uint16_t		 tag;
10926

10927
	PF_RULES_RLOCK_TRACKER;
10928
	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
10929
	M_ASSERTPKTHDR(*m0);
10930
	NET_EPOCH_ASSERT();
10931

10932
	if (!V_pf_status.running)
10933
		return (PF_PASS);
10934

10935
	kif = (struct pfi_kkif *)ifp->if_pf_kif;
10936

10937
	if (__predict_false(kif == NULL)) {
10938
		DPFPRINTF(PF_DEBUG_URGENT,
10939
		    "%s: kif == NULL, if_xname %s",
10940
		    __func__, ifp->if_xname);
10941
		return (PF_DROP);
10942
	}
10943
	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
10944
		return (PF_PASS);
10945
	}
10946

10947
	if ((*m0)->m_flags & M_SKIP_FIREWALL) {
10948
		return (PF_PASS);
10949
	}
10950

10951
	if (__predict_false(! M_WRITABLE(*m0))) {
10952
		*m0 = m_unshare(*m0, M_NOWAIT);
10953
		if (*m0 == NULL) {
10954
			return (PF_DROP);
10955
		}
10956
	}
10957

10958
	pf_init_pdesc(&pd, *m0);
10959
	SLIST_INIT(&match_rules);
10960

10961
	if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_MTAG_FLAG_ROUTE_TO)) {
10962
		pd.pf_mtag->flags &= ~PF_MTAG_FLAG_ROUTE_TO;
10963

10964
		ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
10965
		    pd.pf_mtag->if_idxgen);
10966
		if (ifp == NULL || ifp->if_flags & IFF_DYING) {
10967
			m_freem(*m0);
10968
			*m0 = NULL;
10969
			return (PF_PASS);
10970
		}
10971
		(ifp->if_output)(ifp, *m0, sintosa(&pd.pf_mtag->dst), NULL);
10972
		*m0 = NULL;
10973
		return (PF_PASS);
10974
	}
10975

10976
	if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL &&
10977
	    pd.pf_mtag->flags & PF_MTAG_FLAG_DUMMYNET) {
10978
		/* Dummynet re-injects packets after they've
10979
		 * completed their delay. We've already
10980
		 * processed them, so pass unconditionally. */
10981

10982
		/* But only once. We may see the packet multiple times (e.g.
10983
		 * PFIL_IN/PFIL_OUT). */
10984
		pf_dummynet_flag_remove(pd.m, pd.pf_mtag);
10985

10986
		return (PF_PASS);
10987
	}
10988

10989
	PF_RULES_RLOCK();
10990

10991
	if (pf_setup_pdesc(af, dir, &pd, m0, &action, &reason,
10992
		kif, default_actions) != PF_PASS) {
10993
		if (action != PF_PASS)
10994
			pd.act.log |= PF_LOG_FORCE;
10995
		goto done;
10996
	}
10997

10998
#ifdef INET
10999
	if (af == AF_INET && dir == PF_OUT && pflags & PFIL_FWD &&
11000
	    pd.df && (*m0)->m_pkthdr.len > ifp->if_mtu) {
11001
		PF_RULES_RUNLOCK();
11002
		icmp_error(*m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
11003
			0, ifp->if_mtu);
11004
		*m0 = NULL;
11005
		return (PF_DROP);
11006
	}
11007
#endif /* INET */
11008
#ifdef INET6
11009
	/*
11010
	 * If we end up changing IP addresses (e.g. binat) the stack may get
11011
	 * confused and fail to send the icmp6 packet too big error. Just send
11012
	 * it here, before we do any NAT.
11013
	 */
11014
	if (af == AF_INET6 && dir == PF_OUT && pflags & PFIL_FWD &&
11015
	    IN6_LINKMTU(ifp) < pf_max_frag_size(*m0)) {
11016
		PF_RULES_RUNLOCK();
11017
		icmp6_error(*m0, ICMP6_PACKET_TOO_BIG, 0, IN6_LINKMTU(ifp));
11018
		*m0 = NULL;
11019
		return (PF_DROP);
11020
	}
11021
#endif /* INET6 */
11022

11023
	if (__predict_false(ip_divert_ptr != NULL) &&
11024
	    ((mtag = m_tag_locate(pd.m, MTAG_PF_DIVERT, 0, NULL)) != NULL)) {
11025
		struct pf_divert_mtag *dt = (struct pf_divert_mtag *)(mtag+1);
11026
		if ((dt->idir == PF_DIVERT_MTAG_DIR_IN && dir == PF_IN) ||
11027
		    (dt->idir == PF_DIVERT_MTAG_DIR_OUT && dir == PF_OUT)) {
11028
			if (pd.pf_mtag == NULL &&
11029
			    ((pd.pf_mtag = pf_get_mtag(pd.m)) == NULL)) {
11030
				action = PF_DROP;
11031
				goto done;
11032
			}
11033
			pd.pf_mtag->flags |= PF_MTAG_FLAG_PACKET_LOOPED;
11034
		}
11035
		if (pd.pf_mtag && pd.pf_mtag->flags & PF_MTAG_FLAG_FASTFWD_OURS_PRESENT) {
11036
			pd.m->m_flags |= M_FASTFWD_OURS;
11037
			pd.pf_mtag->flags &= ~PF_MTAG_FLAG_FASTFWD_OURS_PRESENT;
11038
		}
11039
		m_tag_delete(pd.m, mtag);
11040

11041
		mtag = m_tag_locate(pd.m, MTAG_IPFW_RULE, 0, NULL);
11042
		if (mtag != NULL)
11043
			m_tag_delete(pd.m, mtag);
11044
	}
11045

11046
	switch (pd.virtual_proto) {
11047
	case PF_VPROTO_FRAGMENT:
11048
		/*
11049
		 * handle fragments that aren't reassembled by
11050
		 * normalization
11051
		 */
11052
		if (kif == NULL || r == NULL) /* pflog */
11053
			action = PF_DROP;
11054
		else
11055
			action = pf_test_rule(&r, &s, &pd, &a,
11056
			    &ruleset, &reason, inp, &match_rules);
11057
		if (action != PF_PASS)
11058
			REASON_SET(&reason, PFRES_FRAG);
11059
		break;
11060

11061
	case IPPROTO_TCP: {
11062
		/* Respond to SYN with a syncookie. */
11063
		if ((tcp_get_flags(&pd.hdr.tcp) & (TH_SYN|TH_ACK|TH_RST)) == TH_SYN &&
11064
		    pd.dir == PF_IN && pf_synflood_check(&pd)) {
11065
			pf_syncookie_send(&pd, &reason);
11066
			action = PF_DROP;
11067
			break;
11068
		}
11069

11070
		if ((tcp_get_flags(&pd.hdr.tcp) & TH_ACK) && pd.p_len == 0)
11071
			use_2nd_queue = 1;
11072
		action = pf_normalize_tcp(&pd);
11073
		if (action == PF_DROP)
11074
			break;
11075
		action = pf_test_state(&s, &pd, &reason);
11076
		if (action == PF_PASS || action == PF_AFRT) {
11077
			if (V_pfsync_update_state_ptr != NULL)
11078
				V_pfsync_update_state_ptr(s);
11079
			r = s->rule;
11080
			a = s->anchor;
11081
		} else if (s == NULL) {
11082
			/* Validate remote SYN|ACK, re-create original SYN if
11083
			 * valid. */
11084
			if ((tcp_get_flags(&pd.hdr.tcp) & (TH_SYN|TH_ACK|TH_RST)) ==
11085
			    TH_ACK && pf_syncookie_validate(&pd) &&
11086
			    pd.dir == PF_IN) {
11087
				struct mbuf *msyn;
11088

11089
				msyn = pf_syncookie_recreate_syn(&pd, &reason);
11090
				if (msyn == NULL) {
11091
					action = PF_DROP;
11092
					break;
11093
				}
11094

11095
				action = pf_test(af, dir, pflags, ifp, &msyn, inp,
11096
				    &pd.act);
11097
				m_freem(msyn);
11098
				if (action != PF_PASS)
11099
					break;
11100

11101
				action = pf_test_state(&s, &pd, &reason);
11102
				if (action != PF_PASS || s == NULL) {
11103
					action = PF_DROP;
11104
					break;
11105
				}
11106

11107
				s->src.seqhi = ntohl(pd.hdr.tcp.th_ack) - 1;
11108
				s->src.seqlo = ntohl(pd.hdr.tcp.th_seq) - 1;
11109
				pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_DST);
11110
				action = pf_synproxy(&pd, s, &reason);
11111
				break;
11112
			} else {
11113
				action = pf_test_rule(&r, &s, &pd,
11114
				    &a, &ruleset, &reason, inp, &match_rules);
11115
			}
11116
		}
11117
		break;
11118
	}
11119

11120
	case IPPROTO_SCTP:
11121
		action = pf_normalize_sctp(&pd);
11122
		if (action == PF_DROP)
11123
			break;
11124
		/* fallthrough */
11125
	case IPPROTO_UDP:
11126
	default:
11127
		action = pf_test_state(&s, &pd, &reason);
11128
		if (action == PF_PASS || action == PF_AFRT) {
11129
			if (V_pfsync_update_state_ptr != NULL)
11130
				V_pfsync_update_state_ptr(s);
11131
			r = s->rule;
11132
			a = s->anchor;
11133
		} else if (s == NULL) {
11134
			action = pf_test_rule(&r, &s,
11135
			    &pd, &a, &ruleset, &reason, inp, &match_rules);
11136
		}
11137
		break;
11138

11139
	case IPPROTO_ICMP:
11140
	case IPPROTO_ICMPV6: {
11141
		if (pd.virtual_proto == IPPROTO_ICMP && af != AF_INET) {
11142
			action = PF_DROP;
11143
			REASON_SET(&reason, PFRES_NORM);
11144
			DPFPRINTF(PF_DEBUG_MISC,
11145
			    "dropping IPv6 packet with ICMPv4 payload");
11146
			break;
11147
		}
11148
		if (pd.virtual_proto == IPPROTO_ICMPV6 && af != AF_INET6) {
11149
			action = PF_DROP;
11150
			REASON_SET(&reason, PFRES_NORM);
11151
			DPFPRINTF(PF_DEBUG_MISC,
11152
			    "pf: dropping IPv4 packet with ICMPv6 payload");
11153
			break;
11154
		}
11155
		action = pf_test_state_icmp(&s, &pd, &reason);
11156
		if (action == PF_PASS || action == PF_AFRT) {
11157
			if (V_pfsync_update_state_ptr != NULL)
11158
				V_pfsync_update_state_ptr(s);
11159
			r = s->rule;
11160
			a = s->anchor;
11161
		} else if (s == NULL)
11162
			action = pf_test_rule(&r, &s, &pd,
11163
			    &a, &ruleset, &reason, inp, &match_rules);
11164
		break;
11165
	}
11166

11167
	}
11168

11169
done:
11170
	PF_RULES_RUNLOCK();
11171

11172
	/* if packet sits in reassembly queue, return without error */
11173
	if (pd.m == NULL) {
11174
		pf_free_match_rules(&match_rules);
11175
		goto eat_pkt;
11176
	}
11177

11178
	if (s)
11179
		memcpy(&pd.act, &s->act, sizeof(s->act));
11180

11181
	if (action == PF_PASS && pd.badopts != 0 && !pd.act.allow_opts) {
11182
		action = PF_DROP;
11183
		REASON_SET(&reason, PFRES_IPOPTIONS);
11184
		pd.act.log = PF_LOG_FORCE;
11185
		DPFPRINTF(PF_DEBUG_MISC,
11186
		    "pf: dropping packet with dangerous headers");
11187
	}
11188

11189
	if (pd.act.max_pkt_size && pd.act.max_pkt_size &&
11190
	    pd.tot_len > pd.act.max_pkt_size) {
11191
		action = PF_DROP;
11192
		REASON_SET(&reason, PFRES_NORM);
11193
		pd.act.log = PF_LOG_FORCE;
11194
		DPFPRINTF(PF_DEBUG_MISC,
11195
		    "pf: dropping overly long packet");
11196
	}
11197

11198
	if (s) {
11199
		uint8_t log = pd.act.log;
11200
		memcpy(&pd.act, &s->act, sizeof(struct pf_rule_actions));
11201
		pd.act.log |= log;
11202
		tag = s->tag;
11203
	} else {
11204
		tag = r->tag;
11205
	}
11206

11207
	if (tag > 0 && pf_tag_packet(&pd, tag)) {
11208
		action = PF_DROP;
11209
		REASON_SET(&reason, PFRES_MEMORY);
11210
	}
11211

11212
	pf_scrub(&pd);
11213
	if (pd.proto == IPPROTO_TCP && pd.act.max_mss)
11214
		pf_normalize_mss(&pd);
11215

11216
	if (pd.act.rtableid >= 0)
11217
		M_SETFIB(pd.m, pd.act.rtableid);
11218

11219
	if (pd.act.flags & PFSTATE_SETPRIO) {
11220
		if (pd.tos & IPTOS_LOWDELAY)
11221
			use_2nd_queue = 1;
11222
		if (vlan_set_pcp(pd.m, pd.act.set_prio[use_2nd_queue])) {
11223
			action = PF_DROP;
11224
			REASON_SET(&reason, PFRES_MEMORY);
11225
			pd.act.log = PF_LOG_FORCE;
11226
			DPFPRINTF(PF_DEBUG_MISC,
11227
			    "pf: failed to allocate 802.1q mtag");
11228
		}
11229
	}
11230

11231
#ifdef ALTQ
11232
	if (action == PF_PASS && pd.act.qid) {
11233
		if (pd.pf_mtag == NULL &&
11234
		    ((pd.pf_mtag = pf_get_mtag(pd.m)) == NULL)) {
11235
			action = PF_DROP;
11236
			REASON_SET(&reason, PFRES_MEMORY);
11237
		} else {
11238
			if (s != NULL)
11239
				pd.pf_mtag->qid_hash = pf_state_hash(s);
11240
			if (use_2nd_queue || (pd.tos & IPTOS_LOWDELAY))
11241
				pd.pf_mtag->qid = pd.act.pqid;
11242
			else
11243
				pd.pf_mtag->qid = pd.act.qid;
11244
			/* Add hints for ecn. */
11245
			pd.pf_mtag->hdr = mtod(pd.m, void *);
11246
		}
11247
	}
11248
#endif /* ALTQ */
11249

11250
	/*
11251
	 * connections redirected to loopback should not match sockets
11252
	 * bound specifically to loopback due to security implications,
11253
	 * see tcp_input() and in_pcblookup_listen().
11254
	 */
11255
	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
11256
	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule != NULL &&
11257
	    (s->nat_rule->action == PF_RDR ||
11258
	    s->nat_rule->action == PF_BINAT) &&
11259
	    pf_is_loopback(af, pd.dst))
11260
		pd.m->m_flags |= M_SKIP_FIREWALL;
11261

11262
	if (af == AF_INET && __predict_false(ip_divert_ptr != NULL) &&
11263
	    action == PF_PASS && r->divert.port && !PACKET_LOOPED(&pd)) {
11264
		mtag = m_tag_alloc(MTAG_PF_DIVERT, 0,
11265
		    sizeof(struct pf_divert_mtag), M_NOWAIT | M_ZERO);
11266
		if (mtag != NULL) {
11267
			((struct pf_divert_mtag *)(mtag+1))->port =
11268
			    ntohs(r->divert.port);
11269
			((struct pf_divert_mtag *)(mtag+1))->idir =
11270
			    (dir == PF_IN) ? PF_DIVERT_MTAG_DIR_IN :
11271
			    PF_DIVERT_MTAG_DIR_OUT;
11272

11273
			pf_counters_inc(action, &pd, s, r, a, &match_rules);
11274

11275
			if (s)
11276
				PF_STATE_UNLOCK(s);
11277

11278
			m_tag_prepend(pd.m, mtag);
11279
			if (pd.m->m_flags & M_FASTFWD_OURS) {
11280
				if (pd.pf_mtag == NULL &&
11281
				    ((pd.pf_mtag = pf_get_mtag(pd.m)) == NULL)) {
11282
					action = PF_DROP;
11283
					REASON_SET(&reason, PFRES_MEMORY);
11284
					pd.act.log = PF_LOG_FORCE;
11285
					DPFPRINTF(PF_DEBUG_MISC,
11286
					    "pf: failed to allocate tag");
11287
				} else {
11288
					pd.pf_mtag->flags |=
11289
					    PF_MTAG_FLAG_FASTFWD_OURS_PRESENT;
11290
					pd.m->m_flags &= ~M_FASTFWD_OURS;
11291
				}
11292
			}
11293
			ip_divert_ptr(*m0, dir == PF_IN);
11294
			*m0 = NULL;
11295

11296
			return (action);
11297
		} else {
11298
			/* XXX: ipfw has the same behaviour! */
11299
			action = PF_DROP;
11300
			REASON_SET(&reason, PFRES_MEMORY);
11301
			pd.act.log = PF_LOG_FORCE;
11302
			DPFPRINTF(PF_DEBUG_MISC,
11303
			    "pf: failed to allocate divert tag");
11304
		}
11305
	}
11306
	/* XXX: Anybody working on it?! */
11307
	if (af == AF_INET6 && r->divert.port)
11308
		printf("pf: divert(9) is not supported for IPv6\n");
11309

11310
	/* this flag will need revising if the pkt is forwarded */
11311
	if (pd.pf_mtag)
11312
		pd.pf_mtag->flags &= ~PF_MTAG_FLAG_PACKET_LOOPED;
11313

11314
	if (pd.act.log) {
11315
		struct pf_krule		*lr;
11316

11317
		if (s != NULL && s->nat_rule != NULL &&
11318
		    s->nat_rule->log & PF_LOG_ALL)
11319
			lr = s->nat_rule;
11320
		else
11321
			lr = r;
11322

11323
		if (pd.act.log & PF_LOG_FORCE || lr->log & PF_LOG_ALL)
11324
			PFLOG_PACKET(action, reason, lr, a,
11325
			    ruleset, &pd, (s == NULL), NULL);
11326
		if (s) {
11327
			SLIST_FOREACH(ri, &s->match_rules, entry)
11328
				if (ri->r->log & PF_LOG_ALL)
11329
					PFLOG_PACKET(action,
11330
					    reason, ri->r, a, ruleset, &pd, 0, NULL);
11331
		}
11332
	}
11333

11334
	pf_counters_inc(action, &pd, s, r, a, &match_rules);
11335

11336
	switch (action) {
11337
	case PF_SYNPROXY_DROP:
11338
		m_freem(*m0);
11339
	case PF_DEFER:
11340
		*m0 = NULL;
11341
		action = PF_PASS;
11342
		break;
11343
	case PF_DROP:
11344
		m_freem(*m0);
11345
		*m0 = NULL;
11346
		break;
11347
	case PF_AFRT:
11348
		if (pf_translate_af(&pd)) {
11349
			*m0 = pd.m;
11350
			action = PF_DROP;
11351
			break;
11352
		}
11353
#ifdef INET
11354
		if (pd.naf == AF_INET) {
11355
			action = pf_route(r, kif->pfik_ifp, s, &pd,
11356
			    inp);
11357
		}
11358
#endif /* INET */
11359
#ifdef INET6
11360
		if (pd.naf == AF_INET6) {
11361
			action = pf_route6(r, kif->pfik_ifp, s, &pd,
11362
			    inp);
11363
}
11364
#endif /* INET6 */
11365
		*m0 = pd.m;
11366
		goto out;
11367
		break;
11368
	default:
11369
		if (pd.act.rt) {
11370
			switch (af) {
11371
#ifdef INET
11372
			case AF_INET:
11373
				/* pf_route() returns unlocked. */
11374
				action = pf_route(r, kif->pfik_ifp, s, &pd,
11375
				    inp);
11376
				break;
11377
#endif /* INET */
11378
#ifdef INET6
11379
			case AF_INET6:
11380
				/* pf_route6() returns unlocked. */
11381
				action = pf_route6(r, kif->pfik_ifp, s, &pd,
11382
				    inp);
11383
				break;
11384
#endif /* INET6 */
11385
			}
11386
			*m0 = pd.m;
11387
			goto out;
11388
		}
11389
		if (pf_dummynet(&pd, s, r, m0) != 0) {
11390
			action = PF_DROP;
11391
			REASON_SET(&reason, PFRES_MEMORY);
11392
		}
11393
		break;
11394
	}
11395

11396
eat_pkt:
11397
	SDT_PROBE4(pf, ip, test, done, action, reason, r, s);
11398

11399
	if (s && action != PF_DROP) {
11400
		if (!s->if_index_in && dir == PF_IN)
11401
			s->if_index_in = ifp->if_index;
11402
		else if (!s->if_index_out && dir == PF_OUT)
11403
			s->if_index_out = ifp->if_index;
11404
	}
11405

11406
	if (s)
11407
		PF_STATE_UNLOCK(s);
11408

11409
out:
11410
#ifdef INET6
11411
	/* If reassembled packet passed, create new fragments. */
11412
	if (af == AF_INET6 && action == PF_PASS && *m0 && dir == PF_OUT &&
11413
	    (! (pflags & PF_PFIL_NOREFRAGMENT)) &&
11414
	    (mtag = m_tag_find(pd.m, PACKET_TAG_PF_REASSEMBLED, NULL)) != NULL)
11415
		action = pf_refragment6(ifp, m0, mtag, NULL, pflags & PFIL_FWD);
11416
#endif /* INET6 */
11417

11418
	pf_sctp_multihome_delayed(&pd, kif, s, action);
11419

11420
	return (action);
11421
}
11422
#endif /* INET || INET6 */
11423

11424
Product

Resources

Company