Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netlink/netlink_domain.c
39475 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2021 Ng Peng Nam Sean
5
* Copyright (c) 2022 Alexander V. Chernikov <[email protected]>
6
* Copyright (c) 2023 Gleb Smirnoff <[email protected]>
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*/
29
30
/*
31
* This file contains socket and protocol bindings for netlink.
32
*/
33
34
#include <sys/param.h>
35
#include <sys/kernel.h>
36
#include <sys/malloc.h>
37
#include <sys/lock.h>
38
#include <sys/rmlock.h>
39
#include <sys/domain.h>
40
#include <sys/jail.h>
41
#include <sys/mbuf.h>
42
#include <sys/osd.h>
43
#include <sys/protosw.h>
44
#include <sys/proc.h>
45
#include <sys/ck.h>
46
#include <sys/socket.h>
47
#include <sys/socketvar.h>
48
#include <sys/sysent.h>
49
#include <sys/syslog.h>
50
#include <sys/priv.h>
51
#include <sys/uio.h>
52
53
#include <netlink/netlink.h>
54
#include <netlink/netlink_ctl.h>
55
#include <netlink/netlink_var.h>
56
57
#define DEBUG_MOD_NAME nl_domain
58
#define DEBUG_MAX_LEVEL LOG_DEBUG3
59
#include <netlink/netlink_debug.h>
60
_DECLARE_DEBUG(LOG_INFO);
61
62
_Static_assert((NLP_MAX_GROUPS % 64) == 0,
63
"NLP_MAX_GROUPS has to be multiple of 64");
64
_Static_assert(NLP_MAX_GROUPS >= 64,
65
"NLP_MAX_GROUPS has to be at least 64");
66
67
#define NLCTL_TRACKER struct rm_priotracker nl_tracker
68
#define NLCTL_RLOCK() rm_rlock(&V_nl_ctl.ctl_lock, &nl_tracker)
69
#define NLCTL_RUNLOCK() rm_runlock(&V_nl_ctl.ctl_lock, &nl_tracker)
70
#define NLCTL_LOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_LOCKED)
71
72
#define NLCTL_WLOCK() rm_wlock(&V_nl_ctl.ctl_lock)
73
#define NLCTL_WUNLOCK() rm_wunlock(&V_nl_ctl.ctl_lock)
74
#define NLCTL_WLOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_WLOCKED)
75
76
static u_long nl_sendspace = NLSNDQ;
77
SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
78
"Default netlink socket send space");
79
80
static u_long nl_recvspace = NLSNDQ;
81
SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
82
"Default netlink socket receive space");
83
84
extern u_long sb_max_adj;
85
static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
86
static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS);
87
SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf,
88
CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0,
89
sysctl_handle_nl_maxsockbuf, "LU",
90
"Maximum Netlink socket buffer size");
91
92
93
static unsigned int osd_slot_id = 0;
94
95
void
96
nl_osd_register(void)
97
{
98
osd_slot_id = osd_register(OSD_THREAD, NULL, NULL);
99
}
100
101
void
102
nl_osd_unregister(void)
103
{
104
osd_deregister(OSD_THREAD, osd_slot_id);
105
}
106
107
struct nlpcb *
108
_nl_get_thread_nlp(struct thread *td)
109
{
110
return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id));
111
}
112
113
void
114
nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp)
115
{
116
NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id);
117
if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0)
118
return;
119
/* Failed, need to realloc */
120
void **rsv = osd_reserve(osd_slot_id);
121
osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp);
122
}
123
124
/*
125
* Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
126
* Returns nlpcb pointer if present else NULL
127
*/
128
static struct nlpcb *
129
nl_port_lookup(uint32_t port_id)
130
{
131
struct nlpcb *nlp;
132
133
CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_port_head, nl_port_next) {
134
if (nlp->nl_port == port_id)
135
return (nlp);
136
}
137
return (NULL);
138
}
139
140
static void
141
nlp_join_group(struct nlpcb *nlp, unsigned int group_id)
142
{
143
MPASS(group_id < NLP_MAX_GROUPS);
144
NLCTL_WLOCK_ASSERT();
145
146
/* TODO: add family handler callback */
147
if (!nlp_unconstrained_vnet(nlp))
148
return;
149
150
BIT_SET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups);
151
}
152
153
static void
154
nlp_leave_group(struct nlpcb *nlp, unsigned int group_id)
155
{
156
MPASS(group_id < NLP_MAX_GROUPS);
157
NLCTL_WLOCK_ASSERT();
158
159
BIT_CLR(NLP_MAX_GROUPS, group_id, &nlp->nl_groups);
160
}
161
162
static bool
163
nlp_memberof_group(struct nlpcb *nlp, unsigned int group_id)
164
{
165
MPASS(group_id < NLP_MAX_GROUPS);
166
NLCTL_LOCK_ASSERT();
167
168
return (BIT_ISSET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups));
169
}
170
171
static uint32_t
172
nlp_get_groups_compat(struct nlpcb *nlp)
173
{
174
uint32_t groups_mask = 0;
175
176
NLCTL_LOCK_ASSERT();
177
178
for (int i = 0; i < 32; i++) {
179
if (nlp_memberof_group(nlp, i + 1))
180
groups_mask |= (1 << i);
181
}
182
183
return (groups_mask);
184
}
185
186
static struct nl_buf *
187
nl_buf_copy(struct nl_buf *nb)
188
{
189
struct nl_buf *copy;
190
191
copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
192
if (__predict_false(copy == NULL))
193
return (NULL);
194
memcpy(copy, nb, sizeof(*nb) + nb->buflen);
195
196
return (copy);
197
}
198
199
/*
200
* Broadcasts in the writer's buffer.
201
*/
202
bool
203
nl_send_group(struct nl_writer *nw)
204
{
205
struct nl_buf *nb = nw->buf;
206
struct nlpcb *nlp_last = NULL;
207
struct nlpcb *nlp;
208
NLCTL_TRACKER;
209
210
IF_DEBUG_LEVEL(LOG_DEBUG2) {
211
struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data;
212
NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
213
nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
214
nw->group.proto, nw->group.id);
215
}
216
217
nw->buf = NULL;
218
219
NLCTL_RLOCK();
220
CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next) {
221
if ((nw->group.priv == 0 || priv_check_cred(
222
nlp->nl_socket->so_cred, nw->group.priv) == 0) &&
223
nlp->nl_proto == nw->group.proto &&
224
nlp_memberof_group(nlp, nw->group.id)) {
225
if (nlp_last != NULL) {
226
struct nl_buf *copy;
227
228
copy = nl_buf_copy(nb);
229
if (copy != NULL) {
230
nw->buf = copy;
231
(void)nl_send(nw, nlp_last);
232
} else {
233
NLP_LOCK(nlp_last);
234
if (nlp_last->nl_socket != NULL)
235
sorwakeup(nlp_last->nl_socket);
236
NLP_UNLOCK(nlp_last);
237
}
238
}
239
nlp_last = nlp;
240
}
241
}
242
if (nlp_last != NULL) {
243
nw->buf = nb;
244
(void)nl_send(nw, nlp_last);
245
} else
246
nl_buf_free(nb);
247
248
NLCTL_RUNLOCK();
249
250
return (true);
251
}
252
253
void
254
nl_clear_group(u_int group)
255
{
256
struct nlpcb *nlp;
257
258
NLCTL_WLOCK();
259
CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next)
260
if (nlp_memberof_group(nlp, group))
261
nlp_leave_group(nlp, group);
262
NLCTL_WUNLOCK();
263
}
264
265
static uint32_t
266
nl_find_port(void)
267
{
268
/*
269
* app can open multiple netlink sockets.
270
* Start with current pid, if already taken,
271
* try random numbers in 65k..256k+65k space,
272
* avoiding clash with pids.
273
*/
274
if (nl_port_lookup(curproc->p_pid) == NULL)
275
return (curproc->p_pid);
276
for (int i = 0; i < 16; i++) {
277
uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
278
if (nl_port_lookup(nl_port) == 0)
279
return (nl_port);
280
NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
281
}
282
return (curproc->p_pid);
283
}
284
285
static int
286
nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
287
{
288
if (nlp->nl_bound) {
289
if (nlp->nl_port != snl->nl_pid) {
290
NL_LOG(LOG_DEBUG,
291
"bind() failed: program pid %d "
292
"is different from provided pid %d",
293
nlp->nl_port, snl->nl_pid);
294
return (EINVAL); // XXX: better error
295
}
296
} else {
297
if (snl->nl_pid == 0)
298
snl->nl_pid = nl_find_port();
299
if (nl_port_lookup(snl->nl_pid) != NULL)
300
return (EADDRINUSE);
301
nlp->nl_port = snl->nl_pid;
302
nlp->nl_bound = true;
303
CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_port_head, nlp, nl_port_next);
304
}
305
for (int i = 0; i < 32; i++) {
306
if (snl->nl_groups & ((uint32_t)1 << i))
307
nlp_join_group(nlp, i + 1);
308
else
309
nlp_leave_group(nlp, i + 1);
310
}
311
312
return (0);
313
}
314
315
static int
316
nl_attach(struct socket *so, int proto, struct thread *td)
317
{
318
struct nlpcb *nlp;
319
int error;
320
321
if (__predict_false(netlink_unloading != 0))
322
return (EAFNOSUPPORT);
323
324
error = nl_verify_proto(proto);
325
if (error != 0)
326
return (error);
327
328
bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
329
NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
330
so, is_linux ? "(linux) " : "", curproc->p_pid,
331
nl_get_proto_name(proto));
332
333
nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
334
error = soreserve(so, nl_sendspace, nl_recvspace);
335
if (error != 0) {
336
free(nlp, M_PCB);
337
return (error);
338
}
339
TAILQ_INIT(&so->so_rcv.nl_queue);
340
TAILQ_INIT(&so->so_snd.nl_queue);
341
so->so_pcb = nlp;
342
nlp->nl_socket = so;
343
nlp->nl_proto = proto;
344
nlp->nl_process_id = curproc->p_pid;
345
nlp->nl_linux = is_linux;
346
nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred);
347
nlp->nl_need_thread_setup = true;
348
NLP_LOCK_INIT(nlp);
349
refcount_init(&nlp->nl_refcount, 1);
350
351
nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
352
taskqueue_thread_enqueue, &nlp->nl_taskqueue);
353
TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
354
taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
355
"netlink_socket (PID %u)", nlp->nl_process_id);
356
357
NLCTL_WLOCK();
358
CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_pcb_head, nlp, nl_next);
359
NLCTL_WUNLOCK();
360
361
soisconnected(so);
362
363
return (0);
364
}
365
366
static int
367
nl_bind(struct socket *so, struct sockaddr *sa, struct thread *td)
368
{
369
struct nlpcb *nlp = sotonlpcb(so);
370
struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
371
int error;
372
373
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
374
if (snl->nl_len != sizeof(*snl)) {
375
NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
376
return (EINVAL);
377
}
378
379
380
NLCTL_WLOCK();
381
NLP_LOCK(nlp);
382
error = nl_bind_locked(nlp, snl);
383
NLP_UNLOCK(nlp);
384
NLCTL_WUNLOCK();
385
NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
386
snl->nl_pid, snl->nl_groups, error);
387
388
return (error);
389
}
390
391
392
static int
393
nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
394
{
395
struct sockaddr_nl snl = {
396
.nl_pid = port_id,
397
};
398
int error;
399
400
NLCTL_WLOCK();
401
NLP_LOCK(nlp);
402
snl.nl_groups = nlp_get_groups_compat(nlp);
403
error = nl_bind_locked(nlp, &snl);
404
NLP_UNLOCK(nlp);
405
NLCTL_WUNLOCK();
406
407
NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
408
return (error);
409
}
410
411
/*
412
* nl_autobind_port binds a unused portid to @nlp
413
* @nlp: pcb data for the netlink socket
414
* @candidate_id: first id to consider
415
*/
416
static int
417
nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
418
{
419
uint32_t port_id = candidate_id;
420
NLCTL_TRACKER;
421
bool exist;
422
int error = EADDRINUSE;
423
424
for (int i = 0; i < 10; i++) {
425
NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
426
NLCTL_RLOCK();
427
exist = nl_port_lookup(port_id) != 0;
428
NLCTL_RUNLOCK();
429
if (!exist) {
430
error = nl_assign_port(nlp, port_id);
431
if (error != EADDRINUSE)
432
break;
433
}
434
port_id++;
435
}
436
NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
437
return (error);
438
}
439
440
static int
441
nl_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
442
{
443
struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
444
struct nlpcb *nlp;
445
446
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
447
if (snl->nl_len != sizeof(*snl)) {
448
NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
449
return (EINVAL);
450
}
451
452
nlp = sotonlpcb(so);
453
if (!nlp->nl_bound) {
454
int error = nl_autobind_port(nlp, td->td_proc->p_pid);
455
if (error != 0) {
456
NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
457
return (error);
458
}
459
}
460
/* XXX: Handle socket flags & multicast */
461
soisconnected(so);
462
463
NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
464
465
return (0);
466
}
467
468
static void
469
destroy_nlpcb_epoch(epoch_context_t ctx)
470
{
471
struct nlpcb *nlp;
472
473
nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
474
475
NLP_LOCK_DESTROY(nlp);
476
free(nlp, M_PCB);
477
}
478
479
static void
480
nl_close(struct socket *so)
481
{
482
MPASS(sotonlpcb(so) != NULL);
483
struct nlpcb *nlp;
484
struct nl_buf *nb;
485
486
NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid);
487
nlp = sotonlpcb(so);
488
489
/* Mark as inactive so no new work can be enqueued */
490
NLP_LOCK(nlp);
491
bool was_bound = nlp->nl_bound;
492
NLP_UNLOCK(nlp);
493
494
/* Wait till all scheduled work has been completed */
495
taskqueue_drain_all(nlp->nl_taskqueue);
496
taskqueue_free(nlp->nl_taskqueue);
497
498
NLCTL_WLOCK();
499
NLP_LOCK(nlp);
500
if (was_bound) {
501
CK_LIST_REMOVE(nlp, nl_port_next);
502
NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
503
}
504
CK_LIST_REMOVE(nlp, nl_next);
505
nlp->nl_socket = NULL;
506
NLP_UNLOCK(nlp);
507
NLCTL_WUNLOCK();
508
509
so->so_pcb = NULL;
510
511
while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
512
TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
513
nl_buf_free(nb);
514
}
515
while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
516
TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
517
nl_buf_free(nb);
518
}
519
520
NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
521
522
/* XXX: is delayed free needed? */
523
NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
524
}
525
526
static int
527
nl_disconnect(struct socket *so)
528
{
529
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
530
MPASS(sotonlpcb(so) != NULL);
531
return (ENOTCONN);
532
}
533
534
static int
535
nl_sockaddr(struct socket *so, struct sockaddr *sa)
536
{
537
538
*(struct sockaddr_nl *)sa = (struct sockaddr_nl ){
539
/* TODO: set other fields */
540
.nl_len = sizeof(struct sockaddr_nl),
541
.nl_family = AF_NETLINK,
542
.nl_pid = sotonlpcb(so)->nl_port,
543
};
544
545
return (0);
546
}
547
548
static int
549
nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
550
struct mbuf *m, struct mbuf *control, int flags, struct thread *td)
551
{
552
struct nlpcb *nlp = sotonlpcb(so);
553
struct sockbuf *sb = &so->so_snd;
554
struct nl_buf *nb;
555
size_t len;
556
int error;
557
558
MPASS(m == NULL && uio != NULL);
559
560
if (__predict_false(control != NULL)) {
561
m_freem(control);
562
return (EINVAL);
563
}
564
565
if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */
566
return (EOPNOTSUPP);
567
568
if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr)))
569
return (ENOBUFS); /* XXXGL: any better error? */
570
571
if (__predict_false(uio->uio_resid > sb->sb_hiwat))
572
return (EMSGSIZE);
573
574
error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
575
if (error)
576
return (error);
577
578
len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
579
if (nlp->nl_linux)
580
len += roundup2(uio->uio_resid, 8);
581
nb = nl_buf_alloc(len, M_WAITOK);
582
nb->datalen = uio->uio_resid;
583
error = uiomove(&nb->data[0], uio->uio_resid, uio);
584
if (__predict_false(error))
585
goto out;
586
587
NL_LOG(LOG_DEBUG2, "sending message to kernel %u bytes", nb->datalen);
588
589
SOCK_SENDBUF_LOCK(so);
590
restart:
591
if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) {
592
TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
593
sb->sb_acc += nb->datalen;
594
sb->sb_ccc += nb->datalen;
595
nb = NULL;
596
} else if ((so->so_state & SS_NBIO) ||
597
(flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
598
SOCK_SENDBUF_UNLOCK(so);
599
error = EWOULDBLOCK;
600
goto out;
601
} else {
602
if ((error = sbwait(so, SO_SND)) != 0) {
603
SOCK_SENDBUF_UNLOCK(so);
604
goto out;
605
} else
606
goto restart;
607
}
608
SOCK_SENDBUF_UNLOCK(so);
609
610
if (nb == NULL) {
611
NL_LOG(LOG_DEBUG3, "success");
612
NLP_LOCK(nlp);
613
nl_schedule_taskqueue(nlp);
614
NLP_UNLOCK(nlp);
615
}
616
617
out:
618
SOCK_IO_SEND_UNLOCK(so);
619
if (nb != NULL) {
620
NL_LOG(LOG_DEBUG3, "failure, error %d", error);
621
nl_buf_free(nb);
622
}
623
return (error);
624
}
625
626
/* Create control data for recvmsg(2) on Netlink socket. */
627
static struct mbuf *
628
nl_createcontrol(struct nlpcb *nlp)
629
{
630
struct {
631
struct nlattr nla;
632
uint32_t val;
633
} data[] = {
634
{
635
.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
636
.nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
637
.val = nlp->nl_process_id,
638
},
639
{
640
.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
641
.nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
642
.val = nlp->nl_port,
643
},
644
};
645
646
return (sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO,
647
SOL_NETLINK, M_WAITOK));
648
}
649
650
static int
651
nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
652
struct mbuf **mp, struct mbuf **controlp, int *flagsp)
653
{
654
static const struct sockaddr_nl nl_empty_src = {
655
.nl_len = sizeof(struct sockaddr_nl),
656
.nl_family = PF_NETLINK,
657
.nl_pid = 0 /* comes from the kernel */
658
};
659
struct sockbuf *sb = &so->so_rcv;
660
struct nlpcb *nlp = sotonlpcb(so);
661
struct nl_buf *first, *last, *nb, *next;
662
struct nlmsghdr *hdr;
663
int flags, error;
664
u_int len, overflow, partoff, partlen, msgrcv, datalen;
665
bool nonblock, trunc, peek;
666
667
MPASS(mp == NULL && uio != NULL);
668
669
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
670
671
if (psa != NULL)
672
*psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src,
673
M_WAITOK);
674
675
if (controlp != NULL && (nlp->nl_flags & NLF_MSG_INFO))
676
*controlp = nl_createcontrol(nlp);
677
678
flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
679
trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
680
nonblock = (so->so_state & SS_NBIO) ||
681
(flags & (MSG_DONTWAIT | MSG_NBIO));
682
peek = flags & MSG_PEEK;
683
684
error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
685
if (__predict_false(error))
686
return (error);
687
688
len = 0;
689
overflow = 0;
690
msgrcv = 0;
691
datalen = 0;
692
693
SOCK_RECVBUF_LOCK(so);
694
while ((first = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
695
if (nonblock) {
696
SOCK_RECVBUF_UNLOCK(so);
697
SOCK_IO_RECV_UNLOCK(so);
698
return (EWOULDBLOCK);
699
}
700
error = sbwait(so, SO_RCV);
701
if (error) {
702
SOCK_RECVBUF_UNLOCK(so);
703
SOCK_IO_RECV_UNLOCK(so);
704
return (error);
705
}
706
}
707
708
/*
709
* Netlink socket buffer consists of a queue of nl_bufs, but for the
710
* userland there should be no boundaries. However, there are Netlink
711
* messages, that shouldn't be split. Internal invariant is that a
712
* message never spans two nl_bufs.
713
* If a large userland buffer is provided, we would traverse the queue
714
* until either queue end is reached or the buffer is fulfilled. If
715
* an application provides a buffer that isn't able to fit a single
716
* message, we would truncate it and lose its tail. This is the only
717
* condition where we would lose data. If buffer is able to fit at
718
* least one message, we would return it and won't truncate the next.
719
*
720
* We use same code for normal and MSG_PEEK case. At first queue pass
721
* we scan nl_bufs and count lenght. In case we can read entire buffer
722
* at one write everything is trivial. In case we can not, we save
723
* pointer to the last (or partial) nl_buf and in the !peek case we
724
* split the queue into two pieces. We can safely drop the queue lock,
725
* as kernel would only append nl_bufs to the end of the queue, and
726
* we are the exclusive owner of queue beginning due to sleepable lock.
727
* At the second pass we copy data out and in !peek case free nl_bufs.
728
*/
729
TAILQ_FOREACH(nb, &sb->nl_queue, tailq) {
730
u_int offset;
731
732
MPASS(nb->offset < nb->datalen);
733
offset = nb->offset;
734
while (offset < nb->datalen) {
735
hdr = (struct nlmsghdr *)&nb->data[offset];
736
MPASS(nb->offset + hdr->nlmsg_len <= nb->datalen);
737
if (uio->uio_resid < len + hdr->nlmsg_len) {
738
overflow = len + hdr->nlmsg_len -
739
uio->uio_resid;
740
partoff = nb->offset;
741
if (offset > partoff) {
742
partlen = offset - partoff;
743
if (!peek) {
744
nb->offset = offset;
745
datalen += partlen;
746
}
747
} else if (len == 0 && uio->uio_resid > 0) {
748
flags |= MSG_TRUNC;
749
partlen = uio->uio_resid;
750
if (peek)
751
goto nospace;
752
datalen += hdr->nlmsg_len;
753
if (nb->offset + hdr->nlmsg_len ==
754
nb->datalen) {
755
/*
756
* Avoid leaving empty nb.
757
* Process last nb normally.
758
* Trust uiomove() to care
759
* about negative uio_resid.
760
*/
761
nb = TAILQ_NEXT(nb, tailq);
762
overflow = 0;
763
partlen = 0;
764
} else
765
nb->offset += hdr->nlmsg_len;
766
msgrcv++;
767
} else
768
partlen = 0;
769
goto nospace;
770
}
771
len += hdr->nlmsg_len;
772
offset += hdr->nlmsg_len;
773
MPASS(offset <= nb->buflen);
774
msgrcv++;
775
}
776
MPASS(offset == nb->datalen);
777
datalen += nb->datalen - nb->offset;
778
}
779
nospace:
780
last = nb;
781
if (!peek) {
782
if (last == NULL)
783
TAILQ_INIT(&sb->nl_queue);
784
else {
785
/* XXXGL: create TAILQ_SPLIT */
786
TAILQ_FIRST(&sb->nl_queue) = last;
787
last->tailq.tqe_prev = &TAILQ_FIRST(&sb->nl_queue);
788
}
789
MPASS(sb->sb_acc >= datalen);
790
sb->sb_acc -= datalen;
791
sb->sb_ccc -= datalen;
792
}
793
SOCK_RECVBUF_UNLOCK(so);
794
795
for (nb = first; nb != last; nb = next) {
796
next = TAILQ_NEXT(nb, tailq);
797
if (__predict_true(error == 0))
798
error = uiomove(&nb->data[nb->offset],
799
(int)(nb->datalen - nb->offset), uio);
800
if (!peek)
801
nl_buf_free(nb);
802
}
803
if (last != NULL && partlen > 0 && __predict_true(error == 0))
804
error = uiomove(&nb->data[partoff], (int)partlen, uio);
805
806
if (trunc && overflow > 0) {
807
uio->uio_resid -= overflow;
808
MPASS(uio->uio_resid < 0);
809
} else
810
MPASS(uio->uio_resid >= 0);
811
812
if (uio->uio_td)
813
uio->uio_td->td_ru.ru_msgrcv += msgrcv;
814
815
if (flagsp != NULL)
816
*flagsp |= flags;
817
818
SOCK_IO_RECV_UNLOCK(so);
819
820
nl_on_transmit(sotonlpcb(so));
821
822
return (error);
823
}
824
825
static int
826
nl_getoptflag(int sopt_name)
827
{
828
switch (sopt_name) {
829
case NETLINK_CAP_ACK:
830
return (NLF_CAP_ACK);
831
case NETLINK_EXT_ACK:
832
return (NLF_EXT_ACK);
833
case NETLINK_GET_STRICT_CHK:
834
return (NLF_STRICT);
835
case NETLINK_MSG_INFO:
836
return (NLF_MSG_INFO);
837
}
838
839
return (0);
840
}
841
842
static int
843
nl_ctloutput(struct socket *so, struct sockopt *sopt)
844
{
845
struct nlpcb *nlp = sotonlpcb(so);
846
uint32_t flag;
847
int optval, error = 0;
848
NLCTL_TRACKER;
849
850
NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
851
so, sopt->sopt_name);
852
853
switch (sopt->sopt_dir) {
854
case SOPT_SET:
855
switch (sopt->sopt_name) {
856
case NETLINK_ADD_MEMBERSHIP:
857
case NETLINK_DROP_MEMBERSHIP:
858
error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
859
if (error != 0)
860
break;
861
if (optval <= 0 || optval >= NLP_MAX_GROUPS) {
862
error = ERANGE;
863
break;
864
}
865
NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval);
866
867
NLCTL_WLOCK();
868
if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
869
nlp_join_group(nlp, optval);
870
else
871
nlp_leave_group(nlp, optval);
872
NLCTL_WUNLOCK();
873
break;
874
case NETLINK_CAP_ACK:
875
case NETLINK_EXT_ACK:
876
case NETLINK_GET_STRICT_CHK:
877
case NETLINK_MSG_INFO:
878
error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
879
if (error != 0)
880
break;
881
882
flag = nl_getoptflag(sopt->sopt_name);
883
884
if ((flag == NLF_MSG_INFO) && nlp->nl_linux) {
885
error = EINVAL;
886
break;
887
}
888
889
NLCTL_WLOCK();
890
if (optval != 0)
891
nlp->nl_flags |= flag;
892
else
893
nlp->nl_flags &= ~flag;
894
NLCTL_WUNLOCK();
895
break;
896
default:
897
error = ENOPROTOOPT;
898
}
899
break;
900
case SOPT_GET:
901
switch (sopt->sopt_name) {
902
case NETLINK_LIST_MEMBERSHIPS:
903
NLCTL_RLOCK();
904
optval = nlp_get_groups_compat(nlp);
905
NLCTL_RUNLOCK();
906
error = sooptcopyout(sopt, &optval, sizeof(optval));
907
break;
908
case NETLINK_CAP_ACK:
909
case NETLINK_EXT_ACK:
910
case NETLINK_GET_STRICT_CHK:
911
case NETLINK_MSG_INFO:
912
NLCTL_RLOCK();
913
optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0;
914
NLCTL_RUNLOCK();
915
error = sooptcopyout(sopt, &optval, sizeof(optval));
916
break;
917
default:
918
error = ENOPROTOOPT;
919
}
920
break;
921
default:
922
error = ENOPROTOOPT;
923
}
924
925
return (error);
926
}
927
928
static int
929
sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS)
930
{
931
int error = 0;
932
u_long tmp_maxsockbuf = nl_maxsockbuf;
933
934
error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req);
935
if (error || !req->newptr)
936
return (error);
937
if (tmp_maxsockbuf < MSIZE + MCLBYTES)
938
return (EINVAL);
939
nl_maxsockbuf = tmp_maxsockbuf;
940
941
return (0);
942
}
943
944
static int
945
nl_setsbopt(struct socket *so, struct sockopt *sopt)
946
{
947
int error, optval;
948
bool result;
949
950
if (sopt->sopt_name != SO_RCVBUF)
951
return (sbsetopt(so, sopt));
952
953
/* Allow to override max buffer size in certain conditions */
954
955
error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
956
if (error != 0)
957
return (error);
958
NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval);
959
if (optval > sb_max_adj) {
960
if (priv_check(curthread, PRIV_NET_ROUTE) != 0)
961
return (EPERM);
962
}
963
964
SOCK_RECVBUF_LOCK(so);
965
result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread);
966
SOCK_RECVBUF_UNLOCK(so);
967
968
return (result ? 0 : ENOBUFS);
969
}
970
971
#define NETLINK_PROTOSW \
972
.pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \
973
.pr_ctloutput = nl_ctloutput, \
974
.pr_setsbopt = nl_setsbopt, \
975
.pr_attach = nl_attach, \
976
.pr_bind = nl_bind, \
977
.pr_connect = nl_connect, \
978
.pr_disconnect = nl_disconnect, \
979
.pr_sosend = nl_sosend, \
980
.pr_soreceive = nl_soreceive, \
981
.pr_sockaddr = nl_sockaddr, \
982
.pr_close = nl_close
983
984
static struct protosw netlink_raw_sw = {
985
.pr_type = SOCK_RAW,
986
NETLINK_PROTOSW
987
};
988
989
static struct protosw netlink_dgram_sw = {
990
.pr_type = SOCK_DGRAM,
991
NETLINK_PROTOSW
992
};
993
994
static struct domain netlinkdomain = {
995
.dom_family = PF_NETLINK,
996
.dom_name = "netlink",
997
.dom_flags = DOMF_UNLOADABLE,
998
.dom_nprotosw = 2,
999
.dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw },
1000
};
1001
1002
DOMAIN_SET(netlink);
1003
1004