Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/netlink/netlink_domain.c
104521 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 2021 Ng Peng Nam Sean
5
* Copyright (c) 2022 Alexander V. Chernikov <[email protected]>
6
* Copyright (c) 2023 Gleb Smirnoff <[email protected]>
7
*
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
10
* are met:
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
16
*
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27
* SUCH DAMAGE.
28
*/
29
30
/*
31
* This file contains socket and protocol bindings for netlink.
32
*/
33
34
#include <sys/param.h>
35
#include <sys/kernel.h>
36
#include <sys/malloc.h>
37
#include <sys/lock.h>
38
#include <sys/rmlock.h>
39
#include <sys/domain.h>
40
#include <sys/jail.h>
41
#include <sys/mbuf.h>
42
#include <sys/osd.h>
43
#include <sys/protosw.h>
44
#include <sys/proc.h>
45
#include <sys/ck.h>
46
#include <sys/socket.h>
47
#include <sys/socketvar.h>
48
#include <sys/sysent.h>
49
#include <sys/syslog.h>
50
#include <sys/priv.h>
51
#include <sys/uio.h>
52
53
#include <netlink/netlink.h>
54
#include <netlink/netlink_ctl.h>
55
#include <netlink/netlink_var.h>
56
57
#define DEBUG_MOD_NAME nl_domain
58
#define DEBUG_MAX_LEVEL LOG_DEBUG3
59
#include <netlink/netlink_debug.h>
60
_DECLARE_DEBUG(LOG_INFO);
61
62
_Static_assert((NLP_MAX_GROUPS % 64) == 0,
63
"NLP_MAX_GROUPS has to be multiple of 64");
64
_Static_assert(NLP_MAX_GROUPS >= 64,
65
"NLP_MAX_GROUPS has to be at least 64");
66
67
#define NLCTL_TRACKER struct rm_priotracker nl_tracker
68
#define NLCTL_RLOCK() rm_rlock(&V_nl_ctl.ctl_lock, &nl_tracker)
69
#define NLCTL_RUNLOCK() rm_runlock(&V_nl_ctl.ctl_lock, &nl_tracker)
70
#define NLCTL_LOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_LOCKED)
71
72
#define NLCTL_WLOCK() rm_wlock(&V_nl_ctl.ctl_lock)
73
#define NLCTL_WUNLOCK() rm_wunlock(&V_nl_ctl.ctl_lock)
74
#define NLCTL_WLOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_WLOCKED)
75
76
static u_long nl_sendspace = NLSNDQ;
77
SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
78
"Default netlink socket send space");
79
80
static u_long nl_recvspace = NLSNDQ;
81
SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
82
"Default netlink socket receive space");
83
84
extern u_long sb_max_adj;
85
static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
86
static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS);
87
SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf,
88
CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0,
89
sysctl_handle_nl_maxsockbuf, "LU",
90
"Maximum Netlink socket buffer size");
91
92
93
static unsigned int osd_slot_id = 0;
94
95
void
96
nl_osd_register(void)
97
{
98
osd_slot_id = osd_register(OSD_THREAD, NULL, NULL);
99
}
100
101
void
102
nl_osd_unregister(void)
103
{
104
osd_deregister(OSD_THREAD, osd_slot_id);
105
}
106
107
struct nlpcb *
108
_nl_get_thread_nlp(struct thread *td)
109
{
110
return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id));
111
}
112
113
void
114
nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp)
115
{
116
NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id);
117
if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0)
118
return;
119
/* Failed, need to realloc */
120
void **rsv = osd_reserve(osd_slot_id);
121
osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp);
122
}
123
124
/*
125
* Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
126
* Returns nlpcb pointer if present else NULL
127
*/
128
static struct nlpcb *
129
nl_port_lookup(uint32_t port_id)
130
{
131
struct nlpcb *nlp;
132
133
CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_port_head, nl_port_next) {
134
if (nlp->nl_port == port_id)
135
return (nlp);
136
}
137
return (NULL);
138
}
139
140
static void
141
nlp_join_group(struct nlpcb *nlp, unsigned int group_id)
142
{
143
MPASS(group_id < NLP_MAX_GROUPS);
144
NLCTL_WLOCK_ASSERT();
145
146
/* TODO: add family handler callback */
147
if (!nlp_unconstrained_vnet(nlp))
148
return;
149
150
BIT_SET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups);
151
}
152
153
static void
154
nlp_leave_group(struct nlpcb *nlp, unsigned int group_id)
155
{
156
MPASS(group_id < NLP_MAX_GROUPS);
157
NLCTL_WLOCK_ASSERT();
158
159
BIT_CLR(NLP_MAX_GROUPS, group_id, &nlp->nl_groups);
160
}
161
162
static bool
163
nlp_memberof_group(struct nlpcb *nlp, unsigned int group_id)
164
{
165
MPASS(group_id < NLP_MAX_GROUPS);
166
NLCTL_LOCK_ASSERT();
167
168
return (BIT_ISSET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups));
169
}
170
171
static uint32_t
172
nlp_get_groups_compat(struct nlpcb *nlp)
173
{
174
uint32_t groups_mask = 0;
175
176
NLCTL_LOCK_ASSERT();
177
178
for (int i = 0; i < 32; i++) {
179
if (nlp_memberof_group(nlp, i + 1))
180
groups_mask |= (1 << i);
181
}
182
183
return (groups_mask);
184
}
185
186
static struct nl_buf *
187
nl_buf_copy(struct nl_buf *nb)
188
{
189
struct nl_buf *copy;
190
191
copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
192
if (__predict_false(copy == NULL))
193
return (NULL);
194
memcpy(copy, nb, sizeof(*nb) + nb->buflen);
195
196
return (copy);
197
}
198
199
/*
200
* Broadcasts in the writer's buffer.
201
*/
202
bool
203
nl_send_group(struct nl_writer *nw)
204
{
205
struct nl_buf *nb = nw->buf;
206
struct nlpcb *nlp_last = NULL;
207
struct nlpcb *nlp;
208
NLCTL_TRACKER;
209
210
IF_DEBUG_LEVEL(LOG_DEBUG2) {
211
struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data;
212
NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
213
nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
214
nw->group.proto, nw->group.id);
215
}
216
217
nw->buf = NULL;
218
219
NLCTL_RLOCK();
220
CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next) {
221
if ((nw->group.priv == 0 || priv_check_cred(
222
nlp->nl_socket->so_cred, nw->group.priv) == 0) &&
223
nlp->nl_proto == nw->group.proto &&
224
nlp_memberof_group(nlp, nw->group.id)) {
225
if (nlp_last != NULL) {
226
struct nl_buf *copy;
227
228
copy = nl_buf_copy(nb);
229
if (copy != NULL) {
230
nw->buf = copy;
231
(void)nl_send(nw, nlp_last);
232
} else {
233
NLP_LOCK(nlp_last);
234
if (nlp_last->nl_socket != NULL)
235
sorwakeup(nlp_last->nl_socket);
236
NLP_UNLOCK(nlp_last);
237
}
238
}
239
nlp_last = nlp;
240
}
241
}
242
if (nlp_last != NULL) {
243
nw->buf = nb;
244
(void)nl_send(nw, nlp_last);
245
} else
246
nl_buf_free(nb);
247
248
NLCTL_RUNLOCK();
249
250
return (true);
251
}
252
253
void
254
nl_clear_group(u_int group)
255
{
256
struct nlpcb *nlp;
257
258
NLCTL_WLOCK();
259
CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next)
260
if (nlp_memberof_group(nlp, group))
261
nlp_leave_group(nlp, group);
262
NLCTL_WUNLOCK();
263
}
264
265
static uint32_t
266
nl_find_port(void)
267
{
268
/*
269
* app can open multiple netlink sockets.
270
* Start with current pid, if already taken,
271
* try random numbers in 65k..256k+65k space,
272
* avoiding clash with pids.
273
*/
274
if (nl_port_lookup(curproc->p_pid) == NULL)
275
return (curproc->p_pid);
276
for (int i = 0; i < 16; i++) {
277
uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
278
if (nl_port_lookup(nl_port) == 0)
279
return (nl_port);
280
NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
281
}
282
return (curproc->p_pid);
283
}
284
285
static int
286
nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
287
{
288
if (nlp->nl_bound) {
289
if (nlp->nl_port != snl->nl_pid) {
290
NL_LOG(LOG_DEBUG,
291
"bind() failed: program pid %d "
292
"is different from provided pid %d",
293
nlp->nl_port, snl->nl_pid);
294
return (EINVAL); // XXX: better error
295
}
296
} else {
297
if (snl->nl_pid == 0)
298
snl->nl_pid = nl_find_port();
299
if (nl_port_lookup(snl->nl_pid) != NULL)
300
return (EADDRINUSE);
301
nlp->nl_port = snl->nl_pid;
302
nlp->nl_bound = true;
303
CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_port_head, nlp, nl_port_next);
304
}
305
for (int i = 0; i < 32; i++) {
306
if (snl->nl_groups & ((uint32_t)1 << i))
307
nlp_join_group(nlp, i + 1);
308
else
309
nlp_leave_group(nlp, i + 1);
310
}
311
312
return (0);
313
}
314
315
static int
316
nl_attach(struct socket *so, int proto, struct thread *td)
317
{
318
struct nlpcb *nlp;
319
int error;
320
321
if (__predict_false(netlink_unloading != 0))
322
return (EAFNOSUPPORT);
323
324
error = nl_verify_proto(proto);
325
if (error != 0)
326
return (error);
327
328
bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
329
NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
330
so, is_linux ? "(linux) " : "", curproc->p_pid,
331
nl_get_proto_name(proto));
332
333
mtx_init(&so->so_snd_mtx, "netlink so_snd", NULL, MTX_DEF);
334
mtx_init(&so->so_rcv_mtx, "netlink so_rcv", NULL, MTX_DEF);
335
error = soreserve(so, nl_sendspace, nl_recvspace);
336
if (error != 0) {
337
mtx_destroy(&so->so_snd_mtx);
338
mtx_destroy(&so->so_rcv_mtx);
339
return (error);
340
}
341
TAILQ_INIT(&so->so_rcv.nl_queue);
342
TAILQ_INIT(&so->so_snd.nl_queue);
343
nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
344
so->so_pcb = nlp;
345
nlp->nl_socket = so;
346
nlp->nl_proto = proto;
347
nlp->nl_process_id = curproc->p_pid;
348
nlp->nl_linux = is_linux;
349
nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred);
350
nlp->nl_need_thread_setup = true;
351
NLP_LOCK_INIT(nlp);
352
refcount_init(&nlp->nl_refcount, 1);
353
354
nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
355
taskqueue_thread_enqueue, &nlp->nl_taskqueue);
356
TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
357
taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
358
"netlink_socket (PID %u)", nlp->nl_process_id);
359
360
NLCTL_WLOCK();
361
CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_pcb_head, nlp, nl_next);
362
NLCTL_WUNLOCK();
363
364
soisconnected(so);
365
366
return (0);
367
}
368
369
static int
370
nl_bind(struct socket *so, struct sockaddr *sa, struct thread *td)
371
{
372
struct nlpcb *nlp = sotonlpcb(so);
373
struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
374
int error;
375
376
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
377
if (snl->nl_len != sizeof(*snl)) {
378
NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
379
return (EINVAL);
380
}
381
382
383
NLCTL_WLOCK();
384
NLP_LOCK(nlp);
385
error = nl_bind_locked(nlp, snl);
386
NLP_UNLOCK(nlp);
387
NLCTL_WUNLOCK();
388
NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
389
snl->nl_pid, snl->nl_groups, error);
390
391
return (error);
392
}
393
394
395
static int
396
nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
397
{
398
struct sockaddr_nl snl = {
399
.nl_pid = port_id,
400
};
401
int error;
402
403
NLCTL_WLOCK();
404
NLP_LOCK(nlp);
405
snl.nl_groups = nlp_get_groups_compat(nlp);
406
error = nl_bind_locked(nlp, &snl);
407
NLP_UNLOCK(nlp);
408
NLCTL_WUNLOCK();
409
410
NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
411
return (error);
412
}
413
414
/*
415
* nl_autobind_port binds a unused portid to @nlp
416
* @nlp: pcb data for the netlink socket
417
* @candidate_id: first id to consider
418
*/
419
static int
420
nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
421
{
422
uint32_t port_id = candidate_id;
423
NLCTL_TRACKER;
424
bool exist;
425
int error = EADDRINUSE;
426
427
for (int i = 0; i < 10; i++) {
428
NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
429
NLCTL_RLOCK();
430
exist = nl_port_lookup(port_id) != 0;
431
NLCTL_RUNLOCK();
432
if (!exist) {
433
error = nl_assign_port(nlp, port_id);
434
if (error != EADDRINUSE)
435
break;
436
}
437
port_id++;
438
}
439
NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
440
return (error);
441
}
442
443
static int
444
nl_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
445
{
446
struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
447
struct nlpcb *nlp;
448
449
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
450
if (snl->nl_len != sizeof(*snl)) {
451
NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
452
return (EINVAL);
453
}
454
455
nlp = sotonlpcb(so);
456
if (!nlp->nl_bound) {
457
int error = nl_autobind_port(nlp, td->td_proc->p_pid);
458
if (error != 0) {
459
NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
460
return (error);
461
}
462
}
463
/* XXX: Handle socket flags & multicast */
464
soisconnected(so);
465
466
NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
467
468
return (0);
469
}
470
471
static void
472
destroy_nlpcb_epoch(epoch_context_t ctx)
473
{
474
struct nlpcb *nlp;
475
476
nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
477
478
NLP_LOCK_DESTROY(nlp);
479
free(nlp, M_PCB);
480
}
481
482
static void
483
nl_close(struct socket *so)
484
{
485
MPASS(sotonlpcb(so) != NULL);
486
struct nlpcb *nlp;
487
struct nl_buf *nb;
488
489
NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid);
490
nlp = sotonlpcb(so);
491
492
/* Mark as inactive so no new work can be enqueued */
493
NLP_LOCK(nlp);
494
bool was_bound = nlp->nl_bound;
495
NLP_UNLOCK(nlp);
496
497
/* Wait till all scheduled work has been completed */
498
taskqueue_drain_all(nlp->nl_taskqueue);
499
taskqueue_free(nlp->nl_taskqueue);
500
501
NLCTL_WLOCK();
502
NLP_LOCK(nlp);
503
if (was_bound) {
504
CK_LIST_REMOVE(nlp, nl_port_next);
505
NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
506
}
507
CK_LIST_REMOVE(nlp, nl_next);
508
nlp->nl_socket = NULL;
509
NLP_UNLOCK(nlp);
510
NLCTL_WUNLOCK();
511
512
so->so_pcb = NULL;
513
514
while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
515
TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
516
nl_buf_free(nb);
517
}
518
while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
519
TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
520
nl_buf_free(nb);
521
}
522
523
mtx_destroy(&so->so_snd_mtx);
524
mtx_destroy(&so->so_rcv_mtx);
525
526
NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
527
528
/* XXX: is delayed free needed? */
529
NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
530
}
531
532
static int
533
nl_disconnect(struct socket *so)
534
{
535
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
536
MPASS(sotonlpcb(so) != NULL);
537
return (ENOTCONN);
538
}
539
540
static int
541
nl_sockaddr(struct socket *so, struct sockaddr *sa)
542
{
543
544
*(struct sockaddr_nl *)sa = (struct sockaddr_nl ){
545
/* TODO: set other fields */
546
.nl_len = sizeof(struct sockaddr_nl),
547
.nl_family = AF_NETLINK,
548
.nl_pid = sotonlpcb(so)->nl_port,
549
};
550
551
return (0);
552
}
553
554
static int
555
nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
556
struct mbuf *m, struct mbuf *control, int flags, struct thread *td)
557
{
558
struct nlpcb *nlp = sotonlpcb(so);
559
struct sockbuf *sb = &so->so_snd;
560
struct nl_buf *nb;
561
size_t len;
562
int error;
563
564
MPASS(m == NULL && uio != NULL);
565
566
if (__predict_false(control != NULL)) {
567
m_freem(control);
568
return (EINVAL);
569
}
570
571
if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */
572
return (EOPNOTSUPP);
573
574
if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr)))
575
return (ENOBUFS); /* XXXGL: any better error? */
576
577
if (__predict_false(uio->uio_resid > sb->sb_hiwat))
578
return (EMSGSIZE);
579
580
error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
581
if (error)
582
return (error);
583
584
len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
585
if (nlp->nl_linux)
586
len += roundup2(uio->uio_resid, 8);
587
nb = nl_buf_alloc(len, M_WAITOK);
588
nb->datalen = uio->uio_resid;
589
error = uiomove(&nb->data[0], uio->uio_resid, uio);
590
if (__predict_false(error))
591
goto out;
592
593
NL_LOG(LOG_DEBUG2, "sending message to kernel %u bytes", nb->datalen);
594
595
SOCK_SENDBUF_LOCK(so);
596
restart:
597
if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) {
598
TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
599
sb->sb_acc += nb->datalen;
600
sb->sb_ccc += nb->datalen;
601
nb = NULL;
602
} else if ((so->so_state & SS_NBIO) ||
603
(flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
604
SOCK_SENDBUF_UNLOCK(so);
605
error = EWOULDBLOCK;
606
goto out;
607
} else {
608
if ((error = sbwait(so, SO_SND)) != 0) {
609
SOCK_SENDBUF_UNLOCK(so);
610
goto out;
611
} else
612
goto restart;
613
}
614
SOCK_SENDBUF_UNLOCK(so);
615
616
if (nb == NULL) {
617
NL_LOG(LOG_DEBUG3, "success");
618
NLP_LOCK(nlp);
619
nl_schedule_taskqueue(nlp);
620
NLP_UNLOCK(nlp);
621
}
622
623
out:
624
SOCK_IO_SEND_UNLOCK(so);
625
if (nb != NULL) {
626
NL_LOG(LOG_DEBUG3, "failure, error %d", error);
627
nl_buf_free(nb);
628
}
629
return (error);
630
}
631
632
/* Create control data for recvmsg(2) on Netlink socket. */
633
static struct mbuf *
634
nl_createcontrol(struct nlpcb *nlp)
635
{
636
struct {
637
struct nlattr nla;
638
uint32_t val;
639
} data[] = {
640
{
641
.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
642
.nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
643
.val = nlp->nl_process_id,
644
},
645
{
646
.nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
647
.nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
648
.val = nlp->nl_port,
649
},
650
};
651
652
return (sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO,
653
SOL_NETLINK, M_WAITOK));
654
}
655
656
static int
657
nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
658
struct mbuf **mp, struct mbuf **controlp, int *flagsp)
659
{
660
static const struct sockaddr_nl nl_empty_src = {
661
.nl_len = sizeof(struct sockaddr_nl),
662
.nl_family = PF_NETLINK,
663
.nl_pid = 0 /* comes from the kernel */
664
};
665
struct sockbuf *sb = &so->so_rcv;
666
struct nlpcb *nlp = sotonlpcb(so);
667
struct nl_buf *first, *last, *nb, *next;
668
struct nlmsghdr *hdr;
669
int flags, error;
670
u_int len, overflow, partoff, partlen, msgrcv, datalen;
671
bool nonblock, trunc, peek;
672
673
MPASS(mp == NULL && uio != NULL);
674
675
NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
676
677
if (psa != NULL)
678
*psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src,
679
M_WAITOK);
680
681
if (controlp != NULL && (nlp->nl_flags & NLF_MSG_INFO))
682
*controlp = nl_createcontrol(nlp);
683
684
flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
685
trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
686
nonblock = (so->so_state & SS_NBIO) ||
687
(flags & (MSG_DONTWAIT | MSG_NBIO));
688
peek = flags & MSG_PEEK;
689
690
error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
691
if (__predict_false(error))
692
return (error);
693
694
len = 0;
695
overflow = 0;
696
msgrcv = 0;
697
datalen = 0;
698
699
SOCK_RECVBUF_LOCK(so);
700
while ((first = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
701
if (nonblock) {
702
SOCK_RECVBUF_UNLOCK(so);
703
SOCK_IO_RECV_UNLOCK(so);
704
return (EWOULDBLOCK);
705
}
706
error = sbwait(so, SO_RCV);
707
if (error) {
708
SOCK_RECVBUF_UNLOCK(so);
709
SOCK_IO_RECV_UNLOCK(so);
710
return (error);
711
}
712
}
713
714
/*
715
* Netlink socket buffer consists of a queue of nl_bufs, but for the
716
* userland there should be no boundaries. However, there are Netlink
717
* messages, that shouldn't be split. Internal invariant is that a
718
* message never spans two nl_bufs.
719
* If a large userland buffer is provided, we would traverse the queue
720
* until either queue end is reached or the buffer is fulfilled. If
721
* an application provides a buffer that isn't able to fit a single
722
* message, we would truncate it and lose its tail. This is the only
723
* condition where we would lose data. If buffer is able to fit at
724
* least one message, we would return it and won't truncate the next.
725
*
726
* We use same code for normal and MSG_PEEK case. At first queue pass
727
* we scan nl_bufs and count lenght. In case we can read entire buffer
728
* at one write everything is trivial. In case we can not, we save
729
* pointer to the last (or partial) nl_buf and in the !peek case we
730
* split the queue into two pieces. We can safely drop the queue lock,
731
* as kernel would only append nl_bufs to the end of the queue, and
732
* we are the exclusive owner of queue beginning due to sleepable lock.
733
* At the second pass we copy data out and in !peek case free nl_bufs.
734
*/
735
TAILQ_FOREACH(nb, &sb->nl_queue, tailq) {
736
u_int offset;
737
738
MPASS(nb->offset < nb->datalen);
739
offset = nb->offset;
740
while (offset < nb->datalen) {
741
hdr = (struct nlmsghdr *)&nb->data[offset];
742
MPASS(nb->offset + hdr->nlmsg_len <= nb->datalen);
743
if (uio->uio_resid < len + hdr->nlmsg_len) {
744
overflow = len + hdr->nlmsg_len -
745
uio->uio_resid;
746
partoff = nb->offset;
747
if (offset > partoff) {
748
partlen = offset - partoff;
749
if (!peek) {
750
nb->offset = offset;
751
datalen += partlen;
752
}
753
} else if (len == 0 && uio->uio_resid > 0) {
754
flags |= MSG_TRUNC;
755
partlen = uio->uio_resid;
756
if (peek)
757
goto nospace;
758
datalen += hdr->nlmsg_len;
759
if (nb->offset + hdr->nlmsg_len ==
760
nb->datalen) {
761
/*
762
* Avoid leaving empty nb.
763
* Process last nb normally.
764
* Trust uiomove() to care
765
* about negative uio_resid.
766
*/
767
nb = TAILQ_NEXT(nb, tailq);
768
overflow = 0;
769
partlen = 0;
770
} else
771
nb->offset += hdr->nlmsg_len;
772
msgrcv++;
773
} else
774
partlen = 0;
775
goto nospace;
776
}
777
len += hdr->nlmsg_len;
778
offset += hdr->nlmsg_len;
779
MPASS(offset <= nb->buflen);
780
msgrcv++;
781
}
782
MPASS(offset == nb->datalen);
783
datalen += nb->datalen - nb->offset;
784
}
785
nospace:
786
last = nb;
787
if (!peek) {
788
if (last == NULL)
789
TAILQ_INIT(&sb->nl_queue);
790
else {
791
/* XXXGL: create TAILQ_SPLIT */
792
TAILQ_FIRST(&sb->nl_queue) = last;
793
last->tailq.tqe_prev = &TAILQ_FIRST(&sb->nl_queue);
794
}
795
MPASS(sb->sb_acc >= datalen);
796
sb->sb_acc -= datalen;
797
sb->sb_ccc -= datalen;
798
}
799
SOCK_RECVBUF_UNLOCK(so);
800
801
for (nb = first; nb != last; nb = next) {
802
next = TAILQ_NEXT(nb, tailq);
803
if (__predict_true(error == 0))
804
error = uiomove(&nb->data[nb->offset],
805
(int)(nb->datalen - nb->offset), uio);
806
if (!peek)
807
nl_buf_free(nb);
808
}
809
if (last != NULL && partlen > 0 && __predict_true(error == 0))
810
error = uiomove(&nb->data[partoff], (int)partlen, uio);
811
812
if (trunc && overflow > 0) {
813
uio->uio_resid -= overflow;
814
MPASS(uio->uio_resid < 0);
815
} else
816
MPASS(uio->uio_resid >= 0);
817
818
if (uio->uio_td)
819
uio->uio_td->td_ru.ru_msgrcv += msgrcv;
820
821
if (flagsp != NULL)
822
*flagsp |= flags;
823
824
SOCK_IO_RECV_UNLOCK(so);
825
826
nl_on_transmit(sotonlpcb(so));
827
828
return (error);
829
}
830
831
static int
832
nl_getoptflag(int sopt_name)
833
{
834
switch (sopt_name) {
835
case NETLINK_CAP_ACK:
836
return (NLF_CAP_ACK);
837
case NETLINK_EXT_ACK:
838
return (NLF_EXT_ACK);
839
case NETLINK_GET_STRICT_CHK:
840
return (NLF_STRICT);
841
case NETLINK_MSG_INFO:
842
return (NLF_MSG_INFO);
843
}
844
845
return (0);
846
}
847
848
static int
849
nl_ctloutput(struct socket *so, struct sockopt *sopt)
850
{
851
struct nlpcb *nlp = sotonlpcb(so);
852
uint32_t flag;
853
int optval, error = 0;
854
NLCTL_TRACKER;
855
856
NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
857
so, sopt->sopt_name);
858
859
switch (sopt->sopt_dir) {
860
case SOPT_SET:
861
switch (sopt->sopt_name) {
862
case NETLINK_ADD_MEMBERSHIP:
863
case NETLINK_DROP_MEMBERSHIP:
864
error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
865
if (error != 0)
866
break;
867
if (optval <= 0 || optval >= NLP_MAX_GROUPS) {
868
error = ERANGE;
869
break;
870
}
871
NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval);
872
873
NLCTL_WLOCK();
874
if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
875
nlp_join_group(nlp, optval);
876
else
877
nlp_leave_group(nlp, optval);
878
NLCTL_WUNLOCK();
879
break;
880
case NETLINK_CAP_ACK:
881
case NETLINK_EXT_ACK:
882
case NETLINK_GET_STRICT_CHK:
883
case NETLINK_MSG_INFO:
884
error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
885
if (error != 0)
886
break;
887
888
flag = nl_getoptflag(sopt->sopt_name);
889
890
if ((flag == NLF_MSG_INFO) && nlp->nl_linux) {
891
error = EINVAL;
892
break;
893
}
894
895
NLCTL_WLOCK();
896
if (optval != 0)
897
nlp->nl_flags |= flag;
898
else
899
nlp->nl_flags &= ~flag;
900
NLCTL_WUNLOCK();
901
break;
902
default:
903
error = ENOPROTOOPT;
904
}
905
break;
906
case SOPT_GET:
907
switch (sopt->sopt_name) {
908
case NETLINK_LIST_MEMBERSHIPS:
909
NLCTL_RLOCK();
910
optval = nlp_get_groups_compat(nlp);
911
NLCTL_RUNLOCK();
912
error = sooptcopyout(sopt, &optval, sizeof(optval));
913
break;
914
case NETLINK_CAP_ACK:
915
case NETLINK_EXT_ACK:
916
case NETLINK_GET_STRICT_CHK:
917
case NETLINK_MSG_INFO:
918
NLCTL_RLOCK();
919
optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0;
920
NLCTL_RUNLOCK();
921
error = sooptcopyout(sopt, &optval, sizeof(optval));
922
break;
923
default:
924
error = ENOPROTOOPT;
925
}
926
break;
927
default:
928
error = ENOPROTOOPT;
929
}
930
931
return (error);
932
}
933
934
static int
935
sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS)
936
{
937
int error = 0;
938
u_long tmp_maxsockbuf = nl_maxsockbuf;
939
940
error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req);
941
if (error || !req->newptr)
942
return (error);
943
if (tmp_maxsockbuf < MSIZE + MCLBYTES)
944
return (EINVAL);
945
nl_maxsockbuf = tmp_maxsockbuf;
946
947
return (0);
948
}
949
950
static int
951
nl_setsbopt(struct socket *so, struct sockopt *sopt)
952
{
953
int error, optval;
954
bool result;
955
956
if (sopt->sopt_name != SO_RCVBUF)
957
return (sbsetopt(so, sopt));
958
959
/* Allow to override max buffer size in certain conditions */
960
961
error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
962
if (error != 0)
963
return (error);
964
NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval);
965
if (optval > sb_max_adj) {
966
if (priv_check(curthread, PRIV_NET_ROUTE) != 0)
967
return (EPERM);
968
}
969
970
SOCK_RECVBUF_LOCK(so);
971
result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread);
972
SOCK_RECVBUF_UNLOCK(so);
973
974
return (result ? 0 : ENOBUFS);
975
}
976
977
#define NETLINK_PROTOSW \
978
.pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \
979
.pr_ctloutput = nl_ctloutput, \
980
.pr_setsbopt = nl_setsbopt, \
981
.pr_attach = nl_attach, \
982
.pr_bind = nl_bind, \
983
.pr_connect = nl_connect, \
984
.pr_disconnect = nl_disconnect, \
985
.pr_sosend = nl_sosend, \
986
.pr_soreceive = nl_soreceive, \
987
.pr_sockaddr = nl_sockaddr, \
988
.pr_close = nl_close
989
990
static struct protosw netlink_raw_sw = {
991
.pr_type = SOCK_RAW,
992
NETLINK_PROTOSW
993
};
994
995
static struct protosw netlink_dgram_sw = {
996
.pr_type = SOCK_DGRAM,
997
NETLINK_PROTOSW
998
};
999
1000
static struct domain netlinkdomain = {
1001
.dom_family = PF_NETLINK,
1002
.dom_name = "netlink",
1003
.dom_flags = DOMF_UNLOADABLE,
1004
.dom_nprotosw = 2,
1005
.dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw },
1006
};
1007
1008
DOMAIN_SET(netlink);
1009
1010