Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/kern/kern_jail.c
39475 views
1
/*-
2
* SPDX-License-Identifier: BSD-2-Clause
3
*
4
* Copyright (c) 1999 Poul-Henning Kamp.
5
* Copyright (c) 2008 Bjoern A. Zeeb.
6
* Copyright (c) 2009 James Gritton.
7
* All rights reserved.
8
*
9
* Redistribution and use in source and binary forms, with or without
10
* modification, are permitted provided that the following conditions
11
* are met:
12
* 1. Redistributions of source code must retain the above copyright
13
* notice, this list of conditions and the following disclaimer.
14
* 2. Redistributions in binary form must reproduce the above copyright
15
* notice, this list of conditions and the following disclaimer in the
16
* documentation and/or other materials provided with the distribution.
17
*
18
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28
* SUCH DAMAGE.
29
*/
30
31
#include <sys/cdefs.h>
32
#include "opt_ddb.h"
33
#include "opt_inet.h"
34
#include "opt_inet6.h"
35
#include "opt_nfs.h"
36
37
#include <sys/param.h>
38
#include <sys/types.h>
39
#include <sys/kernel.h>
40
#include <sys/systm.h>
41
#include <sys/errno.h>
42
#include <sys/file.h>
43
#include <sys/sysproto.h>
44
#include <sys/malloc.h>
45
#include <sys/osd.h>
46
#include <sys/priv.h>
47
#include <sys/proc.h>
48
#include <sys/epoch.h>
49
#include <sys/event.h>
50
#include <sys/taskqueue.h>
51
#include <sys/fcntl.h>
52
#include <sys/jail.h>
53
#include <sys/jaildesc.h>
54
#include <sys/linker.h>
55
#include <sys/lock.h>
56
#include <sys/mman.h>
57
#include <sys/mutex.h>
58
#include <sys/racct.h>
59
#include <sys/rctl.h>
60
#include <sys/refcount.h>
61
#include <sys/sx.h>
62
#include <sys/sysent.h>
63
#include <sys/namei.h>
64
#include <sys/mount.h>
65
#include <sys/queue.h>
66
#include <sys/socket.h>
67
#include <sys/syscallsubr.h>
68
#include <sys/sysctl.h>
69
#include <sys/uuid.h>
70
#include <sys/vnode.h>
71
72
#include <net/if.h>
73
#include <net/vnet.h>
74
75
#include <netinet/in.h>
76
77
#ifdef DDB
78
#include <ddb/ddb.h>
79
#endif /* DDB */
80
81
#include <security/mac/mac_framework.h>
82
83
#define PRISON0_HOSTUUID_MODULE "hostuuid"
84
85
MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
86
static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
87
88
/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
89
#ifdef INET
90
#ifdef INET6
91
#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
92
#else
93
#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
94
#endif
95
#else /* !INET */
96
#ifdef INET6
97
#define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
98
#else
99
#define _PR_IP_SADDRSEL 0
100
#endif
101
#endif
102
103
/* prison0 describes what is "real" about the system. */
104
struct prison prison0 = {
105
.pr_id = 0,
106
.pr_name = "0",
107
.pr_ref = 1,
108
.pr_uref = 1,
109
.pr_path = "/",
110
.pr_securelevel = -1,
111
.pr_devfs_rsnum = 0,
112
.pr_state = PRISON_STATE_ALIVE,
113
.pr_childmax = JAIL_MAX,
114
.pr_hostuuid = DEFAULT_HOSTUUID,
115
.pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
116
#ifdef VIMAGE
117
.pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
118
#else
119
.pr_flags = PR_HOST|_PR_IP_SADDRSEL,
120
#endif
121
.pr_allow = PR_ALLOW_PRISON0,
122
};
123
_Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0,
124
"Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved");
125
126
MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
127
128
struct bool_flags {
129
const char *name;
130
const char *noname;
131
volatile u_int flag;
132
};
133
struct jailsys_flags {
134
const char *name;
135
unsigned disable;
136
unsigned new;
137
};
138
139
/*
140
* Handle jail teardown in a dedicated thread to avoid deadlocks from
141
* vnet_destroy().
142
*/
143
TASKQUEUE_DEFINE_THREAD(jail_remove);
144
145
/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
146
struct sx allprison_lock;
147
SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
148
struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
149
LIST_HEAD(, prison_racct) allprison_racct;
150
int lastprid = 0;
151
int lastdeadid = 0;
152
153
static int get_next_prid(struct prison **insprp);
154
static int get_next_deadid(struct prison **insprp);
155
static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
156
static void prison_complete(void *context, int pending);
157
static void prison_deref(struct prison *pr, int flags);
158
static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
159
static int prison_lock_xlock(struct prison *pr, int flags);
160
static void prison_cleanup_locked(struct prison *pr);
161
static void prison_cleanup_unlocked(struct prison *pr);
162
static void prison_free_not_last(struct prison *pr);
163
static void prison_proc_free_not_last(struct prison *pr);
164
static void prison_proc_relink(struct prison *opr, struct prison *npr,
165
struct proc *p);
166
static void prison_set_allow_locked(struct prison *pr, unsigned flag,
167
int enable);
168
static char *prison_path(struct prison *pr1, struct prison *pr2);
169
#ifdef RACCT
170
static void prison_racct_attach(struct prison *pr);
171
static void prison_racct_modify(struct prison *pr);
172
static void prison_racct_detach(struct prison *pr);
173
#endif
174
static void prison_knote(struct prison *pr, long hint);
175
176
/* Flags for prison_deref */
177
#define PD_DEREF 0x01 /* Decrement pr_ref */
178
#define PD_DEUREF 0x02 /* Decrement pr_uref */
179
#define PD_KILL 0x04 /* Remove jail, kill processes, etc */
180
#define PD_LOCKED 0x10 /* pr_mtx is held */
181
#define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */
182
#define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */
183
#define PD_OP_FLAGS 0x07 /* Operation flags */
184
#define PD_LOCK_FLAGS 0x70 /* Lock status flags */
185
186
/*
187
* Parameter names corresponding to PR_* flag values. Size values are for kvm
188
* as we cannot figure out the size of a sparse array, or an array without a
189
* terminating entry.
190
*/
191
static struct bool_flags pr_flag_bool[] = {
192
{"persist", "nopersist", PR_PERSIST},
193
#ifdef INET
194
{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
195
#endif
196
#ifdef INET6
197
{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
198
#endif
199
};
200
const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
201
202
static struct jailsys_flags pr_flag_jailsys[] = {
203
{"host", 0, PR_HOST},
204
#ifdef VIMAGE
205
{"vnet", 0, PR_VNET},
206
#endif
207
#ifdef INET
208
{"ip4", PR_IP4_USER, PR_IP4_USER},
209
#endif
210
#ifdef INET6
211
{"ip6", PR_IP6_USER, PR_IP6_USER},
212
#endif
213
};
214
const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
215
216
/*
217
* Make this array full-size so dynamic parameters can be added.
218
* It is protected by prison0.mtx, but lockless reading is allowed
219
* with an atomic check of the flag values.
220
*/
221
static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
222
{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
223
{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
224
{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
225
{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
226
{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
227
{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
228
{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
229
{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
230
{"allow.reserved_ports", "allow.noreserved_ports",
231
PR_ALLOW_RESERVED_PORTS},
232
{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
233
{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
234
PR_ALLOW_UNPRIV_DEBUG},
235
{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
236
#ifdef VIMAGE
237
{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
238
#endif
239
{"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR},
240
{"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
241
{"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
242
{"allow.routing", "allow.norouting", PR_ALLOW_ROUTING},
243
{"allow.unprivileged_parent_tampering",
244
"allow.nounprivileged_parent_tampering",
245
PR_ALLOW_UNPRIV_PARENT_TAMPER},
246
#ifdef AUDIT
247
{"allow.setaudit", "allow.nosetaudit", PR_ALLOW_SETAUDIT},
248
#endif
249
};
250
static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
251
const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
252
253
#define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \
254
PR_ALLOW_RESERVED_PORTS | \
255
PR_ALLOW_UNPRIV_DEBUG | \
256
PR_ALLOW_SUSER)
257
#define JAIL_DEFAULT_ENFORCE_STATFS 2
258
#define JAIL_DEFAULT_DEVFS_RSNUM 0
259
static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
260
static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
261
static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
262
#if defined(INET) || defined(INET6)
263
static unsigned jail_max_af_ips = 255;
264
#endif
265
266
/*
267
* Initialize the parts of prison0 that can't be static-initialized with
268
* constants. This is called from proc0_init() after creating thread0 cpuset.
269
*/
270
void
271
prison0_init(void)
272
{
273
uint8_t *file, *data;
274
size_t size;
275
char buf[sizeof(prison0.pr_hostuuid)];
276
bool valid;
277
278
prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
279
prison0.pr_osreldate = osreldate;
280
strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
281
282
/* If we have a preloaded hostuuid, use it. */
283
file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
284
if (file != NULL) {
285
data = preload_fetch_addr(file);
286
size = preload_fetch_size(file);
287
if (data != NULL) {
288
/*
289
* The preloaded data may include trailing whitespace, almost
290
* certainly a newline; skip over any whitespace or
291
* non-printable characters to be safe.
292
*/
293
while (size > 0 && data[size - 1] <= 0x20) {
294
size--;
295
}
296
297
valid = false;
298
299
/*
300
* Not NUL-terminated when passed from loader, but
301
* validate_uuid requires that due to using sscanf (as
302
* does the subsequent strlcpy, since it still reads
303
* past the given size to return the true length);
304
* bounce to a temporary buffer to fix.
305
*/
306
if (size >= sizeof(buf))
307
goto done;
308
309
memcpy(buf, data, size);
310
buf[size] = '\0';
311
312
if (validate_uuid(buf, size, NULL, 0) != 0)
313
goto done;
314
315
valid = true;
316
(void)strlcpy(prison0.pr_hostuuid, buf,
317
sizeof(prison0.pr_hostuuid));
318
319
done:
320
if (bootverbose && !valid) {
321
printf("hostuuid: preload data malformed: '%.*s'\n",
322
(int)size, data);
323
}
324
}
325
}
326
if (bootverbose)
327
printf("hostuuid: using %s\n", prison0.pr_hostuuid);
328
}
329
330
/*
331
* struct jail_args {
332
* struct jail *jail;
333
* };
334
*/
335
int
336
sys_jail(struct thread *td, struct jail_args *uap)
337
{
338
uint32_t version;
339
int error;
340
struct jail j;
341
342
error = copyin(uap->jail, &version, sizeof(uint32_t));
343
if (error)
344
return (error);
345
346
switch (version) {
347
case 0:
348
{
349
struct jail_v0 j0;
350
351
/* FreeBSD single IPv4 jails. */
352
bzero(&j, sizeof(struct jail));
353
error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
354
if (error)
355
return (error);
356
j.version = j0.version;
357
j.path = j0.path;
358
j.hostname = j0.hostname;
359
j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */
360
break;
361
}
362
363
case 1:
364
/*
365
* Version 1 was used by multi-IPv4 jail implementations
366
* that never made it into the official kernel.
367
*/
368
return (EINVAL);
369
370
case 2: /* JAIL_API_VERSION */
371
/* FreeBSD multi-IPv4/IPv6,noIP jails. */
372
error = copyin(uap->jail, &j, sizeof(struct jail));
373
if (error)
374
return (error);
375
break;
376
377
default:
378
/* Sci-Fi jails are not supported, sorry. */
379
return (EINVAL);
380
}
381
return (kern_jail(td, &j));
382
}
383
384
int
385
kern_jail(struct thread *td, struct jail *j)
386
{
387
struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
388
#ifdef INET
389
+ 1
390
#endif
391
#ifdef INET6
392
+ 1
393
#endif
394
)];
395
struct uio opt;
396
char *u_path, *u_hostname, *u_name;
397
struct bool_flags *bf;
398
#ifdef INET
399
uint32_t ip4s;
400
struct in_addr *u_ip4;
401
#endif
402
#ifdef INET6
403
struct in6_addr *u_ip6;
404
#endif
405
size_t tmplen;
406
int error, enforce_statfs;
407
408
bzero(&optiov, sizeof(optiov));
409
opt.uio_iov = optiov;
410
opt.uio_iovcnt = 0;
411
opt.uio_offset = -1;
412
opt.uio_resid = -1;
413
opt.uio_segflg = UIO_SYSSPACE;
414
opt.uio_rw = UIO_READ;
415
opt.uio_td = td;
416
417
/* Set permissions for top-level jails from sysctls. */
418
if (!jailed(td->td_ucred)) {
419
for (bf = pr_flag_allow;
420
bf < pr_flag_allow + nitems(pr_flag_allow) &&
421
atomic_load_int(&bf->flag) != 0;
422
bf++) {
423
optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
424
(jail_default_allow & bf->flag)
425
? bf->name : bf->noname);
426
optiov[opt.uio_iovcnt].iov_len =
427
strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
428
opt.uio_iovcnt += 2;
429
}
430
optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
431
optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
432
opt.uio_iovcnt++;
433
enforce_statfs = jail_default_enforce_statfs;
434
optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
435
optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
436
opt.uio_iovcnt++;
437
}
438
439
tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
440
#ifdef INET
441
ip4s = (j->version == 0) ? 1 : j->ip4s;
442
if (ip4s > jail_max_af_ips)
443
return (EINVAL);
444
tmplen += ip4s * sizeof(struct in_addr);
445
#else
446
if (j->ip4s > 0)
447
return (EINVAL);
448
#endif
449
#ifdef INET6
450
if (j->ip6s > jail_max_af_ips)
451
return (EINVAL);
452
tmplen += j->ip6s * sizeof(struct in6_addr);
453
#else
454
if (j->ip6s > 0)
455
return (EINVAL);
456
#endif
457
u_path = malloc(tmplen, M_TEMP, M_WAITOK);
458
u_hostname = u_path + MAXPATHLEN;
459
u_name = u_hostname + MAXHOSTNAMELEN;
460
#ifdef INET
461
u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
462
#endif
463
#ifdef INET6
464
#ifdef INET
465
u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
466
#else
467
u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
468
#endif
469
#endif
470
optiov[opt.uio_iovcnt].iov_base = "path";
471
optiov[opt.uio_iovcnt].iov_len = sizeof("path");
472
opt.uio_iovcnt++;
473
optiov[opt.uio_iovcnt].iov_base = u_path;
474
error = copyinstr(j->path, u_path, MAXPATHLEN,
475
&optiov[opt.uio_iovcnt].iov_len);
476
if (error) {
477
free(u_path, M_TEMP);
478
return (error);
479
}
480
opt.uio_iovcnt++;
481
optiov[opt.uio_iovcnt].iov_base = "host.hostname";
482
optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
483
opt.uio_iovcnt++;
484
optiov[opt.uio_iovcnt].iov_base = u_hostname;
485
error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
486
&optiov[opt.uio_iovcnt].iov_len);
487
if (error) {
488
free(u_path, M_TEMP);
489
return (error);
490
}
491
opt.uio_iovcnt++;
492
if (j->jailname != NULL) {
493
optiov[opt.uio_iovcnt].iov_base = "name";
494
optiov[opt.uio_iovcnt].iov_len = sizeof("name");
495
opt.uio_iovcnt++;
496
optiov[opt.uio_iovcnt].iov_base = u_name;
497
error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
498
&optiov[opt.uio_iovcnt].iov_len);
499
if (error) {
500
free(u_path, M_TEMP);
501
return (error);
502
}
503
opt.uio_iovcnt++;
504
}
505
#ifdef INET
506
optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
507
optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
508
opt.uio_iovcnt++;
509
optiov[opt.uio_iovcnt].iov_base = u_ip4;
510
optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
511
if (j->version == 0)
512
u_ip4->s_addr = j->ip4s;
513
else {
514
error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
515
if (error) {
516
free(u_path, M_TEMP);
517
return (error);
518
}
519
}
520
opt.uio_iovcnt++;
521
#endif
522
#ifdef INET6
523
optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
524
optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
525
opt.uio_iovcnt++;
526
optiov[opt.uio_iovcnt].iov_base = u_ip6;
527
optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
528
error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
529
if (error) {
530
free(u_path, M_TEMP);
531
return (error);
532
}
533
opt.uio_iovcnt++;
534
#endif
535
KASSERT(opt.uio_iovcnt <= nitems(optiov),
536
("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
537
error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
538
free(u_path, M_TEMP);
539
return (error);
540
}
541
542
/*
543
* struct jail_set_args {
544
* struct iovec *iovp;
545
* unsigned int iovcnt;
546
* int flags;
547
* };
548
*/
549
int
550
sys_jail_set(struct thread *td, struct jail_set_args *uap)
551
{
552
struct uio *auio;
553
int error;
554
555
/* Check that we have an even number of iovecs. */
556
if (uap->iovcnt & 1)
557
return (EINVAL);
558
559
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
560
if (error)
561
return (error);
562
error = kern_jail_set(td, auio, uap->flags);
563
freeuio(auio);
564
return (error);
565
}
566
567
#if defined(INET) || defined(INET6)
568
typedef int prison_addr_cmp_t(const void *, const void *);
569
typedef bool prison_addr_valid_t(const void *);
570
static const struct pr_family {
571
size_t size;
572
prison_addr_cmp_t *cmp;
573
prison_addr_valid_t *valid;
574
int ip_flag;
575
} pr_families[PR_FAMILY_MAX] = {
576
#ifdef INET
577
[PR_INET] = {
578
.size = sizeof(struct in_addr),
579
.cmp = prison_qcmp_v4,
580
.valid = prison_valid_v4,
581
.ip_flag = PR_IP4_USER,
582
},
583
#endif
584
#ifdef INET6
585
[PR_INET6] = {
586
.size = sizeof(struct in6_addr),
587
.cmp = prison_qcmp_v6,
588
.valid = prison_valid_v6,
589
.ip_flag = PR_IP6_USER,
590
},
591
#endif
592
};
593
594
/*
595
* Network address lists (pr_addrs) allocation for jails. The addresses
596
* are accessed locklessly by the network stack, thus need to be protected by
597
* the network epoch.
598
*/
599
struct prison_ip {
600
struct epoch_context ctx;
601
uint32_t ips;
602
#ifdef FUTURE_C
603
/*
604
* XXX Variable-length automatic arrays in union may be
605
* supported in future C.
606
*/
607
union {
608
char pr_ip[];
609
struct in_addr pr_ip4[];
610
struct in6_addr pr_ip6[];
611
};
612
#else /* No future C :( */
613
char pr_ip[];
614
#endif
615
};
616
617
static char *
618
PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
619
{
620
MPASS(pip);
621
MPASS(af < PR_FAMILY_MAX);
622
MPASS(idx >= 0 && idx < pip->ips);
623
624
return (pip->pr_ip + pr_families[af].size * idx);
625
}
626
627
static struct prison_ip *
628
prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
629
{
630
struct prison_ip *pip;
631
632
pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
633
M_PRISON, flags);
634
if (pip != NULL)
635
pip->ips = cnt;
636
return (pip);
637
}
638
639
/*
640
* Allocate and copyin user supplied address list, sorting and validating.
641
* kern_jail_set() helper.
642
*/
643
static struct prison_ip *
644
prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
645
{
646
prison_addr_cmp_t *const cmp = pr_families[af].cmp;
647
const size_t size = pr_families[af].size;
648
struct prison_ip *pip;
649
650
pip = prison_ip_alloc(af, cnt, M_WAITOK);
651
bcopy(op, pip->pr_ip, cnt * size);
652
/*
653
* IP addresses are all sorted but ip[0] to preserve
654
* the primary IP address as given from userland.
655
* This special IP is used for unbound outgoing
656
* connections as well for "loopback" traffic in case
657
* source address selection cannot find any more fitting
658
* address to connect from.
659
*/
660
if (cnt > 1)
661
qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
662
/*
663
* Check for duplicate addresses and do some simple
664
* zero and broadcast checks. If users give other bogus
665
* addresses it is their problem.
666
*/
667
for (int i = 0; i < cnt; i++) {
668
if (!pr_families[af].valid(PR_IP(pip, af, i))) {
669
free(pip, M_PRISON);
670
return (NULL);
671
}
672
if (i + 1 < cnt &&
673
(cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
674
cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
675
free(pip, M_PRISON);
676
return (NULL);
677
}
678
}
679
680
return (pip);
681
}
682
683
/*
684
* Allocate and dup parent prison address list.
685
* kern_jail_set() helper.
686
*/
687
static void
688
prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
689
{
690
const struct prison_ip *ppip = ppr->pr_addrs[af];
691
struct prison_ip *pip;
692
693
if (ppip != NULL) {
694
pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
695
bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
696
pr->pr_addrs[af] = pip;
697
}
698
}
699
700
/*
701
* Make sure the new set of IP addresses is a subset of the parent's list.
702
* Don't worry about the parent being unlocked, as any setting is done with
703
* allprison_lock held.
704
* kern_jail_set() helper.
705
*/
706
static bool
707
prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
708
const pr_family_t af)
709
{
710
prison_addr_cmp_t *const cmp = pr_families[af].cmp;
711
int i, j;
712
713
if (ppip == NULL)
714
return (false);
715
716
for (i = 0; i < ppip->ips; i++)
717
if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
718
break;
719
720
if (i == ppip->ips)
721
/* Main address not present in parent. */
722
return (false);
723
724
if (pip->ips > 1) {
725
for (i = j = 1; i < pip->ips; i++) {
726
if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
727
/* Equals to parent primary address. */
728
continue;
729
for (; j < ppip->ips; j++)
730
if (cmp(PR_IP(pip, af, i),
731
PR_IP(ppip, af, j)) == 0)
732
break;
733
if (j == ppip->ips)
734
break;
735
}
736
if (j == ppip->ips)
737
/* Address not present in parent. */
738
return (false);
739
}
740
return (true);
741
}
742
743
/*
744
* Check for conflicting IP addresses. We permit them if there is no more
745
* than one IP on each jail. If there is a duplicate on a jail with more
746
* than one IP stop checking and return error.
747
* kern_jail_set() helper.
748
*/
749
static bool
750
prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
751
struct prison_ip *pip, pr_family_t af)
752
{
753
const struct prison *tppr, *tpr;
754
int descend;
755
756
#ifdef VIMAGE
757
for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
758
if (tppr->pr_flags & PR_VNET)
759
break;
760
#else
761
tppr = &prison0;
762
#endif
763
FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
764
if (tpr == pr ||
765
#ifdef VIMAGE
766
(tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
767
#endif
768
!prison_isalive(tpr)) {
769
descend = 0;
770
continue;
771
}
772
if (!(tpr->pr_flags & pr_families[af].ip_flag))
773
continue;
774
descend = 0;
775
if (tpr->pr_addrs[af] == NULL ||
776
(pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
777
continue;
778
for (int i = 0; i < pip->ips; i++)
779
if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
780
return (false);
781
}
782
783
return (true);
784
}
785
786
_Static_assert(offsetof(struct prison_ip, ctx) == 0,
787
"prison must start with epoch context");
788
static void
789
prison_ip_free_deferred(epoch_context_t ctx)
790
{
791
792
free(ctx, M_PRISON);
793
}
794
795
static void
796
prison_ip_free(struct prison_ip *pip)
797
{
798
799
if (pip != NULL)
800
NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
801
}
802
803
static void
804
prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
805
{
806
struct prison_ip **mem, *old;
807
808
mtx_assert(&pr->pr_mtx, MA_OWNED);
809
810
mem = &pr->pr_addrs[af];
811
812
old = *mem;
813
atomic_store_ptr(mem, new);
814
prison_ip_free(old);
815
}
816
817
/*
818
* Restrict a prison's IP address list with its parent's, possibly replacing
819
* it. Return true if succeed, otherwise should redo.
820
* kern_jail_set() helper.
821
*/
822
static bool
823
prison_ip_restrict(struct prison *pr, const pr_family_t af,
824
struct prison_ip **newp)
825
{
826
struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
827
struct prison_ip *pip = pr->pr_addrs[af];
828
int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
829
const size_t size = pr_families[af].size;
830
struct prison_ip *new = newp != NULL ? *newp : NULL;
831
uint32_t ips;
832
833
mtx_assert(&pr->pr_mtx, MA_OWNED);
834
835
/*
836
* Due to epoch-synchronized access to the IP address lists we always
837
* allocate a new list even if the old one has enough space. We could
838
* atomically update an IPv4 address inside a list, but that would
839
* screw up sorting, and in case of IPv6 we can't even atomically write
840
* one.
841
*/
842
if (ppip == NULL) {
843
if (pip != NULL)
844
prison_ip_set(pr, af, NULL);
845
return (true);
846
}
847
848
if (!(pr->pr_flags & pr_families[af].ip_flag)) {
849
if (new == NULL) {
850
new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
851
if (new == NULL)
852
return (false); /* Redo */
853
}
854
/* This has no user settings, so just copy the parent's list. */
855
MPASS(new->ips == ppip->ips);
856
bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
857
prison_ip_set(pr, af, new);
858
if (newp != NULL)
859
*newp = NULL; /* Used */
860
} else if (pip != NULL) {
861
/* Remove addresses that aren't in the parent. */
862
int i;
863
864
i = 0; /* index in pip */
865
ips = 0; /* index in new */
866
867
if (new == NULL) {
868
new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
869
if (new == NULL)
870
return (false); /* Redo */
871
}
872
873
for (int pi = 0; pi < ppip->ips; pi++)
874
if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
875
/* Found our primary address in parent. */
876
bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
877
size);
878
i++;
879
ips++;
880
break;
881
}
882
for (int pi = 1; i < pip->ips; ) {
883
/* Check against primary, which is unsorted. */
884
if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
885
/* Matches parent's primary address. */
886
bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
887
size);
888
i++;
889
ips++;
890
continue;
891
}
892
/* The rest are sorted. */
893
switch (pi >= ppip->ips ? -1 :
894
cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
895
case -1:
896
i++;
897
break;
898
case 0:
899
bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
900
size);
901
i++;
902
pi++;
903
ips++;
904
break;
905
case 1:
906
pi++;
907
break;
908
}
909
}
910
if (ips == 0) {
911
if (newp == NULL || *newp == NULL)
912
prison_ip_free(new);
913
new = NULL;
914
} else {
915
/* Shrink to real size */
916
KASSERT((new->ips >= ips),
917
("Out-of-bounds write to prison_ip %p", new));
918
new->ips = ips;
919
}
920
prison_ip_set(pr, af, new);
921
if (newp != NULL)
922
*newp = NULL; /* Used */
923
}
924
return (true);
925
}
926
927
/*
928
* Fast-path check if an address belongs to a prison.
929
*/
930
int
931
prison_ip_check(const struct prison *pr, const pr_family_t af,
932
const void *addr)
933
{
934
int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
935
struct prison_ip *pip;
936
int i, a, z, d;
937
938
MPASS(mtx_owned(&pr->pr_mtx) ||
939
in_epoch(net_epoch_preempt) ||
940
sx_xlocked(&allprison_lock));
941
942
pip = atomic_load_ptr(&pr->pr_addrs[af]);
943
if (__predict_false(pip == NULL))
944
return (EAFNOSUPPORT);
945
946
/* Check the primary IP. */
947
if (cmp(PR_IP(pip, af, 0), addr) == 0)
948
return (0);
949
950
/*
951
* All the other IPs are sorted so we can do a binary search.
952
*/
953
a = 0;
954
z = pip->ips - 2;
955
while (a <= z) {
956
i = (a + z) / 2;
957
d = cmp(PR_IP(pip, af, i + 1), addr);
958
if (d > 0)
959
z = i - 1;
960
else if (d < 0)
961
a = i + 1;
962
else
963
return (0);
964
}
965
966
return (EADDRNOTAVAIL);
967
}
968
969
/*
970
* Grab primary IP. Historically required mutex, but nothing prevents
971
* us to support epoch-protected access. Is it used in fast path?
972
* in{6}_jail.c helper
973
*/
974
const void *
975
prison_ip_get0(const struct prison *pr, const pr_family_t af)
976
{
977
const struct prison_ip *pip = pr->pr_addrs[af];
978
979
mtx_assert(&pr->pr_mtx, MA_OWNED);
980
MPASS(pip);
981
982
return (pip->pr_ip);
983
}
984
985
u_int
986
prison_ip_cnt(const struct prison *pr, const pr_family_t af)
987
{
988
989
return (pr->pr_addrs[af]->ips);
990
}
991
#endif /* defined(INET) || defined(INET6) */
992
993
int
994
kern_jail_set(struct thread *td, struct uio *optuio, int flags)
995
{
996
struct file *jfp_out;
997
struct nameidata nd;
998
#ifdef INET
999
struct prison_ip *ip4;
1000
#endif
1001
#ifdef INET6
1002
struct prison_ip *ip6;
1003
#endif
1004
struct vfsopt *opt;
1005
struct vfsoptlist *opts;
1006
struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
1007
struct ucred *jdcred;
1008
struct vnode *root;
1009
char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
1010
char *g_path, *osrelstr;
1011
struct bool_flags *bf;
1012
struct jailsys_flags *jsf;
1013
#if defined(INET) || defined(INET6)
1014
void *op;
1015
#endif
1016
unsigned long hid;
1017
size_t namelen, onamelen, pnamelen;
1018
int created, cuflags, descend, drflags, enforce;
1019
int error, errmsg_len, errmsg_pos;
1020
int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
1021
int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level;
1022
int childmax, osreldt, rsnum, slevel;
1023
#ifdef INET
1024
int ip4s;
1025
bool redo_ip4;
1026
#endif
1027
#ifdef INET6
1028
int ip6s;
1029
bool redo_ip6;
1030
#endif
1031
bool maybe_changed;
1032
uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
1033
uint64_t pr_allow_diff;
1034
unsigned tallow;
1035
char numbuf[12];
1036
1037
mypr = td->td_ucred->cr_prison;
1038
if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) &&
1039
mypr->pr_childmax == 0)
1040
return (EPERM);
1041
if (flags & ~JAIL_SET_MASK)
1042
return (EINVAL);
1043
if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) ==
1044
(JAIL_USE_DESC | JAIL_AT_DESC))
1045
return (EINVAL);
1046
prison_hold(mypr);
1047
1048
#ifdef INET
1049
ip4 = NULL;
1050
#endif
1051
#ifdef INET6
1052
ip6 = NULL;
1053
#endif
1054
g_path = NULL;
1055
jfp_out = NULL;
1056
jfd_out = -1;
1057
/*
1058
* Check all the parameters before committing to anything. Not all
1059
* errors can be caught early, but we may as well try. Also, this
1060
* takes care of some expensive stuff (path lookup) before getting
1061
* the allprison lock.
1062
*
1063
* XXX Jails are not filesystems, and jail parameters are not mount
1064
* options. But it makes more sense to re-use the vfsopt code
1065
* than duplicate it under a different name.
1066
*/
1067
error = vfs_buildopts(optuio, &opts);
1068
if (error)
1069
goto done_free;
1070
1071
cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1072
if (!cuflags) {
1073
error = EINVAL;
1074
vfs_opterror(opts, "no valid operation (create or update)");
1075
goto done_errmsg;
1076
}
1077
1078
error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
1079
if (error == ENOENT) {
1080
if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
1081
JAIL_OWN_DESC)) {
1082
vfs_opterror(opts, "missing desc");
1083
goto done_errmsg;
1084
}
1085
jfd_in = -1;
1086
} else if (error != 0)
1087
goto done_free;
1088
else {
1089
if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
1090
JAIL_OWN_DESC))) {
1091
vfs_opterror(opts, "unexpected desc");
1092
goto done_errmsg;
1093
}
1094
if (flags & JAIL_AT_DESC) {
1095
/*
1096
* Look up and create jails based on the
1097
* descriptor's prison.
1098
*/
1099
prison_free(mypr);
1100
error = jaildesc_find(td, jfd_in, &mypr, NULL);
1101
if (error != 0) {
1102
vfs_opterror(opts, error == ENOENT ?
1103
"descriptor to dead jail" :
1104
"not a jail descriptor");
1105
goto done_errmsg;
1106
}
1107
if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) {
1108
error = EPERM;
1109
goto done_free;
1110
}
1111
}
1112
if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
1113
/* Allocate a jail descriptor to return later. */
1114
error = jaildesc_alloc(td, &jfp_out, &jfd_out,
1115
flags & JAIL_OWN_DESC);
1116
if (error)
1117
goto done_free;
1118
}
1119
}
1120
1121
/*
1122
* Delay the permission check if using a jail descriptor,
1123
* until we get the descriptor's credentials.
1124
*/
1125
if (!(flags & JAIL_USE_DESC)) {
1126
error = priv_check(td, PRIV_JAIL_SET);
1127
if (error == 0 && (flags & JAIL_ATTACH))
1128
error = priv_check(td, PRIV_JAIL_ATTACH);
1129
if (error)
1130
goto done_free;
1131
}
1132
1133
error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1134
if (error == ENOENT)
1135
jid = 0;
1136
else if (error != 0)
1137
goto done_free;
1138
1139
error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
1140
if (error == ENOENT)
1141
gotslevel = 0;
1142
else if (error != 0)
1143
goto done_free;
1144
else
1145
gotslevel = 1;
1146
1147
error =
1148
vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
1149
if (error == ENOENT)
1150
gotchildmax = 0;
1151
else if (error != 0)
1152
goto done_free;
1153
else
1154
gotchildmax = 1;
1155
1156
error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
1157
if (error == ENOENT)
1158
gotenforce = 0;
1159
else if (error != 0)
1160
goto done_free;
1161
else if (enforce < 0 || enforce > 2) {
1162
error = EINVAL;
1163
goto done_free;
1164
} else
1165
gotenforce = 1;
1166
1167
error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
1168
if (error == ENOENT)
1169
gotrsnum = 0;
1170
else if (error != 0)
1171
goto done_free;
1172
else
1173
gotrsnum = 1;
1174
1175
pr_flags = ch_flags = 0;
1176
for (bf = pr_flag_bool;
1177
bf < pr_flag_bool + nitems(pr_flag_bool);
1178
bf++) {
1179
vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
1180
vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
1181
}
1182
ch_flags |= pr_flags;
1183
for (jsf = pr_flag_jailsys;
1184
jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
1185
jsf++) {
1186
error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
1187
if (error == ENOENT)
1188
continue;
1189
if (error != 0)
1190
goto done_free;
1191
switch (jsys) {
1192
case JAIL_SYS_DISABLE:
1193
if (!jsf->disable) {
1194
error = EINVAL;
1195
goto done_free;
1196
}
1197
pr_flags |= jsf->disable;
1198
break;
1199
case JAIL_SYS_NEW:
1200
pr_flags |= jsf->new;
1201
break;
1202
case JAIL_SYS_INHERIT:
1203
break;
1204
default:
1205
error = EINVAL;
1206
goto done_free;
1207
}
1208
ch_flags |= jsf->new | jsf->disable;
1209
}
1210
if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
1211
&& !(pr_flags & PR_PERSIST)) {
1212
error = EINVAL;
1213
vfs_opterror(opts, "new jail must persist or attach");
1214
goto done_errmsg;
1215
}
1216
#ifdef VIMAGE
1217
if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
1218
error = EINVAL;
1219
vfs_opterror(opts, "vnet cannot be changed after creation");
1220
goto done_errmsg;
1221
}
1222
#endif
1223
#ifdef INET
1224
if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
1225
error = EINVAL;
1226
vfs_opterror(opts, "ip4 cannot be changed after creation");
1227
goto done_errmsg;
1228
}
1229
#endif
1230
#ifdef INET6
1231
if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
1232
error = EINVAL;
1233
vfs_opterror(opts, "ip6 cannot be changed after creation");
1234
goto done_errmsg;
1235
}
1236
#endif
1237
1238
pr_allow = ch_allow = 0;
1239
for (bf = pr_flag_allow;
1240
bf < pr_flag_allow + nitems(pr_flag_allow) &&
1241
atomic_load_int(&bf->flag) != 0;
1242
bf++) {
1243
vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
1244
vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
1245
}
1246
ch_allow |= pr_allow;
1247
1248
error = vfs_getopt(opts, "name", (void **)&name, &len);
1249
if (error == ENOENT)
1250
name = NULL;
1251
else if (error != 0)
1252
goto done_free;
1253
else {
1254
if (len == 0 || name[len - 1] != '\0') {
1255
error = EINVAL;
1256
goto done_free;
1257
}
1258
if (len > MAXHOSTNAMELEN) {
1259
error = ENAMETOOLONG;
1260
goto done_free;
1261
}
1262
}
1263
1264
error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
1265
if (error == ENOENT)
1266
host = NULL;
1267
else if (error != 0)
1268
goto done_free;
1269
else {
1270
ch_flags |= PR_HOST;
1271
pr_flags |= PR_HOST;
1272
if (len == 0 || host[len - 1] != '\0') {
1273
error = EINVAL;
1274
goto done_free;
1275
}
1276
if (len > MAXHOSTNAMELEN) {
1277
error = ENAMETOOLONG;
1278
goto done_free;
1279
}
1280
}
1281
1282
error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
1283
if (error == ENOENT)
1284
domain = NULL;
1285
else if (error != 0)
1286
goto done_free;
1287
else {
1288
ch_flags |= PR_HOST;
1289
pr_flags |= PR_HOST;
1290
if (len == 0 || domain[len - 1] != '\0') {
1291
error = EINVAL;
1292
goto done_free;
1293
}
1294
if (len > MAXHOSTNAMELEN) {
1295
error = ENAMETOOLONG;
1296
goto done_free;
1297
}
1298
}
1299
1300
error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
1301
if (error == ENOENT)
1302
uuid = NULL;
1303
else if (error != 0)
1304
goto done_free;
1305
else {
1306
ch_flags |= PR_HOST;
1307
pr_flags |= PR_HOST;
1308
if (len == 0 || uuid[len - 1] != '\0') {
1309
error = EINVAL;
1310
goto done_free;
1311
}
1312
if (len > HOSTUUIDLEN) {
1313
error = ENAMETOOLONG;
1314
goto done_free;
1315
}
1316
}
1317
1318
#ifdef COMPAT_FREEBSD32
1319
if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1320
uint32_t hid32;
1321
1322
error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
1323
hid = hid32;
1324
} else
1325
#endif
1326
error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
1327
if (error == ENOENT)
1328
gothid = 0;
1329
else if (error != 0)
1330
goto done_free;
1331
else {
1332
gothid = 1;
1333
ch_flags |= PR_HOST;
1334
pr_flags |= PR_HOST;
1335
}
1336
1337
#ifdef INET
1338
error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
1339
if (error == ENOENT)
1340
ip4s = 0;
1341
else if (error != 0)
1342
goto done_free;
1343
else if (ip4s & (sizeof(struct in_addr) - 1)) {
1344
error = EINVAL;
1345
goto done_free;
1346
} else {
1347
ch_flags |= PR_IP4_USER;
1348
pr_flags |= PR_IP4_USER;
1349
if (ip4s > 0) {
1350
ip4s /= sizeof(struct in_addr);
1351
if (ip4s > jail_max_af_ips) {
1352
error = EINVAL;
1353
vfs_opterror(opts, "too many IPv4 addresses");
1354
goto done_errmsg;
1355
}
1356
ip4 = prison_ip_copyin(PR_INET, op, ip4s);
1357
if (ip4 == NULL) {
1358
error = EINVAL;
1359
goto done_free;
1360
}
1361
}
1362
}
1363
#endif
1364
1365
#ifdef INET6
1366
error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
1367
if (error == ENOENT)
1368
ip6s = 0;
1369
else if (error != 0)
1370
goto done_free;
1371
else if (ip6s & (sizeof(struct in6_addr) - 1)) {
1372
error = EINVAL;
1373
goto done_free;
1374
} else {
1375
ch_flags |= PR_IP6_USER;
1376
pr_flags |= PR_IP6_USER;
1377
if (ip6s > 0) {
1378
ip6s /= sizeof(struct in6_addr);
1379
if (ip6s > jail_max_af_ips) {
1380
error = EINVAL;
1381
vfs_opterror(opts, "too many IPv6 addresses");
1382
goto done_errmsg;
1383
}
1384
ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
1385
if (ip6 == NULL) {
1386
error = EINVAL;
1387
goto done_free;
1388
}
1389
}
1390
}
1391
#endif
1392
1393
#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1394
if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1395
error = EINVAL;
1396
vfs_opterror(opts,
1397
"vnet jails cannot have IP address restrictions");
1398
goto done_errmsg;
1399
}
1400
#endif
1401
1402
error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
1403
if (error == ENOENT)
1404
osrelstr = NULL;
1405
else if (error != 0)
1406
goto done_free;
1407
else {
1408
if (flags & JAIL_UPDATE) {
1409
error = EINVAL;
1410
vfs_opterror(opts,
1411
"osrelease cannot be changed after creation");
1412
goto done_errmsg;
1413
}
1414
if (len == 0 || osrelstr[len - 1] != '\0') {
1415
error = EINVAL;
1416
goto done_free;
1417
}
1418
if (len >= OSRELEASELEN) {
1419
error = ENAMETOOLONG;
1420
vfs_opterror(opts,
1421
"osrelease string must be 1-%d bytes long",
1422
OSRELEASELEN - 1);
1423
goto done_errmsg;
1424
}
1425
}
1426
1427
error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1428
if (error == ENOENT)
1429
osreldt = 0;
1430
else if (error != 0)
1431
goto done_free;
1432
else {
1433
if (flags & JAIL_UPDATE) {
1434
error = EINVAL;
1435
vfs_opterror(opts,
1436
"osreldate cannot be changed after creation");
1437
goto done_errmsg;
1438
}
1439
if (osreldt == 0) {
1440
error = EINVAL;
1441
vfs_opterror(opts, "osreldate cannot be 0");
1442
goto done_errmsg;
1443
}
1444
}
1445
1446
root = NULL;
1447
error = vfs_getopt(opts, "path", (void **)&path, &len);
1448
if (error == ENOENT)
1449
path = NULL;
1450
else if (error != 0)
1451
goto done_free;
1452
else {
1453
if (flags & JAIL_UPDATE) {
1454
error = EINVAL;
1455
vfs_opterror(opts,
1456
"path cannot be changed after creation");
1457
goto done_errmsg;
1458
}
1459
if (len == 0 || path[len - 1] != '\0') {
1460
error = EINVAL;
1461
goto done_free;
1462
}
1463
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
1464
error = namei(&nd);
1465
if (error)
1466
goto done_free;
1467
root = nd.ni_vp;
1468
NDFREE_PNBUF(&nd);
1469
g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1470
strlcpy(g_path, path, MAXPATHLEN);
1471
error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1472
if (error == 0) {
1473
path = g_path;
1474
} else {
1475
/* exit on other errors */
1476
goto done_free;
1477
}
1478
if (root->v_type != VDIR) {
1479
error = ENOTDIR;
1480
vput(root);
1481
goto done_free;
1482
}
1483
VOP_UNLOCK(root);
1484
}
1485
1486
/*
1487
* Find the specified jail, or at least its parent.
1488
* This abuses the file error codes ENOENT and EEXIST.
1489
*/
1490
pr = NULL;
1491
inspr = NULL;
1492
deadpr = NULL;
1493
maybe_changed = false;
1494
if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1495
namelc = strrchr(name, '.');
1496
jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1497
if (*p != '\0')
1498
jid = 0;
1499
}
1500
sx_xlock(&allprison_lock);
1501
drflags = PD_LIST_XLOCKED;
1502
ppr = mypr;
1503
if (!prison_isalive(ppr)) {
1504
/* This jail is dying. This process will surely follow. */
1505
error = EAGAIN;
1506
goto done_deref;
1507
}
1508
if (flags & JAIL_USE_DESC) {
1509
/* Get the jail from its descriptor. */
1510
error = jaildesc_find(td, jfd_in, &pr, &jdcred);
1511
if (error) {
1512
vfs_opterror(opts, error == ENOENT ?
1513
"descriptor to dead jail" :
1514
"not a jail descriptor");
1515
goto done_deref;
1516
}
1517
drflags |= PD_DEREF;
1518
error = priv_check_cred(jdcred, PRIV_JAIL_SET);
1519
if (error == 0 && (flags & JAIL_ATTACH))
1520
error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
1521
crfree(jdcred);
1522
if (error)
1523
goto done_deref;
1524
mtx_lock(&pr->pr_mtx);
1525
drflags |= PD_LOCKED;
1526
if (cuflags == JAIL_CREATE) {
1527
error = EEXIST;
1528
vfs_opterror(opts, "jail %d already exists",
1529
pr->pr_id);
1530
goto done_deref;
1531
}
1532
if (!prison_isalive(pr)) {
1533
/* While a jid can be resurrected, the prison
1534
* itself cannot.
1535
*/
1536
error = ENOENT;
1537
vfs_opterror(opts, "jail %d is dying", pr->pr_id);
1538
goto done_deref;
1539
}
1540
if (jid != 0 && jid != pr->pr_id) {
1541
error = EINVAL;
1542
vfs_opterror(opts, "cannot change jid");
1543
goto done_deref;
1544
}
1545
jid = pr->pr_id;
1546
} else if (jid != 0) {
1547
if (jid < 0) {
1548
error = EINVAL;
1549
vfs_opterror(opts, "negative jid");
1550
goto done_deref;
1551
}
1552
/*
1553
* See if a requested jid already exists. Keep track of
1554
* where it can be inserted later.
1555
*/
1556
TAILQ_FOREACH(inspr, &allprison, pr_list) {
1557
if (inspr->pr_id < jid)
1558
continue;
1559
if (inspr->pr_id > jid)
1560
break;
1561
if (prison_isalive(inspr)) {
1562
pr = inspr;
1563
mtx_lock(&pr->pr_mtx);
1564
drflags |= PD_LOCKED;
1565
} else {
1566
/* Note a dying jail to handle later. */
1567
deadpr = inspr;
1568
}
1569
inspr = NULL;
1570
break;
1571
}
1572
if (cuflags == JAIL_CREATE && pr != NULL) {
1573
/*
1574
* Even creators that cannot see the jail will
1575
* get EEXIST.
1576
*/
1577
error = EEXIST;
1578
vfs_opterror(opts, "jail %d already exists", jid);
1579
goto done_deref;
1580
}
1581
if ((pr == NULL)
1582
? cuflags == JAIL_UPDATE
1583
: !prison_ischild(mypr, pr)) {
1584
/*
1585
* Updaters get ENOENT for nonexistent jails,
1586
* or for jails they cannot see. The latter
1587
* case is true even for CREATE | UPDATE,
1588
* which normally cannot give this error.
1589
*/
1590
error = ENOENT;
1591
vfs_opterror(opts, "jail %d not found", jid);
1592
goto done_deref;
1593
}
1594
}
1595
/*
1596
* If the caller provided a name, look for a jail by that name.
1597
* This has different semantics for creates and updates keyed by jid
1598
* (where the name must not already exist in a different jail),
1599
* and updates keyed by the name itself (where the name must exist
1600
* because that is the jail being updated).
1601
*/
1602
namelc = NULL;
1603
if (name != NULL) {
1604
namelc = strrchr(name, '.');
1605
if (namelc == NULL)
1606
namelc = name;
1607
else {
1608
/*
1609
* This is a hierarchical name. Split it into the
1610
* parent and child names, and make sure the parent
1611
* exists or matches an already found jail.
1612
*/
1613
if (pr != NULL) {
1614
if (strncmp(name, ppr->pr_name, namelc - name)
1615
|| ppr->pr_name[namelc - name] != '\0') {
1616
error = EINVAL;
1617
vfs_opterror(opts,
1618
"cannot change jail's parent");
1619
goto done_deref;
1620
}
1621
} else {
1622
*namelc = '\0';
1623
ppr = prison_find_name(mypr, name);
1624
if (ppr == NULL) {
1625
error = ENOENT;
1626
vfs_opterror(opts,
1627
"jail \"%s\" not found", name);
1628
goto done_deref;
1629
}
1630
mtx_unlock(&ppr->pr_mtx);
1631
if (!prison_isalive(ppr)) {
1632
error = ENOENT;
1633
vfs_opterror(opts,
1634
"jail \"%s\" is dying", name);
1635
goto done_deref;
1636
}
1637
*namelc = '.';
1638
}
1639
namelc++;
1640
}
1641
if (namelc[0] != '\0') {
1642
pnamelen =
1643
(ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1644
FOREACH_PRISON_CHILD(ppr, tpr) {
1645
if (tpr == pr || !prison_isalive(tpr) ||
1646
strcmp(tpr->pr_name + pnamelen, namelc))
1647
continue;
1648
if (cuflags == JAIL_CREATE || pr != NULL) {
1649
/*
1650
* Create, or update(jid): name must
1651
* not exist in an active sibling jail.
1652
*/
1653
error = EEXIST;
1654
vfs_opterror(opts,
1655
"jail \"%s\" already exists", name);
1656
goto done_deref;
1657
}
1658
/* Use this jail for updates. */
1659
pr = tpr;
1660
mtx_lock(&pr->pr_mtx);
1661
drflags |= PD_LOCKED;
1662
break;
1663
}
1664
/*
1665
* Update: name must exist if no jid is specified.
1666
* As with the jid case, the jail must be currently
1667
* visible, or else even CREATE | UPDATE will get
1668
* an error.
1669
*/
1670
if ((pr == NULL)
1671
? cuflags == JAIL_UPDATE
1672
: !prison_isalive(pr)) {
1673
error = ENOENT;
1674
vfs_opterror(opts, "jail \"%s\" not found",
1675
name);
1676
goto done_deref;
1677
}
1678
}
1679
}
1680
/* Update: must provide a desc, jid, or name. */
1681
else if (cuflags == JAIL_UPDATE && pr == NULL) {
1682
error = ENOENT;
1683
vfs_opterror(opts, "update specified no jail");
1684
goto done_deref;
1685
}
1686
1687
/* If there's no prison to update, create a new one and link it in. */
1688
created = pr == NULL;
1689
if (created) {
1690
for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1691
if (tpr->pr_childcount >= tpr->pr_childmax) {
1692
error = EPERM;
1693
vfs_opterror(opts, "prison limit exceeded");
1694
goto done_deref;
1695
}
1696
1697
if (deadpr != NULL) {
1698
/*
1699
* The prison being created has the same ID as a dying
1700
* one. Handle this by giving the dying jail a new ID.
1701
* This may cause some confusion to user space, but
1702
* only to those listing dying jails.
1703
*/
1704
deadid = get_next_deadid(&dinspr);
1705
if (deadid == 0) {
1706
error = EAGAIN;
1707
vfs_opterror(opts, "no available jail IDs");
1708
goto done_deref;
1709
}
1710
mtx_lock(&deadpr->pr_mtx);
1711
deadpr->pr_id = deadid;
1712
mtx_unlock(&deadpr->pr_mtx);
1713
if (dinspr == deadpr)
1714
inspr = deadpr;
1715
else {
1716
inspr = TAILQ_NEXT(deadpr, pr_list);
1717
TAILQ_REMOVE(&allprison, deadpr, pr_list);
1718
if (dinspr != NULL)
1719
TAILQ_INSERT_AFTER(&allprison, dinspr,
1720
deadpr, pr_list);
1721
else
1722
TAILQ_INSERT_HEAD(&allprison, deadpr,
1723
pr_list);
1724
}
1725
}
1726
if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1727
error = EAGAIN;
1728
vfs_opterror(opts, "no available jail IDs");
1729
goto done_deref;
1730
}
1731
1732
pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1733
pr->pr_state = PRISON_STATE_INVALID;
1734
refcount_init(&pr->pr_ref, 1);
1735
refcount_init(&pr->pr_uref, 0);
1736
drflags |= PD_DEREF;
1737
LIST_INIT(&pr->pr_children);
1738
mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1739
TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1740
1741
pr->pr_id = jid;
1742
if (inspr != NULL)
1743
TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1744
else
1745
TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1746
1747
pr->pr_parent = ppr;
1748
prison_hold(ppr);
1749
prison_proc_hold(ppr);
1750
LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1751
for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1752
tpr->pr_childcount++;
1753
pr->pr_klist = knlist_alloc(&pr->pr_mtx);
1754
1755
/* Set some default values, and inherit some from the parent. */
1756
if (namelc == NULL)
1757
namelc = "";
1758
if (path == NULL) {
1759
path = "/";
1760
root = mypr->pr_root;
1761
vref(root);
1762
}
1763
strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1764
pr->pr_flags |= PR_HOST;
1765
#if defined(INET) || defined(INET6)
1766
#ifdef VIMAGE
1767
if (!(pr_flags & PR_VNET))
1768
#endif
1769
{
1770
#ifdef INET
1771
if (!(ch_flags & PR_IP4_USER))
1772
pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1773
else if (!(pr_flags & PR_IP4_USER)) {
1774
pr->pr_flags |= ppr->pr_flags & PR_IP4;
1775
prison_ip_dup(ppr, pr, PR_INET);
1776
}
1777
#endif
1778
#ifdef INET6
1779
if (!(ch_flags & PR_IP6_USER))
1780
pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1781
else if (!(pr_flags & PR_IP6_USER)) {
1782
pr->pr_flags |= ppr->pr_flags & PR_IP6;
1783
prison_ip_dup(ppr, pr, PR_INET6);
1784
}
1785
#endif
1786
}
1787
#endif
1788
/* Source address selection is always on by default. */
1789
pr->pr_flags |= _PR_IP_SADDRSEL;
1790
1791
pr->pr_securelevel = ppr->pr_securelevel;
1792
pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1793
pr->pr_enforce_statfs = jail_default_enforce_statfs;
1794
pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1795
1796
pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1797
if (osrelstr == NULL)
1798
strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1799
sizeof(pr->pr_osrelease));
1800
else
1801
strlcpy(pr->pr_osrelease, osrelstr,
1802
sizeof(pr->pr_osrelease));
1803
1804
#ifdef VIMAGE
1805
/*
1806
* Allocate a new vnet if specified.
1807
*
1808
* Set PR_VNET now if so, so that the vnet is disposed of
1809
* properly when the jail is destroyed.
1810
*/
1811
if (pr_flags & PR_VNET) {
1812
pr->pr_flags |= PR_VNET;
1813
pr->pr_vnet = vnet_alloc();
1814
} else {
1815
pr->pr_vnet = ppr->pr_vnet;
1816
}
1817
#endif
1818
/*
1819
* Allocate a dedicated cpuset for each jail.
1820
* Unlike other initial settings, this may return an error.
1821
*/
1822
error = cpuset_create_root(ppr, &pr->pr_cpuset);
1823
if (error)
1824
goto done_deref;
1825
1826
mtx_lock(&pr->pr_mtx);
1827
drflags |= PD_LOCKED;
1828
} else {
1829
/*
1830
* Grab a reference for existing prisons, to ensure they
1831
* continue to exist for the duration of the call.
1832
*/
1833
if (!(drflags & PD_DEREF)) {
1834
prison_hold(pr);
1835
drflags |= PD_DEREF;
1836
}
1837
#if defined(VIMAGE) && (defined(INET) || defined(INET6))
1838
if ((pr->pr_flags & PR_VNET) &&
1839
(ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1840
error = EINVAL;
1841
vfs_opterror(opts,
1842
"vnet jails cannot have IP address restrictions");
1843
goto done_deref;
1844
}
1845
#endif
1846
#ifdef INET
1847
if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1848
error = EINVAL;
1849
vfs_opterror(opts,
1850
"ip4 cannot be changed after creation");
1851
goto done_deref;
1852
}
1853
#endif
1854
#ifdef INET6
1855
if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1856
error = EINVAL;
1857
vfs_opterror(opts,
1858
"ip6 cannot be changed after creation");
1859
goto done_deref;
1860
}
1861
#endif
1862
}
1863
1864
/* Do final error checking before setting anything. */
1865
if (gotslevel) {
1866
if (slevel < ppr->pr_securelevel) {
1867
error = EPERM;
1868
goto done_deref;
1869
}
1870
}
1871
if (gotchildmax) {
1872
if (childmax >= ppr->pr_childmax) {
1873
error = EPERM;
1874
goto done_deref;
1875
}
1876
}
1877
if (gotenforce) {
1878
if (enforce < ppr->pr_enforce_statfs) {
1879
error = EPERM;
1880
goto done_deref;
1881
}
1882
}
1883
if (gotrsnum) {
1884
/*
1885
* devfs_rsnum is a uint16_t
1886
*/
1887
if (rsnum < 0 || rsnum > 65535) {
1888
error = EINVAL;
1889
goto done_deref;
1890
}
1891
/*
1892
* Nested jails always inherit parent's devfs ruleset
1893
*/
1894
if (jailed(td->td_ucred)) {
1895
if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1896
error = EPERM;
1897
goto done_deref;
1898
} else
1899
rsnum = ppr->pr_devfs_rsnum;
1900
}
1901
}
1902
#ifdef INET
1903
if (ip4s > 0) {
1904
if ((ppr->pr_flags & PR_IP4) &&
1905
!prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
1906
PR_INET)) {
1907
error = EPERM;
1908
goto done_deref;
1909
}
1910
if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
1911
error = EADDRINUSE;
1912
vfs_opterror(opts, "IPv4 addresses clash");
1913
goto done_deref;
1914
}
1915
}
1916
#endif
1917
#ifdef INET6
1918
if (ip6s > 0) {
1919
if ((ppr->pr_flags & PR_IP6) &&
1920
!prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
1921
PR_INET6)) {
1922
error = EPERM;
1923
goto done_deref;
1924
}
1925
if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
1926
error = EADDRINUSE;
1927
vfs_opterror(opts, "IPv6 addresses clash");
1928
goto done_deref;
1929
}
1930
}
1931
#endif
1932
onamelen = namelen = 0;
1933
if (namelc != NULL) {
1934
/* Give a default name of the jid. Also allow the name to be
1935
* explicitly the jid - but not any other number, and only in
1936
* normal form (no leading zero/etc).
1937
*/
1938
if (namelc[0] == '\0')
1939
snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1940
else if ((strtoul(namelc, &p, 10) != jid ||
1941
namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1942
error = EINVAL;
1943
vfs_opterror(opts,
1944
"name cannot be numeric (unless it is the jid)");
1945
goto done_deref;
1946
}
1947
/*
1948
* Make sure the name isn't too long for the prison or its
1949
* children.
1950
*/
1951
pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1952
onamelen = strlen(pr->pr_name + pnamelen);
1953
namelen = strlen(namelc);
1954
if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1955
error = ENAMETOOLONG;
1956
goto done_deref;
1957
}
1958
FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1959
if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1960
sizeof(pr->pr_name)) {
1961
error = ENAMETOOLONG;
1962
goto done_deref;
1963
}
1964
}
1965
}
1966
pr_allow_diff = pr_allow & ~ppr->pr_allow;
1967
if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1968
error = EPERM;
1969
goto done_deref;
1970
}
1971
1972
/*
1973
* Let modules check their parameters. This requires unlocking and
1974
* then re-locking the prison, but this is still a valid state as long
1975
* as allprison_lock remains xlocked.
1976
*/
1977
mtx_unlock(&pr->pr_mtx);
1978
drflags &= ~PD_LOCKED;
1979
error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1980
if (error != 0)
1981
goto done_deref;
1982
mtx_lock(&pr->pr_mtx);
1983
drflags |= PD_LOCKED;
1984
1985
/* At this point, all valid parameters should have been noted. */
1986
TAILQ_FOREACH(opt, opts, link) {
1987
if (!opt->seen && strcmp(opt->name, "errmsg")) {
1988
error = EINVAL;
1989
vfs_opterror(opts, "unknown parameter: %s", opt->name);
1990
goto done_deref;
1991
}
1992
}
1993
maybe_changed = true;
1994
1995
/* Set the parameters of the prison. */
1996
#ifdef INET
1997
redo_ip4 = false;
1998
if (pr_flags & PR_IP4_USER) {
1999
pr->pr_flags |= PR_IP4;
2000
prison_ip_set(pr, PR_INET, ip4);
2001
ip4 = NULL;
2002
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2003
#ifdef VIMAGE
2004
if (tpr->pr_flags & PR_VNET) {
2005
descend = 0;
2006
continue;
2007
}
2008
#endif
2009
if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
2010
redo_ip4 = true;
2011
descend = 0;
2012
}
2013
}
2014
}
2015
#endif
2016
#ifdef INET6
2017
redo_ip6 = false;
2018
if (pr_flags & PR_IP6_USER) {
2019
pr->pr_flags |= PR_IP6;
2020
prison_ip_set(pr, PR_INET6, ip6);
2021
ip6 = NULL;
2022
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2023
#ifdef VIMAGE
2024
if (tpr->pr_flags & PR_VNET) {
2025
descend = 0;
2026
continue;
2027
}
2028
#endif
2029
if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
2030
redo_ip6 = true;
2031
descend = 0;
2032
}
2033
}
2034
}
2035
#endif
2036
if (gotslevel) {
2037
pr->pr_securelevel = slevel;
2038
/* Set all child jails to be at least this level. */
2039
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
2040
if (tpr->pr_securelevel < slevel)
2041
tpr->pr_securelevel = slevel;
2042
}
2043
if (gotchildmax) {
2044
pr->pr_childmax = childmax;
2045
/* Set all child jails to under this limit. */
2046
FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
2047
if (tpr->pr_childmax > childmax - level)
2048
tpr->pr_childmax = childmax > level
2049
? childmax - level : 0;
2050
}
2051
if (gotenforce) {
2052
pr->pr_enforce_statfs = enforce;
2053
/* Pass this restriction on to the children. */
2054
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
2055
if (tpr->pr_enforce_statfs < enforce)
2056
tpr->pr_enforce_statfs = enforce;
2057
}
2058
if (gotrsnum) {
2059
pr->pr_devfs_rsnum = rsnum;
2060
/* Pass this restriction on to the children. */
2061
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
2062
tpr->pr_devfs_rsnum = rsnum;
2063
}
2064
if (namelc != NULL) {
2065
if (ppr == &prison0)
2066
strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
2067
else
2068
snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
2069
ppr->pr_name, namelc);
2070
/* Change this component of child names. */
2071
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2072
bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
2073
strlen(tpr->pr_name + onamelen) + 1);
2074
bcopy(pr->pr_name, tpr->pr_name, namelen);
2075
}
2076
}
2077
if (path != NULL) {
2078
/* Try to keep a real-rooted full pathname. */
2079
strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
2080
pr->pr_root = root;
2081
root = NULL;
2082
}
2083
if (PR_HOST & ch_flags & ~pr_flags) {
2084
if (pr->pr_flags & PR_HOST) {
2085
/*
2086
* Copy the parent's host info. As with pr_ip4 above,
2087
* the lack of a lock on the parent is not a problem;
2088
* it is always set with allprison_lock at least
2089
* shared, and is held exclusively here.
2090
*/
2091
strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
2092
sizeof(pr->pr_hostname));
2093
strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
2094
sizeof(pr->pr_domainname));
2095
strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
2096
sizeof(pr->pr_hostuuid));
2097
pr->pr_hostid = pr->pr_parent->pr_hostid;
2098
}
2099
} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
2100
/* Set this prison, and any descendants without PR_HOST. */
2101
if (host != NULL)
2102
strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
2103
if (domain != NULL)
2104
strlcpy(pr->pr_domainname, domain,
2105
sizeof(pr->pr_domainname));
2106
if (uuid != NULL)
2107
strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
2108
if (gothid)
2109
pr->pr_hostid = hid;
2110
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2111
if (tpr->pr_flags & PR_HOST)
2112
descend = 0;
2113
else {
2114
if (host != NULL)
2115
strlcpy(tpr->pr_hostname,
2116
pr->pr_hostname,
2117
sizeof(tpr->pr_hostname));
2118
if (domain != NULL)
2119
strlcpy(tpr->pr_domainname,
2120
pr->pr_domainname,
2121
sizeof(tpr->pr_domainname));
2122
if (uuid != NULL)
2123
strlcpy(tpr->pr_hostuuid,
2124
pr->pr_hostuuid,
2125
sizeof(tpr->pr_hostuuid));
2126
if (gothid)
2127
tpr->pr_hostid = hid;
2128
}
2129
}
2130
}
2131
pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
2132
if ((tallow = ch_allow & ~pr_allow))
2133
prison_set_allow_locked(pr, tallow, 0);
2134
/*
2135
* Persistent prisons get an extra reference, and prisons losing their
2136
* persist flag lose that reference.
2137
*/
2138
if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
2139
if (pr_flags & PR_PERSIST) {
2140
prison_hold(pr);
2141
/*
2142
* This may be a new prison's first user reference,
2143
* but wait to call it alive until after OSD calls
2144
* have had a chance to run (and perhaps to fail).
2145
*/
2146
refcount_acquire(&pr->pr_uref);
2147
} else {
2148
drflags |= PD_DEUREF;
2149
prison_free_not_last(pr);
2150
}
2151
}
2152
pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
2153
mtx_unlock(&pr->pr_mtx);
2154
drflags &= ~PD_LOCKED;
2155
/*
2156
* Any errors past this point will need to de-persist newly created
2157
* prisons, as well as call remove methods.
2158
*/
2159
if (created)
2160
drflags |= PD_KILL;
2161
2162
#ifdef RACCT
2163
if (racct_enable && created)
2164
prison_racct_attach(pr);
2165
#endif
2166
2167
/* Locks may have prevented a complete restriction of child IP
2168
* addresses. If so, allocate some more memory and try again.
2169
*/
2170
#ifdef INET
2171
while (redo_ip4) {
2172
ip4s = pr->pr_addrs[PR_INET]->ips;
2173
MPASS(ip4 == NULL);
2174
ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
2175
mtx_lock(&pr->pr_mtx);
2176
redo_ip4 = false;
2177
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2178
#ifdef VIMAGE
2179
if (tpr->pr_flags & PR_VNET) {
2180
descend = 0;
2181
continue;
2182
}
2183
#endif
2184
if (!prison_ip_restrict(tpr, PR_INET, &ip4))
2185
redo_ip4 = true;
2186
}
2187
mtx_unlock(&pr->pr_mtx);
2188
}
2189
#endif
2190
#ifdef INET6
2191
while (redo_ip6) {
2192
ip6s = pr->pr_addrs[PR_INET6]->ips;
2193
MPASS(ip6 == NULL);
2194
ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
2195
mtx_lock(&pr->pr_mtx);
2196
redo_ip6 = false;
2197
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2198
#ifdef VIMAGE
2199
if (tpr->pr_flags & PR_VNET) {
2200
descend = 0;
2201
continue;
2202
}
2203
#endif
2204
if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
2205
redo_ip6 = true;
2206
}
2207
mtx_unlock(&pr->pr_mtx);
2208
}
2209
#endif
2210
2211
/* Let the modules do their work. */
2212
if (created) {
2213
error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
2214
if (error)
2215
goto done_deref;
2216
}
2217
error = osd_jail_call(pr, PR_METHOD_SET, opts);
2218
if (error)
2219
goto done_deref;
2220
2221
/*
2222
* A new prison is now ready to be seen; either it has gained a user
2223
* reference via persistence, or is about to gain one via attachment.
2224
*/
2225
if (created) {
2226
sx_assert(&allprison_lock, SX_XLOCKED);
2227
prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id);
2228
mtx_lock(&pr->pr_mtx);
2229
drflags |= PD_LOCKED;
2230
pr->pr_state = PRISON_STATE_ALIVE;
2231
}
2232
2233
/* Attach this process to the prison if requested. */
2234
if (flags & JAIL_ATTACH) {
2235
error = do_jail_attach(td, pr,
2236
prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
2237
drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
2238
if (error) {
2239
vfs_opterror(opts, "attach failed");
2240
goto done_deref;
2241
}
2242
}
2243
2244
#ifdef RACCT
2245
if (racct_enable && !created) {
2246
if (drflags & PD_LOCKED) {
2247
mtx_unlock(&pr->pr_mtx);
2248
drflags &= ~PD_LOCKED;
2249
}
2250
if (drflags & PD_LIST_XLOCKED) {
2251
sx_xunlock(&allprison_lock);
2252
drflags &= ~PD_LIST_XLOCKED;
2253
}
2254
prison_racct_modify(pr);
2255
}
2256
#endif
2257
2258
if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
2259
(pr->pr_root->v_vflag & VV_ROOT) == 0)
2260
printf("Warning jail jid=%d: mountd/nfsd requires a separate"
2261
" file system\n", pr->pr_id);
2262
2263
/*
2264
* Now that the prison is fully created without error, set the
2265
* jail descriptor if one was requested. This is the only
2266
* parameter that is returned to the caller (except the error
2267
* message).
2268
*/
2269
if (jfd_out >= 0) {
2270
if (!(drflags & PD_LOCKED)) {
2271
mtx_lock(&pr->pr_mtx);
2272
drflags |= PD_LOCKED;
2273
}
2274
jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1;
2275
if (optuio->uio_segflg == UIO_SYSSPACE)
2276
*(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out;
2277
else
2278
(void)copyout(&jfd_out,
2279
optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out));
2280
jaildesc_set_prison(jfp_out, pr);
2281
}
2282
2283
drflags &= ~PD_KILL;
2284
td->td_retval[0] = pr->pr_id;
2285
2286
done_deref:
2287
/*
2288
* Report changes to kevent. This can happen even if the
2289
* system call fails, as changes might have been made before
2290
* the failure.
2291
*/
2292
if (maybe_changed && !created)
2293
prison_knote(pr, NOTE_JAIL_SET);
2294
/* Release any temporary prison holds and/or locks. */
2295
if (pr != NULL)
2296
prison_deref(pr, drflags);
2297
else if (drflags & PD_LIST_SLOCKED)
2298
sx_sunlock(&allprison_lock);
2299
else if (drflags & PD_LIST_XLOCKED)
2300
sx_xunlock(&allprison_lock);
2301
if (root != NULL)
2302
vrele(root);
2303
done_errmsg:
2304
if (error) {
2305
/* Write the error message back to userspace. */
2306
if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
2307
&errmsg_len) == 0 && errmsg_len > 0) {
2308
errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
2309
if (optuio->uio_segflg == UIO_SYSSPACE)
2310
bcopy(errmsg,
2311
optuio->uio_iov[errmsg_pos].iov_base,
2312
errmsg_len);
2313
else
2314
(void)copyout(errmsg,
2315
optuio->uio_iov[errmsg_pos].iov_base,
2316
errmsg_len);
2317
}
2318
}
2319
done_free:
2320
/* Clean up other resources. */
2321
#ifdef INET
2322
prison_ip_free(ip4);
2323
#endif
2324
#ifdef INET6
2325
prison_ip_free(ip6);
2326
#endif
2327
if (jfp_out != NULL)
2328
fdrop(jfp_out, td);
2329
if (error && jfd_out >= 0)
2330
(void)kern_close(td, jfd_out);
2331
if (g_path != NULL)
2332
free(g_path, M_TEMP);
2333
vfs_freeopts(opts);
2334
prison_free(mypr);
2335
return (error);
2336
}
2337
2338
/*
2339
* Find the next available prison ID. Return the ID on success, or zero
2340
* on failure. Also set a pointer to the allprison list entry the prison
2341
* should be inserted before.
2342
*/
2343
static int
2344
get_next_prid(struct prison **insprp)
2345
{
2346
struct prison *inspr;
2347
int jid, maxid;
2348
2349
jid = lastprid % JAIL_MAX + 1;
2350
if (TAILQ_EMPTY(&allprison) ||
2351
TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
2352
/*
2353
* A common case is for all jails to be implicitly numbered,
2354
* which means they'll go on the end of the list, at least
2355
* for the first JAIL_MAX times.
2356
*/
2357
inspr = NULL;
2358
} else {
2359
/*
2360
* Take two passes through the allprison list: first starting
2361
* with the proposed jid, then ending with it.
2362
*/
2363
for (maxid = JAIL_MAX; maxid != 0; ) {
2364
TAILQ_FOREACH(inspr, &allprison, pr_list) {
2365
if (inspr->pr_id < jid)
2366
continue;
2367
if (inspr->pr_id > jid) {
2368
/* Found an opening. */
2369
maxid = 0;
2370
break;
2371
}
2372
if (++jid > maxid) {
2373
if (lastprid == maxid || lastprid == 0)
2374
{
2375
/*
2376
* The entire legal range
2377
* has been traversed
2378
*/
2379
return 0;
2380
}
2381
/* Try again from the start. */
2382
jid = 1;
2383
maxid = lastprid;
2384
break;
2385
}
2386
}
2387
if (inspr == NULL) {
2388
/* Found room at the end of the list. */
2389
break;
2390
}
2391
}
2392
}
2393
*insprp = inspr;
2394
lastprid = jid;
2395
return (jid);
2396
}
2397
2398
/*
2399
* Find the next available ID for a renumbered dead prison. This is the same
2400
* as get_next_prid, but counting backward from the end of the range.
2401
*/
2402
static int
2403
get_next_deadid(struct prison **dinsprp)
2404
{
2405
struct prison *dinspr;
2406
int deadid, minid;
2407
2408
deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX;
2409
/*
2410
* Take two reverse passes through the allprison list: first
2411
* starting with the proposed deadid, then ending with it.
2412
*/
2413
for (minid = 1; minid != 0; ) {
2414
TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) {
2415
if (dinspr->pr_id > deadid)
2416
continue;
2417
if (dinspr->pr_id < deadid) {
2418
/* Found an opening. */
2419
minid = 0;
2420
break;
2421
}
2422
if (--deadid < minid) {
2423
if (lastdeadid == minid || lastdeadid == 0)
2424
{
2425
/*
2426
* The entire legal range
2427
* has been traversed
2428
*/
2429
return 0;
2430
}
2431
/* Try again from the end. */
2432
deadid = JAIL_MAX;
2433
minid = lastdeadid;
2434
break;
2435
}
2436
}
2437
if (dinspr == NULL) {
2438
/* Found room at the beginning of the list. */
2439
break;
2440
}
2441
}
2442
*dinsprp = dinspr;
2443
lastdeadid = deadid;
2444
return (deadid);
2445
}
2446
2447
/*
2448
* struct jail_get_args {
2449
* struct iovec *iovp;
2450
* unsigned int iovcnt;
2451
* int flags;
2452
* };
2453
*/
2454
int
2455
sys_jail_get(struct thread *td, struct jail_get_args *uap)
2456
{
2457
struct uio *auio;
2458
int error;
2459
2460
/* Check that we have an even number of iovecs. */
2461
if (uap->iovcnt & 1)
2462
return (EINVAL);
2463
2464
error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2465
if (error)
2466
return (error);
2467
error = kern_jail_get(td, auio, uap->flags);
2468
if (error == 0)
2469
error = copyout(auio->uio_iov, uap->iovp,
2470
uap->iovcnt * sizeof(struct iovec));
2471
freeuio(auio);
2472
return (error);
2473
}
2474
2475
int
2476
kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2477
{
2478
struct bool_flags *bf;
2479
struct file *jfp_out;
2480
struct jailsys_flags *jsf;
2481
struct prison *pr, *mypr;
2482
struct vfsopt *opt;
2483
struct vfsoptlist *opts;
2484
char *errmsg, *name;
2485
int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2486
int jfd_in, jfd_out;
2487
unsigned f;
2488
2489
if (flags & ~JAIL_GET_MASK)
2490
return (EINVAL);
2491
if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) ==
2492
(JAIL_USE_DESC | JAIL_AT_DESC))
2493
return (EINVAL);
2494
2495
/* Get the parameter list. */
2496
error = vfs_buildopts(optuio, &opts);
2497
if (error)
2498
return (error);
2499
errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2500
mypr = td->td_ucred->cr_prison;
2501
prison_hold(mypr);
2502
pr = NULL;
2503
jfp_out = NULL;
2504
jfd_out = -1;
2505
2506
/*
2507
* Find the prison specified by one of: desc, lastjid, jid, name.
2508
*/
2509
sx_slock(&allprison_lock);
2510
drflags = PD_LIST_SLOCKED;
2511
2512
error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
2513
if (error == ENOENT) {
2514
if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) {
2515
vfs_opterror(opts, "missing desc");
2516
goto done;
2517
}
2518
} else if (error == 0) {
2519
if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
2520
JAIL_OWN_DESC))) {
2521
vfs_opterror(opts, "unexpected desc");
2522
goto done;
2523
}
2524
if (flags & JAIL_USE_DESC) {
2525
/* Get the jail from its descriptor. */
2526
error = jaildesc_find(td, jfd_in, &pr, NULL);
2527
if (error) {
2528
vfs_opterror(opts, error == ENOENT ?
2529
"descriptor to dead jail" :
2530
"not a jail descriptor");
2531
goto done;
2532
}
2533
drflags |= PD_DEREF;
2534
mtx_lock(&pr->pr_mtx);
2535
drflags |= PD_LOCKED;
2536
if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2537
error = ENOENT;
2538
vfs_opterror(opts, "jail %d is dying",
2539
pr->pr_id);
2540
goto done;
2541
}
2542
goto found_prison;
2543
}
2544
if (flags & JAIL_AT_DESC) {
2545
/* Look up jails based on the descriptor's prison. */
2546
prison_free(mypr);
2547
error = jaildesc_find(td, jfd_in, &mypr, NULL);
2548
if (error != 0) {
2549
vfs_opterror(opts, error == ENOENT ?
2550
"descriptor to dead jail" :
2551
"not a jail descriptor");
2552
goto done;
2553
}
2554
}
2555
if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
2556
/* Allocate a jail descriptor to return later. */
2557
error = jaildesc_alloc(td, &jfp_out, &jfd_out,
2558
flags & JAIL_OWN_DESC);
2559
if (error)
2560
goto done;
2561
}
2562
} else
2563
goto done;
2564
2565
error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2566
if (error == 0) {
2567
TAILQ_FOREACH(pr, &allprison, pr_list) {
2568
if (pr->pr_id > jid &&
2569
((flags & JAIL_DYING) || prison_isalive(pr)) &&
2570
prison_ischild(mypr, pr)) {
2571
mtx_lock(&pr->pr_mtx);
2572
drflags |= PD_LOCKED;
2573
goto found_prison;
2574
}
2575
}
2576
error = ENOENT;
2577
vfs_opterror(opts, "no jail after %d", jid);
2578
goto done;
2579
} else if (error != ENOENT)
2580
goto done;
2581
2582
error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2583
if (error == 0) {
2584
if (jid != 0) {
2585
pr = prison_find_child(mypr, jid);
2586
if (pr != NULL) {
2587
drflags |= PD_LOCKED;
2588
if (!(prison_isalive(pr) ||
2589
(flags & JAIL_DYING))) {
2590
error = ENOENT;
2591
vfs_opterror(opts, "jail %d is dying",
2592
jid);
2593
goto done;
2594
}
2595
goto found_prison;
2596
}
2597
error = ENOENT;
2598
vfs_opterror(opts, "jail %d not found", jid);
2599
goto done;
2600
}
2601
} else if (error != ENOENT)
2602
goto done;
2603
2604
error = vfs_getopt(opts, "name", (void **)&name, &len);
2605
if (error == 0) {
2606
if (len == 0 || name[len - 1] != '\0') {
2607
error = EINVAL;
2608
goto done;
2609
}
2610
pr = prison_find_name(mypr, name);
2611
if (pr != NULL) {
2612
drflags |= PD_LOCKED;
2613
if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2614
error = ENOENT;
2615
vfs_opterror(opts, "jail \"%s\" is dying",
2616
name);
2617
goto done;
2618
}
2619
goto found_prison;
2620
}
2621
error = ENOENT;
2622
vfs_opterror(opts, "jail \"%s\" not found", name);
2623
goto done;
2624
} else if (error != ENOENT)
2625
goto done;
2626
2627
vfs_opterror(opts, "no jail specified");
2628
error = ENOENT;
2629
goto done;
2630
2631
found_prison:
2632
/* Get the parameters of the prison. */
2633
if (!(drflags & PD_DEREF)) {
2634
prison_hold(pr);
2635
drflags |= PD_DEREF;
2636
}
2637
td->td_retval[0] = pr->pr_id;
2638
if (jfd_out >= 0) {
2639
error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out));
2640
if (error != 0 && error != ENOENT)
2641
goto done;
2642
jaildesc_set_prison(jfp_out, pr);
2643
}
2644
error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2645
if (error != 0 && error != ENOENT)
2646
goto done;
2647
i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2648
error = vfs_setopt(opts, "parent", &i, sizeof(i));
2649
if (error != 0 && error != ENOENT)
2650
goto done;
2651
error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2652
if (error != 0 && error != ENOENT)
2653
goto done;
2654
error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2655
sizeof(pr->pr_cpuset->cs_id));
2656
if (error != 0 && error != ENOENT)
2657
goto done;
2658
error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2659
if (error != 0 && error != ENOENT)
2660
goto done;
2661
#ifdef INET
2662
error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
2663
pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
2664
pr_families[PR_INET].size : 0 );
2665
if (error != 0 && error != ENOENT)
2666
goto done;
2667
#endif
2668
#ifdef INET6
2669
error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
2670
pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
2671
pr_families[PR_INET6].size : 0 );
2672
if (error != 0 && error != ENOENT)
2673
goto done;
2674
#endif
2675
error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2676
sizeof(pr->pr_securelevel));
2677
if (error != 0 && error != ENOENT)
2678
goto done;
2679
error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2680
sizeof(pr->pr_childcount));
2681
if (error != 0 && error != ENOENT)
2682
goto done;
2683
error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2684
sizeof(pr->pr_childmax));
2685
if (error != 0 && error != ENOENT)
2686
goto done;
2687
error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2688
if (error != 0 && error != ENOENT)
2689
goto done;
2690
error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2691
if (error != 0 && error != ENOENT)
2692
goto done;
2693
error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2694
if (error != 0 && error != ENOENT)
2695
goto done;
2696
#ifdef COMPAT_FREEBSD32
2697
if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2698
uint32_t hid32 = pr->pr_hostid;
2699
2700
error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2701
} else
2702
#endif
2703
error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2704
sizeof(pr->pr_hostid));
2705
if (error != 0 && error != ENOENT)
2706
goto done;
2707
error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2708
sizeof(pr->pr_enforce_statfs));
2709
if (error != 0 && error != ENOENT)
2710
goto done;
2711
error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2712
sizeof(pr->pr_devfs_rsnum));
2713
if (error != 0 && error != ENOENT)
2714
goto done;
2715
for (bf = pr_flag_bool;
2716
bf < pr_flag_bool + nitems(pr_flag_bool);
2717
bf++) {
2718
i = (pr->pr_flags & bf->flag) ? 1 : 0;
2719
error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2720
if (error != 0 && error != ENOENT)
2721
goto done;
2722
i = !i;
2723
error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2724
if (error != 0 && error != ENOENT)
2725
goto done;
2726
}
2727
for (jsf = pr_flag_jailsys;
2728
jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2729
jsf++) {
2730
f = pr->pr_flags & (jsf->disable | jsf->new);
2731
i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2732
: (f == jsf->new) ? JAIL_SYS_NEW
2733
: JAIL_SYS_INHERIT;
2734
error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2735
if (error != 0 && error != ENOENT)
2736
goto done;
2737
}
2738
for (bf = pr_flag_allow;
2739
bf < pr_flag_allow + nitems(pr_flag_allow) &&
2740
atomic_load_int(&bf->flag) != 0;
2741
bf++) {
2742
i = (pr->pr_allow & bf->flag) ? 1 : 0;
2743
error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2744
if (error != 0 && error != ENOENT)
2745
goto done;
2746
i = !i;
2747
error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2748
if (error != 0 && error != ENOENT)
2749
goto done;
2750
}
2751
i = !prison_isalive(pr);
2752
error = vfs_setopt(opts, "dying", &i, sizeof(i));
2753
if (error != 0 && error != ENOENT)
2754
goto done;
2755
i = !i;
2756
error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2757
if (error != 0 && error != ENOENT)
2758
goto done;
2759
error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2760
sizeof(pr->pr_osreldate));
2761
if (error != 0 && error != ENOENT)
2762
goto done;
2763
error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2764
if (error != 0 && error != ENOENT)
2765
goto done;
2766
2767
/* Get the module parameters. */
2768
mtx_unlock(&pr->pr_mtx);
2769
drflags &= ~PD_LOCKED;
2770
error = osd_jail_call(pr, PR_METHOD_GET, opts);
2771
if (error)
2772
goto done;
2773
prison_deref(pr, drflags);
2774
pr = NULL;
2775
drflags = 0;
2776
2777
/* By now, all parameters should have been noted. */
2778
TAILQ_FOREACH(opt, opts, link) {
2779
if (!opt->seen &&
2780
(strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name ||
2781
strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) {
2782
/* Communicate back a missing key. */
2783
free(opt->value, M_MOUNT);
2784
opt->value = NULL;
2785
opt->len = 0;
2786
continue;
2787
}
2788
if (!opt->seen && strcmp(opt->name, "errmsg")) {
2789
error = EINVAL;
2790
vfs_opterror(opts, "unknown parameter: %s", opt->name);
2791
goto done;
2792
}
2793
}
2794
2795
/* Write the fetched parameters back to userspace. */
2796
error = 0;
2797
TAILQ_FOREACH(opt, opts, link) {
2798
if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2799
pos = 2 * opt->pos + 1;
2800
optuio->uio_iov[pos].iov_len = opt->len;
2801
if (opt->value != NULL) {
2802
if (optuio->uio_segflg == UIO_SYSSPACE) {
2803
bcopy(opt->value,
2804
optuio->uio_iov[pos].iov_base,
2805
opt->len);
2806
} else {
2807
error = copyout(opt->value,
2808
optuio->uio_iov[pos].iov_base,
2809
opt->len);
2810
if (error)
2811
break;
2812
}
2813
}
2814
}
2815
}
2816
2817
done:
2818
/* Release any temporary prison holds and/or locks. */
2819
if (pr != NULL)
2820
prison_deref(pr, drflags);
2821
else if (drflags & PD_LIST_SLOCKED)
2822
sx_sunlock(&allprison_lock);
2823
else if (drflags & PD_LIST_XLOCKED)
2824
sx_xunlock(&allprison_lock);
2825
/* Clean up other resources. */
2826
if (jfp_out != NULL)
2827
(void)fdrop(jfp_out, td);
2828
if (error && jfd_out >= 0)
2829
(void)kern_close(td, jfd_out);
2830
if (error && errmsg_pos >= 0) {
2831
/* Write the error message back to userspace. */
2832
vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2833
errmsg_pos = 2 * errmsg_pos + 1;
2834
if (errmsg_len > 0) {
2835
if (optuio->uio_segflg == UIO_SYSSPACE)
2836
bcopy(errmsg,
2837
optuio->uio_iov[errmsg_pos].iov_base,
2838
errmsg_len);
2839
else
2840
(void)copyout(errmsg,
2841
optuio->uio_iov[errmsg_pos].iov_base,
2842
errmsg_len);
2843
}
2844
}
2845
vfs_freeopts(opts);
2846
prison_free(mypr);
2847
return (error);
2848
}
2849
2850
/*
2851
* struct jail_remove_args {
2852
* int jid;
2853
* };
2854
*/
2855
int
2856
sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2857
{
2858
struct prison *pr;
2859
int error;
2860
2861
error = priv_check(td, PRIV_JAIL_REMOVE);
2862
if (error)
2863
return (error);
2864
2865
sx_xlock(&allprison_lock);
2866
pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2867
if (pr == NULL) {
2868
sx_xunlock(&allprison_lock);
2869
return (EINVAL);
2870
}
2871
prison_hold(pr);
2872
prison_remove(pr);
2873
return (0);
2874
}
2875
2876
/*
2877
* struct jail_remove_jd_args {
2878
* int fd;
2879
* };
2880
*/
2881
int
2882
sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap)
2883
{
2884
struct prison *pr;
2885
struct ucred *jdcred;
2886
int error;
2887
2888
error = jaildesc_find(td, uap->fd, &pr, &jdcred);
2889
if (error)
2890
return (error);
2891
error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE);
2892
crfree(jdcred);
2893
if (error) {
2894
prison_free(pr);
2895
return (error);
2896
}
2897
sx_xlock(&allprison_lock);
2898
mtx_lock(&pr->pr_mtx);
2899
prison_remove(pr);
2900
return (0);
2901
}
2902
2903
/*
2904
* Begin the removal process for a prison. The allprison lock should
2905
* be held exclusively, and the prison should be both locked and held.
2906
*/
2907
void
2908
prison_remove(struct prison *pr)
2909
{
2910
sx_assert(&allprison_lock, SA_XLOCKED);
2911
mtx_assert(&pr->pr_mtx, MA_OWNED);
2912
if (!prison_isalive(pr)) {
2913
/* Silently ignore already-dying prisons. */
2914
mtx_unlock(&pr->pr_mtx);
2915
sx_xunlock(&allprison_lock);
2916
return;
2917
}
2918
prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2919
}
2920
2921
/*
2922
* struct jail_attach_args {
2923
* int jid;
2924
* };
2925
*/
2926
int
2927
sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2928
{
2929
struct prison *pr;
2930
int error;
2931
2932
error = priv_check(td, PRIV_JAIL_ATTACH);
2933
if (error)
2934
return (error);
2935
2936
sx_slock(&allprison_lock);
2937
pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2938
if (pr == NULL) {
2939
sx_sunlock(&allprison_lock);
2940
return (EINVAL);
2941
}
2942
2943
/* Do not allow a process to attach to a prison that is not alive. */
2944
if (!prison_isalive(pr)) {
2945
mtx_unlock(&pr->pr_mtx);
2946
sx_sunlock(&allprison_lock);
2947
return (EINVAL);
2948
}
2949
2950
return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2951
}
2952
2953
/*
2954
* struct jail_attach_jd_args {
2955
* int fd;
2956
* };
2957
*/
2958
int
2959
sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap)
2960
{
2961
struct prison *pr;
2962
struct ucred *jdcred;
2963
int drflags, error;
2964
2965
sx_slock(&allprison_lock);
2966
drflags = PD_LIST_SLOCKED;
2967
error = jaildesc_find(td, uap->fd, &pr, &jdcred);
2968
if (error)
2969
goto fail;
2970
drflags |= PD_DEREF;
2971
error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
2972
crfree(jdcred);
2973
if (error)
2974
goto fail;
2975
mtx_lock(&pr->pr_mtx);
2976
drflags |= PD_LOCKED;
2977
2978
/* Do not allow a process to attach to a prison that is not alive. */
2979
if (!prison_isalive(pr)) {
2980
error = EINVAL;
2981
goto fail;
2982
}
2983
2984
return (do_jail_attach(td, pr, drflags));
2985
2986
fail:
2987
prison_deref(pr, drflags);
2988
return (error);
2989
}
2990
2991
static int
2992
do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2993
{
2994
struct proc *p;
2995
struct ucred *newcred, *oldcred;
2996
int error;
2997
2998
mtx_assert(&pr->pr_mtx, MA_OWNED);
2999
sx_assert(&allprison_lock, SX_LOCKED);
3000
drflags &= PD_LOCK_FLAGS;
3001
/*
3002
* XXX: Note that there is a slight race here if two threads
3003
* in the same privileged process attempt to attach to two
3004
* different jails at the same time. It is important for
3005
* user processes not to do this, or they might end up with
3006
* a process root from one prison, but attached to the jail
3007
* of another.
3008
*/
3009
if (!(drflags & PD_DEREF)) {
3010
prison_hold(pr);
3011
drflags |= PD_DEREF;
3012
}
3013
refcount_acquire(&pr->pr_uref);
3014
drflags |= PD_DEUREF;
3015
mtx_unlock(&pr->pr_mtx);
3016
drflags &= ~PD_LOCKED;
3017
3018
/* Let modules do whatever they need to prepare for attaching. */
3019
error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
3020
if (error) {
3021
prison_deref(pr, drflags);
3022
return (error);
3023
}
3024
sx_unlock(&allprison_lock);
3025
drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
3026
3027
/*
3028
* Reparent the newly attached process to this jail.
3029
*/
3030
p = td->td_proc;
3031
error = cpuset_setproc_update_set(p, pr->pr_cpuset);
3032
if (error)
3033
goto e_revert_osd;
3034
3035
vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
3036
if ((error = change_dir(pr->pr_root, td)) != 0)
3037
goto e_unlock;
3038
#ifdef MAC
3039
if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
3040
goto e_unlock;
3041
#endif
3042
VOP_UNLOCK(pr->pr_root);
3043
if ((error = pwd_chroot_chdir(td, pr->pr_root)))
3044
goto e_revert_osd;
3045
3046
newcred = crget();
3047
PROC_LOCK(p);
3048
oldcred = crcopysafe(p, newcred);
3049
newcred->cr_prison = pr;
3050
proc_set_cred(p, newcred);
3051
setsugid(p);
3052
#ifdef RACCT
3053
racct_proc_ucred_changed(p, oldcred, newcred);
3054
crhold(newcred);
3055
#endif
3056
PROC_UNLOCK(p);
3057
#ifdef RCTL
3058
rctl_proc_ucred_changed(p, newcred);
3059
crfree(newcred);
3060
#endif
3061
prison_proc_relink(oldcred->cr_prison, pr, p);
3062
prison_deref(oldcred->cr_prison, drflags);
3063
crfree(oldcred);
3064
prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid);
3065
3066
/*
3067
* If the prison was killed while changing credentials, die along
3068
* with it.
3069
*/
3070
if (!prison_isalive(pr)) {
3071
PROC_LOCK(p);
3072
kern_psignal(p, SIGKILL);
3073
PROC_UNLOCK(p);
3074
}
3075
3076
return (0);
3077
3078
e_unlock:
3079
VOP_UNLOCK(pr->pr_root);
3080
e_revert_osd:
3081
/* Tell modules this thread is still in its old jail after all. */
3082
sx_slock(&allprison_lock);
3083
drflags |= PD_LIST_SLOCKED;
3084
(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
3085
prison_deref(pr, drflags);
3086
return (error);
3087
}
3088
3089
/*
3090
* Returns a locked prison instance, or NULL on failure.
3091
*/
3092
struct prison *
3093
prison_find(int prid)
3094
{
3095
struct prison *pr;
3096
3097
sx_assert(&allprison_lock, SX_LOCKED);
3098
TAILQ_FOREACH(pr, &allprison, pr_list) {
3099
if (pr->pr_id < prid)
3100
continue;
3101
if (pr->pr_id > prid)
3102
break;
3103
KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
3104
mtx_lock(&pr->pr_mtx);
3105
return (pr);
3106
}
3107
return (NULL);
3108
}
3109
3110
/*
3111
* Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
3112
*/
3113
struct prison *
3114
prison_find_child(struct prison *mypr, int prid)
3115
{
3116
struct prison *pr;
3117
int descend;
3118
3119
sx_assert(&allprison_lock, SX_LOCKED);
3120
FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
3121
if (pr->pr_id == prid) {
3122
KASSERT(prison_isvalid(pr),
3123
("Found invalid prison %p", pr));
3124
mtx_lock(&pr->pr_mtx);
3125
return (pr);
3126
}
3127
}
3128
return (NULL);
3129
}
3130
3131
/*
3132
* Look for the name relative to mypr. Returns a locked prison or NULL.
3133
*/
3134
struct prison *
3135
prison_find_name(struct prison *mypr, const char *name)
3136
{
3137
struct prison *pr, *deadpr;
3138
size_t mylen;
3139
int descend;
3140
3141
sx_assert(&allprison_lock, SX_LOCKED);
3142
mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
3143
deadpr = NULL;
3144
FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
3145
if (!strcmp(pr->pr_name + mylen, name)) {
3146
KASSERT(prison_isvalid(pr),
3147
("Found invalid prison %p", pr));
3148
if (prison_isalive(pr)) {
3149
mtx_lock(&pr->pr_mtx);
3150
return (pr);
3151
}
3152
deadpr = pr;
3153
}
3154
}
3155
/* There was no valid prison - perhaps there was a dying one. */
3156
if (deadpr != NULL)
3157
mtx_lock(&deadpr->pr_mtx);
3158
return (deadpr);
3159
}
3160
3161
/*
3162
* See if a prison has the specific flag set. The prison should be locked,
3163
* unless checking for flags that are only set at jail creation (such as
3164
* PR_IP4 and PR_IP6), or only the single bit is examined, without regard
3165
* to any other prison data.
3166
*/
3167
bool
3168
prison_flag(struct ucred *cred, unsigned flag)
3169
{
3170
3171
return ((cred->cr_prison->pr_flags & flag) != 0);
3172
}
3173
3174
/*
3175
* See if a prison has the specific allow flag set.
3176
* The prison *should* be locked, or only a single bit is examined, without
3177
* regard to any other prison data.
3178
*/
3179
bool
3180
prison_allow(struct ucred *cred, unsigned flag)
3181
{
3182
3183
return ((cred->cr_prison->pr_allow & flag) != 0);
3184
}
3185
3186
/*
3187
* Hold a prison reference, by incrementing pr_ref. It is generally
3188
* an error to hold a prison that does not already have a reference.
3189
* A prison record will remain valid as long as it has at least one
3190
* reference, and will not be removed as long as either the prison
3191
* mutex or the allprison lock is held (allprison_lock may be shared).
3192
*/
3193
void
3194
prison_hold_locked(struct prison *pr)
3195
{
3196
3197
/* Locking is no longer required. */
3198
prison_hold(pr);
3199
}
3200
3201
void
3202
prison_hold(struct prison *pr)
3203
{
3204
#ifdef INVARIANTS
3205
int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
3206
3207
KASSERT(was_valid,
3208
("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
3209
#else
3210
refcount_acquire(&pr->pr_ref);
3211
#endif
3212
}
3213
3214
/*
3215
* Remove a prison reference. If that was the last reference, the
3216
* prison will be removed (at a later time).
3217
*/
3218
void
3219
prison_free_locked(struct prison *pr)
3220
{
3221
3222
mtx_assert(&pr->pr_mtx, MA_OWNED);
3223
/*
3224
* Locking is no longer required, but unlock because the caller
3225
* expects it.
3226
*/
3227
mtx_unlock(&pr->pr_mtx);
3228
prison_free(pr);
3229
}
3230
3231
void
3232
prison_free(struct prison *pr)
3233
{
3234
3235
KASSERT(refcount_load(&pr->pr_ref) > 0,
3236
("Trying to free dead prison %p (jid=%d).",
3237
pr, pr->pr_id));
3238
if (!refcount_release_if_not_last(&pr->pr_ref)) {
3239
/*
3240
* Don't remove the last reference in this context,
3241
* in case there are locks held.
3242
*/
3243
taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task);
3244
}
3245
}
3246
3247
static void
3248
prison_free_not_last(struct prison *pr)
3249
{
3250
#ifdef INVARIANTS
3251
int lastref;
3252
3253
KASSERT(refcount_load(&pr->pr_ref) > 0,
3254
("Trying to free dead prison %p (jid=%d).",
3255
pr, pr->pr_id));
3256
lastref = refcount_release(&pr->pr_ref);
3257
KASSERT(!lastref,
3258
("prison_free_not_last freed last ref on prison %p (jid=%d).",
3259
pr, pr->pr_id));
3260
#else
3261
refcount_release(&pr->pr_ref);
3262
#endif
3263
}
3264
3265
/*
3266
* Hold a prison for user visibility, by incrementing pr_uref.
3267
* It is generally an error to hold a prison that isn't already
3268
* user-visible, except through the jail system calls. It is also
3269
* an error to hold an invalid prison. A prison record will remain
3270
* alive as long as it has at least one user reference, and will not
3271
* be set to the dying state until the prison mutex and allprison_lock
3272
* are both freed.
3273
*/
3274
void
3275
prison_proc_hold(struct prison *pr)
3276
{
3277
#ifdef INVARIANTS
3278
int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
3279
3280
KASSERT(was_alive,
3281
("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
3282
#else
3283
refcount_acquire(&pr->pr_uref);
3284
#endif
3285
}
3286
3287
/*
3288
* Remove a prison user reference. If it was the last reference, the
3289
* prison will be considered "dying", and may be removed once all of
3290
* its references are dropped.
3291
*/
3292
void
3293
prison_proc_free(struct prison *pr)
3294
{
3295
3296
/*
3297
* Locking is only required when releasing the last reference.
3298
* This allows assurance that a locked prison will remain alive
3299
* until it is unlocked.
3300
*/
3301
KASSERT(refcount_load(&pr->pr_uref) > 0,
3302
("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
3303
if (!refcount_release_if_not_last(&pr->pr_uref)) {
3304
/*
3305
* Don't remove the last user reference in this context,
3306
* which is expected to be a process that is not only locked,
3307
* but also half dead. Add a reference so any calls to
3308
* prison_free() won't re-submit the task.
3309
*/
3310
prison_hold(pr);
3311
mtx_lock(&pr->pr_mtx);
3312
KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
3313
("Redundant last reference in prison_proc_free (jid=%d)",
3314
pr->pr_id));
3315
pr->pr_flags |= PR_COMPLETE_PROC;
3316
mtx_unlock(&pr->pr_mtx);
3317
taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task);
3318
}
3319
}
3320
3321
static void
3322
prison_proc_free_not_last(struct prison *pr)
3323
{
3324
#ifdef INVARIANTS
3325
int lastref;
3326
3327
KASSERT(refcount_load(&pr->pr_uref) > 0,
3328
("Trying to free dead prison %p (jid=%d).",
3329
pr, pr->pr_id));
3330
lastref = refcount_release(&pr->pr_uref);
3331
KASSERT(!lastref,
3332
("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
3333
pr, pr->pr_id));
3334
#else
3335
refcount_release(&pr->pr_uref);
3336
#endif
3337
}
3338
3339
void
3340
prison_proc_link(struct prison *pr, struct proc *p)
3341
{
3342
3343
sx_assert(&allproc_lock, SA_XLOCKED);
3344
LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
3345
}
3346
3347
void
3348
prison_proc_unlink(struct prison *pr, struct proc *p)
3349
{
3350
3351
sx_assert(&allproc_lock, SA_XLOCKED);
3352
LIST_REMOVE(p, p_jaillist);
3353
}
3354
3355
static void
3356
prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
3357
{
3358
3359
sx_xlock(&allproc_lock);
3360
prison_proc_unlink(opr, p);
3361
prison_proc_link(npr, p);
3362
sx_xunlock(&allproc_lock);
3363
}
3364
3365
/*
3366
* Complete a call to either prison_free or prison_proc_free.
3367
*/
3368
static void
3369
prison_complete(void *context, int pending)
3370
{
3371
struct prison *pr = context;
3372
int drflags;
3373
3374
/*
3375
* This could be called to release the last reference, or the last
3376
* user reference (plus the reference held in prison_proc_free).
3377
*/
3378
drflags = prison_lock_xlock(pr, PD_DEREF);
3379
if (pr->pr_flags & PR_COMPLETE_PROC) {
3380
pr->pr_flags &= ~PR_COMPLETE_PROC;
3381
drflags |= PD_DEUREF;
3382
}
3383
prison_deref(pr, drflags);
3384
}
3385
3386
static void
3387
prison_kill_processes_cb(struct proc *p, void *arg __unused)
3388
{
3389
3390
kern_psignal(p, SIGKILL);
3391
}
3392
3393
/*
3394
* Note the iteration does not guarantee acting on all processes.
3395
* Most notably there may be fork or jail_attach in progress.
3396
*/
3397
void
3398
prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
3399
void *cbarg)
3400
{
3401
struct prison *ppr;
3402
struct proc *p;
3403
3404
if (atomic_load_int(&pr->pr_childcount) == 0) {
3405
sx_slock(&allproc_lock);
3406
LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
3407
if (p->p_state == PRS_NEW)
3408
continue;
3409
PROC_LOCK(p);
3410
cb(p, cbarg);
3411
PROC_UNLOCK(p);
3412
}
3413
sx_sunlock(&allproc_lock);
3414
if (atomic_load_int(&pr->pr_childcount) == 0)
3415
return;
3416
/*
3417
* Some jails popped up during the iteration, fall through to a
3418
* system-wide search.
3419
*/
3420
}
3421
3422
sx_slock(&allproc_lock);
3423
FOREACH_PROC_IN_SYSTEM(p) {
3424
PROC_LOCK(p);
3425
if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
3426
for (ppr = p->p_ucred->cr_prison; ppr != NULL;
3427
ppr = ppr->pr_parent) {
3428
if (ppr == pr) {
3429
cb(p, cbarg);
3430
break;
3431
}
3432
}
3433
}
3434
PROC_UNLOCK(p);
3435
}
3436
sx_sunlock(&allproc_lock);
3437
}
3438
3439
/*
3440
* Remove a prison reference and/or user reference (usually).
3441
* This assumes context that allows sleeping (for allprison_lock),
3442
* with no non-sleeping locks held, except perhaps the prison itself.
3443
* If there are no more references, release and delist the prison.
3444
* On completion, the prison lock and the allprison lock are both
3445
* unlocked.
3446
*/
3447
static void
3448
prison_deref(struct prison *pr, int flags)
3449
{
3450
struct prisonlist freeprison;
3451
struct prison *killpr, *rpr, *ppr, *tpr;
3452
3453
killpr = NULL;
3454
TAILQ_INIT(&freeprison);
3455
/*
3456
* Release this prison as requested, which may cause its parent
3457
* to be released, and then maybe its grandparent, etc.
3458
*/
3459
for (;;) {
3460
if (flags & PD_KILL) {
3461
/* Kill the prison and its descendents. */
3462
KASSERT(pr != &prison0,
3463
("prison_deref trying to kill prison0"));
3464
if (!(flags & PD_DEREF)) {
3465
prison_hold(pr);
3466
flags |= PD_DEREF;
3467
}
3468
flags = prison_lock_xlock(pr, flags);
3469
prison_deref_kill(pr, &freeprison);
3470
}
3471
if (flags & PD_DEUREF) {
3472
/* Drop a user reference. */
3473
KASSERT(refcount_load(&pr->pr_uref) > 0,
3474
("prison_deref PD_DEUREF on a dead prison (jid=%d)",
3475
pr->pr_id));
3476
if (!refcount_release_if_not_last(&pr->pr_uref)) {
3477
if (!(flags & PD_DEREF)) {
3478
prison_hold(pr);
3479
flags |= PD_DEREF;
3480
}
3481
flags = prison_lock_xlock(pr, flags);
3482
if (refcount_release(&pr->pr_uref) &&
3483
pr->pr_state == PRISON_STATE_ALIVE) {
3484
/*
3485
* When the last user references goes,
3486
* this becomes a dying prison.
3487
*/
3488
KASSERT(
3489
refcount_load(&prison0.pr_uref) > 0,
3490
("prison0 pr_uref=0"));
3491
pr->pr_state = PRISON_STATE_DYING;
3492
prison_cleanup_locked(pr);
3493
mtx_unlock(&pr->pr_mtx);
3494
flags &= ~PD_LOCKED;
3495
prison_cleanup_unlocked(pr);
3496
}
3497
}
3498
}
3499
if (flags & PD_KILL) {
3500
/*
3501
* Any remaining user references are probably processes
3502
* that need to be killed, either in this prison or its
3503
* descendants.
3504
*/
3505
if (refcount_load(&pr->pr_uref) > 0)
3506
killpr = pr;
3507
/* Make sure the parent prison doesn't get killed. */
3508
flags &= ~PD_KILL;
3509
}
3510
if (flags & PD_DEREF) {
3511
/* Drop a reference. */
3512
KASSERT(refcount_load(&pr->pr_ref) > 0,
3513
("prison_deref PD_DEREF on a dead prison (jid=%d)",
3514
pr->pr_id));
3515
if (!refcount_release_if_not_last(&pr->pr_ref)) {
3516
flags = prison_lock_xlock(pr, flags);
3517
if (refcount_release(&pr->pr_ref)) {
3518
/*
3519
* When the last reference goes,
3520
* unlink the prison and set it aside.
3521
*/
3522
KASSERT(
3523
refcount_load(&pr->pr_uref) == 0,
3524
("prison_deref: last ref, "
3525
"but still has %d urefs (jid=%d)",
3526
pr->pr_uref, pr->pr_id));
3527
KASSERT(
3528
refcount_load(&prison0.pr_ref) != 0,
3529
("prison0 pr_ref=0"));
3530
pr->pr_state = PRISON_STATE_INVALID;
3531
TAILQ_REMOVE(&allprison, pr, pr_list);
3532
LIST_REMOVE(pr, pr_sibling);
3533
TAILQ_INSERT_TAIL(&freeprison, pr,
3534
pr_list);
3535
for (ppr = pr->pr_parent;
3536
ppr != NULL;
3537
ppr = ppr->pr_parent)
3538
ppr->pr_childcount--;
3539
/*
3540
* Removing a prison frees references
3541
* from its parent.
3542
*/
3543
ppr = pr->pr_parent;
3544
pr->pr_parent = NULL;
3545
mtx_unlock(&pr->pr_mtx);
3546
3547
pr = ppr;
3548
flags &= ~PD_LOCKED;
3549
flags |= PD_DEREF | PD_DEUREF;
3550
continue;
3551
}
3552
}
3553
}
3554
break;
3555
}
3556
3557
/* Release all the prison locks. */
3558
if (flags & PD_LOCKED)
3559
mtx_unlock(&pr->pr_mtx);
3560
if (flags & PD_LIST_SLOCKED)
3561
sx_sunlock(&allprison_lock);
3562
else if (flags & PD_LIST_XLOCKED)
3563
sx_xunlock(&allprison_lock);
3564
3565
/* Kill any processes attached to a killed prison. */
3566
if (killpr != NULL)
3567
prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
3568
3569
/*
3570
* Finish removing any unreferenced prisons, which couldn't happen
3571
* while allprison_lock was held (to avoid a LOR on vrele).
3572
*/
3573
TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
3574
#ifdef VIMAGE
3575
if (rpr->pr_flags & PR_VNET)
3576
vnet_destroy(rpr->pr_vnet);
3577
#endif
3578
if (rpr->pr_root != NULL)
3579
vrele(rpr->pr_root);
3580
mtx_destroy(&rpr->pr_mtx);
3581
#ifdef INET
3582
prison_ip_free(rpr->pr_addrs[PR_INET]);
3583
#endif
3584
#ifdef INET6
3585
prison_ip_free(rpr->pr_addrs[PR_INET6]);
3586
#endif
3587
if (rpr->pr_cpuset != NULL)
3588
cpuset_rel(rpr->pr_cpuset);
3589
osd_jail_exit(rpr);
3590
#ifdef RACCT
3591
if (racct_enable)
3592
prison_racct_detach(rpr);
3593
#endif
3594
TAILQ_REMOVE(&freeprison, rpr, pr_list);
3595
free(rpr, M_PRISON);
3596
}
3597
}
3598
3599
/*
3600
* Kill the prison and its descendants. Mark them as dying, clear the
3601
* persist flag, and call module remove methods.
3602
*/
3603
static void
3604
prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
3605
{
3606
struct prison *cpr, *ppr, *rpr;
3607
bool descend;
3608
3609
/*
3610
* Unlike the descendants, the target prison can be killed
3611
* even if it is currently dying. This is useful for failed
3612
* creation in jail_set(2).
3613
*/
3614
KASSERT(refcount_load(&pr->pr_ref) > 0,
3615
("Trying to kill dead prison %p (jid=%d).",
3616
pr, pr->pr_id));
3617
refcount_acquire(&pr->pr_uref);
3618
pr->pr_state = PRISON_STATE_DYING;
3619
mtx_unlock(&pr->pr_mtx);
3620
3621
rpr = NULL;
3622
FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
3623
if (descend) {
3624
if (!prison_isalive(cpr)) {
3625
descend = false;
3626
continue;
3627
}
3628
prison_hold(cpr);
3629
prison_proc_hold(cpr);
3630
mtx_lock(&cpr->pr_mtx);
3631
cpr->pr_state = PRISON_STATE_DYING;
3632
cpr->pr_flags |= PR_REMOVE;
3633
mtx_unlock(&cpr->pr_mtx);
3634
continue;
3635
}
3636
if (!(cpr->pr_flags & PR_REMOVE))
3637
continue;
3638
prison_cleanup_unlocked(cpr);
3639
mtx_lock(&cpr->pr_mtx);
3640
prison_cleanup_locked(cpr);
3641
cpr->pr_flags &= ~PR_REMOVE;
3642
if (cpr->pr_flags & PR_PERSIST) {
3643
cpr->pr_flags &= ~PR_PERSIST;
3644
prison_proc_free_not_last(cpr);
3645
prison_free_not_last(cpr);
3646
}
3647
(void)refcount_release(&cpr->pr_uref);
3648
if (refcount_release(&cpr->pr_ref)) {
3649
/*
3650
* When the last reference goes, unlink the prison
3651
* and set it aside for prison_deref() to handle.
3652
* Delay unlinking the sibling list to keep the loop
3653
* safe.
3654
*/
3655
if (rpr != NULL)
3656
LIST_REMOVE(rpr, pr_sibling);
3657
rpr = cpr;
3658
rpr->pr_state = PRISON_STATE_INVALID;
3659
TAILQ_REMOVE(&allprison, rpr, pr_list);
3660
TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
3661
/*
3662
* Removing a prison frees references from its parent.
3663
*/
3664
ppr = rpr->pr_parent;
3665
prison_proc_free_not_last(ppr);
3666
prison_free_not_last(ppr);
3667
for (; ppr != NULL; ppr = ppr->pr_parent)
3668
ppr->pr_childcount--;
3669
}
3670
mtx_unlock(&cpr->pr_mtx);
3671
}
3672
if (rpr != NULL)
3673
LIST_REMOVE(rpr, pr_sibling);
3674
3675
prison_cleanup_unlocked(pr);
3676
mtx_lock(&pr->pr_mtx);
3677
prison_cleanup_locked(pr);
3678
if (pr->pr_flags & PR_PERSIST) {
3679
pr->pr_flags &= ~PR_PERSIST;
3680
prison_proc_free_not_last(pr);
3681
prison_free_not_last(pr);
3682
}
3683
(void)refcount_release(&pr->pr_uref);
3684
}
3685
3686
/*
3687
* Given the current locking state in the flags, make sure allprison_lock
3688
* is held exclusive, and the prison is locked. Return flags indicating
3689
* the new state.
3690
*/
3691
static int
3692
prison_lock_xlock(struct prison *pr, int flags)
3693
{
3694
3695
if (!(flags & PD_LIST_XLOCKED)) {
3696
/*
3697
* Get allprison_lock, which may be an upgrade,
3698
* and may require unlocking the prison.
3699
*/
3700
if (flags & PD_LOCKED) {
3701
mtx_unlock(&pr->pr_mtx);
3702
flags &= ~PD_LOCKED;
3703
}
3704
if (flags & PD_LIST_SLOCKED) {
3705
if (!sx_try_upgrade(&allprison_lock)) {
3706
sx_sunlock(&allprison_lock);
3707
sx_xlock(&allprison_lock);
3708
}
3709
flags &= ~PD_LIST_SLOCKED;
3710
} else
3711
sx_xlock(&allprison_lock);
3712
flags |= PD_LIST_XLOCKED;
3713
}
3714
if (!(flags & PD_LOCKED)) {
3715
/* Lock the prison mutex. */
3716
mtx_lock(&pr->pr_mtx);
3717
flags |= PD_LOCKED;
3718
}
3719
return flags;
3720
}
3721
3722
/*
3723
* Release a prison's resources when it starts dying (when the last user
3724
* reference is dropped, or when it is killed). Two functions are called,
3725
* for work that requires a locked prison or an unlocked one.
3726
*/
3727
static void
3728
prison_cleanup_locked(struct prison *pr)
3729
{
3730
sx_assert(&allprison_lock, SA_XLOCKED);
3731
mtx_assert(&pr->pr_mtx, MA_OWNED);
3732
prison_knote(pr, NOTE_JAIL_REMOVE);
3733
knlist_detach(pr->pr_klist);
3734
jaildesc_prison_cleanup(pr);
3735
pr->pr_klist = NULL;
3736
}
3737
3738
static void
3739
prison_cleanup_unlocked(struct prison *pr)
3740
{
3741
sx_assert(&allprison_lock, SA_XLOCKED);
3742
mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3743
vfs_exjail_delete(pr);
3744
shm_remove_prison(pr);
3745
(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3746
}
3747
3748
/*
3749
* Set or clear a permission bit in the pr_allow field, passing restrictions
3750
* (cleared permission) down to child jails.
3751
*/
3752
void
3753
prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3754
{
3755
struct prison *pr;
3756
3757
pr = cred->cr_prison;
3758
sx_slock(&allprison_lock);
3759
mtx_lock(&pr->pr_mtx);
3760
prison_set_allow_locked(pr, flag, enable);
3761
mtx_unlock(&pr->pr_mtx);
3762
sx_sunlock(&allprison_lock);
3763
}
3764
3765
static void
3766
prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3767
{
3768
struct prison *cpr;
3769
int descend;
3770
3771
if (enable != 0)
3772
pr->pr_allow |= flag;
3773
else {
3774
pr->pr_allow &= ~flag;
3775
FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3776
cpr->pr_allow &= ~flag;
3777
}
3778
}
3779
3780
/*
3781
* Check if a jail supports the given address family.
3782
*
3783
* Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3784
* if not.
3785
*/
3786
int
3787
prison_check_af(struct ucred *cred, int af)
3788
{
3789
struct prison *pr;
3790
int error;
3791
3792
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3793
3794
pr = cred->cr_prison;
3795
#ifdef VIMAGE
3796
/* Prisons with their own network stack are not limited. */
3797
if (prison_owns_vnet(pr))
3798
return (0);
3799
#endif
3800
3801
error = 0;
3802
switch (af)
3803
{
3804
#ifdef INET
3805
case AF_INET:
3806
if (pr->pr_flags & PR_IP4)
3807
{
3808
mtx_lock(&pr->pr_mtx);
3809
if ((pr->pr_flags & PR_IP4) &&
3810
pr->pr_addrs[PR_INET] == NULL)
3811
error = EAFNOSUPPORT;
3812
mtx_unlock(&pr->pr_mtx);
3813
}
3814
break;
3815
#endif
3816
#ifdef INET6
3817
case AF_INET6:
3818
if (pr->pr_flags & PR_IP6)
3819
{
3820
mtx_lock(&pr->pr_mtx);
3821
if ((pr->pr_flags & PR_IP6) &&
3822
pr->pr_addrs[PR_INET6] == NULL)
3823
error = EAFNOSUPPORT;
3824
mtx_unlock(&pr->pr_mtx);
3825
}
3826
break;
3827
#endif
3828
case AF_LOCAL:
3829
case AF_ROUTE:
3830
case AF_NETLINK:
3831
break;
3832
default:
3833
if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3834
error = EAFNOSUPPORT;
3835
}
3836
return (error);
3837
}
3838
3839
/*
3840
* Check if given address belongs to the jail referenced by cred (wrapper to
3841
* prison_check_ip[46]).
3842
*
3843
* Returns 0 if jail doesn't restrict the address family or if address belongs
3844
* to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3845
* the jail doesn't allow the address family. IPv4 Address passed in in NBO.
3846
*/
3847
int
3848
prison_if(struct ucred *cred, const struct sockaddr *sa)
3849
{
3850
#ifdef INET
3851
const struct sockaddr_in *sai;
3852
#endif
3853
#ifdef INET6
3854
const struct sockaddr_in6 *sai6;
3855
#endif
3856
int error;
3857
3858
KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3859
KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3860
3861
#ifdef VIMAGE
3862
if (prison_owns_vnet(cred->cr_prison))
3863
return (0);
3864
#endif
3865
3866
error = 0;
3867
switch (sa->sa_family)
3868
{
3869
#ifdef INET
3870
case AF_INET:
3871
sai = (const struct sockaddr_in *)sa;
3872
error = prison_check_ip4(cred, &sai->sin_addr);
3873
break;
3874
#endif
3875
#ifdef INET6
3876
case AF_INET6:
3877
sai6 = (const struct sockaddr_in6 *)sa;
3878
error = prison_check_ip6(cred, &sai6->sin6_addr);
3879
break;
3880
#endif
3881
default:
3882
if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3883
error = EAFNOSUPPORT;
3884
}
3885
return (error);
3886
}
3887
3888
/*
3889
* Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3890
*/
3891
int
3892
prison_check(struct ucred *cred1, struct ucred *cred2)
3893
{
3894
3895
return ((cred1->cr_prison == cred2->cr_prison ||
3896
prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3897
}
3898
3899
/*
3900
* For mountd/nfsd to run within a prison, it must be:
3901
* - A vnet prison.
3902
* - PR_ALLOW_NFSD must be set on it.
3903
* - The root directory (pr_root) of the prison must be
3904
* a file system mount point, so the mountd can hang
3905
* export information on it.
3906
* - The prison's enforce_statfs cannot be 0, so that
3907
* mountd(8) can do exports.
3908
*/
3909
bool
3910
prison_check_nfsd(struct ucred *cred)
3911
{
3912
3913
if (jailed_without_vnet(cred))
3914
return (false);
3915
if (!prison_allow(cred, PR_ALLOW_NFSD))
3916
return (false);
3917
if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3918
return (false);
3919
if (cred->cr_prison->pr_enforce_statfs == 0)
3920
return (false);
3921
return (true);
3922
}
3923
3924
/*
3925
* Return true if p2 is a child of p1, otherwise false.
3926
*/
3927
bool
3928
prison_ischild(struct prison *pr1, struct prison *pr2)
3929
{
3930
3931
for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3932
if (pr1 == pr2)
3933
return (true);
3934
return (false);
3935
}
3936
3937
/*
3938
* Return true if the prison is currently alive. A prison is alive if it
3939
* holds user references and it isn't being removed.
3940
*/
3941
bool
3942
prison_isalive(const struct prison *pr)
3943
{
3944
3945
if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3946
return (false);
3947
return (true);
3948
}
3949
3950
/*
3951
* Return true if the prison is currently valid. A prison is valid if it has
3952
* been fully created, and is not being destroyed. Note that dying prisons
3953
* are still considered valid. Invalid prisons won't be found under normal
3954
* circumstances, as they're only put in that state by functions that have
3955
* an exclusive hold on allprison_lock.
3956
*/
3957
bool
3958
prison_isvalid(struct prison *pr)
3959
{
3960
3961
if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3962
return (false);
3963
if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3964
return (false);
3965
return (true);
3966
}
3967
3968
/*
3969
* Return true if the passed credential is in a jail and that jail does not
3970
* have its own virtual network stack, otherwise false.
3971
*/
3972
bool
3973
jailed_without_vnet(struct ucred *cred)
3974
{
3975
3976
if (!jailed(cred))
3977
return (false);
3978
#ifdef VIMAGE
3979
if (prison_owns_vnet(cred->cr_prison))
3980
return (false);
3981
#endif
3982
3983
return (true);
3984
}
3985
3986
/*
3987
* Return the correct hostname (domainname, et al) for the passed credential.
3988
*/
3989
void
3990
getcredhostname(struct ucred *cred, char *buf, size_t size)
3991
{
3992
struct prison *pr;
3993
3994
/*
3995
* A NULL credential can be used to shortcut to the physical
3996
* system's hostname.
3997
*/
3998
pr = (cred != NULL) ? cred->cr_prison : &prison0;
3999
mtx_lock(&pr->pr_mtx);
4000
strlcpy(buf, pr->pr_hostname, size);
4001
mtx_unlock(&pr->pr_mtx);
4002
}
4003
4004
void
4005
getcreddomainname(struct ucred *cred, char *buf, size_t size)
4006
{
4007
4008
mtx_lock(&cred->cr_prison->pr_mtx);
4009
strlcpy(buf, cred->cr_prison->pr_domainname, size);
4010
mtx_unlock(&cred->cr_prison->pr_mtx);
4011
}
4012
4013
void
4014
getcredhostuuid(struct ucred *cred, char *buf, size_t size)
4015
{
4016
4017
mtx_lock(&cred->cr_prison->pr_mtx);
4018
strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
4019
mtx_unlock(&cred->cr_prison->pr_mtx);
4020
}
4021
4022
void
4023
getcredhostid(struct ucred *cred, unsigned long *hostid)
4024
{
4025
4026
mtx_lock(&cred->cr_prison->pr_mtx);
4027
*hostid = cred->cr_prison->pr_hostid;
4028
mtx_unlock(&cred->cr_prison->pr_mtx);
4029
}
4030
4031
void
4032
getjailname(struct ucred *cred, char *name, size_t len)
4033
{
4034
4035
mtx_lock(&cred->cr_prison->pr_mtx);
4036
strlcpy(name, cred->cr_prison->pr_name, len);
4037
mtx_unlock(&cred->cr_prison->pr_mtx);
4038
}
4039
4040
#ifdef VIMAGE
4041
/*
4042
* Determine whether the prison owns its VNET.
4043
*/
4044
bool
4045
prison_owns_vnet(struct prison *pr)
4046
{
4047
4048
/*
4049
* vnets cannot be added/removed after jail creation,
4050
* so no need to lock here.
4051
*/
4052
return ((pr->pr_flags & PR_VNET) != 0);
4053
}
4054
#endif
4055
4056
/*
4057
* Determine whether the subject represented by cred can "see"
4058
* status of a mount point.
4059
* Returns: 0 for permitted, ENOENT otherwise.
4060
* XXX: This function should be called cr_canseemount() and should be
4061
* placed in kern_prot.c.
4062
*/
4063
int
4064
prison_canseemount(struct ucred *cred, struct mount *mp)
4065
{
4066
struct prison *pr;
4067
struct statfs *sp;
4068
size_t len;
4069
4070
pr = cred->cr_prison;
4071
if (pr->pr_enforce_statfs == 0)
4072
return (0);
4073
if (pr->pr_root->v_mount == mp)
4074
return (0);
4075
if (pr->pr_enforce_statfs == 2)
4076
return (ENOENT);
4077
/*
4078
* If jail's chroot directory is set to "/" we should be able to see
4079
* all mount-points from inside a jail.
4080
* This is ugly check, but this is the only situation when jail's
4081
* directory ends with '/'.
4082
*/
4083
if (strcmp(pr->pr_path, "/") == 0)
4084
return (0);
4085
len = strlen(pr->pr_path);
4086
sp = &mp->mnt_stat;
4087
if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
4088
return (ENOENT);
4089
/*
4090
* Be sure that we don't have situation where jail's root directory
4091
* is "/some/path" and mount point is "/some/pathpath".
4092
*/
4093
if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
4094
return (ENOENT);
4095
return (0);
4096
}
4097
4098
void
4099
prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
4100
{
4101
char jpath[MAXPATHLEN];
4102
struct prison *pr;
4103
size_t len;
4104
4105
pr = cred->cr_prison;
4106
if (pr->pr_enforce_statfs == 0)
4107
return;
4108
if (prison_canseemount(cred, mp) != 0) {
4109
bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4110
strlcpy(sp->f_mntonname, "[restricted]",
4111
sizeof(sp->f_mntonname));
4112
return;
4113
}
4114
if (pr->pr_root->v_mount == mp) {
4115
/*
4116
* Clear current buffer data, so we are sure nothing from
4117
* the valid path left there.
4118
*/
4119
bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4120
*sp->f_mntonname = '/';
4121
return;
4122
}
4123
/*
4124
* If jail's chroot directory is set to "/" we should be able to see
4125
* all mount-points from inside a jail.
4126
*/
4127
if (strcmp(pr->pr_path, "/") == 0)
4128
return;
4129
len = strlen(pr->pr_path);
4130
strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
4131
/*
4132
* Clear current buffer data, so we are sure nothing from
4133
* the valid path left there.
4134
*/
4135
bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4136
if (*jpath == '\0') {
4137
/* Should never happen. */
4138
*sp->f_mntonname = '/';
4139
} else {
4140
strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
4141
}
4142
}
4143
4144
/*
4145
* Check with permission for a specific privilege is granted within jail. We
4146
* have a specific list of accepted privileges; the rest are denied.
4147
*/
4148
int
4149
prison_priv_check(struct ucred *cred, int priv)
4150
{
4151
struct prison *pr;
4152
int error;
4153
4154
/*
4155
* Some policies have custom handlers. This routine should not be
4156
* called for them. See priv_check_cred().
4157
*/
4158
switch (priv) {
4159
case PRIV_VFS_LOOKUP:
4160
case PRIV_VFS_GENERATION:
4161
KASSERT(0, ("prison_priv_check instead of a custom handler "
4162
"called for %d\n", priv));
4163
}
4164
4165
if (!jailed(cred))
4166
return (0);
4167
4168
#ifdef VIMAGE
4169
/*
4170
* Privileges specific to prisons with a virtual network stack.
4171
* There might be a duplicate entry here in case the privilege
4172
* is only granted conditionally in the legacy jail case.
4173
*/
4174
switch (priv) {
4175
/*
4176
* NFS-specific privileges.
4177
*/
4178
case PRIV_NFS_DAEMON:
4179
case PRIV_VFS_GETFH:
4180
case PRIV_VFS_MOUNT_EXPORTED:
4181
if (!prison_check_nfsd(cred))
4182
return (EPERM);
4183
#ifdef notyet
4184
case PRIV_NFS_LOCKD:
4185
#endif
4186
/*
4187
* Network stack privileges.
4188
*/
4189
case PRIV_NET_BRIDGE:
4190
case PRIV_NET_GRE:
4191
case PRIV_NET_BPF:
4192
case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
4193
case PRIV_NET_ROUTE:
4194
case PRIV_NET_TAP:
4195
case PRIV_NET_SETIFMTU:
4196
case PRIV_NET_SETIFFLAGS:
4197
case PRIV_NET_SETIFCAP:
4198
case PRIV_NET_SETIFDESCR:
4199
case PRIV_NET_SETIFNAME :
4200
case PRIV_NET_SETIFMETRIC:
4201
case PRIV_NET_SETIFPHYS:
4202
case PRIV_NET_SETIFMAC:
4203
case PRIV_NET_SETLANPCP:
4204
case PRIV_NET_ADDMULTI:
4205
case PRIV_NET_DELMULTI:
4206
case PRIV_NET_HWIOCTL:
4207
case PRIV_NET_SETLLADDR:
4208
case PRIV_NET_ADDIFGROUP:
4209
case PRIV_NET_DELIFGROUP:
4210
case PRIV_NET_IFCREATE:
4211
case PRIV_NET_IFDESTROY:
4212
case PRIV_NET_ADDIFADDR:
4213
case PRIV_NET_DELIFADDR:
4214
case PRIV_NET_LAGG:
4215
case PRIV_NET_GIF:
4216
case PRIV_NET_SETIFVNET:
4217
case PRIV_NET_SETIFFIB:
4218
case PRIV_NET_OVPN:
4219
case PRIV_NET_ME:
4220
case PRIV_NET_WG:
4221
4222
/*
4223
* 802.11-related privileges.
4224
*/
4225
case PRIV_NET80211_VAP_GETKEY:
4226
case PRIV_NET80211_VAP_MANAGE:
4227
4228
#ifdef notyet
4229
/*
4230
* ATM privileges.
4231
*/
4232
case PRIV_NETATM_CFG:
4233
case PRIV_NETATM_ADD:
4234
case PRIV_NETATM_DEL:
4235
case PRIV_NETATM_SET:
4236
4237
/*
4238
* Bluetooth privileges.
4239
*/
4240
case PRIV_NETBLUETOOTH_RAW:
4241
#endif
4242
4243
/*
4244
* Netgraph and netgraph module privileges.
4245
*/
4246
case PRIV_NETGRAPH_CONTROL:
4247
#ifdef notyet
4248
case PRIV_NETGRAPH_TTY:
4249
#endif
4250
4251
/*
4252
* IPv4 and IPv6 privileges.
4253
*/
4254
case PRIV_NETINET_IPFW:
4255
case PRIV_NETINET_DIVERT:
4256
case PRIV_NETINET_PF:
4257
case PRIV_NETINET_DUMMYNET:
4258
case PRIV_NETINET_CARP:
4259
case PRIV_NETINET_MROUTE:
4260
case PRIV_NETINET_RAW:
4261
case PRIV_NETINET_ADDRCTRL6:
4262
case PRIV_NETINET_ND6:
4263
case PRIV_NETINET_SCOPE6:
4264
case PRIV_NETINET_ALIFETIME6:
4265
case PRIV_NETINET_IPSEC:
4266
case PRIV_NETINET_BINDANY:
4267
4268
#ifdef notyet
4269
/*
4270
* NCP privileges.
4271
*/
4272
case PRIV_NETNCP:
4273
4274
/*
4275
* SMB privileges.
4276
*/
4277
case PRIV_NETSMB:
4278
#endif
4279
4280
/*
4281
* No default: or deny here.
4282
* In case of no permit fall through to next switch().
4283
*/
4284
if (cred->cr_prison->pr_flags & PR_VNET)
4285
return (0);
4286
}
4287
#endif /* VIMAGE */
4288
4289
switch (priv) {
4290
/*
4291
* Allow ktrace privileges for root in jail.
4292
*/
4293
case PRIV_KTRACE:
4294
4295
/*
4296
* Allow jailed processes to configure audit identity and
4297
* submit audit records (login, etc). In the future we may
4298
* want to further refine the relationship between audit and
4299
* jail.
4300
*/
4301
case PRIV_AUDIT_GETAUDIT:
4302
case PRIV_AUDIT_SETAUDIT:
4303
if (cred->cr_prison->pr_allow & PR_ALLOW_SETAUDIT)
4304
return (0);
4305
else
4306
return (EPERM);
4307
#if 0
4308
case PRIV_AUDIT_SUBMIT:
4309
#endif
4310
4311
/*
4312
* Allow jailed processes to manipulate process UNIX
4313
* credentials in any way they see fit.
4314
*/
4315
case PRIV_CRED_SETCRED:
4316
case PRIV_CRED_SETUID:
4317
case PRIV_CRED_SETEUID:
4318
case PRIV_CRED_SETGID:
4319
case PRIV_CRED_SETEGID:
4320
case PRIV_CRED_SETGROUPS:
4321
case PRIV_CRED_SETREUID:
4322
case PRIV_CRED_SETREGID:
4323
case PRIV_CRED_SETRESUID:
4324
case PRIV_CRED_SETRESGID:
4325
4326
/*
4327
* Jail implements visibility constraints already, so allow
4328
* jailed root to override uid/gid-based constraints.
4329
*/
4330
case PRIV_SEEOTHERGIDS:
4331
case PRIV_SEEOTHERUIDS:
4332
case PRIV_SEEJAILPROC:
4333
4334
/*
4335
* Jail implements inter-process debugging limits already, so
4336
* allow jailed root various debugging privileges.
4337
*/
4338
case PRIV_DEBUG_DIFFCRED:
4339
case PRIV_DEBUG_SUGID:
4340
case PRIV_DEBUG_UNPRIV:
4341
case PRIV_DEBUG_DIFFJAIL:
4342
4343
/*
4344
* Allow jail to set various resource limits and login
4345
* properties, and for now, exceed process resource limits.
4346
*/
4347
case PRIV_PROC_LIMIT:
4348
case PRIV_PROC_SETLOGIN:
4349
case PRIV_PROC_SETRLIMIT:
4350
4351
/*
4352
* Debuggers should work in jails.
4353
*/
4354
case PRIV_PROC_MEM_WRITE:
4355
4356
/*
4357
* System V and POSIX IPC privileges are granted in jail.
4358
*/
4359
case PRIV_IPC_READ:
4360
case PRIV_IPC_WRITE:
4361
case PRIV_IPC_ADMIN:
4362
case PRIV_IPC_MSGSIZE:
4363
case PRIV_MQ_ADMIN:
4364
4365
/*
4366
* Jail operations within a jail work on child jails.
4367
*/
4368
case PRIV_JAIL_ATTACH:
4369
case PRIV_JAIL_SET:
4370
case PRIV_JAIL_REMOVE:
4371
4372
/*
4373
* Jail implements its own inter-process limits, so allow
4374
* root processes in jail to change scheduling on other
4375
* processes in the same jail. Likewise for signalling.
4376
*/
4377
case PRIV_SCHED_DIFFCRED:
4378
case PRIV_SCHED_CPUSET:
4379
case PRIV_SCHED_DIFFJAIL:
4380
case PRIV_SIGNAL_DIFFCRED:
4381
case PRIV_SIGNAL_SUGID:
4382
case PRIV_SIGNAL_DIFFJAIL:
4383
4384
/*
4385
* Allow jailed processes to write to sysctls marked as jail
4386
* writable.
4387
*/
4388
case PRIV_SYSCTL_WRITEJAIL:
4389
4390
/*
4391
* Allow root in jail to manage a variety of quota
4392
* properties. These should likely be conditional on a
4393
* configuration option.
4394
*/
4395
case PRIV_VFS_GETQUOTA:
4396
case PRIV_VFS_SETQUOTA:
4397
4398
/*
4399
* Since Jail relies on chroot() to implement file system
4400
* protections, grant many VFS privileges to root in jail.
4401
* Be careful to exclude mount-related and NFS-related
4402
* privileges.
4403
*/
4404
case PRIV_VFS_READ:
4405
case PRIV_VFS_WRITE:
4406
case PRIV_VFS_ADMIN:
4407
case PRIV_VFS_EXEC:
4408
case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
4409
case PRIV_VFS_CHFLAGS_DEV:
4410
case PRIV_VFS_CHOWN:
4411
case PRIV_VFS_CHROOT:
4412
case PRIV_VFS_RETAINSUGID:
4413
case PRIV_VFS_FCHROOT:
4414
case PRIV_VFS_LINK:
4415
case PRIV_VFS_SETGID:
4416
case PRIV_VFS_STAT:
4417
case PRIV_VFS_STICKYFILE:
4418
4419
/*
4420
* As in the non-jail case, non-root users are expected to be
4421
* able to read kernel/physical memory (provided /dev/[k]mem
4422
* exists in the jail and they have permission to access it).
4423
*/
4424
case PRIV_KMEM_READ:
4425
return (0);
4426
4427
/*
4428
* Depending on the global setting, allow privilege of
4429
* setting system flags.
4430
*/
4431
case PRIV_VFS_SYSFLAGS:
4432
if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4433
return (0);
4434
else
4435
return (EPERM);
4436
4437
/*
4438
* Depending on the global setting, allow privilege of
4439
* mounting/unmounting file systems.
4440
*/
4441
case PRIV_VFS_MOUNT:
4442
case PRIV_VFS_UNMOUNT:
4443
case PRIV_VFS_MOUNT_NONUSER:
4444
case PRIV_VFS_MOUNT_OWNER:
4445
pr = cred->cr_prison;
4446
prison_lock(pr);
4447
if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
4448
error = 0;
4449
else
4450
error = EPERM;
4451
prison_unlock(pr);
4452
return (error);
4453
4454
/*
4455
* Jails should hold no disposition on the PRIV_VFS_READ_DIR
4456
* policy. priv_check_cred will not specifically allow it, and
4457
* we may want a MAC policy to allow it.
4458
*/
4459
case PRIV_VFS_READ_DIR:
4460
return (0);
4461
4462
/*
4463
* Conditionally allow privileged process in the jail to
4464
* manipulate filesystem extended attributes in the system
4465
* namespace.
4466
*/
4467
case PRIV_VFS_EXTATTR_SYSTEM:
4468
if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0)
4469
return (0);
4470
else
4471
return (EPERM);
4472
4473
/*
4474
* Conditionnaly allow locking (unlocking) physical pages
4475
* in memory.
4476
*/
4477
case PRIV_VM_MLOCK:
4478
case PRIV_VM_MUNLOCK:
4479
if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
4480
return (0);
4481
else
4482
return (EPERM);
4483
4484
/*
4485
* Conditionally allow jailed root to bind reserved ports.
4486
*/
4487
case PRIV_NETINET_RESERVEDPORT:
4488
if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
4489
return (0);
4490
else
4491
return (EPERM);
4492
4493
/*
4494
* Allow jailed root to reuse in-use ports.
4495
*/
4496
case PRIV_NETINET_REUSEPORT:
4497
return (0);
4498
4499
/*
4500
* Allow jailed root to set certain IPv4/6 (option) headers.
4501
*/
4502
case PRIV_NETINET_SETHDROPTS:
4503
return (0);
4504
4505
/*
4506
* Conditionally allow creating raw sockets in jail.
4507
*/
4508
case PRIV_NETINET_RAW:
4509
if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4510
return (0);
4511
else
4512
return (EPERM);
4513
4514
/*
4515
* Since jail implements its own visibility limits on netstat
4516
* sysctls, allow getcred. This allows identd to work in
4517
* jail.
4518
*/
4519
case PRIV_NETINET_GETCRED:
4520
return (0);
4521
4522
/*
4523
* Allow jailed root to set loginclass.
4524
*/
4525
case PRIV_PROC_SETLOGINCLASS:
4526
return (0);
4527
4528
/*
4529
* Do not allow a process inside a jail to read the kernel
4530
* message buffer unless explicitly permitted.
4531
*/
4532
case PRIV_MSGBUF:
4533
if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
4534
return (0);
4535
return (EPERM);
4536
4537
/*
4538
* Conditionally allow privileged process in the jail adjust
4539
* machine time.
4540
*/
4541
case PRIV_ADJTIME:
4542
case PRIV_NTP_ADJTIME:
4543
if (cred->cr_prison->pr_allow &
4544
(PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) {
4545
return (0);
4546
}
4547
return (EPERM);
4548
4549
/*
4550
* Conditionally allow privileged process in the jail set
4551
* machine time.
4552
*/
4553
case PRIV_SETTIMEOFDAY:
4554
case PRIV_CLOCK_SETTIME:
4555
if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME)
4556
return (0);
4557
else
4558
return (EPERM);
4559
4560
/*
4561
* Conditionally allow privileged process in the jail to modify
4562
* the routing table.
4563
*/
4564
case PRIV_NET_ROUTE:
4565
if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING)
4566
return (0);
4567
else
4568
return (EPERM);
4569
4570
default:
4571
/*
4572
* In all remaining cases, deny the privilege request. This
4573
* includes almost all network privileges, many system
4574
* configuration privileges.
4575
*/
4576
return (EPERM);
4577
}
4578
}
4579
4580
/*
4581
* Return the part of pr2's name that is relative to pr1, or the whole name
4582
* if it does not directly follow.
4583
*/
4584
4585
char *
4586
prison_name(struct prison *pr1, struct prison *pr2)
4587
{
4588
char *name;
4589
4590
/* Jails see themselves as "0" (if they see themselves at all). */
4591
if (pr1 == pr2)
4592
return "0";
4593
name = pr2->pr_name;
4594
if (prison_ischild(pr1, pr2)) {
4595
/*
4596
* pr1 isn't locked (and allprison_lock may not be either)
4597
* so its length can't be counted on. But the number of dots
4598
* can be counted on - and counted.
4599
*/
4600
for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4601
name = strchr(name, '.') + 1;
4602
}
4603
return (name);
4604
}
4605
4606
/*
4607
* Return the part of pr2's path that is relative to pr1, or the whole path
4608
* if it does not directly follow.
4609
*/
4610
static char *
4611
prison_path(struct prison *pr1, struct prison *pr2)
4612
{
4613
char *path1, *path2;
4614
int len1;
4615
4616
path1 = pr1->pr_path;
4617
path2 = pr2->pr_path;
4618
if (!strcmp(path1, "/"))
4619
return (path2);
4620
len1 = strlen(path1);
4621
if (strncmp(path1, path2, len1))
4622
return (path2);
4623
if (path2[len1] == '\0')
4624
return "/";
4625
if (path2[len1] == '/')
4626
return (path2 + len1);
4627
return (path2);
4628
}
4629
4630
/*
4631
* Jail-related sysctls.
4632
*/
4633
SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4634
"Jails");
4635
4636
#if defined(INET) || defined(INET6)
4637
/*
4638
* Copy address array to memory that would be then SYSCTL_OUT-ed.
4639
* sysctl_jail_list() helper.
4640
*/
4641
static void
4642
prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
4643
{
4644
const struct prison_ip *pip;
4645
const size_t size = pr_families[af].size;
4646
4647
again:
4648
mtx_assert(&pr->pr_mtx, MA_OWNED);
4649
if ((pip = pr->pr_addrs[af]) != NULL) {
4650
if (*len < pip->ips) {
4651
*len = pip->ips;
4652
mtx_unlock(&pr->pr_mtx);
4653
*out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
4654
mtx_lock(&pr->pr_mtx);
4655
goto again;
4656
}
4657
bcopy(pip->pr_ip, *out, pip->ips * size);
4658
}
4659
}
4660
#endif
4661
4662
static int
4663
sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4664
{
4665
struct xprison *xp;
4666
struct prison *pr, *cpr;
4667
#ifdef INET
4668
struct in_addr *ip4 = NULL;
4669
int ip4s = 0;
4670
#endif
4671
#ifdef INET6
4672
struct in6_addr *ip6 = NULL;
4673
int ip6s = 0;
4674
#endif
4675
int descend, error;
4676
4677
xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4678
pr = req->td->td_ucred->cr_prison;
4679
error = 0;
4680
sx_slock(&allprison_lock);
4681
FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4682
mtx_lock(&cpr->pr_mtx);
4683
#ifdef INET
4684
prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
4685
#endif
4686
#ifdef INET6
4687
prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
4688
#endif
4689
bzero(xp, sizeof(*xp));
4690
xp->pr_version = XPRISON_VERSION;
4691
xp->pr_id = cpr->pr_id;
4692
xp->pr_state = cpr->pr_state;
4693
strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4694
strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4695
strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4696
#ifdef INET
4697
xp->pr_ip4s = ip4s;
4698
#endif
4699
#ifdef INET6
4700
xp->pr_ip6s = ip6s;
4701
#endif
4702
mtx_unlock(&cpr->pr_mtx);
4703
error = SYSCTL_OUT(req, xp, sizeof(*xp));
4704
if (error)
4705
break;
4706
#ifdef INET
4707
if (xp->pr_ip4s > 0) {
4708
error = SYSCTL_OUT(req, ip4,
4709
xp->pr_ip4s * sizeof(struct in_addr));
4710
if (error)
4711
break;
4712
}
4713
#endif
4714
#ifdef INET6
4715
if (xp->pr_ip6s > 0) {
4716
error = SYSCTL_OUT(req, ip6,
4717
xp->pr_ip6s * sizeof(struct in6_addr));
4718
if (error)
4719
break;
4720
}
4721
#endif
4722
}
4723
sx_sunlock(&allprison_lock);
4724
free(xp, M_TEMP);
4725
#ifdef INET
4726
free(ip4, M_TEMP);
4727
#endif
4728
#ifdef INET6
4729
free(ip6, M_TEMP);
4730
#endif
4731
return (error);
4732
}
4733
4734
SYSCTL_OID(_security_jail, OID_AUTO, list,
4735
CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4736
sysctl_jail_list, "S", "List of active jails");
4737
4738
static int
4739
sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4740
{
4741
int error, injail;
4742
4743
injail = jailed(req->td->td_ucred);
4744
error = SYSCTL_OUT(req, &injail, sizeof(injail));
4745
4746
return (error);
4747
}
4748
4749
SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4750
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4751
sysctl_jail_jailed, "I", "Process in jail?");
4752
4753
static int
4754
sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4755
{
4756
int error, havevnet;
4757
#ifdef VIMAGE
4758
struct ucred *cred = req->td->td_ucred;
4759
4760
havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison);
4761
#else
4762
havevnet = 0;
4763
#endif
4764
error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4765
4766
return (error);
4767
}
4768
4769
SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4770
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4771
sysctl_jail_vnet, "I", "Jail owns vnet?");
4772
4773
#if defined(INET) || defined(INET6)
4774
SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4775
&jail_max_af_ips, 0,
4776
"Number of IP addresses a jail may have at most per address family (deprecated)");
4777
#endif
4778
4779
/*
4780
* Default parameters for jail(2) compatibility. For historical reasons,
4781
* the sysctl names have varying similarity to the parameter names. Prisons
4782
* just see their own parameters, and can't change them.
4783
*/
4784
static int
4785
sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4786
{
4787
int error, i;
4788
4789
/* Get the current flag value, and convert it to a boolean. */
4790
if (req->td->td_ucred->cr_prison == &prison0) {
4791
mtx_lock(&prison0.pr_mtx);
4792
i = (jail_default_allow & arg2) != 0;
4793
mtx_unlock(&prison0.pr_mtx);
4794
} else
4795
i = prison_allow(req->td->td_ucred, arg2);
4796
4797
if (arg1 != NULL)
4798
i = !i;
4799
error = sysctl_handle_int(oidp, &i, 0, req);
4800
if (error || !req->newptr)
4801
return (error);
4802
i = i ? arg2 : 0;
4803
if (arg1 != NULL)
4804
i ^= arg2;
4805
/*
4806
* The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4807
* for writing.
4808
*/
4809
mtx_lock(&prison0.pr_mtx);
4810
jail_default_allow = (jail_default_allow & ~arg2) | i;
4811
mtx_unlock(&prison0.pr_mtx);
4812
return (0);
4813
}
4814
4815
SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4816
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4817
NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4818
"Processes in jail can set their hostnames (deprecated)");
4819
SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4820
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4821
(void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4822
"Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4823
SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4824
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4825
NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4826
"Processes in jail can use System V IPC primitives (deprecated)");
4827
SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4828
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4829
NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4830
"Prison root can create raw sockets (deprecated)");
4831
SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4832
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4833
NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4834
"Processes in jail can alter system file flags (deprecated)");
4835
SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4836
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4837
NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4838
"Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4839
SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
4840
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4841
NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
4842
"Processes in jail can lock/unlock physical pages in memory");
4843
4844
static int
4845
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4846
{
4847
struct prison *pr;
4848
int level, error;
4849
4850
pr = req->td->td_ucred->cr_prison;
4851
level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4852
error = sysctl_handle_int(oidp, &level, 0, req);
4853
if (error || !req->newptr)
4854
return (error);
4855
*(int *)arg1 = level;
4856
return (0);
4857
}
4858
4859
SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4860
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4861
&jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4862
sysctl_jail_default_level, "I",
4863
"Processes in jail cannot see all mounted file systems (deprecated)");
4864
4865
SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4866
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4867
&jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4868
sysctl_jail_default_level, "I",
4869
"Ruleset for the devfs filesystem in jail (deprecated)");
4870
4871
SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4872
"Limits and stats of child jails");
4873
4874
static int
4875
sysctl_jail_children(SYSCTL_HANDLER_ARGS)
4876
{
4877
struct prison *pr;
4878
int i;
4879
4880
pr = req->td->td_ucred->cr_prison;
4881
4882
switch (oidp->oid_kind & CTLTYPE) {
4883
case CTLTYPE_INT:
4884
i = *(int *)((char *)pr + arg2);
4885
return (SYSCTL_OUT(req, &i, sizeof(i)));
4886
}
4887
4888
return (0);
4889
}
4890
4891
SYSCTL_PROC(_security_jail_children, OID_AUTO, max,
4892
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4893
NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children,
4894
"I", "Maximum number of child jails");
4895
SYSCTL_PROC(_security_jail_children, OID_AUTO, cur,
4896
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4897
NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children,
4898
"I", "Current number of child jails");
4899
4900
/*
4901
* Nodes to describe jail parameters. Maximum length of string parameters
4902
* is returned in the string itself, and the other parameters exist merely
4903
* to make themselves and their types known.
4904
*/
4905
SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4906
"Jail parameters");
4907
4908
int
4909
sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4910
{
4911
int i;
4912
long l;
4913
size_t s;
4914
char numbuf[12];
4915
4916
switch (oidp->oid_kind & CTLTYPE)
4917
{
4918
case CTLTYPE_LONG:
4919
case CTLTYPE_ULONG:
4920
l = 0;
4921
#ifdef SCTL_MASK32
4922
if (!(req->flags & SCTL_MASK32))
4923
#endif
4924
return (SYSCTL_OUT(req, &l, sizeof(l)));
4925
case CTLTYPE_INT:
4926
case CTLTYPE_UINT:
4927
i = 0;
4928
return (SYSCTL_OUT(req, &i, sizeof(i)));
4929
case CTLTYPE_STRING:
4930
snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4931
return
4932
(sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4933
case CTLTYPE_STRUCT:
4934
s = (size_t)arg2;
4935
return (SYSCTL_OUT(req, &s, sizeof(s)));
4936
}
4937
return (0);
4938
}
4939
4940
/*
4941
* CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4942
* jail creation time but cannot be changed in an existing jail.
4943
*/
4944
SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4945
SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4946
SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4947
SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4948
SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4949
"I", "Jail secure level");
4950
SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4951
"Jail value for kern.osreldate and uname -K");
4952
SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4953
"Jail value for kern.osrelease and uname -r");
4954
SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4955
"I", "Jail cannot see all mounted file systems");
4956
SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4957
"I", "Ruleset for in-jail devfs mounts");
4958
SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4959
"B", "Jail persistence");
4960
#ifdef VIMAGE
4961
SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4962
"E,jailsys", "Virtual network stack");
4963
#endif
4964
SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4965
"B", "Jail is in the process of shutting down");
4966
4967
SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4968
SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4969
"I", "Current number of child jails");
4970
SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4971
"I", "Maximum number of child jails");
4972
4973
SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4974
SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4975
"Jail hostname");
4976
SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4977
"Jail NIS domainname");
4978
SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4979
"Jail host UUID");
4980
SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4981
"LU", "Jail host ID");
4982
4983
SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4984
SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4985
4986
#ifdef INET
4987
SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4988
"Jail IPv4 address virtualization");
4989
SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4990
"S,in_addr,a", "Jail IPv4 addresses");
4991
SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4992
"B", "Do (not) use IPv4 source address selection rather than the "
4993
"primary jail IPv4 address.");
4994
#endif
4995
#ifdef INET6
4996
SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4997
"Jail IPv6 address virtualization");
4998
SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4999
"S,in6_addr,a", "Jail IPv6 addresses");
5000
SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
5001
"B", "Do (not) use IPv6 source address selection rather than the "
5002
"primary jail IPv6 address.");
5003
#endif
5004
5005
SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
5006
SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
5007
"B", "Jail may set hostname");
5008
SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
5009
"B", "Jail may use SYSV IPC");
5010
SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
5011
"B", "Jail may create raw sockets");
5012
SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
5013
"B", "Jail may alter system file flags");
5014
SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
5015
"B", "Jail may set file quotas");
5016
SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
5017
"B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
5018
SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
5019
"B", "Jail may lock (unlock) physical pages in memory");
5020
SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
5021
"B", "Jail may bind sockets to reserved ports");
5022
SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
5023
"B", "Jail may read the kernel message buffer");
5024
SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
5025
"B", "Unprivileged processes may use process debugging facilities");
5026
SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering,
5027
CTLTYPE_INT | CTLFLAG_RW, "B",
5028
"Unprivileged parent jail processes may tamper with same-uid processes"
5029
" (signal/debug/cpuset)");
5030
SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
5031
"B", "Processes in jail with uid 0 have privilege");
5032
#ifdef VIMAGE
5033
SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
5034
"B", "Mountd/nfsd may run in the jail");
5035
#endif
5036
SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW,
5037
"B", "Jail may set system-level filesystem extended attributes");
5038
SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW,
5039
"B", "Jail may adjust system time");
5040
SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
5041
"B", "Jail may set system time");
5042
SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW,
5043
"B", "Jail may modify routing table");
5044
#ifdef AUDIT
5045
SYSCTL_JAIL_PARAM(_allow, setaudit, CTLTYPE_INT | CTLFLAG_RW,
5046
"B", "Jail may set and get audit session state");
5047
#endif
5048
5049
SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
5050
SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
5051
"B", "Jail may mount/unmount jail-friendly file systems in general");
5052
5053
/*
5054
* Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return
5055
* its associated bit in the pr_allow bitmask, or zero if the parameter was
5056
* not created.
5057
*/
5058
unsigned
5059
prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
5060
const char *descr)
5061
{
5062
struct bool_flags *bf;
5063
struct sysctl_oid *parent;
5064
char *allow_name, *allow_noname, *allowed;
5065
#ifndef NO_SYSCTL_DESCR
5066
char *descr_deprecated;
5067
#endif
5068
u_int allow_flag;
5069
5070
if (prefix
5071
? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
5072
< 0 ||
5073
asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
5074
< 0
5075
: asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
5076
asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
5077
free(allow_name, M_PRISON);
5078
return 0;
5079
}
5080
5081
/*
5082
* See if this parameter has already beed added, i.e. a module was
5083
* previously loaded/unloaded.
5084
*/
5085
mtx_lock(&prison0.pr_mtx);
5086
for (bf = pr_flag_allow;
5087
bf < pr_flag_allow + nitems(pr_flag_allow) &&
5088
atomic_load_int(&bf->flag) != 0;
5089
bf++) {
5090
if (strcmp(bf->name, allow_name) == 0) {
5091
allow_flag = bf->flag;
5092
goto no_add;
5093
}
5094
}
5095
5096
/*
5097
* Find a free bit in pr_allow_all, failing if there are none
5098
* (which shouldn't happen as long as we keep track of how many
5099
* potential dynamic flags exist).
5100
*/
5101
for (allow_flag = 1;; allow_flag <<= 1) {
5102
if (allow_flag == 0)
5103
goto no_add;
5104
if ((pr_allow_all & allow_flag) == 0)
5105
break;
5106
}
5107
5108
/* Note the parameter in the next open slot in pr_flag_allow. */
5109
for (bf = pr_flag_allow; ; bf++) {
5110
if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
5111
/* This should never happen, but is not fatal. */
5112
allow_flag = 0;
5113
goto no_add;
5114
}
5115
if (atomic_load_int(&bf->flag) == 0)
5116
break;
5117
}
5118
bf->name = allow_name;
5119
bf->noname = allow_noname;
5120
pr_allow_all |= allow_flag;
5121
/*
5122
* prison0 always has permission for the new parameter.
5123
* Other jails must have it granted to them.
5124
*/
5125
prison0.pr_allow |= allow_flag;
5126
/* The flag indicates a valid entry, so make sure it is set last. */
5127
atomic_store_rel_int(&bf->flag, allow_flag);
5128
mtx_unlock(&prison0.pr_mtx);
5129
5130
/*
5131
* Create sysctls for the parameter, and the back-compat global
5132
* permission.
5133
*/
5134
parent = prefix
5135
? SYSCTL_ADD_NODE(NULL,
5136
SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
5137
OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
5138
: &sysctl___security_jail_param_allow;
5139
(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
5140
name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
5141
NULL, 0, sysctl_jail_param, "B", descr);
5142
if ((prefix
5143
? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
5144
: asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
5145
#ifndef NO_SYSCTL_DESCR
5146
(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
5147
descr);
5148
#endif
5149
(void)SYSCTL_ADD_PROC(NULL,
5150
SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
5151
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
5152
sysctl_jail_default_allow, "I", descr_deprecated);
5153
#ifndef NO_SYSCTL_DESCR
5154
free(descr_deprecated, M_TEMP);
5155
#endif
5156
free(allowed, M_TEMP);
5157
}
5158
return allow_flag;
5159
5160
no_add:
5161
mtx_unlock(&prison0.pr_mtx);
5162
free(allow_name, M_PRISON);
5163
free(allow_noname, M_PRISON);
5164
return allow_flag;
5165
}
5166
5167
/*
5168
* The VFS system will register jail-aware filesystems here. They each get
5169
* a parameter allow.mount.xxxfs and a flag to check when a jailed user
5170
* attempts to mount.
5171
*/
5172
void
5173
prison_add_vfs(struct vfsconf *vfsp)
5174
{
5175
#ifdef NO_SYSCTL_DESCR
5176
5177
vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
5178
NULL, NULL);
5179
#else
5180
char *descr;
5181
5182
(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
5183
vfsp->vfc_name);
5184
vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
5185
NULL, descr);
5186
free(descr, M_TEMP);
5187
#endif
5188
}
5189
5190
#ifdef RACCT
5191
void
5192
prison_racct_foreach(void (*callback)(struct racct *racct,
5193
void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
5194
void *arg2, void *arg3)
5195
{
5196
struct prison_racct *prr;
5197
5198
ASSERT_RACCT_ENABLED();
5199
5200
sx_slock(&allprison_lock);
5201
if (pre != NULL)
5202
(pre)();
5203
LIST_FOREACH(prr, &allprison_racct, prr_next)
5204
(callback)(prr->prr_racct, arg2, arg3);
5205
if (post != NULL)
5206
(post)();
5207
sx_sunlock(&allprison_lock);
5208
}
5209
5210
static struct prison_racct *
5211
prison_racct_find_locked(const char *name)
5212
{
5213
struct prison_racct *prr;
5214
5215
ASSERT_RACCT_ENABLED();
5216
sx_assert(&allprison_lock, SA_XLOCKED);
5217
5218
if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
5219
return (NULL);
5220
5221
LIST_FOREACH(prr, &allprison_racct, prr_next) {
5222
if (strcmp(name, prr->prr_name) != 0)
5223
continue;
5224
5225
/* Found prison_racct with a matching name? */
5226
prison_racct_hold(prr);
5227
return (prr);
5228
}
5229
5230
/* Add new prison_racct. */
5231
prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
5232
racct_create(&prr->prr_racct);
5233
5234
strcpy(prr->prr_name, name);
5235
refcount_init(&prr->prr_refcount, 1);
5236
LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
5237
5238
return (prr);
5239
}
5240
5241
struct prison_racct *
5242
prison_racct_find(const char *name)
5243
{
5244
struct prison_racct *prr;
5245
5246
ASSERT_RACCT_ENABLED();
5247
5248
sx_xlock(&allprison_lock);
5249
prr = prison_racct_find_locked(name);
5250
sx_xunlock(&allprison_lock);
5251
return (prr);
5252
}
5253
5254
void
5255
prison_racct_hold(struct prison_racct *prr)
5256
{
5257
5258
ASSERT_RACCT_ENABLED();
5259
5260
refcount_acquire(&prr->prr_refcount);
5261
}
5262
5263
static void
5264
prison_racct_free_locked(struct prison_racct *prr)
5265
{
5266
5267
ASSERT_RACCT_ENABLED();
5268
sx_assert(&allprison_lock, SA_XLOCKED);
5269
5270
if (refcount_release(&prr->prr_refcount)) {
5271
racct_destroy(&prr->prr_racct);
5272
LIST_REMOVE(prr, prr_next);
5273
free(prr, M_PRISON_RACCT);
5274
}
5275
}
5276
5277
void
5278
prison_racct_free(struct prison_racct *prr)
5279
{
5280
5281
ASSERT_RACCT_ENABLED();
5282
sx_assert(&allprison_lock, SA_UNLOCKED);
5283
5284
if (refcount_release_if_not_last(&prr->prr_refcount))
5285
return;
5286
5287
sx_xlock(&allprison_lock);
5288
prison_racct_free_locked(prr);
5289
sx_xunlock(&allprison_lock);
5290
}
5291
5292
static void
5293
prison_racct_attach(struct prison *pr)
5294
{
5295
struct prison_racct *prr;
5296
5297
ASSERT_RACCT_ENABLED();
5298
sx_assert(&allprison_lock, SA_XLOCKED);
5299
5300
prr = prison_racct_find_locked(pr->pr_name);
5301
KASSERT(prr != NULL, ("cannot find prison_racct"));
5302
5303
pr->pr_prison_racct = prr;
5304
}
5305
5306
/*
5307
* Handle jail renaming. From the racct point of view, renaming means
5308
* moving from one prison_racct to another.
5309
*/
5310
static void
5311
prison_racct_modify(struct prison *pr)
5312
{
5313
#ifdef RCTL
5314
struct proc *p;
5315
struct ucred *cred;
5316
#endif
5317
struct prison_racct *oldprr;
5318
5319
ASSERT_RACCT_ENABLED();
5320
5321
sx_slock(&allproc_lock);
5322
sx_xlock(&allprison_lock);
5323
5324
if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
5325
sx_xunlock(&allprison_lock);
5326
sx_sunlock(&allproc_lock);
5327
return;
5328
}
5329
5330
oldprr = pr->pr_prison_racct;
5331
pr->pr_prison_racct = NULL;
5332
5333
prison_racct_attach(pr);
5334
5335
/*
5336
* Move resource utilisation records.
5337
*/
5338
racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
5339
5340
#ifdef RCTL
5341
/*
5342
* Force rctl to reattach rules to processes.
5343
*/
5344
FOREACH_PROC_IN_SYSTEM(p) {
5345
PROC_LOCK(p);
5346
cred = crhold(p->p_ucred);
5347
PROC_UNLOCK(p);
5348
rctl_proc_ucred_changed(p, cred);
5349
crfree(cred);
5350
}
5351
#endif
5352
5353
sx_sunlock(&allproc_lock);
5354
prison_racct_free_locked(oldprr);
5355
sx_xunlock(&allprison_lock);
5356
}
5357
5358
static void
5359
prison_racct_detach(struct prison *pr)
5360
{
5361
5362
ASSERT_RACCT_ENABLED();
5363
sx_assert(&allprison_lock, SA_UNLOCKED);
5364
5365
if (pr->pr_prison_racct == NULL)
5366
return;
5367
prison_racct_free(pr->pr_prison_racct);
5368
pr->pr_prison_racct = NULL;
5369
}
5370
#endif /* RACCT */
5371
5372
/*
5373
* Submit a knote for a prison, locking if necessary.
5374
*/
5375
static void
5376
prison_knote(struct prison *pr, long hint)
5377
{
5378
int locked;
5379
5380
locked = mtx_owned(&pr->pr_mtx);
5381
if (!locked)
5382
mtx_lock(&pr->pr_mtx);
5383
KNOTE_LOCKED(pr->pr_klist, hint);
5384
jaildesc_knote(pr, hint);
5385
if (!locked)
5386
mtx_unlock(&pr->pr_mtx);
5387
}
5388
5389
#ifdef DDB
5390
5391
static void
5392
db_show_prison(struct prison *pr)
5393
{
5394
struct bool_flags *bf;
5395
struct jailsys_flags *jsf;
5396
#if defined(INET) || defined(INET6)
5397
int ii;
5398
struct prison_ip *pip;
5399
#endif
5400
unsigned f;
5401
#ifdef INET
5402
char ip4buf[INET_ADDRSTRLEN];
5403
#endif
5404
#ifdef INET6
5405
char ip6buf[INET6_ADDRSTRLEN];
5406
#endif
5407
5408
db_printf("prison %p:\n", pr);
5409
db_printf(" jid = %d\n", pr->pr_id);
5410
db_printf(" name = %s\n", pr->pr_name);
5411
db_printf(" parent = %p\n", pr->pr_parent);
5412
db_printf(" ref = %d\n", pr->pr_ref);
5413
db_printf(" uref = %d\n", pr->pr_uref);
5414
db_printf(" state = %s\n",
5415
pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
5416
pr->pr_state == PRISON_STATE_DYING ? "dying" :
5417
"invalid");
5418
db_printf(" path = %s\n", pr->pr_path);
5419
db_printf(" cpuset = %d\n", pr->pr_cpuset
5420
? pr->pr_cpuset->cs_id : -1);
5421
#ifdef VIMAGE
5422
db_printf(" vnet = %p\n", pr->pr_vnet);
5423
#endif
5424
db_printf(" root = %p\n", pr->pr_root);
5425
db_printf(" securelevel = %d\n", pr->pr_securelevel);
5426
db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum);
5427
db_printf(" children.max = %d\n", pr->pr_childmax);
5428
db_printf(" children.cur = %d\n", pr->pr_childcount);
5429
db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
5430
db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
5431
db_printf(" flags = 0x%x", pr->pr_flags);
5432
for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
5433
if (pr->pr_flags & bf->flag)
5434
db_printf(" %s", bf->name);
5435
for (jsf = pr_flag_jailsys;
5436
jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
5437
jsf++) {
5438
f = pr->pr_flags & (jsf->disable | jsf->new);
5439
db_printf(" %-16s= %s\n", jsf->name,
5440
(f != 0 && f == jsf->disable) ? "disable"
5441
: (f == jsf->new) ? "new"
5442
: "inherit");
5443
}
5444
db_printf(" allow = 0x%x", pr->pr_allow);
5445
for (bf = pr_flag_allow;
5446
bf < pr_flag_allow + nitems(pr_flag_allow) &&
5447
atomic_load_int(&bf->flag) != 0;
5448
bf++)
5449
if (pr->pr_allow & bf->flag)
5450
db_printf(" %s", bf->name);
5451
db_printf("\n");
5452
db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
5453
db_printf(" host.hostname = %s\n", pr->pr_hostname);
5454
db_printf(" host.domainname = %s\n", pr->pr_domainname);
5455
db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
5456
db_printf(" host.hostid = %lu\n", pr->pr_hostid);
5457
#ifdef INET
5458
if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
5459
db_printf(" ip4s = %d\n", pip->ips);
5460
for (ii = 0; ii < pip->ips; ii++)
5461
db_printf(" %s %s\n",
5462
ii == 0 ? "ip4.addr =" : " ",
5463
inet_ntoa_r(
5464
*(const struct in_addr *)PR_IP(pip, PR_INET, ii),
5465
ip4buf));
5466
}
5467
#endif
5468
#ifdef INET6
5469
if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
5470
db_printf(" ip6s = %d\n", pip->ips);
5471
for (ii = 0; ii < pip->ips; ii++)
5472
db_printf(" %s %s\n",
5473
ii == 0 ? "ip6.addr =" : " ",
5474
ip6_sprintf(ip6buf,
5475
(const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
5476
}
5477
#endif
5478
}
5479
5480
DB_SHOW_COMMAND(prison, db_show_prison_command)
5481
{
5482
struct prison *pr;
5483
5484
if (!have_addr) {
5485
/*
5486
* Show all prisons in the list, and prison0 which is not
5487
* listed.
5488
*/
5489
db_show_prison(&prison0);
5490
if (!db_pager_quit) {
5491
TAILQ_FOREACH(pr, &allprison, pr_list) {
5492
db_show_prison(pr);
5493
if (db_pager_quit)
5494
break;
5495
}
5496
}
5497
return;
5498
}
5499
5500
if (addr == 0)
5501
pr = &prison0;
5502
else {
5503
/* Look for a prison with the ID and with references. */
5504
TAILQ_FOREACH(pr, &allprison, pr_list)
5505
if (pr->pr_id == addr && pr->pr_ref > 0)
5506
break;
5507
if (pr == NULL)
5508
/* Look again, without requiring a reference. */
5509
TAILQ_FOREACH(pr, &allprison, pr_list)
5510
if (pr->pr_id == addr)
5511
break;
5512
if (pr == NULL)
5513
/* Assume address points to a valid prison. */
5514
pr = (struct prison *)addr;
5515
}
5516
db_show_prison(pr);
5517
}
5518
5519
#endif /* DDB */
5520
5521