Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/kern/kern_descrip.c
105670 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1982, 1986, 1989, 1991, 1993
5
* The Regents of the University of California. All rights reserved.
6
* (c) UNIX System Laboratories, Inc.
7
* All or some portions of this file are derived from material licensed
8
* to the University of California by American Telephone and Telegraph
9
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
10
* the permission of UNIX System Laboratories, Inc.
11
*
12
* Redistribution and use in source and binary forms, with or without
13
* modification, are permitted provided that the following conditions
14
* are met:
15
* 1. Redistributions of source code must retain the above copyright
16
* notice, this list of conditions and the following disclaimer.
17
* 2. Redistributions in binary form must reproduce the above copyright
18
* notice, this list of conditions and the following disclaimer in the
19
* documentation and/or other materials provided with the distribution.
20
* 3. Neither the name of the University nor the names of its contributors
21
* may be used to endorse or promote products derived from this software
22
* without specific prior written permission.
23
*
24
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34
* SUCH DAMAGE.
35
*/
36
37
#include "opt_capsicum.h"
38
#include "opt_ddb.h"
39
#include "opt_ktrace.h"
40
41
#define EXTERR_CATEGORY EXTERR_CAT_FILEDESC
42
#include <sys/systm.h>
43
#include <sys/capsicum.h>
44
#include <sys/conf.h>
45
#include <sys/exterrvar.h>
46
#include <sys/fcntl.h>
47
#include <sys/file.h>
48
#include <sys/filedesc.h>
49
#include <sys/filio.h>
50
#include <sys/jail.h>
51
#include <sys/kernel.h>
52
#include <sys/limits.h>
53
#include <sys/lock.h>
54
#include <sys/malloc.h>
55
#include <sys/mount.h>
56
#include <sys/mutex.h>
57
#include <sys/namei.h>
58
#include <sys/selinfo.h>
59
#include <sys/poll.h>
60
#include <sys/priv.h>
61
#include <sys/proc.h>
62
#include <sys/protosw.h>
63
#include <sys/racct.h>
64
#include <sys/resourcevar.h>
65
#include <sys/sbuf.h>
66
#include <sys/signalvar.h>
67
#include <sys/kdb.h>
68
#include <sys/smr.h>
69
#include <sys/stat.h>
70
#include <sys/sx.h>
71
#include <sys/syscallsubr.h>
72
#include <sys/sysctl.h>
73
#include <sys/sysproto.h>
74
#include <sys/unistd.h>
75
#include <sys/user.h>
76
#include <sys/vnode.h>
77
#include <sys/ktrace.h>
78
79
#include <net/vnet.h>
80
81
#include <security/audit/audit.h>
82
83
#include <vm/uma.h>
84
#include <vm/vm.h>
85
86
#include <ddb/ddb.h>
87
88
static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
89
static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
90
static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors");
91
static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
92
"file desc to leader structures");
93
static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
94
MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
95
96
MALLOC_DECLARE(M_FADVISE);
97
98
static __read_mostly uma_zone_t file_zone;
99
static __read_mostly uma_zone_t filedesc0_zone;
100
__read_mostly uma_zone_t pwd_zone;
101
VFS_SMR_DECLARE;
102
103
static int closefp(struct filedesc *fdp, int fd, struct file *fp,
104
struct thread *td, bool holdleaders, bool audit);
105
static void export_file_to_kinfo(struct file *fp, int fd,
106
cap_rights_t *rightsp, struct kinfo_file *kif,
107
struct filedesc *fdp, int flags);
108
static int fd_first_free(struct filedesc *fdp, int low, int size);
109
static void fdgrowtable(struct filedesc *fdp, int nfd);
110
static void fdgrowtable_exp(struct filedesc *fdp, int nfd);
111
static void fdunused(struct filedesc *fdp, int fd);
112
static void fdused(struct filedesc *fdp, int fd);
113
static int fget_unlocked_seq(struct thread *td, int fd,
114
const cap_rights_t *needrightsp, uint8_t *flagsp,
115
struct file **fpp, seqc_t *seqp);
116
static int getmaxfd(struct thread *td);
117
static u_long *filecaps_copy_prep(const struct filecaps *src);
118
static void filecaps_copy_finish(const struct filecaps *src,
119
struct filecaps *dst, u_long *ioctls);
120
static u_long *filecaps_free_prep(struct filecaps *fcaps);
121
static void filecaps_free_finish(u_long *ioctls);
122
123
static struct pwd *pwd_alloc(void);
124
125
/*
126
* Each process has:
127
*
128
* - An array of open file descriptors (fd_ofiles)
129
* - An array of file flags (fd_ofileflags)
130
* - A bitmap recording which descriptors are in use (fd_map)
131
*
132
* A process starts out with NDFILE descriptors. The value of NDFILE has
133
* been selected based the historical limit of 20 open files, and an
134
* assumption that the majority of processes, especially short-lived
135
* processes like shells, will never need more.
136
*
137
* If this initial allocation is exhausted, a larger descriptor table and
138
* map are allocated dynamically, and the pointers in the process's struct
139
* filedesc are updated to point to those. This is repeated every time
140
* the process runs out of file descriptors (provided it hasn't hit its
141
* resource limit).
142
*
143
* Since threads may hold references to individual descriptor table
144
* entries, the tables are never freed. Instead, they are placed on a
145
* linked list and freed only when the struct filedesc is released.
146
*/
147
#define NDFILE 20
148
#define NDSLOTSIZE sizeof(NDSLOTTYPE)
149
#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT)
150
#define NDSLOT(x) ((x) / NDENTRIES)
151
#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
152
#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
153
154
#define FILEDESC_FOREACH_FDE(fdp, _iterator, _fde) \
155
struct filedesc *_fdp = (fdp); \
156
int _lastfile = fdlastfile_single(_fdp); \
157
for (_iterator = 0; _iterator <= _lastfile; _iterator++) \
158
if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL)
159
160
#define FILEDESC_FOREACH_FP(fdp, _iterator, _fp) \
161
struct filedesc *_fdp = (fdp); \
162
int _lastfile = fdlastfile_single(_fdp); \
163
for (_iterator = 0; _iterator <= _lastfile; _iterator++) \
164
if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL)
165
166
/*
167
* SLIST entry used to keep track of ofiles which must be reclaimed when
168
* the process exits.
169
*/
170
struct freetable {
171
struct fdescenttbl *ft_table;
172
SLIST_ENTRY(freetable) ft_next;
173
};
174
175
/*
176
* Initial allocation: a filedesc structure + the head of SLIST used to
177
* keep track of old ofiles + enough space for NDFILE descriptors.
178
*/
179
180
struct fdescenttbl0 {
181
int fdt_nfiles;
182
struct filedescent fdt_ofiles[NDFILE];
183
};
184
185
struct filedesc0 {
186
struct filedesc fd_fd;
187
SLIST_HEAD(, freetable) fd_free;
188
struct fdescenttbl0 fd_dfiles;
189
NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
190
};
191
192
/*
193
* Descriptor management.
194
*/
195
static int __exclusive_cache_line openfiles; /* actual number of open files */
196
struct mtx sigio_lock; /* mtx to protect pointers to sigio */
197
198
/*
199
* If low >= size, just return low. Otherwise find the first zero bit in the
200
* given bitmap, starting at low and not exceeding size - 1. Return size if
201
* not found.
202
*/
203
static int
204
fd_first_free(struct filedesc *fdp, int low, int size)
205
{
206
NDSLOTTYPE *map = fdp->fd_map;
207
NDSLOTTYPE mask;
208
int off, maxoff;
209
210
if (low >= size)
211
return (low);
212
213
off = NDSLOT(low);
214
if (low % NDENTRIES) {
215
mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
216
if ((mask &= ~map[off]) != 0UL)
217
return (off * NDENTRIES + ffsl(mask) - 1);
218
++off;
219
}
220
for (maxoff = NDSLOTS(size); off < maxoff; ++off)
221
if (map[off] != ~0UL)
222
return (off * NDENTRIES + ffsl(~map[off]) - 1);
223
return (size);
224
}
225
226
/*
227
* Find the last used fd.
228
*
229
* Call this variant if fdp can't be modified by anyone else (e.g, during exec).
230
* Otherwise use fdlastfile.
231
*/
232
int
233
fdlastfile_single(struct filedesc *fdp)
234
{
235
NDSLOTTYPE *map = fdp->fd_map;
236
int off, minoff;
237
238
off = NDSLOT(fdp->fd_nfiles - 1);
239
for (minoff = NDSLOT(0); off >= minoff; --off)
240
if (map[off] != 0)
241
return (off * NDENTRIES + flsl(map[off]) - 1);
242
return (-1);
243
}
244
245
int
246
fdlastfile(struct filedesc *fdp)
247
{
248
249
FILEDESC_LOCK_ASSERT(fdp);
250
return (fdlastfile_single(fdp));
251
}
252
253
static int
254
fdisused(struct filedesc *fdp, int fd)
255
{
256
257
KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
258
("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
259
260
return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
261
}
262
263
/*
264
* Mark a file descriptor as used.
265
*/
266
static void
267
fdused_init(struct filedesc *fdp, int fd)
268
{
269
270
KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
271
272
fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
273
}
274
275
static void
276
fdused(struct filedesc *fdp, int fd)
277
{
278
279
FILEDESC_XLOCK_ASSERT(fdp);
280
281
fdused_init(fdp, fd);
282
if (fd == fdp->fd_freefile)
283
fdp->fd_freefile++;
284
}
285
286
/*
287
* Mark a file descriptor as unused.
288
*/
289
static void
290
fdunused(struct filedesc *fdp, int fd)
291
{
292
293
FILEDESC_XLOCK_ASSERT(fdp);
294
295
KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
296
KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
297
("fd=%d is still in use", fd));
298
299
fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
300
if (fd < fdp->fd_freefile)
301
fdp->fd_freefile = fd;
302
}
303
304
/*
305
* Free a file descriptor.
306
*
307
* Avoid some work if fdp is about to be destroyed.
308
*/
309
static inline void
310
fdefree_last(struct filedescent *fde)
311
{
312
313
filecaps_free(&fde->fde_caps);
314
}
315
316
static inline void
317
fdfree(struct filedesc *fdp, int fd)
318
{
319
struct filedescent *fde;
320
321
FILEDESC_XLOCK_ASSERT(fdp);
322
fde = &fdp->fd_ofiles[fd];
323
#ifdef CAPABILITIES
324
seqc_write_begin(&fde->fde_seqc);
325
#endif
326
fde->fde_file = NULL;
327
#ifdef CAPABILITIES
328
seqc_write_end(&fde->fde_seqc);
329
#endif
330
fdefree_last(fde);
331
fdunused(fdp, fd);
332
}
333
334
/*
335
* System calls on descriptors.
336
*/
337
#ifndef _SYS_SYSPROTO_H_
338
struct getdtablesize_args {
339
int dummy;
340
};
341
#endif
342
/* ARGSUSED */
343
int
344
sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
345
{
346
#ifdef RACCT
347
uint64_t lim;
348
#endif
349
350
td->td_retval[0] = getmaxfd(td);
351
#ifdef RACCT
352
PROC_LOCK(td->td_proc);
353
lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
354
PROC_UNLOCK(td->td_proc);
355
if (lim < td->td_retval[0])
356
td->td_retval[0] = lim;
357
#endif
358
return (0);
359
}
360
361
/*
362
* Duplicate a file descriptor to a particular value.
363
*
364
* Note: keep in mind that a potential race condition exists when closing
365
* descriptors from a shared descriptor table (via rfork).
366
*/
367
#ifndef _SYS_SYSPROTO_H_
368
struct dup2_args {
369
u_int from;
370
u_int to;
371
};
372
#endif
373
/* ARGSUSED */
374
int
375
sys_dup2(struct thread *td, struct dup2_args *uap)
376
{
377
378
return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
379
}
380
381
/*
382
* Duplicate a file descriptor.
383
*/
384
#ifndef _SYS_SYSPROTO_H_
385
struct dup_args {
386
u_int fd;
387
};
388
#endif
389
/* ARGSUSED */
390
int
391
sys_dup(struct thread *td, struct dup_args *uap)
392
{
393
394
return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
395
}
396
397
/*
398
* The file control system call.
399
*/
400
#ifndef _SYS_SYSPROTO_H_
401
struct fcntl_args {
402
int fd;
403
int cmd;
404
long arg;
405
};
406
#endif
407
/* ARGSUSED */
408
int
409
sys_fcntl(struct thread *td, struct fcntl_args *uap)
410
{
411
412
return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
413
}
414
415
int
416
kern_fcntl_freebsd(struct thread *td, int fd, int cmd, intptr_t arg)
417
{
418
struct flock fl;
419
struct __oflock ofl;
420
intptr_t arg1;
421
int error, newcmd;
422
423
error = 0;
424
newcmd = cmd;
425
switch (cmd) {
426
case F_OGETLK:
427
case F_OSETLK:
428
case F_OSETLKW:
429
/*
430
* Convert old flock structure to new.
431
*/
432
error = copyin((void *)arg, &ofl, sizeof(ofl));
433
fl.l_start = ofl.l_start;
434
fl.l_len = ofl.l_len;
435
fl.l_pid = ofl.l_pid;
436
fl.l_type = ofl.l_type;
437
fl.l_whence = ofl.l_whence;
438
fl.l_sysid = 0;
439
440
switch (cmd) {
441
case F_OGETLK:
442
newcmd = F_GETLK;
443
break;
444
case F_OSETLK:
445
newcmd = F_SETLK;
446
break;
447
case F_OSETLKW:
448
newcmd = F_SETLKW;
449
break;
450
}
451
arg1 = (intptr_t)&fl;
452
break;
453
case F_GETLK:
454
case F_SETLK:
455
case F_SETLKW:
456
case F_SETLK_REMOTE:
457
error = copyin((void *)arg, &fl, sizeof(fl));
458
arg1 = (intptr_t)&fl;
459
break;
460
default:
461
arg1 = arg;
462
break;
463
}
464
if (error)
465
return (error);
466
error = kern_fcntl(td, fd, newcmd, arg1);
467
if (error)
468
return (error);
469
if (cmd == F_OGETLK) {
470
ofl.l_start = fl.l_start;
471
ofl.l_len = fl.l_len;
472
ofl.l_pid = fl.l_pid;
473
ofl.l_type = fl.l_type;
474
ofl.l_whence = fl.l_whence;
475
error = copyout(&ofl, (void *)arg, sizeof(ofl));
476
} else if (cmd == F_GETLK) {
477
error = copyout(&fl, (void *)arg, sizeof(fl));
478
}
479
return (error);
480
}
481
482
struct flags_trans_elem {
483
u_int f;
484
u_int t;
485
};
486
487
static u_int
488
flags_trans(const struct flags_trans_elem *ftes, int nitems, u_int from_flags)
489
{
490
u_int res;
491
int i;
492
493
res = 0;
494
for (i = 0; i < nitems; i++) {
495
if ((from_flags & ftes[i].f) != 0)
496
res |= ftes[i].t;
497
}
498
return (res);
499
}
500
501
static uint8_t
502
fd_to_fde_flags(int fd_flags)
503
{
504
static const struct flags_trans_elem fd_to_fde_flags_s[] = {
505
{ .f = FD_CLOEXEC, .t = UF_EXCLOSE },
506
{ .f = FD_CLOFORK, .t = UF_FOCLOSE },
507
{ .f = FD_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH },
508
};
509
510
return (flags_trans(fd_to_fde_flags_s, nitems(fd_to_fde_flags_s),
511
fd_flags));
512
}
513
514
static int
515
fde_to_fd_flags(uint8_t fde_flags)
516
{
517
static const struct flags_trans_elem fde_to_fd_flags_s[] = {
518
{ .f = UF_EXCLOSE, .t = FD_CLOEXEC },
519
{ .f = UF_FOCLOSE, .t = FD_CLOFORK },
520
{ .f = UF_RESOLVE_BENEATH, .t = FD_RESOLVE_BENEATH },
521
};
522
523
return (flags_trans(fde_to_fd_flags_s, nitems(fde_to_fd_flags_s),
524
fde_flags));
525
}
526
527
static uint8_t
528
fddup_to_fde_flags(int fddup_flags)
529
{
530
static const struct flags_trans_elem fddup_to_fde_flags_s[] = {
531
{ .f = FDDUP_FLAG_CLOEXEC, .t = UF_EXCLOSE },
532
{ .f = FDDUP_FLAG_CLOFORK, .t = UF_FOCLOSE },
533
};
534
535
return (flags_trans(fddup_to_fde_flags_s, nitems(fddup_to_fde_flags_s),
536
fddup_flags));
537
}
538
539
static uint8_t
540
close_range_to_fde_flags(int close_range_flags)
541
{
542
static const struct flags_trans_elem close_range_to_fde_flags_s[] = {
543
{ .f = CLOSE_RANGE_CLOEXEC, .t = UF_EXCLOSE },
544
{ .f = CLOSE_RANGE_CLOFORK, .t = UF_FOCLOSE },
545
};
546
547
return (flags_trans(close_range_to_fde_flags_s,
548
nitems(close_range_to_fde_flags_s), close_range_flags));
549
}
550
551
static uint8_t
552
open_to_fde_flags(int open_flags, bool sticky_orb)
553
{
554
static const struct flags_trans_elem open_to_fde_flags_s[] = {
555
{ .f = O_CLOEXEC, .t = UF_EXCLOSE },
556
{ .f = O_CLOFORK, .t = UF_FOCLOSE },
557
{ .f = O_RESOLVE_BENEATH, .t = UF_RESOLVE_BENEATH },
558
};
559
#if defined(__clang__) && __clang_major__ >= 19
560
_Static_assert(open_to_fde_flags_s[nitems(open_to_fde_flags_s) - 1].f ==
561
O_RESOLVE_BENEATH, "O_RESOLVE_BENEATH must be last, for sticky_orb");
562
#endif
563
564
return (flags_trans(open_to_fde_flags_s, nitems(open_to_fde_flags_s) -
565
(sticky_orb ? 0 : 1), open_flags));
566
}
567
568
int
569
kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
570
{
571
struct filedesc *fdp;
572
struct flock *flp;
573
struct file *fp, *fp2;
574
struct filedescent *fde;
575
struct proc *p;
576
struct vnode *vp;
577
struct mount *mp;
578
struct kinfo_file *kif;
579
int error, flg, kif_sz, seals, tmp, got_set, got_cleared;
580
uint64_t bsize;
581
off_t foffset;
582
int flags;
583
584
error = 0;
585
flg = F_POSIX;
586
p = td->td_proc;
587
fdp = p->p_fd;
588
589
AUDIT_ARG_FD(cmd);
590
AUDIT_ARG_CMD(cmd);
591
switch (cmd) {
592
case F_DUPFD:
593
tmp = arg;
594
error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
595
break;
596
597
case F_DUPFD_CLOEXEC:
598
tmp = arg;
599
error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
600
break;
601
602
case F_DUPFD_CLOFORK:
603
tmp = arg;
604
error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOFORK, fd, tmp);
605
break;
606
607
case F_DUP2FD:
608
tmp = arg;
609
error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
610
break;
611
612
case F_DUP2FD_CLOEXEC:
613
tmp = arg;
614
error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
615
break;
616
617
case F_GETFD:
618
error = EBADF;
619
FILEDESC_SLOCK(fdp);
620
fde = fdeget_noref(fdp, fd);
621
if (fde != NULL) {
622
td->td_retval[0] = fde_to_fd_flags(fde->fde_flags);
623
error = 0;
624
}
625
FILEDESC_SUNLOCK(fdp);
626
break;
627
628
case F_SETFD:
629
error = EBADF;
630
FILEDESC_XLOCK(fdp);
631
fde = fdeget_noref(fdp, fd);
632
if (fde != NULL) {
633
/*
634
* UF_RESOLVE_BENEATH is sticky and cannot be cleared.
635
*/
636
fde->fde_flags = (fde->fde_flags &
637
~(UF_EXCLOSE | UF_FOCLOSE)) | fd_to_fde_flags(arg);
638
error = 0;
639
}
640
FILEDESC_XUNLOCK(fdp);
641
break;
642
643
case F_GETFL:
644
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
645
if (error != 0)
646
break;
647
td->td_retval[0] = OFLAGS(fp->f_flag);
648
fdrop(fp, td);
649
break;
650
651
case F_SETFL:
652
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
653
if (error != 0)
654
break;
655
if (fp->f_ops == &path_fileops) {
656
fdrop(fp, td);
657
error = EBADF;
658
break;
659
}
660
fsetfl_lock(fp);
661
do {
662
tmp = flg = fp->f_flag;
663
tmp &= ~FCNTLFLAGS;
664
tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
665
} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
666
got_set = tmp & ~flg;
667
got_cleared = flg & ~tmp;
668
if (((got_set | got_cleared) & FNONBLOCK) != 0) {
669
tmp = fp->f_flag & FNONBLOCK;
670
error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
671
if (error != 0)
672
goto revert_flags;
673
}
674
if (((got_set | got_cleared) & FASYNC) != 0) {
675
tmp = fp->f_flag & FASYNC;
676
error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
677
if (error != 0)
678
goto revert_nonblock;
679
}
680
fsetfl_unlock(fp);
681
fdrop(fp, td);
682
break;
683
revert_nonblock:
684
if (((got_set | got_cleared) & FNONBLOCK) != 0) {
685
tmp = ~fp->f_flag & FNONBLOCK;
686
(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
687
}
688
revert_flags:
689
do {
690
tmp = flg = fp->f_flag;
691
tmp &= ~FCNTLFLAGS;
692
tmp |= got_cleared;
693
tmp &= ~got_set;
694
} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
695
fsetfl_unlock(fp);
696
fdrop(fp, td);
697
break;
698
699
case F_GETOWN:
700
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
701
if (error != 0)
702
break;
703
error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
704
if (error == 0)
705
td->td_retval[0] = tmp;
706
fdrop(fp, td);
707
break;
708
709
case F_SETOWN:
710
error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
711
if (error != 0)
712
break;
713
tmp = arg;
714
error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
715
fdrop(fp, td);
716
break;
717
718
case F_SETLK_REMOTE:
719
error = priv_check(td, PRIV_NFS_LOCKD);
720
if (error != 0)
721
return (error);
722
flg = F_REMOTE;
723
goto do_setlk;
724
725
case F_SETLKW:
726
flg |= F_WAIT;
727
/* FALLTHROUGH F_SETLK */
728
729
case F_SETLK:
730
do_setlk:
731
flp = (struct flock *)arg;
732
if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
733
error = EINVAL;
734
break;
735
}
736
737
error = fget_unlocked(td, fd, &cap_flock_rights, &fp);
738
if (error != 0)
739
break;
740
if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
741
error = EBADF;
742
fdrop(fp, td);
743
break;
744
}
745
746
if (flp->l_whence == SEEK_CUR) {
747
foffset = foffset_get(fp);
748
if (foffset < 0 ||
749
(flp->l_start > 0 &&
750
foffset > OFF_MAX - flp->l_start)) {
751
error = EOVERFLOW;
752
fdrop(fp, td);
753
break;
754
}
755
flp->l_start += foffset;
756
}
757
758
vp = fp->f_vnode;
759
switch (flp->l_type) {
760
case F_RDLCK:
761
if ((fp->f_flag & FREAD) == 0) {
762
error = EBADF;
763
break;
764
}
765
if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
766
PROC_LOCK(p->p_leader);
767
p->p_leader->p_flag |= P_ADVLOCK;
768
PROC_UNLOCK(p->p_leader);
769
}
770
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
771
flp, flg);
772
break;
773
case F_WRLCK:
774
if ((fp->f_flag & FWRITE) == 0) {
775
error = EBADF;
776
break;
777
}
778
if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
779
PROC_LOCK(p->p_leader);
780
p->p_leader->p_flag |= P_ADVLOCK;
781
PROC_UNLOCK(p->p_leader);
782
}
783
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
784
flp, flg);
785
break;
786
case F_UNLCK:
787
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
788
flp, flg);
789
break;
790
case F_UNLCKSYS:
791
if (flg != F_REMOTE) {
792
error = EINVAL;
793
break;
794
}
795
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
796
F_UNLCKSYS, flp, flg);
797
break;
798
default:
799
error = EINVAL;
800
break;
801
}
802
if (error != 0 || flp->l_type == F_UNLCK ||
803
flp->l_type == F_UNLCKSYS) {
804
fdrop(fp, td);
805
break;
806
}
807
808
/*
809
* Check for a race with close.
810
*
811
* The vnode is now advisory locked (or unlocked, but this case
812
* is not really important) as the caller requested.
813
* We had to drop the filedesc lock, so we need to recheck if
814
* the descriptor is still valid, because if it was closed
815
* in the meantime we need to remove advisory lock from the
816
* vnode - close on any descriptor leading to an advisory
817
* locked vnode, removes that lock.
818
* We will return 0 on purpose in that case, as the result of
819
* successful advisory lock might have been externally visible
820
* already. This is fine - effectively we pretend to the caller
821
* that the closing thread was a bit slower and that the
822
* advisory lock succeeded before the close.
823
*/
824
error = fget_unlocked(td, fd, &cap_no_rights, &fp2);
825
if (error != 0) {
826
fdrop(fp, td);
827
break;
828
}
829
if (fp != fp2) {
830
flp->l_whence = SEEK_SET;
831
flp->l_start = 0;
832
flp->l_len = 0;
833
flp->l_type = F_UNLCK;
834
(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
835
F_UNLCK, flp, F_POSIX);
836
}
837
fdrop(fp, td);
838
fdrop(fp2, td);
839
break;
840
841
case F_GETLK:
842
error = fget_unlocked(td, fd, &cap_flock_rights, &fp);
843
if (error != 0)
844
break;
845
if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
846
error = EBADF;
847
fdrop(fp, td);
848
break;
849
}
850
flp = (struct flock *)arg;
851
if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
852
flp->l_type != F_UNLCK) {
853
error = EINVAL;
854
fdrop(fp, td);
855
break;
856
}
857
if (flp->l_whence == SEEK_CUR) {
858
foffset = foffset_get(fp);
859
if ((flp->l_start > 0 &&
860
foffset > OFF_MAX - flp->l_start) ||
861
(flp->l_start < 0 &&
862
foffset < OFF_MIN - flp->l_start)) {
863
error = EOVERFLOW;
864
fdrop(fp, td);
865
break;
866
}
867
flp->l_start += foffset;
868
}
869
vp = fp->f_vnode;
870
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
871
F_POSIX);
872
fdrop(fp, td);
873
break;
874
875
case F_ADD_SEALS:
876
error = fget_unlocked(td, fd, &cap_no_rights, &fp);
877
if (error != 0)
878
break;
879
error = fo_add_seals(fp, arg);
880
fdrop(fp, td);
881
break;
882
883
case F_GET_SEALS:
884
error = fget_unlocked(td, fd, &cap_no_rights, &fp);
885
if (error != 0)
886
break;
887
if (fo_get_seals(fp, &seals) == 0)
888
td->td_retval[0] = seals;
889
else
890
error = EINVAL;
891
fdrop(fp, td);
892
break;
893
894
case F_RDAHEAD:
895
arg = arg ? 128 * 1024: 0;
896
/* FALLTHROUGH */
897
case F_READAHEAD:
898
error = fget_unlocked(td, fd, &cap_no_rights, &fp);
899
if (error != 0)
900
break;
901
if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
902
fdrop(fp, td);
903
error = EBADF;
904
break;
905
}
906
vp = fp->f_vnode;
907
if (vp->v_type != VREG) {
908
fdrop(fp, td);
909
error = ENOTTY;
910
break;
911
}
912
913
/*
914
* Exclusive lock synchronizes against f_seqcount reads and
915
* writes in sequential_heuristic().
916
*/
917
error = vn_lock(vp, LK_EXCLUSIVE);
918
if (error != 0) {
919
fdrop(fp, td);
920
break;
921
}
922
if (arg >= 0) {
923
bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
924
arg = MIN(arg, INT_MAX - bsize + 1);
925
fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
926
(arg + bsize - 1) / bsize);
927
atomic_set_int(&fp->f_flag, FRDAHEAD);
928
} else {
929
atomic_clear_int(&fp->f_flag, FRDAHEAD);
930
}
931
VOP_UNLOCK(vp);
932
fdrop(fp, td);
933
break;
934
935
case F_ISUNIONSTACK:
936
/*
937
* Check if the vnode is part of a union stack (either the
938
* "union" flag from mount(2) or unionfs).
939
*
940
* Prior to introduction of this op libc's readdir would call
941
* fstatfs(2), in effect unnecessarily copying kilobytes of
942
* data just to check fs name and a mount flag.
943
*
944
* Fixing the code to handle everything in the kernel instead
945
* is a non-trivial endeavor and has low priority, thus this
946
* horrible kludge facilitates the current behavior in a much
947
* cheaper manner until someone(tm) sorts this out.
948
*/
949
error = fget_unlocked(td, fd, &cap_no_rights, &fp);
950
if (error != 0)
951
break;
952
if (fp->f_type != DTYPE_VNODE) {
953
fdrop(fp, td);
954
error = EBADF;
955
break;
956
}
957
vp = fp->f_vnode;
958
/*
959
* Since we don't prevent dooming the vnode even non-null mp
960
* found can become immediately stale. This is tolerable since
961
* mount points are type-stable (providing safe memory access)
962
* and any vfs op on this vnode going forward will return an
963
* error (meaning return value in this case is meaningless).
964
*/
965
mp = atomic_load_ptr(&vp->v_mount);
966
if (__predict_false(mp == NULL)) {
967
fdrop(fp, td);
968
error = EBADF;
969
break;
970
}
971
td->td_retval[0] = 0;
972
if (mp->mnt_kern_flag & MNTK_UNIONFS ||
973
mp->mnt_flag & MNT_UNION)
974
td->td_retval[0] = 1;
975
fdrop(fp, td);
976
break;
977
978
case F_KINFO:
979
#ifdef CAPABILITY_MODE
980
if (CAP_TRACING(td))
981
ktrcapfail(CAPFAIL_SYSCALL, &cmd);
982
if (IN_CAPABILITY_MODE(td)) {
983
error = ECAPMODE;
984
break;
985
}
986
#endif
987
error = copyin((void *)arg, &kif_sz, sizeof(kif_sz));
988
if (error != 0)
989
break;
990
if (kif_sz != sizeof(*kif)) {
991
error = EINVAL;
992
break;
993
}
994
kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO);
995
FILEDESC_SLOCK(fdp);
996
error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL);
997
if (error == 0 && fhold(fp)) {
998
export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0);
999
FILEDESC_SUNLOCK(fdp);
1000
fdrop(fp, td);
1001
if ((kif->kf_status & KF_ATTR_VALID) != 0) {
1002
kif->kf_structsize = sizeof(*kif);
1003
error = copyout(kif, (void *)arg, sizeof(*kif));
1004
} else {
1005
error = EBADF;
1006
}
1007
} else {
1008
FILEDESC_SUNLOCK(fdp);
1009
if (error == 0)
1010
error = EBADF;
1011
}
1012
free(kif, M_TEMP);
1013
break;
1014
1015
default:
1016
if ((cmd & ((1u << F_DUP3FD_SHIFT) - 1)) != F_DUP3FD)
1017
return (EXTERROR(EINVAL, "invalid fcntl cmd"));
1018
/* Handle F_DUP3FD */
1019
flags = (cmd >> F_DUP3FD_SHIFT);
1020
if ((flags & ~(FD_CLOEXEC | FD_CLOFORK)) != 0)
1021
return (EXTERROR(EINVAL, "invalid flags for F_DUP3FD"));
1022
tmp = arg;
1023
error = kern_dup(td, FDDUP_FIXED,
1024
((flags & FD_CLOEXEC) != 0 ? FDDUP_FLAG_CLOEXEC : 0) |
1025
((flags & FD_CLOFORK) != 0 ? FDDUP_FLAG_CLOFORK : 0),
1026
fd, tmp);
1027
break;
1028
}
1029
return (error);
1030
}
1031
1032
static int
1033
getmaxfd(struct thread *td)
1034
{
1035
1036
return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
1037
}
1038
1039
/*
1040
* Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
1041
*/
1042
int
1043
kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
1044
{
1045
struct filedesc *fdp;
1046
struct filedescent *oldfde, *newfde;
1047
struct proc *p;
1048
struct file *delfp, *oldfp;
1049
u_long *oioctls, *nioctls;
1050
int error, maxfd;
1051
1052
p = td->td_proc;
1053
fdp = p->p_fd;
1054
oioctls = NULL;
1055
1056
MPASS((flags & ~(FDDUP_FLAG_CLOEXEC | FDDUP_FLAG_CLOFORK)) == 0);
1057
MPASS(mode < FDDUP_LASTMODE);
1058
1059
AUDIT_ARG_FD(old);
1060
/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
1061
1062
/*
1063
* Verify we have a valid descriptor to dup from and possibly to
1064
* dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
1065
* return EINVAL when the new descriptor is out of bounds.
1066
*/
1067
if (old < 0)
1068
return (EBADF);
1069
if (new < 0)
1070
return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
1071
maxfd = getmaxfd(td);
1072
if (new >= maxfd)
1073
return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
1074
1075
error = EBADF;
1076
FILEDESC_XLOCK(fdp);
1077
if (fget_noref(fdp, old) == NULL)
1078
goto unlock;
1079
if (mode == FDDUP_FIXED && old == new) {
1080
td->td_retval[0] = new;
1081
fdp->fd_ofiles[new].fde_flags |= fddup_to_fde_flags(flags);
1082
error = 0;
1083
goto unlock;
1084
}
1085
1086
oldfde = &fdp->fd_ofiles[old];
1087
oldfp = oldfde->fde_file;
1088
if (!fhold(oldfp))
1089
goto unlock;
1090
1091
/*
1092
* If the caller specified a file descriptor, make sure the file
1093
* table is large enough to hold it, and grab it. Otherwise, just
1094
* allocate a new descriptor the usual way.
1095
*/
1096
switch (mode) {
1097
case FDDUP_NORMAL:
1098
case FDDUP_FCNTL:
1099
if ((error = fdalloc(td, new, &new)) != 0) {
1100
fdrop(oldfp, td);
1101
goto unlock;
1102
}
1103
break;
1104
case FDDUP_FIXED:
1105
if (new >= fdp->fd_nfiles) {
1106
/*
1107
* The resource limits are here instead of e.g.
1108
* fdalloc(), because the file descriptor table may be
1109
* shared between processes, so we can't really use
1110
* racct_add()/racct_sub(). Instead of counting the
1111
* number of actually allocated descriptors, just put
1112
* the limit on the size of the file descriptor table.
1113
*/
1114
#ifdef RACCT
1115
if (RACCT_ENABLED()) {
1116
error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
1117
if (error != 0) {
1118
error = EMFILE;
1119
fdrop(oldfp, td);
1120
goto unlock;
1121
}
1122
}
1123
#endif
1124
fdgrowtable_exp(fdp, new + 1);
1125
}
1126
if (!fdisused(fdp, new))
1127
fdused(fdp, new);
1128
break;
1129
default:
1130
KASSERT(0, ("%s unsupported mode %d", __func__, mode));
1131
}
1132
1133
KASSERT(old != new, ("new fd is same as old"));
1134
1135
/* Refetch oldfde because the table may have grown and old one freed. */
1136
oldfde = &fdp->fd_ofiles[old];
1137
KASSERT(oldfp == oldfde->fde_file,
1138
("fdt_ofiles shift from growth observed at fd %d",
1139
old));
1140
1141
newfde = &fdp->fd_ofiles[new];
1142
delfp = newfde->fde_file;
1143
1144
nioctls = filecaps_copy_prep(&oldfde->fde_caps);
1145
1146
/*
1147
* Duplicate the source descriptor.
1148
*/
1149
#ifdef CAPABILITIES
1150
seqc_write_begin(&newfde->fde_seqc);
1151
#endif
1152
oioctls = filecaps_free_prep(&newfde->fde_caps);
1153
fde_copy(oldfde, newfde);
1154
filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
1155
nioctls);
1156
newfde->fde_flags = (oldfde->fde_flags & ~(UF_EXCLOSE | UF_FOCLOSE)) |
1157
fddup_to_fde_flags(flags);
1158
#ifdef CAPABILITIES
1159
seqc_write_end(&newfde->fde_seqc);
1160
#endif
1161
td->td_retval[0] = new;
1162
1163
error = 0;
1164
1165
if (delfp != NULL) {
1166
(void) closefp(fdp, new, delfp, td, true, false);
1167
FILEDESC_UNLOCK_ASSERT(fdp);
1168
} else {
1169
unlock:
1170
FILEDESC_XUNLOCK(fdp);
1171
}
1172
1173
filecaps_free_finish(oioctls);
1174
return (error);
1175
}
1176
1177
static void
1178
sigiofree(struct sigio *sigio)
1179
{
1180
crfree(sigio->sio_ucred);
1181
free(sigio, M_SIGIO);
1182
}
1183
1184
static struct sigio *
1185
funsetown_locked(struct sigio *sigio)
1186
{
1187
struct proc *p;
1188
struct pgrp *pg;
1189
1190
SIGIO_ASSERT_LOCKED();
1191
1192
if (sigio == NULL)
1193
return (NULL);
1194
*sigio->sio_myref = NULL;
1195
if (sigio->sio_pgid < 0) {
1196
pg = sigio->sio_pgrp;
1197
PGRP_LOCK(pg);
1198
SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio);
1199
PGRP_UNLOCK(pg);
1200
} else {
1201
p = sigio->sio_proc;
1202
PROC_LOCK(p);
1203
SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio);
1204
PROC_UNLOCK(p);
1205
}
1206
return (sigio);
1207
}
1208
1209
/*
1210
* If sigio is on the list associated with a process or process group,
1211
* disable signalling from the device, remove sigio from the list and
1212
* free sigio.
1213
*/
1214
void
1215
funsetown(struct sigio **sigiop)
1216
{
1217
struct sigio *sigio;
1218
1219
/* Racy check, consumers must provide synchronization. */
1220
if (*sigiop == NULL)
1221
return;
1222
1223
SIGIO_LOCK();
1224
sigio = funsetown_locked(*sigiop);
1225
SIGIO_UNLOCK();
1226
if (sigio != NULL)
1227
sigiofree(sigio);
1228
}
1229
1230
/*
1231
* Free a list of sigio structures. The caller must ensure that new sigio
1232
* structures cannot be added after this point. For process groups this is
1233
* guaranteed using the proctree lock; for processes, the P_WEXIT flag serves
1234
* as an interlock.
1235
*/
1236
void
1237
funsetownlst(struct sigiolst *sigiolst)
1238
{
1239
struct proc *p;
1240
struct pgrp *pg;
1241
struct sigio *sigio, *tmp;
1242
1243
/* Racy check. */
1244
sigio = SLIST_FIRST(sigiolst);
1245
if (sigio == NULL)
1246
return;
1247
1248
p = NULL;
1249
pg = NULL;
1250
1251
SIGIO_LOCK();
1252
sigio = SLIST_FIRST(sigiolst);
1253
if (sigio == NULL) {
1254
SIGIO_UNLOCK();
1255
return;
1256
}
1257
1258
/*
1259
* Every entry of the list should belong to a single proc or pgrp.
1260
*/
1261
if (sigio->sio_pgid < 0) {
1262
pg = sigio->sio_pgrp;
1263
sx_assert(&proctree_lock, SX_XLOCKED);
1264
PGRP_LOCK(pg);
1265
} else /* if (sigio->sio_pgid > 0) */ {
1266
p = sigio->sio_proc;
1267
PROC_LOCK(p);
1268
KASSERT((p->p_flag & P_WEXIT) != 0,
1269
("%s: process %p is not exiting", __func__, p));
1270
}
1271
1272
SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) {
1273
*sigio->sio_myref = NULL;
1274
if (pg != NULL) {
1275
KASSERT(sigio->sio_pgid < 0,
1276
("Proc sigio in pgrp sigio list"));
1277
KASSERT(sigio->sio_pgrp == pg,
1278
("Bogus pgrp in sigio list"));
1279
} else /* if (p != NULL) */ {
1280
KASSERT(sigio->sio_pgid > 0,
1281
("Pgrp sigio in proc sigio list"));
1282
KASSERT(sigio->sio_proc == p,
1283
("Bogus proc in sigio list"));
1284
}
1285
}
1286
1287
if (pg != NULL)
1288
PGRP_UNLOCK(pg);
1289
else
1290
PROC_UNLOCK(p);
1291
SIGIO_UNLOCK();
1292
1293
SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp)
1294
sigiofree(sigio);
1295
}
1296
1297
/*
1298
* This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1299
*
1300
* After permission checking, add a sigio structure to the sigio list for
1301
* the process or process group.
1302
*/
1303
int
1304
fsetown(pid_t pgid, struct sigio **sigiop)
1305
{
1306
struct proc *proc;
1307
struct pgrp *pgrp;
1308
struct sigio *osigio, *sigio;
1309
int ret;
1310
1311
if (pgid == 0) {
1312
funsetown(sigiop);
1313
return (0);
1314
}
1315
1316
sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1317
sigio->sio_pgid = pgid;
1318
sigio->sio_ucred = crhold(curthread->td_ucred);
1319
sigio->sio_myref = sigiop;
1320
1321
ret = 0;
1322
if (pgid > 0) {
1323
ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc);
1324
SIGIO_LOCK();
1325
osigio = funsetown_locked(*sigiop);
1326
if (ret == 0) {
1327
PROC_LOCK(proc);
1328
_PRELE(proc);
1329
if ((proc->p_flag & P_WEXIT) != 0) {
1330
ret = ESRCH;
1331
} else if (proc->p_session !=
1332
curthread->td_proc->p_session) {
1333
/*
1334
* Policy - Don't allow a process to FSETOWN a
1335
* process in another session.
1336
*
1337
* Remove this test to allow maximum flexibility
1338
* or restrict FSETOWN to the current process or
1339
* process group for maximum safety.
1340
*/
1341
ret = EPERM;
1342
} else {
1343
sigio->sio_proc = proc;
1344
SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio,
1345
sio_pgsigio);
1346
}
1347
PROC_UNLOCK(proc);
1348
}
1349
} else /* if (pgid < 0) */ {
1350
sx_slock(&proctree_lock);
1351
SIGIO_LOCK();
1352
osigio = funsetown_locked(*sigiop);
1353
pgrp = pgfind(-pgid);
1354
if (pgrp == NULL) {
1355
ret = ESRCH;
1356
} else {
1357
if (pgrp->pg_session != curthread->td_proc->p_session) {
1358
/*
1359
* Policy - Don't allow a process to FSETOWN a
1360
* process in another session.
1361
*
1362
* Remove this test to allow maximum flexibility
1363
* or restrict FSETOWN to the current process or
1364
* process group for maximum safety.
1365
*/
1366
ret = EPERM;
1367
} else {
1368
sigio->sio_pgrp = pgrp;
1369
SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio,
1370
sio_pgsigio);
1371
}
1372
PGRP_UNLOCK(pgrp);
1373
}
1374
sx_sunlock(&proctree_lock);
1375
}
1376
if (ret == 0)
1377
*sigiop = sigio;
1378
SIGIO_UNLOCK();
1379
if (osigio != NULL)
1380
sigiofree(osigio);
1381
return (ret);
1382
}
1383
1384
/*
1385
* This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1386
*/
1387
pid_t
1388
fgetown(struct sigio **sigiop)
1389
{
1390
pid_t pgid;
1391
1392
SIGIO_LOCK();
1393
pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1394
SIGIO_UNLOCK();
1395
return (pgid);
1396
}
1397
1398
static int
1399
closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1400
bool audit)
1401
{
1402
int error;
1403
1404
FILEDESC_XLOCK_ASSERT(fdp);
1405
1406
/*
1407
* We now hold the fp reference that used to be owned by the
1408
* descriptor array. We have to unlock the FILEDESC *AFTER*
1409
* knote_fdclose to prevent a race of the fd getting opened, a knote
1410
* added, and deleteing a knote for the new fd.
1411
*/
1412
if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
1413
knote_fdclose(td, fd);
1414
1415
if (fp->f_ops->fo_fdclose != NULL)
1416
fp->f_ops->fo_fdclose(fp, fd, td);
1417
FILEDESC_XUNLOCK(fdp);
1418
1419
#ifdef AUDIT
1420
if (AUDITING_TD(td) && audit)
1421
audit_sysclose(td, fd, fp);
1422
#endif
1423
error = closef(fp, td);
1424
1425
/*
1426
* All paths leading up to closefp() will have already removed or
1427
* replaced the fd in the filedesc table, so a restart would not
1428
* operate on the same file.
1429
*/
1430
if (error == ERESTART)
1431
error = EINTR;
1432
1433
return (error);
1434
}
1435
1436
static int
1437
closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1438
bool holdleaders, bool audit)
1439
{
1440
int error;
1441
1442
FILEDESC_XLOCK_ASSERT(fdp);
1443
1444
if (holdleaders) {
1445
if (td->td_proc->p_fdtol != NULL) {
1446
/*
1447
* Ask fdfree() to sleep to ensure that all relevant
1448
* process leaders can be traversed in closef().
1449
*/
1450
fdp->fd_holdleaderscount++;
1451
} else {
1452
holdleaders = false;
1453
}
1454
}
1455
1456
error = closefp_impl(fdp, fd, fp, td, audit);
1457
if (holdleaders) {
1458
FILEDESC_XLOCK(fdp);
1459
fdp->fd_holdleaderscount--;
1460
if (fdp->fd_holdleaderscount == 0 &&
1461
fdp->fd_holdleaderswakeup != 0) {
1462
fdp->fd_holdleaderswakeup = 0;
1463
wakeup(&fdp->fd_holdleaderscount);
1464
}
1465
FILEDESC_XUNLOCK(fdp);
1466
}
1467
return (error);
1468
}
1469
1470
static int
1471
closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1472
bool holdleaders, bool audit)
1473
{
1474
1475
FILEDESC_XLOCK_ASSERT(fdp);
1476
1477
if (__predict_false(td->td_proc->p_fdtol != NULL)) {
1478
return (closefp_hl(fdp, fd, fp, td, holdleaders, audit));
1479
} else {
1480
return (closefp_impl(fdp, fd, fp, td, audit));
1481
}
1482
}
1483
1484
/*
1485
* Close a file descriptor.
1486
*/
1487
#ifndef _SYS_SYSPROTO_H_
1488
struct close_args {
1489
int fd;
1490
};
1491
#endif
1492
/* ARGSUSED */
1493
int
1494
sys_close(struct thread *td, struct close_args *uap)
1495
{
1496
1497
return (kern_close(td, uap->fd));
1498
}
1499
1500
int
1501
kern_close(struct thread *td, int fd)
1502
{
1503
struct filedesc *fdp;
1504
struct file *fp;
1505
1506
fdp = td->td_proc->p_fd;
1507
1508
FILEDESC_XLOCK(fdp);
1509
if ((fp = fget_noref(fdp, fd)) == NULL) {
1510
FILEDESC_XUNLOCK(fdp);
1511
return (EBADF);
1512
}
1513
fdfree(fdp, fd);
1514
1515
/* closefp() drops the FILEDESC lock for us. */
1516
return (closefp(fdp, fd, fp, td, true, true));
1517
}
1518
1519
static int
1520
close_range_flags(struct thread *td, u_int lowfd, u_int highfd, int flags)
1521
{
1522
struct filedesc *fdp;
1523
struct fdescenttbl *fdt;
1524
struct filedescent *fde;
1525
int fd, fde_flags;
1526
1527
fde_flags = close_range_to_fde_flags(flags);
1528
fdp = td->td_proc->p_fd;
1529
FILEDESC_XLOCK(fdp);
1530
fdt = atomic_load_ptr(&fdp->fd_files);
1531
highfd = MIN(highfd, fdt->fdt_nfiles - 1);
1532
fd = lowfd;
1533
if (__predict_false(fd > highfd)) {
1534
goto out_locked;
1535
}
1536
for (; fd <= highfd; fd++) {
1537
fde = &fdt->fdt_ofiles[fd];
1538
if (fde->fde_file != NULL)
1539
fde->fde_flags |= fde_flags;
1540
}
1541
out_locked:
1542
FILEDESC_XUNLOCK(fdp);
1543
return (0);
1544
}
1545
1546
static int
1547
close_range_impl(struct thread *td, u_int lowfd, u_int highfd)
1548
{
1549
struct filedesc *fdp;
1550
const struct fdescenttbl *fdt;
1551
struct file *fp;
1552
int fd;
1553
1554
fdp = td->td_proc->p_fd;
1555
FILEDESC_XLOCK(fdp);
1556
fdt = atomic_load_ptr(&fdp->fd_files);
1557
highfd = MIN(highfd, fdt->fdt_nfiles - 1);
1558
fd = lowfd;
1559
if (__predict_false(fd > highfd)) {
1560
goto out_locked;
1561
}
1562
for (;;) {
1563
fp = fdt->fdt_ofiles[fd].fde_file;
1564
if (fp == NULL) {
1565
if (fd == highfd)
1566
goto out_locked;
1567
} else {
1568
fdfree(fdp, fd);
1569
(void) closefp(fdp, fd, fp, td, true, true);
1570
if (fd == highfd)
1571
goto out_unlocked;
1572
FILEDESC_XLOCK(fdp);
1573
fdt = atomic_load_ptr(&fdp->fd_files);
1574
}
1575
fd++;
1576
}
1577
out_locked:
1578
FILEDESC_XUNLOCK(fdp);
1579
out_unlocked:
1580
return (0);
1581
}
1582
1583
int
1584
kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd)
1585
{
1586
1587
/*
1588
* Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
1589
* open should not be a usage error. From a close_range() perspective,
1590
* close_range(3, ~0U, 0) in the same scenario should also likely not
1591
* be a usage error as all fd above 3 are in-fact already closed.
1592
*/
1593
if (highfd < lowfd) {
1594
return (EINVAL);
1595
}
1596
1597
if ((flags & (CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
1598
return (close_range_flags(td, lowfd, highfd, flags));
1599
1600
return (close_range_impl(td, lowfd, highfd));
1601
}
1602
1603
#ifndef _SYS_SYSPROTO_H_
1604
struct close_range_args {
1605
u_int lowfd;
1606
u_int highfd;
1607
int flags;
1608
};
1609
#endif
1610
int
1611
sys_close_range(struct thread *td, struct close_range_args *uap)
1612
{
1613
1614
AUDIT_ARG_FD(uap->lowfd);
1615
AUDIT_ARG_CMD(uap->highfd);
1616
AUDIT_ARG_FFLAGS(uap->flags);
1617
1618
if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_CLOFORK)) != 0)
1619
return (EINVAL);
1620
return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd));
1621
}
1622
1623
#ifdef COMPAT_FREEBSD12
1624
/*
1625
* Close open file descriptors.
1626
*/
1627
#ifndef _SYS_SYSPROTO_H_
1628
struct freebsd12_closefrom_args {
1629
int lowfd;
1630
};
1631
#endif
1632
/* ARGSUSED */
1633
int
1634
freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
1635
{
1636
u_int lowfd;
1637
1638
AUDIT_ARG_FD(uap->lowfd);
1639
1640
/*
1641
* Treat negative starting file descriptor values identical to
1642
* closefrom(0) which closes all files.
1643
*/
1644
lowfd = MAX(0, uap->lowfd);
1645
return (kern_close_range(td, 0, lowfd, ~0U));
1646
}
1647
#endif /* COMPAT_FREEBSD12 */
1648
1649
#if defined(COMPAT_43)
1650
/*
1651
* Return status information about a file descriptor.
1652
*/
1653
#ifndef _SYS_SYSPROTO_H_
1654
struct ofstat_args {
1655
int fd;
1656
struct ostat *sb;
1657
};
1658
#endif
1659
/* ARGSUSED */
1660
int
1661
ofstat(struct thread *td, struct ofstat_args *uap)
1662
{
1663
struct ostat oub;
1664
struct stat ub;
1665
int error;
1666
1667
error = kern_fstat(td, uap->fd, &ub);
1668
if (error == 0) {
1669
cvtstat(&ub, &oub);
1670
error = copyout(&oub, uap->sb, sizeof(oub));
1671
}
1672
return (error);
1673
}
1674
#endif /* COMPAT_43 */
1675
1676
#if defined(COMPAT_FREEBSD11)
1677
int
1678
freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
1679
{
1680
struct stat sb;
1681
struct freebsd11_stat osb;
1682
int error;
1683
1684
error = kern_fstat(td, uap->fd, &sb);
1685
if (error != 0)
1686
return (error);
1687
error = freebsd11_cvtstat(&sb, &osb);
1688
if (error == 0)
1689
error = copyout(&osb, uap->sb, sizeof(osb));
1690
return (error);
1691
}
1692
#endif /* COMPAT_FREEBSD11 */
1693
1694
/*
1695
* Return status information about a file descriptor.
1696
*/
1697
#ifndef _SYS_SYSPROTO_H_
1698
struct fstat_args {
1699
int fd;
1700
struct stat *sb;
1701
};
1702
#endif
1703
/* ARGSUSED */
1704
int
1705
sys_fstat(struct thread *td, struct fstat_args *uap)
1706
{
1707
struct stat ub;
1708
int error;
1709
1710
error = kern_fstat(td, uap->fd, &ub);
1711
if (error == 0)
1712
error = copyout(&ub, uap->sb, sizeof(ub));
1713
return (error);
1714
}
1715
1716
int
1717
kern_fstat(struct thread *td, int fd, struct stat *sbp)
1718
{
1719
struct file *fp;
1720
int error;
1721
1722
AUDIT_ARG_FD(fd);
1723
1724
error = fget(td, fd, &cap_fstat_rights, &fp);
1725
if (__predict_false(error != 0))
1726
return (error);
1727
1728
AUDIT_ARG_FILE(td->td_proc, fp);
1729
1730
sbp->st_filerev = 0;
1731
sbp->st_bsdflags = 0;
1732
error = fo_stat(fp, sbp, td->td_ucred);
1733
fdrop(fp, td);
1734
#ifdef __STAT_TIME_T_EXT
1735
sbp->st_atim_ext = 0;
1736
sbp->st_mtim_ext = 0;
1737
sbp->st_ctim_ext = 0;
1738
sbp->st_btim_ext = 0;
1739
#endif
1740
#ifdef KTRACE
1741
if (KTRPOINT(td, KTR_STRUCT))
1742
ktrstat_error(sbp, error);
1743
#endif
1744
return (error);
1745
}
1746
1747
#if defined(COMPAT_FREEBSD11)
1748
/*
1749
* Return status information about a file descriptor.
1750
*/
1751
#ifndef _SYS_SYSPROTO_H_
1752
struct freebsd11_nfstat_args {
1753
int fd;
1754
struct nstat *sb;
1755
};
1756
#endif
1757
/* ARGSUSED */
1758
int
1759
freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
1760
{
1761
struct nstat nub;
1762
struct stat ub;
1763
int error;
1764
1765
error = kern_fstat(td, uap->fd, &ub);
1766
if (error != 0)
1767
return (error);
1768
error = freebsd11_cvtnstat(&ub, &nub);
1769
if (error != 0)
1770
error = copyout(&nub, uap->sb, sizeof(nub));
1771
return (error);
1772
}
1773
#endif /* COMPAT_FREEBSD11 */
1774
1775
/*
1776
* Return pathconf information about a file descriptor.
1777
*/
1778
#ifndef _SYS_SYSPROTO_H_
1779
struct fpathconf_args {
1780
int fd;
1781
int name;
1782
};
1783
#endif
1784
/* ARGSUSED */
1785
int
1786
sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1787
{
1788
long value;
1789
int error;
1790
1791
error = kern_fpathconf(td, uap->fd, uap->name, &value);
1792
if (error == 0)
1793
td->td_retval[0] = value;
1794
return (error);
1795
}
1796
1797
int
1798
kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
1799
{
1800
struct file *fp;
1801
struct vnode *vp;
1802
int error;
1803
1804
error = fget(td, fd, &cap_fpathconf_rights, &fp);
1805
if (error != 0)
1806
return (error);
1807
1808
if (name == _PC_ASYNC_IO) {
1809
*valuep = _POSIX_ASYNCHRONOUS_IO;
1810
goto out;
1811
}
1812
vp = fp->f_vnode;
1813
if (vp != NULL) {
1814
vn_lock(vp, LK_SHARED | LK_RETRY);
1815
error = VOP_PATHCONF(vp, name, valuep);
1816
VOP_UNLOCK(vp);
1817
} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1818
if (name != _PC_PIPE_BUF) {
1819
error = EINVAL;
1820
} else {
1821
*valuep = PIPE_BUF;
1822
error = 0;
1823
}
1824
} else {
1825
error = EOPNOTSUPP;
1826
}
1827
out:
1828
fdrop(fp, td);
1829
return (error);
1830
}
1831
1832
/*
1833
* Copy filecaps structure allocating memory for ioctls array if needed.
1834
*
1835
* The last parameter indicates whether the fdtable is locked. If it is not and
1836
* ioctls are encountered, copying fails and the caller must lock the table.
1837
*
1838
* Note that if the table was not locked, the caller has to check the relevant
1839
* sequence counter to determine whether the operation was successful.
1840
*/
1841
bool
1842
filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
1843
{
1844
size_t size;
1845
1846
if (src->fc_ioctls != NULL && !locked)
1847
return (false);
1848
memcpy(dst, src, sizeof(*src));
1849
if (src->fc_ioctls == NULL)
1850
return (true);
1851
1852
KASSERT(src->fc_nioctls > 0,
1853
("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1854
1855
size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1856
dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1857
memcpy(dst->fc_ioctls, src->fc_ioctls, size);
1858
return (true);
1859
}
1860
1861
static u_long *
1862
filecaps_copy_prep(const struct filecaps *src)
1863
{
1864
u_long *ioctls;
1865
size_t size;
1866
1867
if (__predict_true(src->fc_ioctls == NULL))
1868
return (NULL);
1869
1870
KASSERT(src->fc_nioctls > 0,
1871
("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1872
1873
size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1874
ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1875
return (ioctls);
1876
}
1877
1878
static void
1879
filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
1880
u_long *ioctls)
1881
{
1882
size_t size;
1883
1884
*dst = *src;
1885
if (__predict_true(src->fc_ioctls == NULL)) {
1886
MPASS(ioctls == NULL);
1887
return;
1888
}
1889
1890
size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1891
dst->fc_ioctls = ioctls;
1892
bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1893
}
1894
1895
/*
1896
* Move filecaps structure to the new place and clear the old place.
1897
*/
1898
void
1899
filecaps_move(struct filecaps *src, struct filecaps *dst)
1900
{
1901
1902
*dst = *src;
1903
bzero(src, sizeof(*src));
1904
}
1905
1906
/*
1907
* Fill the given filecaps structure with full rights.
1908
*/
1909
static void
1910
filecaps_fill(struct filecaps *fcaps)
1911
{
1912
1913
CAP_ALL(&fcaps->fc_rights);
1914
fcaps->fc_ioctls = NULL;
1915
fcaps->fc_nioctls = -1;
1916
fcaps->fc_fcntls = CAP_FCNTL_ALL;
1917
}
1918
1919
/*
1920
* Free memory allocated within filecaps structure.
1921
*/
1922
static void
1923
filecaps_free_ioctl(struct filecaps *fcaps)
1924
{
1925
1926
free(fcaps->fc_ioctls, M_FILECAPS);
1927
fcaps->fc_ioctls = NULL;
1928
}
1929
1930
void
1931
filecaps_free(struct filecaps *fcaps)
1932
{
1933
1934
filecaps_free_ioctl(fcaps);
1935
bzero(fcaps, sizeof(*fcaps));
1936
}
1937
1938
static u_long *
1939
filecaps_free_prep(struct filecaps *fcaps)
1940
{
1941
u_long *ioctls;
1942
1943
ioctls = fcaps->fc_ioctls;
1944
bzero(fcaps, sizeof(*fcaps));
1945
return (ioctls);
1946
}
1947
1948
static void
1949
filecaps_free_finish(u_long *ioctls)
1950
{
1951
1952
free(ioctls, M_FILECAPS);
1953
}
1954
1955
/*
1956
* Validate the given filecaps structure.
1957
*/
1958
static void
1959
filecaps_validate(const struct filecaps *fcaps, const char *func)
1960
{
1961
1962
KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1963
("%s: invalid rights", func));
1964
KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1965
("%s: invalid fcntls", func));
1966
KASSERT(fcaps->fc_fcntls == 0 ||
1967
cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1968
("%s: fcntls without CAP_FCNTL", func));
1969
/*
1970
* open calls without WANTIOCTLCAPS free caps but leave the counter
1971
*/
1972
#if 0
1973
KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1974
(fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1975
("%s: invalid ioctls", func));
1976
#endif
1977
KASSERT(fcaps->fc_nioctls == 0 ||
1978
cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1979
("%s: ioctls without CAP_IOCTL", func));
1980
}
1981
1982
static void
1983
fdgrowtable_exp(struct filedesc *fdp, int nfd)
1984
{
1985
int nfd1;
1986
1987
FILEDESC_XLOCK_ASSERT(fdp);
1988
1989
nfd1 = fdp->fd_nfiles * 2;
1990
if (nfd1 < nfd)
1991
nfd1 = nfd;
1992
fdgrowtable(fdp, nfd1);
1993
}
1994
1995
/*
1996
* Grow the file table to accommodate (at least) nfd descriptors.
1997
*/
1998
static void
1999
fdgrowtable(struct filedesc *fdp, int nfd)
2000
{
2001
struct filedesc0 *fdp0;
2002
struct freetable *ft;
2003
struct fdescenttbl *ntable;
2004
struct fdescenttbl *otable;
2005
int nnfiles, onfiles;
2006
NDSLOTTYPE *nmap, *omap;
2007
2008
KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
2009
2010
/* save old values */
2011
onfiles = fdp->fd_nfiles;
2012
otable = fdp->fd_files;
2013
omap = fdp->fd_map;
2014
2015
/* compute the size of the new table */
2016
nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
2017
if (nnfiles <= onfiles)
2018
/* the table is already large enough */
2019
return;
2020
2021
/*
2022
* Allocate a new table. We need enough space for the number of
2023
* entries, file entries themselves and the struct freetable we will use
2024
* when we decommission the table and place it on the freelist.
2025
* We place the struct freetable in the middle so we don't have
2026
* to worry about padding.
2027
*/
2028
ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
2029
nnfiles * sizeof(ntable->fdt_ofiles[0]) +
2030
sizeof(struct freetable),
2031
M_FILEDESC, M_ZERO | M_WAITOK);
2032
/* copy the old data */
2033
ntable->fdt_nfiles = nnfiles;
2034
memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
2035
onfiles * sizeof(ntable->fdt_ofiles[0]));
2036
2037
/*
2038
* Allocate a new map only if the old is not large enough. It will
2039
* grow at a slower rate than the table as it can map more
2040
* entries than the table can hold.
2041
*/
2042
if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
2043
nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
2044
M_ZERO | M_WAITOK);
2045
/* copy over the old data and update the pointer */
2046
memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
2047
fdp->fd_map = nmap;
2048
}
2049
2050
/*
2051
* Make sure that ntable is correctly initialized before we replace
2052
* fd_files poiner. Otherwise fget_unlocked() may see inconsistent
2053
* data.
2054
*/
2055
atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
2056
2057
/*
2058
* Free the old file table when not shared by other threads or processes.
2059
* The old file table is considered to be shared when either are true:
2060
* - The process has more than one thread.
2061
* - The file descriptor table has been shared via fdshare().
2062
*
2063
* When shared, the old file table will be placed on a freelist
2064
* which will be processed when the struct filedesc is released.
2065
*
2066
* Note that if onfiles == NDFILE, we're dealing with the original
2067
* static allocation contained within (struct filedesc0 *)fdp,
2068
* which must not be freed.
2069
*/
2070
if (onfiles > NDFILE) {
2071
/*
2072
* Note we may be called here from fdinit while allocating a
2073
* table for a new process in which case ->p_fd points
2074
* elsewhere.
2075
*/
2076
if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) {
2077
free(otable, M_FILEDESC);
2078
} else {
2079
ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
2080
fdp0 = (struct filedesc0 *)fdp;
2081
ft->ft_table = otable;
2082
SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
2083
}
2084
}
2085
/*
2086
* The map does not have the same possibility of threads still
2087
* holding references to it. So always free it as long as it
2088
* does not reference the original static allocation.
2089
*/
2090
if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
2091
free(omap, M_FILEDESC);
2092
}
2093
2094
/*
2095
* Allocate a file descriptor for the process.
2096
*/
2097
int
2098
fdalloc(struct thread *td, int minfd, int *result)
2099
{
2100
struct proc *p = td->td_proc;
2101
struct filedesc *fdp = p->p_fd;
2102
int fd, maxfd, allocfd;
2103
#ifdef RACCT
2104
int error;
2105
#endif
2106
2107
FILEDESC_XLOCK_ASSERT(fdp);
2108
2109
if (fdp->fd_freefile > minfd)
2110
minfd = fdp->fd_freefile;
2111
2112
maxfd = getmaxfd(td);
2113
2114
/*
2115
* Search the bitmap for a free descriptor starting at minfd.
2116
* If none is found, grow the file table.
2117
*/
2118
fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
2119
if (__predict_false(fd >= maxfd))
2120
return (EMFILE);
2121
if (__predict_false(fd >= fdp->fd_nfiles)) {
2122
allocfd = min(fd * 2, maxfd);
2123
#ifdef RACCT
2124
if (RACCT_ENABLED()) {
2125
error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
2126
if (error != 0)
2127
return (EMFILE);
2128
}
2129
#endif
2130
/*
2131
* fd is already equal to first free descriptor >= minfd, so
2132
* we only need to grow the table and we are done.
2133
*/
2134
fdgrowtable_exp(fdp, allocfd);
2135
}
2136
2137
/*
2138
* Perform some sanity checks, then mark the file descriptor as
2139
* used and return it to the caller.
2140
*/
2141
KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
2142
("invalid descriptor %d", fd));
2143
KASSERT(!fdisused(fdp, fd),
2144
("fd_first_free() returned non-free descriptor"));
2145
KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
2146
("file descriptor isn't free"));
2147
fdused(fdp, fd);
2148
*result = fd;
2149
return (0);
2150
}
2151
2152
/*
2153
* Allocate n file descriptors for the process.
2154
*/
2155
int
2156
fdallocn(struct thread *td, int minfd, int *fds, int n)
2157
{
2158
struct proc *p = td->td_proc;
2159
struct filedesc *fdp = p->p_fd;
2160
int i;
2161
2162
FILEDESC_XLOCK_ASSERT(fdp);
2163
2164
for (i = 0; i < n; i++)
2165
if (fdalloc(td, 0, &fds[i]) != 0)
2166
break;
2167
2168
if (i < n) {
2169
for (i--; i >= 0; i--)
2170
fdunused(fdp, fds[i]);
2171
return (EMFILE);
2172
}
2173
2174
return (0);
2175
}
2176
2177
/*
2178
* Create a new open file structure and allocate a file descriptor for the
2179
* process that refers to it. We add one reference to the file for the
2180
* descriptor table and one reference for resultfp. This is to prevent us
2181
* being preempted and the entry in the descriptor table closed after we
2182
* release the FILEDESC lock.
2183
*/
2184
int
2185
falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
2186
struct filecaps *fcaps)
2187
{
2188
struct file *fp;
2189
int error, fd;
2190
2191
MPASS(resultfp != NULL);
2192
MPASS(resultfd != NULL);
2193
2194
error = _falloc_noinstall(td, &fp, 2);
2195
if (__predict_false(error != 0)) {
2196
return (error);
2197
}
2198
2199
error = finstall_refed(td, fp, &fd, flags, fcaps);
2200
if (__predict_false(error != 0)) {
2201
falloc_abort(td, fp);
2202
return (error);
2203
}
2204
2205
*resultfp = fp;
2206
*resultfd = fd;
2207
2208
return (0);
2209
}
2210
2211
/*
2212
* Create a new open file structure without allocating a file descriptor.
2213
*/
2214
int
2215
_falloc_noinstall(struct thread *td, struct file **resultfp, u_int n)
2216
{
2217
struct file *fp;
2218
int maxuserfiles = maxfiles - (maxfiles / 20);
2219
int openfiles_new;
2220
static struct timeval lastfail;
2221
static int curfail;
2222
2223
KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
2224
MPASS(n > 0);
2225
2226
openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
2227
if ((openfiles_new >= maxuserfiles &&
2228
priv_check(td, PRIV_MAXFILES) != 0) ||
2229
openfiles_new >= maxfiles) {
2230
atomic_subtract_int(&openfiles, 1);
2231
if (ppsratecheck(&lastfail, &curfail, 1)) {
2232
printf("kern.maxfiles limit exceeded by uid %i, (%s) "
2233
"please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
2234
}
2235
return (ENFILE);
2236
}
2237
fp = uma_zalloc(file_zone, M_WAITOK);
2238
bzero(fp, sizeof(*fp));
2239
refcount_init(&fp->f_count, n);
2240
fp->f_cred = crhold(td->td_ucred);
2241
fp->f_ops = &badfileops;
2242
*resultfp = fp;
2243
return (0);
2244
}
2245
2246
void
2247
falloc_abort(struct thread *td, struct file *fp)
2248
{
2249
2250
/*
2251
* For assertion purposes.
2252
*/
2253
refcount_init(&fp->f_count, 0);
2254
_fdrop(fp, td);
2255
}
2256
2257
/*
2258
* Install a file in a file descriptor table.
2259
*/
2260
void
2261
_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
2262
struct filecaps *fcaps)
2263
{
2264
struct filedescent *fde;
2265
2266
MPASS(fp != NULL);
2267
if (fcaps != NULL)
2268
filecaps_validate(fcaps, __func__);
2269
FILEDESC_XLOCK_ASSERT(fdp);
2270
2271
fde = &fdp->fd_ofiles[fd];
2272
#ifdef CAPABILITIES
2273
seqc_write_begin(&fde->fde_seqc);
2274
#endif
2275
fde->fde_file = fp;
2276
fde->fde_flags = open_to_fde_flags(flags, true);
2277
if (fcaps != NULL)
2278
filecaps_move(fcaps, &fde->fde_caps);
2279
else
2280
filecaps_fill(&fde->fde_caps);
2281
#ifdef CAPABILITIES
2282
seqc_write_end(&fde->fde_seqc);
2283
#endif
2284
}
2285
2286
int
2287
finstall_refed(struct thread *td, struct file *fp, int *fd, int flags,
2288
struct filecaps *fcaps)
2289
{
2290
struct filedesc *fdp = td->td_proc->p_fd;
2291
int error;
2292
2293
MPASS(fd != NULL);
2294
2295
FILEDESC_XLOCK(fdp);
2296
error = fdalloc(td, 0, fd);
2297
if (__predict_true(error == 0)) {
2298
_finstall(fdp, fp, *fd, flags, fcaps);
2299
}
2300
FILEDESC_XUNLOCK(fdp);
2301
return (error);
2302
}
2303
2304
int
2305
finstall(struct thread *td, struct file *fp, int *fd, int flags,
2306
struct filecaps *fcaps)
2307
{
2308
int error;
2309
2310
MPASS(fd != NULL);
2311
2312
if (!fhold(fp))
2313
return (EBADF);
2314
error = finstall_refed(td, fp, fd, flags, fcaps);
2315
if (__predict_false(error != 0)) {
2316
fdrop(fp, td);
2317
}
2318
return (error);
2319
}
2320
2321
/*
2322
* Build a new filedesc structure from another.
2323
*
2324
* If fdp is not NULL, return with it shared locked.
2325
*/
2326
struct filedesc *
2327
fdinit(void)
2328
{
2329
struct filedesc0 *newfdp0;
2330
struct filedesc *newfdp;
2331
2332
newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
2333
newfdp = &newfdp0->fd_fd;
2334
2335
/* Create the file descriptor table. */
2336
FILEDESC_LOCK_INIT(newfdp);
2337
refcount_init(&newfdp->fd_refcnt, 1);
2338
refcount_init(&newfdp->fd_holdcnt, 1);
2339
newfdp->fd_map = newfdp0->fd_dmap;
2340
newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
2341
newfdp->fd_files->fdt_nfiles = NDFILE;
2342
2343
return (newfdp);
2344
}
2345
2346
/*
2347
* Build a pwddesc structure from another.
2348
* Copy the current, root, and jail root vnode references.
2349
*
2350
* If pdp is not NULL and keeplock is true, return with it (exclusively) locked.
2351
*/
2352
struct pwddesc *
2353
pdinit(struct pwddesc *pdp, bool keeplock)
2354
{
2355
struct pwddesc *newpdp;
2356
struct pwd *newpwd;
2357
2358
newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO);
2359
2360
PWDDESC_LOCK_INIT(newpdp);
2361
refcount_init(&newpdp->pd_refcount, 1);
2362
newpdp->pd_cmask = CMASK;
2363
2364
if (pdp == NULL) {
2365
newpwd = pwd_alloc();
2366
smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
2367
return (newpdp);
2368
}
2369
2370
PWDDESC_XLOCK(pdp);
2371
newpwd = pwd_hold_pwddesc(pdp);
2372
smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
2373
if (!keeplock)
2374
PWDDESC_XUNLOCK(pdp);
2375
return (newpdp);
2376
}
2377
2378
/*
2379
* Hold either filedesc or pwddesc of the passed process.
2380
*
2381
* The process lock is used to synchronize against the target exiting and
2382
* freeing the data.
2383
*
2384
* Clearing can be ilustrated in 3 steps:
2385
* 1. set the pointer to NULL. Either routine can race against it, hence
2386
* atomic_load_ptr.
2387
* 2. observe the process lock as not taken. Until then fdhold/pdhold can
2388
* race to either still see the pointer or find NULL. It is still safe to
2389
* grab a reference as clearing is stalled.
2390
* 3. after the lock is observed as not taken, any fdhold/pdhold calls are
2391
* guaranteed to see NULL, making it safe to finish clearing
2392
*/
2393
static struct filedesc *
2394
fdhold(struct proc *p)
2395
{
2396
struct filedesc *fdp;
2397
2398
PROC_LOCK_ASSERT(p, MA_OWNED);
2399
fdp = atomic_load_ptr(&p->p_fd);
2400
if (fdp != NULL)
2401
refcount_acquire(&fdp->fd_holdcnt);
2402
return (fdp);
2403
}
2404
2405
static struct pwddesc *
2406
pdhold(struct proc *p)
2407
{
2408
struct pwddesc *pdp;
2409
2410
PROC_LOCK_ASSERT(p, MA_OWNED);
2411
pdp = atomic_load_ptr(&p->p_pd);
2412
if (pdp != NULL)
2413
refcount_acquire(&pdp->pd_refcount);
2414
return (pdp);
2415
}
2416
2417
static void
2418
fddrop(struct filedesc *fdp)
2419
{
2420
2421
if (refcount_load(&fdp->fd_holdcnt) > 1) {
2422
if (refcount_release(&fdp->fd_holdcnt) == 0)
2423
return;
2424
}
2425
2426
FILEDESC_LOCK_DESTROY(fdp);
2427
uma_zfree(filedesc0_zone, fdp);
2428
}
2429
2430
static void
2431
pddrop(struct pwddesc *pdp)
2432
{
2433
struct pwd *pwd;
2434
2435
if (refcount_release_if_not_last(&pdp->pd_refcount))
2436
return;
2437
2438
PWDDESC_XLOCK(pdp);
2439
if (refcount_release(&pdp->pd_refcount) == 0) {
2440
PWDDESC_XUNLOCK(pdp);
2441
return;
2442
}
2443
pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
2444
pwd_set(pdp, NULL);
2445
PWDDESC_XUNLOCK(pdp);
2446
pwd_drop(pwd);
2447
2448
PWDDESC_LOCK_DESTROY(pdp);
2449
free(pdp, M_PWDDESC);
2450
}
2451
2452
/*
2453
* Share a filedesc structure.
2454
*/
2455
struct filedesc *
2456
fdshare(struct filedesc *fdp)
2457
{
2458
2459
refcount_acquire(&fdp->fd_refcnt);
2460
return (fdp);
2461
}
2462
2463
/*
2464
* Share a pwddesc structure.
2465
*/
2466
struct pwddesc *
2467
pdshare(struct pwddesc *pdp)
2468
{
2469
refcount_acquire(&pdp->pd_refcount);
2470
return (pdp);
2471
}
2472
2473
/*
2474
* Unshare a filedesc structure, if necessary by making a copy
2475
*/
2476
void
2477
fdunshare(struct thread *td)
2478
{
2479
struct filedesc *tmp;
2480
struct proc *p = td->td_proc;
2481
2482
if (refcount_load(&p->p_fd->fd_refcnt) == 1)
2483
return;
2484
2485
tmp = fdcopy(p->p_fd, p);
2486
fdescfree(td);
2487
p->p_fd = tmp;
2488
}
2489
2490
/*
2491
* Unshare a pwddesc structure.
2492
*/
2493
void
2494
pdunshare(struct thread *td)
2495
{
2496
struct pwddesc *pdp;
2497
struct proc *p;
2498
2499
p = td->td_proc;
2500
/* Not shared. */
2501
if (refcount_load(&p->p_pd->pd_refcount) == 1)
2502
return;
2503
2504
pdp = pdcopy(p->p_pd);
2505
pdescfree(td);
2506
p->p_pd = pdp;
2507
}
2508
2509
/*
2510
* Copy a filedesc structure. A NULL pointer in returns a NULL reference,
2511
* this is to ease callers, not catch errors.
2512
*/
2513
struct filedesc *
2514
fdcopy(struct filedesc *fdp, struct proc *p1)
2515
{
2516
struct filedesc *newfdp;
2517
struct filedescent *nfde, *ofde;
2518
struct file *fp;
2519
int i, lastfile;
2520
bool fork_pass;
2521
2522
MPASS(fdp != NULL);
2523
2524
fork_pass = false;
2525
newfdp = fdinit();
2526
FILEDESC_SLOCK(fdp);
2527
for (;;) {
2528
lastfile = fdlastfile(fdp);
2529
if (lastfile < newfdp->fd_nfiles)
2530
break;
2531
FILEDESC_SUNLOCK(fdp);
2532
fdgrowtable(newfdp, lastfile + 1);
2533
FILEDESC_SLOCK(fdp);
2534
}
2535
2536
/*
2537
* Copy all passable descriptors (i.e. not kqueue), and
2538
* prepare to handle copyable but not passable descriptors
2539
* (kqueues).
2540
*
2541
* The pass to handle copying is performed after all passable
2542
* files are installed into the new file descriptor's table,
2543
* since kqueues need all referenced file descriptors already
2544
* valid, including other kqueues. For the same reason the
2545
* copying is done in two passes by itself, first installing
2546
* not fully initialized ('empty') copyable files into the new
2547
* fd table, and then giving the subsystems a second chance to
2548
* really fill the copied file backing structure with the
2549
* content.
2550
*/
2551
newfdp->fd_freefile = fdp->fd_freefile;
2552
FILEDESC_FOREACH_FDE(fdp, i, ofde) {
2553
const struct fileops *ops;
2554
2555
ops = ofde->fde_file->f_ops;
2556
fp = NULL;
2557
if ((ops->fo_flags & DFLAG_FORK) != 0 &&
2558
(ofde->fde_flags & UF_FOCLOSE) == 0) {
2559
if (ops->fo_fork(newfdp, ofde->fde_file, &fp, p1,
2560
curthread) != 0)
2561
continue;
2562
fork_pass = true;
2563
} else if ((ops->fo_flags & DFLAG_PASSABLE) == 0 ||
2564
(ofde->fde_flags & UF_FOCLOSE) != 0 ||
2565
!fhold(ofde->fde_file)) {
2566
if (newfdp->fd_freefile == fdp->fd_freefile)
2567
newfdp->fd_freefile = i;
2568
continue;
2569
}
2570
nfde = &newfdp->fd_ofiles[i];
2571
*nfde = *ofde;
2572
if (fp != NULL)
2573
nfde->fde_file = fp;
2574
filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
2575
fdused_init(newfdp, i);
2576
}
2577
MPASS(newfdp->fd_freefile != -1);
2578
FILEDESC_SUNLOCK(fdp);
2579
2580
/*
2581
* Now handle copying kqueues, since all fds, including
2582
* kqueues, are in place.
2583
*/
2584
if (__predict_false(fork_pass)) {
2585
FILEDESC_FOREACH_FDE(newfdp, i, nfde) {
2586
const struct fileops *ops;
2587
2588
ops = nfde->fde_file->f_ops;
2589
if ((ops->fo_flags & DFLAG_FORK) == 0 ||
2590
nfde->fde_file == NULL)
2591
continue;
2592
ops->fo_fork(newfdp, NULL, &nfde->fde_file, p1,
2593
curthread);
2594
}
2595
}
2596
return (newfdp);
2597
}
2598
2599
/*
2600
* Copy a pwddesc structure.
2601
*/
2602
struct pwddesc *
2603
pdcopy(struct pwddesc *pdp)
2604
{
2605
struct pwddesc *newpdp;
2606
2607
MPASS(pdp != NULL);
2608
2609
newpdp = pdinit(pdp, true);
2610
newpdp->pd_cmask = pdp->pd_cmask;
2611
PWDDESC_XUNLOCK(pdp);
2612
return (newpdp);
2613
}
2614
2615
/*
2616
* Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
2617
* one of processes using it exits) and the table used to be shared.
2618
*/
2619
static void
2620
fdclearlocks(struct thread *td)
2621
{
2622
struct filedesc *fdp;
2623
struct filedesc_to_leader *fdtol;
2624
struct flock lf;
2625
struct file *fp;
2626
struct proc *p;
2627
struct vnode *vp;
2628
int i;
2629
2630
p = td->td_proc;
2631
fdp = p->p_fd;
2632
fdtol = p->p_fdtol;
2633
MPASS(fdtol != NULL);
2634
2635
FILEDESC_XLOCK(fdp);
2636
KASSERT(fdtol->fdl_refcount > 0,
2637
("filedesc_to_refcount botch: fdl_refcount=%d",
2638
fdtol->fdl_refcount));
2639
if (fdtol->fdl_refcount == 1 &&
2640
(p->p_leader->p_flag & P_ADVLOCK) != 0) {
2641
FILEDESC_FOREACH_FP(fdp, i, fp) {
2642
if (fp->f_type != DTYPE_VNODE ||
2643
!fhold(fp))
2644
continue;
2645
FILEDESC_XUNLOCK(fdp);
2646
lf.l_whence = SEEK_SET;
2647
lf.l_start = 0;
2648
lf.l_len = 0;
2649
lf.l_type = F_UNLCK;
2650
vp = fp->f_vnode;
2651
(void) VOP_ADVLOCK(vp,
2652
(caddr_t)p->p_leader, F_UNLCK,
2653
&lf, F_POSIX);
2654
FILEDESC_XLOCK(fdp);
2655
fdrop(fp, td);
2656
}
2657
}
2658
retry:
2659
if (fdtol->fdl_refcount == 1) {
2660
if (fdp->fd_holdleaderscount > 0 &&
2661
(p->p_leader->p_flag & P_ADVLOCK) != 0) {
2662
/*
2663
* close() or kern_dup() has cleared a reference
2664
* in a shared file descriptor table.
2665
*/
2666
fdp->fd_holdleaderswakeup = 1;
2667
sx_sleep(&fdp->fd_holdleaderscount,
2668
FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
2669
goto retry;
2670
}
2671
if (fdtol->fdl_holdcount > 0) {
2672
/*
2673
* Ensure that fdtol->fdl_leader remains
2674
* valid in closef().
2675
*/
2676
fdtol->fdl_wakeup = 1;
2677
sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2678
"fdlhold", 0);
2679
goto retry;
2680
}
2681
}
2682
fdtol->fdl_refcount--;
2683
if (fdtol->fdl_refcount == 0 &&
2684
fdtol->fdl_holdcount == 0) {
2685
fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2686
fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2687
} else
2688
fdtol = NULL;
2689
p->p_fdtol = NULL;
2690
FILEDESC_XUNLOCK(fdp);
2691
if (fdtol != NULL)
2692
free(fdtol, M_FILEDESC_TO_LEADER);
2693
}
2694
2695
/*
2696
* Release a filedesc structure.
2697
*/
2698
static void
2699
fdescfree_fds(struct thread *td, struct filedesc *fdp)
2700
{
2701
struct filedesc0 *fdp0;
2702
struct freetable *ft, *tft;
2703
struct filedescent *fde;
2704
struct file *fp;
2705
int i;
2706
2707
KASSERT(refcount_load(&fdp->fd_refcnt) == 0,
2708
("%s: fd table %p carries references", __func__, fdp));
2709
2710
/*
2711
* Serialize with threads iterating over the table, if any.
2712
*/
2713
if (refcount_load(&fdp->fd_holdcnt) > 1) {
2714
FILEDESC_XLOCK(fdp);
2715
FILEDESC_XUNLOCK(fdp);
2716
}
2717
2718
FILEDESC_FOREACH_FDE(fdp, i, fde) {
2719
fp = fde->fde_file;
2720
fdefree_last(fde);
2721
(void) closef(fp, td);
2722
}
2723
2724
if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2725
free(fdp->fd_map, M_FILEDESC);
2726
if (fdp->fd_nfiles > NDFILE)
2727
free(fdp->fd_files, M_FILEDESC);
2728
2729
fdp0 = (struct filedesc0 *)fdp;
2730
SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
2731
free(ft->ft_table, M_FILEDESC);
2732
2733
fddrop(fdp);
2734
}
2735
2736
void
2737
fdescfree(struct thread *td)
2738
{
2739
struct proc *p;
2740
struct filedesc *fdp;
2741
2742
p = td->td_proc;
2743
fdp = p->p_fd;
2744
MPASS(fdp != NULL);
2745
2746
#ifdef RACCT
2747
if (RACCT_ENABLED())
2748
racct_set_unlocked(p, RACCT_NOFILE, 0);
2749
#endif
2750
2751
if (p->p_fdtol != NULL)
2752
fdclearlocks(td);
2753
2754
/*
2755
* Check fdhold for an explanation.
2756
*/
2757
atomic_store_ptr(&p->p_fd, NULL);
2758
atomic_thread_fence_seq_cst();
2759
PROC_WAIT_UNLOCKED(p);
2760
2761
if (refcount_release(&fdp->fd_refcnt) == 0)
2762
return;
2763
2764
fdescfree_fds(td, fdp);
2765
}
2766
2767
void
2768
pdescfree(struct thread *td)
2769
{
2770
struct proc *p;
2771
struct pwddesc *pdp;
2772
2773
p = td->td_proc;
2774
pdp = p->p_pd;
2775
MPASS(pdp != NULL);
2776
2777
/*
2778
* Check pdhold for an explanation.
2779
*/
2780
atomic_store_ptr(&p->p_pd, NULL);
2781
atomic_thread_fence_seq_cst();
2782
PROC_WAIT_UNLOCKED(p);
2783
2784
pddrop(pdp);
2785
}
2786
2787
/*
2788
* For setugid programs, we don't want to people to use that setugidness
2789
* to generate error messages which write to a file which otherwise would
2790
* otherwise be off-limits to the process. We check for filesystems where
2791
* the vnode can change out from under us after execve (like [lin]procfs).
2792
*
2793
* Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
2794
* sufficient. We also don't check for setugidness since we know we are.
2795
*/
2796
static bool
2797
is_unsafe(struct file *fp)
2798
{
2799
struct vnode *vp;
2800
2801
if (fp->f_type != DTYPE_VNODE)
2802
return (false);
2803
2804
vp = fp->f_vnode;
2805
return ((vp->v_vflag & VV_PROCDEP) != 0);
2806
}
2807
2808
/*
2809
* Make this setguid thing safe, if at all possible.
2810
*/
2811
void
2812
fdsetugidsafety(struct thread *td)
2813
{
2814
struct filedesc *fdp;
2815
struct file *fp;
2816
int i;
2817
2818
fdp = td->td_proc->p_fd;
2819
KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
2820
("the fdtable should not be shared"));
2821
MPASS(fdp->fd_nfiles >= 3);
2822
for (i = 0; i <= 2; i++) {
2823
fp = fdp->fd_ofiles[i].fde_file;
2824
if (fp != NULL && is_unsafe(fp)) {
2825
FILEDESC_XLOCK(fdp);
2826
knote_fdclose(td, i);
2827
/*
2828
* NULL-out descriptor prior to close to avoid
2829
* a race while close blocks.
2830
*/
2831
fdfree(fdp, i);
2832
FILEDESC_XUNLOCK(fdp);
2833
(void) closef(fp, td);
2834
}
2835
}
2836
}
2837
2838
/*
2839
* If a specific file object occupies a specific file descriptor, close the
2840
* file descriptor entry and drop a reference on the file object. This is a
2841
* convenience function to handle a subsequent error in a function that calls
2842
* falloc() that handles the race that another thread might have closed the
2843
* file descriptor out from under the thread creating the file object.
2844
*/
2845
void
2846
fdclose(struct thread *td, struct file *fp, int idx)
2847
{
2848
struct filedesc *fdp = td->td_proc->p_fd;
2849
2850
FILEDESC_XLOCK(fdp);
2851
if (fdp->fd_ofiles[idx].fde_file == fp) {
2852
fdfree(fdp, idx);
2853
FILEDESC_XUNLOCK(fdp);
2854
fdrop(fp, td);
2855
} else
2856
FILEDESC_XUNLOCK(fdp);
2857
}
2858
2859
/*
2860
* Close any files on exec?
2861
*/
2862
void
2863
fdcloseexec(struct thread *td)
2864
{
2865
struct filedesc *fdp;
2866
struct filedescent *fde;
2867
struct file *fp;
2868
int i;
2869
2870
fdp = td->td_proc->p_fd;
2871
KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
2872
("the fdtable should not be shared"));
2873
FILEDESC_FOREACH_FDE(fdp, i, fde) {
2874
fp = fde->fde_file;
2875
if (fp->f_type == DTYPE_MQUEUE ||
2876
(fde->fde_flags & UF_EXCLOSE)) {
2877
FILEDESC_XLOCK(fdp);
2878
fdfree(fdp, i);
2879
(void) closefp(fdp, i, fp, td, false, false);
2880
FILEDESC_UNLOCK_ASSERT(fdp);
2881
} else if (fde->fde_flags & UF_FOCLOSE) {
2882
/*
2883
* https://austingroupbugs.net/view.php?id=1851
2884
* FD_CLOFORK should not be preserved across exec
2885
*/
2886
fde->fde_flags &= ~UF_FOCLOSE;
2887
}
2888
}
2889
}
2890
2891
/*
2892
* It is unsafe for set[ug]id processes to be started with file
2893
* descriptors 0..2 closed, as these descriptors are given implicit
2894
* significance in the Standard C library. fdcheckstd() will create a
2895
* descriptor referencing /dev/null for each of stdin, stdout, and
2896
* stderr that is not already open.
2897
*/
2898
int
2899
fdcheckstd(struct thread *td)
2900
{
2901
struct filedesc *fdp;
2902
register_t save;
2903
int i, error, devnull;
2904
2905
fdp = td->td_proc->p_fd;
2906
KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
2907
("the fdtable should not be shared"));
2908
MPASS(fdp->fd_nfiles >= 3);
2909
devnull = -1;
2910
for (i = 0; i <= 2; i++) {
2911
if (fdp->fd_ofiles[i].fde_file != NULL)
2912
continue;
2913
2914
save = td->td_retval[0];
2915
if (devnull != -1) {
2916
error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
2917
} else {
2918
error = kern_openat(td, AT_FDCWD, "/dev/null",
2919
UIO_SYSSPACE, O_RDWR, 0);
2920
if (error == 0) {
2921
devnull = td->td_retval[0];
2922
KASSERT(devnull == i, ("we didn't get our fd"));
2923
}
2924
}
2925
td->td_retval[0] = save;
2926
if (error != 0)
2927
return (error);
2928
}
2929
return (0);
2930
}
2931
2932
/*
2933
* Internal form of close. Decrement reference count on file structure.
2934
* Note: td may be NULL when closing a file that was being passed in a
2935
* message.
2936
*/
2937
int
2938
closef(struct file *fp, struct thread *td)
2939
{
2940
struct vnode *vp;
2941
struct flock lf;
2942
struct filedesc_to_leader *fdtol;
2943
struct filedesc *fdp;
2944
2945
MPASS(td != NULL);
2946
2947
/*
2948
* POSIX record locking dictates that any close releases ALL
2949
* locks owned by this process. This is handled by setting
2950
* a flag in the unlock to free ONLY locks obeying POSIX
2951
* semantics, and not to free BSD-style file locks.
2952
* If the descriptor was in a message, POSIX-style locks
2953
* aren't passed with the descriptor, and the thread pointer
2954
* will be NULL. Callers should be careful only to pass a
2955
* NULL thread pointer when there really is no owning
2956
* context that might have locks, or the locks will be
2957
* leaked.
2958
*/
2959
if (fp->f_type == DTYPE_VNODE) {
2960
vp = fp->f_vnode;
2961
if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2962
lf.l_whence = SEEK_SET;
2963
lf.l_start = 0;
2964
lf.l_len = 0;
2965
lf.l_type = F_UNLCK;
2966
(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2967
F_UNLCK, &lf, F_POSIX);
2968
}
2969
fdtol = td->td_proc->p_fdtol;
2970
if (fdtol != NULL) {
2971
/*
2972
* Handle special case where file descriptor table is
2973
* shared between multiple process leaders.
2974
*/
2975
fdp = td->td_proc->p_fd;
2976
FILEDESC_XLOCK(fdp);
2977
for (fdtol = fdtol->fdl_next;
2978
fdtol != td->td_proc->p_fdtol;
2979
fdtol = fdtol->fdl_next) {
2980
if ((fdtol->fdl_leader->p_flag &
2981
P_ADVLOCK) == 0)
2982
continue;
2983
fdtol->fdl_holdcount++;
2984
FILEDESC_XUNLOCK(fdp);
2985
lf.l_whence = SEEK_SET;
2986
lf.l_start = 0;
2987
lf.l_len = 0;
2988
lf.l_type = F_UNLCK;
2989
vp = fp->f_vnode;
2990
(void) VOP_ADVLOCK(vp,
2991
(caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2992
F_POSIX);
2993
FILEDESC_XLOCK(fdp);
2994
fdtol->fdl_holdcount--;
2995
if (fdtol->fdl_holdcount == 0 &&
2996
fdtol->fdl_wakeup != 0) {
2997
fdtol->fdl_wakeup = 0;
2998
wakeup(fdtol);
2999
}
3000
}
3001
FILEDESC_XUNLOCK(fdp);
3002
}
3003
}
3004
return (fdrop_close(fp, td));
3005
}
3006
3007
/*
3008
* Hack for file descriptor passing code.
3009
*/
3010
void
3011
closef_nothread(struct file *fp)
3012
{
3013
3014
fdrop(fp, NULL);
3015
}
3016
3017
/*
3018
* Initialize the file pointer with the specified properties.
3019
*
3020
* The ops are set with release semantics to be certain that the flags, type,
3021
* and data are visible when ops is. This is to prevent ops methods from being
3022
* called with bad data.
3023
*/
3024
void
3025
finit(struct file *fp, u_int flag, short type, void *data,
3026
const struct fileops *ops)
3027
{
3028
fp->f_data = data;
3029
fp->f_flag = flag;
3030
fp->f_type = type;
3031
atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
3032
}
3033
3034
void
3035
finit_vnode(struct file *fp, u_int flag, void *data, const struct fileops *ops)
3036
{
3037
fp->f_seqcount[UIO_READ] = 1;
3038
fp->f_seqcount[UIO_WRITE] = 1;
3039
finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
3040
data, ops);
3041
}
3042
3043
int
3044
fget_cap_noref(struct filedesc *fdp, int fd, const cap_rights_t *needrightsp,
3045
struct file **fpp, struct filecaps *havecapsp)
3046
{
3047
struct filedescent *fde;
3048
int error;
3049
3050
FILEDESC_LOCK_ASSERT(fdp);
3051
3052
*fpp = NULL;
3053
fde = fdeget_noref(fdp, fd);
3054
if (fde == NULL) {
3055
error = EBADF;
3056
goto out;
3057
}
3058
3059
#ifdef CAPABILITIES
3060
error = cap_check(cap_rights_fde_inline(fde), needrightsp);
3061
if (error != 0)
3062
goto out;
3063
#endif
3064
3065
if (havecapsp != NULL)
3066
filecaps_copy(&fde->fde_caps, havecapsp, true);
3067
3068
*fpp = fde->fde_file;
3069
3070
error = 0;
3071
out:
3072
return (error);
3073
}
3074
3075
#ifdef CAPABILITIES
3076
int
3077
fget_cap(struct thread *td, int fd, const cap_rights_t *needrightsp,
3078
uint8_t *flagsp, struct file **fpp, struct filecaps *havecapsp)
3079
{
3080
struct filedesc *fdp = td->td_proc->p_fd;
3081
int error;
3082
struct file *fp;
3083
seqc_t seq;
3084
3085
*fpp = NULL;
3086
for (;;) {
3087
error = fget_unlocked_seq(td, fd, needrightsp, flagsp, &fp,
3088
&seq);
3089
if (error != 0)
3090
return (error);
3091
3092
if (havecapsp != NULL) {
3093
if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
3094
havecapsp, false)) {
3095
fdrop(fp, td);
3096
goto get_locked;
3097
}
3098
}
3099
3100
if (!fd_modified(fdp, fd, seq))
3101
break;
3102
fdrop(fp, td);
3103
}
3104
3105
*fpp = fp;
3106
return (0);
3107
3108
get_locked:
3109
FILEDESC_SLOCK(fdp);
3110
error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp);
3111
if (error == 0 && !fhold(*fpp))
3112
error = EBADF;
3113
FILEDESC_SUNLOCK(fdp);
3114
return (error);
3115
}
3116
#else
3117
int
3118
fget_cap(struct thread *td, int fd, const cap_rights_t *needrightsp,
3119
uint8_t *flagsp, struct file **fpp, struct filecaps *havecapsp)
3120
{
3121
int error;
3122
error = fget_unlocked_flags(td, fd, needrightsp, flagsp, fpp);
3123
if (havecapsp != NULL && error == 0)
3124
filecaps_fill(havecapsp);
3125
3126
return (error);
3127
}
3128
#endif
3129
3130
int
3131
fget_remote(struct thread *td, struct proc *p, int fd, struct file **fpp)
3132
{
3133
struct filedesc *fdp;
3134
struct file *fp;
3135
int error;
3136
3137
if (p == td->td_proc) /* curproc */
3138
return (fget_unlocked(td, fd, &cap_no_rights, fpp));
3139
3140
PROC_LOCK(p);
3141
fdp = fdhold(p);
3142
PROC_UNLOCK(p);
3143
if (fdp == NULL)
3144
return (ENOENT);
3145
FILEDESC_SLOCK(fdp);
3146
if (refcount_load(&fdp->fd_refcnt) != 0) {
3147
fp = fget_noref(fdp, fd);
3148
if (fp != NULL && fhold(fp)) {
3149
*fpp = fp;
3150
error = 0;
3151
} else {
3152
error = EBADF;
3153
}
3154
} else {
3155
error = ENOENT;
3156
}
3157
FILEDESC_SUNLOCK(fdp);
3158
fddrop(fdp);
3159
return (error);
3160
}
3161
3162
int
3163
fget_remote_foreach(struct thread *td, struct proc *p,
3164
int (*fn)(struct proc *, int, struct file *, void *), void *arg)
3165
{
3166
struct filedesc *fdp;
3167
struct fdescenttbl *fdt;
3168
struct file *fp;
3169
int error, error1, fd, highfd;
3170
3171
error = 0;
3172
PROC_LOCK(p);
3173
fdp = fdhold(p);
3174
PROC_UNLOCK(p);
3175
if (fdp == NULL)
3176
return (ENOENT);
3177
3178
FILEDESC_SLOCK(fdp);
3179
if (refcount_load(&fdp->fd_refcnt) != 0) {
3180
fdt = atomic_load_ptr(&fdp->fd_files);
3181
highfd = fdt->fdt_nfiles - 1;
3182
FILEDESC_SUNLOCK(fdp);
3183
} else {
3184
error = ENOENT;
3185
FILEDESC_SUNLOCK(fdp);
3186
goto out;
3187
}
3188
3189
for (fd = 0; fd <= highfd; fd++) {
3190
error1 = fget_remote(td, p, fd, &fp);
3191
if (error1 != 0)
3192
continue;
3193
error = fn(p, fd, fp, arg);
3194
fdrop(fp, td);
3195
if (error != 0)
3196
break;
3197
}
3198
out:
3199
fddrop(fdp);
3200
return (error);
3201
}
3202
3203
#ifdef CAPABILITIES
3204
int
3205
fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, int *flagsp)
3206
{
3207
const struct filedescent *fde;
3208
const struct fdescenttbl *fdt;
3209
struct filedesc *fdp;
3210
struct file *fp;
3211
struct vnode *vp;
3212
const cap_rights_t *haverights;
3213
cap_rights_t rights;
3214
seqc_t seq;
3215
int fd, flags;
3216
3217
VFS_SMR_ASSERT_ENTERED();
3218
3219
fd = ndp->ni_dirfd;
3220
rights = *ndp->ni_rightsneeded;
3221
cap_rights_set_one(&rights, CAP_LOOKUP);
3222
3223
fdp = curproc->p_fd;
3224
fdt = fdp->fd_files;
3225
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
3226
return (EBADF);
3227
seq = seqc_read_notmodify(fd_seqc(fdt, fd));
3228
fde = &fdt->fdt_ofiles[fd];
3229
haverights = cap_rights_fde_inline(fde);
3230
fp = fde->fde_file;
3231
if (__predict_false(fp == NULL))
3232
return (EAGAIN);
3233
if (__predict_false(cap_check_inline_transient(haverights, &rights)))
3234
return (EAGAIN);
3235
flags = fp->f_flag & FSEARCH;
3236
flags |= (fde->fde_flags & UF_RESOLVE_BENEATH) != 0 ?
3237
O_RESOLVE_BENEATH : 0;
3238
vp = fp->f_vnode;
3239
if (__predict_false(vp == NULL)) {
3240
return (EAGAIN);
3241
}
3242
if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
3243
return (EAGAIN);
3244
}
3245
/*
3246
* Use an acquire barrier to force re-reading of fdt so it is
3247
* refreshed for verification.
3248
*/
3249
atomic_thread_fence_acq();
3250
fdt = fdp->fd_files;
3251
if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)))
3252
return (EAGAIN);
3253
/*
3254
* If file descriptor doesn't have all rights,
3255
* all lookups relative to it must also be
3256
* strictly relative.
3257
*
3258
* Not yet supported by fast path.
3259
*/
3260
CAP_ALL(&rights);
3261
if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
3262
ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
3263
ndp->ni_filecaps.fc_nioctls != -1) {
3264
#ifdef notyet
3265
ndp->ni_lcf |= NI_LCF_STRICTREL;
3266
#else
3267
return (EAGAIN);
3268
#endif
3269
}
3270
*vpp = vp;
3271
*flagsp = flags;
3272
return (0);
3273
}
3274
#else
3275
int
3276
fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, int *flagsp)
3277
{
3278
const struct filedescent *fde;
3279
const struct fdescenttbl *fdt;
3280
struct filedesc *fdp;
3281
struct file *fp;
3282
struct vnode *vp;
3283
int fd, flags;
3284
3285
VFS_SMR_ASSERT_ENTERED();
3286
3287
fd = ndp->ni_dirfd;
3288
fdp = curproc->p_fd;
3289
fdt = fdp->fd_files;
3290
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
3291
return (EBADF);
3292
fde = &fdt->fdt_ofiles[fd];
3293
fp = fde->fde_file;
3294
if (__predict_false(fp == NULL))
3295
return (EAGAIN);
3296
flags = fp->f_flag & FSEARCH;
3297
flags |= (fde->fde_flags & UF_RESOLVE_BENEATH) != 0 ?
3298
O_RESOLVE_BENEATH : 0;
3299
vp = fp->f_vnode;
3300
if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
3301
return (EAGAIN);
3302
}
3303
/*
3304
* Use an acquire barrier to force re-reading of fdt so it is
3305
* refreshed for verification.
3306
*/
3307
atomic_thread_fence_acq();
3308
fdt = fdp->fd_files;
3309
if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
3310
return (EAGAIN);
3311
filecaps_fill(&ndp->ni_filecaps);
3312
*vpp = vp;
3313
*flagsp = flags;
3314
return (0);
3315
}
3316
#endif
3317
3318
int
3319
fgetvp_lookup(struct nameidata *ndp, struct vnode **vpp)
3320
{
3321
struct thread *td;
3322
struct file *fp;
3323
struct vnode *vp;
3324
struct componentname *cnp;
3325
cap_rights_t rights;
3326
int error;
3327
uint8_t flags;
3328
3329
td = curthread;
3330
rights = *ndp->ni_rightsneeded;
3331
cap_rights_set_one(&rights, CAP_LOOKUP);
3332
cnp = &ndp->ni_cnd;
3333
3334
error = fget_cap(td, ndp->ni_dirfd, &rights, &flags, &fp,
3335
&ndp->ni_filecaps);
3336
if (__predict_false(error != 0))
3337
return (error);
3338
if (__predict_false(fp->f_ops == &badfileops)) {
3339
error = EBADF;
3340
goto out_free;
3341
}
3342
vp = fp->f_vnode;
3343
if (__predict_false(vp == NULL)) {
3344
error = ENOTDIR;
3345
goto out_free;
3346
}
3347
vrefact(vp);
3348
/*
3349
* XXX does not check for VDIR, handled by namei_setup
3350
*/
3351
if ((fp->f_flag & FSEARCH) != 0)
3352
cnp->cn_flags |= NOEXECCHECK;
3353
if ((flags & UF_RESOLVE_BENEATH) != 0) {
3354
cnp->cn_flags |= RBENEATH;
3355
ndp->ni_resflags |= NIRES_BENEATH;
3356
}
3357
fdrop(fp, td);
3358
3359
#ifdef CAPABILITIES
3360
/*
3361
* If file descriptor doesn't have all rights,
3362
* all lookups relative to it must also be
3363
* strictly relative.
3364
*/
3365
CAP_ALL(&rights);
3366
if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
3367
ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
3368
ndp->ni_filecaps.fc_nioctls != -1) {
3369
ndp->ni_lcf |= NI_LCF_STRICTREL;
3370
ndp->ni_resflags |= NIRES_STRICTREL;
3371
}
3372
#endif
3373
3374
/*
3375
* TODO: avoid copying ioctl caps if it can be helped to begin with
3376
*/
3377
if ((cnp->cn_flags & WANTIOCTLCAPS) == 0)
3378
filecaps_free_ioctl(&ndp->ni_filecaps);
3379
3380
*vpp = vp;
3381
return (0);
3382
3383
out_free:
3384
filecaps_free(&ndp->ni_filecaps);
3385
fdrop(fp, td);
3386
return (error);
3387
}
3388
3389
/*
3390
* Fetch the descriptor locklessly.
3391
*
3392
* We avoid fdrop() races by never raising a refcount above 0. To accomplish
3393
* this we have to use a cmpset loop rather than an atomic_add. The descriptor
3394
* must be re-verified once we acquire a reference to be certain that the
3395
* identity is still correct and we did not lose a race due to preemption.
3396
*
3397
* Force a reload of fdt when looping. Another thread could reallocate
3398
* the table before this fd was closed, so it is possible that there is
3399
* a stale fp pointer in cached version.
3400
*/
3401
#ifdef CAPABILITIES
3402
static int
3403
fget_unlocked_seq(struct thread *td, int fd, const cap_rights_t *needrightsp,
3404
uint8_t *flagsp, struct file **fpp, seqc_t *seqp)
3405
{
3406
struct filedesc *fdp;
3407
const struct filedescent *fde;
3408
const struct fdescenttbl *fdt;
3409
struct file *fp;
3410
seqc_t seq;
3411
cap_rights_t haverights;
3412
int error;
3413
uint8_t flags;
3414
3415
fdp = td->td_proc->p_fd;
3416
fdt = fdp->fd_files;
3417
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
3418
return (EBADF);
3419
3420
for (;;) {
3421
seq = seqc_read_notmodify(fd_seqc(fdt, fd));
3422
fde = &fdt->fdt_ofiles[fd];
3423
haverights = *cap_rights_fde_inline(fde);
3424
fp = fde->fde_file;
3425
flags = fde->fde_flags;
3426
if (__predict_false(fp == NULL)) {
3427
if (seqc_consistent(fd_seqc(fdt, fd), seq))
3428
return (EBADF);
3429
fdt = atomic_load_ptr(&fdp->fd_files);
3430
continue;
3431
}
3432
error = cap_check_inline(&haverights, needrightsp);
3433
if (__predict_false(error != 0)) {
3434
if (seqc_consistent(fd_seqc(fdt, fd), seq))
3435
return (error);
3436
fdt = atomic_load_ptr(&fdp->fd_files);
3437
continue;
3438
}
3439
if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
3440
fdt = atomic_load_ptr(&fdp->fd_files);
3441
continue;
3442
}
3443
/*
3444
* Use an acquire barrier to force re-reading of fdt so it is
3445
* refreshed for verification.
3446
*/
3447
atomic_thread_fence_acq();
3448
fdt = fdp->fd_files;
3449
if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))
3450
break;
3451
fdrop(fp, td);
3452
}
3453
*fpp = fp;
3454
if (flagsp != NULL)
3455
*flagsp = flags;
3456
if (seqp != NULL)
3457
*seqp = seq;
3458
return (0);
3459
}
3460
#else
3461
static int
3462
fget_unlocked_seq(struct thread *td, int fd, const cap_rights_t *needrightsp,
3463
uint8_t *flagsp, struct file **fpp, seqc_t *seqp __unused)
3464
{
3465
struct filedesc *fdp;
3466
const struct fdescenttbl *fdt;
3467
struct file *fp;
3468
uint8_t flags;
3469
3470
fdp = td->td_proc->p_fd;
3471
fdt = fdp->fd_files;
3472
if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
3473
return (EBADF);
3474
3475
for (;;) {
3476
fp = fdt->fdt_ofiles[fd].fde_file;
3477
flags = fdt->fdt_ofiles[fd].fde_flags;
3478
if (__predict_false(fp == NULL))
3479
return (EBADF);
3480
if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
3481
fdt = atomic_load_ptr(&fdp->fd_files);
3482
continue;
3483
}
3484
/*
3485
* Use an acquire barrier to force re-reading of fdt so it is
3486
* refreshed for verification.
3487
*/
3488
atomic_thread_fence_acq();
3489
fdt = fdp->fd_files;
3490
if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file))
3491
break;
3492
fdrop(fp, td);
3493
}
3494
if (flagsp != NULL)
3495
*flagsp = flags;
3496
*fpp = fp;
3497
return (0);
3498
}
3499
#endif
3500
3501
/*
3502
* See the comments in fget_unlocked_seq for an explanation of how this works.
3503
*
3504
* This is a simplified variant which bails out to the aforementioned routine
3505
* if anything goes wrong. In practice this only happens when userspace is
3506
* racing with itself.
3507
*/
3508
int
3509
fget_unlocked_flags(struct thread *td, int fd, const cap_rights_t *needrightsp,
3510
uint8_t *flagsp, struct file **fpp)
3511
{
3512
struct filedesc *fdp;
3513
#ifdef CAPABILITIES
3514
const struct filedescent *fde;
3515
#endif
3516
const struct fdescenttbl *fdt;
3517
struct file *fp;
3518
#ifdef CAPABILITIES
3519
seqc_t seq;
3520
const cap_rights_t *haverights;
3521
#endif
3522
uint8_t flags;
3523
3524
fdp = td->td_proc->p_fd;
3525
fdt = fdp->fd_files;
3526
if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) {
3527
*fpp = NULL;
3528
return (EBADF);
3529
}
3530
#ifdef CAPABILITIES
3531
seq = seqc_read_notmodify(fd_seqc(fdt, fd));
3532
fde = &fdt->fdt_ofiles[fd];
3533
haverights = cap_rights_fde_inline(fde);
3534
fp = fde->fde_file;
3535
flags = fde->fde_flags;
3536
#else
3537
fp = fdt->fdt_ofiles[fd].fde_file;
3538
flags = fdt->fdt_ofiles[fd].fde_flags;
3539
#endif
3540
if (__predict_false(fp == NULL))
3541
goto out_fallback;
3542
#ifdef CAPABILITIES
3543
if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
3544
goto out_fallback;
3545
#endif
3546
if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
3547
goto out_fallback;
3548
3549
/*
3550
* Use an acquire barrier to force re-reading of fdt so it is
3551
* refreshed for verification.
3552
*/
3553
atomic_thread_fence_acq();
3554
fdt = fdp->fd_files;
3555
#ifdef CAPABILITIES
3556
if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)))
3557
#else
3558
if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
3559
#endif
3560
goto out_fdrop;
3561
*fpp = fp;
3562
if (flagsp != NULL)
3563
*flagsp = flags;
3564
return (0);
3565
out_fdrop:
3566
fdrop(fp, td);
3567
out_fallback:
3568
*fpp = NULL;
3569
return (fget_unlocked_seq(td, fd, needrightsp, flagsp, fpp, NULL));
3570
}
3571
3572
int
3573
fget_unlocked(struct thread *td, int fd, const cap_rights_t *needrightsp,
3574
struct file **fpp)
3575
{
3576
return (fget_unlocked_flags(td, fd, needrightsp, NULL, fpp));
3577
}
3578
3579
/*
3580
* Translate fd -> file when the caller guarantees the file descriptor table
3581
* can't be changed by others.
3582
*
3583
* Note this does not mean the file object itself is only visible to the caller,
3584
* merely that it wont disappear without having to be referenced.
3585
*
3586
* Must be paired with fput_only_user.
3587
*/
3588
#ifdef CAPABILITIES
3589
int
3590
fget_only_user(struct filedesc *fdp, int fd, const cap_rights_t *needrightsp,
3591
struct file **fpp)
3592
{
3593
const struct filedescent *fde;
3594
const struct fdescenttbl *fdt;
3595
const cap_rights_t *haverights;
3596
struct file *fp;
3597
int error;
3598
3599
MPASS(FILEDESC_IS_ONLY_USER(fdp));
3600
3601
*fpp = NULL;
3602
if (__predict_false(fd >= fdp->fd_nfiles))
3603
return (EBADF);
3604
3605
fdt = fdp->fd_files;
3606
fde = &fdt->fdt_ofiles[fd];
3607
fp = fde->fde_file;
3608
if (__predict_false(fp == NULL))
3609
return (EBADF);
3610
MPASS(refcount_load(&fp->f_count) > 0);
3611
haverights = cap_rights_fde_inline(fde);
3612
error = cap_check_inline(haverights, needrightsp);
3613
if (__predict_false(error != 0))
3614
return (error);
3615
*fpp = fp;
3616
return (0);
3617
}
3618
#else
3619
int
3620
fget_only_user(struct filedesc *fdp, int fd, const cap_rights_t *needrightsp,
3621
struct file **fpp)
3622
{
3623
struct file *fp;
3624
3625
MPASS(FILEDESC_IS_ONLY_USER(fdp));
3626
3627
*fpp = NULL;
3628
if (__predict_false(fd >= fdp->fd_nfiles))
3629
return (EBADF);
3630
3631
fp = fdp->fd_ofiles[fd].fde_file;
3632
if (__predict_false(fp == NULL))
3633
return (EBADF);
3634
3635
MPASS(refcount_load(&fp->f_count) > 0);
3636
*fpp = fp;
3637
return (0);
3638
}
3639
#endif
3640
3641
/*
3642
* Extract the file pointer associated with the specified descriptor for the
3643
* current user process.
3644
*
3645
* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
3646
* returned.
3647
*
3648
* File's rights will be checked against the capability rights mask.
3649
*
3650
* If an error occurred the non-zero error is returned and *fpp is set to
3651
* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
3652
* responsible for fdrop().
3653
*/
3654
static __inline int
3655
_fget(struct thread *td, int fd, struct file **fpp, int flags,
3656
const cap_rights_t *needrightsp)
3657
{
3658
struct file *fp;
3659
int error;
3660
3661
*fpp = NULL;
3662
error = fget_unlocked(td, fd, needrightsp, &fp);
3663
if (__predict_false(error != 0))
3664
return (error);
3665
if (__predict_false(fp->f_ops == &badfileops)) {
3666
fdrop(fp, td);
3667
return (EBADF);
3668
}
3669
3670
/*
3671
* FREAD and FWRITE failure return EBADF as per POSIX.
3672
*/
3673
error = 0;
3674
switch (flags) {
3675
case FREAD:
3676
case FWRITE:
3677
if ((fp->f_flag & flags) == 0)
3678
error = EBADF;
3679
break;
3680
case FEXEC:
3681
if (fp->f_ops != &path_fileops &&
3682
((fp->f_flag & (FREAD | FEXEC)) == 0 ||
3683
(fp->f_flag & FWRITE) != 0))
3684
error = EBADF;
3685
break;
3686
case 0:
3687
break;
3688
default:
3689
KASSERT(0, ("wrong flags"));
3690
}
3691
3692
if (error != 0) {
3693
fdrop(fp, td);
3694
return (error);
3695
}
3696
3697
*fpp = fp;
3698
return (0);
3699
}
3700
3701
int
3702
fget(struct thread *td, int fd, const cap_rights_t *rightsp, struct file **fpp)
3703
{
3704
3705
return (_fget(td, fd, fpp, 0, rightsp));
3706
}
3707
3708
int
3709
fget_mmap(struct thread *td, int fd, const cap_rights_t *rightsp,
3710
vm_prot_t *maxprotp, struct file **fpp)
3711
{
3712
int error;
3713
#ifndef CAPABILITIES
3714
error = _fget(td, fd, fpp, 0, rightsp);
3715
if (maxprotp != NULL)
3716
*maxprotp = VM_PROT_ALL;
3717
return (error);
3718
#else
3719
cap_rights_t fdrights;
3720
struct filedesc *fdp;
3721
struct file *fp;
3722
seqc_t seq;
3723
3724
*fpp = NULL;
3725
fdp = td->td_proc->p_fd;
3726
MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
3727
for (;;) {
3728
error = fget_unlocked_seq(td, fd, rightsp, NULL, &fp, &seq);
3729
if (__predict_false(error != 0))
3730
return (error);
3731
if (__predict_false(fp->f_ops == &badfileops)) {
3732
fdrop(fp, td);
3733
return (EBADF);
3734
}
3735
if (maxprotp != NULL)
3736
fdrights = *cap_rights(fdp, fd);
3737
if (!fd_modified(fdp, fd, seq))
3738
break;
3739
fdrop(fp, td);
3740
}
3741
3742
/*
3743
* If requested, convert capability rights to access flags.
3744
*/
3745
if (maxprotp != NULL)
3746
*maxprotp = cap_rights_to_vmprot(&fdrights);
3747
*fpp = fp;
3748
return (0);
3749
#endif
3750
}
3751
3752
int
3753
fget_read(struct thread *td, int fd, const cap_rights_t *rightsp,
3754
struct file **fpp)
3755
{
3756
3757
return (_fget(td, fd, fpp, FREAD, rightsp));
3758
}
3759
3760
int
3761
fget_write(struct thread *td, int fd, const cap_rights_t *rightsp,
3762
struct file **fpp)
3763
{
3764
3765
return (_fget(td, fd, fpp, FWRITE, rightsp));
3766
}
3767
3768
int
3769
fget_fcntl(struct thread *td, int fd, const cap_rights_t *rightsp,
3770
int needfcntl, struct file **fpp)
3771
{
3772
#ifndef CAPABILITIES
3773
return (fget_unlocked(td, fd, rightsp, fpp));
3774
#else
3775
struct filedesc *fdp = td->td_proc->p_fd;
3776
struct file *fp;
3777
int error;
3778
seqc_t seq;
3779
3780
*fpp = NULL;
3781
MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
3782
for (;;) {
3783
error = fget_unlocked_seq(td, fd, rightsp, NULL, &fp, &seq);
3784
if (error != 0)
3785
return (error);
3786
error = cap_fcntl_check(fdp, fd, needfcntl);
3787
if (!fd_modified(fdp, fd, seq))
3788
break;
3789
fdrop(fp, td);
3790
}
3791
if (error != 0) {
3792
fdrop(fp, td);
3793
return (error);
3794
}
3795
*fpp = fp;
3796
return (0);
3797
#endif
3798
}
3799
3800
/*
3801
* Like fget() but loads the underlying vnode, or returns an error if the
3802
* descriptor does not represent a vnode. Note that pipes use vnodes but
3803
* never have VM objects. The returned vnode will be vref()'d.
3804
*
3805
* XXX: what about the unused flags ?
3806
*/
3807
static __inline int
3808
_fgetvp(struct thread *td, int fd, int flags, const cap_rights_t *needrightsp,
3809
struct vnode **vpp)
3810
{
3811
struct file *fp;
3812
int error;
3813
3814
*vpp = NULL;
3815
error = _fget(td, fd, &fp, flags, needrightsp);
3816
if (error != 0)
3817
return (error);
3818
if (fp->f_vnode == NULL) {
3819
error = EINVAL;
3820
} else {
3821
*vpp = fp->f_vnode;
3822
vrefact(*vpp);
3823
}
3824
fdrop(fp, td);
3825
3826
return (error);
3827
}
3828
3829
int
3830
fgetvp(struct thread *td, int fd, const cap_rights_t *rightsp,
3831
struct vnode **vpp)
3832
{
3833
3834
return (_fgetvp(td, fd, 0, rightsp, vpp));
3835
}
3836
3837
int
3838
fgetvp_rights(struct thread *td, int fd, const cap_rights_t *needrightsp,
3839
struct filecaps *havecaps, struct vnode **vpp)
3840
{
3841
struct filecaps caps;
3842
struct file *fp;
3843
int error;
3844
3845
error = fget_cap(td, fd, needrightsp, NULL, &fp, &caps);
3846
if (error != 0)
3847
return (error);
3848
if (fp->f_ops == &badfileops) {
3849
error = EBADF;
3850
goto out;
3851
}
3852
if (fp->f_vnode == NULL) {
3853
error = EINVAL;
3854
goto out;
3855
}
3856
3857
*havecaps = caps;
3858
*vpp = fp->f_vnode;
3859
vrefact(*vpp);
3860
fdrop(fp, td);
3861
3862
return (0);
3863
out:
3864
filecaps_free(&caps);
3865
fdrop(fp, td);
3866
return (error);
3867
}
3868
3869
int
3870
fgetvp_read(struct thread *td, int fd, const cap_rights_t *rightsp,
3871
struct vnode **vpp)
3872
{
3873
3874
return (_fgetvp(td, fd, FREAD, rightsp, vpp));
3875
}
3876
3877
int
3878
fgetvp_exec(struct thread *td, int fd, const cap_rights_t *rightsp,
3879
struct vnode **vpp)
3880
{
3881
3882
return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
3883
}
3884
3885
#ifdef notyet
3886
int
3887
fgetvp_write(struct thread *td, int fd, const cap_rights_t *rightsp,
3888
struct vnode **vpp)
3889
{
3890
3891
return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
3892
}
3893
#endif
3894
3895
/*
3896
* Handle the last reference to a file being closed.
3897
*
3898
* Without the noinline attribute clang keeps inlining the func thorough this
3899
* file when fdrop is used.
3900
*/
3901
int __noinline
3902
_fdrop(struct file *fp, struct thread *td)
3903
{
3904
int error;
3905
3906
KASSERT(refcount_load(&fp->f_count) == 0,
3907
("fdrop: fp %p count %d", fp, refcount_load(&fp->f_count)));
3908
3909
error = fo_close(fp, td);
3910
atomic_subtract_int(&openfiles, 1);
3911
crfree(fp->f_cred);
3912
free(fp->f_advice, M_FADVISE);
3913
uma_zfree(file_zone, fp);
3914
3915
return (error);
3916
}
3917
3918
/*
3919
* Apply an advisory lock on a file descriptor.
3920
*
3921
* Just attempt to get a record lock of the requested type on the entire file
3922
* (l_whence = SEEK_SET, l_start = 0, l_len = 0).
3923
*/
3924
#ifndef _SYS_SYSPROTO_H_
3925
struct flock_args {
3926
int fd;
3927
int how;
3928
};
3929
#endif
3930
/* ARGSUSED */
3931
int
3932
sys_flock(struct thread *td, struct flock_args *uap)
3933
{
3934
struct file *fp;
3935
struct vnode *vp;
3936
struct flock lf;
3937
int error;
3938
3939
error = fget(td, uap->fd, &cap_flock_rights, &fp);
3940
if (error != 0)
3941
return (error);
3942
error = EOPNOTSUPP;
3943
if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
3944
goto done;
3945
}
3946
if (fp->f_ops == &path_fileops) {
3947
goto done;
3948
}
3949
3950
error = 0;
3951
vp = fp->f_vnode;
3952
lf.l_whence = SEEK_SET;
3953
lf.l_start = 0;
3954
lf.l_len = 0;
3955
if (uap->how & LOCK_UN) {
3956
lf.l_type = F_UNLCK;
3957
atomic_clear_int(&fp->f_flag, FHASLOCK);
3958
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
3959
goto done;
3960
}
3961
if (uap->how & LOCK_EX)
3962
lf.l_type = F_WRLCK;
3963
else if (uap->how & LOCK_SH)
3964
lf.l_type = F_RDLCK;
3965
else {
3966
error = EBADF;
3967
goto done;
3968
}
3969
atomic_set_int(&fp->f_flag, FHASLOCK);
3970
error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
3971
(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
3972
done:
3973
fdrop(fp, td);
3974
return (error);
3975
}
3976
/*
3977
* Duplicate the specified descriptor to a free descriptor.
3978
*/
3979
int
3980
dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
3981
int openerror, int *indxp)
3982
{
3983
struct filedescent *newfde, *oldfde;
3984
struct file *fp;
3985
u_long *ioctls;
3986
int error, indx;
3987
3988
KASSERT(openerror == ENODEV || openerror == ENXIO,
3989
("unexpected error %d in %s", openerror, __func__));
3990
3991
/*
3992
* If the to-be-dup'd fd number is greater than the allowed number
3993
* of file descriptors, or the fd to be dup'd has already been
3994
* closed, then reject.
3995
*/
3996
FILEDESC_XLOCK(fdp);
3997
if ((fp = fget_noref(fdp, dfd)) == NULL) {
3998
FILEDESC_XUNLOCK(fdp);
3999
return (EBADF);
4000
}
4001
4002
error = fdalloc(td, 0, &indx);
4003
if (error != 0) {
4004
FILEDESC_XUNLOCK(fdp);
4005
return (error);
4006
}
4007
4008
/*
4009
* There are two cases of interest here.
4010
*
4011
* For ENODEV simply dup (dfd) to file descriptor (indx) and return.
4012
*
4013
* For ENXIO steal away the file structure from (dfd) and store it in
4014
* (indx). (dfd) is effectively closed by this operation.
4015
*/
4016
switch (openerror) {
4017
case ENODEV:
4018
/*
4019
* Check that the mode the file is being opened for is a
4020
* subset of the mode of the existing descriptor.
4021
*/
4022
if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
4023
fdunused(fdp, indx);
4024
FILEDESC_XUNLOCK(fdp);
4025
return (EACCES);
4026
}
4027
if (!fhold(fp)) {
4028
fdunused(fdp, indx);
4029
FILEDESC_XUNLOCK(fdp);
4030
return (EBADF);
4031
}
4032
newfde = &fdp->fd_ofiles[indx];
4033
oldfde = &fdp->fd_ofiles[dfd];
4034
ioctls = filecaps_copy_prep(&oldfde->fde_caps);
4035
#ifdef CAPABILITIES
4036
seqc_write_begin(&newfde->fde_seqc);
4037
#endif
4038
fde_copy(oldfde, newfde);
4039
filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
4040
ioctls);
4041
#ifdef CAPABILITIES
4042
seqc_write_end(&newfde->fde_seqc);
4043
#endif
4044
break;
4045
case ENXIO:
4046
/*
4047
* Steal away the file pointer from dfd and stuff it into indx.
4048
*/
4049
newfde = &fdp->fd_ofiles[indx];
4050
oldfde = &fdp->fd_ofiles[dfd];
4051
#ifdef CAPABILITIES
4052
seqc_write_begin(&oldfde->fde_seqc);
4053
seqc_write_begin(&newfde->fde_seqc);
4054
#endif
4055
fde_copy(oldfde, newfde);
4056
oldfde->fde_file = NULL;
4057
fdunused(fdp, dfd);
4058
#ifdef CAPABILITIES
4059
seqc_write_end(&newfde->fde_seqc);
4060
seqc_write_end(&oldfde->fde_seqc);
4061
#endif
4062
break;
4063
}
4064
FILEDESC_XUNLOCK(fdp);
4065
*indxp = indx;
4066
return (0);
4067
}
4068
4069
/*
4070
* This sysctl determines if we will allow a process to chroot(2) if it
4071
* has a directory open:
4072
* 0: disallowed for all processes.
4073
* 1: allowed for processes that were not already chroot(2)'ed.
4074
* 2: allowed for all processes.
4075
*/
4076
4077
static int chroot_allow_open_directories = 1;
4078
4079
SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
4080
&chroot_allow_open_directories, 0,
4081
"Allow a process to chroot(2) if it has a directory open");
4082
4083
/*
4084
* Helper function for raised chroot(2) security function: Refuse if
4085
* any filedescriptors are open directories.
4086
*/
4087
static int
4088
chroot_refuse_vdir_fds(struct filedesc *fdp)
4089
{
4090
struct vnode *vp;
4091
struct file *fp;
4092
int i;
4093
4094
FILEDESC_LOCK_ASSERT(fdp);
4095
4096
FILEDESC_FOREACH_FP(fdp, i, fp) {
4097
if (fp->f_type == DTYPE_VNODE) {
4098
vp = fp->f_vnode;
4099
if (vp->v_type == VDIR)
4100
return (EPERM);
4101
}
4102
}
4103
return (0);
4104
}
4105
4106
static void
4107
pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
4108
{
4109
4110
if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
4111
vrefact(oldpwd->pwd_cdir);
4112
newpwd->pwd_cdir = oldpwd->pwd_cdir;
4113
}
4114
4115
if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
4116
vrefact(oldpwd->pwd_rdir);
4117
newpwd->pwd_rdir = oldpwd->pwd_rdir;
4118
}
4119
4120
if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
4121
vrefact(oldpwd->pwd_jdir);
4122
newpwd->pwd_jdir = oldpwd->pwd_jdir;
4123
}
4124
4125
if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) {
4126
vrefact(oldpwd->pwd_adir);
4127
newpwd->pwd_adir = oldpwd->pwd_adir;
4128
}
4129
}
4130
4131
struct pwd *
4132
pwd_hold_pwddesc(struct pwddesc *pdp)
4133
{
4134
struct pwd *pwd;
4135
4136
PWDDESC_ASSERT_XLOCKED(pdp);
4137
pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4138
if (pwd != NULL)
4139
refcount_acquire(&pwd->pwd_refcount);
4140
return (pwd);
4141
}
4142
4143
bool
4144
pwd_hold_smr(struct pwd *pwd)
4145
{
4146
4147
MPASS(pwd != NULL);
4148
if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
4149
return (true);
4150
}
4151
return (false);
4152
}
4153
4154
struct pwd *
4155
pwd_hold(struct thread *td)
4156
{
4157
struct pwddesc *pdp;
4158
struct pwd *pwd;
4159
4160
pdp = td->td_proc->p_pd;
4161
4162
vfs_smr_enter();
4163
pwd = vfs_smr_entered_load(&pdp->pd_pwd);
4164
if (pwd_hold_smr(pwd)) {
4165
vfs_smr_exit();
4166
return (pwd);
4167
}
4168
vfs_smr_exit();
4169
PWDDESC_XLOCK(pdp);
4170
pwd = pwd_hold_pwddesc(pdp);
4171
MPASS(pwd != NULL);
4172
PWDDESC_XUNLOCK(pdp);
4173
return (pwd);
4174
}
4175
4176
struct pwd *
4177
pwd_hold_proc(struct proc *p)
4178
{
4179
struct pwddesc *pdp;
4180
struct pwd *pwd;
4181
4182
PROC_ASSERT_HELD(p);
4183
PROC_LOCK(p);
4184
pdp = pdhold(p);
4185
MPASS(pdp != NULL);
4186
PROC_UNLOCK(p);
4187
4188
PWDDESC_XLOCK(pdp);
4189
pwd = pwd_hold_pwddesc(pdp);
4190
MPASS(pwd != NULL);
4191
PWDDESC_XUNLOCK(pdp);
4192
pddrop(pdp);
4193
return (pwd);
4194
}
4195
4196
static struct pwd *
4197
pwd_alloc(void)
4198
{
4199
struct pwd *pwd;
4200
4201
pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
4202
bzero(pwd, sizeof(*pwd));
4203
refcount_init(&pwd->pwd_refcount, 1);
4204
return (pwd);
4205
}
4206
4207
void
4208
pwd_drop(struct pwd *pwd)
4209
{
4210
4211
if (!refcount_release(&pwd->pwd_refcount))
4212
return;
4213
4214
if (pwd->pwd_cdir != NULL)
4215
vrele(pwd->pwd_cdir);
4216
if (pwd->pwd_rdir != NULL)
4217
vrele(pwd->pwd_rdir);
4218
if (pwd->pwd_jdir != NULL)
4219
vrele(pwd->pwd_jdir);
4220
if (pwd->pwd_adir != NULL)
4221
vrele(pwd->pwd_adir);
4222
uma_zfree_smr(pwd_zone, pwd);
4223
}
4224
4225
/*
4226
* The caller is responsible for invoking priv_check() and
4227
* mac_vnode_check_chroot() to authorize this operation.
4228
*/
4229
int
4230
pwd_chroot(struct thread *td, struct vnode *vp)
4231
{
4232
struct pwddesc *pdp;
4233
struct filedesc *fdp;
4234
struct pwd *newpwd, *oldpwd;
4235
int error;
4236
4237
fdp = td->td_proc->p_fd;
4238
pdp = td->td_proc->p_pd;
4239
newpwd = pwd_alloc();
4240
FILEDESC_SLOCK(fdp);
4241
PWDDESC_XLOCK(pdp);
4242
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4243
if (chroot_allow_open_directories == 0 ||
4244
(chroot_allow_open_directories == 1 &&
4245
oldpwd->pwd_rdir != rootvnode)) {
4246
error = chroot_refuse_vdir_fds(fdp);
4247
FILEDESC_SUNLOCK(fdp);
4248
if (error != 0) {
4249
PWDDESC_XUNLOCK(pdp);
4250
pwd_drop(newpwd);
4251
return (error);
4252
}
4253
} else {
4254
FILEDESC_SUNLOCK(fdp);
4255
}
4256
4257
vrefact(vp);
4258
newpwd->pwd_rdir = vp;
4259
vrefact(vp);
4260
newpwd->pwd_adir = vp;
4261
if (oldpwd->pwd_jdir == NULL) {
4262
vrefact(vp);
4263
newpwd->pwd_jdir = vp;
4264
}
4265
pwd_fill(oldpwd, newpwd);
4266
pwd_set(pdp, newpwd);
4267
PWDDESC_XUNLOCK(pdp);
4268
pwd_drop(oldpwd);
4269
return (0);
4270
}
4271
4272
void
4273
pwd_chdir(struct thread *td, struct vnode *vp)
4274
{
4275
struct pwddesc *pdp;
4276
struct pwd *newpwd, *oldpwd;
4277
4278
VNPASS(vp->v_usecount > 0, vp);
4279
4280
newpwd = pwd_alloc();
4281
pdp = td->td_proc->p_pd;
4282
PWDDESC_XLOCK(pdp);
4283
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4284
newpwd->pwd_cdir = vp;
4285
pwd_fill(oldpwd, newpwd);
4286
pwd_set(pdp, newpwd);
4287
PWDDESC_XUNLOCK(pdp);
4288
pwd_drop(oldpwd);
4289
}
4290
4291
/*
4292
* Process is transitioning to/from a non-native ABI.
4293
*/
4294
void
4295
pwd_altroot(struct thread *td, struct vnode *altroot_vp)
4296
{
4297
struct pwddesc *pdp;
4298
struct pwd *newpwd, *oldpwd;
4299
4300
newpwd = pwd_alloc();
4301
pdp = td->td_proc->p_pd;
4302
PWDDESC_XLOCK(pdp);
4303
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4304
if (altroot_vp != NULL) {
4305
/*
4306
* Native process to a non-native ABI.
4307
*/
4308
4309
vrefact(altroot_vp);
4310
newpwd->pwd_adir = altroot_vp;
4311
} else {
4312
/*
4313
* Non-native process to the native ABI.
4314
*/
4315
4316
vrefact(oldpwd->pwd_rdir);
4317
newpwd->pwd_adir = oldpwd->pwd_rdir;
4318
}
4319
pwd_fill(oldpwd, newpwd);
4320
pwd_set(pdp, newpwd);
4321
PWDDESC_XUNLOCK(pdp);
4322
pwd_drop(oldpwd);
4323
}
4324
4325
/*
4326
* jail_attach(2) changes both root and working directories.
4327
*/
4328
int
4329
pwd_chroot_chdir(struct thread *td, struct vnode *vp)
4330
{
4331
struct pwddesc *pdp;
4332
struct filedesc *fdp;
4333
struct pwd *newpwd, *oldpwd;
4334
int error;
4335
4336
fdp = td->td_proc->p_fd;
4337
pdp = td->td_proc->p_pd;
4338
newpwd = pwd_alloc();
4339
FILEDESC_SLOCK(fdp);
4340
PWDDESC_XLOCK(pdp);
4341
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4342
error = chroot_refuse_vdir_fds(fdp);
4343
FILEDESC_SUNLOCK(fdp);
4344
if (error != 0) {
4345
PWDDESC_XUNLOCK(pdp);
4346
pwd_drop(newpwd);
4347
return (error);
4348
}
4349
4350
vrefact(vp);
4351
newpwd->pwd_rdir = vp;
4352
vrefact(vp);
4353
newpwd->pwd_cdir = vp;
4354
if (oldpwd->pwd_jdir == NULL) {
4355
vrefact(vp);
4356
newpwd->pwd_jdir = vp;
4357
}
4358
vrefact(vp);
4359
newpwd->pwd_adir = vp;
4360
pwd_fill(oldpwd, newpwd);
4361
pwd_set(pdp, newpwd);
4362
PWDDESC_XUNLOCK(pdp);
4363
pwd_drop(oldpwd);
4364
return (0);
4365
}
4366
4367
void
4368
pwd_ensure_dirs(void)
4369
{
4370
struct pwddesc *pdp;
4371
struct pwd *oldpwd, *newpwd;
4372
4373
pdp = curproc->p_pd;
4374
PWDDESC_XLOCK(pdp);
4375
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4376
if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL &&
4377
oldpwd->pwd_adir != NULL) {
4378
PWDDESC_XUNLOCK(pdp);
4379
return;
4380
}
4381
PWDDESC_XUNLOCK(pdp);
4382
4383
newpwd = pwd_alloc();
4384
PWDDESC_XLOCK(pdp);
4385
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4386
pwd_fill(oldpwd, newpwd);
4387
if (newpwd->pwd_cdir == NULL) {
4388
vrefact(rootvnode);
4389
newpwd->pwd_cdir = rootvnode;
4390
}
4391
if (newpwd->pwd_rdir == NULL) {
4392
vrefact(rootvnode);
4393
newpwd->pwd_rdir = rootvnode;
4394
}
4395
if (newpwd->pwd_adir == NULL) {
4396
vrefact(rootvnode);
4397
newpwd->pwd_adir = rootvnode;
4398
}
4399
pwd_set(pdp, newpwd);
4400
PWDDESC_XUNLOCK(pdp);
4401
pwd_drop(oldpwd);
4402
}
4403
4404
void
4405
pwd_set_rootvnode(void)
4406
{
4407
struct pwddesc *pdp;
4408
struct pwd *oldpwd, *newpwd;
4409
4410
pdp = curproc->p_pd;
4411
4412
newpwd = pwd_alloc();
4413
PWDDESC_XLOCK(pdp);
4414
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4415
vrefact(rootvnode);
4416
newpwd->pwd_cdir = rootvnode;
4417
vrefact(rootvnode);
4418
newpwd->pwd_rdir = rootvnode;
4419
vrefact(rootvnode);
4420
newpwd->pwd_adir = rootvnode;
4421
pwd_fill(oldpwd, newpwd);
4422
pwd_set(pdp, newpwd);
4423
PWDDESC_XUNLOCK(pdp);
4424
pwd_drop(oldpwd);
4425
}
4426
4427
/*
4428
* Scan all active processes and prisons to see if any of them have a current
4429
* or root directory of `olddp'. If so, replace them with the new mount point.
4430
*/
4431
void
4432
mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
4433
{
4434
struct pwddesc *pdp;
4435
struct pwd *newpwd, *oldpwd;
4436
struct prison *pr;
4437
struct proc *p;
4438
int nrele;
4439
4440
if (vrefcnt(olddp) == 1)
4441
return;
4442
nrele = 0;
4443
newpwd = pwd_alloc();
4444
sx_slock(&allproc_lock);
4445
FOREACH_PROC_IN_SYSTEM(p) {
4446
PROC_LOCK(p);
4447
pdp = pdhold(p);
4448
PROC_UNLOCK(p);
4449
if (pdp == NULL)
4450
continue;
4451
PWDDESC_XLOCK(pdp);
4452
oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
4453
if (oldpwd == NULL ||
4454
(oldpwd->pwd_cdir != olddp &&
4455
oldpwd->pwd_rdir != olddp &&
4456
oldpwd->pwd_jdir != olddp &&
4457
oldpwd->pwd_adir != olddp)) {
4458
PWDDESC_XUNLOCK(pdp);
4459
pddrop(pdp);
4460
continue;
4461
}
4462
if (oldpwd->pwd_cdir == olddp) {
4463
vrefact(newdp);
4464
newpwd->pwd_cdir = newdp;
4465
}
4466
if (oldpwd->pwd_rdir == olddp) {
4467
vrefact(newdp);
4468
newpwd->pwd_rdir = newdp;
4469
}
4470
if (oldpwd->pwd_jdir == olddp) {
4471
vrefact(newdp);
4472
newpwd->pwd_jdir = newdp;
4473
}
4474
if (oldpwd->pwd_adir == olddp) {
4475
vrefact(newdp);
4476
newpwd->pwd_adir = newdp;
4477
}
4478
pwd_fill(oldpwd, newpwd);
4479
pwd_set(pdp, newpwd);
4480
PWDDESC_XUNLOCK(pdp);
4481
pwd_drop(oldpwd);
4482
pddrop(pdp);
4483
newpwd = pwd_alloc();
4484
}
4485
sx_sunlock(&allproc_lock);
4486
pwd_drop(newpwd);
4487
if (rootvnode == olddp) {
4488
vrefact(newdp);
4489
rootvnode = newdp;
4490
nrele++;
4491
}
4492
mtx_lock(&prison0.pr_mtx);
4493
if (prison0.pr_root == olddp) {
4494
vrefact(newdp);
4495
prison0.pr_root = newdp;
4496
nrele++;
4497
}
4498
mtx_unlock(&prison0.pr_mtx);
4499
sx_slock(&allprison_lock);
4500
TAILQ_FOREACH(pr, &allprison, pr_list) {
4501
mtx_lock(&pr->pr_mtx);
4502
if (pr->pr_root == olddp) {
4503
vrefact(newdp);
4504
pr->pr_root = newdp;
4505
nrele++;
4506
}
4507
mtx_unlock(&pr->pr_mtx);
4508
}
4509
sx_sunlock(&allprison_lock);
4510
while (nrele--)
4511
vrele(olddp);
4512
}
4513
4514
int
4515
descrip_check_write_mp(struct filedesc *fdp, struct mount *mp)
4516
{
4517
struct file *fp;
4518
struct vnode *vp;
4519
int error, i;
4520
4521
error = 0;
4522
FILEDESC_SLOCK(fdp);
4523
FILEDESC_FOREACH_FP(fdp, i, fp) {
4524
if (fp->f_type != DTYPE_VNODE ||
4525
(atomic_load_int(&fp->f_flag) & FWRITE) == 0)
4526
continue;
4527
vp = fp->f_vnode;
4528
if (vp->v_mount == mp) {
4529
error = EDEADLK;
4530
break;
4531
}
4532
}
4533
FILEDESC_SUNLOCK(fdp);
4534
return (error);
4535
}
4536
4537
struct filedesc_to_leader *
4538
filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp,
4539
struct proc *leader)
4540
{
4541
struct filedesc_to_leader *fdtol;
4542
4543
fdtol = malloc(sizeof(struct filedesc_to_leader),
4544
M_FILEDESC_TO_LEADER, M_WAITOK);
4545
fdtol->fdl_refcount = 1;
4546
fdtol->fdl_holdcount = 0;
4547
fdtol->fdl_wakeup = 0;
4548
fdtol->fdl_leader = leader;
4549
if (old != NULL) {
4550
FILEDESC_XLOCK(fdp);
4551
fdtol->fdl_next = old->fdl_next;
4552
fdtol->fdl_prev = old;
4553
old->fdl_next = fdtol;
4554
fdtol->fdl_next->fdl_prev = fdtol;
4555
FILEDESC_XUNLOCK(fdp);
4556
} else {
4557
fdtol->fdl_next = fdtol;
4558
fdtol->fdl_prev = fdtol;
4559
}
4560
return (fdtol);
4561
}
4562
4563
struct filedesc_to_leader *
4564
filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp)
4565
{
4566
FILEDESC_XLOCK(fdp);
4567
fdtol->fdl_refcount++;
4568
FILEDESC_XUNLOCK(fdp);
4569
return (fdtol);
4570
}
4571
4572
static int
4573
filedesc_nfiles(struct filedesc *fdp)
4574
{
4575
NDSLOTTYPE *map;
4576
int count, off, minoff;
4577
4578
if (fdp == NULL)
4579
return (0);
4580
count = 0;
4581
FILEDESC_SLOCK(fdp);
4582
map = fdp->fd_map;
4583
off = NDSLOT(fdp->fd_nfiles - 1);
4584
for (minoff = NDSLOT(0); off >= minoff; --off)
4585
count += bitcountl(map[off]);
4586
FILEDESC_SUNLOCK(fdp);
4587
return (count);
4588
}
4589
4590
int
4591
proc_nfiles(struct proc *p)
4592
{
4593
struct filedesc *fdp;
4594
int res;
4595
4596
PROC_LOCK(p);
4597
fdp = fdhold(p);
4598
PROC_UNLOCK(p);
4599
res = filedesc_nfiles(fdp);
4600
fddrop(fdp);
4601
return (res);
4602
}
4603
4604
static int
4605
sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
4606
{
4607
u_int namelen;
4608
int count;
4609
4610
namelen = arg2;
4611
if (namelen != 1)
4612
return (EINVAL);
4613
4614
if (*(int *)arg1 != 0)
4615
return (EINVAL);
4616
4617
count = filedesc_nfiles(curproc->p_fd);
4618
return (SYSCTL_OUT(req, &count, sizeof(count)));
4619
}
4620
4621
static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
4622
CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
4623
"Number of open file descriptors");
4624
4625
/*
4626
* Get file structures globally.
4627
*/
4628
static int
4629
sysctl_kern_file(SYSCTL_HANDLER_ARGS)
4630
{
4631
struct xfile xf;
4632
struct filedesc *fdp;
4633
struct file *fp;
4634
struct proc *p;
4635
int error, n;
4636
4637
error = sysctl_wire_old_buffer(req, 0);
4638
if (error != 0)
4639
return (error);
4640
if (req->oldptr == NULL) {
4641
n = 0;
4642
sx_slock(&allproc_lock);
4643
FOREACH_PROC_IN_SYSTEM(p) {
4644
PROC_LOCK(p);
4645
if (p->p_state == PRS_NEW) {
4646
PROC_UNLOCK(p);
4647
continue;
4648
}
4649
fdp = fdhold(p);
4650
PROC_UNLOCK(p);
4651
if (fdp == NULL)
4652
continue;
4653
/* overestimates sparse tables. */
4654
n += fdp->fd_nfiles;
4655
fddrop(fdp);
4656
}
4657
sx_sunlock(&allproc_lock);
4658
return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
4659
}
4660
error = 0;
4661
bzero(&xf, sizeof(xf));
4662
xf.xf_size = sizeof(xf);
4663
sx_slock(&allproc_lock);
4664
FOREACH_PROC_IN_SYSTEM(p) {
4665
PROC_LOCK(p);
4666
if (p->p_state == PRS_NEW) {
4667
PROC_UNLOCK(p);
4668
continue;
4669
}
4670
if (p_cansee(req->td, p) != 0) {
4671
PROC_UNLOCK(p);
4672
continue;
4673
}
4674
xf.xf_pid = p->p_pid;
4675
xf.xf_uid = p->p_ucred->cr_uid;
4676
fdp = fdhold(p);
4677
PROC_UNLOCK(p);
4678
if (fdp == NULL)
4679
continue;
4680
FILEDESC_SLOCK(fdp);
4681
if (refcount_load(&fdp->fd_refcnt) == 0)
4682
goto nextproc;
4683
FILEDESC_FOREACH_FP(fdp, n, fp) {
4684
xf.xf_fd = n;
4685
xf.xf_file = (uintptr_t)fp;
4686
xf.xf_data = (uintptr_t)fp->f_data;
4687
xf.xf_vnode = (uintptr_t)fp->f_vnode;
4688
xf.xf_type = (uintptr_t)fp->f_type;
4689
xf.xf_count = refcount_load(&fp->f_count);
4690
xf.xf_msgcount = 0;
4691
xf.xf_offset = foffset_get(fp);
4692
xf.xf_flag = fp->f_flag;
4693
error = SYSCTL_OUT(req, &xf, sizeof(xf));
4694
4695
/*
4696
* There is no need to re-check the fdtable refcount
4697
* here since the filedesc lock is not dropped in the
4698
* loop body.
4699
*/
4700
if (error != 0)
4701
break;
4702
}
4703
nextproc:
4704
FILEDESC_SUNLOCK(fdp);
4705
fddrop(fdp);
4706
if (error)
4707
break;
4708
}
4709
sx_sunlock(&allproc_lock);
4710
return (error);
4711
}
4712
4713
SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
4714
0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
4715
4716
#ifdef KINFO_FILE_SIZE
4717
CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
4718
#endif
4719
4720
static int
4721
xlate_fflags(int fflags)
4722
{
4723
static const struct {
4724
int fflag;
4725
int kf_fflag;
4726
} fflags_table[] = {
4727
{ FAPPEND, KF_FLAG_APPEND },
4728
{ FASYNC, KF_FLAG_ASYNC },
4729
{ FFSYNC, KF_FLAG_FSYNC },
4730
{ FHASLOCK, KF_FLAG_HASLOCK },
4731
{ FNONBLOCK, KF_FLAG_NONBLOCK },
4732
{ FREAD, KF_FLAG_READ },
4733
{ FWRITE, KF_FLAG_WRITE },
4734
{ O_CREAT, KF_FLAG_CREAT },
4735
{ O_DIRECT, KF_FLAG_DIRECT },
4736
{ O_EXCL, KF_FLAG_EXCL },
4737
{ O_EXEC, KF_FLAG_EXEC },
4738
{ O_EXLOCK, KF_FLAG_EXLOCK },
4739
{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
4740
{ O_SHLOCK, KF_FLAG_SHLOCK },
4741
{ O_TRUNC, KF_FLAG_TRUNC }
4742
};
4743
unsigned int i;
4744
int kflags;
4745
4746
kflags = 0;
4747
for (i = 0; i < nitems(fflags_table); i++)
4748
if (fflags & fflags_table[i].fflag)
4749
kflags |= fflags_table[i].kf_fflag;
4750
return (kflags);
4751
}
4752
4753
/* Trim unused data from kf_path by truncating the structure size. */
4754
void
4755
pack_kinfo(struct kinfo_file *kif)
4756
{
4757
4758
kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
4759
strlen(kif->kf_path) + 1;
4760
kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
4761
}
4762
4763
static void
4764
export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
4765
struct kinfo_file *kif, struct filedesc *fdp, int flags)
4766
{
4767
int error;
4768
4769
bzero(kif, sizeof(*kif));
4770
4771
/* Set a default type to allow for empty fill_kinfo() methods. */
4772
kif->kf_type = KF_TYPE_UNKNOWN;
4773
kif->kf_flags = xlate_fflags(fp->f_flag);
4774
if (rightsp != NULL)
4775
kif->kf_cap_rights = *rightsp;
4776
else
4777
cap_rights_init_zero(&kif->kf_cap_rights);
4778
kif->kf_fd = fd;
4779
kif->kf_ref_count = refcount_load(&fp->f_count);
4780
kif->kf_offset = foffset_get(fp);
4781
4782
/*
4783
* This may drop the filedesc lock, so the 'fp' cannot be
4784
* accessed after this call.
4785
*/
4786
error = fo_fill_kinfo(fp, kif, fdp);
4787
if (error == 0)
4788
kif->kf_status |= KF_ATTR_VALID;
4789
if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
4790
pack_kinfo(kif);
4791
else
4792
kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
4793
}
4794
4795
static void
4796
export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
4797
struct kinfo_file *kif, int flags)
4798
{
4799
int error;
4800
4801
bzero(kif, sizeof(*kif));
4802
4803
kif->kf_type = KF_TYPE_VNODE;
4804
error = vn_fill_kinfo_vnode(vp, kif);
4805
if (error == 0)
4806
kif->kf_status |= KF_ATTR_VALID;
4807
kif->kf_flags = xlate_fflags(fflags);
4808
cap_rights_init_zero(&kif->kf_cap_rights);
4809
kif->kf_fd = fd;
4810
kif->kf_ref_count = -1;
4811
kif->kf_offset = -1;
4812
if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
4813
pack_kinfo(kif);
4814
else
4815
kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
4816
vrele(vp);
4817
}
4818
4819
struct export_fd_buf {
4820
struct filedesc *fdp;
4821
struct pwddesc *pdp;
4822
struct sbuf *sb;
4823
ssize_t remainder;
4824
struct kinfo_file kif;
4825
int flags;
4826
};
4827
4828
static int
4829
export_kinfo_to_sb(struct export_fd_buf *efbuf)
4830
{
4831
struct kinfo_file *kif;
4832
4833
kif = &efbuf->kif;
4834
if (efbuf->remainder != -1) {
4835
if (efbuf->remainder < kif->kf_structsize)
4836
return (ENOMEM);
4837
efbuf->remainder -= kif->kf_structsize;
4838
}
4839
if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0)
4840
return (sbuf_error(efbuf->sb));
4841
return (0);
4842
}
4843
4844
static int
4845
export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
4846
struct export_fd_buf *efbuf)
4847
{
4848
int error;
4849
4850
if (efbuf->remainder == 0)
4851
return (ENOMEM);
4852
export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
4853
efbuf->flags);
4854
FILEDESC_SUNLOCK(efbuf->fdp);
4855
error = export_kinfo_to_sb(efbuf);
4856
FILEDESC_SLOCK(efbuf->fdp);
4857
return (error);
4858
}
4859
4860
static int
4861
export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
4862
struct export_fd_buf *efbuf)
4863
{
4864
int error;
4865
4866
if (efbuf->remainder == 0)
4867
return (ENOMEM);
4868
if (efbuf->pdp != NULL)
4869
PWDDESC_XUNLOCK(efbuf->pdp);
4870
export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
4871
error = export_kinfo_to_sb(efbuf);
4872
if (efbuf->pdp != NULL)
4873
PWDDESC_XLOCK(efbuf->pdp);
4874
return (error);
4875
}
4876
4877
/*
4878
* Store a process file descriptor information to sbuf.
4879
*
4880
* Takes a locked proc as argument, and returns with the proc unlocked.
4881
*/
4882
int
4883
kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen,
4884
int flags)
4885
{
4886
struct file *fp;
4887
struct filedesc *fdp;
4888
struct pwddesc *pdp;
4889
struct export_fd_buf *efbuf;
4890
struct vnode *cttyvp, *textvp, *tracevp;
4891
struct pwd *pwd;
4892
int error, i;
4893
cap_rights_t rights;
4894
4895
PROC_LOCK_ASSERT(p, MA_OWNED);
4896
4897
/* ktrace vnode */
4898
tracevp = ktr_get_tracevp(p, true);
4899
/* text vnode */
4900
textvp = p->p_textvp;
4901
if (textvp != NULL)
4902
vrefact(textvp);
4903
/* Controlling tty. */
4904
cttyvp = NULL;
4905
if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
4906
cttyvp = p->p_pgrp->pg_session->s_ttyvp;
4907
if (cttyvp != NULL)
4908
vrefact(cttyvp);
4909
}
4910
fdp = fdhold(p);
4911
pdp = pdhold(p);
4912
PROC_UNLOCK(p);
4913
4914
efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
4915
efbuf->fdp = NULL;
4916
efbuf->pdp = NULL;
4917
efbuf->sb = sb;
4918
efbuf->remainder = maxlen;
4919
efbuf->flags = flags;
4920
4921
error = 0;
4922
if (tracevp != NULL)
4923
error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE,
4924
FREAD | FWRITE, efbuf);
4925
if (error == 0 && textvp != NULL)
4926
error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD,
4927
efbuf);
4928
if (error == 0 && cttyvp != NULL)
4929
error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY,
4930
FREAD | FWRITE, efbuf);
4931
if (error != 0 || pdp == NULL || fdp == NULL)
4932
goto fail;
4933
efbuf->fdp = fdp;
4934
efbuf->pdp = pdp;
4935
PWDDESC_XLOCK(pdp);
4936
pwd = pwd_hold_pwddesc(pdp);
4937
if (pwd != NULL) {
4938
/* working directory */
4939
if (pwd->pwd_cdir != NULL) {
4940
vrefact(pwd->pwd_cdir);
4941
error = export_vnode_to_sb(pwd->pwd_cdir,
4942
KF_FD_TYPE_CWD, FREAD, efbuf);
4943
}
4944
/* root directory */
4945
if (error == 0 && pwd->pwd_rdir != NULL) {
4946
vrefact(pwd->pwd_rdir);
4947
error = export_vnode_to_sb(pwd->pwd_rdir,
4948
KF_FD_TYPE_ROOT, FREAD, efbuf);
4949
}
4950
/* jail directory */
4951
if (error == 0 && pwd->pwd_jdir != NULL) {
4952
vrefact(pwd->pwd_jdir);
4953
error = export_vnode_to_sb(pwd->pwd_jdir,
4954
KF_FD_TYPE_JAIL, FREAD, efbuf);
4955
}
4956
}
4957
PWDDESC_XUNLOCK(pdp);
4958
if (error != 0)
4959
goto fail;
4960
if (pwd != NULL)
4961
pwd_drop(pwd);
4962
FILEDESC_SLOCK(fdp);
4963
if (refcount_load(&fdp->fd_refcnt) == 0)
4964
goto skip;
4965
FILEDESC_FOREACH_FP(fdp, i, fp) {
4966
#ifdef CAPABILITIES
4967
rights = *cap_rights(fdp, i);
4968
#else /* !CAPABILITIES */
4969
rights = cap_no_rights;
4970
#endif
4971
/*
4972
* Create sysctl entry. It is OK to drop the filedesc
4973
* lock inside of export_file_to_sb() as we will
4974
* re-validate and re-evaluate its properties when the
4975
* loop continues.
4976
*/
4977
error = export_file_to_sb(fp, i, &rights, efbuf);
4978
if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0)
4979
break;
4980
}
4981
skip:
4982
FILEDESC_SUNLOCK(fdp);
4983
fail:
4984
if (fdp != NULL)
4985
fddrop(fdp);
4986
if (pdp != NULL)
4987
pddrop(pdp);
4988
free(efbuf, M_TEMP);
4989
return (error);
4990
}
4991
4992
#define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5)
4993
4994
/*
4995
* Get per-process file descriptors for use by procstat(1), et al.
4996
*/
4997
static int
4998
sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
4999
{
5000
struct sbuf sb;
5001
struct proc *p;
5002
ssize_t maxlen;
5003
u_int namelen;
5004
int error, error2, *name;
5005
5006
namelen = arg2;
5007
if (namelen != 1)
5008
return (EINVAL);
5009
5010
name = (int *)arg1;
5011
5012
sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
5013
sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
5014
error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
5015
if (error != 0) {
5016
sbuf_delete(&sb);
5017
return (error);
5018
}
5019
maxlen = req->oldptr != NULL ? req->oldlen : -1;
5020
error = kern_proc_filedesc_out(p, &sb, maxlen,
5021
KERN_FILEDESC_PACK_KINFO);
5022
error2 = sbuf_finish(&sb);
5023
sbuf_delete(&sb);
5024
return (error != 0 ? error : error2);
5025
}
5026
5027
#ifdef COMPAT_FREEBSD7
5028
#ifdef KINFO_OFILE_SIZE
5029
CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
5030
#endif
5031
5032
static void
5033
kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
5034
{
5035
5036
okif->kf_structsize = sizeof(*okif);
5037
okif->kf_type = kif->kf_type;
5038
okif->kf_fd = kif->kf_fd;
5039
okif->kf_ref_count = kif->kf_ref_count;
5040
okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
5041
KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
5042
KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
5043
okif->kf_offset = kif->kf_offset;
5044
if (kif->kf_type == KF_TYPE_VNODE)
5045
okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
5046
else
5047
okif->kf_vnode_type = KF_VTYPE_VNON;
5048
strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
5049
if (kif->kf_type == KF_TYPE_SOCKET) {
5050
okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
5051
okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
5052
okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
5053
okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
5054
okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
5055
} else {
5056
okif->kf_sa_local.ss_family = AF_UNSPEC;
5057
okif->kf_sa_peer.ss_family = AF_UNSPEC;
5058
}
5059
}
5060
5061
static int
5062
export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
5063
struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req)
5064
{
5065
int error;
5066
5067
vrefact(vp);
5068
PWDDESC_XUNLOCK(pdp);
5069
export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
5070
kinfo_to_okinfo(kif, okif);
5071
error = SYSCTL_OUT(req, okif, sizeof(*okif));
5072
PWDDESC_XLOCK(pdp);
5073
return (error);
5074
}
5075
5076
/*
5077
* Get per-process file descriptors for use by procstat(1), et al.
5078
*/
5079
static int
5080
sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
5081
{
5082
struct kinfo_ofile *okif;
5083
struct kinfo_file *kif;
5084
struct filedesc *fdp;
5085
struct pwddesc *pdp;
5086
struct pwd *pwd;
5087
u_int namelen;
5088
int error, i, *name;
5089
struct file *fp;
5090
struct proc *p;
5091
5092
namelen = arg2;
5093
if (namelen != 1)
5094
return (EINVAL);
5095
5096
name = (int *)arg1;
5097
error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
5098
if (error != 0)
5099
return (error);
5100
fdp = fdhold(p);
5101
if (fdp != NULL)
5102
pdp = pdhold(p);
5103
PROC_UNLOCK(p);
5104
if (fdp == NULL || pdp == NULL) {
5105
if (fdp != NULL)
5106
fddrop(fdp);
5107
return (ENOENT);
5108
}
5109
kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
5110
okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
5111
PWDDESC_XLOCK(pdp);
5112
pwd = pwd_hold_pwddesc(pdp);
5113
if (pwd != NULL) {
5114
if (pwd->pwd_cdir != NULL)
5115
export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
5116
okif, pdp, req);
5117
if (pwd->pwd_rdir != NULL)
5118
export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
5119
okif, pdp, req);
5120
if (pwd->pwd_jdir != NULL)
5121
export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
5122
okif, pdp, req);
5123
}
5124
PWDDESC_XUNLOCK(pdp);
5125
if (pwd != NULL)
5126
pwd_drop(pwd);
5127
FILEDESC_SLOCK(fdp);
5128
if (refcount_load(&fdp->fd_refcnt) == 0)
5129
goto skip;
5130
FILEDESC_FOREACH_FP(fdp, i, fp) {
5131
export_file_to_kinfo(fp, i, NULL, kif, fdp,
5132
KERN_FILEDESC_PACK_KINFO);
5133
FILEDESC_SUNLOCK(fdp);
5134
kinfo_to_okinfo(kif, okif);
5135
error = SYSCTL_OUT(req, okif, sizeof(*okif));
5136
FILEDESC_SLOCK(fdp);
5137
if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0)
5138
break;
5139
}
5140
skip:
5141
FILEDESC_SUNLOCK(fdp);
5142
fddrop(fdp);
5143
pddrop(pdp);
5144
free(kif, M_TEMP);
5145
free(okif, M_TEMP);
5146
return (0);
5147
}
5148
5149
static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
5150
CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
5151
"Process ofiledesc entries");
5152
#endif /* COMPAT_FREEBSD7 */
5153
5154
int
5155
vntype_to_kinfo(int vtype)
5156
{
5157
struct {
5158
int vtype;
5159
int kf_vtype;
5160
} vtypes_table[] = {
5161
{ VBAD, KF_VTYPE_VBAD },
5162
{ VBLK, KF_VTYPE_VBLK },
5163
{ VCHR, KF_VTYPE_VCHR },
5164
{ VDIR, KF_VTYPE_VDIR },
5165
{ VFIFO, KF_VTYPE_VFIFO },
5166
{ VLNK, KF_VTYPE_VLNK },
5167
{ VNON, KF_VTYPE_VNON },
5168
{ VREG, KF_VTYPE_VREG },
5169
{ VSOCK, KF_VTYPE_VSOCK }
5170
};
5171
unsigned int i;
5172
5173
/*
5174
* Perform vtype translation.
5175
*/
5176
for (i = 0; i < nitems(vtypes_table); i++)
5177
if (vtypes_table[i].vtype == vtype)
5178
return (vtypes_table[i].kf_vtype);
5179
5180
return (KF_VTYPE_UNKNOWN);
5181
}
5182
5183
static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
5184
CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
5185
"Process filedesc entries");
5186
5187
/*
5188
* Store a process current working directory information to sbuf.
5189
*
5190
* Takes a locked proc as argument, and returns with the proc unlocked.
5191
*/
5192
int
5193
kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen)
5194
{
5195
struct pwddesc *pdp;
5196
struct pwd *pwd;
5197
struct export_fd_buf *efbuf;
5198
struct vnode *cdir;
5199
int error;
5200
5201
PROC_LOCK_ASSERT(p, MA_OWNED);
5202
5203
pdp = pdhold(p);
5204
PROC_UNLOCK(p);
5205
if (pdp == NULL)
5206
return (EINVAL);
5207
5208
efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
5209
efbuf->fdp = NULL;
5210
efbuf->pdp = pdp;
5211
efbuf->sb = sb;
5212
efbuf->remainder = maxlen;
5213
efbuf->flags = 0;
5214
5215
PWDDESC_XLOCK(pdp);
5216
pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
5217
cdir = pwd->pwd_cdir;
5218
if (cdir == NULL) {
5219
error = EINVAL;
5220
} else {
5221
vrefact(cdir);
5222
error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
5223
}
5224
PWDDESC_XUNLOCK(pdp);
5225
pddrop(pdp);
5226
free(efbuf, M_TEMP);
5227
return (error);
5228
}
5229
5230
/*
5231
* Get per-process current working directory.
5232
*/
5233
static int
5234
sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
5235
{
5236
struct sbuf sb;
5237
struct proc *p;
5238
ssize_t maxlen;
5239
u_int namelen;
5240
int error, error2, *name;
5241
5242
namelen = arg2;
5243
if (namelen != 1)
5244
return (EINVAL);
5245
5246
name = (int *)arg1;
5247
5248
sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
5249
sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
5250
error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
5251
if (error != 0) {
5252
sbuf_delete(&sb);
5253
return (error);
5254
}
5255
maxlen = req->oldptr != NULL ? req->oldlen : -1;
5256
error = kern_proc_cwd_out(p, &sb, maxlen);
5257
error2 = sbuf_finish(&sb);
5258
sbuf_delete(&sb);
5259
return (error != 0 ? error : error2);
5260
}
5261
5262
static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
5263
sysctl_kern_proc_cwd, "Process current working directory");
5264
5265
#ifdef DDB
5266
/*
5267
* For the purposes of debugging, generate a human-readable string for the
5268
* file type.
5269
*/
5270
static const char *
5271
file_type_to_name(short type)
5272
{
5273
5274
switch (type) {
5275
case 0:
5276
return ("zero");
5277
case DTYPE_VNODE:
5278
return ("vnode");
5279
case DTYPE_SOCKET:
5280
return ("socket");
5281
case DTYPE_PIPE:
5282
return ("pipe");
5283
case DTYPE_FIFO:
5284
return ("fifo");
5285
case DTYPE_KQUEUE:
5286
return ("kqueue");
5287
case DTYPE_CRYPTO:
5288
return ("crypto");
5289
case DTYPE_MQUEUE:
5290
return ("mqueue");
5291
case DTYPE_SHM:
5292
return ("shm");
5293
case DTYPE_SEM:
5294
return ("ksem");
5295
case DTYPE_PTS:
5296
return ("pts");
5297
case DTYPE_DEV:
5298
return ("dev");
5299
case DTYPE_PROCDESC:
5300
return ("proc");
5301
case DTYPE_EVENTFD:
5302
return ("eventfd");
5303
case DTYPE_TIMERFD:
5304
return ("timerfd");
5305
case DTYPE_JAILDESC:
5306
return ("jail");
5307
default:
5308
return ("unkn");
5309
}
5310
}
5311
5312
/*
5313
* For the purposes of debugging, identify a process (if any, perhaps one of
5314
* many) that references the passed file in its file descriptor array. Return
5315
* NULL if none.
5316
*/
5317
static struct proc *
5318
file_to_first_proc(struct file *fp)
5319
{
5320
struct filedesc *fdp;
5321
struct proc *p;
5322
int n;
5323
5324
FOREACH_PROC_IN_SYSTEM(p) {
5325
if (p->p_state == PRS_NEW)
5326
continue;
5327
fdp = p->p_fd;
5328
if (fdp == NULL)
5329
continue;
5330
for (n = 0; n < fdp->fd_nfiles; n++) {
5331
if (fp == fdp->fd_ofiles[n].fde_file)
5332
return (p);
5333
}
5334
}
5335
return (NULL);
5336
}
5337
5338
static void
5339
db_print_file(struct file *fp, int header)
5340
{
5341
#define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
5342
struct proc *p;
5343
5344
if (header)
5345
db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
5346
XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
5347
"GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
5348
"FCmd");
5349
p = file_to_first_proc(fp);
5350
db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
5351
fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
5352
fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode,
5353
p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
5354
5355
#undef XPTRWIDTH
5356
}
5357
5358
DB_SHOW_COMMAND(file, db_show_file)
5359
{
5360
struct file *fp;
5361
5362
if (!have_addr) {
5363
db_printf("usage: show file <addr>\n");
5364
return;
5365
}
5366
fp = (struct file *)addr;
5367
db_print_file(fp, 1);
5368
}
5369
5370
DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE)
5371
{
5372
struct filedesc *fdp;
5373
struct file *fp;
5374
struct proc *p;
5375
int header;
5376
int n;
5377
5378
header = 1;
5379
FOREACH_PROC_IN_SYSTEM(p) {
5380
if (p->p_state == PRS_NEW)
5381
continue;
5382
if ((fdp = p->p_fd) == NULL)
5383
continue;
5384
for (n = 0; n < fdp->fd_nfiles; ++n) {
5385
if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
5386
continue;
5387
db_print_file(fp, header);
5388
header = 0;
5389
}
5390
}
5391
}
5392
#endif
5393
5394
SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc,
5395
CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
5396
&maxfilesperproc, 0, "Maximum files allowed open per process");
5397
5398
SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RWTUN | CTLFLAG_NOFETCH,
5399
&maxfiles, 0, "Maximum number of files");
5400
5401
SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
5402
&openfiles, 0, "System-wide number of open files");
5403
5404
/* ARGSUSED*/
5405
static void
5406
filelistinit(void *dummy)
5407
{
5408
5409
file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
5410
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
5411
filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
5412
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
5413
pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
5414
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
5415
/*
5416
* XXXMJG this is a temporary hack due to boot ordering issues against
5417
* the vnode zone.
5418
*/
5419
vfs_smr = uma_zone_get_smr(pwd_zone);
5420
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
5421
}
5422
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
5423
5424
/*-------------------------------------------------------------------*/
5425
5426
static int
5427
badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
5428
int flags, struct thread *td)
5429
{
5430
5431
return (EBADF);
5432
}
5433
5434
static int
5435
badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
5436
struct thread *td)
5437
{
5438
5439
return (EINVAL);
5440
}
5441
5442
static int
5443
badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
5444
struct thread *td)
5445
{
5446
5447
return (EBADF);
5448
}
5449
5450
static int
5451
badfo_poll(struct file *fp, int events, struct ucred *active_cred,
5452
struct thread *td)
5453
{
5454
5455
return (0);
5456
}
5457
5458
static int
5459
badfo_kqfilter(struct file *fp, struct knote *kn)
5460
{
5461
5462
return (EBADF);
5463
}
5464
5465
static int
5466
badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
5467
{
5468
5469
return (EBADF);
5470
}
5471
5472
static int
5473
badfo_close(struct file *fp, struct thread *td)
5474
{
5475
5476
return (0);
5477
}
5478
5479
static int
5480
badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
5481
struct thread *td)
5482
{
5483
5484
return (EBADF);
5485
}
5486
5487
static int
5488
badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
5489
struct thread *td)
5490
{
5491
5492
return (EBADF);
5493
}
5494
5495
static int
5496
badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
5497
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
5498
struct thread *td)
5499
{
5500
5501
return (EBADF);
5502
}
5503
5504
static int
5505
badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
5506
{
5507
5508
return (0);
5509
}
5510
5511
const struct fileops badfileops = {
5512
.fo_read = badfo_readwrite,
5513
.fo_write = badfo_readwrite,
5514
.fo_truncate = badfo_truncate,
5515
.fo_ioctl = badfo_ioctl,
5516
.fo_poll = badfo_poll,
5517
.fo_kqfilter = badfo_kqfilter,
5518
.fo_stat = badfo_stat,
5519
.fo_close = badfo_close,
5520
.fo_chmod = badfo_chmod,
5521
.fo_chown = badfo_chown,
5522
.fo_sendfile = badfo_sendfile,
5523
.fo_fill_kinfo = badfo_fill_kinfo,
5524
};
5525
5526
static int
5527
path_poll(struct file *fp, int events, struct ucred *active_cred,
5528
struct thread *td)
5529
{
5530
return (POLLNVAL);
5531
}
5532
5533
static int
5534
path_close(struct file *fp, struct thread *td)
5535
{
5536
MPASS(fp->f_type == DTYPE_VNODE);
5537
fp->f_ops = &badfileops;
5538
vrele(fp->f_vnode);
5539
return (0);
5540
}
5541
5542
const struct fileops path_fileops = {
5543
.fo_read = badfo_readwrite,
5544
.fo_write = badfo_readwrite,
5545
.fo_truncate = badfo_truncate,
5546
.fo_ioctl = badfo_ioctl,
5547
.fo_poll = path_poll,
5548
.fo_kqfilter = vn_kqfilter_opath,
5549
.fo_stat = vn_statfile,
5550
.fo_close = path_close,
5551
.fo_chmod = badfo_chmod,
5552
.fo_chown = badfo_chown,
5553
.fo_sendfile = badfo_sendfile,
5554
.fo_fill_kinfo = vn_fill_kinfo,
5555
.fo_cmp = vn_cmp,
5556
.fo_flags = DFLAG_PASSABLE,
5557
};
5558
5559
int
5560
invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
5561
int flags, struct thread *td)
5562
{
5563
5564
return (EOPNOTSUPP);
5565
}
5566
5567
int
5568
invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
5569
struct thread *td)
5570
{
5571
5572
return (EINVAL);
5573
}
5574
5575
int
5576
invfo_ioctl(struct file *fp, u_long com, void *data,
5577
struct ucred *active_cred, struct thread *td)
5578
{
5579
5580
return (ENOTTY);
5581
}
5582
5583
int
5584
invfo_poll(struct file *fp, int events, struct ucred *active_cred,
5585
struct thread *td)
5586
{
5587
5588
return (poll_no_poll(events));
5589
}
5590
5591
int
5592
invfo_kqfilter(struct file *fp, struct knote *kn)
5593
{
5594
5595
return (EINVAL);
5596
}
5597
5598
int
5599
invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
5600
struct thread *td)
5601
{
5602
5603
return (EINVAL);
5604
}
5605
5606
int
5607
invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
5608
struct thread *td)
5609
{
5610
5611
return (EINVAL);
5612
}
5613
5614
int
5615
invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
5616
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
5617
struct thread *td)
5618
{
5619
5620
return (EINVAL);
5621
}
5622
5623
/*-------------------------------------------------------------------*/
5624
5625
/*
5626
* File Descriptor pseudo-device driver (/dev/fd/).
5627
*
5628
* Opening minor device N dup()s the file (if any) connected to file
5629
* descriptor N belonging to the calling process. Note that this driver
5630
* consists of only the ``open()'' routine, because all subsequent
5631
* references to this file will be direct to the other driver.
5632
*
5633
* XXX: we could give this one a cloning event handler if necessary.
5634
*/
5635
5636
/* ARGSUSED */
5637
static int
5638
fdopen(struct cdev *dev, int mode, int type, struct thread *td)
5639
{
5640
5641
/*
5642
* XXX Kludge: set curthread->td_dupfd to contain the value of the
5643
* the file descriptor being sought for duplication. The error
5644
* return ensures that the vnode for this device will be released
5645
* by vn_open. Open will detect this special error and take the
5646
* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
5647
* will simply report the error.
5648
*/
5649
td->td_dupfd = dev2unit(dev);
5650
return (ENODEV);
5651
}
5652
5653
static struct cdevsw fildesc_cdevsw = {
5654
.d_version = D_VERSION,
5655
.d_open = fdopen,
5656
.d_name = "FD",
5657
};
5658
5659
static void
5660
fildesc_drvinit(void *unused)
5661
{
5662
struct cdev *dev;
5663
5664
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
5665
UID_ROOT, GID_WHEEL, 0666, "fd/0");
5666
make_dev_alias(dev, "stdin");
5667
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
5668
UID_ROOT, GID_WHEEL, 0666, "fd/1");
5669
make_dev_alias(dev, "stdout");
5670
dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
5671
UID_ROOT, GID_WHEEL, 0666, "fd/2");
5672
make_dev_alias(dev, "stderr");
5673
}
5674
5675
SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
5676
5677