Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/kern/kern_fork.c
103756 views
1
/*-
2
* SPDX-License-Identifier: BSD-3-Clause
3
*
4
* Copyright (c) 1982, 1986, 1989, 1991, 1993
5
* The Regents of the University of California. All rights reserved.
6
* (c) UNIX System Laboratories, Inc.
7
* All or some portions of this file are derived from material licensed
8
* to the University of California by American Telephone and Telegraph
9
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
10
* the permission of UNIX System Laboratories, Inc.
11
*
12
* Redistribution and use in source and binary forms, with or without
13
* modification, are permitted provided that the following conditions
14
* are met:
15
* 1. Redistributions of source code must retain the above copyright
16
* notice, this list of conditions and the following disclaimer.
17
* 2. Redistributions in binary form must reproduce the above copyright
18
* notice, this list of conditions and the following disclaimer in the
19
* documentation and/or other materials provided with the distribution.
20
* 3. Neither the name of the University nor the names of its contributors
21
* may be used to endorse or promote products derived from this software
22
* without specific prior written permission.
23
*
24
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34
* SUCH DAMAGE.
35
*/
36
37
#include "opt_ktrace.h"
38
#include "opt_kstack_pages.h"
39
40
#define EXTERR_CATEGORY EXTERR_CAT_FORK
41
#include <sys/systm.h>
42
#include <sys/acct.h>
43
#include <sys/bitstring.h>
44
#include <sys/eventhandler.h>
45
#include <sys/exterrvar.h>
46
#include <sys/fcntl.h>
47
#include <sys/filedesc.h>
48
#include <sys/jail.h>
49
#include <sys/kernel.h>
50
#include <sys/kthread.h>
51
#include <sys/ktr.h>
52
#include <sys/ktrace.h>
53
#include <sys/sysctl.h>
54
#include <sys/lock.h>
55
#include <sys/malloc.h>
56
#include <sys/msan.h>
57
#include <sys/mutex.h>
58
#include <sys/priv.h>
59
#include <sys/proc.h>
60
#include <sys/procdesc.h>
61
#include <sys/ptrace.h>
62
#include <sys/racct.h>
63
#include <sys/resourcevar.h>
64
#include <sys/sched.h>
65
#include <sys/sdt.h>
66
#include <sys/signalvar.h>
67
#include <sys/sx.h>
68
#include <sys/syscall.h>
69
#include <sys/sysent.h>
70
#include <sys/sysproto.h>
71
#include <sys/vmmeter.h>
72
#include <sys/vnode.h>
73
#include <sys/unistd.h>
74
75
#include <security/audit/audit.h>
76
#include <security/mac/mac_framework.h>
77
78
#include <vm/vm.h>
79
#include <vm/pmap.h>
80
#include <vm/vm_map.h>
81
#include <vm/vm_extern.h>
82
#include <vm/uma.h>
83
84
#ifdef KDTRACE_HOOKS
85
#include <sys/dtrace_bsd.h>
86
dtrace_fork_func_t dtrace_fasttrap_fork;
87
#endif
88
89
SDT_PROVIDER_DECLARE(proc);
90
SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int");
91
92
#ifndef _SYS_SYSPROTO_H_
93
struct fork_args {
94
int dummy;
95
};
96
#endif
97
98
/* ARGSUSED */
99
int
100
sys_fork(struct thread *td, struct fork_args *uap)
101
{
102
struct fork_req fr;
103
int error, pid;
104
105
bzero(&fr, sizeof(fr));
106
fr.fr_flags = RFFDG | RFPROC;
107
fr.fr_pidp = &pid;
108
error = fork1(td, &fr);
109
if (error == 0) {
110
td->td_retval[0] = pid;
111
td->td_retval[1] = 0;
112
}
113
return (error);
114
}
115
116
/* ARGUSED */
117
int
118
sys_pdfork(struct thread *td, struct pdfork_args *uap)
119
{
120
struct fork_req fr;
121
int error, fd, pid;
122
123
bzero(&fr, sizeof(fr));
124
fr.fr_flags = RFFDG | RFPROC | RFPROCDESC;
125
fr.fr_pidp = &pid;
126
fr.fr_pd_fd = &fd;
127
fr.fr_pd_flags = uap->flags;
128
AUDIT_ARG_FFLAGS(uap->flags);
129
/*
130
* It is necessary to return fd by reference because 0 is a valid file
131
* descriptor number, and the child needs to be able to distinguish
132
* itself from the parent using the return value.
133
*/
134
error = fork1(td, &fr);
135
if (error == 0) {
136
td->td_retval[0] = pid;
137
td->td_retval[1] = 0;
138
error = copyout(&fd, uap->fdp, sizeof(fd));
139
}
140
return (error);
141
}
142
143
/* ARGSUSED */
144
int
145
sys_vfork(struct thread *td, struct vfork_args *uap)
146
{
147
struct fork_req fr;
148
int error, pid;
149
150
bzero(&fr, sizeof(fr));
151
fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
152
fr.fr_pidp = &pid;
153
error = fork1(td, &fr);
154
if (error == 0) {
155
td->td_retval[0] = pid;
156
td->td_retval[1] = 0;
157
}
158
return (error);
159
}
160
161
int
162
sys_rfork(struct thread *td, struct rfork_args *uap)
163
{
164
struct fork_req fr;
165
int error, pid;
166
167
/* Don't allow kernel-only flags. */
168
if ((uap->flags & RFKERNELONLY) != 0)
169
return (EXTERROR(EINVAL, "Kernel-only flags %#jx", uap->flags));
170
/* RFSPAWN must not appear with others */
171
if ((uap->flags & RFSPAWN) != 0 && uap->flags != RFSPAWN)
172
return (EXTERROR(EINVAL, "RFSPAWN must be the only flag %#jx",
173
uap->flags));
174
175
AUDIT_ARG_FFLAGS(uap->flags);
176
bzero(&fr, sizeof(fr));
177
if ((uap->flags & RFSPAWN) != 0) {
178
fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
179
fr.fr_flags2 = FR2_DROPSIG_CAUGHT;
180
} else {
181
fr.fr_flags = uap->flags;
182
}
183
fr.fr_pidp = &pid;
184
error = fork1(td, &fr);
185
if (error == 0) {
186
td->td_retval[0] = pid;
187
td->td_retval[1] = 0;
188
}
189
return (error);
190
}
191
192
int
193
sys_pdrfork(struct thread *td, struct pdrfork_args *uap)
194
{
195
struct fork_req fr;
196
int error, fd, pid;
197
198
bzero(&fr, sizeof(fr));
199
fd = -1;
200
201
AUDIT_ARG_FFLAGS(uap->pdflags);
202
AUDIT_ARG_CMD(uap->rfflags);
203
204
if ((uap->rfflags & (RFSTOPPED | RFHIGHPID)) != 0)
205
return (EXTERROR(EINVAL,
206
"Kernel-only flags %#jx", uap->rfflags));
207
208
/* RFSPAWN must not appear with others */
209
if ((uap->rfflags & RFSPAWN) != 0) {
210
if (uap->rfflags != RFSPAWN)
211
return (EXTERROR(EINVAL,
212
"RFSPAWN must be the only flag %#jx",
213
uap->rfflags));
214
fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPROCDESC;
215
fr.fr_flags2 = FR2_DROPSIG_CAUGHT;
216
} else {
217
fr.fr_flags = uap->rfflags;
218
}
219
220
fr.fr_pidp = &pid;
221
fr.fr_pd_fd = &fd;
222
fr.fr_pd_flags = uap->pdflags;
223
error = fork1(td, &fr);
224
if (error == 0) {
225
td->td_retval[0] = pid;
226
td->td_retval[1] = 0;
227
if ((fr.fr_flags & (RFPROC | RFPROCDESC)) ==
228
(RFPROC | RFPROCDESC) || uap->rfflags == RFSPAWN)
229
error = copyout(&fd, uap->fdp, sizeof(fd));
230
}
231
return (error);
232
}
233
234
int __exclusive_cache_line nprocs = 1; /* process 0 */
235
int lastpid = 0;
236
SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
237
"Last used PID");
238
239
/*
240
* Random component to lastpid generation. We mix in a random factor to make
241
* it a little harder to predict. We sanity check the modulus value to avoid
242
* doing it in critical paths. Don't let it be too small or we pointlessly
243
* waste randomness entropy, and don't let it be impossibly large. Using a
244
* modulus that is too big causes a LOT more process table scans and slows
245
* down fork processing as the pidchecked caching is defeated.
246
*/
247
static int randompid = 0;
248
249
static int
250
sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
251
{
252
int error, pid;
253
254
error = sysctl_wire_old_buffer(req, sizeof(int));
255
if (error != 0)
256
return(error);
257
sx_xlock(&allproc_lock);
258
pid = randompid;
259
error = sysctl_handle_int(oidp, &pid, 0, req);
260
if (error == 0 && req->newptr != NULL) {
261
if (pid == 0)
262
randompid = 0;
263
else if (pid == 1)
264
/* generate a random PID modulus between 100 and 1123 */
265
randompid = 100 + arc4random() % 1024;
266
else if (pid < 0 || pid > pid_max - 100)
267
/* out of range */
268
randompid = pid_max - 100;
269
else if (pid < 100)
270
/* Make it reasonable */
271
randompid = 100;
272
else
273
randompid = pid;
274
}
275
sx_xunlock(&allproc_lock);
276
return (error);
277
}
278
279
SYSCTL_PROC(_kern, OID_AUTO, randompid,
280
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
281
sysctl_kern_randompid, "I",
282
"Random PID modulus. Special values: 0: disable, 1: choose random value");
283
284
extern bitstr_t proc_id_pidmap;
285
extern bitstr_t proc_id_grpidmap;
286
extern bitstr_t proc_id_sessidmap;
287
extern bitstr_t proc_id_reapmap;
288
289
/*
290
* Find an unused process ID
291
*
292
* If RFHIGHPID is set (used during system boot), do not allocate
293
* low-numbered pids.
294
*/
295
static int
296
fork_findpid(int flags)
297
{
298
pid_t result;
299
int trypid, random;
300
301
/*
302
* Avoid calling arc4random with procid_lock held.
303
*/
304
random = 0;
305
if (__predict_false(randompid))
306
random = arc4random() % randompid;
307
308
mtx_lock(&procid_lock);
309
310
trypid = lastpid + 1;
311
if (flags & RFHIGHPID) {
312
if (trypid < 10)
313
trypid = 10;
314
} else {
315
trypid += random;
316
}
317
retry:
318
if (trypid >= pid_max)
319
trypid = 2;
320
321
bit_ffc_at(&proc_id_pidmap, trypid, pid_max, &result);
322
if (result == -1) {
323
KASSERT(trypid != 2, ("unexpectedly ran out of IDs"));
324
trypid = 2;
325
goto retry;
326
}
327
if (bit_test(&proc_id_grpidmap, result) ||
328
bit_test(&proc_id_sessidmap, result) ||
329
bit_test(&proc_id_reapmap, result)) {
330
trypid = result + 1;
331
goto retry;
332
}
333
334
/*
335
* RFHIGHPID does not mess with the lastpid counter during boot.
336
*/
337
if ((flags & RFHIGHPID) == 0)
338
lastpid = result;
339
340
bit_set(&proc_id_pidmap, result);
341
mtx_unlock(&procid_lock);
342
343
return (result);
344
}
345
346
static int
347
fork_norfproc(struct thread *td, int flags)
348
{
349
struct proc *p1;
350
int error;
351
352
KASSERT((flags & RFPROC) == 0,
353
("fork_norfproc called with RFPROC set"));
354
p1 = td->td_proc;
355
356
/*
357
* Quiesce other threads if necessary. If RFMEM is not specified we
358
* must ensure that other threads do not concurrently create a second
359
* process sharing the vmspace, see vmspace_unshare().
360
*/
361
if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS &&
362
((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) {
363
PROC_LOCK(p1);
364
if (thread_single(p1, SINGLE_BOUNDARY)) {
365
PROC_UNLOCK(p1);
366
return (ERESTART);
367
}
368
PROC_UNLOCK(p1);
369
}
370
371
error = vm_forkproc(td, NULL, NULL, NULL, flags);
372
if (error != 0)
373
goto fail;
374
375
/*
376
* Close all file descriptors.
377
*/
378
if ((flags & RFCFDG) != 0) {
379
struct filedesc *fdtmp;
380
struct pwddesc *pdtmp;
381
382
pdtmp = pdinit(td->td_proc->p_pd, false);
383
fdtmp = fdinit();
384
pdescfree(td);
385
fdescfree(td);
386
p1->p_fd = fdtmp;
387
p1->p_pd = pdtmp;
388
}
389
390
/*
391
* Unshare file descriptors (from parent).
392
*/
393
if ((flags & RFFDG) != 0) {
394
fdunshare(td);
395
pdunshare(td);
396
}
397
398
fail:
399
if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS &&
400
((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) {
401
PROC_LOCK(p1);
402
thread_single_end(p1, SINGLE_BOUNDARY);
403
PROC_UNLOCK(p1);
404
}
405
return (error);
406
}
407
408
static void
409
do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2,
410
struct vmspace *vm2, struct file *fp_procdesc)
411
{
412
struct proc *p1, *pptr;
413
struct filedesc *fd;
414
struct filedesc_to_leader *fdtol;
415
struct pwddesc *pd;
416
struct sigacts *newsigacts;
417
418
p1 = td->td_proc;
419
420
PROC_LOCK(p1);
421
bcopy(&p1->p_startcopy, &p2->p_startcopy,
422
__rangeof(struct proc, p_startcopy, p_endcopy));
423
pargs_hold(p2->p_args);
424
PROC_UNLOCK(p1);
425
426
bzero(&p2->p_startzero,
427
__rangeof(struct proc, p_startzero, p_endzero));
428
429
/* Tell the prison that we exist. */
430
prison_proc_hold(p2->p_ucred->cr_prison);
431
432
p2->p_state = PRS_NEW; /* protect against others */
433
p2->p_pid = fork_findpid(fr->fr_flags);
434
AUDIT_ARG_PID(p2->p_pid);
435
TSFORK(p2->p_pid, p1->p_pid);
436
437
sx_xlock(&allproc_lock);
438
LIST_INSERT_HEAD(&allproc, p2, p_list);
439
allproc_gen++;
440
prison_proc_link(p2->p_ucred->cr_prison, p2);
441
sx_xunlock(&allproc_lock);
442
443
sx_xlock(PIDHASHLOCK(p2->p_pid));
444
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
445
sx_xunlock(PIDHASHLOCK(p2->p_pid));
446
447
tidhash_add(td2);
448
449
/*
450
* Malloc things while we don't hold any locks.
451
*/
452
if (fr->fr_flags & RFSIGSHARE)
453
newsigacts = NULL;
454
else
455
newsigacts = sigacts_alloc();
456
457
/*
458
* Copy filedesc.
459
*/
460
if (fr->fr_flags & RFCFDG) {
461
pd = pdinit(p1->p_pd, false);
462
fd = fdinit();
463
fdtol = NULL;
464
} else if (fr->fr_flags & RFFDG) {
465
if (fr->fr_flags2 & FR2_SHARE_PATHS)
466
pd = pdshare(p1->p_pd);
467
else
468
pd = pdcopy(p1->p_pd);
469
fd = fdcopy(p1->p_fd, p2);
470
fdtol = NULL;
471
} else {
472
if (fr->fr_flags2 & FR2_SHARE_PATHS)
473
pd = pdcopy(p1->p_pd);
474
else
475
pd = pdshare(p1->p_pd);
476
fd = fdshare(p1->p_fd);
477
if (p1->p_fdtol == NULL)
478
p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
479
p1->p_leader);
480
if ((fr->fr_flags & RFTHREAD) != 0) {
481
/*
482
* Shared file descriptor table, and shared
483
* process leaders.
484
*/
485
fdtol = filedesc_to_leader_share(p1->p_fdtol, p1->p_fd);
486
} else {
487
/*
488
* Shared file descriptor table, and different
489
* process leaders.
490
*/
491
fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
492
p1->p_fd, p2);
493
}
494
}
495
/*
496
* Make a proc table entry for the new process.
497
* Start by zeroing the section of proc that is zero-initialized,
498
* then copy the section that is copied directly from the parent.
499
*/
500
501
PROC_LOCK(p2);
502
PROC_LOCK(p1);
503
504
bzero(&td2->td_startzero,
505
__rangeof(struct thread, td_startzero, td_endzero));
506
507
bcopy(&td->td_startcopy, &td2->td_startcopy,
508
__rangeof(struct thread, td_startcopy, td_endcopy));
509
510
bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
511
td2->td_sigstk = td->td_sigstk;
512
td2->td_flags = TDF_INMEM;
513
td2->td_lend_user_pri = PRI_MAX;
514
515
#ifdef VIMAGE
516
td2->td_vnet = NULL;
517
td2->td_vnet_lpush = NULL;
518
#endif
519
520
/*
521
* Allow the scheduler to initialize the child.
522
*/
523
thread_lock(td);
524
sched_fork(td, td2);
525
/*
526
* Request AST to check for TDP_RFPPWAIT. Do it here
527
* to avoid calling thread_lock() again.
528
*/
529
if ((fr->fr_flags & RFPPWAIT) != 0)
530
ast_sched_locked(td, TDA_VFORK);
531
thread_unlock(td);
532
533
/*
534
* Duplicate sub-structures as needed.
535
* Increase reference counts on shared objects.
536
*/
537
p2->p_flag = P_INMEM;
538
p2->p_flag2 = p1->p_flag2 & (P2_ASLR_DISABLE | P2_ASLR_ENABLE |
539
P2_ASLR_IGNSTART | P2_NOTRACE | P2_NOTRACE_EXEC |
540
P2_PROTMAX_ENABLE | P2_PROTMAX_DISABLE | P2_TRAPCAP |
541
P2_STKGAP_DISABLE | P2_STKGAP_DISABLE_EXEC | P2_NO_NEW_PRIVS |
542
P2_WXORX_DISABLE | P2_WXORX_ENABLE_EXEC | P2_LOGSIGEXIT_CTL |
543
P2_LOGSIGEXIT_ENABLE);
544
p2->p_swtick = ticks;
545
if (p1->p_flag & P_PROFIL)
546
startprofclock(p2);
547
548
if (fr->fr_flags & RFSIGSHARE) {
549
p2->p_sigacts = sigacts_hold(p1->p_sigacts);
550
} else {
551
sigacts_copy(newsigacts, p1->p_sigacts);
552
p2->p_sigacts = newsigacts;
553
if ((fr->fr_flags2 & (FR2_DROPSIG_CAUGHT | FR2_KPROC)) != 0) {
554
mtx_lock(&p2->p_sigacts->ps_mtx);
555
if ((fr->fr_flags2 & FR2_DROPSIG_CAUGHT) != 0)
556
sig_drop_caught(p2);
557
if ((fr->fr_flags2 & FR2_KPROC) != 0)
558
p2->p_sigacts->ps_flag |= PS_NOCLDWAIT;
559
mtx_unlock(&p2->p_sigacts->ps_mtx);
560
}
561
}
562
563
if (fr->fr_flags & RFTSIGZMB)
564
p2->p_sigparent = RFTSIGNUM(fr->fr_flags);
565
else if (fr->fr_flags & RFLINUXTHPN)
566
p2->p_sigparent = SIGUSR1;
567
else
568
p2->p_sigparent = SIGCHLD;
569
570
if ((fr->fr_flags2 & FR2_KPROC) != 0) {
571
p2->p_flag |= P_SYSTEM | P_KPROC;
572
td2->td_pflags |= TDP_KTHREAD;
573
}
574
575
p2->p_textvp = p1->p_textvp;
576
p2->p_textdvp = p1->p_textdvp;
577
p2->p_fd = fd;
578
p2->p_fdtol = fdtol;
579
p2->p_pd = pd;
580
581
if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
582
p2->p_flag |= P_PROTECTED;
583
p2->p_flag2 |= P2_INHERIT_PROTECTED;
584
}
585
586
/*
587
* p_limit is copy-on-write. Bump its refcount.
588
*/
589
lim_fork(p1, p2);
590
591
thread_cow_get_proc(td2, p2);
592
593
pstats_fork(p1->p_stats, p2->p_stats);
594
595
PROC_UNLOCK(p1);
596
PROC_UNLOCK(p2);
597
598
/*
599
* Bump references to the text vnode and directory, and copy
600
* the hardlink name.
601
*/
602
if (p2->p_textvp != NULL)
603
vrefact(p2->p_textvp);
604
if (p2->p_textdvp != NULL)
605
vrefact(p2->p_textdvp);
606
p2->p_binname = p1->p_binname == NULL ? NULL :
607
strdup(p1->p_binname, M_PARGS);
608
609
/*
610
* Set up linkage for kernel based threading.
611
*/
612
if ((fr->fr_flags & RFTHREAD) != 0) {
613
mtx_lock(&ppeers_lock);
614
p2->p_peers = p1->p_peers;
615
p1->p_peers = p2;
616
p2->p_leader = p1->p_leader;
617
mtx_unlock(&ppeers_lock);
618
PROC_LOCK(p1->p_leader);
619
if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
620
PROC_UNLOCK(p1->p_leader);
621
/*
622
* The task leader is exiting, so process p1 is
623
* going to be killed shortly. Since p1 obviously
624
* isn't dead yet, we know that the leader is either
625
* sending SIGKILL's to all the processes in this
626
* task or is sleeping waiting for all the peers to
627
* exit. We let p1 complete the fork, but we need
628
* to go ahead and kill the new process p2 since
629
* the task leader may not get a chance to send
630
* SIGKILL to it. We leave it on the list so that
631
* the task leader will wait for this new process
632
* to commit suicide.
633
*/
634
PROC_LOCK(p2);
635
kern_psignal(p2, SIGKILL);
636
PROC_UNLOCK(p2);
637
} else
638
PROC_UNLOCK(p1->p_leader);
639
} else {
640
p2->p_peers = NULL;
641
p2->p_leader = p2;
642
}
643
644
sx_xlock(&proctree_lock);
645
PGRP_LOCK(p1->p_pgrp);
646
PROC_LOCK(p2);
647
PROC_LOCK(p1);
648
649
/*
650
* Preserve some more flags in subprocess. P_PROFIL has already
651
* been preserved.
652
*/
653
p2->p_flag |= p1->p_flag & P_SUGID;
654
td2->td_pflags |= td->td_pflags & (TDP_ALTSTACK | TDP_SIGFASTBLOCK);
655
td2->td_pflags2 |= td->td_pflags2 & TDP2_UEXTERR;
656
if (p1->p_flag & P_CONTROLT) {
657
SESS_LOCK(p1->p_session);
658
if (p1->p_session->s_ttyvp != NULL)
659
p2->p_flag |= P_CONTROLT;
660
SESS_UNLOCK(p1->p_session);
661
}
662
if (fr->fr_flags & RFPPWAIT)
663
p2->p_flag |= P_PPWAIT;
664
665
p2->p_pgrp = p1->p_pgrp;
666
LIST_INSERT_AFTER(p1, p2, p_pglist);
667
PGRP_UNLOCK(p1->p_pgrp);
668
LIST_INIT(&p2->p_children);
669
LIST_INIT(&p2->p_orphans);
670
671
callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
672
673
/*
674
* This begins the section where we must prevent the parent
675
* from being swapped.
676
*/
677
_PHOLD(p1);
678
PROC_UNLOCK(p1);
679
680
/*
681
* Attach the new process to its parent.
682
*
683
* If RFNOWAIT is set, the newly created process becomes a child
684
* of init. This effectively disassociates the child from the
685
* parent.
686
*/
687
if ((fr->fr_flags & RFNOWAIT) != 0) {
688
pptr = p1->p_reaper;
689
p2->p_reaper = pptr;
690
} else {
691
p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ?
692
p1 : p1->p_reaper;
693
pptr = p1;
694
}
695
p2->p_pptr = pptr;
696
p2->p_oppid = pptr->p_pid;
697
LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
698
LIST_INIT(&p2->p_reaplist);
699
LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
700
if (p2->p_reaper == p1 && p1 != initproc) {
701
p2->p_reapsubtree = p2->p_pid;
702
proc_id_set_cond(PROC_ID_REAP, p2->p_pid);
703
}
704
sx_xunlock(&proctree_lock);
705
706
/* Inform accounting that we have forked. */
707
p2->p_acflag = AFORK;
708
PROC_UNLOCK(p2);
709
710
#ifdef KTRACE
711
ktrprocfork(p1, p2);
712
#endif
713
714
/*
715
* Finish creating the child process. It will return via a different
716
* execution path later. (ie: directly into user mode)
717
*/
718
vm_forkproc(td, p2, td2, vm2, fr->fr_flags);
719
720
if (fr->fr_flags == (RFFDG | RFPROC)) {
721
VM_CNT_INC(v_forks);
722
VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize +
723
p2->p_vmspace->vm_ssize);
724
} else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
725
VM_CNT_INC(v_vforks);
726
VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize +
727
p2->p_vmspace->vm_ssize);
728
} else if (p1 == &proc0) {
729
VM_CNT_INC(v_kthreads);
730
VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize +
731
p2->p_vmspace->vm_ssize);
732
} else {
733
VM_CNT_INC(v_rforks);
734
VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize +
735
p2->p_vmspace->vm_ssize);
736
}
737
738
/*
739
* Associate the process descriptor with the process before anything
740
* can happen that might cause that process to need the descriptor.
741
* However, don't do this until after fork(2) can no longer fail.
742
*/
743
if (fr->fr_flags & RFPROCDESC)
744
procdesc_new(p2, fr->fr_pd_flags);
745
746
/*
747
* Both processes are set up, now check if any loadable modules want
748
* to adjust anything.
749
*/
750
EVENTHANDLER_DIRECT_INVOKE(process_fork, p1, p2, fr->fr_flags);
751
752
/*
753
* Set the child start time and mark the process as being complete.
754
*/
755
PROC_LOCK(p2);
756
PROC_LOCK(p1);
757
microuptime(&p2->p_stats->p_start);
758
PROC_SLOCK(p2);
759
p2->p_state = PRS_NORMAL;
760
PROC_SUNLOCK(p2);
761
762
#ifdef KDTRACE_HOOKS
763
/*
764
* Tell the DTrace fasttrap provider about the new process so that any
765
* tracepoints inherited from the parent can be removed. We have to do
766
* this only after p_state is PRS_NORMAL since the fasttrap module will
767
* use pfind() later on.
768
*/
769
if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork)
770
dtrace_fasttrap_fork(p1, p2);
771
#endif
772
if (fr->fr_flags & RFPPWAIT) {
773
td->td_pflags |= TDP_RFPPWAIT;
774
td->td_rfppwait_p = p2;
775
td->td_dbgflags |= TDB_VFORK;
776
}
777
PROC_UNLOCK(p2);
778
779
/*
780
* Tell any interested parties about the new process.
781
*/
782
knote_fork(p1->p_klist, p2->p_pid);
783
784
/*
785
* Now can be swapped.
786
*/
787
_PRELE(p1);
788
PROC_UNLOCK(p1);
789
SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags);
790
791
if (fr->fr_flags & RFPROCDESC) {
792
procdesc_finit(p2->p_procdesc, fp_procdesc);
793
fdrop(fp_procdesc, td);
794
}
795
796
/*
797
* Speculative check for PTRACE_FORK. PTRACE_FORK is not
798
* synced with forks in progress so it is OK if we miss it
799
* if being set atm.
800
*/
801
if ((p1->p_ptevents & PTRACE_FORK) != 0) {
802
sx_xlock(&proctree_lock);
803
PROC_LOCK(p2);
804
805
/*
806
* p1->p_ptevents & p1->p_pptr are protected by both
807
* process and proctree locks for modifications,
808
* so owning proctree_lock allows the race-free read.
809
*/
810
if ((p1->p_ptevents & PTRACE_FORK) != 0) {
811
/*
812
* Arrange for debugger to receive the fork event.
813
*
814
* We can report PL_FLAG_FORKED regardless of
815
* P_FOLLOWFORK settings, but it does not make a sense
816
* for runaway child.
817
*/
818
td->td_dbgflags |= TDB_FORK;
819
td->td_dbg_forked = p2->p_pid;
820
td2->td_dbgflags |= TDB_STOPATFORK;
821
proc_set_traced(p2, true);
822
CTR2(KTR_PTRACE,
823
"do_fork: attaching to new child pid %d: oppid %d",
824
p2->p_pid, p2->p_oppid);
825
proc_reparent(p2, p1->p_pptr, false);
826
}
827
PROC_UNLOCK(p2);
828
sx_xunlock(&proctree_lock);
829
}
830
831
racct_proc_fork_done(p2);
832
833
if ((fr->fr_flags & RFSTOPPED) == 0) {
834
if (fr->fr_pidp != NULL)
835
*fr->fr_pidp = p2->p_pid;
836
/*
837
* If RFSTOPPED not requested, make child runnable and
838
* add to run queue.
839
*/
840
thread_lock(td2);
841
TD_SET_CAN_RUN(td2);
842
sched_add(td2, SRQ_BORING);
843
} else {
844
*fr->fr_procp = p2;
845
}
846
}
847
848
static void
849
ast_vfork(struct thread *td, int tda __unused)
850
{
851
struct proc *p, *p2;
852
853
MPASS(td->td_pflags & TDP_RFPPWAIT);
854
855
p = td->td_proc;
856
/*
857
* Preserve synchronization semantics of vfork. If
858
* waiting for child to exec or exit, fork set
859
* P_PPWAIT on child, and there we sleep on our proc
860
* (in case of exit).
861
*
862
* Do it after the ptracestop() above is finished, to
863
* not block our debugger until child execs or exits
864
* to finish vfork wait.
865
*/
866
td->td_pflags &= ~TDP_RFPPWAIT;
867
p2 = td->td_rfppwait_p;
868
again:
869
PROC_LOCK(p2);
870
while (p2->p_flag & P_PPWAIT) {
871
PROC_LOCK(p);
872
if (thread_suspend_check_needed()) {
873
PROC_UNLOCK(p2);
874
thread_suspend_check(0);
875
PROC_UNLOCK(p);
876
goto again;
877
} else {
878
PROC_UNLOCK(p);
879
}
880
cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz);
881
}
882
PROC_UNLOCK(p2);
883
884
if (td->td_dbgflags & TDB_VFORK) {
885
PROC_LOCK(p);
886
if (p->p_ptevents & PTRACE_VFORK)
887
ptracestop(td, SIGTRAP, NULL);
888
td->td_dbgflags &= ~TDB_VFORK;
889
PROC_UNLOCK(p);
890
}
891
}
892
893
int
894
fork1(struct thread *td, struct fork_req *fr)
895
{
896
struct proc *p1, *newproc;
897
struct thread *td2;
898
struct vmspace *vm2;
899
struct ucred *cred;
900
struct file *fp_procdesc;
901
struct pgrp *pg;
902
vm_ooffset_t mem_charged;
903
int error, nprocs_new;
904
static int curfail;
905
static struct timeval lastfail;
906
int flags, pages;
907
bool killsx_locked, singlethreaded;
908
909
flags = fr->fr_flags;
910
pages = fr->fr_pages;
911
912
if ((flags & RFSTOPPED) != 0)
913
MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL);
914
else
915
MPASS(fr->fr_procp == NULL);
916
917
if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
918
return (EXTERROR(EINVAL,
919
"Undef or unimplemented flags %#jx", flags));
920
921
if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
922
return (EXTERROR(EINVAL,
923
"Signal value requires RFTSIGZMB", flags));
924
925
if ((flags & (RFFDG | RFCFDG)) == (RFFDG | RFCFDG))
926
return (EXTERROR(EINVAL, "Can not copy and clear"));
927
928
if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
929
return (EXTERROR(EINVAL, "Invalid signal", RFTSIGNUM(flags)));
930
931
if ((flags & RFPROCDESC) != 0) {
932
if ((flags & RFPROC) == 0)
933
return (EXTERROR(EINVAL,
934
"Can not not create a process yet get a process descriptor"));
935
936
if (fr->fr_pd_fd == NULL)
937
return (EXTERROR(EINVAL,
938
"Must provide a place to put a procdesc if creating one"));
939
940
if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0)
941
return (EXTERROR(EINVAL,
942
"Invallid pdflags at fork %#jx", fr->fr_pd_flags));
943
}
944
945
p1 = td->td_proc;
946
947
/*
948
* Here we don't create a new process, but we divorce
949
* certain parts of a process from itself.
950
*/
951
if ((flags & RFPROC) == 0) {
952
if (fr->fr_procp != NULL)
953
*fr->fr_procp = NULL;
954
else if (fr->fr_pidp != NULL)
955
*fr->fr_pidp = 0;
956
return (fork_norfproc(td, flags));
957
}
958
959
fp_procdesc = NULL;
960
newproc = NULL;
961
vm2 = NULL;
962
killsx_locked = false;
963
singlethreaded = false;
964
965
/*
966
* Increment the nprocs resource before allocations occur.
967
* Although process entries are dynamically created, we still
968
* keep a global limit on the maximum number we will
969
* create. There are hard-limits as to the number of processes
970
* that can run, established by the KVA and memory usage for
971
* the process data.
972
*
973
* Don't allow a nonprivileged user to use the last ten
974
* processes; don't let root exceed the limit.
975
*/
976
nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
977
if (nprocs_new >= maxproc - 10) {
978
if (priv_check_cred(td->td_ucred, PRIV_MAXPROC) != 0 ||
979
nprocs_new >= maxproc) {
980
error = EAGAIN;
981
sx_xlock(&allproc_lock);
982
if (ppsratecheck(&lastfail, &curfail, 1)) {
983
printf("maxproc limit exceeded by uid %u "
984
"(pid %d); see tuning(7) and "
985
"login.conf(5)\n",
986
td->td_ucred->cr_ruid, p1->p_pid);
987
}
988
sx_xunlock(&allproc_lock);
989
goto fail2;
990
}
991
}
992
993
/*
994
* If we are possibly multi-threaded, and there is a process
995
* sending a signal to our group right now, ensure that our
996
* other threads cannot be chosen for the signal queueing.
997
* Otherwise, this might delay signal action, and make the new
998
* child escape the signaling.
999
*/
1000
pg = p1->p_pgrp;
1001
if (p1->p_numthreads > 1) {
1002
if (sx_try_slock(&pg->pg_killsx) != 0) {
1003
killsx_locked = true;
1004
} else {
1005
PROC_LOCK(p1);
1006
if (thread_single(p1, SINGLE_BOUNDARY)) {
1007
PROC_UNLOCK(p1);
1008
error = ERESTART;
1009
goto fail2;
1010
}
1011
PROC_UNLOCK(p1);
1012
singlethreaded = true;
1013
}
1014
}
1015
1016
/*
1017
* Atomically check for signals and block processes from sending
1018
* a signal to our process group until the child is visible.
1019
*/
1020
if (!killsx_locked && sx_slock_sig(&pg->pg_killsx) != 0) {
1021
error = ERESTART;
1022
goto fail2;
1023
}
1024
if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0)) {
1025
/*
1026
* Either the process was moved to other process
1027
* group, or there is pending signal. sx_slock_sig()
1028
* does not check for signals if not sleeping for the
1029
* lock.
1030
*/
1031
sx_sunlock(&pg->pg_killsx);
1032
killsx_locked = false;
1033
error = ERESTART;
1034
goto fail2;
1035
} else {
1036
killsx_locked = true;
1037
}
1038
1039
/*
1040
* If required, create a process descriptor in the parent first; we
1041
* will abandon it if something goes wrong. We don't finit() until
1042
* later.
1043
*/
1044
if (flags & RFPROCDESC) {
1045
error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd,
1046
fr->fr_pd_flags, fr->fr_pd_fcaps);
1047
if (error != 0)
1048
goto fail2;
1049
AUDIT_ARG_FD(*fr->fr_pd_fd);
1050
}
1051
1052
mem_charged = 0;
1053
if (pages == 0)
1054
pages = kstack_pages;
1055
/* Allocate new proc. */
1056
newproc = uma_zalloc(proc_zone, M_WAITOK);
1057
td2 = FIRST_THREAD_IN_PROC(newproc);
1058
if (td2 == NULL) {
1059
td2 = thread_alloc(pages);
1060
if (td2 == NULL) {
1061
error = ENOMEM;
1062
goto fail2;
1063
}
1064
proc_linkup(newproc, td2);
1065
} else {
1066
error = thread_recycle(td2, pages);
1067
if (error != 0)
1068
goto fail2;
1069
}
1070
1071
if ((flags & RFMEM) == 0) {
1072
vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
1073
if (vm2 == NULL) {
1074
error = ENOMEM;
1075
goto fail2;
1076
}
1077
if (!swap_reserve(mem_charged)) {
1078
/*
1079
* The swap reservation failed. The accounting
1080
* from the entries of the copied vm2 will be
1081
* subtracted in vmspace_free(), so force the
1082
* reservation there.
1083
*/
1084
swap_reserve_force(mem_charged);
1085
error = ENOMEM;
1086
goto fail2;
1087
}
1088
} else
1089
vm2 = NULL;
1090
1091
/*
1092
* XXX: This is ugly; when we copy resource usage, we need to bump
1093
* per-cred resource counters.
1094
*/
1095
newproc->p_ucred = crcowget(td->td_ucred);
1096
1097
/*
1098
* Initialize resource accounting for the child process.
1099
*/
1100
error = racct_proc_fork(p1, newproc);
1101
if (error != 0) {
1102
error = EAGAIN;
1103
goto fail1;
1104
}
1105
1106
#ifdef MAC
1107
mac_proc_init(newproc);
1108
#endif
1109
1110
/*
1111
* Increment the count of procs running with this uid. Don't allow
1112
* a nonprivileged user to exceed their current limit.
1113
*/
1114
cred = td->td_ucred;
1115
if (!chgproccnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC))) {
1116
if (priv_check_cred(cred, PRIV_PROC_LIMIT) != 0)
1117
goto fail0;
1118
chgproccnt(cred->cr_ruidinfo, 1, 0);
1119
}
1120
1121
newproc->p_klist = knlist_alloc(&newproc->p_mtx);
1122
1123
do_fork(td, fr, newproc, td2, vm2, fp_procdesc);
1124
error = 0;
1125
goto cleanup;
1126
fail0:
1127
error = EAGAIN;
1128
#ifdef MAC
1129
mac_proc_destroy(newproc);
1130
#endif
1131
racct_proc_exit(newproc);
1132
fail1:
1133
proc_unset_cred(newproc, false);
1134
fail2:
1135
if (vm2 != NULL)
1136
vmspace_free(vm2);
1137
uma_zfree(proc_zone, newproc);
1138
if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
1139
fdclose(td, fp_procdesc, *fr->fr_pd_fd);
1140
fdrop(fp_procdesc, td);
1141
}
1142
atomic_add_int(&nprocs, -1);
1143
cleanup:
1144
if (killsx_locked)
1145
sx_sunlock(&pg->pg_killsx);
1146
if (singlethreaded) {
1147
PROC_LOCK(p1);
1148
thread_single_end(p1, SINGLE_BOUNDARY);
1149
PROC_UNLOCK(p1);
1150
}
1151
if (error != 0)
1152
pause("fork", hz / 2);
1153
return (error);
1154
}
1155
1156
/*
1157
* Handle the return of a child process from fork1(). This function
1158
* is called from the MD fork_trampoline() entry point.
1159
*/
1160
void
1161
fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
1162
struct trapframe *frame)
1163
{
1164
struct proc *p;
1165
struct thread *td;
1166
struct thread *dtd;
1167
1168
kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED);
1169
1170
td = curthread;
1171
p = td->td_proc;
1172
KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
1173
1174
CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
1175
td, td_get_sched(td), p->p_pid, td->td_name);
1176
1177
sched_fork_exit(td);
1178
1179
/*
1180
* Processes normally resume in mi_switch() after being
1181
* cpu_switch()'ed to, but when children start up they arrive here
1182
* instead, so we must do much the same things as mi_switch() would.
1183
*/
1184
if ((dtd = PCPU_GET(deadthread))) {
1185
PCPU_SET(deadthread, NULL);
1186
thread_stash(dtd);
1187
}
1188
thread_unlock(td);
1189
1190
/*
1191
* cpu_fork_kthread_handler intercepts this function call to
1192
* have this call a non-return function to stay in kernel mode.
1193
* initproc has its own fork handler, but it does return.
1194
*/
1195
KASSERT(callout != NULL, ("NULL callout in fork_exit"));
1196
callout(arg, frame);
1197
1198
/*
1199
* Check if a kernel thread misbehaved and returned from its main
1200
* function.
1201
*/
1202
if (p->p_flag & P_KPROC) {
1203
printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
1204
td->td_name, p->p_pid);
1205
kthread_exit();
1206
}
1207
mtx_assert(&Giant, MA_NOTOWNED);
1208
1209
/*
1210
* Now going to return to userland.
1211
*/
1212
1213
if (p->p_sysent->sv_schedtail != NULL)
1214
(p->p_sysent->sv_schedtail)(td);
1215
1216
userret(td, frame);
1217
}
1218
1219
/*
1220
* Simplified back end of syscall(), used when returning from fork()
1221
* directly into user mode. This function is passed in to fork_exit()
1222
* as the first parameter and is called when returning to a new
1223
* userland process.
1224
*/
1225
void
1226
fork_return(struct thread *td, struct trapframe *frame)
1227
{
1228
struct proc *p;
1229
1230
p = td->td_proc;
1231
if (td->td_dbgflags & TDB_STOPATFORK) {
1232
PROC_LOCK(p);
1233
if ((p->p_flag & P_TRACED) != 0) {
1234
/*
1235
* Inform the debugger if one is still present.
1236
*/
1237
td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP;
1238
ptracestop(td, SIGSTOP, NULL);
1239
td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
1240
} else {
1241
/*
1242
* ... otherwise clear the request.
1243
*/
1244
td->td_dbgflags &= ~TDB_STOPATFORK;
1245
}
1246
PROC_UNLOCK(p);
1247
} else if (p->p_flag & P_TRACED) {
1248
/*
1249
* This is the start of a new thread in a traced
1250
* process. Report a system call exit event.
1251
*/
1252
PROC_LOCK(p);
1253
td->td_dbgflags |= TDB_SCX;
1254
if ((p->p_ptevents & PTRACE_SCX) != 0 ||
1255
(td->td_dbgflags & TDB_BORN) != 0)
1256
ptracestop(td, SIGTRAP, NULL);
1257
td->td_dbgflags &= ~(TDB_SCX | TDB_BORN);
1258
PROC_UNLOCK(p);
1259
}
1260
1261
/*
1262
* If the prison was killed mid-fork, die along with it.
1263
*/
1264
if (!prison_isalive(td->td_ucred->cr_prison))
1265
exit1(td, 0, SIGKILL);
1266
1267
#ifdef KTRACE
1268
if (KTRPOINT(td, KTR_SYSRET))
1269
ktrsysret(td->td_sa.code, 0, 0);
1270
#endif
1271
}
1272
1273
static void
1274
fork_init(void *arg __unused)
1275
{
1276
ast_register(TDA_VFORK, ASTR_ASTF_REQUIRED | ASTR_TDP, TDP_RFPPWAIT,
1277
ast_vfork);
1278
}
1279
SYSINIT(fork, SI_SUB_INTRINSIC, SI_ORDER_ANY, fork_init, NULL);
1280
1281