Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/kernel/exit.c
10814 views
1
/*
2
* linux/kernel/exit.c
3
*
4
* Copyright (C) 1991, 1992 Linus Torvalds
5
*/
6
7
#include <linux/mm.h>
8
#include <linux/slab.h>
9
#include <linux/interrupt.h>
10
#include <linux/module.h>
11
#include <linux/capability.h>
12
#include <linux/completion.h>
13
#include <linux/personality.h>
14
#include <linux/tty.h>
15
#include <linux/iocontext.h>
16
#include <linux/key.h>
17
#include <linux/security.h>
18
#include <linux/cpu.h>
19
#include <linux/acct.h>
20
#include <linux/tsacct_kern.h>
21
#include <linux/file.h>
22
#include <linux/fdtable.h>
23
#include <linux/binfmts.h>
24
#include <linux/nsproxy.h>
25
#include <linux/pid_namespace.h>
26
#include <linux/ptrace.h>
27
#include <linux/profile.h>
28
#include <linux/mount.h>
29
#include <linux/proc_fs.h>
30
#include <linux/kthread.h>
31
#include <linux/mempolicy.h>
32
#include <linux/taskstats_kern.h>
33
#include <linux/delayacct.h>
34
#include <linux/freezer.h>
35
#include <linux/cgroup.h>
36
#include <linux/syscalls.h>
37
#include <linux/signal.h>
38
#include <linux/posix-timers.h>
39
#include <linux/cn_proc.h>
40
#include <linux/mutex.h>
41
#include <linux/futex.h>
42
#include <linux/pipe_fs_i.h>
43
#include <linux/audit.h> /* for audit_free() */
44
#include <linux/resource.h>
45
#include <linux/blkdev.h>
46
#include <linux/task_io_accounting_ops.h>
47
#include <linux/tracehook.h>
48
#include <linux/fs_struct.h>
49
#include <linux/init_task.h>
50
#include <linux/perf_event.h>
51
#include <trace/events/sched.h>
52
#include <linux/hw_breakpoint.h>
53
#include <linux/oom.h>
54
55
#include <asm/uaccess.h>
56
#include <asm/unistd.h>
57
#include <asm/pgtable.h>
58
#include <asm/mmu_context.h>
59
60
static void exit_mm(struct task_struct * tsk);
61
62
static void __unhash_process(struct task_struct *p, bool group_dead)
63
{
64
nr_threads--;
65
detach_pid(p, PIDTYPE_PID);
66
if (group_dead) {
67
detach_pid(p, PIDTYPE_PGID);
68
detach_pid(p, PIDTYPE_SID);
69
70
list_del_rcu(&p->tasks);
71
list_del_init(&p->sibling);
72
__this_cpu_dec(process_counts);
73
}
74
list_del_rcu(&p->thread_group);
75
}
76
77
/*
78
* This function expects the tasklist_lock write-locked.
79
*/
80
static void __exit_signal(struct task_struct *tsk)
81
{
82
struct signal_struct *sig = tsk->signal;
83
bool group_dead = thread_group_leader(tsk);
84
struct sighand_struct *sighand;
85
struct tty_struct *uninitialized_var(tty);
86
87
sighand = rcu_dereference_check(tsk->sighand,
88
rcu_read_lock_held() ||
89
lockdep_tasklist_lock_is_held());
90
spin_lock(&sighand->siglock);
91
92
posix_cpu_timers_exit(tsk);
93
if (group_dead) {
94
posix_cpu_timers_exit_group(tsk);
95
tty = sig->tty;
96
sig->tty = NULL;
97
} else {
98
/*
99
* This can only happen if the caller is de_thread().
100
* FIXME: this is the temporary hack, we should teach
101
* posix-cpu-timers to handle this case correctly.
102
*/
103
if (unlikely(has_group_leader_pid(tsk)))
104
posix_cpu_timers_exit_group(tsk);
105
106
/*
107
* If there is any task waiting for the group exit
108
* then notify it:
109
*/
110
if (sig->notify_count > 0 && !--sig->notify_count)
111
wake_up_process(sig->group_exit_task);
112
113
if (tsk == sig->curr_target)
114
sig->curr_target = next_thread(tsk);
115
/*
116
* Accumulate here the counters for all threads but the
117
* group leader as they die, so they can be added into
118
* the process-wide totals when those are taken.
119
* The group leader stays around as a zombie as long
120
* as there are other threads. When it gets reaped,
121
* the exit.c code will add its counts into these totals.
122
* We won't ever get here for the group leader, since it
123
* will have been the last reference on the signal_struct.
124
*/
125
sig->utime = cputime_add(sig->utime, tsk->utime);
126
sig->stime = cputime_add(sig->stime, tsk->stime);
127
sig->gtime = cputime_add(sig->gtime, tsk->gtime);
128
sig->min_flt += tsk->min_flt;
129
sig->maj_flt += tsk->maj_flt;
130
sig->nvcsw += tsk->nvcsw;
131
sig->nivcsw += tsk->nivcsw;
132
sig->inblock += task_io_get_inblock(tsk);
133
sig->oublock += task_io_get_oublock(tsk);
134
task_io_accounting_add(&sig->ioac, &tsk->ioac);
135
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
136
}
137
138
sig->nr_threads--;
139
__unhash_process(tsk, group_dead);
140
141
/*
142
* Do this under ->siglock, we can race with another thread
143
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
144
*/
145
flush_sigqueue(&tsk->pending);
146
tsk->sighand = NULL;
147
spin_unlock(&sighand->siglock);
148
149
__cleanup_sighand(sighand);
150
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
151
if (group_dead) {
152
flush_sigqueue(&sig->shared_pending);
153
tty_kref_put(tty);
154
}
155
}
156
157
static void delayed_put_task_struct(struct rcu_head *rhp)
158
{
159
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160
161
perf_event_delayed_put(tsk);
162
trace_sched_process_free(tsk);
163
put_task_struct(tsk);
164
}
165
166
167
void release_task(struct task_struct * p)
168
{
169
struct task_struct *leader;
170
int zap_leader;
171
repeat:
172
tracehook_prepare_release_task(p);
173
/* don't need to get the RCU readlock here - the process is dead and
174
* can't be modifying its own credentials. But shut RCU-lockdep up */
175
rcu_read_lock();
176
atomic_dec(&__task_cred(p)->user->processes);
177
rcu_read_unlock();
178
179
proc_flush_task(p);
180
181
write_lock_irq(&tasklist_lock);
182
tracehook_finish_release_task(p);
183
__exit_signal(p);
184
185
/*
186
* If we are the last non-leader member of the thread
187
* group, and the leader is zombie, then notify the
188
* group leader's parent process. (if it wants notification.)
189
*/
190
zap_leader = 0;
191
leader = p->group_leader;
192
if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
193
BUG_ON(task_detached(leader));
194
do_notify_parent(leader, leader->exit_signal);
195
/*
196
* If we were the last child thread and the leader has
197
* exited already, and the leader's parent ignores SIGCHLD,
198
* then we are the one who should release the leader.
199
*
200
* do_notify_parent() will have marked it self-reaping in
201
* that case.
202
*/
203
zap_leader = task_detached(leader);
204
205
/*
206
* This maintains the invariant that release_task()
207
* only runs on a task in EXIT_DEAD, just for sanity.
208
*/
209
if (zap_leader)
210
leader->exit_state = EXIT_DEAD;
211
}
212
213
write_unlock_irq(&tasklist_lock);
214
release_thread(p);
215
call_rcu(&p->rcu, delayed_put_task_struct);
216
217
p = leader;
218
if (unlikely(zap_leader))
219
goto repeat;
220
}
221
222
/*
223
* This checks not only the pgrp, but falls back on the pid if no
224
* satisfactory pgrp is found. I dunno - gdb doesn't work correctly
225
* without this...
226
*
227
* The caller must hold rcu lock or the tasklist lock.
228
*/
229
struct pid *session_of_pgrp(struct pid *pgrp)
230
{
231
struct task_struct *p;
232
struct pid *sid = NULL;
233
234
p = pid_task(pgrp, PIDTYPE_PGID);
235
if (p == NULL)
236
p = pid_task(pgrp, PIDTYPE_PID);
237
if (p != NULL)
238
sid = task_session(p);
239
240
return sid;
241
}
242
243
/*
244
* Determine if a process group is "orphaned", according to the POSIX
245
* definition in 2.2.2.52. Orphaned process groups are not to be affected
246
* by terminal-generated stop signals. Newly orphaned process groups are
247
* to receive a SIGHUP and a SIGCONT.
248
*
249
* "I ask you, have you ever known what it is to be an orphan?"
250
*/
251
static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
252
{
253
struct task_struct *p;
254
255
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
256
if ((p == ignored_task) ||
257
(p->exit_state && thread_group_empty(p)) ||
258
is_global_init(p->real_parent))
259
continue;
260
261
if (task_pgrp(p->real_parent) != pgrp &&
262
task_session(p->real_parent) == task_session(p))
263
return 0;
264
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
265
266
return 1;
267
}
268
269
int is_current_pgrp_orphaned(void)
270
{
271
int retval;
272
273
read_lock(&tasklist_lock);
274
retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
275
read_unlock(&tasklist_lock);
276
277
return retval;
278
}
279
280
static int has_stopped_jobs(struct pid *pgrp)
281
{
282
int retval = 0;
283
struct task_struct *p;
284
285
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
286
if (!task_is_stopped(p))
287
continue;
288
retval = 1;
289
break;
290
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
291
return retval;
292
}
293
294
/*
295
* Check to see if any process groups have become orphaned as
296
* a result of our exiting, and if they have any stopped jobs,
297
* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
298
*/
299
static void
300
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
301
{
302
struct pid *pgrp = task_pgrp(tsk);
303
struct task_struct *ignored_task = tsk;
304
305
if (!parent)
306
/* exit: our father is in a different pgrp than
307
* we are and we were the only connection outside.
308
*/
309
parent = tsk->real_parent;
310
else
311
/* reparent: our child is in a different pgrp than
312
* we are, and it was the only connection outside.
313
*/
314
ignored_task = NULL;
315
316
if (task_pgrp(parent) != pgrp &&
317
task_session(parent) == task_session(tsk) &&
318
will_become_orphaned_pgrp(pgrp, ignored_task) &&
319
has_stopped_jobs(pgrp)) {
320
__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
321
__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
322
}
323
}
324
325
/**
326
* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
327
*
328
* If a kernel thread is launched as a result of a system call, or if
329
* it ever exits, it should generally reparent itself to kthreadd so it
330
* isn't in the way of other processes and is correctly cleaned up on exit.
331
*
332
* The various task state such as scheduling policy and priority may have
333
* been inherited from a user process, so we reset them to sane values here.
334
*
335
* NOTE that reparent_to_kthreadd() gives the caller full capabilities.
336
*/
337
static void reparent_to_kthreadd(void)
338
{
339
write_lock_irq(&tasklist_lock);
340
341
ptrace_unlink(current);
342
/* Reparent to init */
343
current->real_parent = current->parent = kthreadd_task;
344
list_move_tail(&current->sibling, &current->real_parent->children);
345
346
/* Set the exit signal to SIGCHLD so we signal init on exit */
347
current->exit_signal = SIGCHLD;
348
349
if (task_nice(current) < 0)
350
set_user_nice(current, 0);
351
/* cpus_allowed? */
352
/* rt_priority? */
353
/* signals? */
354
memcpy(current->signal->rlim, init_task.signal->rlim,
355
sizeof(current->signal->rlim));
356
357
atomic_inc(&init_cred.usage);
358
commit_creds(&init_cred);
359
write_unlock_irq(&tasklist_lock);
360
}
361
362
void __set_special_pids(struct pid *pid)
363
{
364
struct task_struct *curr = current->group_leader;
365
366
if (task_session(curr) != pid)
367
change_pid(curr, PIDTYPE_SID, pid);
368
369
if (task_pgrp(curr) != pid)
370
change_pid(curr, PIDTYPE_PGID, pid);
371
}
372
373
static void set_special_pids(struct pid *pid)
374
{
375
write_lock_irq(&tasklist_lock);
376
__set_special_pids(pid);
377
write_unlock_irq(&tasklist_lock);
378
}
379
380
/*
381
* Let kernel threads use this to say that they allow a certain signal.
382
* Must not be used if kthread was cloned with CLONE_SIGHAND.
383
*/
384
int allow_signal(int sig)
385
{
386
if (!valid_signal(sig) || sig < 1)
387
return -EINVAL;
388
389
spin_lock_irq(&current->sighand->siglock);
390
/* This is only needed for daemonize()'ed kthreads */
391
sigdelset(&current->blocked, sig);
392
/*
393
* Kernel threads handle their own signals. Let the signal code
394
* know it'll be handled, so that they don't get converted to
395
* SIGKILL or just silently dropped.
396
*/
397
current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
398
recalc_sigpending();
399
spin_unlock_irq(&current->sighand->siglock);
400
return 0;
401
}
402
403
EXPORT_SYMBOL(allow_signal);
404
405
int disallow_signal(int sig)
406
{
407
if (!valid_signal(sig) || sig < 1)
408
return -EINVAL;
409
410
spin_lock_irq(&current->sighand->siglock);
411
current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
412
recalc_sigpending();
413
spin_unlock_irq(&current->sighand->siglock);
414
return 0;
415
}
416
417
EXPORT_SYMBOL(disallow_signal);
418
419
/*
420
* Put all the gunge required to become a kernel thread without
421
* attached user resources in one place where it belongs.
422
*/
423
424
void daemonize(const char *name, ...)
425
{
426
va_list args;
427
sigset_t blocked;
428
429
va_start(args, name);
430
vsnprintf(current->comm, sizeof(current->comm), name, args);
431
va_end(args);
432
433
/*
434
* If we were started as result of loading a module, close all of the
435
* user space pages. We don't need them, and if we didn't close them
436
* they would be locked into memory.
437
*/
438
exit_mm(current);
439
/*
440
* We don't want to have TIF_FREEZE set if the system-wide hibernation
441
* or suspend transition begins right now.
442
*/
443
current->flags |= (PF_NOFREEZE | PF_KTHREAD);
444
445
if (current->nsproxy != &init_nsproxy) {
446
get_nsproxy(&init_nsproxy);
447
switch_task_namespaces(current, &init_nsproxy);
448
}
449
set_special_pids(&init_struct_pid);
450
proc_clear_tty(current);
451
452
/* Block and flush all signals */
453
sigfillset(&blocked);
454
sigprocmask(SIG_BLOCK, &blocked, NULL);
455
flush_signals(current);
456
457
/* Become as one with the init task */
458
459
daemonize_fs_struct();
460
exit_files(current);
461
current->files = init_task.files;
462
atomic_inc(&current->files->count);
463
464
reparent_to_kthreadd();
465
}
466
467
EXPORT_SYMBOL(daemonize);
468
469
static void close_files(struct files_struct * files)
470
{
471
int i, j;
472
struct fdtable *fdt;
473
474
j = 0;
475
476
/*
477
* It is safe to dereference the fd table without RCU or
478
* ->file_lock because this is the last reference to the
479
* files structure. But use RCU to shut RCU-lockdep up.
480
*/
481
rcu_read_lock();
482
fdt = files_fdtable(files);
483
rcu_read_unlock();
484
for (;;) {
485
unsigned long set;
486
i = j * __NFDBITS;
487
if (i >= fdt->max_fds)
488
break;
489
set = fdt->open_fds->fds_bits[j++];
490
while (set) {
491
if (set & 1) {
492
struct file * file = xchg(&fdt->fd[i], NULL);
493
if (file) {
494
filp_close(file, files);
495
cond_resched();
496
}
497
}
498
i++;
499
set >>= 1;
500
}
501
}
502
}
503
504
struct files_struct *get_files_struct(struct task_struct *task)
505
{
506
struct files_struct *files;
507
508
task_lock(task);
509
files = task->files;
510
if (files)
511
atomic_inc(&files->count);
512
task_unlock(task);
513
514
return files;
515
}
516
517
void put_files_struct(struct files_struct *files)
518
{
519
struct fdtable *fdt;
520
521
if (atomic_dec_and_test(&files->count)) {
522
close_files(files);
523
/*
524
* Free the fd and fdset arrays if we expanded them.
525
* If the fdtable was embedded, pass files for freeing
526
* at the end of the RCU grace period. Otherwise,
527
* you can free files immediately.
528
*/
529
rcu_read_lock();
530
fdt = files_fdtable(files);
531
if (fdt != &files->fdtab)
532
kmem_cache_free(files_cachep, files);
533
free_fdtable(fdt);
534
rcu_read_unlock();
535
}
536
}
537
538
void reset_files_struct(struct files_struct *files)
539
{
540
struct task_struct *tsk = current;
541
struct files_struct *old;
542
543
old = tsk->files;
544
task_lock(tsk);
545
tsk->files = files;
546
task_unlock(tsk);
547
put_files_struct(old);
548
}
549
550
void exit_files(struct task_struct *tsk)
551
{
552
struct files_struct * files = tsk->files;
553
554
if (files) {
555
task_lock(tsk);
556
tsk->files = NULL;
557
task_unlock(tsk);
558
put_files_struct(files);
559
}
560
}
561
562
#ifdef CONFIG_MM_OWNER
563
/*
564
* A task is exiting. If it owned this mm, find a new owner for the mm.
565
*/
566
void mm_update_next_owner(struct mm_struct *mm)
567
{
568
struct task_struct *c, *g, *p = current;
569
570
retry:
571
/*
572
* If the exiting or execing task is not the owner, it's
573
* someone else's problem.
574
*/
575
if (mm->owner != p)
576
return;
577
/*
578
* The current owner is exiting/execing and there are no other
579
* candidates. Do not leave the mm pointing to a possibly
580
* freed task structure.
581
*/
582
if (atomic_read(&mm->mm_users) <= 1) {
583
mm->owner = NULL;
584
return;
585
}
586
587
read_lock(&tasklist_lock);
588
/*
589
* Search in the children
590
*/
591
list_for_each_entry(c, &p->children, sibling) {
592
if (c->mm == mm)
593
goto assign_new_owner;
594
}
595
596
/*
597
* Search in the siblings
598
*/
599
list_for_each_entry(c, &p->real_parent->children, sibling) {
600
if (c->mm == mm)
601
goto assign_new_owner;
602
}
603
604
/*
605
* Search through everything else. We should not get
606
* here often
607
*/
608
do_each_thread(g, c) {
609
if (c->mm == mm)
610
goto assign_new_owner;
611
} while_each_thread(g, c);
612
613
read_unlock(&tasklist_lock);
614
/*
615
* We found no owner yet mm_users > 1: this implies that we are
616
* most likely racing with swapoff (try_to_unuse()) or /proc or
617
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
618
*/
619
mm->owner = NULL;
620
return;
621
622
assign_new_owner:
623
BUG_ON(c == p);
624
get_task_struct(c);
625
/*
626
* The task_lock protects c->mm from changing.
627
* We always want mm->owner->mm == mm
628
*/
629
task_lock(c);
630
/*
631
* Delay read_unlock() till we have the task_lock()
632
* to ensure that c does not slip away underneath us
633
*/
634
read_unlock(&tasklist_lock);
635
if (c->mm != mm) {
636
task_unlock(c);
637
put_task_struct(c);
638
goto retry;
639
}
640
mm->owner = c;
641
task_unlock(c);
642
put_task_struct(c);
643
}
644
#endif /* CONFIG_MM_OWNER */
645
646
/*
647
* Turn us into a lazy TLB process if we
648
* aren't already..
649
*/
650
static void exit_mm(struct task_struct * tsk)
651
{
652
struct mm_struct *mm = tsk->mm;
653
struct core_state *core_state;
654
655
mm_release(tsk, mm);
656
if (!mm)
657
return;
658
/*
659
* Serialize with any possible pending coredump.
660
* We must hold mmap_sem around checking core_state
661
* and clearing tsk->mm. The core-inducing thread
662
* will increment ->nr_threads for each thread in the
663
* group with ->mm != NULL.
664
*/
665
down_read(&mm->mmap_sem);
666
core_state = mm->core_state;
667
if (core_state) {
668
struct core_thread self;
669
up_read(&mm->mmap_sem);
670
671
self.task = tsk;
672
self.next = xchg(&core_state->dumper.next, &self);
673
/*
674
* Implies mb(), the result of xchg() must be visible
675
* to core_state->dumper.
676
*/
677
if (atomic_dec_and_test(&core_state->nr_threads))
678
complete(&core_state->startup);
679
680
for (;;) {
681
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
682
if (!self.task) /* see coredump_finish() */
683
break;
684
schedule();
685
}
686
__set_task_state(tsk, TASK_RUNNING);
687
down_read(&mm->mmap_sem);
688
}
689
atomic_inc(&mm->mm_count);
690
BUG_ON(mm != tsk->active_mm);
691
/* more a memory barrier than a real lock */
692
task_lock(tsk);
693
tsk->mm = NULL;
694
up_read(&mm->mmap_sem);
695
enter_lazy_tlb(mm, current);
696
/* We don't want this task to be frozen prematurely */
697
clear_freeze_flag(tsk);
698
if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
699
atomic_dec(&mm->oom_disable_count);
700
task_unlock(tsk);
701
mm_update_next_owner(mm);
702
mmput(mm);
703
}
704
705
/*
706
* When we die, we re-parent all our children.
707
* Try to give them to another thread in our thread
708
* group, and if no such member exists, give it to
709
* the child reaper process (ie "init") in our pid
710
* space.
711
*/
712
static struct task_struct *find_new_reaper(struct task_struct *father)
713
__releases(&tasklist_lock)
714
__acquires(&tasklist_lock)
715
{
716
struct pid_namespace *pid_ns = task_active_pid_ns(father);
717
struct task_struct *thread;
718
719
thread = father;
720
while_each_thread(father, thread) {
721
if (thread->flags & PF_EXITING)
722
continue;
723
if (unlikely(pid_ns->child_reaper == father))
724
pid_ns->child_reaper = thread;
725
return thread;
726
}
727
728
if (unlikely(pid_ns->child_reaper == father)) {
729
write_unlock_irq(&tasklist_lock);
730
if (unlikely(pid_ns == &init_pid_ns))
731
panic("Attempted to kill init!");
732
733
zap_pid_ns_processes(pid_ns);
734
write_lock_irq(&tasklist_lock);
735
/*
736
* We can not clear ->child_reaper or leave it alone.
737
* There may by stealth EXIT_DEAD tasks on ->children,
738
* forget_original_parent() must move them somewhere.
739
*/
740
pid_ns->child_reaper = init_pid_ns.child_reaper;
741
}
742
743
return pid_ns->child_reaper;
744
}
745
746
/*
747
* Any that need to be release_task'd are put on the @dead list.
748
*/
749
static void reparent_leader(struct task_struct *father, struct task_struct *p,
750
struct list_head *dead)
751
{
752
list_move_tail(&p->sibling, &p->real_parent->children);
753
754
if (task_detached(p))
755
return;
756
/*
757
* If this is a threaded reparent there is no need to
758
* notify anyone anything has happened.
759
*/
760
if (same_thread_group(p->real_parent, father))
761
return;
762
763
/* We don't want people slaying init. */
764
p->exit_signal = SIGCHLD;
765
766
/* If it has exited notify the new parent about this child's death. */
767
if (!task_ptrace(p) &&
768
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
769
do_notify_parent(p, p->exit_signal);
770
if (task_detached(p)) {
771
p->exit_state = EXIT_DEAD;
772
list_move_tail(&p->sibling, dead);
773
}
774
}
775
776
kill_orphaned_pgrp(p, father);
777
}
778
779
static void forget_original_parent(struct task_struct *father)
780
{
781
struct task_struct *p, *n, *reaper;
782
LIST_HEAD(dead_children);
783
784
write_lock_irq(&tasklist_lock);
785
/*
786
* Note that exit_ptrace() and find_new_reaper() might
787
* drop tasklist_lock and reacquire it.
788
*/
789
exit_ptrace(father);
790
reaper = find_new_reaper(father);
791
792
list_for_each_entry_safe(p, n, &father->children, sibling) {
793
struct task_struct *t = p;
794
do {
795
t->real_parent = reaper;
796
if (t->parent == father) {
797
BUG_ON(task_ptrace(t));
798
t->parent = t->real_parent;
799
}
800
if (t->pdeath_signal)
801
group_send_sig_info(t->pdeath_signal,
802
SEND_SIG_NOINFO, t);
803
} while_each_thread(p, t);
804
reparent_leader(father, p, &dead_children);
805
}
806
write_unlock_irq(&tasklist_lock);
807
808
BUG_ON(!list_empty(&father->children));
809
810
list_for_each_entry_safe(p, n, &dead_children, sibling) {
811
list_del_init(&p->sibling);
812
release_task(p);
813
}
814
}
815
816
/*
817
* Send signals to all our closest relatives so that they know
818
* to properly mourn us..
819
*/
820
static void exit_notify(struct task_struct *tsk, int group_dead)
821
{
822
int signal;
823
void *cookie;
824
825
/*
826
* This does two things:
827
*
828
* A. Make init inherit all the child processes
829
* B. Check to see if any process groups have become orphaned
830
* as a result of our exiting, and if they have any stopped
831
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
832
*/
833
forget_original_parent(tsk);
834
exit_task_namespaces(tsk);
835
836
write_lock_irq(&tasklist_lock);
837
if (group_dead)
838
kill_orphaned_pgrp(tsk->group_leader, NULL);
839
840
/* Let father know we died
841
*
842
* Thread signals are configurable, but you aren't going to use
843
* that to send signals to arbitrary processes.
844
* That stops right now.
845
*
846
* If the parent exec id doesn't match the exec id we saved
847
* when we started then we know the parent has changed security
848
* domain.
849
*
850
* If our self_exec id doesn't match our parent_exec_id then
851
* we have changed execution domain as these two values started
852
* the same after a fork.
853
*/
854
if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
855
(tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
856
tsk->self_exec_id != tsk->parent_exec_id))
857
tsk->exit_signal = SIGCHLD;
858
859
signal = tracehook_notify_death(tsk, &cookie, group_dead);
860
if (signal >= 0)
861
signal = do_notify_parent(tsk, signal);
862
863
tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
864
865
/* mt-exec, de_thread() is waiting for group leader */
866
if (unlikely(tsk->signal->notify_count < 0))
867
wake_up_process(tsk->signal->group_exit_task);
868
write_unlock_irq(&tasklist_lock);
869
870
tracehook_report_death(tsk, signal, cookie, group_dead);
871
872
/* If the process is dead, release it - nobody will wait for it */
873
if (signal == DEATH_REAP)
874
release_task(tsk);
875
}
876
877
#ifdef CONFIG_DEBUG_STACK_USAGE
878
static void check_stack_usage(void)
879
{
880
static DEFINE_SPINLOCK(low_water_lock);
881
static int lowest_to_date = THREAD_SIZE;
882
unsigned long free;
883
884
free = stack_not_used(current);
885
886
if (free >= lowest_to_date)
887
return;
888
889
spin_lock(&low_water_lock);
890
if (free < lowest_to_date) {
891
printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
892
"left\n",
893
current->comm, free);
894
lowest_to_date = free;
895
}
896
spin_unlock(&low_water_lock);
897
}
898
#else
899
static inline void check_stack_usage(void) {}
900
#endif
901
902
NORET_TYPE void do_exit(long code)
903
{
904
struct task_struct *tsk = current;
905
int group_dead;
906
907
profile_task_exit(tsk);
908
909
WARN_ON(atomic_read(&tsk->fs_excl));
910
WARN_ON(blk_needs_flush_plug(tsk));
911
912
if (unlikely(in_interrupt()))
913
panic("Aiee, killing interrupt handler!");
914
if (unlikely(!tsk->pid))
915
panic("Attempted to kill the idle task!");
916
917
/*
918
* If do_exit is called because this processes oopsed, it's possible
919
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
920
* continuing. Amongst other possible reasons, this is to prevent
921
* mm_release()->clear_child_tid() from writing to a user-controlled
922
* kernel address.
923
*/
924
set_fs(USER_DS);
925
926
tracehook_report_exit(&code);
927
928
validate_creds_for_do_exit(tsk);
929
930
/*
931
* We're taking recursive faults here in do_exit. Safest is to just
932
* leave this task alone and wait for reboot.
933
*/
934
if (unlikely(tsk->flags & PF_EXITING)) {
935
printk(KERN_ALERT
936
"Fixing recursive fault but reboot is needed!\n");
937
/*
938
* We can do this unlocked here. The futex code uses
939
* this flag just to verify whether the pi state
940
* cleanup has been done or not. In the worst case it
941
* loops once more. We pretend that the cleanup was
942
* done as there is no way to return. Either the
943
* OWNER_DIED bit is set by now or we push the blocked
944
* task into the wait for ever nirwana as well.
945
*/
946
tsk->flags |= PF_EXITPIDONE;
947
set_current_state(TASK_UNINTERRUPTIBLE);
948
schedule();
949
}
950
951
exit_irq_thread();
952
953
exit_signals(tsk); /* sets PF_EXITING */
954
/*
955
* tsk->flags are checked in the futex code to protect against
956
* an exiting task cleaning up the robust pi futexes.
957
*/
958
smp_mb();
959
raw_spin_unlock_wait(&tsk->pi_lock);
960
961
if (unlikely(in_atomic()))
962
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
963
current->comm, task_pid_nr(current),
964
preempt_count());
965
966
acct_update_integrals(tsk);
967
/* sync mm's RSS info before statistics gathering */
968
if (tsk->mm)
969
sync_mm_rss(tsk, tsk->mm);
970
group_dead = atomic_dec_and_test(&tsk->signal->live);
971
if (group_dead) {
972
hrtimer_cancel(&tsk->signal->real_timer);
973
exit_itimers(tsk->signal);
974
if (tsk->mm)
975
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
976
}
977
acct_collect(code, group_dead);
978
if (group_dead)
979
tty_audit_exit();
980
if (unlikely(tsk->audit_context))
981
audit_free(tsk);
982
983
tsk->exit_code = code;
984
taskstats_exit(tsk, group_dead);
985
986
exit_mm(tsk);
987
988
if (group_dead)
989
acct_process();
990
trace_sched_process_exit(tsk);
991
992
exit_sem(tsk);
993
exit_files(tsk);
994
exit_fs(tsk);
995
check_stack_usage();
996
exit_thread();
997
998
/*
999
* Flush inherited counters to the parent - before the parent
1000
* gets woken up by child-exit notifications.
1001
*
1002
* because of cgroup mode, must be called before cgroup_exit()
1003
*/
1004
perf_event_exit_task(tsk);
1005
1006
cgroup_exit(tsk, 1);
1007
1008
if (group_dead)
1009
disassociate_ctty(1);
1010
1011
module_put(task_thread_info(tsk)->exec_domain->module);
1012
1013
proc_exit_connector(tsk);
1014
1015
/*
1016
* FIXME: do that only when needed, using sched_exit tracepoint
1017
*/
1018
ptrace_put_breakpoints(tsk);
1019
1020
exit_notify(tsk, group_dead);
1021
#ifdef CONFIG_NUMA
1022
task_lock(tsk);
1023
mpol_put(tsk->mempolicy);
1024
tsk->mempolicy = NULL;
1025
task_unlock(tsk);
1026
#endif
1027
#ifdef CONFIG_FUTEX
1028
if (unlikely(current->pi_state_cache))
1029
kfree(current->pi_state_cache);
1030
#endif
1031
/*
1032
* Make sure we are holding no locks:
1033
*/
1034
debug_check_no_locks_held(tsk);
1035
/*
1036
* We can do this unlocked here. The futex code uses this flag
1037
* just to verify whether the pi state cleanup has been done
1038
* or not. In the worst case it loops once more.
1039
*/
1040
tsk->flags |= PF_EXITPIDONE;
1041
1042
if (tsk->io_context)
1043
exit_io_context(tsk);
1044
1045
if (tsk->splice_pipe)
1046
__free_pipe_info(tsk->splice_pipe);
1047
1048
validate_creds_for_do_exit(tsk);
1049
1050
preempt_disable();
1051
exit_rcu();
1052
/* causes final put_task_struct in finish_task_switch(). */
1053
tsk->state = TASK_DEAD;
1054
schedule();
1055
BUG();
1056
/* Avoid "noreturn function does return". */
1057
for (;;)
1058
cpu_relax(); /* For when BUG is null */
1059
}
1060
1061
EXPORT_SYMBOL_GPL(do_exit);
1062
1063
NORET_TYPE void complete_and_exit(struct completion *comp, long code)
1064
{
1065
if (comp)
1066
complete(comp);
1067
1068
do_exit(code);
1069
}
1070
1071
EXPORT_SYMBOL(complete_and_exit);
1072
1073
SYSCALL_DEFINE1(exit, int, error_code)
1074
{
1075
do_exit((error_code&0xff)<<8);
1076
}
1077
1078
/*
1079
* Take down every thread in the group. This is called by fatal signals
1080
* as well as by sys_exit_group (below).
1081
*/
1082
NORET_TYPE void
1083
do_group_exit(int exit_code)
1084
{
1085
struct signal_struct *sig = current->signal;
1086
1087
BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1088
1089
if (signal_group_exit(sig))
1090
exit_code = sig->group_exit_code;
1091
else if (!thread_group_empty(current)) {
1092
struct sighand_struct *const sighand = current->sighand;
1093
spin_lock_irq(&sighand->siglock);
1094
if (signal_group_exit(sig))
1095
/* Another thread got here before we took the lock. */
1096
exit_code = sig->group_exit_code;
1097
else {
1098
sig->group_exit_code = exit_code;
1099
sig->flags = SIGNAL_GROUP_EXIT;
1100
zap_other_threads(current);
1101
}
1102
spin_unlock_irq(&sighand->siglock);
1103
}
1104
1105
do_exit(exit_code);
1106
/* NOTREACHED */
1107
}
1108
1109
/*
1110
* this kills every thread in the thread group. Note that any externally
1111
* wait4()-ing process will get the correct exit code - even if this
1112
* thread is not the thread group leader.
1113
*/
1114
SYSCALL_DEFINE1(exit_group, int, error_code)
1115
{
1116
do_group_exit((error_code & 0xff) << 8);
1117
/* NOTREACHED */
1118
return 0;
1119
}
1120
1121
struct wait_opts {
1122
enum pid_type wo_type;
1123
int wo_flags;
1124
struct pid *wo_pid;
1125
1126
struct siginfo __user *wo_info;
1127
int __user *wo_stat;
1128
struct rusage __user *wo_rusage;
1129
1130
wait_queue_t child_wait;
1131
int notask_error;
1132
};
1133
1134
static inline
1135
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1136
{
1137
if (type != PIDTYPE_PID)
1138
task = task->group_leader;
1139
return task->pids[type].pid;
1140
}
1141
1142
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1143
{
1144
return wo->wo_type == PIDTYPE_MAX ||
1145
task_pid_type(p, wo->wo_type) == wo->wo_pid;
1146
}
1147
1148
static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1149
{
1150
if (!eligible_pid(wo, p))
1151
return 0;
1152
/* Wait for all children (clone and not) if __WALL is set;
1153
* otherwise, wait for clone children *only* if __WCLONE is
1154
* set; otherwise, wait for non-clone children *only*. (Note:
1155
* A "clone" child here is one that reports to its parent
1156
* using a signal other than SIGCHLD.) */
1157
if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1158
&& !(wo->wo_flags & __WALL))
1159
return 0;
1160
1161
return 1;
1162
}
1163
1164
static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1165
pid_t pid, uid_t uid, int why, int status)
1166
{
1167
struct siginfo __user *infop;
1168
int retval = wo->wo_rusage
1169
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1170
1171
put_task_struct(p);
1172
infop = wo->wo_info;
1173
if (infop) {
1174
if (!retval)
1175
retval = put_user(SIGCHLD, &infop->si_signo);
1176
if (!retval)
1177
retval = put_user(0, &infop->si_errno);
1178
if (!retval)
1179
retval = put_user((short)why, &infop->si_code);
1180
if (!retval)
1181
retval = put_user(pid, &infop->si_pid);
1182
if (!retval)
1183
retval = put_user(uid, &infop->si_uid);
1184
if (!retval)
1185
retval = put_user(status, &infop->si_status);
1186
}
1187
if (!retval)
1188
retval = pid;
1189
return retval;
1190
}
1191
1192
/*
1193
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1194
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1195
* the lock and this task is uninteresting. If we return nonzero, we have
1196
* released the lock and the system call should return.
1197
*/
1198
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1199
{
1200
unsigned long state;
1201
int retval, status, traced;
1202
pid_t pid = task_pid_vnr(p);
1203
uid_t uid = __task_cred(p)->uid;
1204
struct siginfo __user *infop;
1205
1206
if (!likely(wo->wo_flags & WEXITED))
1207
return 0;
1208
1209
if (unlikely(wo->wo_flags & WNOWAIT)) {
1210
int exit_code = p->exit_code;
1211
int why;
1212
1213
get_task_struct(p);
1214
read_unlock(&tasklist_lock);
1215
if ((exit_code & 0x7f) == 0) {
1216
why = CLD_EXITED;
1217
status = exit_code >> 8;
1218
} else {
1219
why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1220
status = exit_code & 0x7f;
1221
}
1222
return wait_noreap_copyout(wo, p, pid, uid, why, status);
1223
}
1224
1225
/*
1226
* Try to move the task's state to DEAD
1227
* only one thread is allowed to do this:
1228
*/
1229
state = xchg(&p->exit_state, EXIT_DEAD);
1230
if (state != EXIT_ZOMBIE) {
1231
BUG_ON(state != EXIT_DEAD);
1232
return 0;
1233
}
1234
1235
traced = ptrace_reparented(p);
1236
/*
1237
* It can be ptraced but not reparented, check
1238
* !task_detached() to filter out sub-threads.
1239
*/
1240
if (likely(!traced) && likely(!task_detached(p))) {
1241
struct signal_struct *psig;
1242
struct signal_struct *sig;
1243
unsigned long maxrss;
1244
cputime_t tgutime, tgstime;
1245
1246
/*
1247
* The resource counters for the group leader are in its
1248
* own task_struct. Those for dead threads in the group
1249
* are in its signal_struct, as are those for the child
1250
* processes it has previously reaped. All these
1251
* accumulate in the parent's signal_struct c* fields.
1252
*
1253
* We don't bother to take a lock here to protect these
1254
* p->signal fields, because they are only touched by
1255
* __exit_signal, which runs with tasklist_lock
1256
* write-locked anyway, and so is excluded here. We do
1257
* need to protect the access to parent->signal fields,
1258
* as other threads in the parent group can be right
1259
* here reaping other children at the same time.
1260
*
1261
* We use thread_group_times() to get times for the thread
1262
* group, which consolidates times for all threads in the
1263
* group including the group leader.
1264
*/
1265
thread_group_times(p, &tgutime, &tgstime);
1266
spin_lock_irq(&p->real_parent->sighand->siglock);
1267
psig = p->real_parent->signal;
1268
sig = p->signal;
1269
psig->cutime =
1270
cputime_add(psig->cutime,
1271
cputime_add(tgutime,
1272
sig->cutime));
1273
psig->cstime =
1274
cputime_add(psig->cstime,
1275
cputime_add(tgstime,
1276
sig->cstime));
1277
psig->cgtime =
1278
cputime_add(psig->cgtime,
1279
cputime_add(p->gtime,
1280
cputime_add(sig->gtime,
1281
sig->cgtime)));
1282
psig->cmin_flt +=
1283
p->min_flt + sig->min_flt + sig->cmin_flt;
1284
psig->cmaj_flt +=
1285
p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1286
psig->cnvcsw +=
1287
p->nvcsw + sig->nvcsw + sig->cnvcsw;
1288
psig->cnivcsw +=
1289
p->nivcsw + sig->nivcsw + sig->cnivcsw;
1290
psig->cinblock +=
1291
task_io_get_inblock(p) +
1292
sig->inblock + sig->cinblock;
1293
psig->coublock +=
1294
task_io_get_oublock(p) +
1295
sig->oublock + sig->coublock;
1296
maxrss = max(sig->maxrss, sig->cmaxrss);
1297
if (psig->cmaxrss < maxrss)
1298
psig->cmaxrss = maxrss;
1299
task_io_accounting_add(&psig->ioac, &p->ioac);
1300
task_io_accounting_add(&psig->ioac, &sig->ioac);
1301
spin_unlock_irq(&p->real_parent->sighand->siglock);
1302
}
1303
1304
/*
1305
* Now we are sure this task is interesting, and no other
1306
* thread can reap it because we set its state to EXIT_DEAD.
1307
*/
1308
read_unlock(&tasklist_lock);
1309
1310
retval = wo->wo_rusage
1311
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1312
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1313
? p->signal->group_exit_code : p->exit_code;
1314
if (!retval && wo->wo_stat)
1315
retval = put_user(status, wo->wo_stat);
1316
1317
infop = wo->wo_info;
1318
if (!retval && infop)
1319
retval = put_user(SIGCHLD, &infop->si_signo);
1320
if (!retval && infop)
1321
retval = put_user(0, &infop->si_errno);
1322
if (!retval && infop) {
1323
int why;
1324
1325
if ((status & 0x7f) == 0) {
1326
why = CLD_EXITED;
1327
status >>= 8;
1328
} else {
1329
why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1330
status &= 0x7f;
1331
}
1332
retval = put_user((short)why, &infop->si_code);
1333
if (!retval)
1334
retval = put_user(status, &infop->si_status);
1335
}
1336
if (!retval && infop)
1337
retval = put_user(pid, &infop->si_pid);
1338
if (!retval && infop)
1339
retval = put_user(uid, &infop->si_uid);
1340
if (!retval)
1341
retval = pid;
1342
1343
if (traced) {
1344
write_lock_irq(&tasklist_lock);
1345
/* We dropped tasklist, ptracer could die and untrace */
1346
ptrace_unlink(p);
1347
/*
1348
* If this is not a detached task, notify the parent.
1349
* If it's still not detached after that, don't release
1350
* it now.
1351
*/
1352
if (!task_detached(p)) {
1353
do_notify_parent(p, p->exit_signal);
1354
if (!task_detached(p)) {
1355
p->exit_state = EXIT_ZOMBIE;
1356
p = NULL;
1357
}
1358
}
1359
write_unlock_irq(&tasklist_lock);
1360
}
1361
if (p != NULL)
1362
release_task(p);
1363
1364
return retval;
1365
}
1366
1367
static int *task_stopped_code(struct task_struct *p, bool ptrace)
1368
{
1369
if (ptrace) {
1370
if (task_is_stopped_or_traced(p))
1371
return &p->exit_code;
1372
} else {
1373
if (p->signal->flags & SIGNAL_STOP_STOPPED)
1374
return &p->signal->group_exit_code;
1375
}
1376
return NULL;
1377
}
1378
1379
/**
1380
* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1381
* @wo: wait options
1382
* @ptrace: is the wait for ptrace
1383
* @p: task to wait for
1384
*
1385
* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1386
*
1387
* CONTEXT:
1388
* read_lock(&tasklist_lock), which is released if return value is
1389
* non-zero. Also, grabs and releases @p->sighand->siglock.
1390
*
1391
* RETURNS:
1392
* 0 if wait condition didn't exist and search for other wait conditions
1393
* should continue. Non-zero return, -errno on failure and @p's pid on
1394
* success, implies that tasklist_lock is released and wait condition
1395
* search should terminate.
1396
*/
1397
static int wait_task_stopped(struct wait_opts *wo,
1398
int ptrace, struct task_struct *p)
1399
{
1400
struct siginfo __user *infop;
1401
int retval, exit_code, *p_code, why;
1402
uid_t uid = 0; /* unneeded, required by compiler */
1403
pid_t pid;
1404
1405
/*
1406
* Traditionally we see ptrace'd stopped tasks regardless of options.
1407
*/
1408
if (!ptrace && !(wo->wo_flags & WUNTRACED))
1409
return 0;
1410
1411
if (!task_stopped_code(p, ptrace))
1412
return 0;
1413
1414
exit_code = 0;
1415
spin_lock_irq(&p->sighand->siglock);
1416
1417
p_code = task_stopped_code(p, ptrace);
1418
if (unlikely(!p_code))
1419
goto unlock_sig;
1420
1421
exit_code = *p_code;
1422
if (!exit_code)
1423
goto unlock_sig;
1424
1425
if (!unlikely(wo->wo_flags & WNOWAIT))
1426
*p_code = 0;
1427
1428
uid = task_uid(p);
1429
unlock_sig:
1430
spin_unlock_irq(&p->sighand->siglock);
1431
if (!exit_code)
1432
return 0;
1433
1434
/*
1435
* Now we are pretty sure this task is interesting.
1436
* Make sure it doesn't get reaped out from under us while we
1437
* give up the lock and then examine it below. We don't want to
1438
* keep holding onto the tasklist_lock while we call getrusage and
1439
* possibly take page faults for user memory.
1440
*/
1441
get_task_struct(p);
1442
pid = task_pid_vnr(p);
1443
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1444
read_unlock(&tasklist_lock);
1445
1446
if (unlikely(wo->wo_flags & WNOWAIT))
1447
return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1448
1449
retval = wo->wo_rusage
1450
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1451
if (!retval && wo->wo_stat)
1452
retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1453
1454
infop = wo->wo_info;
1455
if (!retval && infop)
1456
retval = put_user(SIGCHLD, &infop->si_signo);
1457
if (!retval && infop)
1458
retval = put_user(0, &infop->si_errno);
1459
if (!retval && infop)
1460
retval = put_user((short)why, &infop->si_code);
1461
if (!retval && infop)
1462
retval = put_user(exit_code, &infop->si_status);
1463
if (!retval && infop)
1464
retval = put_user(pid, &infop->si_pid);
1465
if (!retval && infop)
1466
retval = put_user(uid, &infop->si_uid);
1467
if (!retval)
1468
retval = pid;
1469
put_task_struct(p);
1470
1471
BUG_ON(!retval);
1472
return retval;
1473
}
1474
1475
/*
1476
* Handle do_wait work for one task in a live, non-stopped state.
1477
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1478
* the lock and this task is uninteresting. If we return nonzero, we have
1479
* released the lock and the system call should return.
1480
*/
1481
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1482
{
1483
int retval;
1484
pid_t pid;
1485
uid_t uid;
1486
1487
if (!unlikely(wo->wo_flags & WCONTINUED))
1488
return 0;
1489
1490
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1491
return 0;
1492
1493
spin_lock_irq(&p->sighand->siglock);
1494
/* Re-check with the lock held. */
1495
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1496
spin_unlock_irq(&p->sighand->siglock);
1497
return 0;
1498
}
1499
if (!unlikely(wo->wo_flags & WNOWAIT))
1500
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1501
uid = task_uid(p);
1502
spin_unlock_irq(&p->sighand->siglock);
1503
1504
pid = task_pid_vnr(p);
1505
get_task_struct(p);
1506
read_unlock(&tasklist_lock);
1507
1508
if (!wo->wo_info) {
1509
retval = wo->wo_rusage
1510
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1511
put_task_struct(p);
1512
if (!retval && wo->wo_stat)
1513
retval = put_user(0xffff, wo->wo_stat);
1514
if (!retval)
1515
retval = pid;
1516
} else {
1517
retval = wait_noreap_copyout(wo, p, pid, uid,
1518
CLD_CONTINUED, SIGCONT);
1519
BUG_ON(retval == 0);
1520
}
1521
1522
return retval;
1523
}
1524
1525
/*
1526
* Consider @p for a wait by @parent.
1527
*
1528
* -ECHILD should be in ->notask_error before the first call.
1529
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1530
* Returns zero if the search for a child should continue;
1531
* then ->notask_error is 0 if @p is an eligible child,
1532
* or another error from security_task_wait(), or still -ECHILD.
1533
*/
1534
static int wait_consider_task(struct wait_opts *wo, int ptrace,
1535
struct task_struct *p)
1536
{
1537
int ret = eligible_child(wo, p);
1538
if (!ret)
1539
return ret;
1540
1541
ret = security_task_wait(p);
1542
if (unlikely(ret < 0)) {
1543
/*
1544
* If we have not yet seen any eligible child,
1545
* then let this error code replace -ECHILD.
1546
* A permission error will give the user a clue
1547
* to look for security policy problems, rather
1548
* than for mysterious wait bugs.
1549
*/
1550
if (wo->notask_error)
1551
wo->notask_error = ret;
1552
return 0;
1553
}
1554
1555
/* dead body doesn't have much to contribute */
1556
if (p->exit_state == EXIT_DEAD)
1557
return 0;
1558
1559
/* slay zombie? */
1560
if (p->exit_state == EXIT_ZOMBIE) {
1561
/*
1562
* A zombie ptracee is only visible to its ptracer.
1563
* Notification and reaping will be cascaded to the real
1564
* parent when the ptracer detaches.
1565
*/
1566
if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1567
/* it will become visible, clear notask_error */
1568
wo->notask_error = 0;
1569
return 0;
1570
}
1571
1572
/* we don't reap group leaders with subthreads */
1573
if (!delay_group_leader(p))
1574
return wait_task_zombie(wo, p);
1575
1576
/*
1577
* Allow access to stopped/continued state via zombie by
1578
* falling through. Clearing of notask_error is complex.
1579
*
1580
* When !@ptrace:
1581
*
1582
* If WEXITED is set, notask_error should naturally be
1583
* cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1584
* so, if there are live subthreads, there are events to
1585
* wait for. If all subthreads are dead, it's still safe
1586
* to clear - this function will be called again in finite
1587
* amount time once all the subthreads are released and
1588
* will then return without clearing.
1589
*
1590
* When @ptrace:
1591
*
1592
* Stopped state is per-task and thus can't change once the
1593
* target task dies. Only continued and exited can happen.
1594
* Clear notask_error if WCONTINUED | WEXITED.
1595
*/
1596
if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1597
wo->notask_error = 0;
1598
} else {
1599
/*
1600
* If @p is ptraced by a task in its real parent's group,
1601
* hide group stop/continued state when looking at @p as
1602
* the real parent; otherwise, a single stop can be
1603
* reported twice as group and ptrace stops.
1604
*
1605
* If a ptracer wants to distinguish the two events for its
1606
* own children, it should create a separate process which
1607
* takes the role of real parent.
1608
*/
1609
if (likely(!ptrace) && task_ptrace(p) &&
1610
same_thread_group(p->parent, p->real_parent))
1611
return 0;
1612
1613
/*
1614
* @p is alive and it's gonna stop, continue or exit, so
1615
* there always is something to wait for.
1616
*/
1617
wo->notask_error = 0;
1618
}
1619
1620
/*
1621
* Wait for stopped. Depending on @ptrace, different stopped state
1622
* is used and the two don't interact with each other.
1623
*/
1624
ret = wait_task_stopped(wo, ptrace, p);
1625
if (ret)
1626
return ret;
1627
1628
/*
1629
* Wait for continued. There's only one continued state and the
1630
* ptracer can consume it which can confuse the real parent. Don't
1631
* use WCONTINUED from ptracer. You don't need or want it.
1632
*/
1633
return wait_task_continued(wo, p);
1634
}
1635
1636
/*
1637
* Do the work of do_wait() for one thread in the group, @tsk.
1638
*
1639
* -ECHILD should be in ->notask_error before the first call.
1640
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
1641
* Returns zero if the search for a child should continue; then
1642
* ->notask_error is 0 if there were any eligible children,
1643
* or another error from security_task_wait(), or still -ECHILD.
1644
*/
1645
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1646
{
1647
struct task_struct *p;
1648
1649
list_for_each_entry(p, &tsk->children, sibling) {
1650
int ret = wait_consider_task(wo, 0, p);
1651
if (ret)
1652
return ret;
1653
}
1654
1655
return 0;
1656
}
1657
1658
static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1659
{
1660
struct task_struct *p;
1661
1662
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1663
int ret = wait_consider_task(wo, 1, p);
1664
if (ret)
1665
return ret;
1666
}
1667
1668
return 0;
1669
}
1670
1671
static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1672
int sync, void *key)
1673
{
1674
struct wait_opts *wo = container_of(wait, struct wait_opts,
1675
child_wait);
1676
struct task_struct *p = key;
1677
1678
if (!eligible_pid(wo, p))
1679
return 0;
1680
1681
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1682
return 0;
1683
1684
return default_wake_function(wait, mode, sync, key);
1685
}
1686
1687
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1688
{
1689
__wake_up_sync_key(&parent->signal->wait_chldexit,
1690
TASK_INTERRUPTIBLE, 1, p);
1691
}
1692
1693
static long do_wait(struct wait_opts *wo)
1694
{
1695
struct task_struct *tsk;
1696
int retval;
1697
1698
trace_sched_process_wait(wo->wo_pid);
1699
1700
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1701
wo->child_wait.private = current;
1702
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1703
repeat:
1704
/*
1705
* If there is nothing that can match our critiera just get out.
1706
* We will clear ->notask_error to zero if we see any child that
1707
* might later match our criteria, even if we are not able to reap
1708
* it yet.
1709
*/
1710
wo->notask_error = -ECHILD;
1711
if ((wo->wo_type < PIDTYPE_MAX) &&
1712
(!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1713
goto notask;
1714
1715
set_current_state(TASK_INTERRUPTIBLE);
1716
read_lock(&tasklist_lock);
1717
tsk = current;
1718
do {
1719
retval = do_wait_thread(wo, tsk);
1720
if (retval)
1721
goto end;
1722
1723
retval = ptrace_do_wait(wo, tsk);
1724
if (retval)
1725
goto end;
1726
1727
if (wo->wo_flags & __WNOTHREAD)
1728
break;
1729
} while_each_thread(current, tsk);
1730
read_unlock(&tasklist_lock);
1731
1732
notask:
1733
retval = wo->notask_error;
1734
if (!retval && !(wo->wo_flags & WNOHANG)) {
1735
retval = -ERESTARTSYS;
1736
if (!signal_pending(current)) {
1737
schedule();
1738
goto repeat;
1739
}
1740
}
1741
end:
1742
__set_current_state(TASK_RUNNING);
1743
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1744
return retval;
1745
}
1746
1747
SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1748
infop, int, options, struct rusage __user *, ru)
1749
{
1750
struct wait_opts wo;
1751
struct pid *pid = NULL;
1752
enum pid_type type;
1753
long ret;
1754
1755
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
1756
return -EINVAL;
1757
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1758
return -EINVAL;
1759
1760
switch (which) {
1761
case P_ALL:
1762
type = PIDTYPE_MAX;
1763
break;
1764
case P_PID:
1765
type = PIDTYPE_PID;
1766
if (upid <= 0)
1767
return -EINVAL;
1768
break;
1769
case P_PGID:
1770
type = PIDTYPE_PGID;
1771
if (upid <= 0)
1772
return -EINVAL;
1773
break;
1774
default:
1775
return -EINVAL;
1776
}
1777
1778
if (type < PIDTYPE_MAX)
1779
pid = find_get_pid(upid);
1780
1781
wo.wo_type = type;
1782
wo.wo_pid = pid;
1783
wo.wo_flags = options;
1784
wo.wo_info = infop;
1785
wo.wo_stat = NULL;
1786
wo.wo_rusage = ru;
1787
ret = do_wait(&wo);
1788
1789
if (ret > 0) {
1790
ret = 0;
1791
} else if (infop) {
1792
/*
1793
* For a WNOHANG return, clear out all the fields
1794
* we would set so the user can easily tell the
1795
* difference.
1796
*/
1797
if (!ret)
1798
ret = put_user(0, &infop->si_signo);
1799
if (!ret)
1800
ret = put_user(0, &infop->si_errno);
1801
if (!ret)
1802
ret = put_user(0, &infop->si_code);
1803
if (!ret)
1804
ret = put_user(0, &infop->si_pid);
1805
if (!ret)
1806
ret = put_user(0, &infop->si_uid);
1807
if (!ret)
1808
ret = put_user(0, &infop->si_status);
1809
}
1810
1811
put_pid(pid);
1812
1813
/* avoid REGPARM breakage on x86: */
1814
asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1815
return ret;
1816
}
1817
1818
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1819
int, options, struct rusage __user *, ru)
1820
{
1821
struct wait_opts wo;
1822
struct pid *pid = NULL;
1823
enum pid_type type;
1824
long ret;
1825
1826
if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1827
__WNOTHREAD|__WCLONE|__WALL))
1828
return -EINVAL;
1829
1830
if (upid == -1)
1831
type = PIDTYPE_MAX;
1832
else if (upid < 0) {
1833
type = PIDTYPE_PGID;
1834
pid = find_get_pid(-upid);
1835
} else if (upid == 0) {
1836
type = PIDTYPE_PGID;
1837
pid = get_task_pid(current, PIDTYPE_PGID);
1838
} else /* upid > 0 */ {
1839
type = PIDTYPE_PID;
1840
pid = find_get_pid(upid);
1841
}
1842
1843
wo.wo_type = type;
1844
wo.wo_pid = pid;
1845
wo.wo_flags = options | WEXITED;
1846
wo.wo_info = NULL;
1847
wo.wo_stat = stat_addr;
1848
wo.wo_rusage = ru;
1849
ret = do_wait(&wo);
1850
put_pid(pid);
1851
1852
/* avoid REGPARM breakage on x86: */
1853
asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1854
return ret;
1855
}
1856
1857
#ifdef __ARCH_WANT_SYS_WAITPID
1858
1859
/*
1860
* sys_waitpid() remains for compatibility. waitpid() should be
1861
* implemented by calling sys_wait4() from libc.a.
1862
*/
1863
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1864
{
1865
return sys_wait4(pid, stat_addr, options, NULL);
1866
}
1867
1868
#endif
1869
1870