Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/fork.c
26245 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* linux/kernel/fork.c
4
*
5
* Copyright (C) 1991, 1992 Linus Torvalds
6
*/
7
8
/*
9
* 'fork.c' contains the help-routines for the 'fork' system call
10
* (see also entry.S and others).
11
* Fork is rather simple, once you get the hang of it, but the memory
12
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
13
*/
14
15
#include <linux/anon_inodes.h>
16
#include <linux/slab.h>
17
#include <linux/sched/autogroup.h>
18
#include <linux/sched/mm.h>
19
#include <linux/sched/user.h>
20
#include <linux/sched/numa_balancing.h>
21
#include <linux/sched/stat.h>
22
#include <linux/sched/task.h>
23
#include <linux/sched/task_stack.h>
24
#include <linux/sched/cputime.h>
25
#include <linux/sched/ext.h>
26
#include <linux/seq_file.h>
27
#include <linux/rtmutex.h>
28
#include <linux/init.h>
29
#include <linux/unistd.h>
30
#include <linux/module.h>
31
#include <linux/vmalloc.h>
32
#include <linux/completion.h>
33
#include <linux/personality.h>
34
#include <linux/mempolicy.h>
35
#include <linux/sem.h>
36
#include <linux/file.h>
37
#include <linux/fdtable.h>
38
#include <linux/iocontext.h>
39
#include <linux/key.h>
40
#include <linux/kmsan.h>
41
#include <linux/binfmts.h>
42
#include <linux/mman.h>
43
#include <linux/mmu_notifier.h>
44
#include <linux/fs.h>
45
#include <linux/mm.h>
46
#include <linux/mm_inline.h>
47
#include <linux/memblock.h>
48
#include <linux/nsproxy.h>
49
#include <linux/capability.h>
50
#include <linux/cpu.h>
51
#include <linux/cgroup.h>
52
#include <linux/security.h>
53
#include <linux/hugetlb.h>
54
#include <linux/seccomp.h>
55
#include <linux/swap.h>
56
#include <linux/syscalls.h>
57
#include <linux/syscall_user_dispatch.h>
58
#include <linux/jiffies.h>
59
#include <linux/futex.h>
60
#include <linux/compat.h>
61
#include <linux/kthread.h>
62
#include <linux/task_io_accounting_ops.h>
63
#include <linux/rcupdate.h>
64
#include <linux/ptrace.h>
65
#include <linux/mount.h>
66
#include <linux/audit.h>
67
#include <linux/memcontrol.h>
68
#include <linux/ftrace.h>
69
#include <linux/proc_fs.h>
70
#include <linux/profile.h>
71
#include <linux/rmap.h>
72
#include <linux/ksm.h>
73
#include <linux/acct.h>
74
#include <linux/userfaultfd_k.h>
75
#include <linux/tsacct_kern.h>
76
#include <linux/cn_proc.h>
77
#include <linux/freezer.h>
78
#include <linux/delayacct.h>
79
#include <linux/taskstats_kern.h>
80
#include <linux/tty.h>
81
#include <linux/fs_struct.h>
82
#include <linux/magic.h>
83
#include <linux/perf_event.h>
84
#include <linux/posix-timers.h>
85
#include <linux/user-return-notifier.h>
86
#include <linux/oom.h>
87
#include <linux/khugepaged.h>
88
#include <linux/signalfd.h>
89
#include <linux/uprobes.h>
90
#include <linux/aio.h>
91
#include <linux/compiler.h>
92
#include <linux/sysctl.h>
93
#include <linux/kcov.h>
94
#include <linux/livepatch.h>
95
#include <linux/thread_info.h>
96
#include <linux/kstack_erase.h>
97
#include <linux/kasan.h>
98
#include <linux/scs.h>
99
#include <linux/io_uring.h>
100
#include <linux/bpf.h>
101
#include <linux/stackprotector.h>
102
#include <linux/user_events.h>
103
#include <linux/iommu.h>
104
#include <linux/rseq.h>
105
#include <uapi/linux/pidfd.h>
106
#include <linux/pidfs.h>
107
#include <linux/tick.h>
108
#include <linux/unwind_deferred.h>
109
110
#include <asm/pgalloc.h>
111
#include <linux/uaccess.h>
112
#include <asm/mmu_context.h>
113
#include <asm/cacheflush.h>
114
#include <asm/tlbflush.h>
115
116
/* For dup_mmap(). */
117
#include "../mm/internal.h"
118
119
#include <trace/events/sched.h>
120
121
#define CREATE_TRACE_POINTS
122
#include <trace/events/task.h>
123
124
#include <kunit/visibility.h>
125
126
/*
127
* Minimum number of threads to boot the kernel
128
*/
129
#define MIN_THREADS 20
130
131
/*
132
* Maximum number of threads
133
*/
134
#define MAX_THREADS FUTEX_TID_MASK
135
136
/*
137
* Protected counters by write_lock_irq(&tasklist_lock)
138
*/
139
unsigned long total_forks; /* Handle normal Linux uptimes. */
140
int nr_threads; /* The idle threads do not count.. */
141
142
static int max_threads; /* tunable limit on nr_threads */
143
144
#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
145
146
static const char * const resident_page_types[] = {
147
NAMED_ARRAY_INDEX(MM_FILEPAGES),
148
NAMED_ARRAY_INDEX(MM_ANONPAGES),
149
NAMED_ARRAY_INDEX(MM_SWAPENTS),
150
NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
151
};
152
153
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
154
155
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
156
157
#ifdef CONFIG_PROVE_RCU
158
int lockdep_tasklist_lock_is_held(void)
159
{
160
return lockdep_is_held(&tasklist_lock);
161
}
162
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
163
#endif /* #ifdef CONFIG_PROVE_RCU */
164
165
int nr_processes(void)
166
{
167
int cpu;
168
int total = 0;
169
170
for_each_possible_cpu(cpu)
171
total += per_cpu(process_counts, cpu);
172
173
return total;
174
}
175
176
void __weak arch_release_task_struct(struct task_struct *tsk)
177
{
178
}
179
180
static struct kmem_cache *task_struct_cachep;
181
182
static inline struct task_struct *alloc_task_struct_node(int node)
183
{
184
return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
185
}
186
187
static inline void free_task_struct(struct task_struct *tsk)
188
{
189
kmem_cache_free(task_struct_cachep, tsk);
190
}
191
192
#ifdef CONFIG_VMAP_STACK
193
/*
194
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
195
* flush. Try to minimize the number of calls by caching stacks.
196
*/
197
#define NR_CACHED_STACKS 2
198
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
199
/*
200
* Allocated stacks are cached and later reused by new threads, so memcg
201
* accounting is performed by the code assigning/releasing stacks to tasks.
202
* We need a zeroed memory without __GFP_ACCOUNT.
203
*/
204
#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
205
206
struct vm_stack {
207
struct rcu_head rcu;
208
struct vm_struct *stack_vm_area;
209
};
210
211
static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
212
{
213
unsigned int i;
214
215
for (i = 0; i < NR_CACHED_STACKS; i++) {
216
struct vm_struct *tmp = NULL;
217
218
if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
219
return true;
220
}
221
return false;
222
}
223
224
static void thread_stack_free_rcu(struct rcu_head *rh)
225
{
226
struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
227
struct vm_struct *vm_area = vm_stack->stack_vm_area;
228
229
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
230
return;
231
232
vfree(vm_area->addr);
233
}
234
235
static void thread_stack_delayed_free(struct task_struct *tsk)
236
{
237
struct vm_stack *vm_stack = tsk->stack;
238
239
vm_stack->stack_vm_area = tsk->stack_vm_area;
240
call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
241
}
242
243
static int free_vm_stack_cache(unsigned int cpu)
244
{
245
struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
246
int i;
247
248
for (i = 0; i < NR_CACHED_STACKS; i++) {
249
struct vm_struct *vm_area = cached_vm_stack_areas[i];
250
251
if (!vm_area)
252
continue;
253
254
vfree(vm_area->addr);
255
cached_vm_stack_areas[i] = NULL;
256
}
257
258
return 0;
259
}
260
261
static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
262
{
263
int i;
264
int ret;
265
int nr_charged = 0;
266
267
BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
268
269
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
270
ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
271
if (ret)
272
goto err;
273
nr_charged++;
274
}
275
return 0;
276
err:
277
for (i = 0; i < nr_charged; i++)
278
memcg_kmem_uncharge_page(vm_area->pages[i], 0);
279
return ret;
280
}
281
282
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
283
{
284
struct vm_struct *vm_area;
285
void *stack;
286
int i;
287
288
for (i = 0; i < NR_CACHED_STACKS; i++) {
289
vm_area = this_cpu_xchg(cached_stacks[i], NULL);
290
if (!vm_area)
291
continue;
292
293
/* Reset stack metadata. */
294
kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
295
296
stack = kasan_reset_tag(vm_area->addr);
297
298
/* Clear stale pointers from reused stack. */
299
memset(stack, 0, THREAD_SIZE);
300
301
if (memcg_charge_kernel_stack(vm_area)) {
302
vfree(vm_area->addr);
303
return -ENOMEM;
304
}
305
306
tsk->stack_vm_area = vm_area;
307
tsk->stack = stack;
308
return 0;
309
}
310
311
stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
312
GFP_VMAP_STACK,
313
node, __builtin_return_address(0));
314
if (!stack)
315
return -ENOMEM;
316
317
vm_area = find_vm_area(stack);
318
if (memcg_charge_kernel_stack(vm_area)) {
319
vfree(stack);
320
return -ENOMEM;
321
}
322
/*
323
* We can't call find_vm_area() in interrupt context, and
324
* free_thread_stack() can be called in interrupt context,
325
* so cache the vm_struct.
326
*/
327
tsk->stack_vm_area = vm_area;
328
stack = kasan_reset_tag(stack);
329
tsk->stack = stack;
330
return 0;
331
}
332
333
static void free_thread_stack(struct task_struct *tsk)
334
{
335
if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
336
thread_stack_delayed_free(tsk);
337
338
tsk->stack = NULL;
339
tsk->stack_vm_area = NULL;
340
}
341
342
#else /* !CONFIG_VMAP_STACK */
343
344
/*
345
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
346
* kmemcache based allocator.
347
*/
348
#if THREAD_SIZE >= PAGE_SIZE
349
350
static void thread_stack_free_rcu(struct rcu_head *rh)
351
{
352
__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
353
}
354
355
static void thread_stack_delayed_free(struct task_struct *tsk)
356
{
357
struct rcu_head *rh = tsk->stack;
358
359
call_rcu(rh, thread_stack_free_rcu);
360
}
361
362
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
363
{
364
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
365
THREAD_SIZE_ORDER);
366
367
if (likely(page)) {
368
tsk->stack = kasan_reset_tag(page_address(page));
369
return 0;
370
}
371
return -ENOMEM;
372
}
373
374
static void free_thread_stack(struct task_struct *tsk)
375
{
376
thread_stack_delayed_free(tsk);
377
tsk->stack = NULL;
378
}
379
380
#else /* !(THREAD_SIZE >= PAGE_SIZE) */
381
382
static struct kmem_cache *thread_stack_cache;
383
384
static void thread_stack_free_rcu(struct rcu_head *rh)
385
{
386
kmem_cache_free(thread_stack_cache, rh);
387
}
388
389
static void thread_stack_delayed_free(struct task_struct *tsk)
390
{
391
struct rcu_head *rh = tsk->stack;
392
393
call_rcu(rh, thread_stack_free_rcu);
394
}
395
396
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
397
{
398
unsigned long *stack;
399
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
400
stack = kasan_reset_tag(stack);
401
tsk->stack = stack;
402
return stack ? 0 : -ENOMEM;
403
}
404
405
static void free_thread_stack(struct task_struct *tsk)
406
{
407
thread_stack_delayed_free(tsk);
408
tsk->stack = NULL;
409
}
410
411
void thread_stack_cache_init(void)
412
{
413
thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
414
THREAD_SIZE, THREAD_SIZE, 0, 0,
415
THREAD_SIZE, NULL);
416
BUG_ON(thread_stack_cache == NULL);
417
}
418
419
#endif /* THREAD_SIZE >= PAGE_SIZE */
420
#endif /* CONFIG_VMAP_STACK */
421
422
/* SLAB cache for signal_struct structures (tsk->signal) */
423
static struct kmem_cache *signal_cachep;
424
425
/* SLAB cache for sighand_struct structures (tsk->sighand) */
426
struct kmem_cache *sighand_cachep;
427
428
/* SLAB cache for files_struct structures (tsk->files) */
429
struct kmem_cache *files_cachep;
430
431
/* SLAB cache for fs_struct structures (tsk->fs) */
432
struct kmem_cache *fs_cachep;
433
434
/* SLAB cache for mm_struct structures (tsk->mm) */
435
static struct kmem_cache *mm_cachep;
436
437
static void account_kernel_stack(struct task_struct *tsk, int account)
438
{
439
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
440
struct vm_struct *vm_area = task_stack_vm_area(tsk);
441
int i;
442
443
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
444
mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
445
account * (PAGE_SIZE / 1024));
446
} else {
447
void *stack = task_stack_page(tsk);
448
449
/* All stack pages are in the same node. */
450
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
451
account * (THREAD_SIZE / 1024));
452
}
453
}
454
455
void exit_task_stack_account(struct task_struct *tsk)
456
{
457
account_kernel_stack(tsk, -1);
458
459
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
460
struct vm_struct *vm_area;
461
int i;
462
463
vm_area = task_stack_vm_area(tsk);
464
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
465
memcg_kmem_uncharge_page(vm_area->pages[i], 0);
466
}
467
}
468
469
static void release_task_stack(struct task_struct *tsk)
470
{
471
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
472
return; /* Better to leak the stack than to free prematurely */
473
474
free_thread_stack(tsk);
475
}
476
477
#ifdef CONFIG_THREAD_INFO_IN_TASK
478
void put_task_stack(struct task_struct *tsk)
479
{
480
if (refcount_dec_and_test(&tsk->stack_refcount))
481
release_task_stack(tsk);
482
}
483
#endif
484
485
void free_task(struct task_struct *tsk)
486
{
487
#ifdef CONFIG_SECCOMP
488
WARN_ON_ONCE(tsk->seccomp.filter);
489
#endif
490
release_user_cpus_ptr(tsk);
491
scs_release(tsk);
492
493
#ifndef CONFIG_THREAD_INFO_IN_TASK
494
/*
495
* The task is finally done with both the stack and thread_info,
496
* so free both.
497
*/
498
release_task_stack(tsk);
499
#else
500
/*
501
* If the task had a separate stack allocation, it should be gone
502
* by now.
503
*/
504
WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
505
#endif
506
rt_mutex_debug_task_free(tsk);
507
ftrace_graph_exit_task(tsk);
508
arch_release_task_struct(tsk);
509
if (tsk->flags & PF_KTHREAD)
510
free_kthread_struct(tsk);
511
bpf_task_storage_free(tsk);
512
free_task_struct(tsk);
513
}
514
EXPORT_SYMBOL(free_task);
515
516
void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
517
{
518
struct file *exe_file;
519
520
exe_file = get_mm_exe_file(oldmm);
521
RCU_INIT_POINTER(mm->exe_file, exe_file);
522
/*
523
* We depend on the oldmm having properly denied write access to the
524
* exe_file already.
525
*/
526
if (exe_file && exe_file_deny_write_access(exe_file))
527
pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
528
}
529
530
#ifdef CONFIG_MMU
531
static inline int mm_alloc_pgd(struct mm_struct *mm)
532
{
533
mm->pgd = pgd_alloc(mm);
534
if (unlikely(!mm->pgd))
535
return -ENOMEM;
536
return 0;
537
}
538
539
static inline void mm_free_pgd(struct mm_struct *mm)
540
{
541
pgd_free(mm, mm->pgd);
542
}
543
#else
544
#define mm_alloc_pgd(mm) (0)
545
#define mm_free_pgd(mm)
546
#endif /* CONFIG_MMU */
547
548
#ifdef CONFIG_MM_ID
549
static DEFINE_IDA(mm_ida);
550
551
static inline int mm_alloc_id(struct mm_struct *mm)
552
{
553
int ret;
554
555
ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
556
if (ret < 0)
557
return ret;
558
mm->mm_id = ret;
559
return 0;
560
}
561
562
static inline void mm_free_id(struct mm_struct *mm)
563
{
564
const mm_id_t id = mm->mm_id;
565
566
mm->mm_id = MM_ID_DUMMY;
567
if (id == MM_ID_DUMMY)
568
return;
569
if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
570
return;
571
ida_free(&mm_ida, id);
572
}
573
#else /* !CONFIG_MM_ID */
574
static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
575
static inline void mm_free_id(struct mm_struct *mm) {}
576
#endif /* CONFIG_MM_ID */
577
578
static void check_mm(struct mm_struct *mm)
579
{
580
int i;
581
582
BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
583
"Please make sure 'struct resident_page_types[]' is updated as well");
584
585
for (i = 0; i < NR_MM_COUNTERS; i++) {
586
long x = percpu_counter_sum(&mm->rss_stat[i]);
587
588
if (unlikely(x)) {
589
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
590
mm, resident_page_types[i], x,
591
current->comm,
592
task_pid_nr(current));
593
}
594
}
595
596
if (mm_pgtables_bytes(mm))
597
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
598
mm_pgtables_bytes(mm));
599
600
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
601
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
602
#endif
603
}
604
605
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
606
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
607
608
static void do_check_lazy_tlb(void *arg)
609
{
610
struct mm_struct *mm = arg;
611
612
WARN_ON_ONCE(current->active_mm == mm);
613
}
614
615
static void do_shoot_lazy_tlb(void *arg)
616
{
617
struct mm_struct *mm = arg;
618
619
if (current->active_mm == mm) {
620
WARN_ON_ONCE(current->mm);
621
current->active_mm = &init_mm;
622
switch_mm(mm, &init_mm, current);
623
}
624
}
625
626
static void cleanup_lazy_tlbs(struct mm_struct *mm)
627
{
628
if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
629
/*
630
* In this case, lazy tlb mms are refounted and would not reach
631
* __mmdrop until all CPUs have switched away and mmdrop()ed.
632
*/
633
return;
634
}
635
636
/*
637
* Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
638
* requires lazy mm users to switch to another mm when the refcount
639
* drops to zero, before the mm is freed. This requires IPIs here to
640
* switch kernel threads to init_mm.
641
*
642
* archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
643
* switch with the final userspace teardown TLB flush which leaves the
644
* mm lazy on this CPU but no others, reducing the need for additional
645
* IPIs here. There are cases where a final IPI is still required here,
646
* such as the final mmdrop being performed on a different CPU than the
647
* one exiting, or kernel threads using the mm when userspace exits.
648
*
649
* IPI overheads have not found to be expensive, but they could be
650
* reduced in a number of possible ways, for example (roughly
651
* increasing order of complexity):
652
* - The last lazy reference created by exit_mm() could instead switch
653
* to init_mm, however it's probable this will run on the same CPU
654
* immediately afterwards, so this may not reduce IPIs much.
655
* - A batch of mms requiring IPIs could be gathered and freed at once.
656
* - CPUs store active_mm where it can be remotely checked without a
657
* lock, to filter out false-positives in the cpumask.
658
* - After mm_users or mm_count reaches zero, switching away from the
659
* mm could clear mm_cpumask to reduce some IPIs, perhaps together
660
* with some batching or delaying of the final IPIs.
661
* - A delayed freeing and RCU-like quiescing sequence based on mm
662
* switching to avoid IPIs completely.
663
*/
664
on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
665
if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
666
on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
667
}
668
669
/*
670
* Called when the last reference to the mm
671
* is dropped: either by a lazy thread or by
672
* mmput. Free the page directory and the mm.
673
*/
674
void __mmdrop(struct mm_struct *mm)
675
{
676
BUG_ON(mm == &init_mm);
677
WARN_ON_ONCE(mm == current->mm);
678
679
/* Ensure no CPUs are using this as their lazy tlb mm */
680
cleanup_lazy_tlbs(mm);
681
682
WARN_ON_ONCE(mm == current->active_mm);
683
mm_free_pgd(mm);
684
mm_free_id(mm);
685
destroy_context(mm);
686
mmu_notifier_subscriptions_destroy(mm);
687
check_mm(mm);
688
put_user_ns(mm->user_ns);
689
mm_pasid_drop(mm);
690
mm_destroy_cid(mm);
691
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
692
futex_hash_free(mm);
693
694
free_mm(mm);
695
}
696
EXPORT_SYMBOL_GPL(__mmdrop);
697
698
static void mmdrop_async_fn(struct work_struct *work)
699
{
700
struct mm_struct *mm;
701
702
mm = container_of(work, struct mm_struct, async_put_work);
703
__mmdrop(mm);
704
}
705
706
static void mmdrop_async(struct mm_struct *mm)
707
{
708
if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
709
INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
710
schedule_work(&mm->async_put_work);
711
}
712
}
713
714
static inline void free_signal_struct(struct signal_struct *sig)
715
{
716
taskstats_tgid_free(sig);
717
sched_autogroup_exit(sig);
718
/*
719
* __mmdrop is not safe to call from softirq context on x86 due to
720
* pgd_dtor so postpone it to the async context
721
*/
722
if (sig->oom_mm)
723
mmdrop_async(sig->oom_mm);
724
kmem_cache_free(signal_cachep, sig);
725
}
726
727
static inline void put_signal_struct(struct signal_struct *sig)
728
{
729
if (refcount_dec_and_test(&sig->sigcnt))
730
free_signal_struct(sig);
731
}
732
733
void __put_task_struct(struct task_struct *tsk)
734
{
735
WARN_ON(!tsk->exit_state);
736
WARN_ON(refcount_read(&tsk->usage));
737
WARN_ON(tsk == current);
738
739
unwind_task_free(tsk);
740
sched_ext_free(tsk);
741
io_uring_free(tsk);
742
cgroup_free(tsk);
743
task_numa_free(tsk, true);
744
security_task_free(tsk);
745
exit_creds(tsk);
746
delayacct_tsk_free(tsk);
747
put_signal_struct(tsk->signal);
748
sched_core_free(tsk);
749
free_task(tsk);
750
}
751
EXPORT_SYMBOL_GPL(__put_task_struct);
752
753
void __put_task_struct_rcu_cb(struct rcu_head *rhp)
754
{
755
struct task_struct *task = container_of(rhp, struct task_struct, rcu);
756
757
__put_task_struct(task);
758
}
759
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
760
761
void __init __weak arch_task_cache_init(void) { }
762
763
/*
764
* set_max_threads
765
*/
766
static void __init set_max_threads(unsigned int max_threads_suggested)
767
{
768
u64 threads;
769
unsigned long nr_pages = memblock_estimated_nr_free_pages();
770
771
/*
772
* The number of threads shall be limited such that the thread
773
* structures may only consume a small part of the available memory.
774
*/
775
if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
776
threads = MAX_THREADS;
777
else
778
threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
779
(u64) THREAD_SIZE * 8UL);
780
781
if (threads > max_threads_suggested)
782
threads = max_threads_suggested;
783
784
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
785
}
786
787
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
788
/* Initialized by the architecture: */
789
int arch_task_struct_size __read_mostly;
790
#endif
791
792
static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
793
{
794
/* Fetch thread_struct whitelist for the architecture. */
795
arch_thread_struct_whitelist(offset, size);
796
797
/*
798
* Handle zero-sized whitelist or empty thread_struct, otherwise
799
* adjust offset to position of thread_struct in task_struct.
800
*/
801
if (unlikely(*size == 0))
802
*offset = 0;
803
else
804
*offset += offsetof(struct task_struct, thread);
805
}
806
807
void __init fork_init(void)
808
{
809
int i;
810
#ifndef ARCH_MIN_TASKALIGN
811
#define ARCH_MIN_TASKALIGN 0
812
#endif
813
int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
814
unsigned long useroffset, usersize;
815
816
/* create a slab on which task_structs can be allocated */
817
task_struct_whitelist(&useroffset, &usersize);
818
task_struct_cachep = kmem_cache_create_usercopy("task_struct",
819
arch_task_struct_size, align,
820
SLAB_PANIC|SLAB_ACCOUNT,
821
useroffset, usersize, NULL);
822
823
/* do the arch specific task caches init */
824
arch_task_cache_init();
825
826
set_max_threads(MAX_THREADS);
827
828
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
829
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
830
init_task.signal->rlim[RLIMIT_SIGPENDING] =
831
init_task.signal->rlim[RLIMIT_NPROC];
832
833
for (i = 0; i < UCOUNT_COUNTS; i++)
834
init_user_ns.ucount_max[i] = max_threads/2;
835
836
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
837
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
838
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
839
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
840
841
#ifdef CONFIG_VMAP_STACK
842
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
843
NULL, free_vm_stack_cache);
844
#endif
845
846
scs_init();
847
848
lockdep_init_task(&init_task);
849
uprobes_init();
850
}
851
852
int __weak arch_dup_task_struct(struct task_struct *dst,
853
struct task_struct *src)
854
{
855
*dst = *src;
856
return 0;
857
}
858
859
void set_task_stack_end_magic(struct task_struct *tsk)
860
{
861
unsigned long *stackend;
862
863
stackend = end_of_stack(tsk);
864
*stackend = STACK_END_MAGIC; /* for overflow detection */
865
}
866
867
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
868
{
869
struct task_struct *tsk;
870
int err;
871
872
if (node == NUMA_NO_NODE)
873
node = tsk_fork_get_node(orig);
874
tsk = alloc_task_struct_node(node);
875
if (!tsk)
876
return NULL;
877
878
err = arch_dup_task_struct(tsk, orig);
879
if (err)
880
goto free_tsk;
881
882
err = alloc_thread_stack_node(tsk, node);
883
if (err)
884
goto free_tsk;
885
886
#ifdef CONFIG_THREAD_INFO_IN_TASK
887
refcount_set(&tsk->stack_refcount, 1);
888
#endif
889
account_kernel_stack(tsk, 1);
890
891
err = scs_prepare(tsk, node);
892
if (err)
893
goto free_stack;
894
895
#ifdef CONFIG_SECCOMP
896
/*
897
* We must handle setting up seccomp filters once we're under
898
* the sighand lock in case orig has changed between now and
899
* then. Until then, filter must be NULL to avoid messing up
900
* the usage counts on the error path calling free_task.
901
*/
902
tsk->seccomp.filter = NULL;
903
#endif
904
905
setup_thread_stack(tsk, orig);
906
clear_user_return_notifier(tsk);
907
clear_tsk_need_resched(tsk);
908
set_task_stack_end_magic(tsk);
909
clear_syscall_work_syscall_user_dispatch(tsk);
910
911
#ifdef CONFIG_STACKPROTECTOR
912
tsk->stack_canary = get_random_canary();
913
#endif
914
if (orig->cpus_ptr == &orig->cpus_mask)
915
tsk->cpus_ptr = &tsk->cpus_mask;
916
dup_user_cpus_ptr(tsk, orig, node);
917
918
/*
919
* One for the user space visible state that goes away when reaped.
920
* One for the scheduler.
921
*/
922
refcount_set(&tsk->rcu_users, 2);
923
/* One for the rcu users */
924
refcount_set(&tsk->usage, 1);
925
#ifdef CONFIG_BLK_DEV_IO_TRACE
926
tsk->btrace_seq = 0;
927
#endif
928
tsk->splice_pipe = NULL;
929
tsk->task_frag.page = NULL;
930
tsk->wake_q.next = NULL;
931
tsk->worker_private = NULL;
932
933
kcov_task_init(tsk);
934
kmsan_task_create(tsk);
935
kmap_local_fork(tsk);
936
937
#ifdef CONFIG_FAULT_INJECTION
938
tsk->fail_nth = 0;
939
#endif
940
941
#ifdef CONFIG_BLK_CGROUP
942
tsk->throttle_disk = NULL;
943
tsk->use_memdelay = 0;
944
#endif
945
946
#ifdef CONFIG_ARCH_HAS_CPU_PASID
947
tsk->pasid_activated = 0;
948
#endif
949
950
#ifdef CONFIG_MEMCG
951
tsk->active_memcg = NULL;
952
#endif
953
954
#ifdef CONFIG_X86_BUS_LOCK_DETECT
955
tsk->reported_split_lock = 0;
956
#endif
957
958
#ifdef CONFIG_SCHED_MM_CID
959
tsk->mm_cid = -1;
960
tsk->last_mm_cid = -1;
961
tsk->mm_cid_active = 0;
962
tsk->migrate_from_cpu = -1;
963
#endif
964
return tsk;
965
966
free_stack:
967
exit_task_stack_account(tsk);
968
free_thread_stack(tsk);
969
free_tsk:
970
free_task_struct(tsk);
971
return NULL;
972
}
973
974
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
975
976
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
977
978
static int __init coredump_filter_setup(char *s)
979
{
980
default_dump_filter =
981
(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
982
MMF_DUMP_FILTER_MASK;
983
return 1;
984
}
985
986
__setup("coredump_filter=", coredump_filter_setup);
987
988
#include <linux/init_task.h>
989
990
static void mm_init_aio(struct mm_struct *mm)
991
{
992
#ifdef CONFIG_AIO
993
spin_lock_init(&mm->ioctx_lock);
994
mm->ioctx_table = NULL;
995
#endif
996
}
997
998
static __always_inline void mm_clear_owner(struct mm_struct *mm,
999
struct task_struct *p)
1000
{
1001
#ifdef CONFIG_MEMCG
1002
if (mm->owner == p)
1003
WRITE_ONCE(mm->owner, NULL);
1004
#endif
1005
}
1006
1007
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1008
{
1009
#ifdef CONFIG_MEMCG
1010
mm->owner = p;
1011
#endif
1012
}
1013
1014
static void mm_init_uprobes_state(struct mm_struct *mm)
1015
{
1016
#ifdef CONFIG_UPROBES
1017
mm->uprobes_state.xol_area = NULL;
1018
#endif
1019
}
1020
1021
static void mmap_init_lock(struct mm_struct *mm)
1022
{
1023
init_rwsem(&mm->mmap_lock);
1024
mm_lock_seqcount_init(mm);
1025
#ifdef CONFIG_PER_VMA_LOCK
1026
rcuwait_init(&mm->vma_writer_wait);
1027
#endif
1028
}
1029
1030
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1031
struct user_namespace *user_ns)
1032
{
1033
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
1034
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
1035
atomic_set(&mm->mm_users, 1);
1036
atomic_set(&mm->mm_count, 1);
1037
seqcount_init(&mm->write_protect_seq);
1038
mmap_init_lock(mm);
1039
INIT_LIST_HEAD(&mm->mmlist);
1040
mm_pgtables_bytes_init(mm);
1041
mm->map_count = 0;
1042
mm->locked_vm = 0;
1043
atomic64_set(&mm->pinned_vm, 0);
1044
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1045
spin_lock_init(&mm->page_table_lock);
1046
spin_lock_init(&mm->arg_lock);
1047
mm_init_cpumask(mm);
1048
mm_init_aio(mm);
1049
mm_init_owner(mm, p);
1050
mm_pasid_init(mm);
1051
RCU_INIT_POINTER(mm->exe_file, NULL);
1052
mmu_notifier_subscriptions_init(mm);
1053
init_tlb_flush_pending(mm);
1054
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
1055
mm->pmd_huge_pte = NULL;
1056
#endif
1057
mm_init_uprobes_state(mm);
1058
hugetlb_count_init(mm);
1059
1060
if (current->mm) {
1061
mm->flags = mmf_init_flags(current->mm->flags);
1062
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1063
} else {
1064
mm->flags = default_dump_filter;
1065
mm->def_flags = 0;
1066
}
1067
1068
if (futex_mm_init(mm))
1069
goto fail_mm_init;
1070
1071
if (mm_alloc_pgd(mm))
1072
goto fail_nopgd;
1073
1074
if (mm_alloc_id(mm))
1075
goto fail_noid;
1076
1077
if (init_new_context(p, mm))
1078
goto fail_nocontext;
1079
1080
if (mm_alloc_cid(mm, p))
1081
goto fail_cid;
1082
1083
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
1084
NR_MM_COUNTERS))
1085
goto fail_pcpu;
1086
1087
mm->user_ns = get_user_ns(user_ns);
1088
lru_gen_init_mm(mm);
1089
return mm;
1090
1091
fail_pcpu:
1092
mm_destroy_cid(mm);
1093
fail_cid:
1094
destroy_context(mm);
1095
fail_nocontext:
1096
mm_free_id(mm);
1097
fail_noid:
1098
mm_free_pgd(mm);
1099
fail_nopgd:
1100
futex_hash_free(mm);
1101
fail_mm_init:
1102
free_mm(mm);
1103
return NULL;
1104
}
1105
1106
/*
1107
* Allocate and initialize an mm_struct.
1108
*/
1109
struct mm_struct *mm_alloc(void)
1110
{
1111
struct mm_struct *mm;
1112
1113
mm = allocate_mm();
1114
if (!mm)
1115
return NULL;
1116
1117
memset(mm, 0, sizeof(*mm));
1118
return mm_init(mm, current, current_user_ns());
1119
}
1120
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
1121
1122
static inline void __mmput(struct mm_struct *mm)
1123
{
1124
VM_BUG_ON(atomic_read(&mm->mm_users));
1125
1126
uprobe_clear_state(mm);
1127
exit_aio(mm);
1128
ksm_exit(mm);
1129
khugepaged_exit(mm); /* must run before exit_mmap */
1130
exit_mmap(mm);
1131
mm_put_huge_zero_folio(mm);
1132
set_mm_exe_file(mm, NULL);
1133
if (!list_empty(&mm->mmlist)) {
1134
spin_lock(&mmlist_lock);
1135
list_del(&mm->mmlist);
1136
spin_unlock(&mmlist_lock);
1137
}
1138
if (mm->binfmt)
1139
module_put(mm->binfmt->module);
1140
lru_gen_del_mm(mm);
1141
mmdrop(mm);
1142
}
1143
1144
/*
1145
* Decrement the use count and release all resources for an mm.
1146
*/
1147
void mmput(struct mm_struct *mm)
1148
{
1149
might_sleep();
1150
1151
if (atomic_dec_and_test(&mm->mm_users))
1152
__mmput(mm);
1153
}
1154
EXPORT_SYMBOL_GPL(mmput);
1155
1156
#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
1157
static void mmput_async_fn(struct work_struct *work)
1158
{
1159
struct mm_struct *mm = container_of(work, struct mm_struct,
1160
async_put_work);
1161
1162
__mmput(mm);
1163
}
1164
1165
void mmput_async(struct mm_struct *mm)
1166
{
1167
if (atomic_dec_and_test(&mm->mm_users)) {
1168
INIT_WORK(&mm->async_put_work, mmput_async_fn);
1169
schedule_work(&mm->async_put_work);
1170
}
1171
}
1172
EXPORT_SYMBOL_GPL(mmput_async);
1173
#endif
1174
1175
/**
1176
* set_mm_exe_file - change a reference to the mm's executable file
1177
* @mm: The mm to change.
1178
* @new_exe_file: The new file to use.
1179
*
1180
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1181
*
1182
* Main users are mmput() and sys_execve(). Callers prevent concurrent
1183
* invocations: in mmput() nobody alive left, in execve it happens before
1184
* the new mm is made visible to anyone.
1185
*
1186
* Can only fail if new_exe_file != NULL.
1187
*/
1188
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1189
{
1190
struct file *old_exe_file;
1191
1192
/*
1193
* It is safe to dereference the exe_file without RCU as
1194
* this function is only called if nobody else can access
1195
* this mm -- see comment above for justification.
1196
*/
1197
old_exe_file = rcu_dereference_raw(mm->exe_file);
1198
1199
if (new_exe_file) {
1200
/*
1201
* We expect the caller (i.e., sys_execve) to already denied
1202
* write access, so this is unlikely to fail.
1203
*/
1204
if (unlikely(exe_file_deny_write_access(new_exe_file)))
1205
return -EACCES;
1206
get_file(new_exe_file);
1207
}
1208
rcu_assign_pointer(mm->exe_file, new_exe_file);
1209
if (old_exe_file) {
1210
exe_file_allow_write_access(old_exe_file);
1211
fput(old_exe_file);
1212
}
1213
return 0;
1214
}
1215
1216
/**
1217
* replace_mm_exe_file - replace a reference to the mm's executable file
1218
* @mm: The mm to change.
1219
* @new_exe_file: The new file to use.
1220
*
1221
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1222
*
1223
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
1224
*/
1225
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1226
{
1227
struct vm_area_struct *vma;
1228
struct file *old_exe_file;
1229
int ret = 0;
1230
1231
/* Forbid mm->exe_file change if old file still mapped. */
1232
old_exe_file = get_mm_exe_file(mm);
1233
if (old_exe_file) {
1234
VMA_ITERATOR(vmi, mm, 0);
1235
mmap_read_lock(mm);
1236
for_each_vma(vmi, vma) {
1237
if (!vma->vm_file)
1238
continue;
1239
if (path_equal(&vma->vm_file->f_path,
1240
&old_exe_file->f_path)) {
1241
ret = -EBUSY;
1242
break;
1243
}
1244
}
1245
mmap_read_unlock(mm);
1246
fput(old_exe_file);
1247
if (ret)
1248
return ret;
1249
}
1250
1251
ret = exe_file_deny_write_access(new_exe_file);
1252
if (ret)
1253
return -EACCES;
1254
get_file(new_exe_file);
1255
1256
/* set the new file */
1257
mmap_write_lock(mm);
1258
old_exe_file = rcu_dereference_raw(mm->exe_file);
1259
rcu_assign_pointer(mm->exe_file, new_exe_file);
1260
mmap_write_unlock(mm);
1261
1262
if (old_exe_file) {
1263
exe_file_allow_write_access(old_exe_file);
1264
fput(old_exe_file);
1265
}
1266
return 0;
1267
}
1268
1269
/**
1270
* get_mm_exe_file - acquire a reference to the mm's executable file
1271
* @mm: The mm of interest.
1272
*
1273
* Returns %NULL if mm has no associated executable file.
1274
* User must release file via fput().
1275
*/
1276
struct file *get_mm_exe_file(struct mm_struct *mm)
1277
{
1278
struct file *exe_file;
1279
1280
rcu_read_lock();
1281
exe_file = get_file_rcu(&mm->exe_file);
1282
rcu_read_unlock();
1283
return exe_file;
1284
}
1285
1286
/**
1287
* get_task_exe_file - acquire a reference to the task's executable file
1288
* @task: The task.
1289
*
1290
* Returns %NULL if task's mm (if any) has no associated executable file or
1291
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
1292
* User must release file via fput().
1293
*/
1294
struct file *get_task_exe_file(struct task_struct *task)
1295
{
1296
struct file *exe_file = NULL;
1297
struct mm_struct *mm;
1298
1299
if (task->flags & PF_KTHREAD)
1300
return NULL;
1301
1302
task_lock(task);
1303
mm = task->mm;
1304
if (mm)
1305
exe_file = get_mm_exe_file(mm);
1306
task_unlock(task);
1307
return exe_file;
1308
}
1309
1310
/**
1311
* get_task_mm - acquire a reference to the task's mm
1312
* @task: The task.
1313
*
1314
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
1315
* this kernel workthread has transiently adopted a user mm with use_mm,
1316
* to do its AIO) is not set and if so returns a reference to it, after
1317
* bumping up the use count. User must release the mm via mmput()
1318
* after use. Typically used by /proc and ptrace.
1319
*/
1320
struct mm_struct *get_task_mm(struct task_struct *task)
1321
{
1322
struct mm_struct *mm;
1323
1324
if (task->flags & PF_KTHREAD)
1325
return NULL;
1326
1327
task_lock(task);
1328
mm = task->mm;
1329
if (mm)
1330
mmget(mm);
1331
task_unlock(task);
1332
return mm;
1333
}
1334
EXPORT_SYMBOL_GPL(get_task_mm);
1335
1336
static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
1337
{
1338
if (mm == current->mm)
1339
return true;
1340
if (ptrace_may_access(task, mode))
1341
return true;
1342
if ((mode & PTRACE_MODE_READ) && perfmon_capable())
1343
return true;
1344
return false;
1345
}
1346
1347
struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1348
{
1349
struct mm_struct *mm;
1350
int err;
1351
1352
err = down_read_killable(&task->signal->exec_update_lock);
1353
if (err)
1354
return ERR_PTR(err);
1355
1356
mm = get_task_mm(task);
1357
if (!mm) {
1358
mm = ERR_PTR(-ESRCH);
1359
} else if (!may_access_mm(mm, task, mode)) {
1360
mmput(mm);
1361
mm = ERR_PTR(-EACCES);
1362
}
1363
up_read(&task->signal->exec_update_lock);
1364
1365
return mm;
1366
}
1367
1368
static void complete_vfork_done(struct task_struct *tsk)
1369
{
1370
struct completion *vfork;
1371
1372
task_lock(tsk);
1373
vfork = tsk->vfork_done;
1374
if (likely(vfork)) {
1375
tsk->vfork_done = NULL;
1376
complete(vfork);
1377
}
1378
task_unlock(tsk);
1379
}
1380
1381
static int wait_for_vfork_done(struct task_struct *child,
1382
struct completion *vfork)
1383
{
1384
unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
1385
int killed;
1386
1387
cgroup_enter_frozen();
1388
killed = wait_for_completion_state(vfork, state);
1389
cgroup_leave_frozen(false);
1390
1391
if (killed) {
1392
task_lock(child);
1393
child->vfork_done = NULL;
1394
task_unlock(child);
1395
}
1396
1397
put_task_struct(child);
1398
return killed;
1399
}
1400
1401
/* Please note the differences between mmput and mm_release.
1402
* mmput is called whenever we stop holding onto a mm_struct,
1403
* error success whatever.
1404
*
1405
* mm_release is called after a mm_struct has been removed
1406
* from the current process.
1407
*
1408
* This difference is important for error handling, when we
1409
* only half set up a mm_struct for a new process and need to restore
1410
* the old one. Because we mmput the new mm_struct before
1411
* restoring the old one. . .
1412
* Eric Biederman 10 January 1998
1413
*/
1414
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1415
{
1416
uprobe_free_utask(tsk);
1417
1418
/* Get rid of any cached register state */
1419
deactivate_mm(tsk, mm);
1420
1421
/*
1422
* Signal userspace if we're not exiting with a core dump
1423
* because we want to leave the value intact for debugging
1424
* purposes.
1425
*/
1426
if (tsk->clear_child_tid) {
1427
if (atomic_read(&mm->mm_users) > 1) {
1428
/*
1429
* We don't check the error code - if userspace has
1430
* not set up a proper pointer then tough luck.
1431
*/
1432
put_user(0, tsk->clear_child_tid);
1433
do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1434
1, NULL, NULL, 0, 0);
1435
}
1436
tsk->clear_child_tid = NULL;
1437
}
1438
1439
/*
1440
* All done, finally we can wake up parent and return this mm to him.
1441
* Also kthread_stop() uses this completion for synchronization.
1442
*/
1443
if (tsk->vfork_done)
1444
complete_vfork_done(tsk);
1445
}
1446
1447
void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1448
{
1449
futex_exit_release(tsk);
1450
mm_release(tsk, mm);
1451
}
1452
1453
void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1454
{
1455
futex_exec_release(tsk);
1456
mm_release(tsk, mm);
1457
}
1458
1459
/**
1460
* dup_mm() - duplicates an existing mm structure
1461
* @tsk: the task_struct with which the new mm will be associated.
1462
* @oldmm: the mm to duplicate.
1463
*
1464
* Allocates a new mm structure and duplicates the provided @oldmm structure
1465
* content into it.
1466
*
1467
* Return: the duplicated mm or NULL on failure.
1468
*/
1469
static struct mm_struct *dup_mm(struct task_struct *tsk,
1470
struct mm_struct *oldmm)
1471
{
1472
struct mm_struct *mm;
1473
int err;
1474
1475
mm = allocate_mm();
1476
if (!mm)
1477
goto fail_nomem;
1478
1479
memcpy(mm, oldmm, sizeof(*mm));
1480
1481
if (!mm_init(mm, tsk, mm->user_ns))
1482
goto fail_nomem;
1483
1484
uprobe_start_dup_mmap();
1485
err = dup_mmap(mm, oldmm);
1486
if (err)
1487
goto free_pt;
1488
uprobe_end_dup_mmap();
1489
1490
mm->hiwater_rss = get_mm_rss(mm);
1491
mm->hiwater_vm = mm->total_vm;
1492
1493
if (mm->binfmt && !try_module_get(mm->binfmt->module))
1494
goto free_pt;
1495
1496
return mm;
1497
1498
free_pt:
1499
/* don't put binfmt in mmput, we haven't got module yet */
1500
mm->binfmt = NULL;
1501
mm_init_owner(mm, NULL);
1502
mmput(mm);
1503
if (err)
1504
uprobe_end_dup_mmap();
1505
1506
fail_nomem:
1507
return NULL;
1508
}
1509
1510
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1511
{
1512
struct mm_struct *mm, *oldmm;
1513
1514
tsk->min_flt = tsk->maj_flt = 0;
1515
tsk->nvcsw = tsk->nivcsw = 0;
1516
#ifdef CONFIG_DETECT_HUNG_TASK
1517
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1518
tsk->last_switch_time = 0;
1519
#endif
1520
1521
tsk->mm = NULL;
1522
tsk->active_mm = NULL;
1523
1524
/*
1525
* Are we cloning a kernel thread?
1526
*
1527
* We need to steal a active VM for that..
1528
*/
1529
oldmm = current->mm;
1530
if (!oldmm)
1531
return 0;
1532
1533
if (clone_flags & CLONE_VM) {
1534
mmget(oldmm);
1535
mm = oldmm;
1536
} else {
1537
mm = dup_mm(tsk, current->mm);
1538
if (!mm)
1539
return -ENOMEM;
1540
}
1541
1542
tsk->mm = mm;
1543
tsk->active_mm = mm;
1544
sched_mm_cid_fork(tsk);
1545
return 0;
1546
}
1547
1548
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
1549
{
1550
struct fs_struct *fs = current->fs;
1551
if (clone_flags & CLONE_FS) {
1552
/* tsk->fs is already what we want */
1553
read_seqlock_excl(&fs->seq);
1554
/* "users" and "in_exec" locked for check_unsafe_exec() */
1555
if (fs->in_exec) {
1556
read_sequnlock_excl(&fs->seq);
1557
return -EAGAIN;
1558
}
1559
fs->users++;
1560
read_sequnlock_excl(&fs->seq);
1561
return 0;
1562
}
1563
tsk->fs = copy_fs_struct(fs);
1564
if (!tsk->fs)
1565
return -ENOMEM;
1566
return 0;
1567
}
1568
1569
static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
1570
int no_files)
1571
{
1572
struct files_struct *oldf, *newf;
1573
1574
/*
1575
* A background process may not have any files ...
1576
*/
1577
oldf = current->files;
1578
if (!oldf)
1579
return 0;
1580
1581
if (no_files) {
1582
tsk->files = NULL;
1583
return 0;
1584
}
1585
1586
if (clone_flags & CLONE_FILES) {
1587
atomic_inc(&oldf->count);
1588
return 0;
1589
}
1590
1591
newf = dup_fd(oldf, NULL);
1592
if (IS_ERR(newf))
1593
return PTR_ERR(newf);
1594
1595
tsk->files = newf;
1596
return 0;
1597
}
1598
1599
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1600
{
1601
struct sighand_struct *sig;
1602
1603
if (clone_flags & CLONE_SIGHAND) {
1604
refcount_inc(&current->sighand->count);
1605
return 0;
1606
}
1607
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1608
RCU_INIT_POINTER(tsk->sighand, sig);
1609
if (!sig)
1610
return -ENOMEM;
1611
1612
refcount_set(&sig->count, 1);
1613
spin_lock_irq(&current->sighand->siglock);
1614
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1615
spin_unlock_irq(&current->sighand->siglock);
1616
1617
/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
1618
if (clone_flags & CLONE_CLEAR_SIGHAND)
1619
flush_signal_handlers(tsk, 0);
1620
1621
return 0;
1622
}
1623
1624
void __cleanup_sighand(struct sighand_struct *sighand)
1625
{
1626
if (refcount_dec_and_test(&sighand->count)) {
1627
signalfd_cleanup(sighand);
1628
/*
1629
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1630
* without an RCU grace period, see __lock_task_sighand().
1631
*/
1632
kmem_cache_free(sighand_cachep, sighand);
1633
}
1634
}
1635
1636
/*
1637
* Initialize POSIX timer handling for a thread group.
1638
*/
1639
static void posix_cpu_timers_init_group(struct signal_struct *sig)
1640
{
1641
struct posix_cputimers *pct = &sig->posix_cputimers;
1642
unsigned long cpu_limit;
1643
1644
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1645
posix_cputimers_group_init(pct, cpu_limit);
1646
}
1647
1648
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1649
{
1650
struct signal_struct *sig;
1651
1652
if (clone_flags & CLONE_THREAD)
1653
return 0;
1654
1655
sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1656
tsk->signal = sig;
1657
if (!sig)
1658
return -ENOMEM;
1659
1660
sig->nr_threads = 1;
1661
sig->quick_threads = 1;
1662
atomic_set(&sig->live, 1);
1663
refcount_set(&sig->sigcnt, 1);
1664
1665
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1666
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1667
tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1668
1669
init_waitqueue_head(&sig->wait_chldexit);
1670
sig->curr_target = tsk;
1671
init_sigpending(&sig->shared_pending);
1672
INIT_HLIST_HEAD(&sig->multiprocess);
1673
seqlock_init(&sig->stats_lock);
1674
prev_cputime_init(&sig->prev_cputime);
1675
1676
#ifdef CONFIG_POSIX_TIMERS
1677
INIT_HLIST_HEAD(&sig->posix_timers);
1678
INIT_HLIST_HEAD(&sig->ignored_posix_timers);
1679
hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1680
#endif
1681
1682
task_lock(current->group_leader);
1683
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1684
task_unlock(current->group_leader);
1685
1686
posix_cpu_timers_init_group(sig);
1687
1688
tty_audit_fork(sig);
1689
sched_autogroup_fork(sig);
1690
1691
sig->oom_score_adj = current->signal->oom_score_adj;
1692
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1693
1694
mutex_init(&sig->cred_guard_mutex);
1695
init_rwsem(&sig->exec_update_lock);
1696
1697
return 0;
1698
}
1699
1700
static void copy_seccomp(struct task_struct *p)
1701
{
1702
#ifdef CONFIG_SECCOMP
1703
/*
1704
* Must be called with sighand->lock held, which is common to
1705
* all threads in the group. Holding cred_guard_mutex is not
1706
* needed because this new task is not yet running and cannot
1707
* be racing exec.
1708
*/
1709
assert_spin_locked(&current->sighand->siglock);
1710
1711
/* Ref-count the new filter user, and assign it. */
1712
get_seccomp_filter(current);
1713
p->seccomp = current->seccomp;
1714
1715
/*
1716
* Explicitly enable no_new_privs here in case it got set
1717
* between the task_struct being duplicated and holding the
1718
* sighand lock. The seccomp state and nnp must be in sync.
1719
*/
1720
if (task_no_new_privs(current))
1721
task_set_no_new_privs(p);
1722
1723
/*
1724
* If the parent gained a seccomp mode after copying thread
1725
* flags and between before we held the sighand lock, we have
1726
* to manually enable the seccomp thread flag here.
1727
*/
1728
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1729
set_task_syscall_work(p, SECCOMP);
1730
#endif
1731
}
1732
1733
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1734
{
1735
current->clear_child_tid = tidptr;
1736
1737
return task_pid_vnr(current);
1738
}
1739
1740
static void rt_mutex_init_task(struct task_struct *p)
1741
{
1742
raw_spin_lock_init(&p->pi_lock);
1743
#ifdef CONFIG_RT_MUTEXES
1744
p->pi_waiters = RB_ROOT_CACHED;
1745
p->pi_top_task = NULL;
1746
p->pi_blocked_on = NULL;
1747
#endif
1748
}
1749
1750
static inline void init_task_pid_links(struct task_struct *task)
1751
{
1752
enum pid_type type;
1753
1754
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1755
INIT_HLIST_NODE(&task->pid_links[type]);
1756
}
1757
1758
static inline void
1759
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1760
{
1761
if (type == PIDTYPE_PID)
1762
task->thread_pid = pid;
1763
else
1764
task->signal->pids[type] = pid;
1765
}
1766
1767
static inline void rcu_copy_process(struct task_struct *p)
1768
{
1769
#ifdef CONFIG_PREEMPT_RCU
1770
p->rcu_read_lock_nesting = 0;
1771
p->rcu_read_unlock_special.s = 0;
1772
p->rcu_blocked_node = NULL;
1773
INIT_LIST_HEAD(&p->rcu_node_entry);
1774
#endif /* #ifdef CONFIG_PREEMPT_RCU */
1775
#ifdef CONFIG_TASKS_RCU
1776
p->rcu_tasks_holdout = false;
1777
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1778
p->rcu_tasks_idle_cpu = -1;
1779
INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
1780
#endif /* #ifdef CONFIG_TASKS_RCU */
1781
#ifdef CONFIG_TASKS_TRACE_RCU
1782
p->trc_reader_nesting = 0;
1783
p->trc_reader_special.s = 0;
1784
INIT_LIST_HEAD(&p->trc_holdout_list);
1785
INIT_LIST_HEAD(&p->trc_blkd_node);
1786
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
1787
}
1788
1789
/**
1790
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
1791
* @pid: the struct pid for which to create a pidfd
1792
* @flags: flags of the new @pidfd
1793
* @ret_file: return the new pidfs file
1794
*
1795
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
1796
* caller's file descriptor table. The pidfd is reserved but not installed yet.
1797
*
1798
* The helper verifies that @pid is still in use, without PIDFD_THREAD the
1799
* task identified by @pid must be a thread-group leader.
1800
*
1801
* If this function returns successfully the caller is responsible to either
1802
* call fd_install() passing the returned pidfd and pidfd file as arguments in
1803
* order to install the pidfd into its file descriptor table or they must use
1804
* put_unused_fd() and fput() on the returned pidfd and pidfd file
1805
* respectively.
1806
*
1807
* This function is useful when a pidfd must already be reserved but there
1808
* might still be points of failure afterwards and the caller wants to ensure
1809
* that no pidfd is leaked into its file descriptor table.
1810
*
1811
* Return: On success, a reserved pidfd is returned from the function and a new
1812
* pidfd file is returned in the last argument to the function. On
1813
* error, a negative error code is returned from the function and the
1814
* last argument remains unchanged.
1815
*/
1816
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
1817
{
1818
struct file *pidfs_file;
1819
1820
/*
1821
* PIDFD_STALE is only allowed to be passed if the caller knows
1822
* that @pid is already registered in pidfs and thus
1823
* PIDFD_INFO_EXIT information is guaranteed to be available.
1824
*/
1825
if (!(flags & PIDFD_STALE)) {
1826
/*
1827
* While holding the pidfd waitqueue lock removing the
1828
* task linkage for the thread-group leader pid
1829
* (PIDTYPE_TGID) isn't possible. Thus, if there's still
1830
* task linkage for PIDTYPE_PID not having thread-group
1831
* leader linkage for the pid means it wasn't a
1832
* thread-group leader in the first place.
1833
*/
1834
guard(spinlock_irq)(&pid->wait_pidfd.lock);
1835
1836
/* Task has already been reaped. */
1837
if (!pid_has_task(pid, PIDTYPE_PID))
1838
return -ESRCH;
1839
/*
1840
* If this struct pid isn't used as a thread-group
1841
* leader but the caller requested to create a
1842
* thread-group leader pidfd then report ENOENT.
1843
*/
1844
if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
1845
return -ENOENT;
1846
}
1847
1848
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
1849
if (pidfd < 0)
1850
return pidfd;
1851
1852
pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
1853
if (IS_ERR(pidfs_file))
1854
return PTR_ERR(pidfs_file);
1855
1856
*ret_file = pidfs_file;
1857
return take_fd(pidfd);
1858
}
1859
1860
static void __delayed_free_task(struct rcu_head *rhp)
1861
{
1862
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1863
1864
free_task(tsk);
1865
}
1866
1867
static __always_inline void delayed_free_task(struct task_struct *tsk)
1868
{
1869
if (IS_ENABLED(CONFIG_MEMCG))
1870
call_rcu(&tsk->rcu, __delayed_free_task);
1871
else
1872
free_task(tsk);
1873
}
1874
1875
static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1876
{
1877
/* Skip if kernel thread */
1878
if (!tsk->mm)
1879
return;
1880
1881
/* Skip if spawning a thread or using vfork */
1882
if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
1883
return;
1884
1885
/* We need to synchronize with __set_oom_adj */
1886
mutex_lock(&oom_adj_mutex);
1887
set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
1888
/* Update the values in case they were changed after copy_signal */
1889
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1890
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1891
mutex_unlock(&oom_adj_mutex);
1892
}
1893
1894
#ifdef CONFIG_RV
1895
static void rv_task_fork(struct task_struct *p)
1896
{
1897
memset(&p->rv, 0, sizeof(p->rv));
1898
}
1899
#else
1900
#define rv_task_fork(p) do {} while (0)
1901
#endif
1902
1903
static bool need_futex_hash_allocate_default(u64 clone_flags)
1904
{
1905
if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
1906
return false;
1907
return true;
1908
}
1909
1910
/*
1911
* This creates a new process as a copy of the old one,
1912
* but does not actually start it yet.
1913
*
1914
* It copies the registers, and all the appropriate
1915
* parts of the process environment (as per the clone
1916
* flags). The actual kick-off is left to the caller.
1917
*/
1918
__latent_entropy struct task_struct *copy_process(
1919
struct pid *pid,
1920
int trace,
1921
int node,
1922
struct kernel_clone_args *args)
1923
{
1924
int pidfd = -1, retval;
1925
struct task_struct *p;
1926
struct multiprocess_signals delayed;
1927
struct file *pidfile = NULL;
1928
const u64 clone_flags = args->flags;
1929
struct nsproxy *nsp = current->nsproxy;
1930
1931
/*
1932
* Don't allow sharing the root directory with processes in a different
1933
* namespace
1934
*/
1935
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1936
return ERR_PTR(-EINVAL);
1937
1938
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1939
return ERR_PTR(-EINVAL);
1940
1941
/*
1942
* Thread groups must share signals as well, and detached threads
1943
* can only be started up within the thread group.
1944
*/
1945
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1946
return ERR_PTR(-EINVAL);
1947
1948
/*
1949
* Shared signal handlers imply shared VM. By way of the above,
1950
* thread groups also imply shared VM. Blocking this case allows
1951
* for various simplifications in other code.
1952
*/
1953
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1954
return ERR_PTR(-EINVAL);
1955
1956
/*
1957
* Siblings of global init remain as zombies on exit since they are
1958
* not reaped by their parent (swapper). To solve this and to avoid
1959
* multi-rooted process trees, prevent global and container-inits
1960
* from creating siblings.
1961
*/
1962
if ((clone_flags & CLONE_PARENT) &&
1963
current->signal->flags & SIGNAL_UNKILLABLE)
1964
return ERR_PTR(-EINVAL);
1965
1966
/*
1967
* If the new process will be in a different pid or user namespace
1968
* do not allow it to share a thread group with the forking task.
1969
*/
1970
if (clone_flags & CLONE_THREAD) {
1971
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1972
(task_active_pid_ns(current) != nsp->pid_ns_for_children))
1973
return ERR_PTR(-EINVAL);
1974
}
1975
1976
if (clone_flags & CLONE_PIDFD) {
1977
/*
1978
* - CLONE_DETACHED is blocked so that we can potentially
1979
* reuse it later for CLONE_PIDFD.
1980
*/
1981
if (clone_flags & CLONE_DETACHED)
1982
return ERR_PTR(-EINVAL);
1983
}
1984
1985
/*
1986
* Force any signals received before this point to be delivered
1987
* before the fork happens. Collect up signals sent to multiple
1988
* processes that happen during the fork and delay them so that
1989
* they appear to happen after the fork.
1990
*/
1991
sigemptyset(&delayed.signal);
1992
INIT_HLIST_NODE(&delayed.node);
1993
1994
spin_lock_irq(&current->sighand->siglock);
1995
if (!(clone_flags & CLONE_THREAD))
1996
hlist_add_head(&delayed.node, &current->signal->multiprocess);
1997
recalc_sigpending();
1998
spin_unlock_irq(&current->sighand->siglock);
1999
retval = -ERESTARTNOINTR;
2000
if (task_sigpending(current))
2001
goto fork_out;
2002
2003
retval = -ENOMEM;
2004
p = dup_task_struct(current, node);
2005
if (!p)
2006
goto fork_out;
2007
p->flags &= ~PF_KTHREAD;
2008
if (args->kthread)
2009
p->flags |= PF_KTHREAD;
2010
if (args->user_worker) {
2011
/*
2012
* Mark us a user worker, and block any signal that isn't
2013
* fatal or STOP
2014
*/
2015
p->flags |= PF_USER_WORKER;
2016
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2017
}
2018
if (args->io_thread)
2019
p->flags |= PF_IO_WORKER;
2020
2021
if (args->name)
2022
strscpy_pad(p->comm, args->name, sizeof(p->comm));
2023
2024
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2025
/*
2026
* Clear TID on mm_release()?
2027
*/
2028
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2029
2030
ftrace_graph_init_task(p);
2031
2032
rt_mutex_init_task(p);
2033
2034
lockdep_assert_irqs_enabled();
2035
#ifdef CONFIG_PROVE_LOCKING
2036
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2037
#endif
2038
retval = copy_creds(p, clone_flags);
2039
if (retval < 0)
2040
goto bad_fork_free;
2041
2042
retval = -EAGAIN;
2043
if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
2044
if (p->real_cred->user != INIT_USER &&
2045
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2046
goto bad_fork_cleanup_count;
2047
}
2048
current->flags &= ~PF_NPROC_EXCEEDED;
2049
2050
/*
2051
* If multiple threads are within copy_process(), then this check
2052
* triggers too late. This doesn't hurt, the check is only there
2053
* to stop root fork bombs.
2054
*/
2055
retval = -EAGAIN;
2056
if (data_race(nr_threads >= max_threads))
2057
goto bad_fork_cleanup_count;
2058
2059
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
2060
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2061
p->flags |= PF_FORKNOEXEC;
2062
INIT_LIST_HEAD(&p->children);
2063
INIT_LIST_HEAD(&p->sibling);
2064
rcu_copy_process(p);
2065
p->vfork_done = NULL;
2066
spin_lock_init(&p->alloc_lock);
2067
2068
init_sigpending(&p->pending);
2069
2070
p->utime = p->stime = p->gtime = 0;
2071
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2072
p->utimescaled = p->stimescaled = 0;
2073
#endif
2074
prev_cputime_init(&p->prev_cputime);
2075
2076
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2077
seqcount_init(&p->vtime.seqcount);
2078
p->vtime.starttime = 0;
2079
p->vtime.state = VTIME_INACTIVE;
2080
#endif
2081
2082
#ifdef CONFIG_IO_URING
2083
p->io_uring = NULL;
2084
#endif
2085
2086
p->default_timer_slack_ns = current->timer_slack_ns;
2087
2088
#ifdef CONFIG_PSI
2089
p->psi_flags = 0;
2090
#endif
2091
2092
task_io_accounting_init(&p->ioac);
2093
acct_clear_integrals(p);
2094
2095
posix_cputimers_init(&p->posix_cputimers);
2096
tick_dep_init_task(p);
2097
2098
p->io_context = NULL;
2099
audit_set_context(p, NULL);
2100
cgroup_fork(p);
2101
if (args->kthread) {
2102
if (!set_kthread_struct(p))
2103
goto bad_fork_cleanup_delayacct;
2104
}
2105
#ifdef CONFIG_NUMA
2106
p->mempolicy = mpol_dup(p->mempolicy);
2107
if (IS_ERR(p->mempolicy)) {
2108
retval = PTR_ERR(p->mempolicy);
2109
p->mempolicy = NULL;
2110
goto bad_fork_cleanup_delayacct;
2111
}
2112
#endif
2113
#ifdef CONFIG_CPUSETS
2114
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2115
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2116
#endif
2117
#ifdef CONFIG_TRACE_IRQFLAGS
2118
memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2119
p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2120
p->irqtrace.softirq_enable_ip = _THIS_IP_;
2121
p->softirqs_enabled = 1;
2122
p->softirq_context = 0;
2123
#endif
2124
2125
p->pagefault_disabled = 0;
2126
2127
#ifdef CONFIG_LOCKDEP
2128
lockdep_init_task(p);
2129
#endif
2130
2131
p->blocked_on = NULL; /* not blocked yet */
2132
2133
#ifdef CONFIG_BCACHE
2134
p->sequential_io = 0;
2135
p->sequential_io_avg = 0;
2136
#endif
2137
#ifdef CONFIG_BPF_SYSCALL
2138
RCU_INIT_POINTER(p->bpf_storage, NULL);
2139
p->bpf_ctx = NULL;
2140
#endif
2141
2142
unwind_task_init(p);
2143
2144
/* Perform scheduler related setup. Assign this task to a CPU. */
2145
retval = sched_fork(clone_flags, p);
2146
if (retval)
2147
goto bad_fork_cleanup_policy;
2148
2149
retval = perf_event_init_task(p, clone_flags);
2150
if (retval)
2151
goto bad_fork_sched_cancel_fork;
2152
retval = audit_alloc(p);
2153
if (retval)
2154
goto bad_fork_cleanup_perf;
2155
/* copy all the process information */
2156
shm_init_task(p);
2157
retval = security_task_alloc(p, clone_flags);
2158
if (retval)
2159
goto bad_fork_cleanup_audit;
2160
retval = copy_semundo(clone_flags, p);
2161
if (retval)
2162
goto bad_fork_cleanup_security;
2163
retval = copy_files(clone_flags, p, args->no_files);
2164
if (retval)
2165
goto bad_fork_cleanup_semundo;
2166
retval = copy_fs(clone_flags, p);
2167
if (retval)
2168
goto bad_fork_cleanup_files;
2169
retval = copy_sighand(clone_flags, p);
2170
if (retval)
2171
goto bad_fork_cleanup_fs;
2172
retval = copy_signal(clone_flags, p);
2173
if (retval)
2174
goto bad_fork_cleanup_sighand;
2175
retval = copy_mm(clone_flags, p);
2176
if (retval)
2177
goto bad_fork_cleanup_signal;
2178
retval = copy_namespaces(clone_flags, p);
2179
if (retval)
2180
goto bad_fork_cleanup_mm;
2181
retval = copy_io(clone_flags, p);
2182
if (retval)
2183
goto bad_fork_cleanup_namespaces;
2184
retval = copy_thread(p, args);
2185
if (retval)
2186
goto bad_fork_cleanup_io;
2187
2188
stackleak_task_init(p);
2189
2190
if (pid != &init_struct_pid) {
2191
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2192
args->set_tid_size);
2193
if (IS_ERR(pid)) {
2194
retval = PTR_ERR(pid);
2195
goto bad_fork_cleanup_thread;
2196
}
2197
}
2198
2199
/*
2200
* This has to happen after we've potentially unshared the file
2201
* descriptor table (so that the pidfd doesn't leak into the child
2202
* if the fd table isn't shared).
2203
*/
2204
if (clone_flags & CLONE_PIDFD) {
2205
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
2206
2207
/*
2208
* Note that no task has been attached to @pid yet indicate
2209
* that via CLONE_PIDFD.
2210
*/
2211
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
2212
if (retval < 0)
2213
goto bad_fork_free_pid;
2214
pidfd = retval;
2215
2216
retval = put_user(pidfd, args->pidfd);
2217
if (retval)
2218
goto bad_fork_put_pidfd;
2219
}
2220
2221
#ifdef CONFIG_BLOCK
2222
p->plug = NULL;
2223
#endif
2224
futex_init_task(p);
2225
2226
/*
2227
* sigaltstack should be cleared when sharing the same VM
2228
*/
2229
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2230
sas_ss_reset(p);
2231
2232
/*
2233
* Syscall tracing and stepping should be turned off in the
2234
* child regardless of CLONE_PTRACE.
2235
*/
2236
user_disable_single_step(p);
2237
clear_task_syscall_work(p, SYSCALL_TRACE);
2238
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2239
clear_task_syscall_work(p, SYSCALL_EMU);
2240
#endif
2241
clear_tsk_latency_tracing(p);
2242
2243
/* ok, now we should be set up.. */
2244
p->pid = pid_nr(pid);
2245
if (clone_flags & CLONE_THREAD) {
2246
p->group_leader = current->group_leader;
2247
p->tgid = current->tgid;
2248
} else {
2249
p->group_leader = p;
2250
p->tgid = p->pid;
2251
}
2252
2253
p->nr_dirtied = 0;
2254
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2255
p->dirty_paused_when = 0;
2256
2257
p->pdeath_signal = 0;
2258
p->task_works = NULL;
2259
clear_posix_cputimers_work(p);
2260
2261
#ifdef CONFIG_KRETPROBES
2262
p->kretprobe_instances.first = NULL;
2263
#endif
2264
#ifdef CONFIG_RETHOOK
2265
p->rethooks.first = NULL;
2266
#endif
2267
2268
/*
2269
* Ensure that the cgroup subsystem policies allow the new process to be
2270
* forked. It should be noted that the new process's css_set can be changed
2271
* between here and cgroup_post_fork() if an organisation operation is in
2272
* progress.
2273
*/
2274
retval = cgroup_can_fork(p, args);
2275
if (retval)
2276
goto bad_fork_put_pidfd;
2277
2278
/*
2279
* Now that the cgroups are pinned, re-clone the parent cgroup and put
2280
* the new task on the correct runqueue. All this *before* the task
2281
* becomes visible.
2282
*
2283
* This isn't part of ->can_fork() because while the re-cloning is
2284
* cgroup specific, it unconditionally needs to place the task on a
2285
* runqueue.
2286
*/
2287
retval = sched_cgroup_fork(p, args);
2288
if (retval)
2289
goto bad_fork_cancel_cgroup;
2290
2291
/*
2292
* Allocate a default futex hash for the user process once the first
2293
* thread spawns.
2294
*/
2295
if (need_futex_hash_allocate_default(clone_flags)) {
2296
retval = futex_hash_allocate_default();
2297
if (retval)
2298
goto bad_fork_core_free;
2299
/*
2300
* If we fail beyond this point we don't free the allocated
2301
* futex hash map. We assume that another thread will be created
2302
* and makes use of it. The hash map will be freed once the main
2303
* thread terminates.
2304
*/
2305
}
2306
/*
2307
* From this point on we must avoid any synchronous user-space
2308
* communication until we take the tasklist-lock. In particular, we do
2309
* not want user-space to be able to predict the process start-time by
2310
* stalling fork(2) after we recorded the start_time but before it is
2311
* visible to the system.
2312
*/
2313
2314
p->start_time = ktime_get_ns();
2315
p->start_boottime = ktime_get_boottime_ns();
2316
2317
/*
2318
* Make it visible to the rest of the system, but dont wake it up yet.
2319
* Need tasklist lock for parent etc handling!
2320
*/
2321
write_lock_irq(&tasklist_lock);
2322
2323
/* CLONE_PARENT re-uses the old parent */
2324
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2325
p->real_parent = current->real_parent;
2326
p->parent_exec_id = current->parent_exec_id;
2327
if (clone_flags & CLONE_THREAD)
2328
p->exit_signal = -1;
2329
else
2330
p->exit_signal = current->group_leader->exit_signal;
2331
} else {
2332
p->real_parent = current;
2333
p->parent_exec_id = current->self_exec_id;
2334
p->exit_signal = args->exit_signal;
2335
}
2336
2337
klp_copy_process(p);
2338
2339
sched_core_fork(p);
2340
2341
spin_lock(&current->sighand->siglock);
2342
2343
rv_task_fork(p);
2344
2345
rseq_fork(p, clone_flags);
2346
2347
/* Don't start children in a dying pid namespace */
2348
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2349
retval = -ENOMEM;
2350
goto bad_fork_core_free;
2351
}
2352
2353
/* Let kill terminate clone/fork in the middle */
2354
if (fatal_signal_pending(current)) {
2355
retval = -EINTR;
2356
goto bad_fork_core_free;
2357
}
2358
2359
/* No more failure paths after this point. */
2360
2361
/*
2362
* Copy seccomp details explicitly here, in case they were changed
2363
* before holding sighand lock.
2364
*/
2365
copy_seccomp(p);
2366
2367
init_task_pid_links(p);
2368
if (likely(p->pid)) {
2369
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2370
2371
init_task_pid(p, PIDTYPE_PID, pid);
2372
if (thread_group_leader(p)) {
2373
init_task_pid(p, PIDTYPE_TGID, pid);
2374
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2375
init_task_pid(p, PIDTYPE_SID, task_session(current));
2376
2377
if (is_child_reaper(pid)) {
2378
ns_of_pid(pid)->child_reaper = p;
2379
p->signal->flags |= SIGNAL_UNKILLABLE;
2380
}
2381
p->signal->shared_pending.signal = delayed.signal;
2382
p->signal->tty = tty_kref_get(current->signal->tty);
2383
/*
2384
* Inherit has_child_subreaper flag under the same
2385
* tasklist_lock with adding child to the process tree
2386
* for propagate_has_child_subreaper optimization.
2387
*/
2388
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2389
p->real_parent->signal->is_child_subreaper;
2390
list_add_tail(&p->sibling, &p->real_parent->children);
2391
list_add_tail_rcu(&p->tasks, &init_task.tasks);
2392
attach_pid(p, PIDTYPE_TGID);
2393
attach_pid(p, PIDTYPE_PGID);
2394
attach_pid(p, PIDTYPE_SID);
2395
__this_cpu_inc(process_counts);
2396
} else {
2397
current->signal->nr_threads++;
2398
current->signal->quick_threads++;
2399
atomic_inc(&current->signal->live);
2400
refcount_inc(&current->signal->sigcnt);
2401
task_join_group_stop(p);
2402
list_add_tail_rcu(&p->thread_node,
2403
&p->signal->thread_head);
2404
}
2405
attach_pid(p, PIDTYPE_PID);
2406
nr_threads++;
2407
}
2408
total_forks++;
2409
hlist_del_init(&delayed.node);
2410
spin_unlock(&current->sighand->siglock);
2411
syscall_tracepoint_update(p);
2412
write_unlock_irq(&tasklist_lock);
2413
2414
if (pidfile)
2415
fd_install(pidfd, pidfile);
2416
2417
proc_fork_connector(p);
2418
sched_post_fork(p);
2419
cgroup_post_fork(p, args);
2420
perf_event_fork(p);
2421
2422
trace_task_newtask(p, clone_flags);
2423
uprobe_copy_process(p, clone_flags);
2424
user_events_fork(p, clone_flags);
2425
2426
copy_oom_score_adj(clone_flags, p);
2427
2428
return p;
2429
2430
bad_fork_core_free:
2431
sched_core_free(p);
2432
spin_unlock(&current->sighand->siglock);
2433
write_unlock_irq(&tasklist_lock);
2434
bad_fork_cancel_cgroup:
2435
cgroup_cancel_fork(p, args);
2436
bad_fork_put_pidfd:
2437
if (clone_flags & CLONE_PIDFD) {
2438
fput(pidfile);
2439
put_unused_fd(pidfd);
2440
}
2441
bad_fork_free_pid:
2442
if (pid != &init_struct_pid)
2443
free_pid(pid);
2444
bad_fork_cleanup_thread:
2445
exit_thread(p);
2446
bad_fork_cleanup_io:
2447
if (p->io_context)
2448
exit_io_context(p);
2449
bad_fork_cleanup_namespaces:
2450
exit_task_namespaces(p);
2451
bad_fork_cleanup_mm:
2452
if (p->mm) {
2453
mm_clear_owner(p->mm, p);
2454
mmput(p->mm);
2455
}
2456
bad_fork_cleanup_signal:
2457
if (!(clone_flags & CLONE_THREAD))
2458
free_signal_struct(p->signal);
2459
bad_fork_cleanup_sighand:
2460
__cleanup_sighand(p->sighand);
2461
bad_fork_cleanup_fs:
2462
exit_fs(p); /* blocking */
2463
bad_fork_cleanup_files:
2464
exit_files(p); /* blocking */
2465
bad_fork_cleanup_semundo:
2466
exit_sem(p);
2467
bad_fork_cleanup_security:
2468
security_task_free(p);
2469
bad_fork_cleanup_audit:
2470
audit_free(p);
2471
bad_fork_cleanup_perf:
2472
perf_event_free_task(p);
2473
bad_fork_sched_cancel_fork:
2474
sched_cancel_fork(p);
2475
bad_fork_cleanup_policy:
2476
lockdep_free_task(p);
2477
#ifdef CONFIG_NUMA
2478
mpol_put(p->mempolicy);
2479
#endif
2480
bad_fork_cleanup_delayacct:
2481
delayacct_tsk_free(p);
2482
bad_fork_cleanup_count:
2483
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2484
exit_creds(p);
2485
bad_fork_free:
2486
WRITE_ONCE(p->__state, TASK_DEAD);
2487
exit_task_stack_account(p);
2488
put_task_stack(p);
2489
delayed_free_task(p);
2490
fork_out:
2491
spin_lock_irq(&current->sighand->siglock);
2492
hlist_del_init(&delayed.node);
2493
spin_unlock_irq(&current->sighand->siglock);
2494
return ERR_PTR(retval);
2495
}
2496
2497
static inline void init_idle_pids(struct task_struct *idle)
2498
{
2499
enum pid_type type;
2500
2501
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2502
INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
2503
init_task_pid(idle, type, &init_struct_pid);
2504
}
2505
}
2506
2507
static int idle_dummy(void *dummy)
2508
{
2509
/* This function is never called */
2510
return 0;
2511
}
2512
2513
struct task_struct * __init fork_idle(int cpu)
2514
{
2515
struct task_struct *task;
2516
struct kernel_clone_args args = {
2517
.flags = CLONE_VM,
2518
.fn = &idle_dummy,
2519
.fn_arg = NULL,
2520
.kthread = 1,
2521
.idle = 1,
2522
};
2523
2524
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2525
if (!IS_ERR(task)) {
2526
init_idle_pids(task);
2527
init_idle(task, cpu);
2528
}
2529
2530
return task;
2531
}
2532
2533
/*
2534
* This is like kernel_clone(), but shaved down and tailored to just
2535
* creating io_uring workers. It returns a created task, or an error pointer.
2536
* The returned task is inactive, and the caller must fire it up through
2537
* wake_up_new_task(p). All signals are blocked in the created task.
2538
*/
2539
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2540
{
2541
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2542
CLONE_IO;
2543
struct kernel_clone_args args = {
2544
.flags = ((lower_32_bits(flags) | CLONE_VM |
2545
CLONE_UNTRACED) & ~CSIGNAL),
2546
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
2547
.fn = fn,
2548
.fn_arg = arg,
2549
.io_thread = 1,
2550
.user_worker = 1,
2551
};
2552
2553
return copy_process(NULL, 0, node, &args);
2554
}
2555
2556
/*
2557
* Ok, this is the main fork-routine.
2558
*
2559
* It copies the process, and if successful kick-starts
2560
* it and waits for it to finish using the VM if required.
2561
*
2562
* args->exit_signal is expected to be checked for sanity by the caller.
2563
*/
2564
pid_t kernel_clone(struct kernel_clone_args *args)
2565
{
2566
u64 clone_flags = args->flags;
2567
struct completion vfork;
2568
struct pid *pid;
2569
struct task_struct *p;
2570
int trace = 0;
2571
pid_t nr;
2572
2573
/*
2574
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
2575
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
2576
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
2577
* field in struct clone_args and it still doesn't make sense to have
2578
* them both point at the same memory location. Performing this check
2579
* here has the advantage that we don't need to have a separate helper
2580
* to check for legacy clone().
2581
*/
2582
if ((clone_flags & CLONE_PIDFD) &&
2583
(clone_flags & CLONE_PARENT_SETTID) &&
2584
(args->pidfd == args->parent_tid))
2585
return -EINVAL;
2586
2587
/*
2588
* Determine whether and which event to report to ptracer. When
2589
* called from kernel_thread or CLONE_UNTRACED is explicitly
2590
* requested, no event is reported; otherwise, report if the event
2591
* for the type of forking is enabled.
2592
*/
2593
if (!(clone_flags & CLONE_UNTRACED)) {
2594
if (clone_flags & CLONE_VFORK)
2595
trace = PTRACE_EVENT_VFORK;
2596
else if (args->exit_signal != SIGCHLD)
2597
trace = PTRACE_EVENT_CLONE;
2598
else
2599
trace = PTRACE_EVENT_FORK;
2600
2601
if (likely(!ptrace_event_enabled(current, trace)))
2602
trace = 0;
2603
}
2604
2605
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2606
add_latent_entropy();
2607
2608
if (IS_ERR(p))
2609
return PTR_ERR(p);
2610
2611
/*
2612
* Do this prior waking up the new thread - the thread pointer
2613
* might get invalid after that point, if the thread exits quickly.
2614
*/
2615
trace_sched_process_fork(current, p);
2616
2617
pid = get_task_pid(p, PIDTYPE_PID);
2618
nr = pid_vnr(pid);
2619
2620
if (clone_flags & CLONE_PARENT_SETTID)
2621
put_user(nr, args->parent_tid);
2622
2623
if (clone_flags & CLONE_VFORK) {
2624
p->vfork_done = &vfork;
2625
init_completion(&vfork);
2626
get_task_struct(p);
2627
}
2628
2629
if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
2630
/* lock the task to synchronize with memcg migration */
2631
task_lock(p);
2632
lru_gen_add_mm(p->mm);
2633
task_unlock(p);
2634
}
2635
2636
wake_up_new_task(p);
2637
2638
/* forking complete and child started to run, tell ptracer */
2639
if (unlikely(trace))
2640
ptrace_event_pid(trace, pid);
2641
2642
if (clone_flags & CLONE_VFORK) {
2643
if (!wait_for_vfork_done(p, &vfork))
2644
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2645
}
2646
2647
put_pid(pid);
2648
return nr;
2649
}
2650
2651
/*
2652
* Create a kernel thread.
2653
*/
2654
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
2655
unsigned long flags)
2656
{
2657
struct kernel_clone_args args = {
2658
.flags = ((lower_32_bits(flags) | CLONE_VM |
2659
CLONE_UNTRACED) & ~CSIGNAL),
2660
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
2661
.fn = fn,
2662
.fn_arg = arg,
2663
.name = name,
2664
.kthread = 1,
2665
};
2666
2667
return kernel_clone(&args);
2668
}
2669
2670
/*
2671
* Create a user mode thread.
2672
*/
2673
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
2674
{
2675
struct kernel_clone_args args = {
2676
.flags = ((lower_32_bits(flags) | CLONE_VM |
2677
CLONE_UNTRACED) & ~CSIGNAL),
2678
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
2679
.fn = fn,
2680
.fn_arg = arg,
2681
};
2682
2683
return kernel_clone(&args);
2684
}
2685
2686
#ifdef __ARCH_WANT_SYS_FORK
2687
SYSCALL_DEFINE0(fork)
2688
{
2689
#ifdef CONFIG_MMU
2690
struct kernel_clone_args args = {
2691
.exit_signal = SIGCHLD,
2692
};
2693
2694
return kernel_clone(&args);
2695
#else
2696
/* can not support in nommu mode */
2697
return -EINVAL;
2698
#endif
2699
}
2700
#endif
2701
2702
#ifdef __ARCH_WANT_SYS_VFORK
2703
SYSCALL_DEFINE0(vfork)
2704
{
2705
struct kernel_clone_args args = {
2706
.flags = CLONE_VFORK | CLONE_VM,
2707
.exit_signal = SIGCHLD,
2708
};
2709
2710
return kernel_clone(&args);
2711
}
2712
#endif
2713
2714
#ifdef __ARCH_WANT_SYS_CLONE
2715
#ifdef CONFIG_CLONE_BACKWARDS
2716
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2717
int __user *, parent_tidptr,
2718
unsigned long, tls,
2719
int __user *, child_tidptr)
2720
#elif defined(CONFIG_CLONE_BACKWARDS2)
2721
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2722
int __user *, parent_tidptr,
2723
int __user *, child_tidptr,
2724
unsigned long, tls)
2725
#elif defined(CONFIG_CLONE_BACKWARDS3)
2726
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2727
int, stack_size,
2728
int __user *, parent_tidptr,
2729
int __user *, child_tidptr,
2730
unsigned long, tls)
2731
#else
2732
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2733
int __user *, parent_tidptr,
2734
int __user *, child_tidptr,
2735
unsigned long, tls)
2736
#endif
2737
{
2738
struct kernel_clone_args args = {
2739
.flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2740
.pidfd = parent_tidptr,
2741
.child_tid = child_tidptr,
2742
.parent_tid = parent_tidptr,
2743
.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2744
.stack = newsp,
2745
.tls = tls,
2746
};
2747
2748
return kernel_clone(&args);
2749
}
2750
#endif
2751
2752
static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2753
struct clone_args __user *uargs,
2754
size_t usize)
2755
{
2756
int err;
2757
struct clone_args args;
2758
pid_t *kset_tid = kargs->set_tid;
2759
2760
BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2761
CLONE_ARGS_SIZE_VER0);
2762
BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2763
CLONE_ARGS_SIZE_VER1);
2764
BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2765
CLONE_ARGS_SIZE_VER2);
2766
BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2767
2768
if (unlikely(usize > PAGE_SIZE))
2769
return -E2BIG;
2770
if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2771
return -EINVAL;
2772
2773
err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2774
if (err)
2775
return err;
2776
2777
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2778
return -EINVAL;
2779
2780
if (unlikely(!args.set_tid && args.set_tid_size > 0))
2781
return -EINVAL;
2782
2783
if (unlikely(args.set_tid && args.set_tid_size == 0))
2784
return -EINVAL;
2785
2786
/*
2787
* Verify that higher 32bits of exit_signal are unset and that
2788
* it is a valid signal
2789
*/
2790
if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2791
!valid_signal(args.exit_signal)))
2792
return -EINVAL;
2793
2794
if ((args.flags & CLONE_INTO_CGROUP) &&
2795
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2796
return -EINVAL;
2797
2798
*kargs = (struct kernel_clone_args){
2799
.flags = args.flags,
2800
.pidfd = u64_to_user_ptr(args.pidfd),
2801
.child_tid = u64_to_user_ptr(args.child_tid),
2802
.parent_tid = u64_to_user_ptr(args.parent_tid),
2803
.exit_signal = args.exit_signal,
2804
.stack = args.stack,
2805
.stack_size = args.stack_size,
2806
.tls = args.tls,
2807
.set_tid_size = args.set_tid_size,
2808
.cgroup = args.cgroup,
2809
};
2810
2811
if (args.set_tid &&
2812
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2813
(kargs->set_tid_size * sizeof(pid_t))))
2814
return -EFAULT;
2815
2816
kargs->set_tid = kset_tid;
2817
2818
return 0;
2819
}
2820
2821
/**
2822
* clone3_stack_valid - check and prepare stack
2823
* @kargs: kernel clone args
2824
*
2825
* Verify that the stack arguments userspace gave us are sane.
2826
* In addition, set the stack direction for userspace since it's easy for us to
2827
* determine.
2828
*/
2829
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2830
{
2831
if (kargs->stack == 0) {
2832
if (kargs->stack_size > 0)
2833
return false;
2834
} else {
2835
if (kargs->stack_size == 0)
2836
return false;
2837
2838
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2839
return false;
2840
2841
#if !defined(CONFIG_STACK_GROWSUP)
2842
kargs->stack += kargs->stack_size;
2843
#endif
2844
}
2845
2846
return true;
2847
}
2848
2849
static bool clone3_args_valid(struct kernel_clone_args *kargs)
2850
{
2851
/* Verify that no unknown flags are passed along. */
2852
if (kargs->flags &
2853
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2854
return false;
2855
2856
/*
2857
* - make the CLONE_DETACHED bit reusable for clone3
2858
* - make the CSIGNAL bits reusable for clone3
2859
*/
2860
if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
2861
return false;
2862
2863
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2864
(CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2865
return false;
2866
2867
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2868
kargs->exit_signal)
2869
return false;
2870
2871
if (!clone3_stack_valid(kargs))
2872
return false;
2873
2874
return true;
2875
}
2876
2877
/**
2878
* sys_clone3 - create a new process with specific properties
2879
* @uargs: argument structure
2880
* @size: size of @uargs
2881
*
2882
* clone3() is the extensible successor to clone()/clone2().
2883
* It takes a struct as argument that is versioned by its size.
2884
*
2885
* Return: On success, a positive PID for the child process.
2886
* On error, a negative errno number.
2887
*/
2888
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2889
{
2890
int err;
2891
2892
struct kernel_clone_args kargs;
2893
pid_t set_tid[MAX_PID_NS_LEVEL];
2894
2895
#ifdef __ARCH_BROKEN_SYS_CLONE3
2896
#warning clone3() entry point is missing, please fix
2897
return -ENOSYS;
2898
#endif
2899
2900
kargs.set_tid = set_tid;
2901
2902
err = copy_clone_args_from_user(&kargs, uargs, size);
2903
if (err)
2904
return err;
2905
2906
if (!clone3_args_valid(&kargs))
2907
return -EINVAL;
2908
2909
return kernel_clone(&kargs);
2910
}
2911
2912
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2913
{
2914
struct task_struct *leader, *parent, *child;
2915
int res;
2916
2917
read_lock(&tasklist_lock);
2918
leader = top = top->group_leader;
2919
down:
2920
for_each_thread(leader, parent) {
2921
list_for_each_entry(child, &parent->children, sibling) {
2922
res = visitor(child, data);
2923
if (res) {
2924
if (res < 0)
2925
goto out;
2926
leader = child;
2927
goto down;
2928
}
2929
up:
2930
;
2931
}
2932
}
2933
2934
if (leader != top) {
2935
child = leader;
2936
parent = child->real_parent;
2937
leader = parent->group_leader;
2938
goto up;
2939
}
2940
out:
2941
read_unlock(&tasklist_lock);
2942
}
2943
2944
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2945
#define ARCH_MIN_MMSTRUCT_ALIGN 0
2946
#endif
2947
2948
static void sighand_ctor(void *data)
2949
{
2950
struct sighand_struct *sighand = data;
2951
2952
spin_lock_init(&sighand->siglock);
2953
init_waitqueue_head(&sighand->signalfd_wqh);
2954
}
2955
2956
void __init mm_cache_init(void)
2957
{
2958
unsigned int mm_size;
2959
2960
/*
2961
* The mm_cpumask is located at the end of mm_struct, and is
2962
* dynamically sized based on the maximum CPU number this system
2963
* can have, taking hotplug into account (nr_cpu_ids).
2964
*/
2965
mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
2966
2967
mm_cachep = kmem_cache_create_usercopy("mm_struct",
2968
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
2969
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2970
offsetof(struct mm_struct, saved_auxv),
2971
sizeof_field(struct mm_struct, saved_auxv),
2972
NULL);
2973
}
2974
2975
void __init proc_caches_init(void)
2976
{
2977
sighand_cachep = kmem_cache_create("sighand_cache",
2978
sizeof(struct sighand_struct), 0,
2979
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2980
SLAB_ACCOUNT, sighand_ctor);
2981
signal_cachep = kmem_cache_create("signal_cache",
2982
sizeof(struct signal_struct), 0,
2983
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2984
NULL);
2985
files_cachep = kmem_cache_create("files_cache",
2986
sizeof(struct files_struct), 0,
2987
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2988
NULL);
2989
fs_cachep = kmem_cache_create("fs_cache",
2990
sizeof(struct fs_struct), 0,
2991
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2992
NULL);
2993
mmap_init();
2994
nsproxy_cache_init();
2995
}
2996
2997
/*
2998
* Check constraints on flags passed to the unshare system call.
2999
*/
3000
static int check_unshare_flags(unsigned long unshare_flags)
3001
{
3002
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
3003
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
3004
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
3005
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
3006
CLONE_NEWTIME))
3007
return -EINVAL;
3008
/*
3009
* Not implemented, but pretend it works if there is nothing
3010
* to unshare. Note that unsharing the address space or the
3011
* signal handlers also need to unshare the signal queues (aka
3012
* CLONE_THREAD).
3013
*/
3014
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
3015
if (!thread_group_empty(current))
3016
return -EINVAL;
3017
}
3018
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
3019
if (refcount_read(&current->sighand->count) > 1)
3020
return -EINVAL;
3021
}
3022
if (unshare_flags & CLONE_VM) {
3023
if (!current_is_single_threaded())
3024
return -EINVAL;
3025
}
3026
3027
return 0;
3028
}
3029
3030
/*
3031
* Unshare the filesystem structure if it is being shared
3032
*/
3033
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
3034
{
3035
struct fs_struct *fs = current->fs;
3036
3037
if (!(unshare_flags & CLONE_FS) || !fs)
3038
return 0;
3039
3040
/* don't need lock here; in the worst case we'll do useless copy */
3041
if (fs->users == 1)
3042
return 0;
3043
3044
*new_fsp = copy_fs_struct(fs);
3045
if (!*new_fsp)
3046
return -ENOMEM;
3047
3048
return 0;
3049
}
3050
3051
/*
3052
* Unshare file descriptor table if it is being shared
3053
*/
3054
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
3055
{
3056
struct files_struct *fd = current->files;
3057
3058
if ((unshare_flags & CLONE_FILES) &&
3059
(fd && atomic_read(&fd->count) > 1)) {
3060
fd = dup_fd(fd, NULL);
3061
if (IS_ERR(fd))
3062
return PTR_ERR(fd);
3063
*new_fdp = fd;
3064
}
3065
3066
return 0;
3067
}
3068
3069
/*
3070
* unshare allows a process to 'unshare' part of the process
3071
* context which was originally shared using clone. copy_*
3072
* functions used by kernel_clone() cannot be used here directly
3073
* because they modify an inactive task_struct that is being
3074
* constructed. Here we are modifying the current, active,
3075
* task_struct.
3076
*/
3077
int ksys_unshare(unsigned long unshare_flags)
3078
{
3079
struct fs_struct *fs, *new_fs = NULL;
3080
struct files_struct *new_fd = NULL;
3081
struct cred *new_cred = NULL;
3082
struct nsproxy *new_nsproxy = NULL;
3083
int do_sysvsem = 0;
3084
int err;
3085
3086
/*
3087
* If unsharing a user namespace must also unshare the thread group
3088
* and unshare the filesystem root and working directories.
3089
*/
3090
if (unshare_flags & CLONE_NEWUSER)
3091
unshare_flags |= CLONE_THREAD | CLONE_FS;
3092
/*
3093
* If unsharing vm, must also unshare signal handlers.
3094
*/
3095
if (unshare_flags & CLONE_VM)
3096
unshare_flags |= CLONE_SIGHAND;
3097
/*
3098
* If unsharing a signal handlers, must also unshare the signal queues.
3099
*/
3100
if (unshare_flags & CLONE_SIGHAND)
3101
unshare_flags |= CLONE_THREAD;
3102
/*
3103
* If unsharing namespace, must also unshare filesystem information.
3104
*/
3105
if (unshare_flags & CLONE_NEWNS)
3106
unshare_flags |= CLONE_FS;
3107
3108
err = check_unshare_flags(unshare_flags);
3109
if (err)
3110
goto bad_unshare_out;
3111
/*
3112
* CLONE_NEWIPC must also detach from the undolist: after switching
3113
* to a new ipc namespace, the semaphore arrays from the old
3114
* namespace are unreachable.
3115
*/
3116
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
3117
do_sysvsem = 1;
3118
err = unshare_fs(unshare_flags, &new_fs);
3119
if (err)
3120
goto bad_unshare_out;
3121
err = unshare_fd(unshare_flags, &new_fd);
3122
if (err)
3123
goto bad_unshare_cleanup_fs;
3124
err = unshare_userns(unshare_flags, &new_cred);
3125
if (err)
3126
goto bad_unshare_cleanup_fd;
3127
err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3128
new_cred, new_fs);
3129
if (err)
3130
goto bad_unshare_cleanup_cred;
3131
3132
if (new_cred) {
3133
err = set_cred_ucounts(new_cred);
3134
if (err)
3135
goto bad_unshare_cleanup_cred;
3136
}
3137
3138
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
3139
if (do_sysvsem) {
3140
/*
3141
* CLONE_SYSVSEM is equivalent to sys_exit().
3142
*/
3143
exit_sem(current);
3144
}
3145
if (unshare_flags & CLONE_NEWIPC) {
3146
/* Orphan segments in old ns (see sem above). */
3147
exit_shm(current);
3148
shm_init_task(current);
3149
}
3150
3151
if (new_nsproxy)
3152
switch_task_namespaces(current, new_nsproxy);
3153
3154
task_lock(current);
3155
3156
if (new_fs) {
3157
fs = current->fs;
3158
read_seqlock_excl(&fs->seq);
3159
current->fs = new_fs;
3160
if (--fs->users)
3161
new_fs = NULL;
3162
else
3163
new_fs = fs;
3164
read_sequnlock_excl(&fs->seq);
3165
}
3166
3167
if (new_fd)
3168
swap(current->files, new_fd);
3169
3170
task_unlock(current);
3171
3172
if (new_cred) {
3173
/* Install the new user namespace */
3174
commit_creds(new_cred);
3175
new_cred = NULL;
3176
}
3177
}
3178
3179
perf_event_namespaces(current);
3180
3181
bad_unshare_cleanup_cred:
3182
if (new_cred)
3183
put_cred(new_cred);
3184
bad_unshare_cleanup_fd:
3185
if (new_fd)
3186
put_files_struct(new_fd);
3187
3188
bad_unshare_cleanup_fs:
3189
if (new_fs)
3190
free_fs_struct(new_fs);
3191
3192
bad_unshare_out:
3193
return err;
3194
}
3195
3196
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3197
{
3198
return ksys_unshare(unshare_flags);
3199
}
3200
3201
/*
3202
* Helper to unshare the files of the current task.
3203
* We don't want to expose copy_files internals to
3204
* the exec layer of the kernel.
3205
*/
3206
3207
int unshare_files(void)
3208
{
3209
struct task_struct *task = current;
3210
struct files_struct *old, *copy = NULL;
3211
int error;
3212
3213
error = unshare_fd(CLONE_FILES, &copy);
3214
if (error || !copy)
3215
return error;
3216
3217
old = task->files;
3218
task_lock(task);
3219
task->files = copy;
3220
task_unlock(task);
3221
put_files_struct(old);
3222
return 0;
3223
}
3224
3225
static int sysctl_max_threads(const struct ctl_table *table, int write,
3226
void *buffer, size_t *lenp, loff_t *ppos)
3227
{
3228
struct ctl_table t;
3229
int ret;
3230
int threads = max_threads;
3231
int min = 1;
3232
int max = MAX_THREADS;
3233
3234
t = *table;
3235
t.data = &threads;
3236
t.extra1 = &min;
3237
t.extra2 = &max;
3238
3239
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3240
if (ret || !write)
3241
return ret;
3242
3243
max_threads = threads;
3244
3245
return 0;
3246
}
3247
3248
static const struct ctl_table fork_sysctl_table[] = {
3249
{
3250
.procname = "threads-max",
3251
.data = NULL,
3252
.maxlen = sizeof(int),
3253
.mode = 0644,
3254
.proc_handler = sysctl_max_threads,
3255
},
3256
};
3257
3258
static int __init init_fork_sysctl(void)
3259
{
3260
register_sysctl_init("kernel", fork_sysctl_table);
3261
return 0;
3262
}
3263
3264
subsys_initcall(init_fork_sysctl);
3265
3266