Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/fork.c
48958 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* linux/kernel/fork.c
4
*
5
* Copyright (C) 1991, 1992 Linus Torvalds
6
*/
7
8
/*
9
* 'fork.c' contains the help-routines for the 'fork' system call
10
* (see also entry.S and others).
11
* Fork is rather simple, once you get the hang of it, but the memory
12
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
13
*/
14
15
#include <linux/anon_inodes.h>
16
#include <linux/slab.h>
17
#include <linux/sched/autogroup.h>
18
#include <linux/sched/mm.h>
19
#include <linux/sched/user.h>
20
#include <linux/sched/numa_balancing.h>
21
#include <linux/sched/stat.h>
22
#include <linux/sched/task.h>
23
#include <linux/sched/task_stack.h>
24
#include <linux/sched/cputime.h>
25
#include <linux/sched/ext.h>
26
#include <linux/seq_file.h>
27
#include <linux/rtmutex.h>
28
#include <linux/init.h>
29
#include <linux/unistd.h>
30
#include <linux/module.h>
31
#include <linux/vmalloc.h>
32
#include <linux/completion.h>
33
#include <linux/personality.h>
34
#include <linux/mempolicy.h>
35
#include <linux/sem.h>
36
#include <linux/file.h>
37
#include <linux/fdtable.h>
38
#include <linux/iocontext.h>
39
#include <linux/key.h>
40
#include <linux/kmsan.h>
41
#include <linux/binfmts.h>
42
#include <linux/mman.h>
43
#include <linux/mmu_notifier.h>
44
#include <linux/fs.h>
45
#include <linux/mm.h>
46
#include <linux/mm_inline.h>
47
#include <linux/memblock.h>
48
#include <linux/nsproxy.h>
49
#include <linux/capability.h>
50
#include <linux/cpu.h>
51
#include <linux/cgroup.h>
52
#include <linux/security.h>
53
#include <linux/hugetlb.h>
54
#include <linux/seccomp.h>
55
#include <linux/swap.h>
56
#include <linux/syscalls.h>
57
#include <linux/syscall_user_dispatch.h>
58
#include <linux/jiffies.h>
59
#include <linux/futex.h>
60
#include <linux/compat.h>
61
#include <linux/kthread.h>
62
#include <linux/task_io_accounting_ops.h>
63
#include <linux/rcupdate.h>
64
#include <linux/ptrace.h>
65
#include <linux/mount.h>
66
#include <linux/audit.h>
67
#include <linux/memcontrol.h>
68
#include <linux/ftrace.h>
69
#include <linux/proc_fs.h>
70
#include <linux/profile.h>
71
#include <linux/rmap.h>
72
#include <linux/ksm.h>
73
#include <linux/acct.h>
74
#include <linux/userfaultfd_k.h>
75
#include <linux/tsacct_kern.h>
76
#include <linux/cn_proc.h>
77
#include <linux/freezer.h>
78
#include <linux/delayacct.h>
79
#include <linux/taskstats_kern.h>
80
#include <linux/tty.h>
81
#include <linux/fs_struct.h>
82
#include <linux/magic.h>
83
#include <linux/perf_event.h>
84
#include <linux/posix-timers.h>
85
#include <linux/user-return-notifier.h>
86
#include <linux/oom.h>
87
#include <linux/khugepaged.h>
88
#include <linux/signalfd.h>
89
#include <linux/uprobes.h>
90
#include <linux/aio.h>
91
#include <linux/compiler.h>
92
#include <linux/sysctl.h>
93
#include <linux/kcov.h>
94
#include <linux/livepatch.h>
95
#include <linux/thread_info.h>
96
#include <linux/kstack_erase.h>
97
#include <linux/kasan.h>
98
#include <linux/scs.h>
99
#include <linux/io_uring.h>
100
#include <linux/bpf.h>
101
#include <linux/stackprotector.h>
102
#include <linux/user_events.h>
103
#include <linux/iommu.h>
104
#include <linux/rseq.h>
105
#include <uapi/linux/pidfd.h>
106
#include <linux/pidfs.h>
107
#include <linux/tick.h>
108
#include <linux/unwind_deferred.h>
109
#include <linux/pgalloc.h>
110
#include <linux/uaccess.h>
111
112
#include <asm/mmu_context.h>
113
#include <asm/cacheflush.h>
114
#include <asm/tlbflush.h>
115
116
/* For dup_mmap(). */
117
#include "../mm/internal.h"
118
119
#include <trace/events/sched.h>
120
121
#define CREATE_TRACE_POINTS
122
#include <trace/events/task.h>
123
124
#include <kunit/visibility.h>
125
126
/*
127
* Minimum number of threads to boot the kernel
128
*/
129
#define MIN_THREADS 20
130
131
/*
132
* Maximum number of threads
133
*/
134
#define MAX_THREADS FUTEX_TID_MASK
135
136
/*
137
* Protected counters by write_lock_irq(&tasklist_lock)
138
*/
139
unsigned long total_forks; /* Handle normal Linux uptimes. */
140
int nr_threads; /* The idle threads do not count.. */
141
142
static int max_threads; /* tunable limit on nr_threads */
143
144
#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
145
146
static const char * const resident_page_types[] = {
147
NAMED_ARRAY_INDEX(MM_FILEPAGES),
148
NAMED_ARRAY_INDEX(MM_ANONPAGES),
149
NAMED_ARRAY_INDEX(MM_SWAPENTS),
150
NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
151
};
152
153
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
154
155
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
156
157
#ifdef CONFIG_PROVE_RCU
158
int lockdep_tasklist_lock_is_held(void)
159
{
160
return lockdep_is_held(&tasklist_lock);
161
}
162
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
163
#endif /* #ifdef CONFIG_PROVE_RCU */
164
165
int nr_processes(void)
166
{
167
int cpu;
168
int total = 0;
169
170
for_each_possible_cpu(cpu)
171
total += per_cpu(process_counts, cpu);
172
173
return total;
174
}
175
176
void __weak arch_release_task_struct(struct task_struct *tsk)
177
{
178
}
179
180
static struct kmem_cache *task_struct_cachep;
181
182
static inline struct task_struct *alloc_task_struct_node(int node)
183
{
184
return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
185
}
186
187
static inline void free_task_struct(struct task_struct *tsk)
188
{
189
kmem_cache_free(task_struct_cachep, tsk);
190
}
191
192
#ifdef CONFIG_VMAP_STACK
193
/*
194
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
195
* flush. Try to minimize the number of calls by caching stacks.
196
*/
197
#define NR_CACHED_STACKS 2
198
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
199
/*
200
* Allocated stacks are cached and later reused by new threads, so memcg
201
* accounting is performed by the code assigning/releasing stacks to tasks.
202
* We need a zeroed memory without __GFP_ACCOUNT.
203
*/
204
#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
205
206
struct vm_stack {
207
struct rcu_head rcu;
208
struct vm_struct *stack_vm_area;
209
};
210
211
static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
212
{
213
struct vm_struct *vm_area;
214
unsigned int i;
215
216
/*
217
* If the node has memory, we are guaranteed the stacks are backed by local pages.
218
* Otherwise the pages are arbitrary.
219
*
220
* Note that depending on cpuset it is possible we will get migrated to a different
221
* node immediately after allocating here, so this does *not* guarantee locality for
222
* arbitrary callers.
223
*/
224
scoped_guard(preempt) {
225
if (node != NUMA_NO_NODE && numa_node_id() != node)
226
return NULL;
227
228
for (i = 0; i < NR_CACHED_STACKS; i++) {
229
vm_area = this_cpu_xchg(cached_stacks[i], NULL);
230
if (vm_area)
231
return vm_area;
232
}
233
}
234
235
return NULL;
236
}
237
238
static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
239
{
240
unsigned int i;
241
int nid;
242
243
/*
244
* Don't cache stacks if any of the pages don't match the local domain, unless
245
* there is no local memory to begin with.
246
*
247
* Note that lack of local memory does not automatically mean it makes no difference
248
* performance-wise which other domain backs the stack. In this case we are merely
249
* trying to avoid constantly going to vmalloc.
250
*/
251
scoped_guard(preempt) {
252
nid = numa_node_id();
253
if (node_state(nid, N_MEMORY)) {
254
for (i = 0; i < vm_area->nr_pages; i++) {
255
struct page *page = vm_area->pages[i];
256
if (page_to_nid(page) != nid)
257
return false;
258
}
259
}
260
261
for (i = 0; i < NR_CACHED_STACKS; i++) {
262
struct vm_struct *tmp = NULL;
263
264
if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
265
return true;
266
}
267
}
268
return false;
269
}
270
271
static void thread_stack_free_rcu(struct rcu_head *rh)
272
{
273
struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
274
struct vm_struct *vm_area = vm_stack->stack_vm_area;
275
276
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
277
return;
278
279
vfree(vm_area->addr);
280
}
281
282
static void thread_stack_delayed_free(struct task_struct *tsk)
283
{
284
struct vm_stack *vm_stack = tsk->stack;
285
286
vm_stack->stack_vm_area = tsk->stack_vm_area;
287
call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
288
}
289
290
static int free_vm_stack_cache(unsigned int cpu)
291
{
292
struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
293
int i;
294
295
for (i = 0; i < NR_CACHED_STACKS; i++) {
296
struct vm_struct *vm_area = cached_vm_stack_areas[i];
297
298
if (!vm_area)
299
continue;
300
301
vfree(vm_area->addr);
302
cached_vm_stack_areas[i] = NULL;
303
}
304
305
return 0;
306
}
307
308
static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
309
{
310
int i;
311
int ret;
312
int nr_charged = 0;
313
314
BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
315
316
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
317
ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
318
if (ret)
319
goto err;
320
nr_charged++;
321
}
322
return 0;
323
err:
324
for (i = 0; i < nr_charged; i++)
325
memcg_kmem_uncharge_page(vm_area->pages[i], 0);
326
return ret;
327
}
328
329
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
330
{
331
struct vm_struct *vm_area;
332
void *stack;
333
334
vm_area = alloc_thread_stack_node_from_cache(tsk, node);
335
if (vm_area) {
336
if (memcg_charge_kernel_stack(vm_area)) {
337
vfree(vm_area->addr);
338
return -ENOMEM;
339
}
340
341
/* Reset stack metadata. */
342
kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
343
344
stack = kasan_reset_tag(vm_area->addr);
345
346
/* Clear stale pointers from reused stack. */
347
memset(stack, 0, THREAD_SIZE);
348
349
tsk->stack_vm_area = vm_area;
350
tsk->stack = stack;
351
return 0;
352
}
353
354
stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
355
GFP_VMAP_STACK,
356
node, __builtin_return_address(0));
357
if (!stack)
358
return -ENOMEM;
359
360
vm_area = find_vm_area(stack);
361
if (memcg_charge_kernel_stack(vm_area)) {
362
vfree(stack);
363
return -ENOMEM;
364
}
365
/*
366
* We can't call find_vm_area() in interrupt context, and
367
* free_thread_stack() can be called in interrupt context,
368
* so cache the vm_struct.
369
*/
370
tsk->stack_vm_area = vm_area;
371
stack = kasan_reset_tag(stack);
372
tsk->stack = stack;
373
return 0;
374
}
375
376
static void free_thread_stack(struct task_struct *tsk)
377
{
378
if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
379
thread_stack_delayed_free(tsk);
380
381
tsk->stack = NULL;
382
tsk->stack_vm_area = NULL;
383
}
384
385
#else /* !CONFIG_VMAP_STACK */
386
387
/*
388
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
389
* kmemcache based allocator.
390
*/
391
#if THREAD_SIZE >= PAGE_SIZE
392
393
static void thread_stack_free_rcu(struct rcu_head *rh)
394
{
395
__free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
396
}
397
398
static void thread_stack_delayed_free(struct task_struct *tsk)
399
{
400
struct rcu_head *rh = tsk->stack;
401
402
call_rcu(rh, thread_stack_free_rcu);
403
}
404
405
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
406
{
407
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
408
THREAD_SIZE_ORDER);
409
410
if (likely(page)) {
411
tsk->stack = kasan_reset_tag(page_address(page));
412
return 0;
413
}
414
return -ENOMEM;
415
}
416
417
static void free_thread_stack(struct task_struct *tsk)
418
{
419
thread_stack_delayed_free(tsk);
420
tsk->stack = NULL;
421
}
422
423
#else /* !(THREAD_SIZE >= PAGE_SIZE) */
424
425
static struct kmem_cache *thread_stack_cache;
426
427
static void thread_stack_free_rcu(struct rcu_head *rh)
428
{
429
kmem_cache_free(thread_stack_cache, rh);
430
}
431
432
static void thread_stack_delayed_free(struct task_struct *tsk)
433
{
434
struct rcu_head *rh = tsk->stack;
435
436
call_rcu(rh, thread_stack_free_rcu);
437
}
438
439
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
440
{
441
unsigned long *stack;
442
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
443
stack = kasan_reset_tag(stack);
444
tsk->stack = stack;
445
return stack ? 0 : -ENOMEM;
446
}
447
448
static void free_thread_stack(struct task_struct *tsk)
449
{
450
thread_stack_delayed_free(tsk);
451
tsk->stack = NULL;
452
}
453
454
void thread_stack_cache_init(void)
455
{
456
thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
457
THREAD_SIZE, THREAD_SIZE, 0, 0,
458
THREAD_SIZE, NULL);
459
BUG_ON(thread_stack_cache == NULL);
460
}
461
462
#endif /* THREAD_SIZE >= PAGE_SIZE */
463
#endif /* CONFIG_VMAP_STACK */
464
465
/* SLAB cache for signal_struct structures (tsk->signal) */
466
static struct kmem_cache *signal_cachep;
467
468
/* SLAB cache for sighand_struct structures (tsk->sighand) */
469
struct kmem_cache *sighand_cachep;
470
471
/* SLAB cache for files_struct structures (tsk->files) */
472
struct kmem_cache *files_cachep;
473
474
/* SLAB cache for fs_struct structures (tsk->fs) */
475
struct kmem_cache *fs_cachep;
476
477
/* SLAB cache for mm_struct structures (tsk->mm) */
478
static struct kmem_cache *mm_cachep;
479
480
static void account_kernel_stack(struct task_struct *tsk, int account)
481
{
482
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
483
struct vm_struct *vm_area = task_stack_vm_area(tsk);
484
int i;
485
486
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
487
mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
488
account * (PAGE_SIZE / 1024));
489
} else {
490
void *stack = task_stack_page(tsk);
491
492
/* All stack pages are in the same node. */
493
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
494
account * (THREAD_SIZE / 1024));
495
}
496
}
497
498
void exit_task_stack_account(struct task_struct *tsk)
499
{
500
account_kernel_stack(tsk, -1);
501
502
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
503
struct vm_struct *vm_area;
504
int i;
505
506
vm_area = task_stack_vm_area(tsk);
507
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
508
memcg_kmem_uncharge_page(vm_area->pages[i], 0);
509
}
510
}
511
512
static void release_task_stack(struct task_struct *tsk)
513
{
514
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
515
return; /* Better to leak the stack than to free prematurely */
516
517
free_thread_stack(tsk);
518
}
519
520
#ifdef CONFIG_THREAD_INFO_IN_TASK
521
void put_task_stack(struct task_struct *tsk)
522
{
523
if (refcount_dec_and_test(&tsk->stack_refcount))
524
release_task_stack(tsk);
525
}
526
#endif
527
528
void free_task(struct task_struct *tsk)
529
{
530
#ifdef CONFIG_SECCOMP
531
WARN_ON_ONCE(tsk->seccomp.filter);
532
#endif
533
release_user_cpus_ptr(tsk);
534
scs_release(tsk);
535
536
#ifndef CONFIG_THREAD_INFO_IN_TASK
537
/*
538
* The task is finally done with both the stack and thread_info,
539
* so free both.
540
*/
541
release_task_stack(tsk);
542
#else
543
/*
544
* If the task had a separate stack allocation, it should be gone
545
* by now.
546
*/
547
WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
548
#endif
549
rt_mutex_debug_task_free(tsk);
550
ftrace_graph_exit_task(tsk);
551
arch_release_task_struct(tsk);
552
if (tsk->flags & PF_KTHREAD)
553
free_kthread_struct(tsk);
554
bpf_task_storage_free(tsk);
555
free_task_struct(tsk);
556
}
557
EXPORT_SYMBOL(free_task);
558
559
void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
560
{
561
struct file *exe_file;
562
563
exe_file = get_mm_exe_file(oldmm);
564
RCU_INIT_POINTER(mm->exe_file, exe_file);
565
/*
566
* We depend on the oldmm having properly denied write access to the
567
* exe_file already.
568
*/
569
if (exe_file && exe_file_deny_write_access(exe_file))
570
pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
571
}
572
573
#ifdef CONFIG_MMU
574
static inline int mm_alloc_pgd(struct mm_struct *mm)
575
{
576
mm->pgd = pgd_alloc(mm);
577
if (unlikely(!mm->pgd))
578
return -ENOMEM;
579
return 0;
580
}
581
582
static inline void mm_free_pgd(struct mm_struct *mm)
583
{
584
pgd_free(mm, mm->pgd);
585
}
586
#else
587
#define mm_alloc_pgd(mm) (0)
588
#define mm_free_pgd(mm)
589
#endif /* CONFIG_MMU */
590
591
#ifdef CONFIG_MM_ID
592
static DEFINE_IDA(mm_ida);
593
594
static inline int mm_alloc_id(struct mm_struct *mm)
595
{
596
int ret;
597
598
ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
599
if (ret < 0)
600
return ret;
601
mm->mm_id = ret;
602
return 0;
603
}
604
605
static inline void mm_free_id(struct mm_struct *mm)
606
{
607
const mm_id_t id = mm->mm_id;
608
609
mm->mm_id = MM_ID_DUMMY;
610
if (id == MM_ID_DUMMY)
611
return;
612
if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
613
return;
614
ida_free(&mm_ida, id);
615
}
616
#else /* !CONFIG_MM_ID */
617
static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
618
static inline void mm_free_id(struct mm_struct *mm) {}
619
#endif /* CONFIG_MM_ID */
620
621
static void check_mm(struct mm_struct *mm)
622
{
623
int i;
624
625
BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
626
"Please make sure 'struct resident_page_types[]' is updated as well");
627
628
for (i = 0; i < NR_MM_COUNTERS; i++) {
629
long x = percpu_counter_sum(&mm->rss_stat[i]);
630
631
if (unlikely(x)) {
632
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
633
mm, resident_page_types[i], x,
634
current->comm,
635
task_pid_nr(current));
636
}
637
}
638
639
if (mm_pgtables_bytes(mm))
640
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
641
mm_pgtables_bytes(mm));
642
643
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
644
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
645
#endif
646
}
647
648
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
649
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
650
651
static void do_check_lazy_tlb(void *arg)
652
{
653
struct mm_struct *mm = arg;
654
655
WARN_ON_ONCE(current->active_mm == mm);
656
}
657
658
static void do_shoot_lazy_tlb(void *arg)
659
{
660
struct mm_struct *mm = arg;
661
662
if (current->active_mm == mm) {
663
WARN_ON_ONCE(current->mm);
664
current->active_mm = &init_mm;
665
switch_mm(mm, &init_mm, current);
666
}
667
}
668
669
static void cleanup_lazy_tlbs(struct mm_struct *mm)
670
{
671
if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
672
/*
673
* In this case, lazy tlb mms are refounted and would not reach
674
* __mmdrop until all CPUs have switched away and mmdrop()ed.
675
*/
676
return;
677
}
678
679
/*
680
* Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
681
* requires lazy mm users to switch to another mm when the refcount
682
* drops to zero, before the mm is freed. This requires IPIs here to
683
* switch kernel threads to init_mm.
684
*
685
* archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
686
* switch with the final userspace teardown TLB flush which leaves the
687
* mm lazy on this CPU but no others, reducing the need for additional
688
* IPIs here. There are cases where a final IPI is still required here,
689
* such as the final mmdrop being performed on a different CPU than the
690
* one exiting, or kernel threads using the mm when userspace exits.
691
*
692
* IPI overheads have not found to be expensive, but they could be
693
* reduced in a number of possible ways, for example (roughly
694
* increasing order of complexity):
695
* - The last lazy reference created by exit_mm() could instead switch
696
* to init_mm, however it's probable this will run on the same CPU
697
* immediately afterwards, so this may not reduce IPIs much.
698
* - A batch of mms requiring IPIs could be gathered and freed at once.
699
* - CPUs store active_mm where it can be remotely checked without a
700
* lock, to filter out false-positives in the cpumask.
701
* - After mm_users or mm_count reaches zero, switching away from the
702
* mm could clear mm_cpumask to reduce some IPIs, perhaps together
703
* with some batching or delaying of the final IPIs.
704
* - A delayed freeing and RCU-like quiescing sequence based on mm
705
* switching to avoid IPIs completely.
706
*/
707
on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
708
if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
709
on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
710
}
711
712
/*
713
* Called when the last reference to the mm
714
* is dropped: either by a lazy thread or by
715
* mmput. Free the page directory and the mm.
716
*/
717
void __mmdrop(struct mm_struct *mm)
718
{
719
BUG_ON(mm == &init_mm);
720
WARN_ON_ONCE(mm == current->mm);
721
722
/* Ensure no CPUs are using this as their lazy tlb mm */
723
cleanup_lazy_tlbs(mm);
724
725
WARN_ON_ONCE(mm == current->active_mm);
726
mm_free_pgd(mm);
727
mm_free_id(mm);
728
destroy_context(mm);
729
mmu_notifier_subscriptions_destroy(mm);
730
check_mm(mm);
731
put_user_ns(mm->user_ns);
732
mm_pasid_drop(mm);
733
mm_destroy_cid(mm);
734
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
735
736
free_mm(mm);
737
}
738
EXPORT_SYMBOL_GPL(__mmdrop);
739
740
static void mmdrop_async_fn(struct work_struct *work)
741
{
742
struct mm_struct *mm;
743
744
mm = container_of(work, struct mm_struct, async_put_work);
745
__mmdrop(mm);
746
}
747
748
static void mmdrop_async(struct mm_struct *mm)
749
{
750
if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
751
INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
752
schedule_work(&mm->async_put_work);
753
}
754
}
755
756
static inline void free_signal_struct(struct signal_struct *sig)
757
{
758
taskstats_tgid_free(sig);
759
sched_autogroup_exit(sig);
760
/*
761
* __mmdrop is not safe to call from softirq context on x86 due to
762
* pgd_dtor so postpone it to the async context
763
*/
764
if (sig->oom_mm)
765
mmdrop_async(sig->oom_mm);
766
kmem_cache_free(signal_cachep, sig);
767
}
768
769
static inline void put_signal_struct(struct signal_struct *sig)
770
{
771
if (refcount_dec_and_test(&sig->sigcnt))
772
free_signal_struct(sig);
773
}
774
775
void __put_task_struct(struct task_struct *tsk)
776
{
777
WARN_ON(!tsk->exit_state);
778
WARN_ON(refcount_read(&tsk->usage));
779
WARN_ON(tsk == current);
780
781
unwind_task_free(tsk);
782
io_uring_free(tsk);
783
cgroup_task_free(tsk);
784
task_numa_free(tsk, true);
785
security_task_free(tsk);
786
exit_creds(tsk);
787
delayacct_tsk_free(tsk);
788
put_signal_struct(tsk->signal);
789
sched_core_free(tsk);
790
free_task(tsk);
791
}
792
EXPORT_SYMBOL_GPL(__put_task_struct);
793
794
void __put_task_struct_rcu_cb(struct rcu_head *rhp)
795
{
796
struct task_struct *task = container_of(rhp, struct task_struct, rcu);
797
798
__put_task_struct(task);
799
}
800
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
801
802
void __init __weak arch_task_cache_init(void) { }
803
804
/*
805
* set_max_threads
806
*/
807
static void __init set_max_threads(unsigned int max_threads_suggested)
808
{
809
u64 threads;
810
unsigned long nr_pages = memblock_estimated_nr_free_pages();
811
812
/*
813
* The number of threads shall be limited such that the thread
814
* structures may only consume a small part of the available memory.
815
*/
816
if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
817
threads = MAX_THREADS;
818
else
819
threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
820
(u64) THREAD_SIZE * 8UL);
821
822
if (threads > max_threads_suggested)
823
threads = max_threads_suggested;
824
825
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
826
}
827
828
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
829
/* Initialized by the architecture: */
830
int arch_task_struct_size __read_mostly;
831
#endif
832
833
static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
834
{
835
/* Fetch thread_struct whitelist for the architecture. */
836
arch_thread_struct_whitelist(offset, size);
837
838
/*
839
* Handle zero-sized whitelist or empty thread_struct, otherwise
840
* adjust offset to position of thread_struct in task_struct.
841
*/
842
if (unlikely(*size == 0))
843
*offset = 0;
844
else
845
*offset += offsetof(struct task_struct, thread);
846
}
847
848
void __init fork_init(void)
849
{
850
int i;
851
#ifndef ARCH_MIN_TASKALIGN
852
#define ARCH_MIN_TASKALIGN 0
853
#endif
854
int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
855
unsigned long useroffset, usersize;
856
857
/* create a slab on which task_structs can be allocated */
858
task_struct_whitelist(&useroffset, &usersize);
859
task_struct_cachep = kmem_cache_create_usercopy("task_struct",
860
arch_task_struct_size, align,
861
SLAB_PANIC|SLAB_ACCOUNT,
862
useroffset, usersize, NULL);
863
864
/* do the arch specific task caches init */
865
arch_task_cache_init();
866
867
set_max_threads(MAX_THREADS);
868
869
init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
870
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
871
init_task.signal->rlim[RLIMIT_SIGPENDING] =
872
init_task.signal->rlim[RLIMIT_NPROC];
873
874
for (i = 0; i < UCOUNT_COUNTS; i++)
875
init_user_ns.ucount_max[i] = max_threads/2;
876
877
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
878
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
879
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
880
set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
881
882
#ifdef CONFIG_VMAP_STACK
883
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
884
NULL, free_vm_stack_cache);
885
#endif
886
887
scs_init();
888
889
lockdep_init_task(&init_task);
890
uprobes_init();
891
}
892
893
int __weak arch_dup_task_struct(struct task_struct *dst,
894
struct task_struct *src)
895
{
896
*dst = *src;
897
return 0;
898
}
899
900
void set_task_stack_end_magic(struct task_struct *tsk)
901
{
902
unsigned long *stackend;
903
904
stackend = end_of_stack(tsk);
905
*stackend = STACK_END_MAGIC; /* for overflow detection */
906
}
907
908
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
909
{
910
struct task_struct *tsk;
911
int err;
912
913
if (node == NUMA_NO_NODE)
914
node = tsk_fork_get_node(orig);
915
tsk = alloc_task_struct_node(node);
916
if (!tsk)
917
return NULL;
918
919
err = arch_dup_task_struct(tsk, orig);
920
if (err)
921
goto free_tsk;
922
923
err = alloc_thread_stack_node(tsk, node);
924
if (err)
925
goto free_tsk;
926
927
#ifdef CONFIG_THREAD_INFO_IN_TASK
928
refcount_set(&tsk->stack_refcount, 1);
929
#endif
930
account_kernel_stack(tsk, 1);
931
932
err = scs_prepare(tsk, node);
933
if (err)
934
goto free_stack;
935
936
#ifdef CONFIG_SECCOMP
937
/*
938
* We must handle setting up seccomp filters once we're under
939
* the sighand lock in case orig has changed between now and
940
* then. Until then, filter must be NULL to avoid messing up
941
* the usage counts on the error path calling free_task.
942
*/
943
tsk->seccomp.filter = NULL;
944
#endif
945
946
setup_thread_stack(tsk, orig);
947
clear_user_return_notifier(tsk);
948
clear_tsk_need_resched(tsk);
949
set_task_stack_end_magic(tsk);
950
clear_syscall_work_syscall_user_dispatch(tsk);
951
952
#ifdef CONFIG_STACKPROTECTOR
953
tsk->stack_canary = get_random_canary();
954
#endif
955
if (orig->cpus_ptr == &orig->cpus_mask)
956
tsk->cpus_ptr = &tsk->cpus_mask;
957
dup_user_cpus_ptr(tsk, orig, node);
958
959
/*
960
* One for the user space visible state that goes away when reaped.
961
* One for the scheduler.
962
*/
963
refcount_set(&tsk->rcu_users, 2);
964
/* One for the rcu users */
965
refcount_set(&tsk->usage, 1);
966
#ifdef CONFIG_BLK_DEV_IO_TRACE
967
tsk->btrace_seq = 0;
968
#endif
969
tsk->splice_pipe = NULL;
970
tsk->task_frag.page = NULL;
971
tsk->wake_q.next = NULL;
972
tsk->worker_private = NULL;
973
974
kcov_task_init(tsk);
975
kmsan_task_create(tsk);
976
kmap_local_fork(tsk);
977
978
#ifdef CONFIG_FAULT_INJECTION
979
tsk->fail_nth = 0;
980
#endif
981
982
#ifdef CONFIG_BLK_CGROUP
983
tsk->throttle_disk = NULL;
984
tsk->use_memdelay = 0;
985
#endif
986
987
#ifdef CONFIG_ARCH_HAS_CPU_PASID
988
tsk->pasid_activated = 0;
989
#endif
990
991
#ifdef CONFIG_MEMCG
992
tsk->active_memcg = NULL;
993
#endif
994
995
#ifdef CONFIG_X86_BUS_LOCK_DETECT
996
tsk->reported_split_lock = 0;
997
#endif
998
999
#ifdef CONFIG_SCHED_MM_CID
1000
tsk->mm_cid.cid = MM_CID_UNSET;
1001
tsk->mm_cid.active = 0;
1002
#endif
1003
return tsk;
1004
1005
free_stack:
1006
exit_task_stack_account(tsk);
1007
free_thread_stack(tsk);
1008
free_tsk:
1009
free_task_struct(tsk);
1010
return NULL;
1011
}
1012
1013
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
1014
1015
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
1016
1017
static int __init coredump_filter_setup(char *s)
1018
{
1019
default_dump_filter =
1020
(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
1021
MMF_DUMP_FILTER_MASK;
1022
return 1;
1023
}
1024
1025
__setup("coredump_filter=", coredump_filter_setup);
1026
1027
#include <linux/init_task.h>
1028
1029
static void mm_init_aio(struct mm_struct *mm)
1030
{
1031
#ifdef CONFIG_AIO
1032
spin_lock_init(&mm->ioctx_lock);
1033
mm->ioctx_table = NULL;
1034
#endif
1035
}
1036
1037
static __always_inline void mm_clear_owner(struct mm_struct *mm,
1038
struct task_struct *p)
1039
{
1040
#ifdef CONFIG_MEMCG
1041
if (mm->owner == p)
1042
WRITE_ONCE(mm->owner, NULL);
1043
#endif
1044
}
1045
1046
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1047
{
1048
#ifdef CONFIG_MEMCG
1049
mm->owner = p;
1050
#endif
1051
}
1052
1053
static void mm_init_uprobes_state(struct mm_struct *mm)
1054
{
1055
#ifdef CONFIG_UPROBES
1056
mm->uprobes_state.xol_area = NULL;
1057
arch_uprobe_init_state(mm);
1058
#endif
1059
}
1060
1061
static void mmap_init_lock(struct mm_struct *mm)
1062
{
1063
init_rwsem(&mm->mmap_lock);
1064
mm_lock_seqcount_init(mm);
1065
#ifdef CONFIG_PER_VMA_LOCK
1066
rcuwait_init(&mm->vma_writer_wait);
1067
#endif
1068
}
1069
1070
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1071
struct user_namespace *user_ns)
1072
{
1073
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
1074
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
1075
atomic_set(&mm->mm_users, 1);
1076
atomic_set(&mm->mm_count, 1);
1077
seqcount_init(&mm->write_protect_seq);
1078
mmap_init_lock(mm);
1079
INIT_LIST_HEAD(&mm->mmlist);
1080
mm_pgtables_bytes_init(mm);
1081
mm->map_count = 0;
1082
mm->locked_vm = 0;
1083
atomic64_set(&mm->pinned_vm, 0);
1084
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
1085
spin_lock_init(&mm->page_table_lock);
1086
spin_lock_init(&mm->arg_lock);
1087
mm_init_cpumask(mm);
1088
mm_init_aio(mm);
1089
mm_init_owner(mm, p);
1090
mm_pasid_init(mm);
1091
RCU_INIT_POINTER(mm->exe_file, NULL);
1092
mmu_notifier_subscriptions_init(mm);
1093
init_tlb_flush_pending(mm);
1094
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
1095
mm->pmd_huge_pte = NULL;
1096
#endif
1097
mm_init_uprobes_state(mm);
1098
hugetlb_count_init(mm);
1099
1100
mm_flags_clear_all(mm);
1101
if (current->mm) {
1102
unsigned long flags = __mm_flags_get_word(current->mm);
1103
1104
__mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
1105
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
1106
} else {
1107
__mm_flags_overwrite_word(mm, default_dump_filter);
1108
mm->def_flags = 0;
1109
}
1110
1111
if (futex_mm_init(mm))
1112
goto fail_mm_init;
1113
1114
if (mm_alloc_pgd(mm))
1115
goto fail_nopgd;
1116
1117
if (mm_alloc_id(mm))
1118
goto fail_noid;
1119
1120
if (init_new_context(p, mm))
1121
goto fail_nocontext;
1122
1123
if (mm_alloc_cid(mm, p))
1124
goto fail_cid;
1125
1126
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
1127
NR_MM_COUNTERS))
1128
goto fail_pcpu;
1129
1130
mm->user_ns = get_user_ns(user_ns);
1131
lru_gen_init_mm(mm);
1132
return mm;
1133
1134
fail_pcpu:
1135
mm_destroy_cid(mm);
1136
fail_cid:
1137
destroy_context(mm);
1138
fail_nocontext:
1139
mm_free_id(mm);
1140
fail_noid:
1141
mm_free_pgd(mm);
1142
fail_nopgd:
1143
futex_hash_free(mm);
1144
fail_mm_init:
1145
free_mm(mm);
1146
return NULL;
1147
}
1148
1149
/*
1150
* Allocate and initialize an mm_struct.
1151
*/
1152
struct mm_struct *mm_alloc(void)
1153
{
1154
struct mm_struct *mm;
1155
1156
mm = allocate_mm();
1157
if (!mm)
1158
return NULL;
1159
1160
memset(mm, 0, sizeof(*mm));
1161
return mm_init(mm, current, current_user_ns());
1162
}
1163
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
1164
1165
static inline void __mmput(struct mm_struct *mm)
1166
{
1167
VM_BUG_ON(atomic_read(&mm->mm_users));
1168
1169
uprobe_clear_state(mm);
1170
exit_aio(mm);
1171
ksm_exit(mm);
1172
khugepaged_exit(mm); /* must run before exit_mmap */
1173
exit_mmap(mm);
1174
mm_put_huge_zero_folio(mm);
1175
set_mm_exe_file(mm, NULL);
1176
if (!list_empty(&mm->mmlist)) {
1177
spin_lock(&mmlist_lock);
1178
list_del(&mm->mmlist);
1179
spin_unlock(&mmlist_lock);
1180
}
1181
if (mm->binfmt)
1182
module_put(mm->binfmt->module);
1183
lru_gen_del_mm(mm);
1184
futex_hash_free(mm);
1185
mmdrop(mm);
1186
}
1187
1188
/*
1189
* Decrement the use count and release all resources for an mm.
1190
*/
1191
void mmput(struct mm_struct *mm)
1192
{
1193
might_sleep();
1194
1195
if (atomic_dec_and_test(&mm->mm_users))
1196
__mmput(mm);
1197
}
1198
EXPORT_SYMBOL_GPL(mmput);
1199
1200
#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
1201
static void mmput_async_fn(struct work_struct *work)
1202
{
1203
struct mm_struct *mm = container_of(work, struct mm_struct,
1204
async_put_work);
1205
1206
__mmput(mm);
1207
}
1208
1209
void mmput_async(struct mm_struct *mm)
1210
{
1211
if (atomic_dec_and_test(&mm->mm_users)) {
1212
INIT_WORK(&mm->async_put_work, mmput_async_fn);
1213
schedule_work(&mm->async_put_work);
1214
}
1215
}
1216
EXPORT_SYMBOL_GPL(mmput_async);
1217
#endif
1218
1219
/**
1220
* set_mm_exe_file - change a reference to the mm's executable file
1221
* @mm: The mm to change.
1222
* @new_exe_file: The new file to use.
1223
*
1224
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1225
*
1226
* Main users are mmput() and sys_execve(). Callers prevent concurrent
1227
* invocations: in mmput() nobody alive left, in execve it happens before
1228
* the new mm is made visible to anyone.
1229
*
1230
* Can only fail if new_exe_file != NULL.
1231
*/
1232
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1233
{
1234
struct file *old_exe_file;
1235
1236
/*
1237
* It is safe to dereference the exe_file without RCU as
1238
* this function is only called if nobody else can access
1239
* this mm -- see comment above for justification.
1240
*/
1241
old_exe_file = rcu_dereference_raw(mm->exe_file);
1242
1243
if (new_exe_file) {
1244
/*
1245
* We expect the caller (i.e., sys_execve) to already denied
1246
* write access, so this is unlikely to fail.
1247
*/
1248
if (unlikely(exe_file_deny_write_access(new_exe_file)))
1249
return -EACCES;
1250
get_file(new_exe_file);
1251
}
1252
rcu_assign_pointer(mm->exe_file, new_exe_file);
1253
if (old_exe_file) {
1254
exe_file_allow_write_access(old_exe_file);
1255
fput(old_exe_file);
1256
}
1257
return 0;
1258
}
1259
1260
/**
1261
* replace_mm_exe_file - replace a reference to the mm's executable file
1262
* @mm: The mm to change.
1263
* @new_exe_file: The new file to use.
1264
*
1265
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
1266
*
1267
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
1268
*/
1269
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1270
{
1271
struct vm_area_struct *vma;
1272
struct file *old_exe_file;
1273
int ret = 0;
1274
1275
/* Forbid mm->exe_file change if old file still mapped. */
1276
old_exe_file = get_mm_exe_file(mm);
1277
if (old_exe_file) {
1278
VMA_ITERATOR(vmi, mm, 0);
1279
mmap_read_lock(mm);
1280
for_each_vma(vmi, vma) {
1281
if (!vma->vm_file)
1282
continue;
1283
if (path_equal(&vma->vm_file->f_path,
1284
&old_exe_file->f_path)) {
1285
ret = -EBUSY;
1286
break;
1287
}
1288
}
1289
mmap_read_unlock(mm);
1290
fput(old_exe_file);
1291
if (ret)
1292
return ret;
1293
}
1294
1295
ret = exe_file_deny_write_access(new_exe_file);
1296
if (ret)
1297
return -EACCES;
1298
get_file(new_exe_file);
1299
1300
/* set the new file */
1301
mmap_write_lock(mm);
1302
old_exe_file = rcu_dereference_raw(mm->exe_file);
1303
rcu_assign_pointer(mm->exe_file, new_exe_file);
1304
mmap_write_unlock(mm);
1305
1306
if (old_exe_file) {
1307
exe_file_allow_write_access(old_exe_file);
1308
fput(old_exe_file);
1309
}
1310
return 0;
1311
}
1312
1313
/**
1314
* get_mm_exe_file - acquire a reference to the mm's executable file
1315
* @mm: The mm of interest.
1316
*
1317
* Returns %NULL if mm has no associated executable file.
1318
* User must release file via fput().
1319
*/
1320
struct file *get_mm_exe_file(struct mm_struct *mm)
1321
{
1322
struct file *exe_file;
1323
1324
rcu_read_lock();
1325
exe_file = get_file_rcu(&mm->exe_file);
1326
rcu_read_unlock();
1327
return exe_file;
1328
}
1329
1330
/**
1331
* get_task_exe_file - acquire a reference to the task's executable file
1332
* @task: The task.
1333
*
1334
* Returns %NULL if task's mm (if any) has no associated executable file or
1335
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
1336
* User must release file via fput().
1337
*/
1338
struct file *get_task_exe_file(struct task_struct *task)
1339
{
1340
struct file *exe_file = NULL;
1341
struct mm_struct *mm;
1342
1343
if (task->flags & PF_KTHREAD)
1344
return NULL;
1345
1346
task_lock(task);
1347
mm = task->mm;
1348
if (mm)
1349
exe_file = get_mm_exe_file(mm);
1350
task_unlock(task);
1351
return exe_file;
1352
}
1353
1354
/**
1355
* get_task_mm - acquire a reference to the task's mm
1356
* @task: The task.
1357
*
1358
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
1359
* this kernel workthread has transiently adopted a user mm with use_mm,
1360
* to do its AIO) is not set and if so returns a reference to it, after
1361
* bumping up the use count. User must release the mm via mmput()
1362
* after use. Typically used by /proc and ptrace.
1363
*/
1364
struct mm_struct *get_task_mm(struct task_struct *task)
1365
{
1366
struct mm_struct *mm;
1367
1368
if (task->flags & PF_KTHREAD)
1369
return NULL;
1370
1371
task_lock(task);
1372
mm = task->mm;
1373
if (mm)
1374
mmget(mm);
1375
task_unlock(task);
1376
return mm;
1377
}
1378
EXPORT_SYMBOL_GPL(get_task_mm);
1379
1380
static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
1381
{
1382
if (mm == current->mm)
1383
return true;
1384
if (ptrace_may_access(task, mode))
1385
return true;
1386
if ((mode & PTRACE_MODE_READ) && perfmon_capable())
1387
return true;
1388
return false;
1389
}
1390
1391
struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
1392
{
1393
struct mm_struct *mm;
1394
int err;
1395
1396
err = down_read_killable(&task->signal->exec_update_lock);
1397
if (err)
1398
return ERR_PTR(err);
1399
1400
mm = get_task_mm(task);
1401
if (!mm) {
1402
mm = ERR_PTR(-ESRCH);
1403
} else if (!may_access_mm(mm, task, mode)) {
1404
mmput(mm);
1405
mm = ERR_PTR(-EACCES);
1406
}
1407
up_read(&task->signal->exec_update_lock);
1408
1409
return mm;
1410
}
1411
1412
static void complete_vfork_done(struct task_struct *tsk)
1413
{
1414
struct completion *vfork;
1415
1416
task_lock(tsk);
1417
vfork = tsk->vfork_done;
1418
if (likely(vfork)) {
1419
tsk->vfork_done = NULL;
1420
complete(vfork);
1421
}
1422
task_unlock(tsk);
1423
}
1424
1425
static int wait_for_vfork_done(struct task_struct *child,
1426
struct completion *vfork)
1427
{
1428
unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
1429
int killed;
1430
1431
cgroup_enter_frozen();
1432
killed = wait_for_completion_state(vfork, state);
1433
cgroup_leave_frozen(false);
1434
1435
if (killed) {
1436
task_lock(child);
1437
child->vfork_done = NULL;
1438
task_unlock(child);
1439
}
1440
1441
put_task_struct(child);
1442
return killed;
1443
}
1444
1445
/* Please note the differences between mmput and mm_release.
1446
* mmput is called whenever we stop holding onto a mm_struct,
1447
* error success whatever.
1448
*
1449
* mm_release is called after a mm_struct has been removed
1450
* from the current process.
1451
*
1452
* This difference is important for error handling, when we
1453
* only half set up a mm_struct for a new process and need to restore
1454
* the old one. Because we mmput the new mm_struct before
1455
* restoring the old one. . .
1456
* Eric Biederman 10 January 1998
1457
*/
1458
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1459
{
1460
uprobe_free_utask(tsk);
1461
1462
/* Get rid of any cached register state */
1463
deactivate_mm(tsk, mm);
1464
1465
/*
1466
* Signal userspace if we're not exiting with a core dump
1467
* because we want to leave the value intact for debugging
1468
* purposes.
1469
*/
1470
if (tsk->clear_child_tid) {
1471
if (atomic_read(&mm->mm_users) > 1) {
1472
/*
1473
* We don't check the error code - if userspace has
1474
* not set up a proper pointer then tough luck.
1475
*/
1476
put_user(0, tsk->clear_child_tid);
1477
do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1478
1, NULL, NULL, 0, 0);
1479
}
1480
tsk->clear_child_tid = NULL;
1481
}
1482
1483
/*
1484
* All done, finally we can wake up parent and return this mm to him.
1485
* Also kthread_stop() uses this completion for synchronization.
1486
*/
1487
if (tsk->vfork_done)
1488
complete_vfork_done(tsk);
1489
}
1490
1491
void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1492
{
1493
futex_exit_release(tsk);
1494
mm_release(tsk, mm);
1495
}
1496
1497
void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
1498
{
1499
futex_exec_release(tsk);
1500
mm_release(tsk, mm);
1501
}
1502
1503
/**
1504
* dup_mm() - duplicates an existing mm structure
1505
* @tsk: the task_struct with which the new mm will be associated.
1506
* @oldmm: the mm to duplicate.
1507
*
1508
* Allocates a new mm structure and duplicates the provided @oldmm structure
1509
* content into it.
1510
*
1511
* Return: the duplicated mm or NULL on failure.
1512
*/
1513
static struct mm_struct *dup_mm(struct task_struct *tsk,
1514
struct mm_struct *oldmm)
1515
{
1516
struct mm_struct *mm;
1517
int err;
1518
1519
mm = allocate_mm();
1520
if (!mm)
1521
goto fail_nomem;
1522
1523
memcpy(mm, oldmm, sizeof(*mm));
1524
1525
if (!mm_init(mm, tsk, mm->user_ns))
1526
goto fail_nomem;
1527
1528
uprobe_start_dup_mmap();
1529
err = dup_mmap(mm, oldmm);
1530
if (err)
1531
goto free_pt;
1532
uprobe_end_dup_mmap();
1533
1534
mm->hiwater_rss = get_mm_rss(mm);
1535
mm->hiwater_vm = mm->total_vm;
1536
1537
if (mm->binfmt && !try_module_get(mm->binfmt->module))
1538
goto free_pt;
1539
1540
return mm;
1541
1542
free_pt:
1543
/* don't put binfmt in mmput, we haven't got module yet */
1544
mm->binfmt = NULL;
1545
mm_init_owner(mm, NULL);
1546
mmput(mm);
1547
if (err)
1548
uprobe_end_dup_mmap();
1549
1550
fail_nomem:
1551
return NULL;
1552
}
1553
1554
static int copy_mm(u64 clone_flags, struct task_struct *tsk)
1555
{
1556
struct mm_struct *mm, *oldmm;
1557
1558
tsk->min_flt = tsk->maj_flt = 0;
1559
tsk->nvcsw = tsk->nivcsw = 0;
1560
#ifdef CONFIG_DETECT_HUNG_TASK
1561
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
1562
tsk->last_switch_time = 0;
1563
#endif
1564
1565
tsk->mm = NULL;
1566
tsk->active_mm = NULL;
1567
1568
/*
1569
* Are we cloning a kernel thread?
1570
*
1571
* We need to steal a active VM for that..
1572
*/
1573
oldmm = current->mm;
1574
if (!oldmm)
1575
return 0;
1576
1577
if (clone_flags & CLONE_VM) {
1578
mmget(oldmm);
1579
mm = oldmm;
1580
} else {
1581
mm = dup_mm(tsk, current->mm);
1582
if (!mm)
1583
return -ENOMEM;
1584
}
1585
1586
tsk->mm = mm;
1587
tsk->active_mm = mm;
1588
sched_mm_cid_fork(tsk);
1589
return 0;
1590
}
1591
1592
static int copy_fs(u64 clone_flags, struct task_struct *tsk)
1593
{
1594
struct fs_struct *fs = current->fs;
1595
if (clone_flags & CLONE_FS) {
1596
/* tsk->fs is already what we want */
1597
read_seqlock_excl(&fs->seq);
1598
/* "users" and "in_exec" locked for check_unsafe_exec() */
1599
if (fs->in_exec) {
1600
read_sequnlock_excl(&fs->seq);
1601
return -EAGAIN;
1602
}
1603
fs->users++;
1604
read_sequnlock_excl(&fs->seq);
1605
return 0;
1606
}
1607
tsk->fs = copy_fs_struct(fs);
1608
if (!tsk->fs)
1609
return -ENOMEM;
1610
return 0;
1611
}
1612
1613
static int copy_files(u64 clone_flags, struct task_struct *tsk,
1614
int no_files)
1615
{
1616
struct files_struct *oldf, *newf;
1617
1618
/*
1619
* A background process may not have any files ...
1620
*/
1621
oldf = current->files;
1622
if (!oldf)
1623
return 0;
1624
1625
if (no_files) {
1626
tsk->files = NULL;
1627
return 0;
1628
}
1629
1630
if (clone_flags & CLONE_FILES) {
1631
atomic_inc(&oldf->count);
1632
return 0;
1633
}
1634
1635
newf = dup_fd(oldf, NULL);
1636
if (IS_ERR(newf))
1637
return PTR_ERR(newf);
1638
1639
tsk->files = newf;
1640
return 0;
1641
}
1642
1643
static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
1644
{
1645
struct sighand_struct *sig;
1646
1647
if (clone_flags & CLONE_SIGHAND) {
1648
refcount_inc(&current->sighand->count);
1649
return 0;
1650
}
1651
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1652
RCU_INIT_POINTER(tsk->sighand, sig);
1653
if (!sig)
1654
return -ENOMEM;
1655
1656
refcount_set(&sig->count, 1);
1657
spin_lock_irq(&current->sighand->siglock);
1658
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1659
spin_unlock_irq(&current->sighand->siglock);
1660
1661
/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
1662
if (clone_flags & CLONE_CLEAR_SIGHAND)
1663
flush_signal_handlers(tsk, 0);
1664
1665
return 0;
1666
}
1667
1668
void __cleanup_sighand(struct sighand_struct *sighand)
1669
{
1670
if (refcount_dec_and_test(&sighand->count)) {
1671
signalfd_cleanup(sighand);
1672
/*
1673
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1674
* without an RCU grace period, see __lock_task_sighand().
1675
*/
1676
kmem_cache_free(sighand_cachep, sighand);
1677
}
1678
}
1679
1680
/*
1681
* Initialize POSIX timer handling for a thread group.
1682
*/
1683
static void posix_cpu_timers_init_group(struct signal_struct *sig)
1684
{
1685
struct posix_cputimers *pct = &sig->posix_cputimers;
1686
unsigned long cpu_limit;
1687
1688
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1689
posix_cputimers_group_init(pct, cpu_limit);
1690
}
1691
1692
static int copy_signal(u64 clone_flags, struct task_struct *tsk)
1693
{
1694
struct signal_struct *sig;
1695
1696
if (clone_flags & CLONE_THREAD)
1697
return 0;
1698
1699
sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1700
tsk->signal = sig;
1701
if (!sig)
1702
return -ENOMEM;
1703
1704
sig->nr_threads = 1;
1705
sig->quick_threads = 1;
1706
atomic_set(&sig->live, 1);
1707
refcount_set(&sig->sigcnt, 1);
1708
1709
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1710
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1711
tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1712
1713
init_waitqueue_head(&sig->wait_chldexit);
1714
sig->curr_target = tsk;
1715
init_sigpending(&sig->shared_pending);
1716
INIT_HLIST_HEAD(&sig->multiprocess);
1717
seqlock_init(&sig->stats_lock);
1718
prev_cputime_init(&sig->prev_cputime);
1719
1720
#ifdef CONFIG_POSIX_TIMERS
1721
INIT_HLIST_HEAD(&sig->posix_timers);
1722
INIT_HLIST_HEAD(&sig->ignored_posix_timers);
1723
hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1724
#endif
1725
1726
task_lock(current->group_leader);
1727
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1728
task_unlock(current->group_leader);
1729
1730
posix_cpu_timers_init_group(sig);
1731
1732
tty_audit_fork(sig);
1733
sched_autogroup_fork(sig);
1734
1735
#ifdef CONFIG_CGROUPS
1736
init_rwsem(&sig->cgroup_threadgroup_rwsem);
1737
#endif
1738
1739
sig->oom_score_adj = current->signal->oom_score_adj;
1740
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1741
1742
mutex_init(&sig->cred_guard_mutex);
1743
init_rwsem(&sig->exec_update_lock);
1744
1745
return 0;
1746
}
1747
1748
static void copy_seccomp(struct task_struct *p)
1749
{
1750
#ifdef CONFIG_SECCOMP
1751
/*
1752
* Must be called with sighand->lock held, which is common to
1753
* all threads in the group. Holding cred_guard_mutex is not
1754
* needed because this new task is not yet running and cannot
1755
* be racing exec.
1756
*/
1757
assert_spin_locked(&current->sighand->siglock);
1758
1759
/* Ref-count the new filter user, and assign it. */
1760
get_seccomp_filter(current);
1761
p->seccomp = current->seccomp;
1762
1763
/*
1764
* Explicitly enable no_new_privs here in case it got set
1765
* between the task_struct being duplicated and holding the
1766
* sighand lock. The seccomp state and nnp must be in sync.
1767
*/
1768
if (task_no_new_privs(current))
1769
task_set_no_new_privs(p);
1770
1771
/*
1772
* If the parent gained a seccomp mode after copying thread
1773
* flags and between before we held the sighand lock, we have
1774
* to manually enable the seccomp thread flag here.
1775
*/
1776
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1777
set_task_syscall_work(p, SECCOMP);
1778
#endif
1779
}
1780
1781
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1782
{
1783
current->clear_child_tid = tidptr;
1784
1785
return task_pid_vnr(current);
1786
}
1787
1788
static void rt_mutex_init_task(struct task_struct *p)
1789
{
1790
raw_spin_lock_init(&p->pi_lock);
1791
#ifdef CONFIG_RT_MUTEXES
1792
p->pi_waiters = RB_ROOT_CACHED;
1793
p->pi_top_task = NULL;
1794
p->pi_blocked_on = NULL;
1795
#endif
1796
}
1797
1798
static inline void init_task_pid_links(struct task_struct *task)
1799
{
1800
enum pid_type type;
1801
1802
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
1803
INIT_HLIST_NODE(&task->pid_links[type]);
1804
}
1805
1806
static inline void
1807
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1808
{
1809
if (type == PIDTYPE_PID)
1810
task->thread_pid = pid;
1811
else
1812
task->signal->pids[type] = pid;
1813
}
1814
1815
static inline void rcu_copy_process(struct task_struct *p)
1816
{
1817
#ifdef CONFIG_PREEMPT_RCU
1818
p->rcu_read_lock_nesting = 0;
1819
p->rcu_read_unlock_special.s = 0;
1820
p->rcu_blocked_node = NULL;
1821
INIT_LIST_HEAD(&p->rcu_node_entry);
1822
#endif /* #ifdef CONFIG_PREEMPT_RCU */
1823
#ifdef CONFIG_TASKS_RCU
1824
p->rcu_tasks_holdout = false;
1825
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
1826
p->rcu_tasks_idle_cpu = -1;
1827
INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
1828
#endif /* #ifdef CONFIG_TASKS_RCU */
1829
#ifdef CONFIG_TASKS_TRACE_RCU
1830
p->trc_reader_nesting = 0;
1831
p->trc_reader_special.s = 0;
1832
INIT_LIST_HEAD(&p->trc_holdout_list);
1833
INIT_LIST_HEAD(&p->trc_blkd_node);
1834
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
1835
}
1836
1837
/**
1838
* pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
1839
* @pid: the struct pid for which to create a pidfd
1840
* @flags: flags of the new @pidfd
1841
* @ret_file: return the new pidfs file
1842
*
1843
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
1844
* caller's file descriptor table. The pidfd is reserved but not installed yet.
1845
*
1846
* The helper verifies that @pid is still in use, without PIDFD_THREAD the
1847
* task identified by @pid must be a thread-group leader.
1848
*
1849
* If this function returns successfully the caller is responsible to either
1850
* call fd_install() passing the returned pidfd and pidfd file as arguments in
1851
* order to install the pidfd into its file descriptor table or they must use
1852
* put_unused_fd() and fput() on the returned pidfd and pidfd file
1853
* respectively.
1854
*
1855
* This function is useful when a pidfd must already be reserved but there
1856
* might still be points of failure afterwards and the caller wants to ensure
1857
* that no pidfd is leaked into its file descriptor table.
1858
*
1859
* Return: On success, a reserved pidfd is returned from the function and a new
1860
* pidfd file is returned in the last argument to the function. On
1861
* error, a negative error code is returned from the function and the
1862
* last argument remains unchanged.
1863
*/
1864
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
1865
{
1866
struct file *pidfs_file;
1867
1868
/*
1869
* PIDFD_STALE is only allowed to be passed if the caller knows
1870
* that @pid is already registered in pidfs and thus
1871
* PIDFD_INFO_EXIT information is guaranteed to be available.
1872
*/
1873
if (!(flags & PIDFD_STALE)) {
1874
/*
1875
* While holding the pidfd waitqueue lock removing the
1876
* task linkage for the thread-group leader pid
1877
* (PIDTYPE_TGID) isn't possible. Thus, if there's still
1878
* task linkage for PIDTYPE_PID not having thread-group
1879
* leader linkage for the pid means it wasn't a
1880
* thread-group leader in the first place.
1881
*/
1882
guard(spinlock_irq)(&pid->wait_pidfd.lock);
1883
1884
/* Task has already been reaped. */
1885
if (!pid_has_task(pid, PIDTYPE_PID))
1886
return -ESRCH;
1887
/*
1888
* If this struct pid isn't used as a thread-group
1889
* leader but the caller requested to create a
1890
* thread-group leader pidfd then report ENOENT.
1891
*/
1892
if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
1893
return -ENOENT;
1894
}
1895
1896
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
1897
if (pidfd < 0)
1898
return pidfd;
1899
1900
pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
1901
if (IS_ERR(pidfs_file))
1902
return PTR_ERR(pidfs_file);
1903
1904
*ret_file = pidfs_file;
1905
return take_fd(pidfd);
1906
}
1907
1908
static void __delayed_free_task(struct rcu_head *rhp)
1909
{
1910
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1911
1912
free_task(tsk);
1913
}
1914
1915
static __always_inline void delayed_free_task(struct task_struct *tsk)
1916
{
1917
if (IS_ENABLED(CONFIG_MEMCG))
1918
call_rcu(&tsk->rcu, __delayed_free_task);
1919
else
1920
free_task(tsk);
1921
}
1922
1923
static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
1924
{
1925
/* Skip if kernel thread */
1926
if (!tsk->mm)
1927
return;
1928
1929
/* Skip if spawning a thread or using vfork */
1930
if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
1931
return;
1932
1933
/* We need to synchronize with __set_oom_adj */
1934
mutex_lock(&oom_adj_mutex);
1935
mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
1936
/* Update the values in case they were changed after copy_signal */
1937
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
1938
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
1939
mutex_unlock(&oom_adj_mutex);
1940
}
1941
1942
#ifdef CONFIG_RV
1943
static void rv_task_fork(struct task_struct *p)
1944
{
1945
memset(&p->rv, 0, sizeof(p->rv));
1946
}
1947
#else
1948
#define rv_task_fork(p) do {} while (0)
1949
#endif
1950
1951
static bool need_futex_hash_allocate_default(u64 clone_flags)
1952
{
1953
if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
1954
return false;
1955
return true;
1956
}
1957
1958
/*
1959
* This creates a new process as a copy of the old one,
1960
* but does not actually start it yet.
1961
*
1962
* It copies the registers, and all the appropriate
1963
* parts of the process environment (as per the clone
1964
* flags). The actual kick-off is left to the caller.
1965
*/
1966
__latent_entropy struct task_struct *copy_process(
1967
struct pid *pid,
1968
int trace,
1969
int node,
1970
struct kernel_clone_args *args)
1971
{
1972
int pidfd = -1, retval;
1973
struct task_struct *p;
1974
struct multiprocess_signals delayed;
1975
struct file *pidfile = NULL;
1976
const u64 clone_flags = args->flags;
1977
struct nsproxy *nsp = current->nsproxy;
1978
1979
/*
1980
* Don't allow sharing the root directory with processes in a different
1981
* namespace
1982
*/
1983
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1984
return ERR_PTR(-EINVAL);
1985
1986
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1987
return ERR_PTR(-EINVAL);
1988
1989
/*
1990
* Thread groups must share signals as well, and detached threads
1991
* can only be started up within the thread group.
1992
*/
1993
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1994
return ERR_PTR(-EINVAL);
1995
1996
/*
1997
* Shared signal handlers imply shared VM. By way of the above,
1998
* thread groups also imply shared VM. Blocking this case allows
1999
* for various simplifications in other code.
2000
*/
2001
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
2002
return ERR_PTR(-EINVAL);
2003
2004
/*
2005
* Siblings of global init remain as zombies on exit since they are
2006
* not reaped by their parent (swapper). To solve this and to avoid
2007
* multi-rooted process trees, prevent global and container-inits
2008
* from creating siblings.
2009
*/
2010
if ((clone_flags & CLONE_PARENT) &&
2011
current->signal->flags & SIGNAL_UNKILLABLE)
2012
return ERR_PTR(-EINVAL);
2013
2014
/*
2015
* If the new process will be in a different pid or user namespace
2016
* do not allow it to share a thread group with the forking task.
2017
*/
2018
if (clone_flags & CLONE_THREAD) {
2019
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
2020
(task_active_pid_ns(current) != nsp->pid_ns_for_children))
2021
return ERR_PTR(-EINVAL);
2022
}
2023
2024
if (clone_flags & CLONE_PIDFD) {
2025
/*
2026
* - CLONE_DETACHED is blocked so that we can potentially
2027
* reuse it later for CLONE_PIDFD.
2028
*/
2029
if (clone_flags & CLONE_DETACHED)
2030
return ERR_PTR(-EINVAL);
2031
}
2032
2033
/*
2034
* Force any signals received before this point to be delivered
2035
* before the fork happens. Collect up signals sent to multiple
2036
* processes that happen during the fork and delay them so that
2037
* they appear to happen after the fork.
2038
*/
2039
sigemptyset(&delayed.signal);
2040
INIT_HLIST_NODE(&delayed.node);
2041
2042
spin_lock_irq(&current->sighand->siglock);
2043
if (!(clone_flags & CLONE_THREAD))
2044
hlist_add_head(&delayed.node, &current->signal->multiprocess);
2045
recalc_sigpending();
2046
spin_unlock_irq(&current->sighand->siglock);
2047
retval = -ERESTARTNOINTR;
2048
if (task_sigpending(current))
2049
goto fork_out;
2050
2051
retval = -ENOMEM;
2052
p = dup_task_struct(current, node);
2053
if (!p)
2054
goto fork_out;
2055
p->flags &= ~PF_KTHREAD;
2056
if (args->kthread)
2057
p->flags |= PF_KTHREAD;
2058
if (args->user_worker) {
2059
/*
2060
* Mark us a user worker, and block any signal that isn't
2061
* fatal or STOP
2062
*/
2063
p->flags |= PF_USER_WORKER;
2064
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
2065
}
2066
if (args->io_thread)
2067
p->flags |= PF_IO_WORKER;
2068
2069
if (args->name)
2070
strscpy_pad(p->comm, args->name, sizeof(p->comm));
2071
2072
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
2073
/*
2074
* Clear TID on mm_release()?
2075
*/
2076
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
2077
2078
ftrace_graph_init_task(p);
2079
2080
rt_mutex_init_task(p);
2081
2082
lockdep_assert_irqs_enabled();
2083
#ifdef CONFIG_PROVE_LOCKING
2084
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
2085
#endif
2086
retval = copy_creds(p, clone_flags);
2087
if (retval < 0)
2088
goto bad_fork_free;
2089
2090
retval = -EAGAIN;
2091
if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
2092
if (p->real_cred->user != INIT_USER &&
2093
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
2094
goto bad_fork_cleanup_count;
2095
}
2096
current->flags &= ~PF_NPROC_EXCEEDED;
2097
2098
/*
2099
* If multiple threads are within copy_process(), then this check
2100
* triggers too late. This doesn't hurt, the check is only there
2101
* to stop root fork bombs.
2102
*/
2103
retval = -EAGAIN;
2104
if (data_race(nr_threads >= max_threads))
2105
goto bad_fork_cleanup_count;
2106
2107
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
2108
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
2109
p->flags |= PF_FORKNOEXEC;
2110
INIT_LIST_HEAD(&p->children);
2111
INIT_LIST_HEAD(&p->sibling);
2112
rcu_copy_process(p);
2113
p->vfork_done = NULL;
2114
spin_lock_init(&p->alloc_lock);
2115
2116
init_sigpending(&p->pending);
2117
2118
p->utime = p->stime = p->gtime = 0;
2119
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2120
p->utimescaled = p->stimescaled = 0;
2121
#endif
2122
prev_cputime_init(&p->prev_cputime);
2123
2124
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2125
seqcount_init(&p->vtime.seqcount);
2126
p->vtime.starttime = 0;
2127
p->vtime.state = VTIME_INACTIVE;
2128
#endif
2129
2130
#ifdef CONFIG_IO_URING
2131
p->io_uring = NULL;
2132
#endif
2133
2134
p->default_timer_slack_ns = current->timer_slack_ns;
2135
2136
#ifdef CONFIG_PSI
2137
p->psi_flags = 0;
2138
#endif
2139
2140
task_io_accounting_init(&p->ioac);
2141
acct_clear_integrals(p);
2142
2143
posix_cputimers_init(&p->posix_cputimers);
2144
tick_dep_init_task(p);
2145
2146
p->io_context = NULL;
2147
audit_set_context(p, NULL);
2148
cgroup_fork(p);
2149
if (args->kthread) {
2150
if (!set_kthread_struct(p))
2151
goto bad_fork_cleanup_delayacct;
2152
}
2153
#ifdef CONFIG_NUMA
2154
p->mempolicy = mpol_dup(p->mempolicy);
2155
if (IS_ERR(p->mempolicy)) {
2156
retval = PTR_ERR(p->mempolicy);
2157
p->mempolicy = NULL;
2158
goto bad_fork_cleanup_delayacct;
2159
}
2160
#endif
2161
#ifdef CONFIG_CPUSETS
2162
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
2163
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
2164
#endif
2165
#ifdef CONFIG_TRACE_IRQFLAGS
2166
memset(&p->irqtrace, 0, sizeof(p->irqtrace));
2167
p->irqtrace.hardirq_disable_ip = _THIS_IP_;
2168
p->irqtrace.softirq_enable_ip = _THIS_IP_;
2169
p->softirqs_enabled = 1;
2170
p->softirq_context = 0;
2171
#endif
2172
2173
p->pagefault_disabled = 0;
2174
2175
lockdep_init_task(p);
2176
2177
p->blocked_on = NULL; /* not blocked yet */
2178
2179
#ifdef CONFIG_BCACHE
2180
p->sequential_io = 0;
2181
p->sequential_io_avg = 0;
2182
#endif
2183
#ifdef CONFIG_BPF_SYSCALL
2184
RCU_INIT_POINTER(p->bpf_storage, NULL);
2185
p->bpf_ctx = NULL;
2186
#endif
2187
2188
unwind_task_init(p);
2189
2190
/* Perform scheduler related setup. Assign this task to a CPU. */
2191
retval = sched_fork(clone_flags, p);
2192
if (retval)
2193
goto bad_fork_cleanup_policy;
2194
2195
retval = perf_event_init_task(p, clone_flags);
2196
if (retval)
2197
goto bad_fork_sched_cancel_fork;
2198
retval = audit_alloc(p);
2199
if (retval)
2200
goto bad_fork_cleanup_perf;
2201
/* copy all the process information */
2202
shm_init_task(p);
2203
retval = security_task_alloc(p, clone_flags);
2204
if (retval)
2205
goto bad_fork_cleanup_audit;
2206
retval = copy_semundo(clone_flags, p);
2207
if (retval)
2208
goto bad_fork_cleanup_security;
2209
retval = copy_files(clone_flags, p, args->no_files);
2210
if (retval)
2211
goto bad_fork_cleanup_semundo;
2212
retval = copy_fs(clone_flags, p);
2213
if (retval)
2214
goto bad_fork_cleanup_files;
2215
retval = copy_sighand(clone_flags, p);
2216
if (retval)
2217
goto bad_fork_cleanup_fs;
2218
retval = copy_signal(clone_flags, p);
2219
if (retval)
2220
goto bad_fork_cleanup_sighand;
2221
retval = copy_mm(clone_flags, p);
2222
if (retval)
2223
goto bad_fork_cleanup_signal;
2224
retval = copy_namespaces(clone_flags, p);
2225
if (retval)
2226
goto bad_fork_cleanup_mm;
2227
retval = copy_io(clone_flags, p);
2228
if (retval)
2229
goto bad_fork_cleanup_namespaces;
2230
retval = copy_thread(p, args);
2231
if (retval)
2232
goto bad_fork_cleanup_io;
2233
2234
stackleak_task_init(p);
2235
2236
if (pid != &init_struct_pid) {
2237
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
2238
args->set_tid_size);
2239
if (IS_ERR(pid)) {
2240
retval = PTR_ERR(pid);
2241
goto bad_fork_cleanup_thread;
2242
}
2243
}
2244
2245
/*
2246
* This has to happen after we've potentially unshared the file
2247
* descriptor table (so that the pidfd doesn't leak into the child
2248
* if the fd table isn't shared).
2249
*/
2250
if (clone_flags & CLONE_PIDFD) {
2251
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
2252
2253
/*
2254
* Note that no task has been attached to @pid yet indicate
2255
* that via CLONE_PIDFD.
2256
*/
2257
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
2258
if (retval < 0)
2259
goto bad_fork_free_pid;
2260
pidfd = retval;
2261
2262
retval = put_user(pidfd, args->pidfd);
2263
if (retval)
2264
goto bad_fork_put_pidfd;
2265
}
2266
2267
#ifdef CONFIG_BLOCK
2268
p->plug = NULL;
2269
#endif
2270
futex_init_task(p);
2271
2272
/*
2273
* sigaltstack should be cleared when sharing the same VM
2274
*/
2275
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
2276
sas_ss_reset(p);
2277
2278
/*
2279
* Syscall tracing and stepping should be turned off in the
2280
* child regardless of CLONE_PTRACE.
2281
*/
2282
user_disable_single_step(p);
2283
clear_task_syscall_work(p, SYSCALL_TRACE);
2284
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
2285
clear_task_syscall_work(p, SYSCALL_EMU);
2286
#endif
2287
clear_tsk_latency_tracing(p);
2288
2289
/* ok, now we should be set up.. */
2290
p->pid = pid_nr(pid);
2291
if (clone_flags & CLONE_THREAD) {
2292
p->group_leader = current->group_leader;
2293
p->tgid = current->tgid;
2294
} else {
2295
p->group_leader = p;
2296
p->tgid = p->pid;
2297
}
2298
2299
p->nr_dirtied = 0;
2300
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
2301
p->dirty_paused_when = 0;
2302
2303
p->pdeath_signal = 0;
2304
p->task_works = NULL;
2305
clear_posix_cputimers_work(p);
2306
2307
#ifdef CONFIG_KRETPROBES
2308
p->kretprobe_instances.first = NULL;
2309
#endif
2310
#ifdef CONFIG_RETHOOK
2311
p->rethooks.first = NULL;
2312
#endif
2313
2314
/*
2315
* Ensure that the cgroup subsystem policies allow the new process to be
2316
* forked. It should be noted that the new process's css_set can be changed
2317
* between here and cgroup_post_fork() if an organisation operation is in
2318
* progress.
2319
*/
2320
retval = cgroup_can_fork(p, args);
2321
if (retval)
2322
goto bad_fork_put_pidfd;
2323
2324
/*
2325
* Now that the cgroups are pinned, re-clone the parent cgroup and put
2326
* the new task on the correct runqueue. All this *before* the task
2327
* becomes visible.
2328
*
2329
* This isn't part of ->can_fork() because while the re-cloning is
2330
* cgroup specific, it unconditionally needs to place the task on a
2331
* runqueue.
2332
*/
2333
retval = sched_cgroup_fork(p, args);
2334
if (retval)
2335
goto bad_fork_cancel_cgroup;
2336
2337
/*
2338
* Allocate a default futex hash for the user process once the first
2339
* thread spawns.
2340
*/
2341
if (need_futex_hash_allocate_default(clone_flags)) {
2342
retval = futex_hash_allocate_default();
2343
if (retval)
2344
goto bad_fork_cancel_cgroup;
2345
/*
2346
* If we fail beyond this point we don't free the allocated
2347
* futex hash map. We assume that another thread will be created
2348
* and makes use of it. The hash map will be freed once the main
2349
* thread terminates.
2350
*/
2351
}
2352
/*
2353
* From this point on we must avoid any synchronous user-space
2354
* communication until we take the tasklist-lock. In particular, we do
2355
* not want user-space to be able to predict the process start-time by
2356
* stalling fork(2) after we recorded the start_time but before it is
2357
* visible to the system.
2358
*/
2359
2360
p->start_time = ktime_get_ns();
2361
p->start_boottime = ktime_get_boottime_ns();
2362
2363
/*
2364
* Make it visible to the rest of the system, but dont wake it up yet.
2365
* Need tasklist lock for parent etc handling!
2366
*/
2367
write_lock_irq(&tasklist_lock);
2368
2369
/* CLONE_PARENT re-uses the old parent */
2370
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
2371
p->real_parent = current->real_parent;
2372
p->parent_exec_id = current->parent_exec_id;
2373
if (clone_flags & CLONE_THREAD)
2374
p->exit_signal = -1;
2375
else
2376
p->exit_signal = current->group_leader->exit_signal;
2377
} else {
2378
p->real_parent = current;
2379
p->parent_exec_id = current->self_exec_id;
2380
p->exit_signal = args->exit_signal;
2381
}
2382
2383
klp_copy_process(p);
2384
2385
sched_core_fork(p);
2386
2387
spin_lock(&current->sighand->siglock);
2388
2389
rv_task_fork(p);
2390
2391
rseq_fork(p, clone_flags);
2392
2393
/* Don't start children in a dying pid namespace */
2394
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
2395
retval = -ENOMEM;
2396
goto bad_fork_core_free;
2397
}
2398
2399
/* Let kill terminate clone/fork in the middle */
2400
if (fatal_signal_pending(current)) {
2401
retval = -EINTR;
2402
goto bad_fork_core_free;
2403
}
2404
2405
/* No more failure paths after this point. */
2406
2407
/*
2408
* Copy seccomp details explicitly here, in case they were changed
2409
* before holding sighand lock.
2410
*/
2411
copy_seccomp(p);
2412
2413
init_task_pid_links(p);
2414
if (likely(p->pid)) {
2415
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
2416
2417
init_task_pid(p, PIDTYPE_PID, pid);
2418
if (thread_group_leader(p)) {
2419
init_task_pid(p, PIDTYPE_TGID, pid);
2420
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
2421
init_task_pid(p, PIDTYPE_SID, task_session(current));
2422
2423
if (is_child_reaper(pid)) {
2424
ns_of_pid(pid)->child_reaper = p;
2425
p->signal->flags |= SIGNAL_UNKILLABLE;
2426
}
2427
p->signal->shared_pending.signal = delayed.signal;
2428
p->signal->tty = tty_kref_get(current->signal->tty);
2429
/*
2430
* Inherit has_child_subreaper flag under the same
2431
* tasklist_lock with adding child to the process tree
2432
* for propagate_has_child_subreaper optimization.
2433
*/
2434
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
2435
p->real_parent->signal->is_child_subreaper;
2436
list_add_tail(&p->sibling, &p->real_parent->children);
2437
list_add_tail_rcu(&p->tasks, &init_task.tasks);
2438
attach_pid(p, PIDTYPE_TGID);
2439
attach_pid(p, PIDTYPE_PGID);
2440
attach_pid(p, PIDTYPE_SID);
2441
__this_cpu_inc(process_counts);
2442
} else {
2443
current->signal->nr_threads++;
2444
current->signal->quick_threads++;
2445
atomic_inc(&current->signal->live);
2446
refcount_inc(&current->signal->sigcnt);
2447
task_join_group_stop(p);
2448
list_add_tail_rcu(&p->thread_node,
2449
&p->signal->thread_head);
2450
}
2451
attach_pid(p, PIDTYPE_PID);
2452
nr_threads++;
2453
}
2454
total_forks++;
2455
hlist_del_init(&delayed.node);
2456
spin_unlock(&current->sighand->siglock);
2457
syscall_tracepoint_update(p);
2458
write_unlock_irq(&tasklist_lock);
2459
2460
if (pidfile)
2461
fd_install(pidfd, pidfile);
2462
2463
proc_fork_connector(p);
2464
sched_post_fork(p);
2465
cgroup_post_fork(p, args);
2466
perf_event_fork(p);
2467
2468
trace_task_newtask(p, clone_flags);
2469
uprobe_copy_process(p, clone_flags);
2470
user_events_fork(p, clone_flags);
2471
2472
copy_oom_score_adj(clone_flags, p);
2473
2474
return p;
2475
2476
bad_fork_core_free:
2477
sched_core_free(p);
2478
spin_unlock(&current->sighand->siglock);
2479
write_unlock_irq(&tasklist_lock);
2480
bad_fork_cancel_cgroup:
2481
cgroup_cancel_fork(p, args);
2482
bad_fork_put_pidfd:
2483
if (clone_flags & CLONE_PIDFD) {
2484
fput(pidfile);
2485
put_unused_fd(pidfd);
2486
}
2487
bad_fork_free_pid:
2488
if (pid != &init_struct_pid)
2489
free_pid(pid);
2490
bad_fork_cleanup_thread:
2491
exit_thread(p);
2492
bad_fork_cleanup_io:
2493
if (p->io_context)
2494
exit_io_context(p);
2495
bad_fork_cleanup_namespaces:
2496
exit_nsproxy_namespaces(p);
2497
bad_fork_cleanup_mm:
2498
if (p->mm) {
2499
sched_mm_cid_exit(p);
2500
mm_clear_owner(p->mm, p);
2501
mmput(p->mm);
2502
}
2503
bad_fork_cleanup_signal:
2504
if (!(clone_flags & CLONE_THREAD))
2505
free_signal_struct(p->signal);
2506
bad_fork_cleanup_sighand:
2507
__cleanup_sighand(p->sighand);
2508
bad_fork_cleanup_fs:
2509
exit_fs(p); /* blocking */
2510
bad_fork_cleanup_files:
2511
exit_files(p); /* blocking */
2512
bad_fork_cleanup_semundo:
2513
exit_sem(p);
2514
bad_fork_cleanup_security:
2515
security_task_free(p);
2516
bad_fork_cleanup_audit:
2517
audit_free(p);
2518
bad_fork_cleanup_perf:
2519
perf_event_free_task(p);
2520
bad_fork_sched_cancel_fork:
2521
sched_cancel_fork(p);
2522
bad_fork_cleanup_policy:
2523
lockdep_free_task(p);
2524
#ifdef CONFIG_NUMA
2525
mpol_put(p->mempolicy);
2526
#endif
2527
bad_fork_cleanup_delayacct:
2528
delayacct_tsk_free(p);
2529
bad_fork_cleanup_count:
2530
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
2531
exit_cred_namespaces(p);
2532
exit_creds(p);
2533
bad_fork_free:
2534
WRITE_ONCE(p->__state, TASK_DEAD);
2535
exit_task_stack_account(p);
2536
put_task_stack(p);
2537
delayed_free_task(p);
2538
fork_out:
2539
spin_lock_irq(&current->sighand->siglock);
2540
hlist_del_init(&delayed.node);
2541
spin_unlock_irq(&current->sighand->siglock);
2542
return ERR_PTR(retval);
2543
}
2544
2545
static inline void init_idle_pids(struct task_struct *idle)
2546
{
2547
enum pid_type type;
2548
2549
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
2550
INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
2551
init_task_pid(idle, type, &init_struct_pid);
2552
}
2553
}
2554
2555
static int idle_dummy(void *dummy)
2556
{
2557
/* This function is never called */
2558
return 0;
2559
}
2560
2561
struct task_struct * __init fork_idle(int cpu)
2562
{
2563
struct task_struct *task;
2564
struct kernel_clone_args args = {
2565
.flags = CLONE_VM,
2566
.fn = &idle_dummy,
2567
.fn_arg = NULL,
2568
.kthread = 1,
2569
.idle = 1,
2570
};
2571
2572
task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2573
if (!IS_ERR(task)) {
2574
init_idle_pids(task);
2575
init_idle(task, cpu);
2576
}
2577
2578
return task;
2579
}
2580
2581
/*
2582
* This is like kernel_clone(), but shaved down and tailored to just
2583
* creating io_uring workers. It returns a created task, or an error pointer.
2584
* The returned task is inactive, and the caller must fire it up through
2585
* wake_up_new_task(p). All signals are blocked in the created task.
2586
*/
2587
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
2588
{
2589
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
2590
CLONE_IO|CLONE_VM|CLONE_UNTRACED;
2591
struct kernel_clone_args args = {
2592
.flags = flags,
2593
.fn = fn,
2594
.fn_arg = arg,
2595
.io_thread = 1,
2596
.user_worker = 1,
2597
};
2598
2599
return copy_process(NULL, 0, node, &args);
2600
}
2601
2602
/*
2603
* Ok, this is the main fork-routine.
2604
*
2605
* It copies the process, and if successful kick-starts
2606
* it and waits for it to finish using the VM if required.
2607
*
2608
* args->exit_signal is expected to be checked for sanity by the caller.
2609
*/
2610
pid_t kernel_clone(struct kernel_clone_args *args)
2611
{
2612
u64 clone_flags = args->flags;
2613
struct completion vfork;
2614
struct pid *pid;
2615
struct task_struct *p;
2616
int trace = 0;
2617
pid_t nr;
2618
2619
/*
2620
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
2621
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
2622
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
2623
* field in struct clone_args and it still doesn't make sense to have
2624
* them both point at the same memory location. Performing this check
2625
* here has the advantage that we don't need to have a separate helper
2626
* to check for legacy clone().
2627
*/
2628
if ((clone_flags & CLONE_PIDFD) &&
2629
(clone_flags & CLONE_PARENT_SETTID) &&
2630
(args->pidfd == args->parent_tid))
2631
return -EINVAL;
2632
2633
/*
2634
* Determine whether and which event to report to ptracer. When
2635
* called from kernel_thread or CLONE_UNTRACED is explicitly
2636
* requested, no event is reported; otherwise, report if the event
2637
* for the type of forking is enabled.
2638
*/
2639
if (!(clone_flags & CLONE_UNTRACED)) {
2640
if (clone_flags & CLONE_VFORK)
2641
trace = PTRACE_EVENT_VFORK;
2642
else if (args->exit_signal != SIGCHLD)
2643
trace = PTRACE_EVENT_CLONE;
2644
else
2645
trace = PTRACE_EVENT_FORK;
2646
2647
if (likely(!ptrace_event_enabled(current, trace)))
2648
trace = 0;
2649
}
2650
2651
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2652
add_latent_entropy();
2653
2654
if (IS_ERR(p))
2655
return PTR_ERR(p);
2656
2657
/*
2658
* Do this prior waking up the new thread - the thread pointer
2659
* might get invalid after that point, if the thread exits quickly.
2660
*/
2661
trace_sched_process_fork(current, p);
2662
2663
pid = get_task_pid(p, PIDTYPE_PID);
2664
nr = pid_vnr(pid);
2665
2666
if (clone_flags & CLONE_PARENT_SETTID)
2667
put_user(nr, args->parent_tid);
2668
2669
if (clone_flags & CLONE_VFORK) {
2670
p->vfork_done = &vfork;
2671
init_completion(&vfork);
2672
get_task_struct(p);
2673
}
2674
2675
if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
2676
/* lock the task to synchronize with memcg migration */
2677
task_lock(p);
2678
lru_gen_add_mm(p->mm);
2679
task_unlock(p);
2680
}
2681
2682
wake_up_new_task(p);
2683
2684
/* forking complete and child started to run, tell ptracer */
2685
if (unlikely(trace))
2686
ptrace_event_pid(trace, pid);
2687
2688
if (clone_flags & CLONE_VFORK) {
2689
if (!wait_for_vfork_done(p, &vfork))
2690
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
2691
}
2692
2693
put_pid(pid);
2694
return nr;
2695
}
2696
2697
/*
2698
* Create a kernel thread.
2699
*/
2700
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
2701
unsigned long flags)
2702
{
2703
struct kernel_clone_args args = {
2704
.flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2705
.exit_signal = (flags & CSIGNAL),
2706
.fn = fn,
2707
.fn_arg = arg,
2708
.name = name,
2709
.kthread = 1,
2710
};
2711
2712
return kernel_clone(&args);
2713
}
2714
2715
/*
2716
* Create a user mode thread.
2717
*/
2718
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
2719
{
2720
struct kernel_clone_args args = {
2721
.flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2722
.exit_signal = (flags & CSIGNAL),
2723
.fn = fn,
2724
.fn_arg = arg,
2725
};
2726
2727
return kernel_clone(&args);
2728
}
2729
2730
#ifdef __ARCH_WANT_SYS_FORK
2731
SYSCALL_DEFINE0(fork)
2732
{
2733
#ifdef CONFIG_MMU
2734
struct kernel_clone_args args = {
2735
.exit_signal = SIGCHLD,
2736
};
2737
2738
return kernel_clone(&args);
2739
#else
2740
/* can not support in nommu mode */
2741
return -EINVAL;
2742
#endif
2743
}
2744
#endif
2745
2746
#ifdef __ARCH_WANT_SYS_VFORK
2747
SYSCALL_DEFINE0(vfork)
2748
{
2749
struct kernel_clone_args args = {
2750
.flags = CLONE_VFORK | CLONE_VM,
2751
.exit_signal = SIGCHLD,
2752
};
2753
2754
return kernel_clone(&args);
2755
}
2756
#endif
2757
2758
#ifdef __ARCH_WANT_SYS_CLONE
2759
#ifdef CONFIG_CLONE_BACKWARDS
2760
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2761
int __user *, parent_tidptr,
2762
unsigned long, tls,
2763
int __user *, child_tidptr)
2764
#elif defined(CONFIG_CLONE_BACKWARDS2)
2765
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2766
int __user *, parent_tidptr,
2767
int __user *, child_tidptr,
2768
unsigned long, tls)
2769
#elif defined(CONFIG_CLONE_BACKWARDS3)
2770
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2771
int, stack_size,
2772
int __user *, parent_tidptr,
2773
int __user *, child_tidptr,
2774
unsigned long, tls)
2775
#else
2776
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2777
int __user *, parent_tidptr,
2778
int __user *, child_tidptr,
2779
unsigned long, tls)
2780
#endif
2781
{
2782
struct kernel_clone_args args = {
2783
.flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
2784
.pidfd = parent_tidptr,
2785
.child_tid = child_tidptr,
2786
.parent_tid = parent_tidptr,
2787
.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
2788
.stack = newsp,
2789
.tls = tls,
2790
};
2791
2792
return kernel_clone(&args);
2793
}
2794
#endif
2795
2796
static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2797
struct clone_args __user *uargs,
2798
size_t usize)
2799
{
2800
int err;
2801
struct clone_args args;
2802
pid_t *kset_tid = kargs->set_tid;
2803
2804
BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
2805
CLONE_ARGS_SIZE_VER0);
2806
BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
2807
CLONE_ARGS_SIZE_VER1);
2808
BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
2809
CLONE_ARGS_SIZE_VER2);
2810
BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
2811
2812
if (unlikely(usize > PAGE_SIZE))
2813
return -E2BIG;
2814
if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
2815
return -EINVAL;
2816
2817
err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
2818
if (err)
2819
return err;
2820
2821
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
2822
return -EINVAL;
2823
2824
if (unlikely(!args.set_tid && args.set_tid_size > 0))
2825
return -EINVAL;
2826
2827
if (unlikely(args.set_tid && args.set_tid_size == 0))
2828
return -EINVAL;
2829
2830
/*
2831
* Verify that higher 32bits of exit_signal are unset and that
2832
* it is a valid signal
2833
*/
2834
if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2835
!valid_signal(args.exit_signal)))
2836
return -EINVAL;
2837
2838
if ((args.flags & CLONE_INTO_CGROUP) &&
2839
(args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
2840
return -EINVAL;
2841
2842
*kargs = (struct kernel_clone_args){
2843
.flags = args.flags,
2844
.pidfd = u64_to_user_ptr(args.pidfd),
2845
.child_tid = u64_to_user_ptr(args.child_tid),
2846
.parent_tid = u64_to_user_ptr(args.parent_tid),
2847
.exit_signal = args.exit_signal,
2848
.stack = args.stack,
2849
.stack_size = args.stack_size,
2850
.tls = args.tls,
2851
.set_tid_size = args.set_tid_size,
2852
.cgroup = args.cgroup,
2853
};
2854
2855
if (args.set_tid &&
2856
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
2857
(kargs->set_tid_size * sizeof(pid_t))))
2858
return -EFAULT;
2859
2860
kargs->set_tid = kset_tid;
2861
2862
return 0;
2863
}
2864
2865
/**
2866
* clone3_stack_valid - check and prepare stack
2867
* @kargs: kernel clone args
2868
*
2869
* Verify that the stack arguments userspace gave us are sane.
2870
* In addition, set the stack direction for userspace since it's easy for us to
2871
* determine.
2872
*/
2873
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
2874
{
2875
if (kargs->stack == 0) {
2876
if (kargs->stack_size > 0)
2877
return false;
2878
} else {
2879
if (kargs->stack_size == 0)
2880
return false;
2881
2882
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
2883
return false;
2884
2885
#if !defined(CONFIG_STACK_GROWSUP)
2886
kargs->stack += kargs->stack_size;
2887
#endif
2888
}
2889
2890
return true;
2891
}
2892
2893
static bool clone3_args_valid(struct kernel_clone_args *kargs)
2894
{
2895
/* Verify that no unknown flags are passed along. */
2896
if (kargs->flags &
2897
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
2898
return false;
2899
2900
/*
2901
* - make the CLONE_DETACHED bit reusable for clone3
2902
* - make the CSIGNAL bits reusable for clone3
2903
*/
2904
if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
2905
return false;
2906
2907
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
2908
(CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
2909
return false;
2910
2911
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2912
kargs->exit_signal)
2913
return false;
2914
2915
if (!clone3_stack_valid(kargs))
2916
return false;
2917
2918
return true;
2919
}
2920
2921
/**
2922
* sys_clone3 - create a new process with specific properties
2923
* @uargs: argument structure
2924
* @size: size of @uargs
2925
*
2926
* clone3() is the extensible successor to clone()/clone2().
2927
* It takes a struct as argument that is versioned by its size.
2928
*
2929
* Return: On success, a positive PID for the child process.
2930
* On error, a negative errno number.
2931
*/
2932
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2933
{
2934
int err;
2935
2936
struct kernel_clone_args kargs;
2937
pid_t set_tid[MAX_PID_NS_LEVEL];
2938
2939
#ifdef __ARCH_BROKEN_SYS_CLONE3
2940
#warning clone3() entry point is missing, please fix
2941
return -ENOSYS;
2942
#endif
2943
2944
kargs.set_tid = set_tid;
2945
2946
err = copy_clone_args_from_user(&kargs, uargs, size);
2947
if (err)
2948
return err;
2949
2950
if (!clone3_args_valid(&kargs))
2951
return -EINVAL;
2952
2953
return kernel_clone(&kargs);
2954
}
2955
2956
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
2957
{
2958
struct task_struct *leader, *parent, *child;
2959
int res;
2960
2961
read_lock(&tasklist_lock);
2962
leader = top = top->group_leader;
2963
down:
2964
for_each_thread(leader, parent) {
2965
list_for_each_entry(child, &parent->children, sibling) {
2966
res = visitor(child, data);
2967
if (res) {
2968
if (res < 0)
2969
goto out;
2970
leader = child;
2971
goto down;
2972
}
2973
up:
2974
;
2975
}
2976
}
2977
2978
if (leader != top) {
2979
child = leader;
2980
parent = child->real_parent;
2981
leader = parent->group_leader;
2982
goto up;
2983
}
2984
out:
2985
read_unlock(&tasklist_lock);
2986
}
2987
2988
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
2989
#define ARCH_MIN_MMSTRUCT_ALIGN 0
2990
#endif
2991
2992
static void sighand_ctor(void *data)
2993
{
2994
struct sighand_struct *sighand = data;
2995
2996
spin_lock_init(&sighand->siglock);
2997
init_waitqueue_head(&sighand->signalfd_wqh);
2998
}
2999
3000
void __init mm_cache_init(void)
3001
{
3002
unsigned int mm_size;
3003
3004
/*
3005
* The mm_cpumask is located at the end of mm_struct, and is
3006
* dynamically sized based on the maximum CPU number this system
3007
* can have, taking hotplug into account (nr_cpu_ids).
3008
*/
3009
mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
3010
3011
mm_cachep = kmem_cache_create_usercopy("mm_struct",
3012
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
3013
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3014
offsetof(struct mm_struct, saved_auxv),
3015
sizeof_field(struct mm_struct, saved_auxv),
3016
NULL);
3017
}
3018
3019
void __init proc_caches_init(void)
3020
{
3021
sighand_cachep = kmem_cache_create("sighand_cache",
3022
sizeof(struct sighand_struct), 0,
3023
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
3024
SLAB_ACCOUNT, sighand_ctor);
3025
signal_cachep = kmem_cache_create("signal_cache",
3026
sizeof(struct signal_struct), 0,
3027
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3028
NULL);
3029
files_cachep = kmem_cache_create("files_cache",
3030
sizeof(struct files_struct), 0,
3031
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3032
NULL);
3033
fs_cachep = kmem_cache_create("fs_cache",
3034
sizeof(struct fs_struct), 0,
3035
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
3036
NULL);
3037
mmap_init();
3038
nsproxy_cache_init();
3039
}
3040
3041
/*
3042
* Check constraints on flags passed to the unshare system call.
3043
*/
3044
static int check_unshare_flags(unsigned long unshare_flags)
3045
{
3046
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
3047
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
3048
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
3049
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
3050
CLONE_NEWTIME))
3051
return -EINVAL;
3052
/*
3053
* Not implemented, but pretend it works if there is nothing
3054
* to unshare. Note that unsharing the address space or the
3055
* signal handlers also need to unshare the signal queues (aka
3056
* CLONE_THREAD).
3057
*/
3058
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
3059
if (!thread_group_empty(current))
3060
return -EINVAL;
3061
}
3062
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
3063
if (refcount_read(&current->sighand->count) > 1)
3064
return -EINVAL;
3065
}
3066
if (unshare_flags & CLONE_VM) {
3067
if (!current_is_single_threaded())
3068
return -EINVAL;
3069
}
3070
3071
return 0;
3072
}
3073
3074
/*
3075
* Unshare the filesystem structure if it is being shared
3076
*/
3077
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
3078
{
3079
struct fs_struct *fs = current->fs;
3080
3081
if (!(unshare_flags & CLONE_FS) || !fs)
3082
return 0;
3083
3084
/* don't need lock here; in the worst case we'll do useless copy */
3085
if (fs->users == 1)
3086
return 0;
3087
3088
*new_fsp = copy_fs_struct(fs);
3089
if (!*new_fsp)
3090
return -ENOMEM;
3091
3092
return 0;
3093
}
3094
3095
/*
3096
* Unshare file descriptor table if it is being shared
3097
*/
3098
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
3099
{
3100
struct files_struct *fd = current->files;
3101
3102
if ((unshare_flags & CLONE_FILES) &&
3103
(fd && atomic_read(&fd->count) > 1)) {
3104
fd = dup_fd(fd, NULL);
3105
if (IS_ERR(fd))
3106
return PTR_ERR(fd);
3107
*new_fdp = fd;
3108
}
3109
3110
return 0;
3111
}
3112
3113
/*
3114
* unshare allows a process to 'unshare' part of the process
3115
* context which was originally shared using clone. copy_*
3116
* functions used by kernel_clone() cannot be used here directly
3117
* because they modify an inactive task_struct that is being
3118
* constructed. Here we are modifying the current, active,
3119
* task_struct.
3120
*/
3121
int ksys_unshare(unsigned long unshare_flags)
3122
{
3123
struct fs_struct *fs, *new_fs = NULL;
3124
struct files_struct *new_fd = NULL;
3125
struct cred *new_cred = NULL;
3126
struct nsproxy *new_nsproxy = NULL;
3127
int do_sysvsem = 0;
3128
int err;
3129
3130
/*
3131
* If unsharing a user namespace must also unshare the thread group
3132
* and unshare the filesystem root and working directories.
3133
*/
3134
if (unshare_flags & CLONE_NEWUSER)
3135
unshare_flags |= CLONE_THREAD | CLONE_FS;
3136
/*
3137
* If unsharing vm, must also unshare signal handlers.
3138
*/
3139
if (unshare_flags & CLONE_VM)
3140
unshare_flags |= CLONE_SIGHAND;
3141
/*
3142
* If unsharing a signal handlers, must also unshare the signal queues.
3143
*/
3144
if (unshare_flags & CLONE_SIGHAND)
3145
unshare_flags |= CLONE_THREAD;
3146
/*
3147
* If unsharing namespace, must also unshare filesystem information.
3148
*/
3149
if (unshare_flags & CLONE_NEWNS)
3150
unshare_flags |= CLONE_FS;
3151
3152
err = check_unshare_flags(unshare_flags);
3153
if (err)
3154
goto bad_unshare_out;
3155
/*
3156
* CLONE_NEWIPC must also detach from the undolist: after switching
3157
* to a new ipc namespace, the semaphore arrays from the old
3158
* namespace are unreachable.
3159
*/
3160
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
3161
do_sysvsem = 1;
3162
err = unshare_fs(unshare_flags, &new_fs);
3163
if (err)
3164
goto bad_unshare_out;
3165
err = unshare_fd(unshare_flags, &new_fd);
3166
if (err)
3167
goto bad_unshare_cleanup_fs;
3168
err = unshare_userns(unshare_flags, &new_cred);
3169
if (err)
3170
goto bad_unshare_cleanup_fd;
3171
err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
3172
new_cred, new_fs);
3173
if (err)
3174
goto bad_unshare_cleanup_cred;
3175
3176
if (new_cred) {
3177
err = set_cred_ucounts(new_cred);
3178
if (err)
3179
goto bad_unshare_cleanup_cred;
3180
}
3181
3182
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
3183
if (do_sysvsem) {
3184
/*
3185
* CLONE_SYSVSEM is equivalent to sys_exit().
3186
*/
3187
exit_sem(current);
3188
}
3189
if (unshare_flags & CLONE_NEWIPC) {
3190
/* Orphan segments in old ns (see sem above). */
3191
exit_shm(current);
3192
shm_init_task(current);
3193
}
3194
3195
if (new_nsproxy)
3196
switch_task_namespaces(current, new_nsproxy);
3197
3198
task_lock(current);
3199
3200
if (new_fs) {
3201
fs = current->fs;
3202
read_seqlock_excl(&fs->seq);
3203
current->fs = new_fs;
3204
if (--fs->users)
3205
new_fs = NULL;
3206
else
3207
new_fs = fs;
3208
read_sequnlock_excl(&fs->seq);
3209
}
3210
3211
if (new_fd)
3212
swap(current->files, new_fd);
3213
3214
task_unlock(current);
3215
3216
if (new_cred) {
3217
/* Install the new user namespace */
3218
commit_creds(new_cred);
3219
new_cred = NULL;
3220
}
3221
}
3222
3223
perf_event_namespaces(current);
3224
3225
bad_unshare_cleanup_cred:
3226
if (new_cred)
3227
put_cred(new_cred);
3228
bad_unshare_cleanup_fd:
3229
if (new_fd)
3230
put_files_struct(new_fd);
3231
3232
bad_unshare_cleanup_fs:
3233
if (new_fs)
3234
free_fs_struct(new_fs);
3235
3236
bad_unshare_out:
3237
return err;
3238
}
3239
3240
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
3241
{
3242
return ksys_unshare(unshare_flags);
3243
}
3244
3245
/*
3246
* Helper to unshare the files of the current task.
3247
* We don't want to expose copy_files internals to
3248
* the exec layer of the kernel.
3249
*/
3250
3251
int unshare_files(void)
3252
{
3253
struct task_struct *task = current;
3254
struct files_struct *old, *copy = NULL;
3255
int error;
3256
3257
error = unshare_fd(CLONE_FILES, &copy);
3258
if (error || !copy)
3259
return error;
3260
3261
old = task->files;
3262
task_lock(task);
3263
task->files = copy;
3264
task_unlock(task);
3265
put_files_struct(old);
3266
return 0;
3267
}
3268
3269
static int sysctl_max_threads(const struct ctl_table *table, int write,
3270
void *buffer, size_t *lenp, loff_t *ppos)
3271
{
3272
struct ctl_table t;
3273
int ret;
3274
int threads = max_threads;
3275
int min = 1;
3276
int max = MAX_THREADS;
3277
3278
t = *table;
3279
t.data = &threads;
3280
t.extra1 = &min;
3281
t.extra2 = &max;
3282
3283
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3284
if (ret || !write)
3285
return ret;
3286
3287
max_threads = threads;
3288
3289
return 0;
3290
}
3291
3292
static const struct ctl_table fork_sysctl_table[] = {
3293
{
3294
.procname = "threads-max",
3295
.data = NULL,
3296
.maxlen = sizeof(int),
3297
.mode = 0644,
3298
.proc_handler = sysctl_max_threads,
3299
},
3300
};
3301
3302
static int __init init_fork_sysctl(void)
3303
{
3304
register_sysctl_init("kernel", fork_sysctl_table);
3305
return 0;
3306
}
3307
3308
subsys_initcall(init_fork_sysctl);
3309
3310