Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/kernel/cgroup.c
10814 views
1
/*
2
* Generic process-grouping system.
3
*
4
* Based originally on the cpuset system, extracted by Paul Menage
5
* Copyright (C) 2006 Google, Inc
6
*
7
* Notifications support
8
* Copyright (C) 2009 Nokia Corporation
9
* Author: Kirill A. Shutemov
10
*
11
* Copyright notices from the original cpuset code:
12
* --------------------------------------------------
13
* Copyright (C) 2003 BULL SA.
14
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
15
*
16
* Portions derived from Patrick Mochel's sysfs code.
17
* sysfs is Copyright (c) 2001-3 Patrick Mochel
18
*
19
* 2003-10-10 Written by Simon Derr.
20
* 2003-10-22 Updates by Stephen Hemminger.
21
* 2004 May-July Rework by Paul Jackson.
22
* ---------------------------------------------------
23
*
24
* This file is subject to the terms and conditions of the GNU General Public
25
* License. See the file COPYING in the main directory of the Linux
26
* distribution for more details.
27
*/
28
29
#include <linux/cgroup.h>
30
#include <linux/ctype.h>
31
#include <linux/errno.h>
32
#include <linux/fs.h>
33
#include <linux/kernel.h>
34
#include <linux/list.h>
35
#include <linux/mm.h>
36
#include <linux/mutex.h>
37
#include <linux/mount.h>
38
#include <linux/pagemap.h>
39
#include <linux/proc_fs.h>
40
#include <linux/rcupdate.h>
41
#include <linux/sched.h>
42
#include <linux/backing-dev.h>
43
#include <linux/seq_file.h>
44
#include <linux/slab.h>
45
#include <linux/magic.h>
46
#include <linux/spinlock.h>
47
#include <linux/string.h>
48
#include <linux/sort.h>
49
#include <linux/kmod.h>
50
#include <linux/module.h>
51
#include <linux/delayacct.h>
52
#include <linux/cgroupstats.h>
53
#include <linux/hash.h>
54
#include <linux/namei.h>
55
#include <linux/pid_namespace.h>
56
#include <linux/idr.h>
57
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58
#include <linux/eventfd.h>
59
#include <linux/poll.h>
60
#include <linux/flex_array.h> /* used in cgroup_attach_proc */
61
62
#include <asm/atomic.h>
63
64
static DEFINE_MUTEX(cgroup_mutex);
65
66
/*
67
* Generate an array of cgroup subsystem pointers. At boot time, this is
68
* populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69
* registered after that. The mutable section of this array is protected by
70
* cgroup_mutex.
71
*/
72
#define SUBSYS(_x) &_x ## _subsys,
73
static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
74
#include <linux/cgroup_subsys.h>
75
};
76
77
#define MAX_CGROUP_ROOT_NAMELEN 64
78
79
/*
80
* A cgroupfs_root represents the root of a cgroup hierarchy,
81
* and may be associated with a superblock to form an active
82
* hierarchy
83
*/
84
struct cgroupfs_root {
85
struct super_block *sb;
86
87
/*
88
* The bitmask of subsystems intended to be attached to this
89
* hierarchy
90
*/
91
unsigned long subsys_bits;
92
93
/* Unique id for this hierarchy. */
94
int hierarchy_id;
95
96
/* The bitmask of subsystems currently attached to this hierarchy */
97
unsigned long actual_subsys_bits;
98
99
/* A list running through the attached subsystems */
100
struct list_head subsys_list;
101
102
/* The root cgroup for this hierarchy */
103
struct cgroup top_cgroup;
104
105
/* Tracks how many cgroups are currently defined in hierarchy.*/
106
int number_of_cgroups;
107
108
/* A list running through the active hierarchies */
109
struct list_head root_list;
110
111
/* Hierarchy-specific flags */
112
unsigned long flags;
113
114
/* The path to use for release notifications. */
115
char release_agent_path[PATH_MAX];
116
117
/* The name for this hierarchy - may be empty */
118
char name[MAX_CGROUP_ROOT_NAMELEN];
119
};
120
121
/*
122
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
123
* subsystems that are otherwise unattached - it never has more than a
124
* single cgroup, and all tasks are part of that cgroup.
125
*/
126
static struct cgroupfs_root rootnode;
127
128
/*
129
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
130
* cgroup_subsys->use_id != 0.
131
*/
132
#define CSS_ID_MAX (65535)
133
struct css_id {
134
/*
135
* The css to which this ID points. This pointer is set to valid value
136
* after cgroup is populated. If cgroup is removed, this will be NULL.
137
* This pointer is expected to be RCU-safe because destroy()
138
* is called after synchronize_rcu(). But for safe use, css_is_removed()
139
* css_tryget() should be used for avoiding race.
140
*/
141
struct cgroup_subsys_state __rcu *css;
142
/*
143
* ID of this css.
144
*/
145
unsigned short id;
146
/*
147
* Depth in hierarchy which this ID belongs to.
148
*/
149
unsigned short depth;
150
/*
151
* ID is freed by RCU. (and lookup routine is RCU safe.)
152
*/
153
struct rcu_head rcu_head;
154
/*
155
* Hierarchy of CSS ID belongs to.
156
*/
157
unsigned short stack[0]; /* Array of Length (depth+1) */
158
};
159
160
/*
161
* cgroup_event represents events which userspace want to receive.
162
*/
163
struct cgroup_event {
164
/*
165
* Cgroup which the event belongs to.
166
*/
167
struct cgroup *cgrp;
168
/*
169
* Control file which the event associated.
170
*/
171
struct cftype *cft;
172
/*
173
* eventfd to signal userspace about the event.
174
*/
175
struct eventfd_ctx *eventfd;
176
/*
177
* Each of these stored in a list by the cgroup.
178
*/
179
struct list_head list;
180
/*
181
* All fields below needed to unregister event when
182
* userspace closes eventfd.
183
*/
184
poll_table pt;
185
wait_queue_head_t *wqh;
186
wait_queue_t wait;
187
struct work_struct remove;
188
};
189
190
/* The list of hierarchy roots */
191
192
static LIST_HEAD(roots);
193
static int root_count;
194
195
static DEFINE_IDA(hierarchy_ida);
196
static int next_hierarchy_id;
197
static DEFINE_SPINLOCK(hierarchy_id_lock);
198
199
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
200
#define dummytop (&rootnode.top_cgroup)
201
202
/* This flag indicates whether tasks in the fork and exit paths should
203
* check for fork/exit handlers to call. This avoids us having to do
204
* extra work in the fork/exit path if none of the subsystems need to
205
* be called.
206
*/
207
static int need_forkexit_callback __read_mostly;
208
209
#ifdef CONFIG_PROVE_LOCKING
210
int cgroup_lock_is_held(void)
211
{
212
return lockdep_is_held(&cgroup_mutex);
213
}
214
#else /* #ifdef CONFIG_PROVE_LOCKING */
215
int cgroup_lock_is_held(void)
216
{
217
return mutex_is_locked(&cgroup_mutex);
218
}
219
#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220
221
EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222
223
/* convenient tests for these bits */
224
inline int cgroup_is_removed(const struct cgroup *cgrp)
225
{
226
return test_bit(CGRP_REMOVED, &cgrp->flags);
227
}
228
229
/* bits in struct cgroupfs_root flags field */
230
enum {
231
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
232
};
233
234
static int cgroup_is_releasable(const struct cgroup *cgrp)
235
{
236
const int bits =
237
(1 << CGRP_RELEASABLE) |
238
(1 << CGRP_NOTIFY_ON_RELEASE);
239
return (cgrp->flags & bits) == bits;
240
}
241
242
static int notify_on_release(const struct cgroup *cgrp)
243
{
244
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245
}
246
247
static int clone_children(const struct cgroup *cgrp)
248
{
249
return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
250
}
251
252
/*
253
* for_each_subsys() allows you to iterate on each subsystem attached to
254
* an active hierarchy
255
*/
256
#define for_each_subsys(_root, _ss) \
257
list_for_each_entry(_ss, &_root->subsys_list, sibling)
258
259
/* for_each_active_root() allows you to iterate across the active hierarchies */
260
#define for_each_active_root(_root) \
261
list_for_each_entry(_root, &roots, root_list)
262
263
/* the list of cgroups eligible for automatic release. Protected by
264
* release_list_lock */
265
static LIST_HEAD(release_list);
266
static DEFINE_SPINLOCK(release_list_lock);
267
static void cgroup_release_agent(struct work_struct *work);
268
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
269
static void check_for_release(struct cgroup *cgrp);
270
271
/* Link structure for associating css_set objects with cgroups */
272
struct cg_cgroup_link {
273
/*
274
* List running through cg_cgroup_links associated with a
275
* cgroup, anchored on cgroup->css_sets
276
*/
277
struct list_head cgrp_link_list;
278
struct cgroup *cgrp;
279
/*
280
* List running through cg_cgroup_links pointing at a
281
* single css_set object, anchored on css_set->cg_links
282
*/
283
struct list_head cg_link_list;
284
struct css_set *cg;
285
};
286
287
/* The default css_set - used by init and its children prior to any
288
* hierarchies being mounted. It contains a pointer to the root state
289
* for each subsystem. Also used to anchor the list of css_sets. Not
290
* reference-counted, to improve performance when child cgroups
291
* haven't been created.
292
*/
293
294
static struct css_set init_css_set;
295
static struct cg_cgroup_link init_css_set_link;
296
297
static int cgroup_init_idr(struct cgroup_subsys *ss,
298
struct cgroup_subsys_state *css);
299
300
/* css_set_lock protects the list of css_set objects, and the
301
* chain of tasks off each css_set. Nests outside task->alloc_lock
302
* due to cgroup_iter_start() */
303
static DEFINE_RWLOCK(css_set_lock);
304
static int css_set_count;
305
306
/*
307
* hash table for cgroup groups. This improves the performance to find
308
* an existing css_set. This hash doesn't (currently) take into
309
* account cgroups in empty hierarchies.
310
*/
311
#define CSS_SET_HASH_BITS 7
312
#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
313
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
314
315
static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
316
{
317
int i;
318
int index;
319
unsigned long tmp = 0UL;
320
321
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
322
tmp += (unsigned long)css[i];
323
tmp = (tmp >> 16) ^ tmp;
324
325
index = hash_long(tmp, CSS_SET_HASH_BITS);
326
327
return &css_set_table[index];
328
}
329
330
/* We don't maintain the lists running through each css_set to its
331
* task until after the first call to cgroup_iter_start(). This
332
* reduces the fork()/exit() overhead for people who have cgroups
333
* compiled into their kernel but not actually in use */
334
static int use_task_css_set_links __read_mostly;
335
336
static void __put_css_set(struct css_set *cg, int taskexit)
337
{
338
struct cg_cgroup_link *link;
339
struct cg_cgroup_link *saved_link;
340
/*
341
* Ensure that the refcount doesn't hit zero while any readers
342
* can see it. Similar to atomic_dec_and_lock(), but for an
343
* rwlock
344
*/
345
if (atomic_add_unless(&cg->refcount, -1, 1))
346
return;
347
write_lock(&css_set_lock);
348
if (!atomic_dec_and_test(&cg->refcount)) {
349
write_unlock(&css_set_lock);
350
return;
351
}
352
353
/* This css_set is dead. unlink it and release cgroup refcounts */
354
hlist_del(&cg->hlist);
355
css_set_count--;
356
357
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
358
cg_link_list) {
359
struct cgroup *cgrp = link->cgrp;
360
list_del(&link->cg_link_list);
361
list_del(&link->cgrp_link_list);
362
if (atomic_dec_and_test(&cgrp->count) &&
363
notify_on_release(cgrp)) {
364
if (taskexit)
365
set_bit(CGRP_RELEASABLE, &cgrp->flags);
366
check_for_release(cgrp);
367
}
368
369
kfree(link);
370
}
371
372
write_unlock(&css_set_lock);
373
kfree_rcu(cg, rcu_head);
374
}
375
376
/*
377
* refcounted get/put for css_set objects
378
*/
379
static inline void get_css_set(struct css_set *cg)
380
{
381
atomic_inc(&cg->refcount);
382
}
383
384
static inline void put_css_set(struct css_set *cg)
385
{
386
__put_css_set(cg, 0);
387
}
388
389
static inline void put_css_set_taskexit(struct css_set *cg)
390
{
391
__put_css_set(cg, 1);
392
}
393
394
/*
395
* compare_css_sets - helper function for find_existing_css_set().
396
* @cg: candidate css_set being tested
397
* @old_cg: existing css_set for a task
398
* @new_cgrp: cgroup that's being entered by the task
399
* @template: desired set of css pointers in css_set (pre-calculated)
400
*
401
* Returns true if "cg" matches "old_cg" except for the hierarchy
402
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
403
*/
404
static bool compare_css_sets(struct css_set *cg,
405
struct css_set *old_cg,
406
struct cgroup *new_cgrp,
407
struct cgroup_subsys_state *template[])
408
{
409
struct list_head *l1, *l2;
410
411
if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
412
/* Not all subsystems matched */
413
return false;
414
}
415
416
/*
417
* Compare cgroup pointers in order to distinguish between
418
* different cgroups in heirarchies with no subsystems. We
419
* could get by with just this check alone (and skip the
420
* memcmp above) but on most setups the memcmp check will
421
* avoid the need for this more expensive check on almost all
422
* candidates.
423
*/
424
425
l1 = &cg->cg_links;
426
l2 = &old_cg->cg_links;
427
while (1) {
428
struct cg_cgroup_link *cgl1, *cgl2;
429
struct cgroup *cg1, *cg2;
430
431
l1 = l1->next;
432
l2 = l2->next;
433
/* See if we reached the end - both lists are equal length. */
434
if (l1 == &cg->cg_links) {
435
BUG_ON(l2 != &old_cg->cg_links);
436
break;
437
} else {
438
BUG_ON(l2 == &old_cg->cg_links);
439
}
440
/* Locate the cgroups associated with these links. */
441
cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
442
cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
443
cg1 = cgl1->cgrp;
444
cg2 = cgl2->cgrp;
445
/* Hierarchies should be linked in the same order. */
446
BUG_ON(cg1->root != cg2->root);
447
448
/*
449
* If this hierarchy is the hierarchy of the cgroup
450
* that's changing, then we need to check that this
451
* css_set points to the new cgroup; if it's any other
452
* hierarchy, then this css_set should point to the
453
* same cgroup as the old css_set.
454
*/
455
if (cg1->root == new_cgrp->root) {
456
if (cg1 != new_cgrp)
457
return false;
458
} else {
459
if (cg1 != cg2)
460
return false;
461
}
462
}
463
return true;
464
}
465
466
/*
467
* find_existing_css_set() is a helper for
468
* find_css_set(), and checks to see whether an existing
469
* css_set is suitable.
470
*
471
* oldcg: the cgroup group that we're using before the cgroup
472
* transition
473
*
474
* cgrp: the cgroup that we're moving into
475
*
476
* template: location in which to build the desired set of subsystem
477
* state objects for the new cgroup group
478
*/
479
static struct css_set *find_existing_css_set(
480
struct css_set *oldcg,
481
struct cgroup *cgrp,
482
struct cgroup_subsys_state *template[])
483
{
484
int i;
485
struct cgroupfs_root *root = cgrp->root;
486
struct hlist_head *hhead;
487
struct hlist_node *node;
488
struct css_set *cg;
489
490
/*
491
* Build the set of subsystem state objects that we want to see in the
492
* new css_set. while subsystems can change globally, the entries here
493
* won't change, so no need for locking.
494
*/
495
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
496
if (root->subsys_bits & (1UL << i)) {
497
/* Subsystem is in this hierarchy. So we want
498
* the subsystem state from the new
499
* cgroup */
500
template[i] = cgrp->subsys[i];
501
} else {
502
/* Subsystem is not in this hierarchy, so we
503
* don't want to change the subsystem state */
504
template[i] = oldcg->subsys[i];
505
}
506
}
507
508
hhead = css_set_hash(template);
509
hlist_for_each_entry(cg, node, hhead, hlist) {
510
if (!compare_css_sets(cg, oldcg, cgrp, template))
511
continue;
512
513
/* This css_set matches what we need */
514
return cg;
515
}
516
517
/* No existing cgroup group matched */
518
return NULL;
519
}
520
521
static void free_cg_links(struct list_head *tmp)
522
{
523
struct cg_cgroup_link *link;
524
struct cg_cgroup_link *saved_link;
525
526
list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
527
list_del(&link->cgrp_link_list);
528
kfree(link);
529
}
530
}
531
532
/*
533
* allocate_cg_links() allocates "count" cg_cgroup_link structures
534
* and chains them on tmp through their cgrp_link_list fields. Returns 0 on
535
* success or a negative error
536
*/
537
static int allocate_cg_links(int count, struct list_head *tmp)
538
{
539
struct cg_cgroup_link *link;
540
int i;
541
INIT_LIST_HEAD(tmp);
542
for (i = 0; i < count; i++) {
543
link = kmalloc(sizeof(*link), GFP_KERNEL);
544
if (!link) {
545
free_cg_links(tmp);
546
return -ENOMEM;
547
}
548
list_add(&link->cgrp_link_list, tmp);
549
}
550
return 0;
551
}
552
553
/**
554
* link_css_set - a helper function to link a css_set to a cgroup
555
* @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
556
* @cg: the css_set to be linked
557
* @cgrp: the destination cgroup
558
*/
559
static void link_css_set(struct list_head *tmp_cg_links,
560
struct css_set *cg, struct cgroup *cgrp)
561
{
562
struct cg_cgroup_link *link;
563
564
BUG_ON(list_empty(tmp_cg_links));
565
link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
566
cgrp_link_list);
567
link->cg = cg;
568
link->cgrp = cgrp;
569
atomic_inc(&cgrp->count);
570
list_move(&link->cgrp_link_list, &cgrp->css_sets);
571
/*
572
* Always add links to the tail of the list so that the list
573
* is sorted by order of hierarchy creation
574
*/
575
list_add_tail(&link->cg_link_list, &cg->cg_links);
576
}
577
578
/*
579
* find_css_set() takes an existing cgroup group and a
580
* cgroup object, and returns a css_set object that's
581
* equivalent to the old group, but with the given cgroup
582
* substituted into the appropriate hierarchy. Must be called with
583
* cgroup_mutex held
584
*/
585
static struct css_set *find_css_set(
586
struct css_set *oldcg, struct cgroup *cgrp)
587
{
588
struct css_set *res;
589
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
590
591
struct list_head tmp_cg_links;
592
593
struct hlist_head *hhead;
594
struct cg_cgroup_link *link;
595
596
/* First see if we already have a cgroup group that matches
597
* the desired set */
598
read_lock(&css_set_lock);
599
res = find_existing_css_set(oldcg, cgrp, template);
600
if (res)
601
get_css_set(res);
602
read_unlock(&css_set_lock);
603
604
if (res)
605
return res;
606
607
res = kmalloc(sizeof(*res), GFP_KERNEL);
608
if (!res)
609
return NULL;
610
611
/* Allocate all the cg_cgroup_link objects that we'll need */
612
if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
613
kfree(res);
614
return NULL;
615
}
616
617
atomic_set(&res->refcount, 1);
618
INIT_LIST_HEAD(&res->cg_links);
619
INIT_LIST_HEAD(&res->tasks);
620
INIT_HLIST_NODE(&res->hlist);
621
622
/* Copy the set of subsystem state objects generated in
623
* find_existing_css_set() */
624
memcpy(res->subsys, template, sizeof(res->subsys));
625
626
write_lock(&css_set_lock);
627
/* Add reference counts and links from the new css_set. */
628
list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
629
struct cgroup *c = link->cgrp;
630
if (c->root == cgrp->root)
631
c = cgrp;
632
link_css_set(&tmp_cg_links, res, c);
633
}
634
635
BUG_ON(!list_empty(&tmp_cg_links));
636
637
css_set_count++;
638
639
/* Add this cgroup group to the hash table */
640
hhead = css_set_hash(res->subsys);
641
hlist_add_head(&res->hlist, hhead);
642
643
write_unlock(&css_set_lock);
644
645
return res;
646
}
647
648
/*
649
* Return the cgroup for "task" from the given hierarchy. Must be
650
* called with cgroup_mutex held.
651
*/
652
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
653
struct cgroupfs_root *root)
654
{
655
struct css_set *css;
656
struct cgroup *res = NULL;
657
658
BUG_ON(!mutex_is_locked(&cgroup_mutex));
659
read_lock(&css_set_lock);
660
/*
661
* No need to lock the task - since we hold cgroup_mutex the
662
* task can't change groups, so the only thing that can happen
663
* is that it exits and its css is set back to init_css_set.
664
*/
665
css = task->cgroups;
666
if (css == &init_css_set) {
667
res = &root->top_cgroup;
668
} else {
669
struct cg_cgroup_link *link;
670
list_for_each_entry(link, &css->cg_links, cg_link_list) {
671
struct cgroup *c = link->cgrp;
672
if (c->root == root) {
673
res = c;
674
break;
675
}
676
}
677
}
678
read_unlock(&css_set_lock);
679
BUG_ON(!res);
680
return res;
681
}
682
683
/*
684
* There is one global cgroup mutex. We also require taking
685
* task_lock() when dereferencing a task's cgroup subsys pointers.
686
* See "The task_lock() exception", at the end of this comment.
687
*
688
* A task must hold cgroup_mutex to modify cgroups.
689
*
690
* Any task can increment and decrement the count field without lock.
691
* So in general, code holding cgroup_mutex can't rely on the count
692
* field not changing. However, if the count goes to zero, then only
693
* cgroup_attach_task() can increment it again. Because a count of zero
694
* means that no tasks are currently attached, therefore there is no
695
* way a task attached to that cgroup can fork (the other way to
696
* increment the count). So code holding cgroup_mutex can safely
697
* assume that if the count is zero, it will stay zero. Similarly, if
698
* a task holds cgroup_mutex on a cgroup with zero count, it
699
* knows that the cgroup won't be removed, as cgroup_rmdir()
700
* needs that mutex.
701
*
702
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
703
* (usually) take cgroup_mutex. These are the two most performance
704
* critical pieces of code here. The exception occurs on cgroup_exit(),
705
* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
706
* is taken, and if the cgroup count is zero, a usermode call made
707
* to the release agent with the name of the cgroup (path relative to
708
* the root of cgroup file system) as the argument.
709
*
710
* A cgroup can only be deleted if both its 'count' of using tasks
711
* is zero, and its list of 'children' cgroups is empty. Since all
712
* tasks in the system use _some_ cgroup, and since there is always at
713
* least one task in the system (init, pid == 1), therefore, top_cgroup
714
* always has either children cgroups and/or using tasks. So we don't
715
* need a special hack to ensure that top_cgroup cannot be deleted.
716
*
717
* The task_lock() exception
718
*
719
* The need for this exception arises from the action of
720
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
721
* another. It does so using cgroup_mutex, however there are
722
* several performance critical places that need to reference
723
* task->cgroup without the expense of grabbing a system global
724
* mutex. Therefore except as noted below, when dereferencing or, as
725
* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
726
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
727
* the task_struct routinely used for such matters.
728
*
729
* P.S. One more locking exception. RCU is used to guard the
730
* update of a tasks cgroup pointer by cgroup_attach_task()
731
*/
732
733
/**
734
* cgroup_lock - lock out any changes to cgroup structures
735
*
736
*/
737
void cgroup_lock(void)
738
{
739
mutex_lock(&cgroup_mutex);
740
}
741
EXPORT_SYMBOL_GPL(cgroup_lock);
742
743
/**
744
* cgroup_unlock - release lock on cgroup changes
745
*
746
* Undo the lock taken in a previous cgroup_lock() call.
747
*/
748
void cgroup_unlock(void)
749
{
750
mutex_unlock(&cgroup_mutex);
751
}
752
EXPORT_SYMBOL_GPL(cgroup_unlock);
753
754
/*
755
* A couple of forward declarations required, due to cyclic reference loop:
756
* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
757
* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
758
* -> cgroup_mkdir.
759
*/
760
761
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
762
static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
763
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
764
static int cgroup_populate_dir(struct cgroup *cgrp);
765
static const struct inode_operations cgroup_dir_inode_operations;
766
static const struct file_operations proc_cgroupstats_operations;
767
768
static struct backing_dev_info cgroup_backing_dev_info = {
769
.name = "cgroup",
770
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
771
};
772
773
static int alloc_css_id(struct cgroup_subsys *ss,
774
struct cgroup *parent, struct cgroup *child);
775
776
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
777
{
778
struct inode *inode = new_inode(sb);
779
780
if (inode) {
781
inode->i_ino = get_next_ino();
782
inode->i_mode = mode;
783
inode->i_uid = current_fsuid();
784
inode->i_gid = current_fsgid();
785
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
786
inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
787
}
788
return inode;
789
}
790
791
/*
792
* Call subsys's pre_destroy handler.
793
* This is called before css refcnt check.
794
*/
795
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
796
{
797
struct cgroup_subsys *ss;
798
int ret = 0;
799
800
for_each_subsys(cgrp->root, ss)
801
if (ss->pre_destroy) {
802
ret = ss->pre_destroy(ss, cgrp);
803
if (ret)
804
break;
805
}
806
807
return ret;
808
}
809
810
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
811
{
812
/* is dentry a directory ? if so, kfree() associated cgroup */
813
if (S_ISDIR(inode->i_mode)) {
814
struct cgroup *cgrp = dentry->d_fsdata;
815
struct cgroup_subsys *ss;
816
BUG_ON(!(cgroup_is_removed(cgrp)));
817
/* It's possible for external users to be holding css
818
* reference counts on a cgroup; css_put() needs to
819
* be able to access the cgroup after decrementing
820
* the reference count in order to know if it needs to
821
* queue the cgroup to be handled by the release
822
* agent */
823
synchronize_rcu();
824
825
mutex_lock(&cgroup_mutex);
826
/*
827
* Release the subsystem state objects.
828
*/
829
for_each_subsys(cgrp->root, ss)
830
ss->destroy(ss, cgrp);
831
832
cgrp->root->number_of_cgroups--;
833
mutex_unlock(&cgroup_mutex);
834
835
/*
836
* Drop the active superblock reference that we took when we
837
* created the cgroup
838
*/
839
deactivate_super(cgrp->root->sb);
840
841
/*
842
* if we're getting rid of the cgroup, refcount should ensure
843
* that there are no pidlists left.
844
*/
845
BUG_ON(!list_empty(&cgrp->pidlists));
846
847
kfree_rcu(cgrp, rcu_head);
848
}
849
iput(inode);
850
}
851
852
static int cgroup_delete(const struct dentry *d)
853
{
854
return 1;
855
}
856
857
static void remove_dir(struct dentry *d)
858
{
859
struct dentry *parent = dget(d->d_parent);
860
861
d_delete(d);
862
simple_rmdir(parent->d_inode, d);
863
dput(parent);
864
}
865
866
static void cgroup_clear_directory(struct dentry *dentry)
867
{
868
struct list_head *node;
869
870
BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
871
spin_lock(&dentry->d_lock);
872
node = dentry->d_subdirs.next;
873
while (node != &dentry->d_subdirs) {
874
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
875
876
spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
877
list_del_init(node);
878
if (d->d_inode) {
879
/* This should never be called on a cgroup
880
* directory with child cgroups */
881
BUG_ON(d->d_inode->i_mode & S_IFDIR);
882
dget_dlock(d);
883
spin_unlock(&d->d_lock);
884
spin_unlock(&dentry->d_lock);
885
d_delete(d);
886
simple_unlink(dentry->d_inode, d);
887
dput(d);
888
spin_lock(&dentry->d_lock);
889
} else
890
spin_unlock(&d->d_lock);
891
node = dentry->d_subdirs.next;
892
}
893
spin_unlock(&dentry->d_lock);
894
}
895
896
/*
897
* NOTE : the dentry must have been dget()'ed
898
*/
899
static void cgroup_d_remove_dir(struct dentry *dentry)
900
{
901
struct dentry *parent;
902
903
cgroup_clear_directory(dentry);
904
905
parent = dentry->d_parent;
906
spin_lock(&parent->d_lock);
907
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
908
list_del_init(&dentry->d_u.d_child);
909
spin_unlock(&dentry->d_lock);
910
spin_unlock(&parent->d_lock);
911
remove_dir(dentry);
912
}
913
914
/*
915
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
916
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
917
* reference to css->refcnt. In general, this refcnt is expected to goes down
918
* to zero, soon.
919
*
920
* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
921
*/
922
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
923
924
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
925
{
926
if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
927
wake_up_all(&cgroup_rmdir_waitq);
928
}
929
930
void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
931
{
932
css_get(css);
933
}
934
935
void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
936
{
937
cgroup_wakeup_rmdir_waiter(css->cgroup);
938
css_put(css);
939
}
940
941
/*
942
* Call with cgroup_mutex held. Drops reference counts on modules, including
943
* any duplicate ones that parse_cgroupfs_options took. If this function
944
* returns an error, no reference counts are touched.
945
*/
946
static int rebind_subsystems(struct cgroupfs_root *root,
947
unsigned long final_bits)
948
{
949
unsigned long added_bits, removed_bits;
950
struct cgroup *cgrp = &root->top_cgroup;
951
int i;
952
953
BUG_ON(!mutex_is_locked(&cgroup_mutex));
954
955
removed_bits = root->actual_subsys_bits & ~final_bits;
956
added_bits = final_bits & ~root->actual_subsys_bits;
957
/* Check that any added subsystems are currently free */
958
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
959
unsigned long bit = 1UL << i;
960
struct cgroup_subsys *ss = subsys[i];
961
if (!(bit & added_bits))
962
continue;
963
/*
964
* Nobody should tell us to do a subsys that doesn't exist:
965
* parse_cgroupfs_options should catch that case and refcounts
966
* ensure that subsystems won't disappear once selected.
967
*/
968
BUG_ON(ss == NULL);
969
if (ss->root != &rootnode) {
970
/* Subsystem isn't free */
971
return -EBUSY;
972
}
973
}
974
975
/* Currently we don't handle adding/removing subsystems when
976
* any child cgroups exist. This is theoretically supportable
977
* but involves complex error handling, so it's being left until
978
* later */
979
if (root->number_of_cgroups > 1)
980
return -EBUSY;
981
982
/* Process each subsystem */
983
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
984
struct cgroup_subsys *ss = subsys[i];
985
unsigned long bit = 1UL << i;
986
if (bit & added_bits) {
987
/* We're binding this subsystem to this hierarchy */
988
BUG_ON(ss == NULL);
989
BUG_ON(cgrp->subsys[i]);
990
BUG_ON(!dummytop->subsys[i]);
991
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
992
mutex_lock(&ss->hierarchy_mutex);
993
cgrp->subsys[i] = dummytop->subsys[i];
994
cgrp->subsys[i]->cgroup = cgrp;
995
list_move(&ss->sibling, &root->subsys_list);
996
ss->root = root;
997
if (ss->bind)
998
ss->bind(ss, cgrp);
999
mutex_unlock(&ss->hierarchy_mutex);
1000
/* refcount was already taken, and we're keeping it */
1001
} else if (bit & removed_bits) {
1002
/* We're removing this subsystem */
1003
BUG_ON(ss == NULL);
1004
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1005
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1006
mutex_lock(&ss->hierarchy_mutex);
1007
if (ss->bind)
1008
ss->bind(ss, dummytop);
1009
dummytop->subsys[i]->cgroup = dummytop;
1010
cgrp->subsys[i] = NULL;
1011
subsys[i]->root = &rootnode;
1012
list_move(&ss->sibling, &rootnode.subsys_list);
1013
mutex_unlock(&ss->hierarchy_mutex);
1014
/* subsystem is now free - drop reference on module */
1015
module_put(ss->module);
1016
} else if (bit & final_bits) {
1017
/* Subsystem state should already exist */
1018
BUG_ON(ss == NULL);
1019
BUG_ON(!cgrp->subsys[i]);
1020
/*
1021
* a refcount was taken, but we already had one, so
1022
* drop the extra reference.
1023
*/
1024
module_put(ss->module);
1025
#ifdef CONFIG_MODULE_UNLOAD
1026
BUG_ON(ss->module && !module_refcount(ss->module));
1027
#endif
1028
} else {
1029
/* Subsystem state shouldn't exist */
1030
BUG_ON(cgrp->subsys[i]);
1031
}
1032
}
1033
root->subsys_bits = root->actual_subsys_bits = final_bits;
1034
synchronize_rcu();
1035
1036
return 0;
1037
}
1038
1039
static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040
{
1041
struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
1042
struct cgroup_subsys *ss;
1043
1044
mutex_lock(&cgroup_mutex);
1045
for_each_subsys(root, ss)
1046
seq_printf(seq, ",%s", ss->name);
1047
if (test_bit(ROOT_NOPREFIX, &root->flags))
1048
seq_puts(seq, ",noprefix");
1049
if (strlen(root->release_agent_path))
1050
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1051
if (clone_children(&root->top_cgroup))
1052
seq_puts(seq, ",clone_children");
1053
if (strlen(root->name))
1054
seq_printf(seq, ",name=%s", root->name);
1055
mutex_unlock(&cgroup_mutex);
1056
return 0;
1057
}
1058
1059
struct cgroup_sb_opts {
1060
unsigned long subsys_bits;
1061
unsigned long flags;
1062
char *release_agent;
1063
bool clone_children;
1064
char *name;
1065
/* User explicitly requested empty subsystem */
1066
bool none;
1067
1068
struct cgroupfs_root *new_root;
1069
1070
};
1071
1072
/*
1073
* Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
1074
* with cgroup_mutex held to protect the subsys[] array. This function takes
1075
* refcounts on subsystems to be used, unless it returns error, in which case
1076
* no refcounts are taken.
1077
*/
1078
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1079
{
1080
char *token, *o = data;
1081
bool all_ss = false, one_ss = false;
1082
unsigned long mask = (unsigned long)-1;
1083
int i;
1084
bool module_pin_failed = false;
1085
1086
BUG_ON(!mutex_is_locked(&cgroup_mutex));
1087
1088
#ifdef CONFIG_CPUSETS
1089
mask = ~(1UL << cpuset_subsys_id);
1090
#endif
1091
1092
memset(opts, 0, sizeof(*opts));
1093
1094
while ((token = strsep(&o, ",")) != NULL) {
1095
if (!*token)
1096
return -EINVAL;
1097
if (!strcmp(token, "none")) {
1098
/* Explicitly have no subsystems */
1099
opts->none = true;
1100
continue;
1101
}
1102
if (!strcmp(token, "all")) {
1103
/* Mutually exclusive option 'all' + subsystem name */
1104
if (one_ss)
1105
return -EINVAL;
1106
all_ss = true;
1107
continue;
1108
}
1109
if (!strcmp(token, "noprefix")) {
1110
set_bit(ROOT_NOPREFIX, &opts->flags);
1111
continue;
1112
}
1113
if (!strcmp(token, "clone_children")) {
1114
opts->clone_children = true;
1115
continue;
1116
}
1117
if (!strncmp(token, "release_agent=", 14)) {
1118
/* Specifying two release agents is forbidden */
1119
if (opts->release_agent)
1120
return -EINVAL;
1121
opts->release_agent =
1122
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1123
if (!opts->release_agent)
1124
return -ENOMEM;
1125
continue;
1126
}
1127
if (!strncmp(token, "name=", 5)) {
1128
const char *name = token + 5;
1129
/* Can't specify an empty name */
1130
if (!strlen(name))
1131
return -EINVAL;
1132
/* Must match [\w.-]+ */
1133
for (i = 0; i < strlen(name); i++) {
1134
char c = name[i];
1135
if (isalnum(c))
1136
continue;
1137
if ((c == '.') || (c == '-') || (c == '_'))
1138
continue;
1139
return -EINVAL;
1140
}
1141
/* Specifying two names is forbidden */
1142
if (opts->name)
1143
return -EINVAL;
1144
opts->name = kstrndup(name,
1145
MAX_CGROUP_ROOT_NAMELEN - 1,
1146
GFP_KERNEL);
1147
if (!opts->name)
1148
return -ENOMEM;
1149
1150
continue;
1151
}
1152
1153
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1154
struct cgroup_subsys *ss = subsys[i];
1155
if (ss == NULL)
1156
continue;
1157
if (strcmp(token, ss->name))
1158
continue;
1159
if (ss->disabled)
1160
continue;
1161
1162
/* Mutually exclusive option 'all' + subsystem name */
1163
if (all_ss)
1164
return -EINVAL;
1165
set_bit(i, &opts->subsys_bits);
1166
one_ss = true;
1167
1168
break;
1169
}
1170
if (i == CGROUP_SUBSYS_COUNT)
1171
return -ENOENT;
1172
}
1173
1174
/*
1175
* If the 'all' option was specified select all the subsystems,
1176
* otherwise 'all, 'none' and a subsystem name options were not
1177
* specified, let's default to 'all'
1178
*/
1179
if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1180
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1181
struct cgroup_subsys *ss = subsys[i];
1182
if (ss == NULL)
1183
continue;
1184
if (ss->disabled)
1185
continue;
1186
set_bit(i, &opts->subsys_bits);
1187
}
1188
}
1189
1190
/* Consistency checks */
1191
1192
/*
1193
* Option noprefix was introduced just for backward compatibility
1194
* with the old cpuset, so we allow noprefix only if mounting just
1195
* the cpuset subsystem.
1196
*/
1197
if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1198
(opts->subsys_bits & mask))
1199
return -EINVAL;
1200
1201
1202
/* Can't specify "none" and some subsystems */
1203
if (opts->subsys_bits && opts->none)
1204
return -EINVAL;
1205
1206
/*
1207
* We either have to specify by name or by subsystems. (So all
1208
* empty hierarchies must have a name).
1209
*/
1210
if (!opts->subsys_bits && !opts->name)
1211
return -EINVAL;
1212
1213
/*
1214
* Grab references on all the modules we'll need, so the subsystems
1215
* don't dance around before rebind_subsystems attaches them. This may
1216
* take duplicate reference counts on a subsystem that's already used,
1217
* but rebind_subsystems handles this case.
1218
*/
1219
for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1220
unsigned long bit = 1UL << i;
1221
1222
if (!(bit & opts->subsys_bits))
1223
continue;
1224
if (!try_module_get(subsys[i]->module)) {
1225
module_pin_failed = true;
1226
break;
1227
}
1228
}
1229
if (module_pin_failed) {
1230
/*
1231
* oops, one of the modules was going away. this means that we
1232
* raced with a module_delete call, and to the user this is
1233
* essentially a "subsystem doesn't exist" case.
1234
*/
1235
for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1236
/* drop refcounts only on the ones we took */
1237
unsigned long bit = 1UL << i;
1238
1239
if (!(bit & opts->subsys_bits))
1240
continue;
1241
module_put(subsys[i]->module);
1242
}
1243
return -ENOENT;
1244
}
1245
1246
return 0;
1247
}
1248
1249
static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1250
{
1251
int i;
1252
for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1253
unsigned long bit = 1UL << i;
1254
1255
if (!(bit & subsys_bits))
1256
continue;
1257
module_put(subsys[i]->module);
1258
}
1259
}
1260
1261
static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1262
{
1263
int ret = 0;
1264
struct cgroupfs_root *root = sb->s_fs_info;
1265
struct cgroup *cgrp = &root->top_cgroup;
1266
struct cgroup_sb_opts opts;
1267
1268
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1269
mutex_lock(&cgroup_mutex);
1270
1271
/* See what subsystems are wanted */
1272
ret = parse_cgroupfs_options(data, &opts);
1273
if (ret)
1274
goto out_unlock;
1275
1276
/* Don't allow flags or name to change at remount */
1277
if (opts.flags != root->flags ||
1278
(opts.name && strcmp(opts.name, root->name))) {
1279
ret = -EINVAL;
1280
drop_parsed_module_refcounts(opts.subsys_bits);
1281
goto out_unlock;
1282
}
1283
1284
ret = rebind_subsystems(root, opts.subsys_bits);
1285
if (ret) {
1286
drop_parsed_module_refcounts(opts.subsys_bits);
1287
goto out_unlock;
1288
}
1289
1290
/* (re)populate subsystem files */
1291
cgroup_populate_dir(cgrp);
1292
1293
if (opts.release_agent)
1294
strcpy(root->release_agent_path, opts.release_agent);
1295
out_unlock:
1296
kfree(opts.release_agent);
1297
kfree(opts.name);
1298
mutex_unlock(&cgroup_mutex);
1299
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1300
return ret;
1301
}
1302
1303
static const struct super_operations cgroup_ops = {
1304
.statfs = simple_statfs,
1305
.drop_inode = generic_delete_inode,
1306
.show_options = cgroup_show_options,
1307
.remount_fs = cgroup_remount,
1308
};
1309
1310
static void init_cgroup_housekeeping(struct cgroup *cgrp)
1311
{
1312
INIT_LIST_HEAD(&cgrp->sibling);
1313
INIT_LIST_HEAD(&cgrp->children);
1314
INIT_LIST_HEAD(&cgrp->css_sets);
1315
INIT_LIST_HEAD(&cgrp->release_list);
1316
INIT_LIST_HEAD(&cgrp->pidlists);
1317
mutex_init(&cgrp->pidlist_mutex);
1318
INIT_LIST_HEAD(&cgrp->event_list);
1319
spin_lock_init(&cgrp->event_list_lock);
1320
}
1321
1322
static void init_cgroup_root(struct cgroupfs_root *root)
1323
{
1324
struct cgroup *cgrp = &root->top_cgroup;
1325
INIT_LIST_HEAD(&root->subsys_list);
1326
INIT_LIST_HEAD(&root->root_list);
1327
root->number_of_cgroups = 1;
1328
cgrp->root = root;
1329
cgrp->top_cgroup = cgrp;
1330
init_cgroup_housekeeping(cgrp);
1331
}
1332
1333
static bool init_root_id(struct cgroupfs_root *root)
1334
{
1335
int ret = 0;
1336
1337
do {
1338
if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1339
return false;
1340
spin_lock(&hierarchy_id_lock);
1341
/* Try to allocate the next unused ID */
1342
ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1343
&root->hierarchy_id);
1344
if (ret == -ENOSPC)
1345
/* Try again starting from 0 */
1346
ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1347
if (!ret) {
1348
next_hierarchy_id = root->hierarchy_id + 1;
1349
} else if (ret != -EAGAIN) {
1350
/* Can only get here if the 31-bit IDR is full ... */
1351
BUG_ON(ret);
1352
}
1353
spin_unlock(&hierarchy_id_lock);
1354
} while (ret);
1355
return true;
1356
}
1357
1358
static int cgroup_test_super(struct super_block *sb, void *data)
1359
{
1360
struct cgroup_sb_opts *opts = data;
1361
struct cgroupfs_root *root = sb->s_fs_info;
1362
1363
/* If we asked for a name then it must match */
1364
if (opts->name && strcmp(opts->name, root->name))
1365
return 0;
1366
1367
/*
1368
* If we asked for subsystems (or explicitly for no
1369
* subsystems) then they must match
1370
*/
1371
if ((opts->subsys_bits || opts->none)
1372
&& (opts->subsys_bits != root->subsys_bits))
1373
return 0;
1374
1375
return 1;
1376
}
1377
1378
static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1379
{
1380
struct cgroupfs_root *root;
1381
1382
if (!opts->subsys_bits && !opts->none)
1383
return NULL;
1384
1385
root = kzalloc(sizeof(*root), GFP_KERNEL);
1386
if (!root)
1387
return ERR_PTR(-ENOMEM);
1388
1389
if (!init_root_id(root)) {
1390
kfree(root);
1391
return ERR_PTR(-ENOMEM);
1392
}
1393
init_cgroup_root(root);
1394
1395
root->subsys_bits = opts->subsys_bits;
1396
root->flags = opts->flags;
1397
if (opts->release_agent)
1398
strcpy(root->release_agent_path, opts->release_agent);
1399
if (opts->name)
1400
strcpy(root->name, opts->name);
1401
if (opts->clone_children)
1402
set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1403
return root;
1404
}
1405
1406
static void cgroup_drop_root(struct cgroupfs_root *root)
1407
{
1408
if (!root)
1409
return;
1410
1411
BUG_ON(!root->hierarchy_id);
1412
spin_lock(&hierarchy_id_lock);
1413
ida_remove(&hierarchy_ida, root->hierarchy_id);
1414
spin_unlock(&hierarchy_id_lock);
1415
kfree(root);
1416
}
1417
1418
static int cgroup_set_super(struct super_block *sb, void *data)
1419
{
1420
int ret;
1421
struct cgroup_sb_opts *opts = data;
1422
1423
/* If we don't have a new root, we can't set up a new sb */
1424
if (!opts->new_root)
1425
return -EINVAL;
1426
1427
BUG_ON(!opts->subsys_bits && !opts->none);
1428
1429
ret = set_anon_super(sb, NULL);
1430
if (ret)
1431
return ret;
1432
1433
sb->s_fs_info = opts->new_root;
1434
opts->new_root->sb = sb;
1435
1436
sb->s_blocksize = PAGE_CACHE_SIZE;
1437
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1438
sb->s_magic = CGROUP_SUPER_MAGIC;
1439
sb->s_op = &cgroup_ops;
1440
1441
return 0;
1442
}
1443
1444
static int cgroup_get_rootdir(struct super_block *sb)
1445
{
1446
static const struct dentry_operations cgroup_dops = {
1447
.d_iput = cgroup_diput,
1448
.d_delete = cgroup_delete,
1449
};
1450
1451
struct inode *inode =
1452
cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1453
struct dentry *dentry;
1454
1455
if (!inode)
1456
return -ENOMEM;
1457
1458
inode->i_fop = &simple_dir_operations;
1459
inode->i_op = &cgroup_dir_inode_operations;
1460
/* directories start off with i_nlink == 2 (for "." entry) */
1461
inc_nlink(inode);
1462
dentry = d_alloc_root(inode);
1463
if (!dentry) {
1464
iput(inode);
1465
return -ENOMEM;
1466
}
1467
sb->s_root = dentry;
1468
/* for everything else we want ->d_op set */
1469
sb->s_d_op = &cgroup_dops;
1470
return 0;
1471
}
1472
1473
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1474
int flags, const char *unused_dev_name,
1475
void *data)
1476
{
1477
struct cgroup_sb_opts opts;
1478
struct cgroupfs_root *root;
1479
int ret = 0;
1480
struct super_block *sb;
1481
struct cgroupfs_root *new_root;
1482
1483
/* First find the desired set of subsystems */
1484
mutex_lock(&cgroup_mutex);
1485
ret = parse_cgroupfs_options(data, &opts);
1486
mutex_unlock(&cgroup_mutex);
1487
if (ret)
1488
goto out_err;
1489
1490
/*
1491
* Allocate a new cgroup root. We may not need it if we're
1492
* reusing an existing hierarchy.
1493
*/
1494
new_root = cgroup_root_from_opts(&opts);
1495
if (IS_ERR(new_root)) {
1496
ret = PTR_ERR(new_root);
1497
goto drop_modules;
1498
}
1499
opts.new_root = new_root;
1500
1501
/* Locate an existing or new sb for this hierarchy */
1502
sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1503
if (IS_ERR(sb)) {
1504
ret = PTR_ERR(sb);
1505
cgroup_drop_root(opts.new_root);
1506
goto drop_modules;
1507
}
1508
1509
root = sb->s_fs_info;
1510
BUG_ON(!root);
1511
if (root == opts.new_root) {
1512
/* We used the new root structure, so this is a new hierarchy */
1513
struct list_head tmp_cg_links;
1514
struct cgroup *root_cgrp = &root->top_cgroup;
1515
struct inode *inode;
1516
struct cgroupfs_root *existing_root;
1517
int i;
1518
1519
BUG_ON(sb->s_root != NULL);
1520
1521
ret = cgroup_get_rootdir(sb);
1522
if (ret)
1523
goto drop_new_super;
1524
inode = sb->s_root->d_inode;
1525
1526
mutex_lock(&inode->i_mutex);
1527
mutex_lock(&cgroup_mutex);
1528
1529
if (strlen(root->name)) {
1530
/* Check for name clashes with existing mounts */
1531
for_each_active_root(existing_root) {
1532
if (!strcmp(existing_root->name, root->name)) {
1533
ret = -EBUSY;
1534
mutex_unlock(&cgroup_mutex);
1535
mutex_unlock(&inode->i_mutex);
1536
goto drop_new_super;
1537
}
1538
}
1539
}
1540
1541
/*
1542
* We're accessing css_set_count without locking
1543
* css_set_lock here, but that's OK - it can only be
1544
* increased by someone holding cgroup_lock, and
1545
* that's us. The worst that can happen is that we
1546
* have some link structures left over
1547
*/
1548
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1549
if (ret) {
1550
mutex_unlock(&cgroup_mutex);
1551
mutex_unlock(&inode->i_mutex);
1552
goto drop_new_super;
1553
}
1554
1555
ret = rebind_subsystems(root, root->subsys_bits);
1556
if (ret == -EBUSY) {
1557
mutex_unlock(&cgroup_mutex);
1558
mutex_unlock(&inode->i_mutex);
1559
free_cg_links(&tmp_cg_links);
1560
goto drop_new_super;
1561
}
1562
/*
1563
* There must be no failure case after here, since rebinding
1564
* takes care of subsystems' refcounts, which are explicitly
1565
* dropped in the failure exit path.
1566
*/
1567
1568
/* EBUSY should be the only error here */
1569
BUG_ON(ret);
1570
1571
list_add(&root->root_list, &roots);
1572
root_count++;
1573
1574
sb->s_root->d_fsdata = root_cgrp;
1575
root->top_cgroup.dentry = sb->s_root;
1576
1577
/* Link the top cgroup in this hierarchy into all
1578
* the css_set objects */
1579
write_lock(&css_set_lock);
1580
for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1581
struct hlist_head *hhead = &css_set_table[i];
1582
struct hlist_node *node;
1583
struct css_set *cg;
1584
1585
hlist_for_each_entry(cg, node, hhead, hlist)
1586
link_css_set(&tmp_cg_links, cg, root_cgrp);
1587
}
1588
write_unlock(&css_set_lock);
1589
1590
free_cg_links(&tmp_cg_links);
1591
1592
BUG_ON(!list_empty(&root_cgrp->sibling));
1593
BUG_ON(!list_empty(&root_cgrp->children));
1594
BUG_ON(root->number_of_cgroups != 1);
1595
1596
cgroup_populate_dir(root_cgrp);
1597
mutex_unlock(&cgroup_mutex);
1598
mutex_unlock(&inode->i_mutex);
1599
} else {
1600
/*
1601
* We re-used an existing hierarchy - the new root (if
1602
* any) is not needed
1603
*/
1604
cgroup_drop_root(opts.new_root);
1605
/* no subsys rebinding, so refcounts don't change */
1606
drop_parsed_module_refcounts(opts.subsys_bits);
1607
}
1608
1609
kfree(opts.release_agent);
1610
kfree(opts.name);
1611
return dget(sb->s_root);
1612
1613
drop_new_super:
1614
deactivate_locked_super(sb);
1615
drop_modules:
1616
drop_parsed_module_refcounts(opts.subsys_bits);
1617
out_err:
1618
kfree(opts.release_agent);
1619
kfree(opts.name);
1620
return ERR_PTR(ret);
1621
}
1622
1623
static void cgroup_kill_sb(struct super_block *sb) {
1624
struct cgroupfs_root *root = sb->s_fs_info;
1625
struct cgroup *cgrp = &root->top_cgroup;
1626
int ret;
1627
struct cg_cgroup_link *link;
1628
struct cg_cgroup_link *saved_link;
1629
1630
BUG_ON(!root);
1631
1632
BUG_ON(root->number_of_cgroups != 1);
1633
BUG_ON(!list_empty(&cgrp->children));
1634
BUG_ON(!list_empty(&cgrp->sibling));
1635
1636
mutex_lock(&cgroup_mutex);
1637
1638
/* Rebind all subsystems back to the default hierarchy */
1639
ret = rebind_subsystems(root, 0);
1640
/* Shouldn't be able to fail ... */
1641
BUG_ON(ret);
1642
1643
/*
1644
* Release all the links from css_sets to this hierarchy's
1645
* root cgroup
1646
*/
1647
write_lock(&css_set_lock);
1648
1649
list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1650
cgrp_link_list) {
1651
list_del(&link->cg_link_list);
1652
list_del(&link->cgrp_link_list);
1653
kfree(link);
1654
}
1655
write_unlock(&css_set_lock);
1656
1657
if (!list_empty(&root->root_list)) {
1658
list_del(&root->root_list);
1659
root_count--;
1660
}
1661
1662
mutex_unlock(&cgroup_mutex);
1663
1664
kill_litter_super(sb);
1665
cgroup_drop_root(root);
1666
}
1667
1668
static struct file_system_type cgroup_fs_type = {
1669
.name = "cgroup",
1670
.mount = cgroup_mount,
1671
.kill_sb = cgroup_kill_sb,
1672
};
1673
1674
static struct kobject *cgroup_kobj;
1675
1676
static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1677
{
1678
return dentry->d_fsdata;
1679
}
1680
1681
static inline struct cftype *__d_cft(struct dentry *dentry)
1682
{
1683
return dentry->d_fsdata;
1684
}
1685
1686
/**
1687
* cgroup_path - generate the path of a cgroup
1688
* @cgrp: the cgroup in question
1689
* @buf: the buffer to write the path into
1690
* @buflen: the length of the buffer
1691
*
1692
* Called with cgroup_mutex held or else with an RCU-protected cgroup
1693
* reference. Writes path of cgroup into buf. Returns 0 on success,
1694
* -errno on error.
1695
*/
1696
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1697
{
1698
char *start;
1699
struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1700
rcu_read_lock_held() ||
1701
cgroup_lock_is_held());
1702
1703
if (!dentry || cgrp == dummytop) {
1704
/*
1705
* Inactive subsystems have no dentry for their root
1706
* cgroup
1707
*/
1708
strcpy(buf, "/");
1709
return 0;
1710
}
1711
1712
start = buf + buflen;
1713
1714
*--start = '\0';
1715
for (;;) {
1716
int len = dentry->d_name.len;
1717
1718
if ((start -= len) < buf)
1719
return -ENAMETOOLONG;
1720
memcpy(start, dentry->d_name.name, len);
1721
cgrp = cgrp->parent;
1722
if (!cgrp)
1723
break;
1724
1725
dentry = rcu_dereference_check(cgrp->dentry,
1726
rcu_read_lock_held() ||
1727
cgroup_lock_is_held());
1728
if (!cgrp->parent)
1729
continue;
1730
if (--start < buf)
1731
return -ENAMETOOLONG;
1732
*start = '/';
1733
}
1734
memmove(buf, start, buf + buflen - start);
1735
return 0;
1736
}
1737
EXPORT_SYMBOL_GPL(cgroup_path);
1738
1739
/*
1740
* cgroup_task_migrate - move a task from one cgroup to another.
1741
*
1742
* 'guarantee' is set if the caller promises that a new css_set for the task
1743
* will already exist. If not set, this function might sleep, and can fail with
1744
* -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745
*/
1746
static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747
struct task_struct *tsk, bool guarantee)
1748
{
1749
struct css_set *oldcg;
1750
struct css_set *newcg;
1751
1752
/*
1753
* get old css_set. we need to take task_lock and refcount it, because
1754
* an exiting task can change its css_set to init_css_set and drop its
1755
* old one without taking cgroup_mutex.
1756
*/
1757
task_lock(tsk);
1758
oldcg = tsk->cgroups;
1759
get_css_set(oldcg);
1760
task_unlock(tsk);
1761
1762
/* locate or allocate a new css_set for this task. */
1763
if (guarantee) {
1764
/* we know the css_set we want already exists. */
1765
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766
read_lock(&css_set_lock);
1767
newcg = find_existing_css_set(oldcg, cgrp, template);
1768
BUG_ON(!newcg);
1769
get_css_set(newcg);
1770
read_unlock(&css_set_lock);
1771
} else {
1772
might_sleep();
1773
/* find_css_set will give us newcg already referenced. */
1774
newcg = find_css_set(oldcg, cgrp);
1775
if (!newcg) {
1776
put_css_set(oldcg);
1777
return -ENOMEM;
1778
}
1779
}
1780
put_css_set(oldcg);
1781
1782
/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783
task_lock(tsk);
1784
if (tsk->flags & PF_EXITING) {
1785
task_unlock(tsk);
1786
put_css_set(newcg);
1787
return -ESRCH;
1788
}
1789
rcu_assign_pointer(tsk->cgroups, newcg);
1790
task_unlock(tsk);
1791
1792
/* Update the css_set linked lists if we're using them */
1793
write_lock(&css_set_lock);
1794
if (!list_empty(&tsk->cg_list))
1795
list_move(&tsk->cg_list, &newcg->tasks);
1796
write_unlock(&css_set_lock);
1797
1798
/*
1799
* We just gained a reference on oldcg by taking it from the task. As
1800
* trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801
* it here; it will be freed under RCU.
1802
*/
1803
put_css_set(oldcg);
1804
1805
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806
return 0;
1807
}
1808
1809
/**
1810
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1811
* @cgrp: the cgroup the task is attaching to
1812
* @tsk: the task to be attached
1813
*
1814
* Call holding cgroup_mutex. May take task_lock of
1815
* the task 'tsk' during call.
1816
*/
1817
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1818
{
1819
int retval;
1820
struct cgroup_subsys *ss, *failed_ss = NULL;
1821
struct cgroup *oldcgrp;
1822
struct cgroupfs_root *root = cgrp->root;
1823
1824
/* Nothing to do if the task is already in that cgroup */
1825
oldcgrp = task_cgroup_from_root(tsk, root);
1826
if (cgrp == oldcgrp)
1827
return 0;
1828
1829
for_each_subsys(root, ss) {
1830
if (ss->can_attach) {
1831
retval = ss->can_attach(ss, cgrp, tsk);
1832
if (retval) {
1833
/*
1834
* Remember on which subsystem the can_attach()
1835
* failed, so that we only call cancel_attach()
1836
* against the subsystems whose can_attach()
1837
* succeeded. (See below)
1838
*/
1839
failed_ss = ss;
1840
goto out;
1841
}
1842
}
1843
if (ss->can_attach_task) {
1844
retval = ss->can_attach_task(cgrp, tsk);
1845
if (retval) {
1846
failed_ss = ss;
1847
goto out;
1848
}
1849
}
1850
}
1851
1852
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1853
if (retval)
1854
goto out;
1855
1856
for_each_subsys(root, ss) {
1857
if (ss->pre_attach)
1858
ss->pre_attach(cgrp);
1859
if (ss->attach_task)
1860
ss->attach_task(cgrp, tsk);
1861
if (ss->attach)
1862
ss->attach(ss, cgrp, oldcgrp, tsk);
1863
}
1864
1865
synchronize_rcu();
1866
1867
/*
1868
* wake up rmdir() waiter. the rmdir should fail since the cgroup
1869
* is no longer empty.
1870
*/
1871
cgroup_wakeup_rmdir_waiter(cgrp);
1872
out:
1873
if (retval) {
1874
for_each_subsys(root, ss) {
1875
if (ss == failed_ss)
1876
/*
1877
* This subsystem was the one that failed the
1878
* can_attach() check earlier, so we don't need
1879
* to call cancel_attach() against it or any
1880
* remaining subsystems.
1881
*/
1882
break;
1883
if (ss->cancel_attach)
1884
ss->cancel_attach(ss, cgrp, tsk);
1885
}
1886
}
1887
return retval;
1888
}
1889
1890
/**
1891
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1892
* @from: attach to all cgroups of a given task
1893
* @tsk: the task to be attached
1894
*/
1895
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1896
{
1897
struct cgroupfs_root *root;
1898
int retval = 0;
1899
1900
cgroup_lock();
1901
for_each_active_root(root) {
1902
struct cgroup *from_cg = task_cgroup_from_root(from, root);
1903
1904
retval = cgroup_attach_task(from_cg, tsk);
1905
if (retval)
1906
break;
1907
}
1908
cgroup_unlock();
1909
1910
return retval;
1911
}
1912
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1913
1914
/*
1915
* cgroup_attach_proc works in two stages, the first of which prefetches all
1916
* new css_sets needed (to make sure we have enough memory before committing
1917
* to the move) and stores them in a list of entries of the following type.
1918
* TODO: possible optimization: use css_set->rcu_head for chaining instead
1919
*/
1920
struct cg_list_entry {
1921
struct css_set *cg;
1922
struct list_head links;
1923
};
1924
1925
static bool css_set_check_fetched(struct cgroup *cgrp,
1926
struct task_struct *tsk, struct css_set *cg,
1927
struct list_head *newcg_list)
1928
{
1929
struct css_set *newcg;
1930
struct cg_list_entry *cg_entry;
1931
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932
1933
read_lock(&css_set_lock);
1934
newcg = find_existing_css_set(cg, cgrp, template);
1935
if (newcg)
1936
get_css_set(newcg);
1937
read_unlock(&css_set_lock);
1938
1939
/* doesn't exist at all? */
1940
if (!newcg)
1941
return false;
1942
/* see if it's already in the list */
1943
list_for_each_entry(cg_entry, newcg_list, links) {
1944
if (cg_entry->cg == newcg) {
1945
put_css_set(newcg);
1946
return true;
1947
}
1948
}
1949
1950
/* not found */
1951
put_css_set(newcg);
1952
return false;
1953
}
1954
1955
/*
1956
* Find the new css_set and store it in the list in preparation for moving the
1957
* given task to the given cgroup. Returns 0 or -ENOMEM.
1958
*/
1959
static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960
struct list_head *newcg_list)
1961
{
1962
struct css_set *newcg;
1963
struct cg_list_entry *cg_entry;
1964
1965
/* ensure a new css_set will exist for this thread */
1966
newcg = find_css_set(cg, cgrp);
1967
if (!newcg)
1968
return -ENOMEM;
1969
/* add it to the list */
1970
cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971
if (!cg_entry) {
1972
put_css_set(newcg);
1973
return -ENOMEM;
1974
}
1975
cg_entry->cg = newcg;
1976
list_add(&cg_entry->links, newcg_list);
1977
return 0;
1978
}
1979
1980
/**
1981
* cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982
* @cgrp: the cgroup to attach to
1983
* @leader: the threadgroup leader task_struct of the group to be attached
1984
*
1985
* Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986
* take task_lock of each thread in leader's threadgroup individually in turn.
1987
*/
1988
int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989
{
1990
int retval, i, group_size;
1991
struct cgroup_subsys *ss, *failed_ss = NULL;
1992
bool cancel_failed_ss = false;
1993
/* guaranteed to be initialized later, but the compiler needs this */
1994
struct cgroup *oldcgrp = NULL;
1995
struct css_set *oldcg;
1996
struct cgroupfs_root *root = cgrp->root;
1997
/* threadgroup list cursor and array */
1998
struct task_struct *tsk;
1999
struct flex_array *group;
2000
/*
2001
* we need to make sure we have css_sets for all the tasks we're
2002
* going to move -before- we actually start moving them, so that in
2003
* case we get an ENOMEM we can bail out before making any changes.
2004
*/
2005
struct list_head newcg_list;
2006
struct cg_list_entry *cg_entry, *temp_nobe;
2007
2008
/*
2009
* step 0: in order to do expensive, possibly blocking operations for
2010
* every thread, we cannot iterate the thread group list, since it needs
2011
* rcu or tasklist locked. instead, build an array of all threads in the
2012
* group - threadgroup_fork_lock prevents new threads from appearing,
2013
* and if threads exit, this will just be an over-estimate.
2014
*/
2015
group_size = get_nr_threads(leader);
2016
/* flex_array supports very large thread-groups better than kmalloc. */
2017
group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018
GFP_KERNEL);
2019
if (!group)
2020
return -ENOMEM;
2021
/* pre-allocate to guarantee space while iterating in rcu read-side. */
2022
retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023
if (retval)
2024
goto out_free_group_list;
2025
2026
/* prevent changes to the threadgroup list while we take a snapshot. */
2027
rcu_read_lock();
2028
if (!thread_group_leader(leader)) {
2029
/*
2030
* a race with de_thread from another thread's exec() may strip
2031
* us of our leadership, making while_each_thread unsafe to use
2032
* on this task. if this happens, there is no choice but to
2033
* throw this task away and try again (from cgroup_procs_write);
2034
* this is "double-double-toil-and-trouble-check locking".
2035
*/
2036
rcu_read_unlock();
2037
retval = -EAGAIN;
2038
goto out_free_group_list;
2039
}
2040
/* take a reference on each task in the group to go in the array. */
2041
tsk = leader;
2042
i = 0;
2043
do {
2044
/* as per above, nr_threads may decrease, but not increase. */
2045
BUG_ON(i >= group_size);
2046
get_task_struct(tsk);
2047
/*
2048
* saying GFP_ATOMIC has no effect here because we did prealloc
2049
* earlier, but it's good form to communicate our expectations.
2050
*/
2051
retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052
BUG_ON(retval != 0);
2053
i++;
2054
} while_each_thread(leader, tsk);
2055
/* remember the number of threads in the array for later. */
2056
group_size = i;
2057
rcu_read_unlock();
2058
2059
/*
2060
* step 1: check that we can legitimately attach to the cgroup.
2061
*/
2062
for_each_subsys(root, ss) {
2063
if (ss->can_attach) {
2064
retval = ss->can_attach(ss, cgrp, leader);
2065
if (retval) {
2066
failed_ss = ss;
2067
goto out_cancel_attach;
2068
}
2069
}
2070
/* a callback to be run on every thread in the threadgroup. */
2071
if (ss->can_attach_task) {
2072
/* run on each task in the threadgroup. */
2073
for (i = 0; i < group_size; i++) {
2074
tsk = flex_array_get_ptr(group, i);
2075
retval = ss->can_attach_task(cgrp, tsk);
2076
if (retval) {
2077
failed_ss = ss;
2078
cancel_failed_ss = true;
2079
goto out_cancel_attach;
2080
}
2081
}
2082
}
2083
}
2084
2085
/*
2086
* step 2: make sure css_sets exist for all threads to be migrated.
2087
* we use find_css_set, which allocates a new one if necessary.
2088
*/
2089
INIT_LIST_HEAD(&newcg_list);
2090
for (i = 0; i < group_size; i++) {
2091
tsk = flex_array_get_ptr(group, i);
2092
/* nothing to do if this task is already in the cgroup */
2093
oldcgrp = task_cgroup_from_root(tsk, root);
2094
if (cgrp == oldcgrp)
2095
continue;
2096
/* get old css_set pointer */
2097
task_lock(tsk);
2098
if (tsk->flags & PF_EXITING) {
2099
/* ignore this task if it's going away */
2100
task_unlock(tsk);
2101
continue;
2102
}
2103
oldcg = tsk->cgroups;
2104
get_css_set(oldcg);
2105
task_unlock(tsk);
2106
/* see if the new one for us is already in the list? */
2107
if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108
/* was already there, nothing to do. */
2109
put_css_set(oldcg);
2110
} else {
2111
/* we don't already have it. get new one. */
2112
retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113
put_css_set(oldcg);
2114
if (retval)
2115
goto out_list_teardown;
2116
}
2117
}
2118
2119
/*
2120
* step 3: now that we're guaranteed success wrt the css_sets, proceed
2121
* to move all tasks to the new cgroup, calling ss->attach_task for each
2122
* one along the way. there are no failure cases after here, so this is
2123
* the commit point.
2124
*/
2125
for_each_subsys(root, ss) {
2126
if (ss->pre_attach)
2127
ss->pre_attach(cgrp);
2128
}
2129
for (i = 0; i < group_size; i++) {
2130
tsk = flex_array_get_ptr(group, i);
2131
/* leave current thread as it is if it's already there */
2132
oldcgrp = task_cgroup_from_root(tsk, root);
2133
if (cgrp == oldcgrp)
2134
continue;
2135
/* attach each task to each subsystem */
2136
for_each_subsys(root, ss) {
2137
if (ss->attach_task)
2138
ss->attach_task(cgrp, tsk);
2139
}
2140
/* if the thread is PF_EXITING, it can just get skipped. */
2141
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142
BUG_ON(retval != 0 && retval != -ESRCH);
2143
}
2144
/* nothing is sensitive to fork() after this point. */
2145
2146
/*
2147
* step 4: do expensive, non-thread-specific subsystem callbacks.
2148
* TODO: if ever a subsystem needs to know the oldcgrp for each task
2149
* being moved, this call will need to be reworked to communicate that.
2150
*/
2151
for_each_subsys(root, ss) {
2152
if (ss->attach)
2153
ss->attach(ss, cgrp, oldcgrp, leader);
2154
}
2155
2156
/*
2157
* step 5: success! and cleanup
2158
*/
2159
synchronize_rcu();
2160
cgroup_wakeup_rmdir_waiter(cgrp);
2161
retval = 0;
2162
out_list_teardown:
2163
/* clean up the list of prefetched css_sets. */
2164
list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165
list_del(&cg_entry->links);
2166
put_css_set(cg_entry->cg);
2167
kfree(cg_entry);
2168
}
2169
out_cancel_attach:
2170
/* same deal as in cgroup_attach_task */
2171
if (retval) {
2172
for_each_subsys(root, ss) {
2173
if (ss == failed_ss) {
2174
if (cancel_failed_ss && ss->cancel_attach)
2175
ss->cancel_attach(ss, cgrp, leader);
2176
break;
2177
}
2178
if (ss->cancel_attach)
2179
ss->cancel_attach(ss, cgrp, leader);
2180
}
2181
}
2182
/* clean up the array of referenced threads in the group. */
2183
for (i = 0; i < group_size; i++) {
2184
tsk = flex_array_get_ptr(group, i);
2185
put_task_struct(tsk);
2186
}
2187
out_free_group_list:
2188
flex_array_free(group);
2189
return retval;
2190
}
2191
2192
/*
2193
* Find the task_struct of the task to attach by vpid and pass it along to the
2194
* function to attach either it or all tasks in its threadgroup. Will take
2195
* cgroup_mutex; may take task_lock of task.
2196
*/
2197
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2198
{
2199
struct task_struct *tsk;
2200
const struct cred *cred = current_cred(), *tcred;
2201
int ret;
2202
2203
if (!cgroup_lock_live_group(cgrp))
2204
return -ENODEV;
2205
2206
if (pid) {
2207
rcu_read_lock();
2208
tsk = find_task_by_vpid(pid);
2209
if (!tsk) {
2210
rcu_read_unlock();
2211
cgroup_unlock();
2212
return -ESRCH;
2213
}
2214
if (threadgroup) {
2215
/*
2216
* RCU protects this access, since tsk was found in the
2217
* tid map. a race with de_thread may cause group_leader
2218
* to stop being the leader, but cgroup_attach_proc will
2219
* detect it later.
2220
*/
2221
tsk = tsk->group_leader;
2222
} else if (tsk->flags & PF_EXITING) {
2223
/* optimization for the single-task-only case */
2224
rcu_read_unlock();
2225
cgroup_unlock();
2226
return -ESRCH;
2227
}
2228
2229
/*
2230
* even if we're attaching all tasks in the thread group, we
2231
* only need to check permissions on one of them.
2232
*/
2233
tcred = __task_cred(tsk);
2234
if (cred->euid &&
2235
cred->euid != tcred->uid &&
2236
cred->euid != tcred->suid) {
2237
rcu_read_unlock();
2238
cgroup_unlock();
2239
return -EACCES;
2240
}
2241
get_task_struct(tsk);
2242
rcu_read_unlock();
2243
} else {
2244
if (threadgroup)
2245
tsk = current->group_leader;
2246
else
2247
tsk = current;
2248
get_task_struct(tsk);
2249
}
2250
2251
if (threadgroup) {
2252
threadgroup_fork_write_lock(tsk);
2253
ret = cgroup_attach_proc(cgrp, tsk);
2254
threadgroup_fork_write_unlock(tsk);
2255
} else {
2256
ret = cgroup_attach_task(cgrp, tsk);
2257
}
2258
put_task_struct(tsk);
2259
cgroup_unlock();
2260
return ret;
2261
}
2262
2263
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2264
{
2265
return attach_task_by_pid(cgrp, pid, false);
2266
}
2267
2268
static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269
{
2270
int ret;
2271
do {
2272
/*
2273
* attach_proc fails with -EAGAIN if threadgroup leadership
2274
* changes in the middle of the operation, in which case we need
2275
* to find the task_struct for the new leader and start over.
2276
*/
2277
ret = attach_task_by_pid(cgrp, tgid, true);
2278
} while (ret == -EAGAIN);
2279
return ret;
2280
}
2281
2282
/**
2283
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2284
* @cgrp: the cgroup to be checked for liveness
2285
*
2286
* On success, returns true; the lock should be later released with
2287
* cgroup_unlock(). On failure returns false with no lock held.
2288
*/
2289
bool cgroup_lock_live_group(struct cgroup *cgrp)
2290
{
2291
mutex_lock(&cgroup_mutex);
2292
if (cgroup_is_removed(cgrp)) {
2293
mutex_unlock(&cgroup_mutex);
2294
return false;
2295
}
2296
return true;
2297
}
2298
EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2299
2300
static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301
const char *buffer)
2302
{
2303
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2304
if (strlen(buffer) >= PATH_MAX)
2305
return -EINVAL;
2306
if (!cgroup_lock_live_group(cgrp))
2307
return -ENODEV;
2308
strcpy(cgrp->root->release_agent_path, buffer);
2309
cgroup_unlock();
2310
return 0;
2311
}
2312
2313
static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2314
struct seq_file *seq)
2315
{
2316
if (!cgroup_lock_live_group(cgrp))
2317
return -ENODEV;
2318
seq_puts(seq, cgrp->root->release_agent_path);
2319
seq_putc(seq, '\n');
2320
cgroup_unlock();
2321
return 0;
2322
}
2323
2324
/* A buffer size big enough for numbers or short strings */
2325
#define CGROUP_LOCAL_BUFFER_SIZE 64
2326
2327
static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2328
struct file *file,
2329
const char __user *userbuf,
2330
size_t nbytes, loff_t *unused_ppos)
2331
{
2332
char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2333
int retval = 0;
2334
char *end;
2335
2336
if (!nbytes)
2337
return -EINVAL;
2338
if (nbytes >= sizeof(buffer))
2339
return -E2BIG;
2340
if (copy_from_user(buffer, userbuf, nbytes))
2341
return -EFAULT;
2342
2343
buffer[nbytes] = 0; /* nul-terminate */
2344
if (cft->write_u64) {
2345
u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2346
if (*end)
2347
return -EINVAL;
2348
retval = cft->write_u64(cgrp, cft, val);
2349
} else {
2350
s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2351
if (*end)
2352
return -EINVAL;
2353
retval = cft->write_s64(cgrp, cft, val);
2354
}
2355
if (!retval)
2356
retval = nbytes;
2357
return retval;
2358
}
2359
2360
static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2361
struct file *file,
2362
const char __user *userbuf,
2363
size_t nbytes, loff_t *unused_ppos)
2364
{
2365
char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2366
int retval = 0;
2367
size_t max_bytes = cft->max_write_len;
2368
char *buffer = local_buffer;
2369
2370
if (!max_bytes)
2371
max_bytes = sizeof(local_buffer) - 1;
2372
if (nbytes >= max_bytes)
2373
return -E2BIG;
2374
/* Allocate a dynamic buffer if we need one */
2375
if (nbytes >= sizeof(local_buffer)) {
2376
buffer = kmalloc(nbytes + 1, GFP_KERNEL);
2377
if (buffer == NULL)
2378
return -ENOMEM;
2379
}
2380
if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2381
retval = -EFAULT;
2382
goto out;
2383
}
2384
2385
buffer[nbytes] = 0; /* nul-terminate */
2386
retval = cft->write_string(cgrp, cft, strstrip(buffer));
2387
if (!retval)
2388
retval = nbytes;
2389
out:
2390
if (buffer != local_buffer)
2391
kfree(buffer);
2392
return retval;
2393
}
2394
2395
static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2396
size_t nbytes, loff_t *ppos)
2397
{
2398
struct cftype *cft = __d_cft(file->f_dentry);
2399
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2400
2401
if (cgroup_is_removed(cgrp))
2402
return -ENODEV;
2403
if (cft->write)
2404
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
2405
if (cft->write_u64 || cft->write_s64)
2406
return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
2407
if (cft->write_string)
2408
return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
2409
if (cft->trigger) {
2410
int ret = cft->trigger(cgrp, (unsigned int)cft->private);
2411
return ret ? ret : nbytes;
2412
}
2413
return -EINVAL;
2414
}
2415
2416
static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
2417
struct file *file,
2418
char __user *buf, size_t nbytes,
2419
loff_t *ppos)
2420
{
2421
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2422
u64 val = cft->read_u64(cgrp, cft);
2423
int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2424
2425
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2426
}
2427
2428
static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
2429
struct file *file,
2430
char __user *buf, size_t nbytes,
2431
loff_t *ppos)
2432
{
2433
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2434
s64 val = cft->read_s64(cgrp, cft);
2435
int len = sprintf(tmp, "%lld\n", (long long) val);
2436
2437
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2438
}
2439
2440
static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2441
size_t nbytes, loff_t *ppos)
2442
{
2443
struct cftype *cft = __d_cft(file->f_dentry);
2444
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2445
2446
if (cgroup_is_removed(cgrp))
2447
return -ENODEV;
2448
2449
if (cft->read)
2450
return cft->read(cgrp, cft, file, buf, nbytes, ppos);
2451
if (cft->read_u64)
2452
return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
2453
if (cft->read_s64)
2454
return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
2455
return -EINVAL;
2456
}
2457
2458
/*
2459
* seqfile ops/methods for returning structured data. Currently just
2460
* supports string->u64 maps, but can be extended in future.
2461
*/
2462
2463
struct cgroup_seqfile_state {
2464
struct cftype *cft;
2465
struct cgroup *cgroup;
2466
};
2467
2468
static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2469
{
2470
struct seq_file *sf = cb->state;
2471
return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2472
}
2473
2474
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2475
{
2476
struct cgroup_seqfile_state *state = m->private;
2477
struct cftype *cft = state->cft;
2478
if (cft->read_map) {
2479
struct cgroup_map_cb cb = {
2480
.fill = cgroup_map_add,
2481
.state = m,
2482
};
2483
return cft->read_map(state->cgroup, cft, &cb);
2484
}
2485
return cft->read_seq_string(state->cgroup, cft, m);
2486
}
2487
2488
static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2489
{
2490
struct seq_file *seq = file->private_data;
2491
kfree(seq->private);
2492
return single_release(inode, file);
2493
}
2494
2495
static const struct file_operations cgroup_seqfile_operations = {
2496
.read = seq_read,
2497
.write = cgroup_file_write,
2498
.llseek = seq_lseek,
2499
.release = cgroup_seqfile_release,
2500
};
2501
2502
static int cgroup_file_open(struct inode *inode, struct file *file)
2503
{
2504
int err;
2505
struct cftype *cft;
2506
2507
err = generic_file_open(inode, file);
2508
if (err)
2509
return err;
2510
cft = __d_cft(file->f_dentry);
2511
2512
if (cft->read_map || cft->read_seq_string) {
2513
struct cgroup_seqfile_state *state =
2514
kzalloc(sizeof(*state), GFP_USER);
2515
if (!state)
2516
return -ENOMEM;
2517
state->cft = cft;
2518
state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2519
file->f_op = &cgroup_seqfile_operations;
2520
err = single_open(file, cgroup_seqfile_show, state);
2521
if (err < 0)
2522
kfree(state);
2523
} else if (cft->open)
2524
err = cft->open(inode, file);
2525
else
2526
err = 0;
2527
2528
return err;
2529
}
2530
2531
static int cgroup_file_release(struct inode *inode, struct file *file)
2532
{
2533
struct cftype *cft = __d_cft(file->f_dentry);
2534
if (cft->release)
2535
return cft->release(inode, file);
2536
return 0;
2537
}
2538
2539
/*
2540
* cgroup_rename - Only allow simple rename of directories in place.
2541
*/
2542
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2543
struct inode *new_dir, struct dentry *new_dentry)
2544
{
2545
if (!S_ISDIR(old_dentry->d_inode->i_mode))
2546
return -ENOTDIR;
2547
if (new_dentry->d_inode)
2548
return -EEXIST;
2549
if (old_dir != new_dir)
2550
return -EIO;
2551
return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2552
}
2553
2554
static const struct file_operations cgroup_file_operations = {
2555
.read = cgroup_file_read,
2556
.write = cgroup_file_write,
2557
.llseek = generic_file_llseek,
2558
.open = cgroup_file_open,
2559
.release = cgroup_file_release,
2560
};
2561
2562
static const struct inode_operations cgroup_dir_inode_operations = {
2563
.lookup = cgroup_lookup,
2564
.mkdir = cgroup_mkdir,
2565
.rmdir = cgroup_rmdir,
2566
.rename = cgroup_rename,
2567
};
2568
2569
static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2570
{
2571
if (dentry->d_name.len > NAME_MAX)
2572
return ERR_PTR(-ENAMETOOLONG);
2573
d_add(dentry, NULL);
2574
return NULL;
2575
}
2576
2577
/*
2578
* Check if a file is a control file
2579
*/
2580
static inline struct cftype *__file_cft(struct file *file)
2581
{
2582
if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2583
return ERR_PTR(-EINVAL);
2584
return __d_cft(file->f_dentry);
2585
}
2586
2587
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2588
struct super_block *sb)
2589
{
2590
struct inode *inode;
2591
2592
if (!dentry)
2593
return -ENOENT;
2594
if (dentry->d_inode)
2595
return -EEXIST;
2596
2597
inode = cgroup_new_inode(mode, sb);
2598
if (!inode)
2599
return -ENOMEM;
2600
2601
if (S_ISDIR(mode)) {
2602
inode->i_op = &cgroup_dir_inode_operations;
2603
inode->i_fop = &simple_dir_operations;
2604
2605
/* start off with i_nlink == 2 (for "." entry) */
2606
inc_nlink(inode);
2607
2608
/* start with the directory inode held, so that we can
2609
* populate it without racing with another mkdir */
2610
mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2611
} else if (S_ISREG(mode)) {
2612
inode->i_size = 0;
2613
inode->i_fop = &cgroup_file_operations;
2614
}
2615
d_instantiate(dentry, inode);
2616
dget(dentry); /* Extra count - pin the dentry in core */
2617
return 0;
2618
}
2619
2620
/*
2621
* cgroup_create_dir - create a directory for an object.
2622
* @cgrp: the cgroup we create the directory for. It must have a valid
2623
* ->parent field. And we are going to fill its ->dentry field.
2624
* @dentry: dentry of the new cgroup
2625
* @mode: mode to set on new directory.
2626
*/
2627
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2628
mode_t mode)
2629
{
2630
struct dentry *parent;
2631
int error = 0;
2632
2633
parent = cgrp->parent->dentry;
2634
error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2635
if (!error) {
2636
dentry->d_fsdata = cgrp;
2637
inc_nlink(parent->d_inode);
2638
rcu_assign_pointer(cgrp->dentry, dentry);
2639
dget(dentry);
2640
}
2641
dput(dentry);
2642
2643
return error;
2644
}
2645
2646
/**
2647
* cgroup_file_mode - deduce file mode of a control file
2648
* @cft: the control file in question
2649
*
2650
* returns cft->mode if ->mode is not 0
2651
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2652
* returns S_IRUGO if it has only a read handler
2653
* returns S_IWUSR if it has only a write hander
2654
*/
2655
static mode_t cgroup_file_mode(const struct cftype *cft)
2656
{
2657
mode_t mode = 0;
2658
2659
if (cft->mode)
2660
return cft->mode;
2661
2662
if (cft->read || cft->read_u64 || cft->read_s64 ||
2663
cft->read_map || cft->read_seq_string)
2664
mode |= S_IRUGO;
2665
2666
if (cft->write || cft->write_u64 || cft->write_s64 ||
2667
cft->write_string || cft->trigger)
2668
mode |= S_IWUSR;
2669
2670
return mode;
2671
}
2672
2673
int cgroup_add_file(struct cgroup *cgrp,
2674
struct cgroup_subsys *subsys,
2675
const struct cftype *cft)
2676
{
2677
struct dentry *dir = cgrp->dentry;
2678
struct dentry *dentry;
2679
int error;
2680
mode_t mode;
2681
2682
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2683
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2684
strcpy(name, subsys->name);
2685
strcat(name, ".");
2686
}
2687
strcat(name, cft->name);
2688
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2689
dentry = lookup_one_len(name, dir, strlen(name));
2690
if (!IS_ERR(dentry)) {
2691
mode = cgroup_file_mode(cft);
2692
error = cgroup_create_file(dentry, mode | S_IFREG,
2693
cgrp->root->sb);
2694
if (!error)
2695
dentry->d_fsdata = (void *)cft;
2696
dput(dentry);
2697
} else
2698
error = PTR_ERR(dentry);
2699
return error;
2700
}
2701
EXPORT_SYMBOL_GPL(cgroup_add_file);
2702
2703
int cgroup_add_files(struct cgroup *cgrp,
2704
struct cgroup_subsys *subsys,
2705
const struct cftype cft[],
2706
int count)
2707
{
2708
int i, err;
2709
for (i = 0; i < count; i++) {
2710
err = cgroup_add_file(cgrp, subsys, &cft[i]);
2711
if (err)
2712
return err;
2713
}
2714
return 0;
2715
}
2716
EXPORT_SYMBOL_GPL(cgroup_add_files);
2717
2718
/**
2719
* cgroup_task_count - count the number of tasks in a cgroup.
2720
* @cgrp: the cgroup in question
2721
*
2722
* Return the number of tasks in the cgroup.
2723
*/
2724
int cgroup_task_count(const struct cgroup *cgrp)
2725
{
2726
int count = 0;
2727
struct cg_cgroup_link *link;
2728
2729
read_lock(&css_set_lock);
2730
list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
2731
count += atomic_read(&link->cg->refcount);
2732
}
2733
read_unlock(&css_set_lock);
2734
return count;
2735
}
2736
2737
/*
2738
* Advance a list_head iterator. The iterator should be positioned at
2739
* the start of a css_set
2740
*/
2741
static void cgroup_advance_iter(struct cgroup *cgrp,
2742
struct cgroup_iter *it)
2743
{
2744
struct list_head *l = it->cg_link;
2745
struct cg_cgroup_link *link;
2746
struct css_set *cg;
2747
2748
/* Advance to the next non-empty css_set */
2749
do {
2750
l = l->next;
2751
if (l == &cgrp->css_sets) {
2752
it->cg_link = NULL;
2753
return;
2754
}
2755
link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
2756
cg = link->cg;
2757
} while (list_empty(&cg->tasks));
2758
it->cg_link = l;
2759
it->task = cg->tasks.next;
2760
}
2761
2762
/*
2763
* To reduce the fork() overhead for systems that are not actually
2764
* using their cgroups capability, we don't maintain the lists running
2765
* through each css_set to its tasks until we see the list actually
2766
* used - in other words after the first call to cgroup_iter_start().
2767
*
2768
* The tasklist_lock is not held here, as do_each_thread() and
2769
* while_each_thread() are protected by RCU.
2770
*/
2771
static void cgroup_enable_task_cg_lists(void)
2772
{
2773
struct task_struct *p, *g;
2774
write_lock(&css_set_lock);
2775
use_task_css_set_links = 1;
2776
do_each_thread(g, p) {
2777
task_lock(p);
2778
/*
2779
* We should check if the process is exiting, otherwise
2780
* it will race with cgroup_exit() in that the list
2781
* entry won't be deleted though the process has exited.
2782
*/
2783
if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2784
list_add(&p->cg_list, &p->cgroups->tasks);
2785
task_unlock(p);
2786
} while_each_thread(g, p);
2787
write_unlock(&css_set_lock);
2788
}
2789
2790
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2791
{
2792
/*
2793
* The first time anyone tries to iterate across a cgroup,
2794
* we need to enable the list linking each css_set to its
2795
* tasks, and fix up all existing tasks.
2796
*/
2797
if (!use_task_css_set_links)
2798
cgroup_enable_task_cg_lists();
2799
2800
read_lock(&css_set_lock);
2801
it->cg_link = &cgrp->css_sets;
2802
cgroup_advance_iter(cgrp, it);
2803
}
2804
2805
struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2806
struct cgroup_iter *it)
2807
{
2808
struct task_struct *res;
2809
struct list_head *l = it->task;
2810
struct cg_cgroup_link *link;
2811
2812
/* If the iterator cg is NULL, we have no tasks */
2813
if (!it->cg_link)
2814
return NULL;
2815
res = list_entry(l, struct task_struct, cg_list);
2816
/* Advance iterator to find next entry */
2817
l = l->next;
2818
link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
2819
if (l == &link->cg->tasks) {
2820
/* We reached the end of this task list - move on to
2821
* the next cg_cgroup_link */
2822
cgroup_advance_iter(cgrp, it);
2823
} else {
2824
it->task = l;
2825
}
2826
return res;
2827
}
2828
2829
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2830
{
2831
read_unlock(&css_set_lock);
2832
}
2833
2834
static inline int started_after_time(struct task_struct *t1,
2835
struct timespec *time,
2836
struct task_struct *t2)
2837
{
2838
int start_diff = timespec_compare(&t1->start_time, time);
2839
if (start_diff > 0) {
2840
return 1;
2841
} else if (start_diff < 0) {
2842
return 0;
2843
} else {
2844
/*
2845
* Arbitrarily, if two processes started at the same
2846
* time, we'll say that the lower pointer value
2847
* started first. Note that t2 may have exited by now
2848
* so this may not be a valid pointer any longer, but
2849
* that's fine - it still serves to distinguish
2850
* between two tasks started (effectively) simultaneously.
2851
*/
2852
return t1 > t2;
2853
}
2854
}
2855
2856
/*
2857
* This function is a callback from heap_insert() and is used to order
2858
* the heap.
2859
* In this case we order the heap in descending task start time.
2860
*/
2861
static inline int started_after(void *p1, void *p2)
2862
{
2863
struct task_struct *t1 = p1;
2864
struct task_struct *t2 = p2;
2865
return started_after_time(t1, &t2->start_time, t2);
2866
}
2867
2868
/**
2869
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
2870
* @scan: struct cgroup_scanner containing arguments for the scan
2871
*
2872
* Arguments include pointers to callback functions test_task() and
2873
* process_task().
2874
* Iterate through all the tasks in a cgroup, calling test_task() for each,
2875
* and if it returns true, call process_task() for it also.
2876
* The test_task pointer may be NULL, meaning always true (select all tasks).
2877
* Effectively duplicates cgroup_iter_{start,next,end}()
2878
* but does not lock css_set_lock for the call to process_task().
2879
* The struct cgroup_scanner may be embedded in any structure of the caller's
2880
* creation.
2881
* It is guaranteed that process_task() will act on every task that
2882
* is a member of the cgroup for the duration of this call. This
2883
* function may or may not call process_task() for tasks that exit
2884
* or move to a different cgroup during the call, or are forked or
2885
* move into the cgroup during the call.
2886
*
2887
* Note that test_task() may be called with locks held, and may in some
2888
* situations be called multiple times for the same task, so it should
2889
* be cheap.
2890
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
2891
* pre-allocated and will be used for heap operations (and its "gt" member will
2892
* be overwritten), else a temporary heap will be used (allocation of which
2893
* may cause this function to fail).
2894
*/
2895
int cgroup_scan_tasks(struct cgroup_scanner *scan)
2896
{
2897
int retval, i;
2898
struct cgroup_iter it;
2899
struct task_struct *p, *dropped;
2900
/* Never dereference latest_task, since it's not refcounted */
2901
struct task_struct *latest_task = NULL;
2902
struct ptr_heap tmp_heap;
2903
struct ptr_heap *heap;
2904
struct timespec latest_time = { 0, 0 };
2905
2906
if (scan->heap) {
2907
/* The caller supplied our heap and pre-allocated its memory */
2908
heap = scan->heap;
2909
heap->gt = &started_after;
2910
} else {
2911
/* We need to allocate our own heap memory */
2912
heap = &tmp_heap;
2913
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
2914
if (retval)
2915
/* cannot allocate the heap */
2916
return retval;
2917
}
2918
2919
again:
2920
/*
2921
* Scan tasks in the cgroup, using the scanner's "test_task" callback
2922
* to determine which are of interest, and using the scanner's
2923
* "process_task" callback to process any of them that need an update.
2924
* Since we don't want to hold any locks during the task updates,
2925
* gather tasks to be processed in a heap structure.
2926
* The heap is sorted by descending task start time.
2927
* If the statically-sized heap fills up, we overflow tasks that
2928
* started later, and in future iterations only consider tasks that
2929
* started after the latest task in the previous pass. This
2930
* guarantees forward progress and that we don't miss any tasks.
2931
*/
2932
heap->size = 0;
2933
cgroup_iter_start(scan->cg, &it);
2934
while ((p = cgroup_iter_next(scan->cg, &it))) {
2935
/*
2936
* Only affect tasks that qualify per the caller's callback,
2937
* if he provided one
2938
*/
2939
if (scan->test_task && !scan->test_task(p, scan))
2940
continue;
2941
/*
2942
* Only process tasks that started after the last task
2943
* we processed
2944
*/
2945
if (!started_after_time(p, &latest_time, latest_task))
2946
continue;
2947
dropped = heap_insert(heap, p);
2948
if (dropped == NULL) {
2949
/*
2950
* The new task was inserted; the heap wasn't
2951
* previously full
2952
*/
2953
get_task_struct(p);
2954
} else if (dropped != p) {
2955
/*
2956
* The new task was inserted, and pushed out a
2957
* different task
2958
*/
2959
get_task_struct(p);
2960
put_task_struct(dropped);
2961
}
2962
/*
2963
* Else the new task was newer than anything already in
2964
* the heap and wasn't inserted
2965
*/
2966
}
2967
cgroup_iter_end(scan->cg, &it);
2968
2969
if (heap->size) {
2970
for (i = 0; i < heap->size; i++) {
2971
struct task_struct *q = heap->ptrs[i];
2972
if (i == 0) {
2973
latest_time = q->start_time;
2974
latest_task = q;
2975
}
2976
/* Process the task per the caller's callback */
2977
scan->process_task(q, scan);
2978
put_task_struct(q);
2979
}
2980
/*
2981
* If we had to process any tasks at all, scan again
2982
* in case some of them were in the middle of forking
2983
* children that didn't get processed.
2984
* Not the most efficient way to do it, but it avoids
2985
* having to take callback_mutex in the fork path
2986
*/
2987
goto again;
2988
}
2989
if (heap == &tmp_heap)
2990
heap_free(&tmp_heap);
2991
return 0;
2992
}
2993
2994
/*
2995
* Stuff for reading the 'tasks'/'procs' files.
2996
*
2997
* Reading this file can return large amounts of data if a cgroup has
2998
* *lots* of attached tasks. So it may need several calls to read(),
2999
* but we cannot guarantee that the information we produce is correct
3000
* unless we produce it entirely atomically.
3001
*
3002
*/
3003
3004
/*
3005
* The following two functions "fix" the issue where there are more pids
3006
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3007
* TODO: replace with a kernel-wide solution to this problem
3008
*/
3009
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3010
static void *pidlist_allocate(int count)
3011
{
3012
if (PIDLIST_TOO_LARGE(count))
3013
return vmalloc(count * sizeof(pid_t));
3014
else
3015
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3016
}
3017
static void pidlist_free(void *p)
3018
{
3019
if (is_vmalloc_addr(p))
3020
vfree(p);
3021
else
3022
kfree(p);
3023
}
3024
static void *pidlist_resize(void *p, int newcount)
3025
{
3026
void *newlist;
3027
/* note: if new alloc fails, old p will still be valid either way */
3028
if (is_vmalloc_addr(p)) {
3029
newlist = vmalloc(newcount * sizeof(pid_t));
3030
if (!newlist)
3031
return NULL;
3032
memcpy(newlist, p, newcount * sizeof(pid_t));
3033
vfree(p);
3034
} else {
3035
newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3036
}
3037
return newlist;
3038
}
3039
3040
/*
3041
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3042
* If the new stripped list is sufficiently smaller and there's enough memory
3043
* to allocate a new buffer, will let go of the unneeded memory. Returns the
3044
* number of unique elements.
3045
*/
3046
/* is the size difference enough that we should re-allocate the array? */
3047
#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3048
static int pidlist_uniq(pid_t **p, int length)
3049
{
3050
int src, dest = 1;
3051
pid_t *list = *p;
3052
pid_t *newlist;
3053
3054
/*
3055
* we presume the 0th element is unique, so i starts at 1. trivial
3056
* edge cases first; no work needs to be done for either
3057
*/
3058
if (length == 0 || length == 1)
3059
return length;
3060
/* src and dest walk down the list; dest counts unique elements */
3061
for (src = 1; src < length; src++) {
3062
/* find next unique element */
3063
while (list[src] == list[src-1]) {
3064
src++;
3065
if (src == length)
3066
goto after;
3067
}
3068
/* dest always points to where the next unique element goes */
3069
list[dest] = list[src];
3070
dest++;
3071
}
3072
after:
3073
/*
3074
* if the length difference is large enough, we want to allocate a
3075
* smaller buffer to save memory. if this fails due to out of memory,
3076
* we'll just stay with what we've got.
3077
*/
3078
if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3079
newlist = pidlist_resize(list, dest);
3080
if (newlist)
3081
*p = newlist;
3082
}
3083
return dest;
3084
}
3085
3086
static int cmppid(const void *a, const void *b)
3087
{
3088
return *(pid_t *)a - *(pid_t *)b;
3089
}
3090
3091
/*
3092
* find the appropriate pidlist for our purpose (given procs vs tasks)
3093
* returns with the lock on that pidlist already held, and takes care
3094
* of the use count, or returns NULL with no locks held if we're out of
3095
* memory.
3096
*/
3097
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3098
enum cgroup_filetype type)
3099
{
3100
struct cgroup_pidlist *l;
3101
/* don't need task_nsproxy() if we're looking at ourself */
3102
struct pid_namespace *ns = current->nsproxy->pid_ns;
3103
3104
/*
3105
* We can't drop the pidlist_mutex before taking the l->mutex in case
3106
* the last ref-holder is trying to remove l from the list at the same
3107
* time. Holding the pidlist_mutex precludes somebody taking whichever
3108
* list we find out from under us - compare release_pid_array().
3109
*/
3110
mutex_lock(&cgrp->pidlist_mutex);
3111
list_for_each_entry(l, &cgrp->pidlists, links) {
3112
if (l->key.type == type && l->key.ns == ns) {
3113
/* make sure l doesn't vanish out from under us */
3114
down_write(&l->mutex);
3115
mutex_unlock(&cgrp->pidlist_mutex);
3116
return l;
3117
}
3118
}
3119
/* entry not found; create a new one */
3120
l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3121
if (!l) {
3122
mutex_unlock(&cgrp->pidlist_mutex);
3123
return l;
3124
}
3125
init_rwsem(&l->mutex);
3126
down_write(&l->mutex);
3127
l->key.type = type;
3128
l->key.ns = get_pid_ns(ns);
3129
l->use_count = 0; /* don't increment here */
3130
l->list = NULL;
3131
l->owner = cgrp;
3132
list_add(&l->links, &cgrp->pidlists);
3133
mutex_unlock(&cgrp->pidlist_mutex);
3134
return l;
3135
}
3136
3137
/*
3138
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
3139
*/
3140
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3141
struct cgroup_pidlist **lp)
3142
{
3143
pid_t *array;
3144
int length;
3145
int pid, n = 0; /* used for populating the array */
3146
struct cgroup_iter it;
3147
struct task_struct *tsk;
3148
struct cgroup_pidlist *l;
3149
3150
/*
3151
* If cgroup gets more users after we read count, we won't have
3152
* enough space - tough. This race is indistinguishable to the
3153
* caller from the case that the additional cgroup users didn't
3154
* show up until sometime later on.
3155
*/
3156
length = cgroup_task_count(cgrp);
3157
array = pidlist_allocate(length);
3158
if (!array)
3159
return -ENOMEM;
3160
/* now, populate the array */
3161
cgroup_iter_start(cgrp, &it);
3162
while ((tsk = cgroup_iter_next(cgrp, &it))) {
3163
if (unlikely(n == length))
3164
break;
3165
/* get tgid or pid for procs or tasks file respectively */
3166
if (type == CGROUP_FILE_PROCS)
3167
pid = task_tgid_vnr(tsk);
3168
else
3169
pid = task_pid_vnr(tsk);
3170
if (pid > 0) /* make sure to only use valid results */
3171
array[n++] = pid;
3172
}
3173
cgroup_iter_end(cgrp, &it);
3174
length = n;
3175
/* now sort & (if procs) strip out duplicates */
3176
sort(array, length, sizeof(pid_t), cmppid, NULL);
3177
if (type == CGROUP_FILE_PROCS)
3178
length = pidlist_uniq(&array, length);
3179
l = cgroup_pidlist_find(cgrp, type);
3180
if (!l) {
3181
pidlist_free(array);
3182
return -ENOMEM;
3183
}
3184
/* store array, freeing old if necessary - lock already held */
3185
pidlist_free(l->list);
3186
l->list = array;
3187
l->length = length;
3188
l->use_count++;
3189
up_write(&l->mutex);
3190
*lp = l;
3191
return 0;
3192
}
3193
3194
/**
3195
* cgroupstats_build - build and fill cgroupstats
3196
* @stats: cgroupstats to fill information into
3197
* @dentry: A dentry entry belonging to the cgroup for which stats have
3198
* been requested.
3199
*
3200
* Build and fill cgroupstats so that taskstats can export it to user
3201
* space.
3202
*/
3203
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3204
{
3205
int ret = -EINVAL;
3206
struct cgroup *cgrp;
3207
struct cgroup_iter it;
3208
struct task_struct *tsk;
3209
3210
/*
3211
* Validate dentry by checking the superblock operations,
3212
* and make sure it's a directory.
3213
*/
3214
if (dentry->d_sb->s_op != &cgroup_ops ||
3215
!S_ISDIR(dentry->d_inode->i_mode))
3216
goto err;
3217
3218
ret = 0;
3219
cgrp = dentry->d_fsdata;
3220
3221
cgroup_iter_start(cgrp, &it);
3222
while ((tsk = cgroup_iter_next(cgrp, &it))) {
3223
switch (tsk->state) {
3224
case TASK_RUNNING:
3225
stats->nr_running++;
3226
break;
3227
case TASK_INTERRUPTIBLE:
3228
stats->nr_sleeping++;
3229
break;
3230
case TASK_UNINTERRUPTIBLE:
3231
stats->nr_uninterruptible++;
3232
break;
3233
case TASK_STOPPED:
3234
stats->nr_stopped++;
3235
break;
3236
default:
3237
if (delayacct_is_task_waiting_on_io(tsk))
3238
stats->nr_io_wait++;
3239
break;
3240
}
3241
}
3242
cgroup_iter_end(cgrp, &it);
3243
3244
err:
3245
return ret;
3246
}
3247
3248
3249
/*
3250
* seq_file methods for the tasks/procs files. The seq_file position is the
3251
* next pid to display; the seq_file iterator is a pointer to the pid
3252
* in the cgroup->l->list array.
3253
*/
3254
3255
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3256
{
3257
/*
3258
* Initially we receive a position value that corresponds to
3259
* one more than the last pid shown (or 0 on the first call or
3260
* after a seek to the start). Use a binary-search to find the
3261
* next pid to display, if any
3262
*/
3263
struct cgroup_pidlist *l = s->private;
3264
int index = 0, pid = *pos;
3265
int *iter;
3266
3267
down_read(&l->mutex);
3268
if (pid) {
3269
int end = l->length;
3270
3271
while (index < end) {
3272
int mid = (index + end) / 2;
3273
if (l->list[mid] == pid) {
3274
index = mid;
3275
break;
3276
} else if (l->list[mid] <= pid)
3277
index = mid + 1;
3278
else
3279
end = mid;
3280
}
3281
}
3282
/* If we're off the end of the array, we're done */
3283
if (index >= l->length)
3284
return NULL;
3285
/* Update the abstract position to be the actual pid that we found */
3286
iter = l->list + index;
3287
*pos = *iter;
3288
return iter;
3289
}
3290
3291
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3292
{
3293
struct cgroup_pidlist *l = s->private;
3294
up_read(&l->mutex);
3295
}
3296
3297
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3298
{
3299
struct cgroup_pidlist *l = s->private;
3300
pid_t *p = v;
3301
pid_t *end = l->list + l->length;
3302
/*
3303
* Advance to the next pid in the array. If this goes off the
3304
* end, we're done
3305
*/
3306
p++;
3307
if (p >= end) {
3308
return NULL;
3309
} else {
3310
*pos = *p;
3311
return p;
3312
}
3313
}
3314
3315
static int cgroup_pidlist_show(struct seq_file *s, void *v)
3316
{
3317
return seq_printf(s, "%d\n", *(int *)v);
3318
}
3319
3320
/*
3321
* seq_operations functions for iterating on pidlists through seq_file -
3322
* independent of whether it's tasks or procs
3323
*/
3324
static const struct seq_operations cgroup_pidlist_seq_operations = {
3325
.start = cgroup_pidlist_start,
3326
.stop = cgroup_pidlist_stop,
3327
.next = cgroup_pidlist_next,
3328
.show = cgroup_pidlist_show,
3329
};
3330
3331
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3332
{
3333
/*
3334
* the case where we're the last user of this particular pidlist will
3335
* have us remove it from the cgroup's list, which entails taking the
3336
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3337
* pidlist_mutex, we have to take pidlist_mutex first.
3338
*/
3339
mutex_lock(&l->owner->pidlist_mutex);
3340
down_write(&l->mutex);
3341
BUG_ON(!l->use_count);
3342
if (!--l->use_count) {
3343
/* we're the last user if refcount is 0; remove and free */
3344
list_del(&l->links);
3345
mutex_unlock(&l->owner->pidlist_mutex);
3346
pidlist_free(l->list);
3347
put_pid_ns(l->key.ns);
3348
up_write(&l->mutex);
3349
kfree(l);
3350
return;
3351
}
3352
mutex_unlock(&l->owner->pidlist_mutex);
3353
up_write(&l->mutex);
3354
}
3355
3356
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3357
{
3358
struct cgroup_pidlist *l;
3359
if (!(file->f_mode & FMODE_READ))
3360
return 0;
3361
/*
3362
* the seq_file will only be initialized if the file was opened for
3363
* reading; hence we check if it's not null only in that case.
3364
*/
3365
l = ((struct seq_file *)file->private_data)->private;
3366
cgroup_release_pid_array(l);
3367
return seq_release(inode, file);
3368
}
3369
3370
static const struct file_operations cgroup_pidlist_operations = {
3371
.read = seq_read,
3372
.llseek = seq_lseek,
3373
.write = cgroup_file_write,
3374
.release = cgroup_pidlist_release,
3375
};
3376
3377
/*
3378
* The following functions handle opens on a file that displays a pidlist
3379
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3380
* in the cgroup.
3381
*/
3382
/* helper function for the two below it */
3383
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3384
{
3385
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3386
struct cgroup_pidlist *l;
3387
int retval;
3388
3389
/* Nothing to do for write-only files */
3390
if (!(file->f_mode & FMODE_READ))
3391
return 0;
3392
3393
/* have the array populated */
3394
retval = pidlist_array_load(cgrp, type, &l);
3395
if (retval)
3396
return retval;
3397
/* configure file information */
3398
file->f_op = &cgroup_pidlist_operations;
3399
3400
retval = seq_open(file, &cgroup_pidlist_seq_operations);
3401
if (retval) {
3402
cgroup_release_pid_array(l);
3403
return retval;
3404
}
3405
((struct seq_file *)file->private_data)->private = l;
3406
return 0;
3407
}
3408
static int cgroup_tasks_open(struct inode *unused, struct file *file)
3409
{
3410
return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3411
}
3412
static int cgroup_procs_open(struct inode *unused, struct file *file)
3413
{
3414
return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3415
}
3416
3417
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
3418
struct cftype *cft)
3419
{
3420
return notify_on_release(cgrp);
3421
}
3422
3423
static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3424
struct cftype *cft,
3425
u64 val)
3426
{
3427
clear_bit(CGRP_RELEASABLE, &cgrp->flags);
3428
if (val)
3429
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3430
else
3431
clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3432
return 0;
3433
}
3434
3435
/*
3436
* Unregister event and free resources.
3437
*
3438
* Gets called from workqueue.
3439
*/
3440
static void cgroup_event_remove(struct work_struct *work)
3441
{
3442
struct cgroup_event *event = container_of(work, struct cgroup_event,
3443
remove);
3444
struct cgroup *cgrp = event->cgrp;
3445
3446
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3447
3448
eventfd_ctx_put(event->eventfd);
3449
kfree(event);
3450
dput(cgrp->dentry);
3451
}
3452
3453
/*
3454
* Gets called on POLLHUP on eventfd when user closes it.
3455
*
3456
* Called with wqh->lock held and interrupts disabled.
3457
*/
3458
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3459
int sync, void *key)
3460
{
3461
struct cgroup_event *event = container_of(wait,
3462
struct cgroup_event, wait);
3463
struct cgroup *cgrp = event->cgrp;
3464
unsigned long flags = (unsigned long)key;
3465
3466
if (flags & POLLHUP) {
3467
__remove_wait_queue(event->wqh, &event->wait);
3468
spin_lock(&cgrp->event_list_lock);
3469
list_del(&event->list);
3470
spin_unlock(&cgrp->event_list_lock);
3471
/*
3472
* We are in atomic context, but cgroup_event_remove() may
3473
* sleep, so we have to call it in workqueue.
3474
*/
3475
schedule_work(&event->remove);
3476
}
3477
3478
return 0;
3479
}
3480
3481
static void cgroup_event_ptable_queue_proc(struct file *file,
3482
wait_queue_head_t *wqh, poll_table *pt)
3483
{
3484
struct cgroup_event *event = container_of(pt,
3485
struct cgroup_event, pt);
3486
3487
event->wqh = wqh;
3488
add_wait_queue(wqh, &event->wait);
3489
}
3490
3491
/*
3492
* Parse input and register new cgroup event handler.
3493
*
3494
* Input must be in format '<event_fd> <control_fd> <args>'.
3495
* Interpretation of args is defined by control file implementation.
3496
*/
3497
static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3498
const char *buffer)
3499
{
3500
struct cgroup_event *event = NULL;
3501
unsigned int efd, cfd;
3502
struct file *efile = NULL;
3503
struct file *cfile = NULL;
3504
char *endp;
3505
int ret;
3506
3507
efd = simple_strtoul(buffer, &endp, 10);
3508
if (*endp != ' ')
3509
return -EINVAL;
3510
buffer = endp + 1;
3511
3512
cfd = simple_strtoul(buffer, &endp, 10);
3513
if ((*endp != ' ') && (*endp != '\0'))
3514
return -EINVAL;
3515
buffer = endp + 1;
3516
3517
event = kzalloc(sizeof(*event), GFP_KERNEL);
3518
if (!event)
3519
return -ENOMEM;
3520
event->cgrp = cgrp;
3521
INIT_LIST_HEAD(&event->list);
3522
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3523
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3524
INIT_WORK(&event->remove, cgroup_event_remove);
3525
3526
efile = eventfd_fget(efd);
3527
if (IS_ERR(efile)) {
3528
ret = PTR_ERR(efile);
3529
goto fail;
3530
}
3531
3532
event->eventfd = eventfd_ctx_fileget(efile);
3533
if (IS_ERR(event->eventfd)) {
3534
ret = PTR_ERR(event->eventfd);
3535
goto fail;
3536
}
3537
3538
cfile = fget(cfd);
3539
if (!cfile) {
3540
ret = -EBADF;
3541
goto fail;
3542
}
3543
3544
/* the process need read permission on control file */
3545
ret = file_permission(cfile, MAY_READ);
3546
if (ret < 0)
3547
goto fail;
3548
3549
event->cft = __file_cft(cfile);
3550
if (IS_ERR(event->cft)) {
3551
ret = PTR_ERR(event->cft);
3552
goto fail;
3553
}
3554
3555
if (!event->cft->register_event || !event->cft->unregister_event) {
3556
ret = -EINVAL;
3557
goto fail;
3558
}
3559
3560
ret = event->cft->register_event(cgrp, event->cft,
3561
event->eventfd, buffer);
3562
if (ret)
3563
goto fail;
3564
3565
if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3566
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3567
ret = 0;
3568
goto fail;
3569
}
3570
3571
/*
3572
* Events should be removed after rmdir of cgroup directory, but before
3573
* destroying subsystem state objects. Let's take reference to cgroup
3574
* directory dentry to do that.
3575
*/
3576
dget(cgrp->dentry);
3577
3578
spin_lock(&cgrp->event_list_lock);
3579
list_add(&event->list, &cgrp->event_list);
3580
spin_unlock(&cgrp->event_list_lock);
3581
3582
fput(cfile);
3583
fput(efile);
3584
3585
return 0;
3586
3587
fail:
3588
if (cfile)
3589
fput(cfile);
3590
3591
if (event && event->eventfd && !IS_ERR(event->eventfd))
3592
eventfd_ctx_put(event->eventfd);
3593
3594
if (!IS_ERR_OR_NULL(efile))
3595
fput(efile);
3596
3597
kfree(event);
3598
3599
return ret;
3600
}
3601
3602
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3603
struct cftype *cft)
3604
{
3605
return clone_children(cgrp);
3606
}
3607
3608
static int cgroup_clone_children_write(struct cgroup *cgrp,
3609
struct cftype *cft,
3610
u64 val)
3611
{
3612
if (val)
3613
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3614
else
3615
clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3616
return 0;
3617
}
3618
3619
/*
3620
* for the common functions, 'private' gives the type of file
3621
*/
3622
/* for hysterical raisins, we can't put this on the older files */
3623
#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3624
static struct cftype files[] = {
3625
{
3626
.name = "tasks",
3627
.open = cgroup_tasks_open,
3628
.write_u64 = cgroup_tasks_write,
3629
.release = cgroup_pidlist_release,
3630
.mode = S_IRUGO | S_IWUSR,
3631
},
3632
{
3633
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
3634
.open = cgroup_procs_open,
3635
.write_u64 = cgroup_procs_write,
3636
.release = cgroup_pidlist_release,
3637
.mode = S_IRUGO | S_IWUSR,
3638
},
3639
{
3640
.name = "notify_on_release",
3641
.read_u64 = cgroup_read_notify_on_release,
3642
.write_u64 = cgroup_write_notify_on_release,
3643
},
3644
{
3645
.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3646
.write_string = cgroup_write_event_control,
3647
.mode = S_IWUGO,
3648
},
3649
{
3650
.name = "cgroup.clone_children",
3651
.read_u64 = cgroup_clone_children_read,
3652
.write_u64 = cgroup_clone_children_write,
3653
},
3654
};
3655
3656
static struct cftype cft_release_agent = {
3657
.name = "release_agent",
3658
.read_seq_string = cgroup_release_agent_show,
3659
.write_string = cgroup_release_agent_write,
3660
.max_write_len = PATH_MAX,
3661
};
3662
3663
static int cgroup_populate_dir(struct cgroup *cgrp)
3664
{
3665
int err;
3666
struct cgroup_subsys *ss;
3667
3668
/* First clear out any existing files */
3669
cgroup_clear_directory(cgrp->dentry);
3670
3671
err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3672
if (err < 0)
3673
return err;
3674
3675
if (cgrp == cgrp->top_cgroup) {
3676
if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3677
return err;
3678
}
3679
3680
for_each_subsys(cgrp->root, ss) {
3681
if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
3682
return err;
3683
}
3684
/* This cgroup is ready now */
3685
for_each_subsys(cgrp->root, ss) {
3686
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3687
/*
3688
* Update id->css pointer and make this css visible from
3689
* CSS ID functions. This pointer will be dereferened
3690
* from RCU-read-side without locks.
3691
*/
3692
if (css->id)
3693
rcu_assign_pointer(css->id->css, css);
3694
}
3695
3696
return 0;
3697
}
3698
3699
static void init_cgroup_css(struct cgroup_subsys_state *css,
3700
struct cgroup_subsys *ss,
3701
struct cgroup *cgrp)
3702
{
3703
css->cgroup = cgrp;
3704
atomic_set(&css->refcnt, 1);
3705
css->flags = 0;
3706
css->id = NULL;
3707
if (cgrp == dummytop)
3708
set_bit(CSS_ROOT, &css->flags);
3709
BUG_ON(cgrp->subsys[ss->subsys_id]);
3710
cgrp->subsys[ss->subsys_id] = css;
3711
}
3712
3713
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
3714
{
3715
/* We need to take each hierarchy_mutex in a consistent order */
3716
int i;
3717
3718
/*
3719
* No worry about a race with rebind_subsystems that might mess up the
3720
* locking order, since both parties are under cgroup_mutex.
3721
*/
3722
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3723
struct cgroup_subsys *ss = subsys[i];
3724
if (ss == NULL)
3725
continue;
3726
if (ss->root == root)
3727
mutex_lock(&ss->hierarchy_mutex);
3728
}
3729
}
3730
3731
static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3732
{
3733
int i;
3734
3735
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3736
struct cgroup_subsys *ss = subsys[i];
3737
if (ss == NULL)
3738
continue;
3739
if (ss->root == root)
3740
mutex_unlock(&ss->hierarchy_mutex);
3741
}
3742
}
3743
3744
/*
3745
* cgroup_create - create a cgroup
3746
* @parent: cgroup that will be parent of the new cgroup
3747
* @dentry: dentry of the new cgroup
3748
* @mode: mode to set on new inode
3749
*
3750
* Must be called with the mutex on the parent inode held
3751
*/
3752
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3753
mode_t mode)
3754
{
3755
struct cgroup *cgrp;
3756
struct cgroupfs_root *root = parent->root;
3757
int err = 0;
3758
struct cgroup_subsys *ss;
3759
struct super_block *sb = root->sb;
3760
3761
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3762
if (!cgrp)
3763
return -ENOMEM;
3764
3765
/* Grab a reference on the superblock so the hierarchy doesn't
3766
* get deleted on unmount if there are child cgroups. This
3767
* can be done outside cgroup_mutex, since the sb can't
3768
* disappear while someone has an open control file on the
3769
* fs */
3770
atomic_inc(&sb->s_active);
3771
3772
mutex_lock(&cgroup_mutex);
3773
3774
init_cgroup_housekeeping(cgrp);
3775
3776
cgrp->parent = parent;
3777
cgrp->root = parent->root;
3778
cgrp->top_cgroup = parent->top_cgroup;
3779
3780
if (notify_on_release(parent))
3781
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3782
3783
if (clone_children(parent))
3784
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3785
3786
for_each_subsys(root, ss) {
3787
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3788
3789
if (IS_ERR(css)) {
3790
err = PTR_ERR(css);
3791
goto err_destroy;
3792
}
3793
init_cgroup_css(css, ss, cgrp);
3794
if (ss->use_id) {
3795
err = alloc_css_id(ss, parent, cgrp);
3796
if (err)
3797
goto err_destroy;
3798
}
3799
/* At error, ->destroy() callback has to free assigned ID. */
3800
if (clone_children(parent) && ss->post_clone)
3801
ss->post_clone(ss, cgrp);
3802
}
3803
3804
cgroup_lock_hierarchy(root);
3805
list_add(&cgrp->sibling, &cgrp->parent->children);
3806
cgroup_unlock_hierarchy(root);
3807
root->number_of_cgroups++;
3808
3809
err = cgroup_create_dir(cgrp, dentry, mode);
3810
if (err < 0)
3811
goto err_remove;
3812
3813
/* The cgroup directory was pre-locked for us */
3814
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3815
3816
err = cgroup_populate_dir(cgrp);
3817
/* If err < 0, we have a half-filled directory - oh well ;) */
3818
3819
mutex_unlock(&cgroup_mutex);
3820
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
3821
3822
return 0;
3823
3824
err_remove:
3825
3826
cgroup_lock_hierarchy(root);
3827
list_del(&cgrp->sibling);
3828
cgroup_unlock_hierarchy(root);
3829
root->number_of_cgroups--;
3830
3831
err_destroy:
3832
3833
for_each_subsys(root, ss) {
3834
if (cgrp->subsys[ss->subsys_id])
3835
ss->destroy(ss, cgrp);
3836
}
3837
3838
mutex_unlock(&cgroup_mutex);
3839
3840
/* Release the reference count that we took on the superblock */
3841
deactivate_super(sb);
3842
3843
kfree(cgrp);
3844
return err;
3845
}
3846
3847
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3848
{
3849
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3850
3851
/* the vfs holds inode->i_mutex already */
3852
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3853
}
3854
3855
static int cgroup_has_css_refs(struct cgroup *cgrp)
3856
{
3857
/* Check the reference count on each subsystem. Since we
3858
* already established that there are no tasks in the
3859
* cgroup, if the css refcount is also 1, then there should
3860
* be no outstanding references, so the subsystem is safe to
3861
* destroy. We scan across all subsystems rather than using
3862
* the per-hierarchy linked list of mounted subsystems since
3863
* we can be called via check_for_release() with no
3864
* synchronization other than RCU, and the subsystem linked
3865
* list isn't RCU-safe */
3866
int i;
3867
/*
3868
* We won't need to lock the subsys array, because the subsystems
3869
* we're concerned about aren't going anywhere since our cgroup root
3870
* has a reference on them.
3871
*/
3872
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3873
struct cgroup_subsys *ss = subsys[i];
3874
struct cgroup_subsys_state *css;
3875
/* Skip subsystems not present or not in this hierarchy */
3876
if (ss == NULL || ss->root != cgrp->root)
3877
continue;
3878
css = cgrp->subsys[ss->subsys_id];
3879
/* When called from check_for_release() it's possible
3880
* that by this point the cgroup has been removed
3881
* and the css deleted. But a false-positive doesn't
3882
* matter, since it can only happen if the cgroup
3883
* has been deleted and hence no longer needs the
3884
* release agent to be called anyway. */
3885
if (css && (atomic_read(&css->refcnt) > 1))
3886
return 1;
3887
}
3888
return 0;
3889
}
3890
3891
/*
3892
* Atomically mark all (or else none) of the cgroup's CSS objects as
3893
* CSS_REMOVED. Return true on success, or false if the cgroup has
3894
* busy subsystems. Call with cgroup_mutex held
3895
*/
3896
3897
static int cgroup_clear_css_refs(struct cgroup *cgrp)
3898
{
3899
struct cgroup_subsys *ss;
3900
unsigned long flags;
3901
bool failed = false;
3902
local_irq_save(flags);
3903
for_each_subsys(cgrp->root, ss) {
3904
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3905
int refcnt;
3906
while (1) {
3907
/* We can only remove a CSS with a refcnt==1 */
3908
refcnt = atomic_read(&css->refcnt);
3909
if (refcnt > 1) {
3910
failed = true;
3911
goto done;
3912
}
3913
BUG_ON(!refcnt);
3914
/*
3915
* Drop the refcnt to 0 while we check other
3916
* subsystems. This will cause any racing
3917
* css_tryget() to spin until we set the
3918
* CSS_REMOVED bits or abort
3919
*/
3920
if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3921
break;
3922
cpu_relax();
3923
}
3924
}
3925
done:
3926
for_each_subsys(cgrp->root, ss) {
3927
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3928
if (failed) {
3929
/*
3930
* Restore old refcnt if we previously managed
3931
* to clear it from 1 to 0
3932
*/
3933
if (!atomic_read(&css->refcnt))
3934
atomic_set(&css->refcnt, 1);
3935
} else {
3936
/* Commit the fact that the CSS is removed */
3937
set_bit(CSS_REMOVED, &css->flags);
3938
}
3939
}
3940
local_irq_restore(flags);
3941
return !failed;
3942
}
3943
3944
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3945
{
3946
struct cgroup *cgrp = dentry->d_fsdata;
3947
struct dentry *d;
3948
struct cgroup *parent;
3949
DEFINE_WAIT(wait);
3950
struct cgroup_event *event, *tmp;
3951
int ret;
3952
3953
/* the vfs holds both inode->i_mutex already */
3954
again:
3955
mutex_lock(&cgroup_mutex);
3956
if (atomic_read(&cgrp->count) != 0) {
3957
mutex_unlock(&cgroup_mutex);
3958
return -EBUSY;
3959
}
3960
if (!list_empty(&cgrp->children)) {
3961
mutex_unlock(&cgroup_mutex);
3962
return -EBUSY;
3963
}
3964
mutex_unlock(&cgroup_mutex);
3965
3966
/*
3967
* In general, subsystem has no css->refcnt after pre_destroy(). But
3968
* in racy cases, subsystem may have to get css->refcnt after
3969
* pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
3970
* make rmdir return -EBUSY too often. To avoid that, we use waitqueue
3971
* for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
3972
* and subsystem's reference count handling. Please see css_get/put
3973
* and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
3974
*/
3975
set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3976
3977
/*
3978
* Call pre_destroy handlers of subsys. Notify subsystems
3979
* that rmdir() request comes.
3980
*/
3981
ret = cgroup_call_pre_destroy(cgrp);
3982
if (ret) {
3983
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3984
return ret;
3985
}
3986
3987
mutex_lock(&cgroup_mutex);
3988
parent = cgrp->parent;
3989
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
3990
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3991
mutex_unlock(&cgroup_mutex);
3992
return -EBUSY;
3993
}
3994
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
3995
if (!cgroup_clear_css_refs(cgrp)) {
3996
mutex_unlock(&cgroup_mutex);
3997
/*
3998
* Because someone may call cgroup_wakeup_rmdir_waiter() before
3999
* prepare_to_wait(), we need to check this flag.
4000
*/
4001
if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4002
schedule();
4003
finish_wait(&cgroup_rmdir_waitq, &wait);
4004
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4005
if (signal_pending(current))
4006
return -EINTR;
4007
goto again;
4008
}
4009
/* NO css_tryget() can success after here. */
4010
finish_wait(&cgroup_rmdir_waitq, &wait);
4011
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4012
4013
spin_lock(&release_list_lock);
4014
set_bit(CGRP_REMOVED, &cgrp->flags);
4015
if (!list_empty(&cgrp->release_list))
4016
list_del_init(&cgrp->release_list);
4017
spin_unlock(&release_list_lock);
4018
4019
cgroup_lock_hierarchy(cgrp->root);
4020
/* delete this cgroup from parent->children */
4021
list_del_init(&cgrp->sibling);
4022
cgroup_unlock_hierarchy(cgrp->root);
4023
4024
d = dget(cgrp->dentry);
4025
4026
cgroup_d_remove_dir(d);
4027
dput(d);
4028
4029
set_bit(CGRP_RELEASABLE, &parent->flags);
4030
check_for_release(parent);
4031
4032
/*
4033
* Unregister events and notify userspace.
4034
* Notify userspace about cgroup removing only after rmdir of cgroup
4035
* directory to avoid race between userspace and kernelspace
4036
*/
4037
spin_lock(&cgrp->event_list_lock);
4038
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4039
list_del(&event->list);
4040
remove_wait_queue(event->wqh, &event->wait);
4041
eventfd_signal(event->eventfd, 1);
4042
schedule_work(&event->remove);
4043
}
4044
spin_unlock(&cgrp->event_list_lock);
4045
4046
mutex_unlock(&cgroup_mutex);
4047
return 0;
4048
}
4049
4050
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4051
{
4052
struct cgroup_subsys_state *css;
4053
4054
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4055
4056
/* Create the top cgroup state for this subsystem */
4057
list_add(&ss->sibling, &rootnode.subsys_list);
4058
ss->root = &rootnode;
4059
css = ss->create(ss, dummytop);
4060
/* We don't handle early failures gracefully */
4061
BUG_ON(IS_ERR(css));
4062
init_cgroup_css(css, ss, dummytop);
4063
4064
/* Update the init_css_set to contain a subsys
4065
* pointer to this state - since the subsystem is
4066
* newly registered, all tasks and hence the
4067
* init_css_set is in the subsystem's top cgroup. */
4068
init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
4069
4070
need_forkexit_callback |= ss->fork || ss->exit;
4071
4072
/* At system boot, before all subsystems have been
4073
* registered, no tasks have been forked, so we don't
4074
* need to invoke fork callbacks here. */
4075
BUG_ON(!list_empty(&init_task.tasks));
4076
4077
mutex_init(&ss->hierarchy_mutex);
4078
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4079
ss->active = 1;
4080
4081
/* this function shouldn't be used with modular subsystems, since they
4082
* need to register a subsys_id, among other things */
4083
BUG_ON(ss->module);
4084
}
4085
4086
/**
4087
* cgroup_load_subsys: load and register a modular subsystem at runtime
4088
* @ss: the subsystem to load
4089
*
4090
* This function should be called in a modular subsystem's initcall. If the
4091
* subsystem is built as a module, it will be assigned a new subsys_id and set
4092
* up for use. If the subsystem is built-in anyway, work is delegated to the
4093
* simpler cgroup_init_subsys.
4094
*/
4095
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096
{
4097
int i;
4098
struct cgroup_subsys_state *css;
4099
4100
/* check name and function validity */
4101
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4102
ss->create == NULL || ss->destroy == NULL)
4103
return -EINVAL;
4104
4105
/*
4106
* we don't support callbacks in modular subsystems. this check is
4107
* before the ss->module check for consistency; a subsystem that could
4108
* be a module should still have no callbacks even if the user isn't
4109
* compiling it as one.
4110
*/
4111
if (ss->fork || ss->exit)
4112
return -EINVAL;
4113
4114
/*
4115
* an optionally modular subsystem is built-in: we want to do nothing,
4116
* since cgroup_init_subsys will have already taken care of it.
4117
*/
4118
if (ss->module == NULL) {
4119
/* a few sanity checks */
4120
BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4121
BUG_ON(subsys[ss->subsys_id] != ss);
4122
return 0;
4123
}
4124
4125
/*
4126
* need to register a subsys id before anything else - for example,
4127
* init_cgroup_css needs it.
4128
*/
4129
mutex_lock(&cgroup_mutex);
4130
/* find the first empty slot in the array */
4131
for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4132
if (subsys[i] == NULL)
4133
break;
4134
}
4135
if (i == CGROUP_SUBSYS_COUNT) {
4136
/* maximum number of subsystems already registered! */
4137
mutex_unlock(&cgroup_mutex);
4138
return -EBUSY;
4139
}
4140
/* assign ourselves the subsys_id */
4141
ss->subsys_id = i;
4142
subsys[i] = ss;
4143
4144
/*
4145
* no ss->create seems to need anything important in the ss struct, so
4146
* this can happen first (i.e. before the rootnode attachment).
4147
*/
4148
css = ss->create(ss, dummytop);
4149
if (IS_ERR(css)) {
4150
/* failure case - need to deassign the subsys[] slot. */
4151
subsys[i] = NULL;
4152
mutex_unlock(&cgroup_mutex);
4153
return PTR_ERR(css);
4154
}
4155
4156
list_add(&ss->sibling, &rootnode.subsys_list);
4157
ss->root = &rootnode;
4158
4159
/* our new subsystem will be attached to the dummy hierarchy. */
4160
init_cgroup_css(css, ss, dummytop);
4161
/* init_idr must be after init_cgroup_css because it sets css->id. */
4162
if (ss->use_id) {
4163
int ret = cgroup_init_idr(ss, css);
4164
if (ret) {
4165
dummytop->subsys[ss->subsys_id] = NULL;
4166
ss->destroy(ss, dummytop);
4167
subsys[i] = NULL;
4168
mutex_unlock(&cgroup_mutex);
4169
return ret;
4170
}
4171
}
4172
4173
/*
4174
* Now we need to entangle the css into the existing css_sets. unlike
4175
* in cgroup_init_subsys, there are now multiple css_sets, so each one
4176
* will need a new pointer to it; done by iterating the css_set_table.
4177
* furthermore, modifying the existing css_sets will corrupt the hash
4178
* table state, so each changed css_set will need its hash recomputed.
4179
* this is all done under the css_set_lock.
4180
*/
4181
write_lock(&css_set_lock);
4182
for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
4183
struct css_set *cg;
4184
struct hlist_node *node, *tmp;
4185
struct hlist_head *bucket = &css_set_table[i], *new_bucket;
4186
4187
hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
4188
/* skip entries that we already rehashed */
4189
if (cg->subsys[ss->subsys_id])
4190
continue;
4191
/* remove existing entry */
4192
hlist_del(&cg->hlist);
4193
/* set new value */
4194
cg->subsys[ss->subsys_id] = css;
4195
/* recompute hash and restore entry */
4196
new_bucket = css_set_hash(cg->subsys);
4197
hlist_add_head(&cg->hlist, new_bucket);
4198
}
4199
}
4200
write_unlock(&css_set_lock);
4201
4202
mutex_init(&ss->hierarchy_mutex);
4203
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4204
ss->active = 1;
4205
4206
/* success! */
4207
mutex_unlock(&cgroup_mutex);
4208
return 0;
4209
}
4210
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4211
4212
/**
4213
* cgroup_unload_subsys: unload a modular subsystem
4214
* @ss: the subsystem to unload
4215
*
4216
* This function should be called in a modular subsystem's exitcall. When this
4217
* function is invoked, the refcount on the subsystem's module will be 0, so
4218
* the subsystem will not be attached to any hierarchy.
4219
*/
4220
void cgroup_unload_subsys(struct cgroup_subsys *ss)
4221
{
4222
struct cg_cgroup_link *link;
4223
struct hlist_head *hhead;
4224
4225
BUG_ON(ss->module == NULL);
4226
4227
/*
4228
* we shouldn't be called if the subsystem is in use, and the use of
4229
* try_module_get in parse_cgroupfs_options should ensure that it
4230
* doesn't start being used while we're killing it off.
4231
*/
4232
BUG_ON(ss->root != &rootnode);
4233
4234
mutex_lock(&cgroup_mutex);
4235
/* deassign the subsys_id */
4236
BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4237
subsys[ss->subsys_id] = NULL;
4238
4239
/* remove subsystem from rootnode's list of subsystems */
4240
list_del_init(&ss->sibling);
4241
4242
/*
4243
* disentangle the css from all css_sets attached to the dummytop. as
4244
* in loading, we need to pay our respects to the hashtable gods.
4245
*/
4246
write_lock(&css_set_lock);
4247
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4248
struct css_set *cg = link->cg;
4249
4250
hlist_del(&cg->hlist);
4251
BUG_ON(!cg->subsys[ss->subsys_id]);
4252
cg->subsys[ss->subsys_id] = NULL;
4253
hhead = css_set_hash(cg->subsys);
4254
hlist_add_head(&cg->hlist, hhead);
4255
}
4256
write_unlock(&css_set_lock);
4257
4258
/*
4259
* remove subsystem's css from the dummytop and free it - need to free
4260
* before marking as null because ss->destroy needs the cgrp->subsys
4261
* pointer to find their state. note that this also takes care of
4262
* freeing the css_id.
4263
*/
4264
ss->destroy(ss, dummytop);
4265
dummytop->subsys[ss->subsys_id] = NULL;
4266
4267
mutex_unlock(&cgroup_mutex);
4268
}
4269
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4270
4271
/**
4272
* cgroup_init_early - cgroup initialization at system boot
4273
*
4274
* Initialize cgroups at system boot, and initialize any
4275
* subsystems that request early init.
4276
*/
4277
int __init cgroup_init_early(void)
4278
{
4279
int i;
4280
atomic_set(&init_css_set.refcount, 1);
4281
INIT_LIST_HEAD(&init_css_set.cg_links);
4282
INIT_LIST_HEAD(&init_css_set.tasks);
4283
INIT_HLIST_NODE(&init_css_set.hlist);
4284
css_set_count = 1;
4285
init_cgroup_root(&rootnode);
4286
root_count = 1;
4287
init_task.cgroups = &init_css_set;
4288
4289
init_css_set_link.cg = &init_css_set;
4290
init_css_set_link.cgrp = dummytop;
4291
list_add(&init_css_set_link.cgrp_link_list,
4292
&rootnode.top_cgroup.css_sets);
4293
list_add(&init_css_set_link.cg_link_list,
4294
&init_css_set.cg_links);
4295
4296
for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4297
INIT_HLIST_HEAD(&css_set_table[i]);
4298
4299
/* at bootup time, we don't worry about modular subsystems */
4300
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4301
struct cgroup_subsys *ss = subsys[i];
4302
4303
BUG_ON(!ss->name);
4304
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4305
BUG_ON(!ss->create);
4306
BUG_ON(!ss->destroy);
4307
if (ss->subsys_id != i) {
4308
printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4309
ss->name, ss->subsys_id);
4310
BUG();
4311
}
4312
4313
if (ss->early_init)
4314
cgroup_init_subsys(ss);
4315
}
4316
return 0;
4317
}
4318
4319
/**
4320
* cgroup_init - cgroup initialization
4321
*
4322
* Register cgroup filesystem and /proc file, and initialize
4323
* any subsystems that didn't request early init.
4324
*/
4325
int __init cgroup_init(void)
4326
{
4327
int err;
4328
int i;
4329
struct hlist_head *hhead;
4330
4331
err = bdi_init(&cgroup_backing_dev_info);
4332
if (err)
4333
return err;
4334
4335
/* at bootup time, we don't worry about modular subsystems */
4336
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4337
struct cgroup_subsys *ss = subsys[i];
4338
if (!ss->early_init)
4339
cgroup_init_subsys(ss);
4340
if (ss->use_id)
4341
cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4342
}
4343
4344
/* Add init_css_set to the hash table */
4345
hhead = css_set_hash(init_css_set.subsys);
4346
hlist_add_head(&init_css_set.hlist, hhead);
4347
BUG_ON(!init_root_id(&rootnode));
4348
4349
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4350
if (!cgroup_kobj) {
4351
err = -ENOMEM;
4352
goto out;
4353
}
4354
4355
err = register_filesystem(&cgroup_fs_type);
4356
if (err < 0) {
4357
kobject_put(cgroup_kobj);
4358
goto out;
4359
}
4360
4361
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4362
4363
out:
4364
if (err)
4365
bdi_destroy(&cgroup_backing_dev_info);
4366
4367
return err;
4368
}
4369
4370
/*
4371
* proc_cgroup_show()
4372
* - Print task's cgroup paths into seq_file, one line for each hierarchy
4373
* - Used for /proc/<pid>/cgroup.
4374
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4375
* doesn't really matter if tsk->cgroup changes after we read it,
4376
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4377
* anyway. No need to check that tsk->cgroup != NULL, thanks to
4378
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4379
* cgroup to top_cgroup.
4380
*/
4381
4382
/* TODO: Use a proper seq_file iterator */
4383
static int proc_cgroup_show(struct seq_file *m, void *v)
4384
{
4385
struct pid *pid;
4386
struct task_struct *tsk;
4387
char *buf;
4388
int retval;
4389
struct cgroupfs_root *root;
4390
4391
retval = -ENOMEM;
4392
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4393
if (!buf)
4394
goto out;
4395
4396
retval = -ESRCH;
4397
pid = m->private;
4398
tsk = get_pid_task(pid, PIDTYPE_PID);
4399
if (!tsk)
4400
goto out_free;
4401
4402
retval = 0;
4403
4404
mutex_lock(&cgroup_mutex);
4405
4406
for_each_active_root(root) {
4407
struct cgroup_subsys *ss;
4408
struct cgroup *cgrp;
4409
int count = 0;
4410
4411
seq_printf(m, "%d:", root->hierarchy_id);
4412
for_each_subsys(root, ss)
4413
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4414
if (strlen(root->name))
4415
seq_printf(m, "%sname=%s", count ? "," : "",
4416
root->name);
4417
seq_putc(m, ':');
4418
cgrp = task_cgroup_from_root(tsk, root);
4419
retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4420
if (retval < 0)
4421
goto out_unlock;
4422
seq_puts(m, buf);
4423
seq_putc(m, '\n');
4424
}
4425
4426
out_unlock:
4427
mutex_unlock(&cgroup_mutex);
4428
put_task_struct(tsk);
4429
out_free:
4430
kfree(buf);
4431
out:
4432
return retval;
4433
}
4434
4435
static int cgroup_open(struct inode *inode, struct file *file)
4436
{
4437
struct pid *pid = PROC_I(inode)->pid;
4438
return single_open(file, proc_cgroup_show, pid);
4439
}
4440
4441
const struct file_operations proc_cgroup_operations = {
4442
.open = cgroup_open,
4443
.read = seq_read,
4444
.llseek = seq_lseek,
4445
.release = single_release,
4446
};
4447
4448
/* Display information about each subsystem and each hierarchy */
4449
static int proc_cgroupstats_show(struct seq_file *m, void *v)
4450
{
4451
int i;
4452
4453
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4454
/*
4455
* ideally we don't want subsystems moving around while we do this.
4456
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
4457
* subsys/hierarchy state.
4458
*/
4459
mutex_lock(&cgroup_mutex);
4460
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4461
struct cgroup_subsys *ss = subsys[i];
4462
if (ss == NULL)
4463
continue;
4464
seq_printf(m, "%s\t%d\t%d\t%d\n",
4465
ss->name, ss->root->hierarchy_id,
4466
ss->root->number_of_cgroups, !ss->disabled);
4467
}
4468
mutex_unlock(&cgroup_mutex);
4469
return 0;
4470
}
4471
4472
static int cgroupstats_open(struct inode *inode, struct file *file)
4473
{
4474
return single_open(file, proc_cgroupstats_show, NULL);
4475
}
4476
4477
static const struct file_operations proc_cgroupstats_operations = {
4478
.open = cgroupstats_open,
4479
.read = seq_read,
4480
.llseek = seq_lseek,
4481
.release = single_release,
4482
};
4483
4484
/**
4485
* cgroup_fork - attach newly forked task to its parents cgroup.
4486
* @child: pointer to task_struct of forking parent process.
4487
*
4488
* Description: A task inherits its parent's cgroup at fork().
4489
*
4490
* A pointer to the shared css_set was automatically copied in
4491
* fork.c by dup_task_struct(). However, we ignore that copy, since
4492
* it was not made under the protection of RCU or cgroup_mutex, so
4493
* might no longer be a valid cgroup pointer. cgroup_attach_task() might
4494
* have already changed current->cgroups, allowing the previously
4495
* referenced cgroup group to be removed and freed.
4496
*
4497
* At the point that cgroup_fork() is called, 'current' is the parent
4498
* task, and the passed argument 'child' points to the child task.
4499
*/
4500
void cgroup_fork(struct task_struct *child)
4501
{
4502
task_lock(current);
4503
child->cgroups = current->cgroups;
4504
get_css_set(child->cgroups);
4505
task_unlock(current);
4506
INIT_LIST_HEAD(&child->cg_list);
4507
}
4508
4509
/**
4510
* cgroup_fork_callbacks - run fork callbacks
4511
* @child: the new task
4512
*
4513
* Called on a new task very soon before adding it to the
4514
* tasklist. No need to take any locks since no-one can
4515
* be operating on this task.
4516
*/
4517
void cgroup_fork_callbacks(struct task_struct *child)
4518
{
4519
if (need_forkexit_callback) {
4520
int i;
4521
/*
4522
* forkexit callbacks are only supported for builtin
4523
* subsystems, and the builtin section of the subsys array is
4524
* immutable, so we don't need to lock the subsys array here.
4525
*/
4526
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4527
struct cgroup_subsys *ss = subsys[i];
4528
if (ss->fork)
4529
ss->fork(ss, child);
4530
}
4531
}
4532
}
4533
4534
/**
4535
* cgroup_post_fork - called on a new task after adding it to the task list
4536
* @child: the task in question
4537
*
4538
* Adds the task to the list running through its css_set if necessary.
4539
* Has to be after the task is visible on the task list in case we race
4540
* with the first call to cgroup_iter_start() - to guarantee that the
4541
* new task ends up on its list.
4542
*/
4543
void cgroup_post_fork(struct task_struct *child)
4544
{
4545
if (use_task_css_set_links) {
4546
write_lock(&css_set_lock);
4547
task_lock(child);
4548
if (list_empty(&child->cg_list))
4549
list_add(&child->cg_list, &child->cgroups->tasks);
4550
task_unlock(child);
4551
write_unlock(&css_set_lock);
4552
}
4553
}
4554
/**
4555
* cgroup_exit - detach cgroup from exiting task
4556
* @tsk: pointer to task_struct of exiting process
4557
* @run_callback: run exit callbacks?
4558
*
4559
* Description: Detach cgroup from @tsk and release it.
4560
*
4561
* Note that cgroups marked notify_on_release force every task in
4562
* them to take the global cgroup_mutex mutex when exiting.
4563
* This could impact scaling on very large systems. Be reluctant to
4564
* use notify_on_release cgroups where very high task exit scaling
4565
* is required on large systems.
4566
*
4567
* the_top_cgroup_hack:
4568
*
4569
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
4570
*
4571
* We call cgroup_exit() while the task is still competent to
4572
* handle notify_on_release(), then leave the task attached to the
4573
* root cgroup in each hierarchy for the remainder of its exit.
4574
*
4575
* To do this properly, we would increment the reference count on
4576
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
4577
* code we would add a second cgroup function call, to drop that
4578
* reference. This would just create an unnecessary hot spot on
4579
* the top_cgroup reference count, to no avail.
4580
*
4581
* Normally, holding a reference to a cgroup without bumping its
4582
* count is unsafe. The cgroup could go away, or someone could
4583
* attach us to a different cgroup, decrementing the count on
4584
* the first cgroup that we never incremented. But in this case,
4585
* top_cgroup isn't going away, and either task has PF_EXITING set,
4586
* which wards off any cgroup_attach_task() attempts, or task is a failed
4587
* fork, never visible to cgroup_attach_task.
4588
*/
4589
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4590
{
4591
struct css_set *cg;
4592
int i;
4593
4594
/*
4595
* Unlink from the css_set task list if necessary.
4596
* Optimistically check cg_list before taking
4597
* css_set_lock
4598
*/
4599
if (!list_empty(&tsk->cg_list)) {
4600
write_lock(&css_set_lock);
4601
if (!list_empty(&tsk->cg_list))
4602
list_del_init(&tsk->cg_list);
4603
write_unlock(&css_set_lock);
4604
}
4605
4606
/* Reassign the task to the init_css_set. */
4607
task_lock(tsk);
4608
cg = tsk->cgroups;
4609
tsk->cgroups = &init_css_set;
4610
4611
if (run_callbacks && need_forkexit_callback) {
4612
/*
4613
* modular subsystems can't use callbacks, so no need to lock
4614
* the subsys array
4615
*/
4616
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4617
struct cgroup_subsys *ss = subsys[i];
4618
if (ss->exit) {
4619
struct cgroup *old_cgrp =
4620
rcu_dereference_raw(cg->subsys[i])->cgroup;
4621
struct cgroup *cgrp = task_cgroup(tsk, i);
4622
ss->exit(ss, cgrp, old_cgrp, tsk);
4623
}
4624
}
4625
}
4626
task_unlock(tsk);
4627
4628
if (cg)
4629
put_css_set_taskexit(cg);
4630
}
4631
4632
/**
4633
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
4634
* @cgrp: the cgroup in question
4635
* @task: the task in question
4636
*
4637
* See if @cgrp is a descendant of @task's cgroup in the appropriate
4638
* hierarchy.
4639
*
4640
* If we are sending in dummytop, then presumably we are creating
4641
* the top cgroup in the subsystem.
4642
*
4643
* Called only by the ns (nsproxy) cgroup.
4644
*/
4645
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
4646
{
4647
int ret;
4648
struct cgroup *target;
4649
4650
if (cgrp == dummytop)
4651
return 1;
4652
4653
target = task_cgroup_from_root(task, cgrp->root);
4654
while (cgrp != target && cgrp!= cgrp->top_cgroup)
4655
cgrp = cgrp->parent;
4656
ret = (cgrp == target);
4657
return ret;
4658
}
4659
4660
static void check_for_release(struct cgroup *cgrp)
4661
{
4662
/* All of these checks rely on RCU to keep the cgroup
4663
* structure alive */
4664
if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
4665
&& list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
4666
/* Control Group is currently removeable. If it's not
4667
* already queued for a userspace notification, queue
4668
* it now */
4669
int need_schedule_work = 0;
4670
spin_lock(&release_list_lock);
4671
if (!cgroup_is_removed(cgrp) &&
4672
list_empty(&cgrp->release_list)) {
4673
list_add(&cgrp->release_list, &release_list);
4674
need_schedule_work = 1;
4675
}
4676
spin_unlock(&release_list_lock);
4677
if (need_schedule_work)
4678
schedule_work(&release_agent_work);
4679
}
4680
}
4681
4682
/* Caller must verify that the css is not for root cgroup */
4683
void __css_put(struct cgroup_subsys_state *css, int count)
4684
{
4685
struct cgroup *cgrp = css->cgroup;
4686
int val;
4687
rcu_read_lock();
4688
val = atomic_sub_return(count, &css->refcnt);
4689
if (val == 1) {
4690
if (notify_on_release(cgrp)) {
4691
set_bit(CGRP_RELEASABLE, &cgrp->flags);
4692
check_for_release(cgrp);
4693
}
4694
cgroup_wakeup_rmdir_waiter(cgrp);
4695
}
4696
rcu_read_unlock();
4697
WARN_ON_ONCE(val < 1);
4698
}
4699
EXPORT_SYMBOL_GPL(__css_put);
4700
4701
/*
4702
* Notify userspace when a cgroup is released, by running the
4703
* configured release agent with the name of the cgroup (path
4704
* relative to the root of cgroup file system) as the argument.
4705
*
4706
* Most likely, this user command will try to rmdir this cgroup.
4707
*
4708
* This races with the possibility that some other task will be
4709
* attached to this cgroup before it is removed, or that some other
4710
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
4711
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
4712
* unused, and this cgroup will be reprieved from its death sentence,
4713
* to continue to serve a useful existence. Next time it's released,
4714
* we will get notified again, if it still has 'notify_on_release' set.
4715
*
4716
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
4717
* means only wait until the task is successfully execve()'d. The
4718
* separate release agent task is forked by call_usermodehelper(),
4719
* then control in this thread returns here, without waiting for the
4720
* release agent task. We don't bother to wait because the caller of
4721
* this routine has no use for the exit status of the release agent
4722
* task, so no sense holding our caller up for that.
4723
*/
4724
static void cgroup_release_agent(struct work_struct *work)
4725
{
4726
BUG_ON(work != &release_agent_work);
4727
mutex_lock(&cgroup_mutex);
4728
spin_lock(&release_list_lock);
4729
while (!list_empty(&release_list)) {
4730
char *argv[3], *envp[3];
4731
int i;
4732
char *pathbuf = NULL, *agentbuf = NULL;
4733
struct cgroup *cgrp = list_entry(release_list.next,
4734
struct cgroup,
4735
release_list);
4736
list_del_init(&cgrp->release_list);
4737
spin_unlock(&release_list_lock);
4738
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4739
if (!pathbuf)
4740
goto continue_free;
4741
if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
4742
goto continue_free;
4743
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
4744
if (!agentbuf)
4745
goto continue_free;
4746
4747
i = 0;
4748
argv[i++] = agentbuf;
4749
argv[i++] = pathbuf;
4750
argv[i] = NULL;
4751
4752
i = 0;
4753
/* minimal command environment */
4754
envp[i++] = "HOME=/";
4755
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
4756
envp[i] = NULL;
4757
4758
/* Drop the lock while we invoke the usermode helper,
4759
* since the exec could involve hitting disk and hence
4760
* be a slow process */
4761
mutex_unlock(&cgroup_mutex);
4762
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
4763
mutex_lock(&cgroup_mutex);
4764
continue_free:
4765
kfree(pathbuf);
4766
kfree(agentbuf);
4767
spin_lock(&release_list_lock);
4768
}
4769
spin_unlock(&release_list_lock);
4770
mutex_unlock(&cgroup_mutex);
4771
}
4772
4773
static int __init cgroup_disable(char *str)
4774
{
4775
int i;
4776
char *token;
4777
4778
while ((token = strsep(&str, ",")) != NULL) {
4779
if (!*token)
4780
continue;
4781
/*
4782
* cgroup_disable, being at boot time, can't know about module
4783
* subsystems, so we don't worry about them.
4784
*/
4785
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4786
struct cgroup_subsys *ss = subsys[i];
4787
4788
if (!strcmp(token, ss->name)) {
4789
ss->disabled = 1;
4790
printk(KERN_INFO "Disabling %s control group"
4791
" subsystem\n", ss->name);
4792
break;
4793
}
4794
}
4795
}
4796
return 1;
4797
}
4798
__setup("cgroup_disable=", cgroup_disable);
4799
4800
/*
4801
* Functons for CSS ID.
4802
*/
4803
4804
/*
4805
*To get ID other than 0, this should be called when !cgroup_is_removed().
4806
*/
4807
unsigned short css_id(struct cgroup_subsys_state *css)
4808
{
4809
struct css_id *cssid;
4810
4811
/*
4812
* This css_id() can return correct value when somone has refcnt
4813
* on this or this is under rcu_read_lock(). Once css->id is allocated,
4814
* it's unchanged until freed.
4815
*/
4816
cssid = rcu_dereference_check(css->id,
4817
rcu_read_lock_held() || atomic_read(&css->refcnt));
4818
4819
if (cssid)
4820
return cssid->id;
4821
return 0;
4822
}
4823
EXPORT_SYMBOL_GPL(css_id);
4824
4825
unsigned short css_depth(struct cgroup_subsys_state *css)
4826
{
4827
struct css_id *cssid;
4828
4829
cssid = rcu_dereference_check(css->id,
4830
rcu_read_lock_held() || atomic_read(&css->refcnt));
4831
4832
if (cssid)
4833
return cssid->depth;
4834
return 0;
4835
}
4836
EXPORT_SYMBOL_GPL(css_depth);
4837
4838
/**
4839
* css_is_ancestor - test "root" css is an ancestor of "child"
4840
* @child: the css to be tested.
4841
* @root: the css supporsed to be an ancestor of the child.
4842
*
4843
* Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4844
* this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4845
* But, considering usual usage, the csses should be valid objects after test.
4846
* Assuming that the caller will do some action to the child if this returns
4847
* returns true, the caller must take "child";s reference count.
4848
* If "child" is valid object and this returns true, "root" is valid, too.
4849
*/
4850
4851
bool css_is_ancestor(struct cgroup_subsys_state *child,
4852
const struct cgroup_subsys_state *root)
4853
{
4854
struct css_id *child_id;
4855
struct css_id *root_id;
4856
bool ret = true;
4857
4858
rcu_read_lock();
4859
child_id = rcu_dereference(child->id);
4860
root_id = rcu_dereference(root->id);
4861
if (!child_id
4862
|| !root_id
4863
|| (child_id->depth < root_id->depth)
4864
|| (child_id->stack[root_id->depth] != root_id->id))
4865
ret = false;
4866
rcu_read_unlock();
4867
return ret;
4868
}
4869
4870
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4871
{
4872
struct css_id *id = css->id;
4873
/* When this is called before css_id initialization, id can be NULL */
4874
if (!id)
4875
return;
4876
4877
BUG_ON(!ss->use_id);
4878
4879
rcu_assign_pointer(id->css, NULL);
4880
rcu_assign_pointer(css->id, NULL);
4881
spin_lock(&ss->id_lock);
4882
idr_remove(&ss->idr, id->id);
4883
spin_unlock(&ss->id_lock);
4884
kfree_rcu(id, rcu_head);
4885
}
4886
EXPORT_SYMBOL_GPL(free_css_id);
4887
4888
/*
4889
* This is called by init or create(). Then, calls to this function are
4890
* always serialized (By cgroup_mutex() at create()).
4891
*/
4892
4893
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4894
{
4895
struct css_id *newid;
4896
int myid, error, size;
4897
4898
BUG_ON(!ss->use_id);
4899
4900
size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
4901
newid = kzalloc(size, GFP_KERNEL);
4902
if (!newid)
4903
return ERR_PTR(-ENOMEM);
4904
/* get id */
4905
if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
4906
error = -ENOMEM;
4907
goto err_out;
4908
}
4909
spin_lock(&ss->id_lock);
4910
/* Don't use 0. allocates an ID of 1-65535 */
4911
error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4912
spin_unlock(&ss->id_lock);
4913
4914
/* Returns error when there are no free spaces for new ID.*/
4915
if (error) {
4916
error = -ENOSPC;
4917
goto err_out;
4918
}
4919
if (myid > CSS_ID_MAX)
4920
goto remove_idr;
4921
4922
newid->id = myid;
4923
newid->depth = depth;
4924
return newid;
4925
remove_idr:
4926
error = -ENOSPC;
4927
spin_lock(&ss->id_lock);
4928
idr_remove(&ss->idr, myid);
4929
spin_unlock(&ss->id_lock);
4930
err_out:
4931
kfree(newid);
4932
return ERR_PTR(error);
4933
4934
}
4935
4936
static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4937
struct cgroup_subsys_state *rootcss)
4938
{
4939
struct css_id *newid;
4940
4941
spin_lock_init(&ss->id_lock);
4942
idr_init(&ss->idr);
4943
4944
newid = get_new_cssid(ss, 0);
4945
if (IS_ERR(newid))
4946
return PTR_ERR(newid);
4947
4948
newid->stack[0] = newid->id;
4949
newid->css = rootcss;
4950
rootcss->id = newid;
4951
return 0;
4952
}
4953
4954
static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4955
struct cgroup *child)
4956
{
4957
int subsys_id, i, depth = 0;
4958
struct cgroup_subsys_state *parent_css, *child_css;
4959
struct css_id *child_id, *parent_id;
4960
4961
subsys_id = ss->subsys_id;
4962
parent_css = parent->subsys[subsys_id];
4963
child_css = child->subsys[subsys_id];
4964
parent_id = parent_css->id;
4965
depth = parent_id->depth + 1;
4966
4967
child_id = get_new_cssid(ss, depth);
4968
if (IS_ERR(child_id))
4969
return PTR_ERR(child_id);
4970
4971
for (i = 0; i < depth; i++)
4972
child_id->stack[i] = parent_id->stack[i];
4973
child_id->stack[depth] = child_id->id;
4974
/*
4975
* child_id->css pointer will be set after this cgroup is available
4976
* see cgroup_populate_dir()
4977
*/
4978
rcu_assign_pointer(child_css->id, child_id);
4979
4980
return 0;
4981
}
4982
4983
/**
4984
* css_lookup - lookup css by id
4985
* @ss: cgroup subsys to be looked into.
4986
* @id: the id
4987
*
4988
* Returns pointer to cgroup_subsys_state if there is valid one with id.
4989
* NULL if not. Should be called under rcu_read_lock()
4990
*/
4991
struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4992
{
4993
struct css_id *cssid = NULL;
4994
4995
BUG_ON(!ss->use_id);
4996
cssid = idr_find(&ss->idr, id);
4997
4998
if (unlikely(!cssid))
4999
return NULL;
5000
5001
return rcu_dereference(cssid->css);
5002
}
5003
EXPORT_SYMBOL_GPL(css_lookup);
5004
5005
/**
5006
* css_get_next - lookup next cgroup under specified hierarchy.
5007
* @ss: pointer to subsystem
5008
* @id: current position of iteration.
5009
* @root: pointer to css. search tree under this.
5010
* @foundid: position of found object.
5011
*
5012
* Search next css under the specified hierarchy of rootid. Calling under
5013
* rcu_read_lock() is necessary. Returns NULL if it reaches the end.
5014
*/
5015
struct cgroup_subsys_state *
5016
css_get_next(struct cgroup_subsys *ss, int id,
5017
struct cgroup_subsys_state *root, int *foundid)
5018
{
5019
struct cgroup_subsys_state *ret = NULL;
5020
struct css_id *tmp;
5021
int tmpid;
5022
int rootid = css_id(root);
5023
int depth = css_depth(root);
5024
5025
if (!rootid)
5026
return NULL;
5027
5028
BUG_ON(!ss->use_id);
5029
/* fill start point for scan */
5030
tmpid = id;
5031
while (1) {
5032
/*
5033
* scan next entry from bitmap(tree), tmpid is updated after
5034
* idr_get_next().
5035
*/
5036
spin_lock(&ss->id_lock);
5037
tmp = idr_get_next(&ss->idr, &tmpid);
5038
spin_unlock(&ss->id_lock);
5039
5040
if (!tmp)
5041
break;
5042
if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5043
ret = rcu_dereference(tmp->css);
5044
if (ret) {
5045
*foundid = tmpid;
5046
break;
5047
}
5048
}
5049
/* continue to scan from next id */
5050
tmpid = tmpid + 1;
5051
}
5052
return ret;
5053
}
5054
5055
/*
5056
* get corresponding css from file open on cgroupfs directory
5057
*/
5058
struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5059
{
5060
struct cgroup *cgrp;
5061
struct inode *inode;
5062
struct cgroup_subsys_state *css;
5063
5064
inode = f->f_dentry->d_inode;
5065
/* check in cgroup filesystem dir */
5066
if (inode->i_op != &cgroup_dir_inode_operations)
5067
return ERR_PTR(-EBADF);
5068
5069
if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5070
return ERR_PTR(-EINVAL);
5071
5072
/* get cgroup */
5073
cgrp = __d_cgrp(f->f_dentry);
5074
css = cgrp->subsys[id];
5075
return css ? css : ERR_PTR(-ENOENT);
5076
}
5077
5078
#ifdef CONFIG_CGROUP_DEBUG
5079
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5080
struct cgroup *cont)
5081
{
5082
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5083
5084
if (!css)
5085
return ERR_PTR(-ENOMEM);
5086
5087
return css;
5088
}
5089
5090
static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
5091
{
5092
kfree(cont->subsys[debug_subsys_id]);
5093
}
5094
5095
static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5096
{
5097
return atomic_read(&cont->count);
5098
}
5099
5100
static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
5101
{
5102
return cgroup_task_count(cont);
5103
}
5104
5105
static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
5106
{
5107
return (u64)(unsigned long)current->cgroups;
5108
}
5109
5110
static u64 current_css_set_refcount_read(struct cgroup *cont,
5111
struct cftype *cft)
5112
{
5113
u64 count;
5114
5115
rcu_read_lock();
5116
count = atomic_read(&current->cgroups->refcount);
5117
rcu_read_unlock();
5118
return count;
5119
}
5120
5121
static int current_css_set_cg_links_read(struct cgroup *cont,
5122
struct cftype *cft,
5123
struct seq_file *seq)
5124
{
5125
struct cg_cgroup_link *link;
5126
struct css_set *cg;
5127
5128
read_lock(&css_set_lock);
5129
rcu_read_lock();
5130
cg = rcu_dereference(current->cgroups);
5131
list_for_each_entry(link, &cg->cg_links, cg_link_list) {
5132
struct cgroup *c = link->cgrp;
5133
const char *name;
5134
5135
if (c->dentry)
5136
name = c->dentry->d_name.name;
5137
else
5138
name = "?";
5139
seq_printf(seq, "Root %d group %s\n",
5140
c->root->hierarchy_id, name);
5141
}
5142
rcu_read_unlock();
5143
read_unlock(&css_set_lock);
5144
return 0;
5145
}
5146
5147
#define MAX_TASKS_SHOWN_PER_CSS 25
5148
static int cgroup_css_links_read(struct cgroup *cont,
5149
struct cftype *cft,
5150
struct seq_file *seq)
5151
{
5152
struct cg_cgroup_link *link;
5153
5154
read_lock(&css_set_lock);
5155
list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
5156
struct css_set *cg = link->cg;
5157
struct task_struct *task;
5158
int count = 0;
5159
seq_printf(seq, "css_set %p\n", cg);
5160
list_for_each_entry(task, &cg->tasks, cg_list) {
5161
if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5162
seq_puts(seq, " ...\n");
5163
break;
5164
} else {
5165
seq_printf(seq, " task %d\n",
5166
task_pid_vnr(task));
5167
}
5168
}
5169
}
5170
read_unlock(&css_set_lock);
5171
return 0;
5172
}
5173
5174
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5175
{
5176
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
5177
}
5178
5179
static struct cftype debug_files[] = {
5180
{
5181
.name = "cgroup_refcount",
5182
.read_u64 = cgroup_refcount_read,
5183
},
5184
{
5185
.name = "taskcount",
5186
.read_u64 = debug_taskcount_read,
5187
},
5188
5189
{
5190
.name = "current_css_set",
5191
.read_u64 = current_css_set_read,
5192
},
5193
5194
{
5195
.name = "current_css_set_refcount",
5196
.read_u64 = current_css_set_refcount_read,
5197
},
5198
5199
{
5200
.name = "current_css_set_cg_links",
5201
.read_seq_string = current_css_set_cg_links_read,
5202
},
5203
5204
{
5205
.name = "cgroup_css_links",
5206
.read_seq_string = cgroup_css_links_read,
5207
},
5208
5209
{
5210
.name = "releasable",
5211
.read_u64 = releasable_read,
5212
},
5213
};
5214
5215
static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
5216
{
5217
return cgroup_add_files(cont, ss, debug_files,
5218
ARRAY_SIZE(debug_files));
5219
}
5220
5221
struct cgroup_subsys debug_subsys = {
5222
.name = "debug",
5223
.create = debug_create,
5224
.destroy = debug_destroy,
5225
.populate = debug_populate,
5226
.subsys_id = debug_subsys_id,
5227
};
5228
#endif /* CONFIG_CGROUP_DEBUG */
5229
5230