CoCalc -- cgroup.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/kernel/cgroup.c
¹⁰⁸¹⁴ views
1
/*
2
 *  Generic process-grouping system.
3
 *
4
 *  Based originally on the cpuset system, extracted by Paul Menage
5
 *  Copyright (C) 2006 Google, Inc
6
 *
7
 *  Notifications support
8
 *  Copyright (C) 2009 Nokia Corporation
9
 *  Author: Kirill A. Shutemov
10
 *
11
 *  Copyright notices from the original cpuset code:
12
 *  --------------------------------------------------
13
 *  Copyright (C) 2003 BULL SA.
14
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
15
 *
16
 *  Portions derived from Patrick Mochel's sysfs code.
17
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
18
 *
19
 *  2003-10-10 Written by Simon Derr.
20
 *  2003-10-22 Updates by Stephen Hemminger.
21
 *  2004 May-July Rework by Paul Jackson.
22
 *  ---------------------------------------------------
23
 *
24
 *  This file is subject to the terms and conditions of the GNU General Public
25
 *  License.  See the file COPYING in the main directory of the Linux
26
 *  distribution for more details.
27
 */
28

29
#include <linux/cgroup.h>
30
#include <linux/ctype.h>
31
#include <linux/errno.h>
32
#include <linux/fs.h>
33
#include <linux/kernel.h>
34
#include <linux/list.h>
35
#include <linux/mm.h>
36
#include <linux/mutex.h>
37
#include <linux/mount.h>
38
#include <linux/pagemap.h>
39
#include <linux/proc_fs.h>
40
#include <linux/rcupdate.h>
41
#include <linux/sched.h>
42
#include <linux/backing-dev.h>
43
#include <linux/seq_file.h>
44
#include <linux/slab.h>
45
#include <linux/magic.h>
46
#include <linux/spinlock.h>
47
#include <linux/string.h>
48
#include <linux/sort.h>
49
#include <linux/kmod.h>
50
#include <linux/module.h>
51
#include <linux/delayacct.h>
52
#include <linux/cgroupstats.h>
53
#include <linux/hash.h>
54
#include <linux/namei.h>
55
#include <linux/pid_namespace.h>
56
#include <linux/idr.h>
57
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58
#include <linux/eventfd.h>
59
#include <linux/poll.h>
60
#include <linux/flex_array.h> /* used in cgroup_attach_proc */
61

62
#include <asm/atomic.h>
63

64
static DEFINE_MUTEX(cgroup_mutex);
65

66
/*
67
 * Generate an array of cgroup subsystem pointers. At boot time, this is
68
 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69
 * registered after that. The mutable section of this array is protected by
70
 * cgroup_mutex.
71
 */
72
#define SUBSYS(_x) &_x ## _subsys,
73
static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
74
#include <linux/cgroup_subsys.h>
75
};
76

77
#define MAX_CGROUP_ROOT_NAMELEN 64
78

79
/*
80
 * A cgroupfs_root represents the root of a cgroup hierarchy,
81
 * and may be associated with a superblock to form an active
82
 * hierarchy
83
 */
84
struct cgroupfs_root {
85
	struct super_block *sb;
86

87
	/*
88
	 * The bitmask of subsystems intended to be attached to this
89
	 * hierarchy
90
	 */
91
	unsigned long subsys_bits;
92

93
	/* Unique id for this hierarchy. */
94
	int hierarchy_id;
95

96
	/* The bitmask of subsystems currently attached to this hierarchy */
97
	unsigned long actual_subsys_bits;
98

99
	/* A list running through the attached subsystems */
100
	struct list_head subsys_list;
101

102
	/* The root cgroup for this hierarchy */
103
	struct cgroup top_cgroup;
104

105
	/* Tracks how many cgroups are currently defined in hierarchy.*/
106
	int number_of_cgroups;
107

108
	/* A list running through the active hierarchies */
109
	struct list_head root_list;
110

111
	/* Hierarchy-specific flags */
112
	unsigned long flags;
113

114
	/* The path to use for release notifications. */
115
	char release_agent_path[PATH_MAX];
116

117
	/* The name for this hierarchy - may be empty */
118
	char name[MAX_CGROUP_ROOT_NAMELEN];
119
};
120

121
/*
122
 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
123
 * subsystems that are otherwise unattached - it never has more than a
124
 * single cgroup, and all tasks are part of that cgroup.
125
 */
126
static struct cgroupfs_root rootnode;
127

128
/*
129
 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
130
 * cgroup_subsys->use_id != 0.
131
 */
132
#define CSS_ID_MAX	(65535)
133
struct css_id {
134
	/*
135
	 * The css to which this ID points. This pointer is set to valid value
136
	 * after cgroup is populated. If cgroup is removed, this will be NULL.
137
	 * This pointer is expected to be RCU-safe because destroy()
138
	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139
	 * css_tryget() should be used for avoiding race.
140
	 */
141
	struct cgroup_subsys_state __rcu *css;
142
	/*
143
	 * ID of this css.
144
	 */
145
	unsigned short id;
146
	/*
147
	 * Depth in hierarchy which this ID belongs to.
148
	 */
149
	unsigned short depth;
150
	/*
151
	 * ID is freed by RCU. (and lookup routine is RCU safe.)
152
	 */
153
	struct rcu_head rcu_head;
154
	/*
155
	 * Hierarchy of CSS ID belongs to.
156
	 */
157
	unsigned short stack[0]; /* Array of Length (depth+1) */
158
};
159

160
/*
161
 * cgroup_event represents events which userspace want to receive.
162
 */
163
struct cgroup_event {
164
	/*
165
	 * Cgroup which the event belongs to.
166
	 */
167
	struct cgroup *cgrp;
168
	/*
169
	 * Control file which the event associated.
170
	 */
171
	struct cftype *cft;
172
	/*
173
	 * eventfd to signal userspace about the event.
174
	 */
175
	struct eventfd_ctx *eventfd;
176
	/*
177
	 * Each of these stored in a list by the cgroup.
178
	 */
179
	struct list_head list;
180
	/*
181
	 * All fields below needed to unregister event when
182
	 * userspace closes eventfd.
183
	 */
184
	poll_table pt;
185
	wait_queue_head_t *wqh;
186
	wait_queue_t wait;
187
	struct work_struct remove;
188
};
189

190
/* The list of hierarchy roots */
191

192
static LIST_HEAD(roots);
193
static int root_count;
194

195
static DEFINE_IDA(hierarchy_ida);
196
static int next_hierarchy_id;
197
static DEFINE_SPINLOCK(hierarchy_id_lock);
198

199
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
200
#define dummytop (&rootnode.top_cgroup)
201

202
/* This flag indicates whether tasks in the fork and exit paths should
203
 * check for fork/exit handlers to call. This avoids us having to do
204
 * extra work in the fork/exit path if none of the subsystems need to
205
 * be called.
206
 */
207
static int need_forkexit_callback __read_mostly;
208

209
#ifdef CONFIG_PROVE_LOCKING
210
int cgroup_lock_is_held(void)
211
{
212
	return lockdep_is_held(&cgroup_mutex);
213
}
214
#else /* #ifdef CONFIG_PROVE_LOCKING */
215
int cgroup_lock_is_held(void)
216
{
217
	return mutex_is_locked(&cgroup_mutex);
218
}
219
#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220

221
EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222

223
/* convenient tests for these bits */
224
inline int cgroup_is_removed(const struct cgroup *cgrp)
225
{
226
	return test_bit(CGRP_REMOVED, &cgrp->flags);
227
}
228

229
/* bits in struct cgroupfs_root flags field */
230
enum {
231
	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
232
};
233

234
static int cgroup_is_releasable(const struct cgroup *cgrp)
235
{
236
	const int bits =
237
		(1 << CGRP_RELEASABLE) |
238
		(1 << CGRP_NOTIFY_ON_RELEASE);
239
	return (cgrp->flags & bits) == bits;
240
}
241

242
static int notify_on_release(const struct cgroup *cgrp)
243
{
244
	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
245
}
246

247
static int clone_children(const struct cgroup *cgrp)
248
{
249
	return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
250
}
251

252
/*
253
 * for_each_subsys() allows you to iterate on each subsystem attached to
254
 * an active hierarchy
255
 */
256
#define for_each_subsys(_root, _ss) \
257
list_for_each_entry(_ss, &_root->subsys_list, sibling)
258

259
/* for_each_active_root() allows you to iterate across the active hierarchies */
260
#define for_each_active_root(_root) \
261
list_for_each_entry(_root, &roots, root_list)
262

263
/* the list of cgroups eligible for automatic release. Protected by
264
 * release_list_lock */
265
static LIST_HEAD(release_list);
266
static DEFINE_SPINLOCK(release_list_lock);
267
static void cgroup_release_agent(struct work_struct *work);
268
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
269
static void check_for_release(struct cgroup *cgrp);
270

271
/* Link structure for associating css_set objects with cgroups */
272
struct cg_cgroup_link {
273
	/*
274
	 * List running through cg_cgroup_links associated with a
275
	 * cgroup, anchored on cgroup->css_sets
276
	 */
277
	struct list_head cgrp_link_list;
278
	struct cgroup *cgrp;
279
	/*
280
	 * List running through cg_cgroup_links pointing at a
281
	 * single css_set object, anchored on css_set->cg_links
282
	 */
283
	struct list_head cg_link_list;
284
	struct css_set *cg;
285
};
286

287
/* The default css_set - used by init and its children prior to any
288
 * hierarchies being mounted. It contains a pointer to the root state
289
 * for each subsystem. Also used to anchor the list of css_sets. Not
290
 * reference-counted, to improve performance when child cgroups
291
 * haven't been created.
292
 */
293

294
static struct css_set init_css_set;
295
static struct cg_cgroup_link init_css_set_link;
296

297
static int cgroup_init_idr(struct cgroup_subsys *ss,
298
			   struct cgroup_subsys_state *css);
299

300
/* css_set_lock protects the list of css_set objects, and the
301
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
302
 * due to cgroup_iter_start() */
303
static DEFINE_RWLOCK(css_set_lock);
304
static int css_set_count;
305

306
/*
307
 * hash table for cgroup groups. This improves the performance to find
308
 * an existing css_set. This hash doesn't (currently) take into
309
 * account cgroups in empty hierarchies.
310
 */
311
#define CSS_SET_HASH_BITS	7
312
#define CSS_SET_TABLE_SIZE	(1 << CSS_SET_HASH_BITS)
313
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
314

315
static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
316
{
317
	int i;
318
	int index;
319
	unsigned long tmp = 0UL;
320

321
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
322
		tmp += (unsigned long)css[i];
323
	tmp = (tmp >> 16) ^ tmp;
324

325
	index = hash_long(tmp, CSS_SET_HASH_BITS);
326

327
	return &css_set_table[index];
328
}
329

330
/* We don't maintain the lists running through each css_set to its
331
 * task until after the first call to cgroup_iter_start(). This
332
 * reduces the fork()/exit() overhead for people who have cgroups
333
 * compiled into their kernel but not actually in use */
334
static int use_task_css_set_links __read_mostly;
335

336
static void __put_css_set(struct css_set *cg, int taskexit)
337
{
338
	struct cg_cgroup_link *link;
339
	struct cg_cgroup_link *saved_link;
340
	/*
341
	 * Ensure that the refcount doesn't hit zero while any readers
342
	 * can see it. Similar to atomic_dec_and_lock(), but for an
343
	 * rwlock
344
	 */
345
	if (atomic_add_unless(&cg->refcount, -1, 1))
346
		return;
347
	write_lock(&css_set_lock);
348
	if (!atomic_dec_and_test(&cg->refcount)) {
349
		write_unlock(&css_set_lock);
350
		return;
351
	}
352

353
	/* This css_set is dead. unlink it and release cgroup refcounts */
354
	hlist_del(&cg->hlist);
355
	css_set_count--;
356

357
	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
358
				 cg_link_list) {
359
		struct cgroup *cgrp = link->cgrp;
360
		list_del(&link->cg_link_list);
361
		list_del(&link->cgrp_link_list);
362
		if (atomic_dec_and_test(&cgrp->count) &&
363
		    notify_on_release(cgrp)) {
364
			if (taskexit)
365
				set_bit(CGRP_RELEASABLE, &cgrp->flags);
366
			check_for_release(cgrp);
367
		}
368

369
		kfree(link);
370
	}
371

372
	write_unlock(&css_set_lock);
373
	kfree_rcu(cg, rcu_head);
374
}
375

376
/*
377
 * refcounted get/put for css_set objects
378
 */
379
static inline void get_css_set(struct css_set *cg)
380
{
381
	atomic_inc(&cg->refcount);
382
}
383

384
static inline void put_css_set(struct css_set *cg)
385
{
386
	__put_css_set(cg, 0);
387
}
388

389
static inline void put_css_set_taskexit(struct css_set *cg)
390
{
391
	__put_css_set(cg, 1);
392
}
393

394
/*
395
 * compare_css_sets - helper function for find_existing_css_set().
396
 * @cg: candidate css_set being tested
397
 * @old_cg: existing css_set for a task
398
 * @new_cgrp: cgroup that's being entered by the task
399
 * @template: desired set of css pointers in css_set (pre-calculated)
400
 *
401
 * Returns true if "cg" matches "old_cg" except for the hierarchy
402
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
403
 */
404
static bool compare_css_sets(struct css_set *cg,
405
			     struct css_set *old_cg,
406
			     struct cgroup *new_cgrp,
407
			     struct cgroup_subsys_state *template[])
408
{
409
	struct list_head *l1, *l2;
410

411
	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
412
		/* Not all subsystems matched */
413
		return false;
414
	}
415

416
	/*
417
	 * Compare cgroup pointers in order to distinguish between
418
	 * different cgroups in heirarchies with no subsystems. We
419
	 * could get by with just this check alone (and skip the
420
	 * memcmp above) but on most setups the memcmp check will
421
	 * avoid the need for this more expensive check on almost all
422
	 * candidates.
423
	 */
424

425
	l1 = &cg->cg_links;
426
	l2 = &old_cg->cg_links;
427
	while (1) {
428
		struct cg_cgroup_link *cgl1, *cgl2;
429
		struct cgroup *cg1, *cg2;
430

431
		l1 = l1->next;
432
		l2 = l2->next;
433
		/* See if we reached the end - both lists are equal length. */
434
		if (l1 == &cg->cg_links) {
435
			BUG_ON(l2 != &old_cg->cg_links);
436
			break;
437
		} else {
438
			BUG_ON(l2 == &old_cg->cg_links);
439
		}
440
		/* Locate the cgroups associated with these links. */
441
		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
442
		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
443
		cg1 = cgl1->cgrp;
444
		cg2 = cgl2->cgrp;
445
		/* Hierarchies should be linked in the same order. */
446
		BUG_ON(cg1->root != cg2->root);
447

448
		/*
449
		 * If this hierarchy is the hierarchy of the cgroup
450
		 * that's changing, then we need to check that this
451
		 * css_set points to the new cgroup; if it's any other
452
		 * hierarchy, then this css_set should point to the
453
		 * same cgroup as the old css_set.
454
		 */
455
		if (cg1->root == new_cgrp->root) {
456
			if (cg1 != new_cgrp)
457
				return false;
458
		} else {
459
			if (cg1 != cg2)
460
				return false;
461
		}
462
	}
463
	return true;
464
}
465

466
/*
467
 * find_existing_css_set() is a helper for
468
 * find_css_set(), and checks to see whether an existing
469
 * css_set is suitable.
470
 *
471
 * oldcg: the cgroup group that we're using before the cgroup
472
 * transition
473
 *
474
 * cgrp: the cgroup that we're moving into
475
 *
476
 * template: location in which to build the desired set of subsystem
477
 * state objects for the new cgroup group
478
 */
479
static struct css_set *find_existing_css_set(
480
	struct css_set *oldcg,
481
	struct cgroup *cgrp,
482
	struct cgroup_subsys_state *template[])
483
{
484
	int i;
485
	struct cgroupfs_root *root = cgrp->root;
486
	struct hlist_head *hhead;
487
	struct hlist_node *node;
488
	struct css_set *cg;
489

490
	/*
491
	 * Build the set of subsystem state objects that we want to see in the
492
	 * new css_set. while subsystems can change globally, the entries here
493
	 * won't change, so no need for locking.
494
	 */
495
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
496
		if (root->subsys_bits & (1UL << i)) {
497
			/* Subsystem is in this hierarchy. So we want
498
			 * the subsystem state from the new
499
			 * cgroup */
500
			template[i] = cgrp->subsys[i];
501
		} else {
502
			/* Subsystem is not in this hierarchy, so we
503
			 * don't want to change the subsystem state */
504
			template[i] = oldcg->subsys[i];
505
		}
506
	}
507

508
	hhead = css_set_hash(template);
509
	hlist_for_each_entry(cg, node, hhead, hlist) {
510
		if (!compare_css_sets(cg, oldcg, cgrp, template))
511
			continue;
512

513
		/* This css_set matches what we need */
514
		return cg;
515
	}
516

517
	/* No existing cgroup group matched */
518
	return NULL;
519
}
520

521
static void free_cg_links(struct list_head *tmp)
522
{
523
	struct cg_cgroup_link *link;
524
	struct cg_cgroup_link *saved_link;
525

526
	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
527
		list_del(&link->cgrp_link_list);
528
		kfree(link);
529
	}
530
}
531

532
/*
533
 * allocate_cg_links() allocates "count" cg_cgroup_link structures
534
 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
535
 * success or a negative error
536
 */
537
static int allocate_cg_links(int count, struct list_head *tmp)
538
{
539
	struct cg_cgroup_link *link;
540
	int i;
541
	INIT_LIST_HEAD(tmp);
542
	for (i = 0; i < count; i++) {
543
		link = kmalloc(sizeof(*link), GFP_KERNEL);
544
		if (!link) {
545
			free_cg_links(tmp);
546
			return -ENOMEM;
547
		}
548
		list_add(&link->cgrp_link_list, tmp);
549
	}
550
	return 0;
551
}
552

553
/**
554
 * link_css_set - a helper function to link a css_set to a cgroup
555
 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
556
 * @cg: the css_set to be linked
557
 * @cgrp: the destination cgroup
558
 */
559
static void link_css_set(struct list_head *tmp_cg_links,
560
			 struct css_set *cg, struct cgroup *cgrp)
561
{
562
	struct cg_cgroup_link *link;
563

564
	BUG_ON(list_empty(tmp_cg_links));
565
	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
566
				cgrp_link_list);
567
	link->cg = cg;
568
	link->cgrp = cgrp;
569
	atomic_inc(&cgrp->count);
570
	list_move(&link->cgrp_link_list, &cgrp->css_sets);
571
	/*
572
	 * Always add links to the tail of the list so that the list
573
	 * is sorted by order of hierarchy creation
574
	 */
575
	list_add_tail(&link->cg_link_list, &cg->cg_links);
576
}
577

578
/*
579
 * find_css_set() takes an existing cgroup group and a
580
 * cgroup object, and returns a css_set object that's
581
 * equivalent to the old group, but with the given cgroup
582
 * substituted into the appropriate hierarchy. Must be called with
583
 * cgroup_mutex held
584
 */
585
static struct css_set *find_css_set(
586
	struct css_set *oldcg, struct cgroup *cgrp)
587
{
588
	struct css_set *res;
589
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
590

591
	struct list_head tmp_cg_links;
592

593
	struct hlist_head *hhead;
594
	struct cg_cgroup_link *link;
595

596
	/* First see if we already have a cgroup group that matches
597
	 * the desired set */
598
	read_lock(&css_set_lock);
599
	res = find_existing_css_set(oldcg, cgrp, template);
600
	if (res)
601
		get_css_set(res);
602
	read_unlock(&css_set_lock);
603

604
	if (res)
605
		return res;
606

607
	res = kmalloc(sizeof(*res), GFP_KERNEL);
608
	if (!res)
609
		return NULL;
610

611
	/* Allocate all the cg_cgroup_link objects that we'll need */
612
	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
613
		kfree(res);
614
		return NULL;
615
	}
616

617
	atomic_set(&res->refcount, 1);
618
	INIT_LIST_HEAD(&res->cg_links);
619
	INIT_LIST_HEAD(&res->tasks);
620
	INIT_HLIST_NODE(&res->hlist);
621

622
	/* Copy the set of subsystem state objects generated in
623
	 * find_existing_css_set() */
624
	memcpy(res->subsys, template, sizeof(res->subsys));
625

626
	write_lock(&css_set_lock);
627
	/* Add reference counts and links from the new css_set. */
628
	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
629
		struct cgroup *c = link->cgrp;
630
		if (c->root == cgrp->root)
631
			c = cgrp;
632
		link_css_set(&tmp_cg_links, res, c);
633
	}
634

635
	BUG_ON(!list_empty(&tmp_cg_links));
636

637
	css_set_count++;
638

639
	/* Add this cgroup group to the hash table */
640
	hhead = css_set_hash(res->subsys);
641
	hlist_add_head(&res->hlist, hhead);
642

643
	write_unlock(&css_set_lock);
644

645
	return res;
646
}
647

648
/*
649
 * Return the cgroup for "task" from the given hierarchy. Must be
650
 * called with cgroup_mutex held.
651
 */
652
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
653
					    struct cgroupfs_root *root)
654
{
655
	struct css_set *css;
656
	struct cgroup *res = NULL;
657

658
	BUG_ON(!mutex_is_locked(&cgroup_mutex));
659
	read_lock(&css_set_lock);
660
	/*
661
	 * No need to lock the task - since we hold cgroup_mutex the
662
	 * task can't change groups, so the only thing that can happen
663
	 * is that it exits and its css is set back to init_css_set.
664
	 */
665
	css = task->cgroups;
666
	if (css == &init_css_set) {
667
		res = &root->top_cgroup;
668
	} else {
669
		struct cg_cgroup_link *link;
670
		list_for_each_entry(link, &css->cg_links, cg_link_list) {
671
			struct cgroup *c = link->cgrp;
672
			if (c->root == root) {
673
				res = c;
674
				break;
675
			}
676
		}
677
	}
678
	read_unlock(&css_set_lock);
679
	BUG_ON(!res);
680
	return res;
681
}
682

683
/*
684
 * There is one global cgroup mutex. We also require taking
685
 * task_lock() when dereferencing a task's cgroup subsys pointers.
686
 * See "The task_lock() exception", at the end of this comment.
687
 *
688
 * A task must hold cgroup_mutex to modify cgroups.
689
 *
690
 * Any task can increment and decrement the count field without lock.
691
 * So in general, code holding cgroup_mutex can't rely on the count
692
 * field not changing.  However, if the count goes to zero, then only
693
 * cgroup_attach_task() can increment it again.  Because a count of zero
694
 * means that no tasks are currently attached, therefore there is no
695
 * way a task attached to that cgroup can fork (the other way to
696
 * increment the count).  So code holding cgroup_mutex can safely
697
 * assume that if the count is zero, it will stay zero. Similarly, if
698
 * a task holds cgroup_mutex on a cgroup with zero count, it
699
 * knows that the cgroup won't be removed, as cgroup_rmdir()
700
 * needs that mutex.
701
 *
702
 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
703
 * (usually) take cgroup_mutex.  These are the two most performance
704
 * critical pieces of code here.  The exception occurs on cgroup_exit(),
705
 * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
706
 * is taken, and if the cgroup count is zero, a usermode call made
707
 * to the release agent with the name of the cgroup (path relative to
708
 * the root of cgroup file system) as the argument.
709
 *
710
 * A cgroup can only be deleted if both its 'count' of using tasks
711
 * is zero, and its list of 'children' cgroups is empty.  Since all
712
 * tasks in the system use _some_ cgroup, and since there is always at
713
 * least one task in the system (init, pid == 1), therefore, top_cgroup
714
 * always has either children cgroups and/or using tasks.  So we don't
715
 * need a special hack to ensure that top_cgroup cannot be deleted.
716
 *
717
 *	The task_lock() exception
718
 *
719
 * The need for this exception arises from the action of
720
 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
721
 * another.  It does so using cgroup_mutex, however there are
722
 * several performance critical places that need to reference
723
 * task->cgroup without the expense of grabbing a system global
724
 * mutex.  Therefore except as noted below, when dereferencing or, as
725
 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
726
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
727
 * the task_struct routinely used for such matters.
728
 *
729
 * P.S.  One more locking exception.  RCU is used to guard the
730
 * update of a tasks cgroup pointer by cgroup_attach_task()
731
 */
732

733
/**
734
 * cgroup_lock - lock out any changes to cgroup structures
735
 *
736
 */
737
void cgroup_lock(void)
738
{
739
	mutex_lock(&cgroup_mutex);
740
}
741
EXPORT_SYMBOL_GPL(cgroup_lock);
742

743
/**
744
 * cgroup_unlock - release lock on cgroup changes
745
 *
746
 * Undo the lock taken in a previous cgroup_lock() call.
747
 */
748
void cgroup_unlock(void)
749
{
750
	mutex_unlock(&cgroup_mutex);
751
}
752
EXPORT_SYMBOL_GPL(cgroup_unlock);
753

754
/*
755
 * A couple of forward declarations required, due to cyclic reference loop:
756
 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
757
 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
758
 * -> cgroup_mkdir.
759
 */
760

761
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
762
static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
763
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
764
static int cgroup_populate_dir(struct cgroup *cgrp);
765
static const struct inode_operations cgroup_dir_inode_operations;
766
static const struct file_operations proc_cgroupstats_operations;
767

768
static struct backing_dev_info cgroup_backing_dev_info = {
769
	.name		= "cgroup",
770
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
771
};
772

773
static int alloc_css_id(struct cgroup_subsys *ss,
774
			struct cgroup *parent, struct cgroup *child);
775

776
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
777
{
778
	struct inode *inode = new_inode(sb);
779

780
	if (inode) {
781
		inode->i_ino = get_next_ino();
782
		inode->i_mode = mode;
783
		inode->i_uid = current_fsuid();
784
		inode->i_gid = current_fsgid();
785
		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
786
		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
787
	}
788
	return inode;
789
}
790

791
/*
792
 * Call subsys's pre_destroy handler.
793
 * This is called before css refcnt check.
794
 */
795
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
796
{
797
	struct cgroup_subsys *ss;
798
	int ret = 0;
799

800
	for_each_subsys(cgrp->root, ss)
801
		if (ss->pre_destroy) {
802
			ret = ss->pre_destroy(ss, cgrp);
803
			if (ret)
804
				break;
805
		}
806

807
	return ret;
808
}
809

810
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
811
{
812
	/* is dentry a directory ? if so, kfree() associated cgroup */
813
	if (S_ISDIR(inode->i_mode)) {
814
		struct cgroup *cgrp = dentry->d_fsdata;
815
		struct cgroup_subsys *ss;
816
		BUG_ON(!(cgroup_is_removed(cgrp)));
817
		/* It's possible for external users to be holding css
818
		 * reference counts on a cgroup; css_put() needs to
819
		 * be able to access the cgroup after decrementing
820
		 * the reference count in order to know if it needs to
821
		 * queue the cgroup to be handled by the release
822
		 * agent */
823
		synchronize_rcu();
824

825
		mutex_lock(&cgroup_mutex);
826
		/*
827
		 * Release the subsystem state objects.
828
		 */
829
		for_each_subsys(cgrp->root, ss)
830
			ss->destroy(ss, cgrp);
831

832
		cgrp->root->number_of_cgroups--;
833
		mutex_unlock(&cgroup_mutex);
834

835
		/*
836
		 * Drop the active superblock reference that we took when we
837
		 * created the cgroup
838
		 */
839
		deactivate_super(cgrp->root->sb);
840

841
		/*
842
		 * if we're getting rid of the cgroup, refcount should ensure
843
		 * that there are no pidlists left.
844
		 */
845
		BUG_ON(!list_empty(&cgrp->pidlists));
846

847
		kfree_rcu(cgrp, rcu_head);
848
	}
849
	iput(inode);
850
}
851

852
static int cgroup_delete(const struct dentry *d)
853
{
854
	return 1;
855
}
856

857
static void remove_dir(struct dentry *d)
858
{
859
	struct dentry *parent = dget(d->d_parent);
860

861
	d_delete(d);
862
	simple_rmdir(parent->d_inode, d);
863
	dput(parent);
864
}
865

866
static void cgroup_clear_directory(struct dentry *dentry)
867
{
868
	struct list_head *node;
869

870
	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
871
	spin_lock(&dentry->d_lock);
872
	node = dentry->d_subdirs.next;
873
	while (node != &dentry->d_subdirs) {
874
		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
875

876
		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
877
		list_del_init(node);
878
		if (d->d_inode) {
879
			/* This should never be called on a cgroup
880
			 * directory with child cgroups */
881
			BUG_ON(d->d_inode->i_mode & S_IFDIR);
882
			dget_dlock(d);
883
			spin_unlock(&d->d_lock);
884
			spin_unlock(&dentry->d_lock);
885
			d_delete(d);
886
			simple_unlink(dentry->d_inode, d);
887
			dput(d);
888
			spin_lock(&dentry->d_lock);
889
		} else
890
			spin_unlock(&d->d_lock);
891
		node = dentry->d_subdirs.next;
892
	}
893
	spin_unlock(&dentry->d_lock);
894
}
895

896
/*
897
 * NOTE : the dentry must have been dget()'ed
898
 */
899
static void cgroup_d_remove_dir(struct dentry *dentry)
900
{
901
	struct dentry *parent;
902

903
	cgroup_clear_directory(dentry);
904

905
	parent = dentry->d_parent;
906
	spin_lock(&parent->d_lock);
907
	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
908
	list_del_init(&dentry->d_u.d_child);
909
	spin_unlock(&dentry->d_lock);
910
	spin_unlock(&parent->d_lock);
911
	remove_dir(dentry);
912
}
913

914
/*
915
 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
916
 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
917
 * reference to css->refcnt. In general, this refcnt is expected to goes down
918
 * to zero, soon.
919
 *
920
 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
921
 */
922
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
923

924
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
925
{
926
	if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
927
		wake_up_all(&cgroup_rmdir_waitq);
928
}
929

930
void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
931
{
932
	css_get(css);
933
}
934

935
void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
936
{
937
	cgroup_wakeup_rmdir_waiter(css->cgroup);
938
	css_put(css);
939
}
940

941
/*
942
 * Call with cgroup_mutex held. Drops reference counts on modules, including
943
 * any duplicate ones that parse_cgroupfs_options took. If this function
944
 * returns an error, no reference counts are touched.
945
 */
946
static int rebind_subsystems(struct cgroupfs_root *root,
947
			      unsigned long final_bits)
948
{
949
	unsigned long added_bits, removed_bits;
950
	struct cgroup *cgrp = &root->top_cgroup;
951
	int i;
952

953
	BUG_ON(!mutex_is_locked(&cgroup_mutex));
954

955
	removed_bits = root->actual_subsys_bits & ~final_bits;
956
	added_bits = final_bits & ~root->actual_subsys_bits;
957
	/* Check that any added subsystems are currently free */
958
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
959
		unsigned long bit = 1UL << i;
960
		struct cgroup_subsys *ss = subsys[i];
961
		if (!(bit & added_bits))
962
			continue;
963
		/*
964
		 * Nobody should tell us to do a subsys that doesn't exist:
965
		 * parse_cgroupfs_options should catch that case and refcounts
966
		 * ensure that subsystems won't disappear once selected.
967
		 */
968
		BUG_ON(ss == NULL);
969
		if (ss->root != &rootnode) {
970
			/* Subsystem isn't free */
971
			return -EBUSY;
972
		}
973
	}
974

975
	/* Currently we don't handle adding/removing subsystems when
976
	 * any child cgroups exist. This is theoretically supportable
977
	 * but involves complex error handling, so it's being left until
978
	 * later */
979
	if (root->number_of_cgroups > 1)
980
		return -EBUSY;
981

982
	/* Process each subsystem */
983
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
984
		struct cgroup_subsys *ss = subsys[i];
985
		unsigned long bit = 1UL << i;
986
		if (bit & added_bits) {
987
			/* We're binding this subsystem to this hierarchy */
988
			BUG_ON(ss == NULL);
989
			BUG_ON(cgrp->subsys[i]);
990
			BUG_ON(!dummytop->subsys[i]);
991
			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
992
			mutex_lock(&ss->hierarchy_mutex);
993
			cgrp->subsys[i] = dummytop->subsys[i];
994
			cgrp->subsys[i]->cgroup = cgrp;
995
			list_move(&ss->sibling, &root->subsys_list);
996
			ss->root = root;
997
			if (ss->bind)
998
				ss->bind(ss, cgrp);
999
			mutex_unlock(&ss->hierarchy_mutex);
1000
			/* refcount was already taken, and we're keeping it */
1001
		} else if (bit & removed_bits) {
1002
			/* We're removing this subsystem */
1003
			BUG_ON(ss == NULL);
1004
			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1005
			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1006
			mutex_lock(&ss->hierarchy_mutex);
1007
			if (ss->bind)
1008
				ss->bind(ss, dummytop);
1009
			dummytop->subsys[i]->cgroup = dummytop;
1010
			cgrp->subsys[i] = NULL;
1011
			subsys[i]->root = &rootnode;
1012
			list_move(&ss->sibling, &rootnode.subsys_list);
1013
			mutex_unlock(&ss->hierarchy_mutex);
1014
			/* subsystem is now free - drop reference on module */
1015
			module_put(ss->module);
1016
		} else if (bit & final_bits) {
1017
			/* Subsystem state should already exist */
1018
			BUG_ON(ss == NULL);
1019
			BUG_ON(!cgrp->subsys[i]);
1020
			/*
1021
			 * a refcount was taken, but we already had one, so
1022
			 * drop the extra reference.
1023
			 */
1024
			module_put(ss->module);
1025
#ifdef CONFIG_MODULE_UNLOAD
1026
			BUG_ON(ss->module && !module_refcount(ss->module));
1027
#endif
1028
		} else {
1029
			/* Subsystem state shouldn't exist */
1030
			BUG_ON(cgrp->subsys[i]);
1031
		}
1032
	}
1033
	root->subsys_bits = root->actual_subsys_bits = final_bits;
1034
	synchronize_rcu();
1035

1036
	return 0;
1037
}
1038

1039
static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1040
{
1041
	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
1042
	struct cgroup_subsys *ss;
1043

1044
	mutex_lock(&cgroup_mutex);
1045
	for_each_subsys(root, ss)
1046
		seq_printf(seq, ",%s", ss->name);
1047
	if (test_bit(ROOT_NOPREFIX, &root->flags))
1048
		seq_puts(seq, ",noprefix");
1049
	if (strlen(root->release_agent_path))
1050
		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1051
	if (clone_children(&root->top_cgroup))
1052
		seq_puts(seq, ",clone_children");
1053
	if (strlen(root->name))
1054
		seq_printf(seq, ",name=%s", root->name);
1055
	mutex_unlock(&cgroup_mutex);
1056
	return 0;
1057
}
1058

1059
struct cgroup_sb_opts {
1060
	unsigned long subsys_bits;
1061
	unsigned long flags;
1062
	char *release_agent;
1063
	bool clone_children;
1064
	char *name;
1065
	/* User explicitly requested empty subsystem */
1066
	bool none;
1067

1068
	struct cgroupfs_root *new_root;
1069

1070
};
1071

1072
/*
1073
 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
1074
 * with cgroup_mutex held to protect the subsys[] array. This function takes
1075
 * refcounts on subsystems to be used, unless it returns error, in which case
1076
 * no refcounts are taken.
1077
 */
1078
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1079
{
1080
	char *token, *o = data;
1081
	bool all_ss = false, one_ss = false;
1082
	unsigned long mask = (unsigned long)-1;
1083
	int i;
1084
	bool module_pin_failed = false;
1085

1086
	BUG_ON(!mutex_is_locked(&cgroup_mutex));
1087

1088
#ifdef CONFIG_CPUSETS
1089
	mask = ~(1UL << cpuset_subsys_id);
1090
#endif
1091

1092
	memset(opts, 0, sizeof(*opts));
1093

1094
	while ((token = strsep(&o, ",")) != NULL) {
1095
		if (!*token)
1096
			return -EINVAL;
1097
		if (!strcmp(token, "none")) {
1098
			/* Explicitly have no subsystems */
1099
			opts->none = true;
1100
			continue;
1101
		}
1102
		if (!strcmp(token, "all")) {
1103
			/* Mutually exclusive option 'all' + subsystem name */
1104
			if (one_ss)
1105
				return -EINVAL;
1106
			all_ss = true;
1107
			continue;
1108
		}
1109
		if (!strcmp(token, "noprefix")) {
1110
			set_bit(ROOT_NOPREFIX, &opts->flags);
1111
			continue;
1112
		}
1113
		if (!strcmp(token, "clone_children")) {
1114
			opts->clone_children = true;
1115
			continue;
1116
		}
1117
		if (!strncmp(token, "release_agent=", 14)) {
1118
			/* Specifying two release agents is forbidden */
1119
			if (opts->release_agent)
1120
				return -EINVAL;
1121
			opts->release_agent =
1122
				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1123
			if (!opts->release_agent)
1124
				return -ENOMEM;
1125
			continue;
1126
		}
1127
		if (!strncmp(token, "name=", 5)) {
1128
			const char *name = token + 5;
1129
			/* Can't specify an empty name */
1130
			if (!strlen(name))
1131
				return -EINVAL;
1132
			/* Must match [\w.-]+ */
1133
			for (i = 0; i < strlen(name); i++) {
1134
				char c = name[i];
1135
				if (isalnum(c))
1136
					continue;
1137
				if ((c == '.') || (c == '-') || (c == '_'))
1138
					continue;
1139
				return -EINVAL;
1140
			}
1141
			/* Specifying two names is forbidden */
1142
			if (opts->name)
1143
				return -EINVAL;
1144
			opts->name = kstrndup(name,
1145
					      MAX_CGROUP_ROOT_NAMELEN - 1,
1146
					      GFP_KERNEL);
1147
			if (!opts->name)
1148
				return -ENOMEM;
1149

1150
			continue;
1151
		}
1152

1153
		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1154
			struct cgroup_subsys *ss = subsys[i];
1155
			if (ss == NULL)
1156
				continue;
1157
			if (strcmp(token, ss->name))
1158
				continue;
1159
			if (ss->disabled)
1160
				continue;
1161

1162
			/* Mutually exclusive option 'all' + subsystem name */
1163
			if (all_ss)
1164
				return -EINVAL;
1165
			set_bit(i, &opts->subsys_bits);
1166
			one_ss = true;
1167

1168
			break;
1169
		}
1170
		if (i == CGROUP_SUBSYS_COUNT)
1171
			return -ENOENT;
1172
	}
1173

1174
	/*
1175
	 * If the 'all' option was specified select all the subsystems,
1176
	 * otherwise 'all, 'none' and a subsystem name options were not
1177
	 * specified, let's default to 'all'
1178
	 */
1179
	if (all_ss || (!all_ss && !one_ss && !opts->none)) {
1180
		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1181
			struct cgroup_subsys *ss = subsys[i];
1182
			if (ss == NULL)
1183
				continue;
1184
			if (ss->disabled)
1185
				continue;
1186
			set_bit(i, &opts->subsys_bits);
1187
		}
1188
	}
1189

1190
	/* Consistency checks */
1191

1192
	/*
1193
	 * Option noprefix was introduced just for backward compatibility
1194
	 * with the old cpuset, so we allow noprefix only if mounting just
1195
	 * the cpuset subsystem.
1196
	 */
1197
	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
1198
	    (opts->subsys_bits & mask))
1199
		return -EINVAL;
1200

1201

1202
	/* Can't specify "none" and some subsystems */
1203
	if (opts->subsys_bits && opts->none)
1204
		return -EINVAL;
1205

1206
	/*
1207
	 * We either have to specify by name or by subsystems. (So all
1208
	 * empty hierarchies must have a name).
1209
	 */
1210
	if (!opts->subsys_bits && !opts->name)
1211
		return -EINVAL;
1212

1213
	/*
1214
	 * Grab references on all the modules we'll need, so the subsystems
1215
	 * don't dance around before rebind_subsystems attaches them. This may
1216
	 * take duplicate reference counts on a subsystem that's already used,
1217
	 * but rebind_subsystems handles this case.
1218
	 */
1219
	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1220
		unsigned long bit = 1UL << i;
1221

1222
		if (!(bit & opts->subsys_bits))
1223
			continue;
1224
		if (!try_module_get(subsys[i]->module)) {
1225
			module_pin_failed = true;
1226
			break;
1227
		}
1228
	}
1229
	if (module_pin_failed) {
1230
		/*
1231
		 * oops, one of the modules was going away. this means that we
1232
		 * raced with a module_delete call, and to the user this is
1233
		 * essentially a "subsystem doesn't exist" case.
1234
		 */
1235
		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1236
			/* drop refcounts only on the ones we took */
1237
			unsigned long bit = 1UL << i;
1238

1239
			if (!(bit & opts->subsys_bits))
1240
				continue;
1241
			module_put(subsys[i]->module);
1242
		}
1243
		return -ENOENT;
1244
	}
1245

1246
	return 0;
1247
}
1248

1249
static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1250
{
1251
	int i;
1252
	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1253
		unsigned long bit = 1UL << i;
1254

1255
		if (!(bit & subsys_bits))
1256
			continue;
1257
		module_put(subsys[i]->module);
1258
	}
1259
}
1260

1261
static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1262
{
1263
	int ret = 0;
1264
	struct cgroupfs_root *root = sb->s_fs_info;
1265
	struct cgroup *cgrp = &root->top_cgroup;
1266
	struct cgroup_sb_opts opts;
1267

1268
	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1269
	mutex_lock(&cgroup_mutex);
1270

1271
	/* See what subsystems are wanted */
1272
	ret = parse_cgroupfs_options(data, &opts);
1273
	if (ret)
1274
		goto out_unlock;
1275

1276
	/* Don't allow flags or name to change at remount */
1277
	if (opts.flags != root->flags ||
1278
	    (opts.name && strcmp(opts.name, root->name))) {
1279
		ret = -EINVAL;
1280
		drop_parsed_module_refcounts(opts.subsys_bits);
1281
		goto out_unlock;
1282
	}
1283

1284
	ret = rebind_subsystems(root, opts.subsys_bits);
1285
	if (ret) {
1286
		drop_parsed_module_refcounts(opts.subsys_bits);
1287
		goto out_unlock;
1288
	}
1289

1290
	/* (re)populate subsystem files */
1291
	cgroup_populate_dir(cgrp);
1292

1293
	if (opts.release_agent)
1294
		strcpy(root->release_agent_path, opts.release_agent);
1295
 out_unlock:
1296
	kfree(opts.release_agent);
1297
	kfree(opts.name);
1298
	mutex_unlock(&cgroup_mutex);
1299
	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1300
	return ret;
1301
}
1302

1303
static const struct super_operations cgroup_ops = {
1304
	.statfs = simple_statfs,
1305
	.drop_inode = generic_delete_inode,
1306
	.show_options = cgroup_show_options,
1307
	.remount_fs = cgroup_remount,
1308
};
1309

1310
static void init_cgroup_housekeeping(struct cgroup *cgrp)
1311
{
1312
	INIT_LIST_HEAD(&cgrp->sibling);
1313
	INIT_LIST_HEAD(&cgrp->children);
1314
	INIT_LIST_HEAD(&cgrp->css_sets);
1315
	INIT_LIST_HEAD(&cgrp->release_list);
1316
	INIT_LIST_HEAD(&cgrp->pidlists);
1317
	mutex_init(&cgrp->pidlist_mutex);
1318
	INIT_LIST_HEAD(&cgrp->event_list);
1319
	spin_lock_init(&cgrp->event_list_lock);
1320
}
1321

1322
static void init_cgroup_root(struct cgroupfs_root *root)
1323
{
1324
	struct cgroup *cgrp = &root->top_cgroup;
1325
	INIT_LIST_HEAD(&root->subsys_list);
1326
	INIT_LIST_HEAD(&root->root_list);
1327
	root->number_of_cgroups = 1;
1328
	cgrp->root = root;
1329
	cgrp->top_cgroup = cgrp;
1330
	init_cgroup_housekeeping(cgrp);
1331
}
1332

1333
static bool init_root_id(struct cgroupfs_root *root)
1334
{
1335
	int ret = 0;
1336

1337
	do {
1338
		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1339
			return false;
1340
		spin_lock(&hierarchy_id_lock);
1341
		/* Try to allocate the next unused ID */
1342
		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1343
					&root->hierarchy_id);
1344
		if (ret == -ENOSPC)
1345
			/* Try again starting from 0 */
1346
			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1347
		if (!ret) {
1348
			next_hierarchy_id = root->hierarchy_id + 1;
1349
		} else if (ret != -EAGAIN) {
1350
			/* Can only get here if the 31-bit IDR is full ... */
1351
			BUG_ON(ret);
1352
		}
1353
		spin_unlock(&hierarchy_id_lock);
1354
	} while (ret);
1355
	return true;
1356
}
1357

1358
static int cgroup_test_super(struct super_block *sb, void *data)
1359
{
1360
	struct cgroup_sb_opts *opts = data;
1361
	struct cgroupfs_root *root = sb->s_fs_info;
1362

1363
	/* If we asked for a name then it must match */
1364
	if (opts->name && strcmp(opts->name, root->name))
1365
		return 0;
1366

1367
	/*
1368
	 * If we asked for subsystems (or explicitly for no
1369
	 * subsystems) then they must match
1370
	 */
1371
	if ((opts->subsys_bits || opts->none)
1372
	    && (opts->subsys_bits != root->subsys_bits))
1373
		return 0;
1374

1375
	return 1;
1376
}
1377

1378
static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1379
{
1380
	struct cgroupfs_root *root;
1381

1382
	if (!opts->subsys_bits && !opts->none)
1383
		return NULL;
1384

1385
	root = kzalloc(sizeof(*root), GFP_KERNEL);
1386
	if (!root)
1387
		return ERR_PTR(-ENOMEM);
1388

1389
	if (!init_root_id(root)) {
1390
		kfree(root);
1391
		return ERR_PTR(-ENOMEM);
1392
	}
1393
	init_cgroup_root(root);
1394

1395
	root->subsys_bits = opts->subsys_bits;
1396
	root->flags = opts->flags;
1397
	if (opts->release_agent)
1398
		strcpy(root->release_agent_path, opts->release_agent);
1399
	if (opts->name)
1400
		strcpy(root->name, opts->name);
1401
	if (opts->clone_children)
1402
		set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
1403
	return root;
1404
}
1405

1406
static void cgroup_drop_root(struct cgroupfs_root *root)
1407
{
1408
	if (!root)
1409
		return;
1410

1411
	BUG_ON(!root->hierarchy_id);
1412
	spin_lock(&hierarchy_id_lock);
1413
	ida_remove(&hierarchy_ida, root->hierarchy_id);
1414
	spin_unlock(&hierarchy_id_lock);
1415
	kfree(root);
1416
}
1417

1418
static int cgroup_set_super(struct super_block *sb, void *data)
1419
{
1420
	int ret;
1421
	struct cgroup_sb_opts *opts = data;
1422

1423
	/* If we don't have a new root, we can't set up a new sb */
1424
	if (!opts->new_root)
1425
		return -EINVAL;
1426

1427
	BUG_ON(!opts->subsys_bits && !opts->none);
1428

1429
	ret = set_anon_super(sb, NULL);
1430
	if (ret)
1431
		return ret;
1432

1433
	sb->s_fs_info = opts->new_root;
1434
	opts->new_root->sb = sb;
1435

1436
	sb->s_blocksize = PAGE_CACHE_SIZE;
1437
	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1438
	sb->s_magic = CGROUP_SUPER_MAGIC;
1439
	sb->s_op = &cgroup_ops;
1440

1441
	return 0;
1442
}
1443

1444
static int cgroup_get_rootdir(struct super_block *sb)
1445
{
1446
	static const struct dentry_operations cgroup_dops = {
1447
		.d_iput = cgroup_diput,
1448
		.d_delete = cgroup_delete,
1449
	};
1450

1451
	struct inode *inode =
1452
		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1453
	struct dentry *dentry;
1454

1455
	if (!inode)
1456
		return -ENOMEM;
1457

1458
	inode->i_fop = &simple_dir_operations;
1459
	inode->i_op = &cgroup_dir_inode_operations;
1460
	/* directories start off with i_nlink == 2 (for "." entry) */
1461
	inc_nlink(inode);
1462
	dentry = d_alloc_root(inode);
1463
	if (!dentry) {
1464
		iput(inode);
1465
		return -ENOMEM;
1466
	}
1467
	sb->s_root = dentry;
1468
	/* for everything else we want ->d_op set */
1469
	sb->s_d_op = &cgroup_dops;
1470
	return 0;
1471
}
1472

1473
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1474
			 int flags, const char *unused_dev_name,
1475
			 void *data)
1476
{
1477
	struct cgroup_sb_opts opts;
1478
	struct cgroupfs_root *root;
1479
	int ret = 0;
1480
	struct super_block *sb;
1481
	struct cgroupfs_root *new_root;
1482

1483
	/* First find the desired set of subsystems */
1484
	mutex_lock(&cgroup_mutex);
1485
	ret = parse_cgroupfs_options(data, &opts);
1486
	mutex_unlock(&cgroup_mutex);
1487
	if (ret)
1488
		goto out_err;
1489

1490
	/*
1491
	 * Allocate a new cgroup root. We may not need it if we're
1492
	 * reusing an existing hierarchy.
1493
	 */
1494
	new_root = cgroup_root_from_opts(&opts);
1495
	if (IS_ERR(new_root)) {
1496
		ret = PTR_ERR(new_root);
1497
		goto drop_modules;
1498
	}
1499
	opts.new_root = new_root;
1500

1501
	/* Locate an existing or new sb for this hierarchy */
1502
	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1503
	if (IS_ERR(sb)) {
1504
		ret = PTR_ERR(sb);
1505
		cgroup_drop_root(opts.new_root);
1506
		goto drop_modules;
1507
	}
1508

1509
	root = sb->s_fs_info;
1510
	BUG_ON(!root);
1511
	if (root == opts.new_root) {
1512
		/* We used the new root structure, so this is a new hierarchy */
1513
		struct list_head tmp_cg_links;
1514
		struct cgroup *root_cgrp = &root->top_cgroup;
1515
		struct inode *inode;
1516
		struct cgroupfs_root *existing_root;
1517
		int i;
1518

1519
		BUG_ON(sb->s_root != NULL);
1520

1521
		ret = cgroup_get_rootdir(sb);
1522
		if (ret)
1523
			goto drop_new_super;
1524
		inode = sb->s_root->d_inode;
1525

1526
		mutex_lock(&inode->i_mutex);
1527
		mutex_lock(&cgroup_mutex);
1528

1529
		if (strlen(root->name)) {
1530
			/* Check for name clashes with existing mounts */
1531
			for_each_active_root(existing_root) {
1532
				if (!strcmp(existing_root->name, root->name)) {
1533
					ret = -EBUSY;
1534
					mutex_unlock(&cgroup_mutex);
1535
					mutex_unlock(&inode->i_mutex);
1536
					goto drop_new_super;
1537
				}
1538
			}
1539
		}
1540

1541
		/*
1542
		 * We're accessing css_set_count without locking
1543
		 * css_set_lock here, but that's OK - it can only be
1544
		 * increased by someone holding cgroup_lock, and
1545
		 * that's us. The worst that can happen is that we
1546
		 * have some link structures left over
1547
		 */
1548
		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1549
		if (ret) {
1550
			mutex_unlock(&cgroup_mutex);
1551
			mutex_unlock(&inode->i_mutex);
1552
			goto drop_new_super;
1553
		}
1554

1555
		ret = rebind_subsystems(root, root->subsys_bits);
1556
		if (ret == -EBUSY) {
1557
			mutex_unlock(&cgroup_mutex);
1558
			mutex_unlock(&inode->i_mutex);
1559
			free_cg_links(&tmp_cg_links);
1560
			goto drop_new_super;
1561
		}
1562
		/*
1563
		 * There must be no failure case after here, since rebinding
1564
		 * takes care of subsystems' refcounts, which are explicitly
1565
		 * dropped in the failure exit path.
1566
		 */
1567

1568
		/* EBUSY should be the only error here */
1569
		BUG_ON(ret);
1570

1571
		list_add(&root->root_list, &roots);
1572
		root_count++;
1573

1574
		sb->s_root->d_fsdata = root_cgrp;
1575
		root->top_cgroup.dentry = sb->s_root;
1576

1577
		/* Link the top cgroup in this hierarchy into all
1578
		 * the css_set objects */
1579
		write_lock(&css_set_lock);
1580
		for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
1581
			struct hlist_head *hhead = &css_set_table[i];
1582
			struct hlist_node *node;
1583
			struct css_set *cg;
1584

1585
			hlist_for_each_entry(cg, node, hhead, hlist)
1586
				link_css_set(&tmp_cg_links, cg, root_cgrp);
1587
		}
1588
		write_unlock(&css_set_lock);
1589

1590
		free_cg_links(&tmp_cg_links);
1591

1592
		BUG_ON(!list_empty(&root_cgrp->sibling));
1593
		BUG_ON(!list_empty(&root_cgrp->children));
1594
		BUG_ON(root->number_of_cgroups != 1);
1595

1596
		cgroup_populate_dir(root_cgrp);
1597
		mutex_unlock(&cgroup_mutex);
1598
		mutex_unlock(&inode->i_mutex);
1599
	} else {
1600
		/*
1601
		 * We re-used an existing hierarchy - the new root (if
1602
		 * any) is not needed
1603
		 */
1604
		cgroup_drop_root(opts.new_root);
1605
		/* no subsys rebinding, so refcounts don't change */
1606
		drop_parsed_module_refcounts(opts.subsys_bits);
1607
	}
1608

1609
	kfree(opts.release_agent);
1610
	kfree(opts.name);
1611
	return dget(sb->s_root);
1612

1613
 drop_new_super:
1614
	deactivate_locked_super(sb);
1615
 drop_modules:
1616
	drop_parsed_module_refcounts(opts.subsys_bits);
1617
 out_err:
1618
	kfree(opts.release_agent);
1619
	kfree(opts.name);
1620
	return ERR_PTR(ret);
1621
}
1622

1623
static void cgroup_kill_sb(struct super_block *sb) {
1624
	struct cgroupfs_root *root = sb->s_fs_info;
1625
	struct cgroup *cgrp = &root->top_cgroup;
1626
	int ret;
1627
	struct cg_cgroup_link *link;
1628
	struct cg_cgroup_link *saved_link;
1629

1630
	BUG_ON(!root);
1631

1632
	BUG_ON(root->number_of_cgroups != 1);
1633
	BUG_ON(!list_empty(&cgrp->children));
1634
	BUG_ON(!list_empty(&cgrp->sibling));
1635

1636
	mutex_lock(&cgroup_mutex);
1637

1638
	/* Rebind all subsystems back to the default hierarchy */
1639
	ret = rebind_subsystems(root, 0);
1640
	/* Shouldn't be able to fail ... */
1641
	BUG_ON(ret);
1642

1643
	/*
1644
	 * Release all the links from css_sets to this hierarchy's
1645
	 * root cgroup
1646
	 */
1647
	write_lock(&css_set_lock);
1648

1649
	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1650
				 cgrp_link_list) {
1651
		list_del(&link->cg_link_list);
1652
		list_del(&link->cgrp_link_list);
1653
		kfree(link);
1654
	}
1655
	write_unlock(&css_set_lock);
1656

1657
	if (!list_empty(&root->root_list)) {
1658
		list_del(&root->root_list);
1659
		root_count--;
1660
	}
1661

1662
	mutex_unlock(&cgroup_mutex);
1663

1664
	kill_litter_super(sb);
1665
	cgroup_drop_root(root);
1666
}
1667

1668
static struct file_system_type cgroup_fs_type = {
1669
	.name = "cgroup",
1670
	.mount = cgroup_mount,
1671
	.kill_sb = cgroup_kill_sb,
1672
};
1673

1674
static struct kobject *cgroup_kobj;
1675

1676
static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1677
{
1678
	return dentry->d_fsdata;
1679
}
1680

1681
static inline struct cftype *__d_cft(struct dentry *dentry)
1682
{
1683
	return dentry->d_fsdata;
1684
}
1685

1686
/**
1687
 * cgroup_path - generate the path of a cgroup
1688
 * @cgrp: the cgroup in question
1689
 * @buf: the buffer to write the path into
1690
 * @buflen: the length of the buffer
1691
 *
1692
 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1693
 * reference.  Writes path of cgroup into buf.  Returns 0 on success,
1694
 * -errno on error.
1695
 */
1696
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1697
{
1698
	char *start;
1699
	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1700
						      rcu_read_lock_held() ||
1701
						      cgroup_lock_is_held());
1702

1703
	if (!dentry || cgrp == dummytop) {
1704
		/*
1705
		 * Inactive subsystems have no dentry for their root
1706
		 * cgroup
1707
		 */
1708
		strcpy(buf, "/");
1709
		return 0;
1710
	}
1711

1712
	start = buf + buflen;
1713

1714
	*--start = '\0';
1715
	for (;;) {
1716
		int len = dentry->d_name.len;
1717

1718
		if ((start -= len) < buf)
1719
			return -ENAMETOOLONG;
1720
		memcpy(start, dentry->d_name.name, len);
1721
		cgrp = cgrp->parent;
1722
		if (!cgrp)
1723
			break;
1724

1725
		dentry = rcu_dereference_check(cgrp->dentry,
1726
					       rcu_read_lock_held() ||
1727
					       cgroup_lock_is_held());
1728
		if (!cgrp->parent)
1729
			continue;
1730
		if (--start < buf)
1731
			return -ENAMETOOLONG;
1732
		*start = '/';
1733
	}
1734
	memmove(buf, start, buf + buflen - start);
1735
	return 0;
1736
}
1737
EXPORT_SYMBOL_GPL(cgroup_path);
1738

1739
/*
1740
 * cgroup_task_migrate - move a task from one cgroup to another.
1741
 *
1742
 * 'guarantee' is set if the caller promises that a new css_set for the task
1743
 * will already exist. If not set, this function might sleep, and can fail with
1744
 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745
 */
1746
static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747
			       struct task_struct *tsk, bool guarantee)
1748
{
1749
	struct css_set *oldcg;
1750
	struct css_set *newcg;
1751

1752
	/*
1753
	 * get old css_set. we need to take task_lock and refcount it, because
1754
	 * an exiting task can change its css_set to init_css_set and drop its
1755
	 * old one without taking cgroup_mutex.
1756
	 */
1757
	task_lock(tsk);
1758
	oldcg = tsk->cgroups;
1759
	get_css_set(oldcg);
1760
	task_unlock(tsk);
1761

1762
	/* locate or allocate a new css_set for this task. */
1763
	if (guarantee) {
1764
		/* we know the css_set we want already exists. */
1765
		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766
		read_lock(&css_set_lock);
1767
		newcg = find_existing_css_set(oldcg, cgrp, template);
1768
		BUG_ON(!newcg);
1769
		get_css_set(newcg);
1770
		read_unlock(&css_set_lock);
1771
	} else {
1772
		might_sleep();
1773
		/* find_css_set will give us newcg already referenced. */
1774
		newcg = find_css_set(oldcg, cgrp);
1775
		if (!newcg) {
1776
			put_css_set(oldcg);
1777
			return -ENOMEM;
1778
		}
1779
	}
1780
	put_css_set(oldcg);
1781

1782
	/* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783
	task_lock(tsk);
1784
	if (tsk->flags & PF_EXITING) {
1785
		task_unlock(tsk);
1786
		put_css_set(newcg);
1787
		return -ESRCH;
1788
	}
1789
	rcu_assign_pointer(tsk->cgroups, newcg);
1790
	task_unlock(tsk);
1791

1792
	/* Update the css_set linked lists if we're using them */
1793
	write_lock(&css_set_lock);
1794
	if (!list_empty(&tsk->cg_list))
1795
		list_move(&tsk->cg_list, &newcg->tasks);
1796
	write_unlock(&css_set_lock);
1797

1798
	/*
1799
	 * We just gained a reference on oldcg by taking it from the task. As
1800
	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801
	 * it here; it will be freed under RCU.
1802
	 */
1803
	put_css_set(oldcg);
1804

1805
	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806
	return 0;
1807
}
1808

1809
/**
1810
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1811
 * @cgrp: the cgroup the task is attaching to
1812
 * @tsk: the task to be attached
1813
 *
1814
 * Call holding cgroup_mutex. May take task_lock of
1815
 * the task 'tsk' during call.
1816
 */
1817
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1818
{
1819
	int retval;
1820
	struct cgroup_subsys *ss, *failed_ss = NULL;
1821
	struct cgroup *oldcgrp;
1822
	struct cgroupfs_root *root = cgrp->root;
1823

1824
	/* Nothing to do if the task is already in that cgroup */
1825
	oldcgrp = task_cgroup_from_root(tsk, root);
1826
	if (cgrp == oldcgrp)
1827
		return 0;
1828

1829
	for_each_subsys(root, ss) {
1830
		if (ss->can_attach) {
1831
			retval = ss->can_attach(ss, cgrp, tsk);
1832
			if (retval) {
1833
				/*
1834
				 * Remember on which subsystem the can_attach()
1835
				 * failed, so that we only call cancel_attach()
1836
				 * against the subsystems whose can_attach()
1837
				 * succeeded. (See below)
1838
				 */
1839
				failed_ss = ss;
1840
				goto out;
1841
			}
1842
		}
1843
		if (ss->can_attach_task) {
1844
			retval = ss->can_attach_task(cgrp, tsk);
1845
			if (retval) {
1846
				failed_ss = ss;
1847
				goto out;
1848
			}
1849
		}
1850
	}
1851

1852
	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1853
	if (retval)
1854
		goto out;
1855

1856
	for_each_subsys(root, ss) {
1857
		if (ss->pre_attach)
1858
			ss->pre_attach(cgrp);
1859
		if (ss->attach_task)
1860
			ss->attach_task(cgrp, tsk);
1861
		if (ss->attach)
1862
			ss->attach(ss, cgrp, oldcgrp, tsk);
1863
	}
1864

1865
	synchronize_rcu();
1866

1867
	/*
1868
	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1869
	 * is no longer empty.
1870
	 */
1871
	cgroup_wakeup_rmdir_waiter(cgrp);
1872
out:
1873
	if (retval) {
1874
		for_each_subsys(root, ss) {
1875
			if (ss == failed_ss)
1876
				/*
1877
				 * This subsystem was the one that failed the
1878
				 * can_attach() check earlier, so we don't need
1879
				 * to call cancel_attach() against it or any
1880
				 * remaining subsystems.
1881
				 */
1882
				break;
1883
			if (ss->cancel_attach)
1884
				ss->cancel_attach(ss, cgrp, tsk);
1885
		}
1886
	}
1887
	return retval;
1888
}
1889

1890
/**
1891
 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1892
 * @from: attach to all cgroups of a given task
1893
 * @tsk: the task to be attached
1894
 */
1895
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1896
{
1897
	struct cgroupfs_root *root;
1898
	int retval = 0;
1899

1900
	cgroup_lock();
1901
	for_each_active_root(root) {
1902
		struct cgroup *from_cg = task_cgroup_from_root(from, root);
1903

1904
		retval = cgroup_attach_task(from_cg, tsk);
1905
		if (retval)
1906
			break;
1907
	}
1908
	cgroup_unlock();
1909

1910
	return retval;
1911
}
1912
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1913

1914
/*
1915
 * cgroup_attach_proc works in two stages, the first of which prefetches all
1916
 * new css_sets needed (to make sure we have enough memory before committing
1917
 * to the move) and stores them in a list of entries of the following type.
1918
 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919
 */
1920
struct cg_list_entry {
1921
	struct css_set *cg;
1922
	struct list_head links;
1923
};
1924

1925
static bool css_set_check_fetched(struct cgroup *cgrp,
1926
				  struct task_struct *tsk, struct css_set *cg,
1927
				  struct list_head *newcg_list)
1928
{
1929
	struct css_set *newcg;
1930
	struct cg_list_entry *cg_entry;
1931
	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932

1933
	read_lock(&css_set_lock);
1934
	newcg = find_existing_css_set(cg, cgrp, template);
1935
	if (newcg)
1936
		get_css_set(newcg);
1937
	read_unlock(&css_set_lock);
1938

1939
	/* doesn't exist at all? */
1940
	if (!newcg)
1941
		return false;
1942
	/* see if it's already in the list */
1943
	list_for_each_entry(cg_entry, newcg_list, links) {
1944
		if (cg_entry->cg == newcg) {
1945
			put_css_set(newcg);
1946
			return true;
1947
		}
1948
	}
1949

1950
	/* not found */
1951
	put_css_set(newcg);
1952
	return false;
1953
}
1954

1955
/*
1956
 * Find the new css_set and store it in the list in preparation for moving the
1957
 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958
 */
1959
static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960
			    struct list_head *newcg_list)
1961
{
1962
	struct css_set *newcg;
1963
	struct cg_list_entry *cg_entry;
1964

1965
	/* ensure a new css_set will exist for this thread */
1966
	newcg = find_css_set(cg, cgrp);
1967
	if (!newcg)
1968
		return -ENOMEM;
1969
	/* add it to the list */
1970
	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971
	if (!cg_entry) {
1972
		put_css_set(newcg);
1973
		return -ENOMEM;
1974
	}
1975
	cg_entry->cg = newcg;
1976
	list_add(&cg_entry->links, newcg_list);
1977
	return 0;
1978
}
1979

1980
/**
1981
 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982
 * @cgrp: the cgroup to attach to
1983
 * @leader: the threadgroup leader task_struct of the group to be attached
1984
 *
1985
 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986
 * take task_lock of each thread in leader's threadgroup individually in turn.
1987
 */
1988
int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989
{
1990
	int retval, i, group_size;
1991
	struct cgroup_subsys *ss, *failed_ss = NULL;
1992
	bool cancel_failed_ss = false;
1993
	/* guaranteed to be initialized later, but the compiler needs this */
1994
	struct cgroup *oldcgrp = NULL;
1995
	struct css_set *oldcg;
1996
	struct cgroupfs_root *root = cgrp->root;
1997
	/* threadgroup list cursor and array */
1998
	struct task_struct *tsk;
1999
	struct flex_array *group;
2000
	/*
2001
	 * we need to make sure we have css_sets for all the tasks we're
2002
	 * going to move -before- we actually start moving them, so that in
2003
	 * case we get an ENOMEM we can bail out before making any changes.
2004
	 */
2005
	struct list_head newcg_list;
2006
	struct cg_list_entry *cg_entry, *temp_nobe;
2007

2008
	/*
2009
	 * step 0: in order to do expensive, possibly blocking operations for
2010
	 * every thread, we cannot iterate the thread group list, since it needs
2011
	 * rcu or tasklist locked. instead, build an array of all threads in the
2012
	 * group - threadgroup_fork_lock prevents new threads from appearing,
2013
	 * and if threads exit, this will just be an over-estimate.
2014
	 */
2015
	group_size = get_nr_threads(leader);
2016
	/* flex_array supports very large thread-groups better than kmalloc. */
2017
	group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018
				 GFP_KERNEL);
2019
	if (!group)
2020
		return -ENOMEM;
2021
	/* pre-allocate to guarantee space while iterating in rcu read-side. */
2022
	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023
	if (retval)
2024
		goto out_free_group_list;
2025

2026
	/* prevent changes to the threadgroup list while we take a snapshot. */
2027
	rcu_read_lock();
2028
	if (!thread_group_leader(leader)) {
2029
		/*
2030
		 * a race with de_thread from another thread's exec() may strip
2031
		 * us of our leadership, making while_each_thread unsafe to use
2032
		 * on this task. if this happens, there is no choice but to
2033
		 * throw this task away and try again (from cgroup_procs_write);
2034
		 * this is "double-double-toil-and-trouble-check locking".
2035
		 */
2036
		rcu_read_unlock();
2037
		retval = -EAGAIN;
2038
		goto out_free_group_list;
2039
	}
2040
	/* take a reference on each task in the group to go in the array. */
2041
	tsk = leader;
2042
	i = 0;
2043
	do {
2044
		/* as per above, nr_threads may decrease, but not increase. */
2045
		BUG_ON(i >= group_size);
2046
		get_task_struct(tsk);
2047
		/*
2048
		 * saying GFP_ATOMIC has no effect here because we did prealloc
2049
		 * earlier, but it's good form to communicate our expectations.
2050
		 */
2051
		retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052
		BUG_ON(retval != 0);
2053
		i++;
2054
	} while_each_thread(leader, tsk);
2055
	/* remember the number of threads in the array for later. */
2056
	group_size = i;
2057
	rcu_read_unlock();
2058

2059
	/*
2060
	 * step 1: check that we can legitimately attach to the cgroup.
2061
	 */
2062
	for_each_subsys(root, ss) {
2063
		if (ss->can_attach) {
2064
			retval = ss->can_attach(ss, cgrp, leader);
2065
			if (retval) {
2066
				failed_ss = ss;
2067
				goto out_cancel_attach;
2068
			}
2069
		}
2070
		/* a callback to be run on every thread in the threadgroup. */
2071
		if (ss->can_attach_task) {
2072
			/* run on each task in the threadgroup. */
2073
			for (i = 0; i < group_size; i++) {
2074
				tsk = flex_array_get_ptr(group, i);
2075
				retval = ss->can_attach_task(cgrp, tsk);
2076
				if (retval) {
2077
					failed_ss = ss;
2078
					cancel_failed_ss = true;
2079
					goto out_cancel_attach;
2080
				}
2081
			}
2082
		}
2083
	}
2084

2085
	/*
2086
	 * step 2: make sure css_sets exist for all threads to be migrated.
2087
	 * we use find_css_set, which allocates a new one if necessary.
2088
	 */
2089
	INIT_LIST_HEAD(&newcg_list);
2090
	for (i = 0; i < group_size; i++) {
2091
		tsk = flex_array_get_ptr(group, i);
2092
		/* nothing to do if this task is already in the cgroup */
2093
		oldcgrp = task_cgroup_from_root(tsk, root);
2094
		if (cgrp == oldcgrp)
2095
			continue;
2096
		/* get old css_set pointer */
2097
		task_lock(tsk);
2098
		if (tsk->flags & PF_EXITING) {
2099
			/* ignore this task if it's going away */
2100
			task_unlock(tsk);
2101
			continue;
2102
		}
2103
		oldcg = tsk->cgroups;
2104
		get_css_set(oldcg);
2105
		task_unlock(tsk);
2106
		/* see if the new one for us is already in the list? */
2107
		if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108
			/* was already there, nothing to do. */
2109
			put_css_set(oldcg);
2110
		} else {
2111
			/* we don't already have it. get new one. */
2112
			retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113
			put_css_set(oldcg);
2114
			if (retval)
2115
				goto out_list_teardown;
2116
		}
2117
	}
2118

2119
	/*
2120
	 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121
	 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122
	 * one along the way. there are no failure cases after here, so this is
2123
	 * the commit point.
2124
	 */
2125
	for_each_subsys(root, ss) {
2126
		if (ss->pre_attach)
2127
			ss->pre_attach(cgrp);
2128
	}
2129
	for (i = 0; i < group_size; i++) {
2130
		tsk = flex_array_get_ptr(group, i);
2131
		/* leave current thread as it is if it's already there */
2132
		oldcgrp = task_cgroup_from_root(tsk, root);
2133
		if (cgrp == oldcgrp)
2134
			continue;
2135
		/* attach each task to each subsystem */
2136
		for_each_subsys(root, ss) {
2137
			if (ss->attach_task)
2138
				ss->attach_task(cgrp, tsk);
2139
		}
2140
		/* if the thread is PF_EXITING, it can just get skipped. */
2141
		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142
		BUG_ON(retval != 0 && retval != -ESRCH);
2143
	}
2144
	/* nothing is sensitive to fork() after this point. */
2145

2146
	/*
2147
	 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148
	 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149
	 * being moved, this call will need to be reworked to communicate that.
2150
	 */
2151
	for_each_subsys(root, ss) {
2152
		if (ss->attach)
2153
			ss->attach(ss, cgrp, oldcgrp, leader);
2154
	}
2155

2156
	/*
2157
	 * step 5: success! and cleanup
2158
	 */
2159
	synchronize_rcu();
2160
	cgroup_wakeup_rmdir_waiter(cgrp);
2161
	retval = 0;
2162
out_list_teardown:
2163
	/* clean up the list of prefetched css_sets. */
2164
	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165
		list_del(&cg_entry->links);
2166
		put_css_set(cg_entry->cg);
2167
		kfree(cg_entry);
2168
	}
2169
out_cancel_attach:
2170
	/* same deal as in cgroup_attach_task */
2171
	if (retval) {
2172
		for_each_subsys(root, ss) {
2173
			if (ss == failed_ss) {
2174
				if (cancel_failed_ss && ss->cancel_attach)
2175
					ss->cancel_attach(ss, cgrp, leader);
2176
				break;
2177
			}
2178
			if (ss->cancel_attach)
2179
				ss->cancel_attach(ss, cgrp, leader);
2180
		}
2181
	}
2182
	/* clean up the array of referenced threads in the group. */
2183
	for (i = 0; i < group_size; i++) {
2184
		tsk = flex_array_get_ptr(group, i);
2185
		put_task_struct(tsk);
2186
	}
2187
out_free_group_list:
2188
	flex_array_free(group);
2189
	return retval;
2190
}
2191

2192
/*
2193
 * Find the task_struct of the task to attach by vpid and pass it along to the
2194
 * function to attach either it or all tasks in its threadgroup. Will take
2195
 * cgroup_mutex; may take task_lock of task.
2196
 */
2197
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2198
{
2199
	struct task_struct *tsk;
2200
	const struct cred *cred = current_cred(), *tcred;
2201
	int ret;
2202

2203
	if (!cgroup_lock_live_group(cgrp))
2204
		return -ENODEV;
2205

2206
	if (pid) {
2207
		rcu_read_lock();
2208
		tsk = find_task_by_vpid(pid);
2209
		if (!tsk) {
2210
			rcu_read_unlock();
2211
			cgroup_unlock();
2212
			return -ESRCH;
2213
		}
2214
		if (threadgroup) {
2215
			/*
2216
			 * RCU protects this access, since tsk was found in the
2217
			 * tid map. a race with de_thread may cause group_leader
2218
			 * to stop being the leader, but cgroup_attach_proc will
2219
			 * detect it later.
2220
			 */
2221
			tsk = tsk->group_leader;
2222
		} else if (tsk->flags & PF_EXITING) {
2223
			/* optimization for the single-task-only case */
2224
			rcu_read_unlock();
2225
			cgroup_unlock();
2226
			return -ESRCH;
2227
		}
2228

2229
		/*
2230
		 * even if we're attaching all tasks in the thread group, we
2231
		 * only need to check permissions on one of them.
2232
		 */
2233
		tcred = __task_cred(tsk);
2234
		if (cred->euid &&
2235
		    cred->euid != tcred->uid &&
2236
		    cred->euid != tcred->suid) {
2237
			rcu_read_unlock();
2238
			cgroup_unlock();
2239
			return -EACCES;
2240
		}
2241
		get_task_struct(tsk);
2242
		rcu_read_unlock();
2243
	} else {
2244
		if (threadgroup)
2245
			tsk = current->group_leader;
2246
		else
2247
			tsk = current;
2248
		get_task_struct(tsk);
2249
	}
2250

2251
	if (threadgroup) {
2252
		threadgroup_fork_write_lock(tsk);
2253
		ret = cgroup_attach_proc(cgrp, tsk);
2254
		threadgroup_fork_write_unlock(tsk);
2255
	} else {
2256
		ret = cgroup_attach_task(cgrp, tsk);
2257
	}
2258
	put_task_struct(tsk);
2259
	cgroup_unlock();
2260
	return ret;
2261
}
2262

2263
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2264
{
2265
	return attach_task_by_pid(cgrp, pid, false);
2266
}
2267

2268
static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269
{
2270
	int ret;
2271
	do {
2272
		/*
2273
		 * attach_proc fails with -EAGAIN if threadgroup leadership
2274
		 * changes in the middle of the operation, in which case we need
2275
		 * to find the task_struct for the new leader and start over.
2276
		 */
2277
		ret = attach_task_by_pid(cgrp, tgid, true);
2278
	} while (ret == -EAGAIN);
2279
	return ret;
2280
}
2281

2282
/**
2283
 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2284
 * @cgrp: the cgroup to be checked for liveness
2285
 *
2286
 * On success, returns true; the lock should be later released with
2287
 * cgroup_unlock(). On failure returns false with no lock held.
2288
 */
2289
bool cgroup_lock_live_group(struct cgroup *cgrp)
2290
{
2291
	mutex_lock(&cgroup_mutex);
2292
	if (cgroup_is_removed(cgrp)) {
2293
		mutex_unlock(&cgroup_mutex);
2294
		return false;
2295
	}
2296
	return true;
2297
}
2298
EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2299

2300
static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301
				      const char *buffer)
2302
{
2303
	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2304
	if (strlen(buffer) >= PATH_MAX)
2305
		return -EINVAL;
2306
	if (!cgroup_lock_live_group(cgrp))
2307
		return -ENODEV;
2308
	strcpy(cgrp->root->release_agent_path, buffer);
2309
	cgroup_unlock();
2310
	return 0;
2311
}
2312

2313
static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2314
				     struct seq_file *seq)
2315
{
2316
	if (!cgroup_lock_live_group(cgrp))
2317
		return -ENODEV;
2318
	seq_puts(seq, cgrp->root->release_agent_path);
2319
	seq_putc(seq, '\n');
2320
	cgroup_unlock();
2321
	return 0;
2322
}
2323

2324
/* A buffer size big enough for numbers or short strings */
2325
#define CGROUP_LOCAL_BUFFER_SIZE 64
2326

2327
static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2328
				struct file *file,
2329
				const char __user *userbuf,
2330
				size_t nbytes, loff_t *unused_ppos)
2331
{
2332
	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2333
	int retval = 0;
2334
	char *end;
2335

2336
	if (!nbytes)
2337
		return -EINVAL;
2338
	if (nbytes >= sizeof(buffer))
2339
		return -E2BIG;
2340
	if (copy_from_user(buffer, userbuf, nbytes))
2341
		return -EFAULT;
2342

2343
	buffer[nbytes] = 0;     /* nul-terminate */
2344
	if (cft->write_u64) {
2345
		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2346
		if (*end)
2347
			return -EINVAL;
2348
		retval = cft->write_u64(cgrp, cft, val);
2349
	} else {
2350
		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2351
		if (*end)
2352
			return -EINVAL;
2353
		retval = cft->write_s64(cgrp, cft, val);
2354
	}
2355
	if (!retval)
2356
		retval = nbytes;
2357
	return retval;
2358
}
2359

2360
static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2361
				   struct file *file,
2362
				   const char __user *userbuf,
2363
				   size_t nbytes, loff_t *unused_ppos)
2364
{
2365
	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2366
	int retval = 0;
2367
	size_t max_bytes = cft->max_write_len;
2368
	char *buffer = local_buffer;
2369

2370
	if (!max_bytes)
2371
		max_bytes = sizeof(local_buffer) - 1;
2372
	if (nbytes >= max_bytes)
2373
		return -E2BIG;
2374
	/* Allocate a dynamic buffer if we need one */
2375
	if (nbytes >= sizeof(local_buffer)) {
2376
		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
2377
		if (buffer == NULL)
2378
			return -ENOMEM;
2379
	}
2380
	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
2381
		retval = -EFAULT;
2382
		goto out;
2383
	}
2384

2385
	buffer[nbytes] = 0;     /* nul-terminate */
2386
	retval = cft->write_string(cgrp, cft, strstrip(buffer));
2387
	if (!retval)
2388
		retval = nbytes;
2389
out:
2390
	if (buffer != local_buffer)
2391
		kfree(buffer);
2392
	return retval;
2393
}
2394

2395
static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2396
						size_t nbytes, loff_t *ppos)
2397
{
2398
	struct cftype *cft = __d_cft(file->f_dentry);
2399
	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2400

2401
	if (cgroup_is_removed(cgrp))
2402
		return -ENODEV;
2403
	if (cft->write)
2404
		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
2405
	if (cft->write_u64 || cft->write_s64)
2406
		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
2407
	if (cft->write_string)
2408
		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
2409
	if (cft->trigger) {
2410
		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
2411
		return ret ? ret : nbytes;
2412
	}
2413
	return -EINVAL;
2414
}
2415

2416
static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
2417
			       struct file *file,
2418
			       char __user *buf, size_t nbytes,
2419
			       loff_t *ppos)
2420
{
2421
	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2422
	u64 val = cft->read_u64(cgrp, cft);
2423
	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2424

2425
	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2426
}
2427

2428
static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
2429
			       struct file *file,
2430
			       char __user *buf, size_t nbytes,
2431
			       loff_t *ppos)
2432
{
2433
	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2434
	s64 val = cft->read_s64(cgrp, cft);
2435
	int len = sprintf(tmp, "%lld\n", (long long) val);
2436

2437
	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2438
}
2439

2440
static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2441
				   size_t nbytes, loff_t *ppos)
2442
{
2443
	struct cftype *cft = __d_cft(file->f_dentry);
2444
	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2445

2446
	if (cgroup_is_removed(cgrp))
2447
		return -ENODEV;
2448

2449
	if (cft->read)
2450
		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
2451
	if (cft->read_u64)
2452
		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
2453
	if (cft->read_s64)
2454
		return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
2455
	return -EINVAL;
2456
}
2457

2458
/*
2459
 * seqfile ops/methods for returning structured data. Currently just
2460
 * supports string->u64 maps, but can be extended in future.
2461
 */
2462

2463
struct cgroup_seqfile_state {
2464
	struct cftype *cft;
2465
	struct cgroup *cgroup;
2466
};
2467

2468
static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2469
{
2470
	struct seq_file *sf = cb->state;
2471
	return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
2472
}
2473

2474
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2475
{
2476
	struct cgroup_seqfile_state *state = m->private;
2477
	struct cftype *cft = state->cft;
2478
	if (cft->read_map) {
2479
		struct cgroup_map_cb cb = {
2480
			.fill = cgroup_map_add,
2481
			.state = m,
2482
		};
2483
		return cft->read_map(state->cgroup, cft, &cb);
2484
	}
2485
	return cft->read_seq_string(state->cgroup, cft, m);
2486
}
2487

2488
static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2489
{
2490
	struct seq_file *seq = file->private_data;
2491
	kfree(seq->private);
2492
	return single_release(inode, file);
2493
}
2494

2495
static const struct file_operations cgroup_seqfile_operations = {
2496
	.read = seq_read,
2497
	.write = cgroup_file_write,
2498
	.llseek = seq_lseek,
2499
	.release = cgroup_seqfile_release,
2500
};
2501

2502
static int cgroup_file_open(struct inode *inode, struct file *file)
2503
{
2504
	int err;
2505
	struct cftype *cft;
2506

2507
	err = generic_file_open(inode, file);
2508
	if (err)
2509
		return err;
2510
	cft = __d_cft(file->f_dentry);
2511

2512
	if (cft->read_map || cft->read_seq_string) {
2513
		struct cgroup_seqfile_state *state =
2514
			kzalloc(sizeof(*state), GFP_USER);
2515
		if (!state)
2516
			return -ENOMEM;
2517
		state->cft = cft;
2518
		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2519
		file->f_op = &cgroup_seqfile_operations;
2520
		err = single_open(file, cgroup_seqfile_show, state);
2521
		if (err < 0)
2522
			kfree(state);
2523
	} else if (cft->open)
2524
		err = cft->open(inode, file);
2525
	else
2526
		err = 0;
2527

2528
	return err;
2529
}
2530

2531
static int cgroup_file_release(struct inode *inode, struct file *file)
2532
{
2533
	struct cftype *cft = __d_cft(file->f_dentry);
2534
	if (cft->release)
2535
		return cft->release(inode, file);
2536
	return 0;
2537
}
2538

2539
/*
2540
 * cgroup_rename - Only allow simple rename of directories in place.
2541
 */
2542
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2543
			    struct inode *new_dir, struct dentry *new_dentry)
2544
{
2545
	if (!S_ISDIR(old_dentry->d_inode->i_mode))
2546
		return -ENOTDIR;
2547
	if (new_dentry->d_inode)
2548
		return -EEXIST;
2549
	if (old_dir != new_dir)
2550
		return -EIO;
2551
	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2552
}
2553

2554
static const struct file_operations cgroup_file_operations = {
2555
	.read = cgroup_file_read,
2556
	.write = cgroup_file_write,
2557
	.llseek = generic_file_llseek,
2558
	.open = cgroup_file_open,
2559
	.release = cgroup_file_release,
2560
};
2561

2562
static const struct inode_operations cgroup_dir_inode_operations = {
2563
	.lookup = cgroup_lookup,
2564
	.mkdir = cgroup_mkdir,
2565
	.rmdir = cgroup_rmdir,
2566
	.rename = cgroup_rename,
2567
};
2568

2569
static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
2570
{
2571
	if (dentry->d_name.len > NAME_MAX)
2572
		return ERR_PTR(-ENAMETOOLONG);
2573
	d_add(dentry, NULL);
2574
	return NULL;
2575
}
2576

2577
/*
2578
 * Check if a file is a control file
2579
 */
2580
static inline struct cftype *__file_cft(struct file *file)
2581
{
2582
	if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2583
		return ERR_PTR(-EINVAL);
2584
	return __d_cft(file->f_dentry);
2585
}
2586

2587
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2588
				struct super_block *sb)
2589
{
2590
	struct inode *inode;
2591

2592
	if (!dentry)
2593
		return -ENOENT;
2594
	if (dentry->d_inode)
2595
		return -EEXIST;
2596

2597
	inode = cgroup_new_inode(mode, sb);
2598
	if (!inode)
2599
		return -ENOMEM;
2600

2601
	if (S_ISDIR(mode)) {
2602
		inode->i_op = &cgroup_dir_inode_operations;
2603
		inode->i_fop = &simple_dir_operations;
2604

2605
		/* start off with i_nlink == 2 (for "." entry) */
2606
		inc_nlink(inode);
2607

2608
		/* start with the directory inode held, so that we can
2609
		 * populate it without racing with another mkdir */
2610
		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2611
	} else if (S_ISREG(mode)) {
2612
		inode->i_size = 0;
2613
		inode->i_fop = &cgroup_file_operations;
2614
	}
2615
	d_instantiate(dentry, inode);
2616
	dget(dentry);	/* Extra count - pin the dentry in core */
2617
	return 0;
2618
}
2619

2620
/*
2621
 * cgroup_create_dir - create a directory for an object.
2622
 * @cgrp: the cgroup we create the directory for. It must have a valid
2623
 *        ->parent field. And we are going to fill its ->dentry field.
2624
 * @dentry: dentry of the new cgroup
2625
 * @mode: mode to set on new directory.
2626
 */
2627
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2628
				mode_t mode)
2629
{
2630
	struct dentry *parent;
2631
	int error = 0;
2632

2633
	parent = cgrp->parent->dentry;
2634
	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
2635
	if (!error) {
2636
		dentry->d_fsdata = cgrp;
2637
		inc_nlink(parent->d_inode);
2638
		rcu_assign_pointer(cgrp->dentry, dentry);
2639
		dget(dentry);
2640
	}
2641
	dput(dentry);
2642

2643
	return error;
2644
}
2645

2646
/**
2647
 * cgroup_file_mode - deduce file mode of a control file
2648
 * @cft: the control file in question
2649
 *
2650
 * returns cft->mode if ->mode is not 0
2651
 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2652
 * returns S_IRUGO if it has only a read handler
2653
 * returns S_IWUSR if it has only a write hander
2654
 */
2655
static mode_t cgroup_file_mode(const struct cftype *cft)
2656
{
2657
	mode_t mode = 0;
2658

2659
	if (cft->mode)
2660
		return cft->mode;
2661

2662
	if (cft->read || cft->read_u64 || cft->read_s64 ||
2663
	    cft->read_map || cft->read_seq_string)
2664
		mode |= S_IRUGO;
2665

2666
	if (cft->write || cft->write_u64 || cft->write_s64 ||
2667
	    cft->write_string || cft->trigger)
2668
		mode |= S_IWUSR;
2669

2670
	return mode;
2671
}
2672

2673
int cgroup_add_file(struct cgroup *cgrp,
2674
		       struct cgroup_subsys *subsys,
2675
		       const struct cftype *cft)
2676
{
2677
	struct dentry *dir = cgrp->dentry;
2678
	struct dentry *dentry;
2679
	int error;
2680
	mode_t mode;
2681

2682
	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2683
	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2684
		strcpy(name, subsys->name);
2685
		strcat(name, ".");
2686
	}
2687
	strcat(name, cft->name);
2688
	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2689
	dentry = lookup_one_len(name, dir, strlen(name));
2690
	if (!IS_ERR(dentry)) {
2691
		mode = cgroup_file_mode(cft);
2692
		error = cgroup_create_file(dentry, mode | S_IFREG,
2693
						cgrp->root->sb);
2694
		if (!error)
2695
			dentry->d_fsdata = (void *)cft;
2696
		dput(dentry);
2697
	} else
2698
		error = PTR_ERR(dentry);
2699
	return error;
2700
}
2701
EXPORT_SYMBOL_GPL(cgroup_add_file);
2702

2703
int cgroup_add_files(struct cgroup *cgrp,
2704
			struct cgroup_subsys *subsys,
2705
			const struct cftype cft[],
2706
			int count)
2707
{
2708
	int i, err;
2709
	for (i = 0; i < count; i++) {
2710
		err = cgroup_add_file(cgrp, subsys, &cft[i]);
2711
		if (err)
2712
			return err;
2713
	}
2714
	return 0;
2715
}
2716
EXPORT_SYMBOL_GPL(cgroup_add_files);
2717

2718
/**
2719
 * cgroup_task_count - count the number of tasks in a cgroup.
2720
 * @cgrp: the cgroup in question
2721
 *
2722
 * Return the number of tasks in the cgroup.
2723
 */
2724
int cgroup_task_count(const struct cgroup *cgrp)
2725
{
2726
	int count = 0;
2727
	struct cg_cgroup_link *link;
2728

2729
	read_lock(&css_set_lock);
2730
	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
2731
		count += atomic_read(&link->cg->refcount);
2732
	}
2733
	read_unlock(&css_set_lock);
2734
	return count;
2735
}
2736

2737
/*
2738
 * Advance a list_head iterator.  The iterator should be positioned at
2739
 * the start of a css_set
2740
 */
2741
static void cgroup_advance_iter(struct cgroup *cgrp,
2742
				struct cgroup_iter *it)
2743
{
2744
	struct list_head *l = it->cg_link;
2745
	struct cg_cgroup_link *link;
2746
	struct css_set *cg;
2747

2748
	/* Advance to the next non-empty css_set */
2749
	do {
2750
		l = l->next;
2751
		if (l == &cgrp->css_sets) {
2752
			it->cg_link = NULL;
2753
			return;
2754
		}
2755
		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
2756
		cg = link->cg;
2757
	} while (list_empty(&cg->tasks));
2758
	it->cg_link = l;
2759
	it->task = cg->tasks.next;
2760
}
2761

2762
/*
2763
 * To reduce the fork() overhead for systems that are not actually
2764
 * using their cgroups capability, we don't maintain the lists running
2765
 * through each css_set to its tasks until we see the list actually
2766
 * used - in other words after the first call to cgroup_iter_start().
2767
 *
2768
 * The tasklist_lock is not held here, as do_each_thread() and
2769
 * while_each_thread() are protected by RCU.
2770
 */
2771
static void cgroup_enable_task_cg_lists(void)
2772
{
2773
	struct task_struct *p, *g;
2774
	write_lock(&css_set_lock);
2775
	use_task_css_set_links = 1;
2776
	do_each_thread(g, p) {
2777
		task_lock(p);
2778
		/*
2779
		 * We should check if the process is exiting, otherwise
2780
		 * it will race with cgroup_exit() in that the list
2781
		 * entry won't be deleted though the process has exited.
2782
		 */
2783
		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2784
			list_add(&p->cg_list, &p->cgroups->tasks);
2785
		task_unlock(p);
2786
	} while_each_thread(g, p);
2787
	write_unlock(&css_set_lock);
2788
}
2789

2790
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2791
{
2792
	/*
2793
	 * The first time anyone tries to iterate across a cgroup,
2794
	 * we need to enable the list linking each css_set to its
2795
	 * tasks, and fix up all existing tasks.
2796
	 */
2797
	if (!use_task_css_set_links)
2798
		cgroup_enable_task_cg_lists();
2799

2800
	read_lock(&css_set_lock);
2801
	it->cg_link = &cgrp->css_sets;
2802
	cgroup_advance_iter(cgrp, it);
2803
}
2804

2805
struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2806
					struct cgroup_iter *it)
2807
{
2808
	struct task_struct *res;
2809
	struct list_head *l = it->task;
2810
	struct cg_cgroup_link *link;
2811

2812
	/* If the iterator cg is NULL, we have no tasks */
2813
	if (!it->cg_link)
2814
		return NULL;
2815
	res = list_entry(l, struct task_struct, cg_list);
2816
	/* Advance iterator to find next entry */
2817
	l = l->next;
2818
	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
2819
	if (l == &link->cg->tasks) {
2820
		/* We reached the end of this task list - move on to
2821
		 * the next cg_cgroup_link */
2822
		cgroup_advance_iter(cgrp, it);
2823
	} else {
2824
		it->task = l;
2825
	}
2826
	return res;
2827
}
2828

2829
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2830
{
2831
	read_unlock(&css_set_lock);
2832
}
2833

2834
static inline int started_after_time(struct task_struct *t1,
2835
				     struct timespec *time,
2836
				     struct task_struct *t2)
2837
{
2838
	int start_diff = timespec_compare(&t1->start_time, time);
2839
	if (start_diff > 0) {
2840
		return 1;
2841
	} else if (start_diff < 0) {
2842
		return 0;
2843
	} else {
2844
		/*
2845
		 * Arbitrarily, if two processes started at the same
2846
		 * time, we'll say that the lower pointer value
2847
		 * started first. Note that t2 may have exited by now
2848
		 * so this may not be a valid pointer any longer, but
2849
		 * that's fine - it still serves to distinguish
2850
		 * between two tasks started (effectively) simultaneously.
2851
		 */
2852
		return t1 > t2;
2853
	}
2854
}
2855

2856
/*
2857
 * This function is a callback from heap_insert() and is used to order
2858
 * the heap.
2859
 * In this case we order the heap in descending task start time.
2860
 */
2861
static inline int started_after(void *p1, void *p2)
2862
{
2863
	struct task_struct *t1 = p1;
2864
	struct task_struct *t2 = p2;
2865
	return started_after_time(t1, &t2->start_time, t2);
2866
}
2867

2868
/**
2869
 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
2870
 * @scan: struct cgroup_scanner containing arguments for the scan
2871
 *
2872
 * Arguments include pointers to callback functions test_task() and
2873
 * process_task().
2874
 * Iterate through all the tasks in a cgroup, calling test_task() for each,
2875
 * and if it returns true, call process_task() for it also.
2876
 * The test_task pointer may be NULL, meaning always true (select all tasks).
2877
 * Effectively duplicates cgroup_iter_{start,next,end}()
2878
 * but does not lock css_set_lock for the call to process_task().
2879
 * The struct cgroup_scanner may be embedded in any structure of the caller's
2880
 * creation.
2881
 * It is guaranteed that process_task() will act on every task that
2882
 * is a member of the cgroup for the duration of this call. This
2883
 * function may or may not call process_task() for tasks that exit
2884
 * or move to a different cgroup during the call, or are forked or
2885
 * move into the cgroup during the call.
2886
 *
2887
 * Note that test_task() may be called with locks held, and may in some
2888
 * situations be called multiple times for the same task, so it should
2889
 * be cheap.
2890
 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
2891
 * pre-allocated and will be used for heap operations (and its "gt" member will
2892
 * be overwritten), else a temporary heap will be used (allocation of which
2893
 * may cause this function to fail).
2894
 */
2895
int cgroup_scan_tasks(struct cgroup_scanner *scan)
2896
{
2897
	int retval, i;
2898
	struct cgroup_iter it;
2899
	struct task_struct *p, *dropped;
2900
	/* Never dereference latest_task, since it's not refcounted */
2901
	struct task_struct *latest_task = NULL;
2902
	struct ptr_heap tmp_heap;
2903
	struct ptr_heap *heap;
2904
	struct timespec latest_time = { 0, 0 };
2905

2906
	if (scan->heap) {
2907
		/* The caller supplied our heap and pre-allocated its memory */
2908
		heap = scan->heap;
2909
		heap->gt = &started_after;
2910
	} else {
2911
		/* We need to allocate our own heap memory */
2912
		heap = &tmp_heap;
2913
		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
2914
		if (retval)
2915
			/* cannot allocate the heap */
2916
			return retval;
2917
	}
2918

2919
 again:
2920
	/*
2921
	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
2922
	 * to determine which are of interest, and using the scanner's
2923
	 * "process_task" callback to process any of them that need an update.
2924
	 * Since we don't want to hold any locks during the task updates,
2925
	 * gather tasks to be processed in a heap structure.
2926
	 * The heap is sorted by descending task start time.
2927
	 * If the statically-sized heap fills up, we overflow tasks that
2928
	 * started later, and in future iterations only consider tasks that
2929
	 * started after the latest task in the previous pass. This
2930
	 * guarantees forward progress and that we don't miss any tasks.
2931
	 */
2932
	heap->size = 0;
2933
	cgroup_iter_start(scan->cg, &it);
2934
	while ((p = cgroup_iter_next(scan->cg, &it))) {
2935
		/*
2936
		 * Only affect tasks that qualify per the caller's callback,
2937
		 * if he provided one
2938
		 */
2939
		if (scan->test_task && !scan->test_task(p, scan))
2940
			continue;
2941
		/*
2942
		 * Only process tasks that started after the last task
2943
		 * we processed
2944
		 */
2945
		if (!started_after_time(p, &latest_time, latest_task))
2946
			continue;
2947
		dropped = heap_insert(heap, p);
2948
		if (dropped == NULL) {
2949
			/*
2950
			 * The new task was inserted; the heap wasn't
2951
			 * previously full
2952
			 */
2953
			get_task_struct(p);
2954
		} else if (dropped != p) {
2955
			/*
2956
			 * The new task was inserted, and pushed out a
2957
			 * different task
2958
			 */
2959
			get_task_struct(p);
2960
			put_task_struct(dropped);
2961
		}
2962
		/*
2963
		 * Else the new task was newer than anything already in
2964
		 * the heap and wasn't inserted
2965
		 */
2966
	}
2967
	cgroup_iter_end(scan->cg, &it);
2968

2969
	if (heap->size) {
2970
		for (i = 0; i < heap->size; i++) {
2971
			struct task_struct *q = heap->ptrs[i];
2972
			if (i == 0) {
2973
				latest_time = q->start_time;
2974
				latest_task = q;
2975
			}
2976
			/* Process the task per the caller's callback */
2977
			scan->process_task(q, scan);
2978
			put_task_struct(q);
2979
		}
2980
		/*
2981
		 * If we had to process any tasks at all, scan again
2982
		 * in case some of them were in the middle of forking
2983
		 * children that didn't get processed.
2984
		 * Not the most efficient way to do it, but it avoids
2985
		 * having to take callback_mutex in the fork path
2986
		 */
2987
		goto again;
2988
	}
2989
	if (heap == &tmp_heap)
2990
		heap_free(&tmp_heap);
2991
	return 0;
2992
}
2993

2994
/*
2995
 * Stuff for reading the 'tasks'/'procs' files.
2996
 *
2997
 * Reading this file can return large amounts of data if a cgroup has
2998
 * *lots* of attached tasks. So it may need several calls to read(),
2999
 * but we cannot guarantee that the information we produce is correct
3000
 * unless we produce it entirely atomically.
3001
 *
3002
 */
3003

3004
/*
3005
 * The following two functions "fix" the issue where there are more pids
3006
 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3007
 * TODO: replace with a kernel-wide solution to this problem
3008
 */
3009
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3010
static void *pidlist_allocate(int count)
3011
{
3012
	if (PIDLIST_TOO_LARGE(count))
3013
		return vmalloc(count * sizeof(pid_t));
3014
	else
3015
		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3016
}
3017
static void pidlist_free(void *p)
3018
{
3019
	if (is_vmalloc_addr(p))
3020
		vfree(p);
3021
	else
3022
		kfree(p);
3023
}
3024
static void *pidlist_resize(void *p, int newcount)
3025
{
3026
	void *newlist;
3027
	/* note: if new alloc fails, old p will still be valid either way */
3028
	if (is_vmalloc_addr(p)) {
3029
		newlist = vmalloc(newcount * sizeof(pid_t));
3030
		if (!newlist)
3031
			return NULL;
3032
		memcpy(newlist, p, newcount * sizeof(pid_t));
3033
		vfree(p);
3034
	} else {
3035
		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3036
	}
3037
	return newlist;
3038
}
3039

3040
/*
3041
 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3042
 * If the new stripped list is sufficiently smaller and there's enough memory
3043
 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3044
 * number of unique elements.
3045
 */
3046
/* is the size difference enough that we should re-allocate the array? */
3047
#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3048
static int pidlist_uniq(pid_t **p, int length)
3049
{
3050
	int src, dest = 1;
3051
	pid_t *list = *p;
3052
	pid_t *newlist;
3053

3054
	/*
3055
	 * we presume the 0th element is unique, so i starts at 1. trivial
3056
	 * edge cases first; no work needs to be done for either
3057
	 */
3058
	if (length == 0 || length == 1)
3059
		return length;
3060
	/* src and dest walk down the list; dest counts unique elements */
3061
	for (src = 1; src < length; src++) {
3062
		/* find next unique element */
3063
		while (list[src] == list[src-1]) {
3064
			src++;
3065
			if (src == length)
3066
				goto after;
3067
		}
3068
		/* dest always points to where the next unique element goes */
3069
		list[dest] = list[src];
3070
		dest++;
3071
	}
3072
after:
3073
	/*
3074
	 * if the length difference is large enough, we want to allocate a
3075
	 * smaller buffer to save memory. if this fails due to out of memory,
3076
	 * we'll just stay with what we've got.
3077
	 */
3078
	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3079
		newlist = pidlist_resize(list, dest);
3080
		if (newlist)
3081
			*p = newlist;
3082
	}
3083
	return dest;
3084
}
3085

3086
static int cmppid(const void *a, const void *b)
3087
{
3088
	return *(pid_t *)a - *(pid_t *)b;
3089
}
3090

3091
/*
3092
 * find the appropriate pidlist for our purpose (given procs vs tasks)
3093
 * returns with the lock on that pidlist already held, and takes care
3094
 * of the use count, or returns NULL with no locks held if we're out of
3095
 * memory.
3096
 */
3097
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3098
						  enum cgroup_filetype type)
3099
{
3100
	struct cgroup_pidlist *l;
3101
	/* don't need task_nsproxy() if we're looking at ourself */
3102
	struct pid_namespace *ns = current->nsproxy->pid_ns;
3103

3104
	/*
3105
	 * We can't drop the pidlist_mutex before taking the l->mutex in case
3106
	 * the last ref-holder is trying to remove l from the list at the same
3107
	 * time. Holding the pidlist_mutex precludes somebody taking whichever
3108
	 * list we find out from under us - compare release_pid_array().
3109
	 */
3110
	mutex_lock(&cgrp->pidlist_mutex);
3111
	list_for_each_entry(l, &cgrp->pidlists, links) {
3112
		if (l->key.type == type && l->key.ns == ns) {
3113
			/* make sure l doesn't vanish out from under us */
3114
			down_write(&l->mutex);
3115
			mutex_unlock(&cgrp->pidlist_mutex);
3116
			return l;
3117
		}
3118
	}
3119
	/* entry not found; create a new one */
3120
	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3121
	if (!l) {
3122
		mutex_unlock(&cgrp->pidlist_mutex);
3123
		return l;
3124
	}
3125
	init_rwsem(&l->mutex);
3126
	down_write(&l->mutex);
3127
	l->key.type = type;
3128
	l->key.ns = get_pid_ns(ns);
3129
	l->use_count = 0; /* don't increment here */
3130
	l->list = NULL;
3131
	l->owner = cgrp;
3132
	list_add(&l->links, &cgrp->pidlists);
3133
	mutex_unlock(&cgrp->pidlist_mutex);
3134
	return l;
3135
}
3136

3137
/*
3138
 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3139
 */
3140
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3141
			      struct cgroup_pidlist **lp)
3142
{
3143
	pid_t *array;
3144
	int length;
3145
	int pid, n = 0; /* used for populating the array */
3146
	struct cgroup_iter it;
3147
	struct task_struct *tsk;
3148
	struct cgroup_pidlist *l;
3149

3150
	/*
3151
	 * If cgroup gets more users after we read count, we won't have
3152
	 * enough space - tough.  This race is indistinguishable to the
3153
	 * caller from the case that the additional cgroup users didn't
3154
	 * show up until sometime later on.
3155
	 */
3156
	length = cgroup_task_count(cgrp);
3157
	array = pidlist_allocate(length);
3158
	if (!array)
3159
		return -ENOMEM;
3160
	/* now, populate the array */
3161
	cgroup_iter_start(cgrp, &it);
3162
	while ((tsk = cgroup_iter_next(cgrp, &it))) {
3163
		if (unlikely(n == length))
3164
			break;
3165
		/* get tgid or pid for procs or tasks file respectively */
3166
		if (type == CGROUP_FILE_PROCS)
3167
			pid = task_tgid_vnr(tsk);
3168
		else
3169
			pid = task_pid_vnr(tsk);
3170
		if (pid > 0) /* make sure to only use valid results */
3171
			array[n++] = pid;
3172
	}
3173
	cgroup_iter_end(cgrp, &it);
3174
	length = n;
3175
	/* now sort & (if procs) strip out duplicates */
3176
	sort(array, length, sizeof(pid_t), cmppid, NULL);
3177
	if (type == CGROUP_FILE_PROCS)
3178
		length = pidlist_uniq(&array, length);
3179
	l = cgroup_pidlist_find(cgrp, type);
3180
	if (!l) {
3181
		pidlist_free(array);
3182
		return -ENOMEM;
3183
	}
3184
	/* store array, freeing old if necessary - lock already held */
3185
	pidlist_free(l->list);
3186
	l->list = array;
3187
	l->length = length;
3188
	l->use_count++;
3189
	up_write(&l->mutex);
3190
	*lp = l;
3191
	return 0;
3192
}
3193

3194
/**
3195
 * cgroupstats_build - build and fill cgroupstats
3196
 * @stats: cgroupstats to fill information into
3197
 * @dentry: A dentry entry belonging to the cgroup for which stats have
3198
 * been requested.
3199
 *
3200
 * Build and fill cgroupstats so that taskstats can export it to user
3201
 * space.
3202
 */
3203
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3204
{
3205
	int ret = -EINVAL;
3206
	struct cgroup *cgrp;
3207
	struct cgroup_iter it;
3208
	struct task_struct *tsk;
3209

3210
	/*
3211
	 * Validate dentry by checking the superblock operations,
3212
	 * and make sure it's a directory.
3213
	 */
3214
	if (dentry->d_sb->s_op != &cgroup_ops ||
3215
	    !S_ISDIR(dentry->d_inode->i_mode))
3216
		 goto err;
3217

3218
	ret = 0;
3219
	cgrp = dentry->d_fsdata;
3220

3221
	cgroup_iter_start(cgrp, &it);
3222
	while ((tsk = cgroup_iter_next(cgrp, &it))) {
3223
		switch (tsk->state) {
3224
		case TASK_RUNNING:
3225
			stats->nr_running++;
3226
			break;
3227
		case TASK_INTERRUPTIBLE:
3228
			stats->nr_sleeping++;
3229
			break;
3230
		case TASK_UNINTERRUPTIBLE:
3231
			stats->nr_uninterruptible++;
3232
			break;
3233
		case TASK_STOPPED:
3234
			stats->nr_stopped++;
3235
			break;
3236
		default:
3237
			if (delayacct_is_task_waiting_on_io(tsk))
3238
				stats->nr_io_wait++;
3239
			break;
3240
		}
3241
	}
3242
	cgroup_iter_end(cgrp, &it);
3243

3244
err:
3245
	return ret;
3246
}
3247

3248

3249
/*
3250
 * seq_file methods for the tasks/procs files. The seq_file position is the
3251
 * next pid to display; the seq_file iterator is a pointer to the pid
3252
 * in the cgroup->l->list array.
3253
 */
3254

3255
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3256
{
3257
	/*
3258
	 * Initially we receive a position value that corresponds to
3259
	 * one more than the last pid shown (or 0 on the first call or
3260
	 * after a seek to the start). Use a binary-search to find the
3261
	 * next pid to display, if any
3262
	 */
3263
	struct cgroup_pidlist *l = s->private;
3264
	int index = 0, pid = *pos;
3265
	int *iter;
3266

3267
	down_read(&l->mutex);
3268
	if (pid) {
3269
		int end = l->length;
3270

3271
		while (index < end) {
3272
			int mid = (index + end) / 2;
3273
			if (l->list[mid] == pid) {
3274
				index = mid;
3275
				break;
3276
			} else if (l->list[mid] <= pid)
3277
				index = mid + 1;
3278
			else
3279
				end = mid;
3280
		}
3281
	}
3282
	/* If we're off the end of the array, we're done */
3283
	if (index >= l->length)
3284
		return NULL;
3285
	/* Update the abstract position to be the actual pid that we found */
3286
	iter = l->list + index;
3287
	*pos = *iter;
3288
	return iter;
3289
}
3290

3291
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3292
{
3293
	struct cgroup_pidlist *l = s->private;
3294
	up_read(&l->mutex);
3295
}
3296

3297
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3298
{
3299
	struct cgroup_pidlist *l = s->private;
3300
	pid_t *p = v;
3301
	pid_t *end = l->list + l->length;
3302
	/*
3303
	 * Advance to the next pid in the array. If this goes off the
3304
	 * end, we're done
3305
	 */
3306
	p++;
3307
	if (p >= end) {
3308
		return NULL;
3309
	} else {
3310
		*pos = *p;
3311
		return p;
3312
	}
3313
}
3314

3315
static int cgroup_pidlist_show(struct seq_file *s, void *v)
3316
{
3317
	return seq_printf(s, "%d\n", *(int *)v);
3318
}
3319

3320
/*
3321
 * seq_operations functions for iterating on pidlists through seq_file -
3322
 * independent of whether it's tasks or procs
3323
 */
3324
static const struct seq_operations cgroup_pidlist_seq_operations = {
3325
	.start = cgroup_pidlist_start,
3326
	.stop = cgroup_pidlist_stop,
3327
	.next = cgroup_pidlist_next,
3328
	.show = cgroup_pidlist_show,
3329
};
3330

3331
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3332
{
3333
	/*
3334
	 * the case where we're the last user of this particular pidlist will
3335
	 * have us remove it from the cgroup's list, which entails taking the
3336
	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3337
	 * pidlist_mutex, we have to take pidlist_mutex first.
3338
	 */
3339
	mutex_lock(&l->owner->pidlist_mutex);
3340
	down_write(&l->mutex);
3341
	BUG_ON(!l->use_count);
3342
	if (!--l->use_count) {
3343
		/* we're the last user if refcount is 0; remove and free */
3344
		list_del(&l->links);
3345
		mutex_unlock(&l->owner->pidlist_mutex);
3346
		pidlist_free(l->list);
3347
		put_pid_ns(l->key.ns);
3348
		up_write(&l->mutex);
3349
		kfree(l);
3350
		return;
3351
	}
3352
	mutex_unlock(&l->owner->pidlist_mutex);
3353
	up_write(&l->mutex);
3354
}
3355

3356
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
3357
{
3358
	struct cgroup_pidlist *l;
3359
	if (!(file->f_mode & FMODE_READ))
3360
		return 0;
3361
	/*
3362
	 * the seq_file will only be initialized if the file was opened for
3363
	 * reading; hence we check if it's not null only in that case.
3364
	 */
3365
	l = ((struct seq_file *)file->private_data)->private;
3366
	cgroup_release_pid_array(l);
3367
	return seq_release(inode, file);
3368
}
3369

3370
static const struct file_operations cgroup_pidlist_operations = {
3371
	.read = seq_read,
3372
	.llseek = seq_lseek,
3373
	.write = cgroup_file_write,
3374
	.release = cgroup_pidlist_release,
3375
};
3376

3377
/*
3378
 * The following functions handle opens on a file that displays a pidlist
3379
 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
3380
 * in the cgroup.
3381
 */
3382
/* helper function for the two below it */
3383
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
3384
{
3385
	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
3386
	struct cgroup_pidlist *l;
3387
	int retval;
3388

3389
	/* Nothing to do for write-only files */
3390
	if (!(file->f_mode & FMODE_READ))
3391
		return 0;
3392

3393
	/* have the array populated */
3394
	retval = pidlist_array_load(cgrp, type, &l);
3395
	if (retval)
3396
		return retval;
3397
	/* configure file information */
3398
	file->f_op = &cgroup_pidlist_operations;
3399

3400
	retval = seq_open(file, &cgroup_pidlist_seq_operations);
3401
	if (retval) {
3402
		cgroup_release_pid_array(l);
3403
		return retval;
3404
	}
3405
	((struct seq_file *)file->private_data)->private = l;
3406
	return 0;
3407
}
3408
static int cgroup_tasks_open(struct inode *unused, struct file *file)
3409
{
3410
	return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
3411
}
3412
static int cgroup_procs_open(struct inode *unused, struct file *file)
3413
{
3414
	return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3415
}
3416

3417
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
3418
					    struct cftype *cft)
3419
{
3420
	return notify_on_release(cgrp);
3421
}
3422

3423
static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3424
					  struct cftype *cft,
3425
					  u64 val)
3426
{
3427
	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
3428
	if (val)
3429
		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3430
	else
3431
		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3432
	return 0;
3433
}
3434

3435
/*
3436
 * Unregister event and free resources.
3437
 *
3438
 * Gets called from workqueue.
3439
 */
3440
static void cgroup_event_remove(struct work_struct *work)
3441
{
3442
	struct cgroup_event *event = container_of(work, struct cgroup_event,
3443
			remove);
3444
	struct cgroup *cgrp = event->cgrp;
3445

3446
	event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3447

3448
	eventfd_ctx_put(event->eventfd);
3449
	kfree(event);
3450
	dput(cgrp->dentry);
3451
}
3452

3453
/*
3454
 * Gets called on POLLHUP on eventfd when user closes it.
3455
 *
3456
 * Called with wqh->lock held and interrupts disabled.
3457
 */
3458
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3459
		int sync, void *key)
3460
{
3461
	struct cgroup_event *event = container_of(wait,
3462
			struct cgroup_event, wait);
3463
	struct cgroup *cgrp = event->cgrp;
3464
	unsigned long flags = (unsigned long)key;
3465

3466
	if (flags & POLLHUP) {
3467
		__remove_wait_queue(event->wqh, &event->wait);
3468
		spin_lock(&cgrp->event_list_lock);
3469
		list_del(&event->list);
3470
		spin_unlock(&cgrp->event_list_lock);
3471
		/*
3472
		 * We are in atomic context, but cgroup_event_remove() may
3473
		 * sleep, so we have to call it in workqueue.
3474
		 */
3475
		schedule_work(&event->remove);
3476
	}
3477

3478
	return 0;
3479
}
3480

3481
static void cgroup_event_ptable_queue_proc(struct file *file,
3482
		wait_queue_head_t *wqh, poll_table *pt)
3483
{
3484
	struct cgroup_event *event = container_of(pt,
3485
			struct cgroup_event, pt);
3486

3487
	event->wqh = wqh;
3488
	add_wait_queue(wqh, &event->wait);
3489
}
3490

3491
/*
3492
 * Parse input and register new cgroup event handler.
3493
 *
3494
 * Input must be in format '<event_fd> <control_fd> <args>'.
3495
 * Interpretation of args is defined by control file implementation.
3496
 */
3497
static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3498
				      const char *buffer)
3499
{
3500
	struct cgroup_event *event = NULL;
3501
	unsigned int efd, cfd;
3502
	struct file *efile = NULL;
3503
	struct file *cfile = NULL;
3504
	char *endp;
3505
	int ret;
3506

3507
	efd = simple_strtoul(buffer, &endp, 10);
3508
	if (*endp != ' ')
3509
		return -EINVAL;
3510
	buffer = endp + 1;
3511

3512
	cfd = simple_strtoul(buffer, &endp, 10);
3513
	if ((*endp != ' ') && (*endp != '\0'))
3514
		return -EINVAL;
3515
	buffer = endp + 1;
3516

3517
	event = kzalloc(sizeof(*event), GFP_KERNEL);
3518
	if (!event)
3519
		return -ENOMEM;
3520
	event->cgrp = cgrp;
3521
	INIT_LIST_HEAD(&event->list);
3522
	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3523
	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3524
	INIT_WORK(&event->remove, cgroup_event_remove);
3525

3526
	efile = eventfd_fget(efd);
3527
	if (IS_ERR(efile)) {
3528
		ret = PTR_ERR(efile);
3529
		goto fail;
3530
	}
3531

3532
	event->eventfd = eventfd_ctx_fileget(efile);
3533
	if (IS_ERR(event->eventfd)) {
3534
		ret = PTR_ERR(event->eventfd);
3535
		goto fail;
3536
	}
3537

3538
	cfile = fget(cfd);
3539
	if (!cfile) {
3540
		ret = -EBADF;
3541
		goto fail;
3542
	}
3543

3544
	/* the process need read permission on control file */
3545
	ret = file_permission(cfile, MAY_READ);
3546
	if (ret < 0)
3547
		goto fail;
3548

3549
	event->cft = __file_cft(cfile);
3550
	if (IS_ERR(event->cft)) {
3551
		ret = PTR_ERR(event->cft);
3552
		goto fail;
3553
	}
3554

3555
	if (!event->cft->register_event || !event->cft->unregister_event) {
3556
		ret = -EINVAL;
3557
		goto fail;
3558
	}
3559

3560
	ret = event->cft->register_event(cgrp, event->cft,
3561
			event->eventfd, buffer);
3562
	if (ret)
3563
		goto fail;
3564

3565
	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3566
		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3567
		ret = 0;
3568
		goto fail;
3569
	}
3570

3571
	/*
3572
	 * Events should be removed after rmdir of cgroup directory, but before
3573
	 * destroying subsystem state objects. Let's take reference to cgroup
3574
	 * directory dentry to do that.
3575
	 */
3576
	dget(cgrp->dentry);
3577

3578
	spin_lock(&cgrp->event_list_lock);
3579
	list_add(&event->list, &cgrp->event_list);
3580
	spin_unlock(&cgrp->event_list_lock);
3581

3582
	fput(cfile);
3583
	fput(efile);
3584

3585
	return 0;
3586

3587
fail:
3588
	if (cfile)
3589
		fput(cfile);
3590

3591
	if (event && event->eventfd && !IS_ERR(event->eventfd))
3592
		eventfd_ctx_put(event->eventfd);
3593

3594
	if (!IS_ERR_OR_NULL(efile))
3595
		fput(efile);
3596

3597
	kfree(event);
3598

3599
	return ret;
3600
}
3601

3602
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
3603
				    struct cftype *cft)
3604
{
3605
	return clone_children(cgrp);
3606
}
3607

3608
static int cgroup_clone_children_write(struct cgroup *cgrp,
3609
				     struct cftype *cft,
3610
				     u64 val)
3611
{
3612
	if (val)
3613
		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3614
	else
3615
		clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3616
	return 0;
3617
}
3618

3619
/*
3620
 * for the common functions, 'private' gives the type of file
3621
 */
3622
/* for hysterical raisins, we can't put this on the older files */
3623
#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3624
static struct cftype files[] = {
3625
	{
3626
		.name = "tasks",
3627
		.open = cgroup_tasks_open,
3628
		.write_u64 = cgroup_tasks_write,
3629
		.release = cgroup_pidlist_release,
3630
		.mode = S_IRUGO | S_IWUSR,
3631
	},
3632
	{
3633
		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
3634
		.open = cgroup_procs_open,
3635
		.write_u64 = cgroup_procs_write,
3636
		.release = cgroup_pidlist_release,
3637
		.mode = S_IRUGO | S_IWUSR,
3638
	},
3639
	{
3640
		.name = "notify_on_release",
3641
		.read_u64 = cgroup_read_notify_on_release,
3642
		.write_u64 = cgroup_write_notify_on_release,
3643
	},
3644
	{
3645
		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3646
		.write_string = cgroup_write_event_control,
3647
		.mode = S_IWUGO,
3648
	},
3649
	{
3650
		.name = "cgroup.clone_children",
3651
		.read_u64 = cgroup_clone_children_read,
3652
		.write_u64 = cgroup_clone_children_write,
3653
	},
3654
};
3655

3656
static struct cftype cft_release_agent = {
3657
	.name = "release_agent",
3658
	.read_seq_string = cgroup_release_agent_show,
3659
	.write_string = cgroup_release_agent_write,
3660
	.max_write_len = PATH_MAX,
3661
};
3662

3663
static int cgroup_populate_dir(struct cgroup *cgrp)
3664
{
3665
	int err;
3666
	struct cgroup_subsys *ss;
3667

3668
	/* First clear out any existing files */
3669
	cgroup_clear_directory(cgrp->dentry);
3670

3671
	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
3672
	if (err < 0)
3673
		return err;
3674

3675
	if (cgrp == cgrp->top_cgroup) {
3676
		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
3677
			return err;
3678
	}
3679

3680
	for_each_subsys(cgrp->root, ss) {
3681
		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
3682
			return err;
3683
	}
3684
	/* This cgroup is ready now */
3685
	for_each_subsys(cgrp->root, ss) {
3686
		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3687
		/*
3688
		 * Update id->css pointer and make this css visible from
3689
		 * CSS ID functions. This pointer will be dereferened
3690
		 * from RCU-read-side without locks.
3691
		 */
3692
		if (css->id)
3693
			rcu_assign_pointer(css->id->css, css);
3694
	}
3695

3696
	return 0;
3697
}
3698

3699
static void init_cgroup_css(struct cgroup_subsys_state *css,
3700
			       struct cgroup_subsys *ss,
3701
			       struct cgroup *cgrp)
3702
{
3703
	css->cgroup = cgrp;
3704
	atomic_set(&css->refcnt, 1);
3705
	css->flags = 0;
3706
	css->id = NULL;
3707
	if (cgrp == dummytop)
3708
		set_bit(CSS_ROOT, &css->flags);
3709
	BUG_ON(cgrp->subsys[ss->subsys_id]);
3710
	cgrp->subsys[ss->subsys_id] = css;
3711
}
3712

3713
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
3714
{
3715
	/* We need to take each hierarchy_mutex in a consistent order */
3716
	int i;
3717

3718
	/*
3719
	 * No worry about a race with rebind_subsystems that might mess up the
3720
	 * locking order, since both parties are under cgroup_mutex.
3721
	 */
3722
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3723
		struct cgroup_subsys *ss = subsys[i];
3724
		if (ss == NULL)
3725
			continue;
3726
		if (ss->root == root)
3727
			mutex_lock(&ss->hierarchy_mutex);
3728
	}
3729
}
3730

3731
static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3732
{
3733
	int i;
3734

3735
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3736
		struct cgroup_subsys *ss = subsys[i];
3737
		if (ss == NULL)
3738
			continue;
3739
		if (ss->root == root)
3740
			mutex_unlock(&ss->hierarchy_mutex);
3741
	}
3742
}
3743

3744
/*
3745
 * cgroup_create - create a cgroup
3746
 * @parent: cgroup that will be parent of the new cgroup
3747
 * @dentry: dentry of the new cgroup
3748
 * @mode: mode to set on new inode
3749
 *
3750
 * Must be called with the mutex on the parent inode held
3751
 */
3752
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3753
			     mode_t mode)
3754
{
3755
	struct cgroup *cgrp;
3756
	struct cgroupfs_root *root = parent->root;
3757
	int err = 0;
3758
	struct cgroup_subsys *ss;
3759
	struct super_block *sb = root->sb;
3760

3761
	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3762
	if (!cgrp)
3763
		return -ENOMEM;
3764

3765
	/* Grab a reference on the superblock so the hierarchy doesn't
3766
	 * get deleted on unmount if there are child cgroups.  This
3767
	 * can be done outside cgroup_mutex, since the sb can't
3768
	 * disappear while someone has an open control file on the
3769
	 * fs */
3770
	atomic_inc(&sb->s_active);
3771

3772
	mutex_lock(&cgroup_mutex);
3773

3774
	init_cgroup_housekeeping(cgrp);
3775

3776
	cgrp->parent = parent;
3777
	cgrp->root = parent->root;
3778
	cgrp->top_cgroup = parent->top_cgroup;
3779

3780
	if (notify_on_release(parent))
3781
		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3782

3783
	if (clone_children(parent))
3784
		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3785

3786
	for_each_subsys(root, ss) {
3787
		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3788

3789
		if (IS_ERR(css)) {
3790
			err = PTR_ERR(css);
3791
			goto err_destroy;
3792
		}
3793
		init_cgroup_css(css, ss, cgrp);
3794
		if (ss->use_id) {
3795
			err = alloc_css_id(ss, parent, cgrp);
3796
			if (err)
3797
				goto err_destroy;
3798
		}
3799
		/* At error, ->destroy() callback has to free assigned ID. */
3800
		if (clone_children(parent) && ss->post_clone)
3801
			ss->post_clone(ss, cgrp);
3802
	}
3803

3804
	cgroup_lock_hierarchy(root);
3805
	list_add(&cgrp->sibling, &cgrp->parent->children);
3806
	cgroup_unlock_hierarchy(root);
3807
	root->number_of_cgroups++;
3808

3809
	err = cgroup_create_dir(cgrp, dentry, mode);
3810
	if (err < 0)
3811
		goto err_remove;
3812

3813
	/* The cgroup directory was pre-locked for us */
3814
	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3815

3816
	err = cgroup_populate_dir(cgrp);
3817
	/* If err < 0, we have a half-filled directory - oh well ;) */
3818

3819
	mutex_unlock(&cgroup_mutex);
3820
	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
3821

3822
	return 0;
3823

3824
 err_remove:
3825

3826
	cgroup_lock_hierarchy(root);
3827
	list_del(&cgrp->sibling);
3828
	cgroup_unlock_hierarchy(root);
3829
	root->number_of_cgroups--;
3830

3831
 err_destroy:
3832

3833
	for_each_subsys(root, ss) {
3834
		if (cgrp->subsys[ss->subsys_id])
3835
			ss->destroy(ss, cgrp);
3836
	}
3837

3838
	mutex_unlock(&cgroup_mutex);
3839

3840
	/* Release the reference count that we took on the superblock */
3841
	deactivate_super(sb);
3842

3843
	kfree(cgrp);
3844
	return err;
3845
}
3846

3847
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3848
{
3849
	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3850

3851
	/* the vfs holds inode->i_mutex already */
3852
	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
3853
}
3854

3855
static int cgroup_has_css_refs(struct cgroup *cgrp)
3856
{
3857
	/* Check the reference count on each subsystem. Since we
3858
	 * already established that there are no tasks in the
3859
	 * cgroup, if the css refcount is also 1, then there should
3860
	 * be no outstanding references, so the subsystem is safe to
3861
	 * destroy. We scan across all subsystems rather than using
3862
	 * the per-hierarchy linked list of mounted subsystems since
3863
	 * we can be called via check_for_release() with no
3864
	 * synchronization other than RCU, and the subsystem linked
3865
	 * list isn't RCU-safe */
3866
	int i;
3867
	/*
3868
	 * We won't need to lock the subsys array, because the subsystems
3869
	 * we're concerned about aren't going anywhere since our cgroup root
3870
	 * has a reference on them.
3871
	 */
3872
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3873
		struct cgroup_subsys *ss = subsys[i];
3874
		struct cgroup_subsys_state *css;
3875
		/* Skip subsystems not present or not in this hierarchy */
3876
		if (ss == NULL || ss->root != cgrp->root)
3877
			continue;
3878
		css = cgrp->subsys[ss->subsys_id];
3879
		/* When called from check_for_release() it's possible
3880
		 * that by this point the cgroup has been removed
3881
		 * and the css deleted. But a false-positive doesn't
3882
		 * matter, since it can only happen if the cgroup
3883
		 * has been deleted and hence no longer needs the
3884
		 * release agent to be called anyway. */
3885
		if (css && (atomic_read(&css->refcnt) > 1))
3886
			return 1;
3887
	}
3888
	return 0;
3889
}
3890

3891
/*
3892
 * Atomically mark all (or else none) of the cgroup's CSS objects as
3893
 * CSS_REMOVED. Return true on success, or false if the cgroup has
3894
 * busy subsystems. Call with cgroup_mutex held
3895
 */
3896

3897
static int cgroup_clear_css_refs(struct cgroup *cgrp)
3898
{
3899
	struct cgroup_subsys *ss;
3900
	unsigned long flags;
3901
	bool failed = false;
3902
	local_irq_save(flags);
3903
	for_each_subsys(cgrp->root, ss) {
3904
		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3905
		int refcnt;
3906
		while (1) {
3907
			/* We can only remove a CSS with a refcnt==1 */
3908
			refcnt = atomic_read(&css->refcnt);
3909
			if (refcnt > 1) {
3910
				failed = true;
3911
				goto done;
3912
			}
3913
			BUG_ON(!refcnt);
3914
			/*
3915
			 * Drop the refcnt to 0 while we check other
3916
			 * subsystems. This will cause any racing
3917
			 * css_tryget() to spin until we set the
3918
			 * CSS_REMOVED bits or abort
3919
			 */
3920
			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
3921
				break;
3922
			cpu_relax();
3923
		}
3924
	}
3925
 done:
3926
	for_each_subsys(cgrp->root, ss) {
3927
		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
3928
		if (failed) {
3929
			/*
3930
			 * Restore old refcnt if we previously managed
3931
			 * to clear it from 1 to 0
3932
			 */
3933
			if (!atomic_read(&css->refcnt))
3934
				atomic_set(&css->refcnt, 1);
3935
		} else {
3936
			/* Commit the fact that the CSS is removed */
3937
			set_bit(CSS_REMOVED, &css->flags);
3938
		}
3939
	}
3940
	local_irq_restore(flags);
3941
	return !failed;
3942
}
3943

3944
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3945
{
3946
	struct cgroup *cgrp = dentry->d_fsdata;
3947
	struct dentry *d;
3948
	struct cgroup *parent;
3949
	DEFINE_WAIT(wait);
3950
	struct cgroup_event *event, *tmp;
3951
	int ret;
3952

3953
	/* the vfs holds both inode->i_mutex already */
3954
again:
3955
	mutex_lock(&cgroup_mutex);
3956
	if (atomic_read(&cgrp->count) != 0) {
3957
		mutex_unlock(&cgroup_mutex);
3958
		return -EBUSY;
3959
	}
3960
	if (!list_empty(&cgrp->children)) {
3961
		mutex_unlock(&cgroup_mutex);
3962
		return -EBUSY;
3963
	}
3964
	mutex_unlock(&cgroup_mutex);
3965

3966
	/*
3967
	 * In general, subsystem has no css->refcnt after pre_destroy(). But
3968
	 * in racy cases, subsystem may have to get css->refcnt after
3969
	 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
3970
	 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
3971
	 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
3972
	 * and subsystem's reference count handling. Please see css_get/put
3973
	 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
3974
	 */
3975
	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3976

3977
	/*
3978
	 * Call pre_destroy handlers of subsys. Notify subsystems
3979
	 * that rmdir() request comes.
3980
	 */
3981
	ret = cgroup_call_pre_destroy(cgrp);
3982
	if (ret) {
3983
		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3984
		return ret;
3985
	}
3986

3987
	mutex_lock(&cgroup_mutex);
3988
	parent = cgrp->parent;
3989
	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
3990
		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3991
		mutex_unlock(&cgroup_mutex);
3992
		return -EBUSY;
3993
	}
3994
	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
3995
	if (!cgroup_clear_css_refs(cgrp)) {
3996
		mutex_unlock(&cgroup_mutex);
3997
		/*
3998
		 * Because someone may call cgroup_wakeup_rmdir_waiter() before
3999
		 * prepare_to_wait(), we need to check this flag.
4000
		 */
4001
		if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
4002
			schedule();
4003
		finish_wait(&cgroup_rmdir_waitq, &wait);
4004
		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4005
		if (signal_pending(current))
4006
			return -EINTR;
4007
		goto again;
4008
	}
4009
	/* NO css_tryget() can success after here. */
4010
	finish_wait(&cgroup_rmdir_waitq, &wait);
4011
	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
4012

4013
	spin_lock(&release_list_lock);
4014
	set_bit(CGRP_REMOVED, &cgrp->flags);
4015
	if (!list_empty(&cgrp->release_list))
4016
		list_del_init(&cgrp->release_list);
4017
	spin_unlock(&release_list_lock);
4018

4019
	cgroup_lock_hierarchy(cgrp->root);
4020
	/* delete this cgroup from parent->children */
4021
	list_del_init(&cgrp->sibling);
4022
	cgroup_unlock_hierarchy(cgrp->root);
4023

4024
	d = dget(cgrp->dentry);
4025

4026
	cgroup_d_remove_dir(d);
4027
	dput(d);
4028

4029
	set_bit(CGRP_RELEASABLE, &parent->flags);
4030
	check_for_release(parent);
4031

4032
	/*
4033
	 * Unregister events and notify userspace.
4034
	 * Notify userspace about cgroup removing only after rmdir of cgroup
4035
	 * directory to avoid race between userspace and kernelspace
4036
	 */
4037
	spin_lock(&cgrp->event_list_lock);
4038
	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4039
		list_del(&event->list);
4040
		remove_wait_queue(event->wqh, &event->wait);
4041
		eventfd_signal(event->eventfd, 1);
4042
		schedule_work(&event->remove);
4043
	}
4044
	spin_unlock(&cgrp->event_list_lock);
4045

4046
	mutex_unlock(&cgroup_mutex);
4047
	return 0;
4048
}
4049

4050
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4051
{
4052
	struct cgroup_subsys_state *css;
4053

4054
	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4055

4056
	/* Create the top cgroup state for this subsystem */
4057
	list_add(&ss->sibling, &rootnode.subsys_list);
4058
	ss->root = &rootnode;
4059
	css = ss->create(ss, dummytop);
4060
	/* We don't handle early failures gracefully */
4061
	BUG_ON(IS_ERR(css));
4062
	init_cgroup_css(css, ss, dummytop);
4063

4064
	/* Update the init_css_set to contain a subsys
4065
	 * pointer to this state - since the subsystem is
4066
	 * newly registered, all tasks and hence the
4067
	 * init_css_set is in the subsystem's top cgroup. */
4068
	init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
4069

4070
	need_forkexit_callback |= ss->fork || ss->exit;
4071

4072
	/* At system boot, before all subsystems have been
4073
	 * registered, no tasks have been forked, so we don't
4074
	 * need to invoke fork callbacks here. */
4075
	BUG_ON(!list_empty(&init_task.tasks));
4076

4077
	mutex_init(&ss->hierarchy_mutex);
4078
	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4079
	ss->active = 1;
4080

4081
	/* this function shouldn't be used with modular subsystems, since they
4082
	 * need to register a subsys_id, among other things */
4083
	BUG_ON(ss->module);
4084
}
4085

4086
/**
4087
 * cgroup_load_subsys: load and register a modular subsystem at runtime
4088
 * @ss: the subsystem to load
4089
 *
4090
 * This function should be called in a modular subsystem's initcall. If the
4091
 * subsystem is built as a module, it will be assigned a new subsys_id and set
4092
 * up for use. If the subsystem is built-in anyway, work is delegated to the
4093
 * simpler cgroup_init_subsys.
4094
 */
4095
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4096
{
4097
	int i;
4098
	struct cgroup_subsys_state *css;
4099

4100
	/* check name and function validity */
4101
	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4102
	    ss->create == NULL || ss->destroy == NULL)
4103
		return -EINVAL;
4104

4105
	/*
4106
	 * we don't support callbacks in modular subsystems. this check is
4107
	 * before the ss->module check for consistency; a subsystem that could
4108
	 * be a module should still have no callbacks even if the user isn't
4109
	 * compiling it as one.
4110
	 */
4111
	if (ss->fork || ss->exit)
4112
		return -EINVAL;
4113

4114
	/*
4115
	 * an optionally modular subsystem is built-in: we want to do nothing,
4116
	 * since cgroup_init_subsys will have already taken care of it.
4117
	 */
4118
	if (ss->module == NULL) {
4119
		/* a few sanity checks */
4120
		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
4121
		BUG_ON(subsys[ss->subsys_id] != ss);
4122
		return 0;
4123
	}
4124

4125
	/*
4126
	 * need to register a subsys id before anything else - for example,
4127
	 * init_cgroup_css needs it.
4128
	 */
4129
	mutex_lock(&cgroup_mutex);
4130
	/* find the first empty slot in the array */
4131
	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
4132
		if (subsys[i] == NULL)
4133
			break;
4134
	}
4135
	if (i == CGROUP_SUBSYS_COUNT) {
4136
		/* maximum number of subsystems already registered! */
4137
		mutex_unlock(&cgroup_mutex);
4138
		return -EBUSY;
4139
	}
4140
	/* assign ourselves the subsys_id */
4141
	ss->subsys_id = i;
4142
	subsys[i] = ss;
4143

4144
	/*
4145
	 * no ss->create seems to need anything important in the ss struct, so
4146
	 * this can happen first (i.e. before the rootnode attachment).
4147
	 */
4148
	css = ss->create(ss, dummytop);
4149
	if (IS_ERR(css)) {
4150
		/* failure case - need to deassign the subsys[] slot. */
4151
		subsys[i] = NULL;
4152
		mutex_unlock(&cgroup_mutex);
4153
		return PTR_ERR(css);
4154
	}
4155

4156
	list_add(&ss->sibling, &rootnode.subsys_list);
4157
	ss->root = &rootnode;
4158

4159
	/* our new subsystem will be attached to the dummy hierarchy. */
4160
	init_cgroup_css(css, ss, dummytop);
4161
	/* init_idr must be after init_cgroup_css because it sets css->id. */
4162
	if (ss->use_id) {
4163
		int ret = cgroup_init_idr(ss, css);
4164
		if (ret) {
4165
			dummytop->subsys[ss->subsys_id] = NULL;
4166
			ss->destroy(ss, dummytop);
4167
			subsys[i] = NULL;
4168
			mutex_unlock(&cgroup_mutex);
4169
			return ret;
4170
		}
4171
	}
4172

4173
	/*
4174
	 * Now we need to entangle the css into the existing css_sets. unlike
4175
	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4176
	 * will need a new pointer to it; done by iterating the css_set_table.
4177
	 * furthermore, modifying the existing css_sets will corrupt the hash
4178
	 * table state, so each changed css_set will need its hash recomputed.
4179
	 * this is all done under the css_set_lock.
4180
	 */
4181
	write_lock(&css_set_lock);
4182
	for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
4183
		struct css_set *cg;
4184
		struct hlist_node *node, *tmp;
4185
		struct hlist_head *bucket = &css_set_table[i], *new_bucket;
4186

4187
		hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
4188
			/* skip entries that we already rehashed */
4189
			if (cg->subsys[ss->subsys_id])
4190
				continue;
4191
			/* remove existing entry */
4192
			hlist_del(&cg->hlist);
4193
			/* set new value */
4194
			cg->subsys[ss->subsys_id] = css;
4195
			/* recompute hash and restore entry */
4196
			new_bucket = css_set_hash(cg->subsys);
4197
			hlist_add_head(&cg->hlist, new_bucket);
4198
		}
4199
	}
4200
	write_unlock(&css_set_lock);
4201

4202
	mutex_init(&ss->hierarchy_mutex);
4203
	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
4204
	ss->active = 1;
4205

4206
	/* success! */
4207
	mutex_unlock(&cgroup_mutex);
4208
	return 0;
4209
}
4210
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4211

4212
/**
4213
 * cgroup_unload_subsys: unload a modular subsystem
4214
 * @ss: the subsystem to unload
4215
 *
4216
 * This function should be called in a modular subsystem's exitcall. When this
4217
 * function is invoked, the refcount on the subsystem's module will be 0, so
4218
 * the subsystem will not be attached to any hierarchy.
4219
 */
4220
void cgroup_unload_subsys(struct cgroup_subsys *ss)
4221
{
4222
	struct cg_cgroup_link *link;
4223
	struct hlist_head *hhead;
4224

4225
	BUG_ON(ss->module == NULL);
4226

4227
	/*
4228
	 * we shouldn't be called if the subsystem is in use, and the use of
4229
	 * try_module_get in parse_cgroupfs_options should ensure that it
4230
	 * doesn't start being used while we're killing it off.
4231
	 */
4232
	BUG_ON(ss->root != &rootnode);
4233

4234
	mutex_lock(&cgroup_mutex);
4235
	/* deassign the subsys_id */
4236
	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
4237
	subsys[ss->subsys_id] = NULL;
4238

4239
	/* remove subsystem from rootnode's list of subsystems */
4240
	list_del_init(&ss->sibling);
4241

4242
	/*
4243
	 * disentangle the css from all css_sets attached to the dummytop. as
4244
	 * in loading, we need to pay our respects to the hashtable gods.
4245
	 */
4246
	write_lock(&css_set_lock);
4247
	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4248
		struct css_set *cg = link->cg;
4249

4250
		hlist_del(&cg->hlist);
4251
		BUG_ON(!cg->subsys[ss->subsys_id]);
4252
		cg->subsys[ss->subsys_id] = NULL;
4253
		hhead = css_set_hash(cg->subsys);
4254
		hlist_add_head(&cg->hlist, hhead);
4255
	}
4256
	write_unlock(&css_set_lock);
4257

4258
	/*
4259
	 * remove subsystem's css from the dummytop and free it - need to free
4260
	 * before marking as null because ss->destroy needs the cgrp->subsys
4261
	 * pointer to find their state. note that this also takes care of
4262
	 * freeing the css_id.
4263
	 */
4264
	ss->destroy(ss, dummytop);
4265
	dummytop->subsys[ss->subsys_id] = NULL;
4266

4267
	mutex_unlock(&cgroup_mutex);
4268
}
4269
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4270

4271
/**
4272
 * cgroup_init_early - cgroup initialization at system boot
4273
 *
4274
 * Initialize cgroups at system boot, and initialize any
4275
 * subsystems that request early init.
4276
 */
4277
int __init cgroup_init_early(void)
4278
{
4279
	int i;
4280
	atomic_set(&init_css_set.refcount, 1);
4281
	INIT_LIST_HEAD(&init_css_set.cg_links);
4282
	INIT_LIST_HEAD(&init_css_set.tasks);
4283
	INIT_HLIST_NODE(&init_css_set.hlist);
4284
	css_set_count = 1;
4285
	init_cgroup_root(&rootnode);
4286
	root_count = 1;
4287
	init_task.cgroups = &init_css_set;
4288

4289
	init_css_set_link.cg = &init_css_set;
4290
	init_css_set_link.cgrp = dummytop;
4291
	list_add(&init_css_set_link.cgrp_link_list,
4292
		 &rootnode.top_cgroup.css_sets);
4293
	list_add(&init_css_set_link.cg_link_list,
4294
		 &init_css_set.cg_links);
4295

4296
	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4297
		INIT_HLIST_HEAD(&css_set_table[i]);
4298

4299
	/* at bootup time, we don't worry about modular subsystems */
4300
	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4301
		struct cgroup_subsys *ss = subsys[i];
4302

4303
		BUG_ON(!ss->name);
4304
		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4305
		BUG_ON(!ss->create);
4306
		BUG_ON(!ss->destroy);
4307
		if (ss->subsys_id != i) {
4308
			printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4309
			       ss->name, ss->subsys_id);
4310
			BUG();
4311
		}
4312

4313
		if (ss->early_init)
4314
			cgroup_init_subsys(ss);
4315
	}
4316
	return 0;
4317
}
4318

4319
/**
4320
 * cgroup_init - cgroup initialization
4321
 *
4322
 * Register cgroup filesystem and /proc file, and initialize
4323
 * any subsystems that didn't request early init.
4324
 */
4325
int __init cgroup_init(void)
4326
{
4327
	int err;
4328
	int i;
4329
	struct hlist_head *hhead;
4330

4331
	err = bdi_init(&cgroup_backing_dev_info);
4332
	if (err)
4333
		return err;
4334

4335
	/* at bootup time, we don't worry about modular subsystems */
4336
	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4337
		struct cgroup_subsys *ss = subsys[i];
4338
		if (!ss->early_init)
4339
			cgroup_init_subsys(ss);
4340
		if (ss->use_id)
4341
			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4342
	}
4343

4344
	/* Add init_css_set to the hash table */
4345
	hhead = css_set_hash(init_css_set.subsys);
4346
	hlist_add_head(&init_css_set.hlist, hhead);
4347
	BUG_ON(!init_root_id(&rootnode));
4348

4349
	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4350
	if (!cgroup_kobj) {
4351
		err = -ENOMEM;
4352
		goto out;
4353
	}
4354

4355
	err = register_filesystem(&cgroup_fs_type);
4356
	if (err < 0) {
4357
		kobject_put(cgroup_kobj);
4358
		goto out;
4359
	}
4360

4361
	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4362

4363
out:
4364
	if (err)
4365
		bdi_destroy(&cgroup_backing_dev_info);
4366

4367
	return err;
4368
}
4369

4370
/*
4371
 * proc_cgroup_show()
4372
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
4373
 *  - Used for /proc/<pid>/cgroup.
4374
 *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4375
 *    doesn't really matter if tsk->cgroup changes after we read it,
4376
 *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4377
 *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
4378
 *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4379
 *    cgroup to top_cgroup.
4380
 */
4381

4382
/* TODO: Use a proper seq_file iterator */
4383
static int proc_cgroup_show(struct seq_file *m, void *v)
4384
{
4385
	struct pid *pid;
4386
	struct task_struct *tsk;
4387
	char *buf;
4388
	int retval;
4389
	struct cgroupfs_root *root;
4390

4391
	retval = -ENOMEM;
4392
	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4393
	if (!buf)
4394
		goto out;
4395

4396
	retval = -ESRCH;
4397
	pid = m->private;
4398
	tsk = get_pid_task(pid, PIDTYPE_PID);
4399
	if (!tsk)
4400
		goto out_free;
4401

4402
	retval = 0;
4403

4404
	mutex_lock(&cgroup_mutex);
4405

4406
	for_each_active_root(root) {
4407
		struct cgroup_subsys *ss;
4408
		struct cgroup *cgrp;
4409
		int count = 0;
4410

4411
		seq_printf(m, "%d:", root->hierarchy_id);
4412
		for_each_subsys(root, ss)
4413
			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4414
		if (strlen(root->name))
4415
			seq_printf(m, "%sname=%s", count ? "," : "",
4416
				   root->name);
4417
		seq_putc(m, ':');
4418
		cgrp = task_cgroup_from_root(tsk, root);
4419
		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4420
		if (retval < 0)
4421
			goto out_unlock;
4422
		seq_puts(m, buf);
4423
		seq_putc(m, '\n');
4424
	}
4425

4426
out_unlock:
4427
	mutex_unlock(&cgroup_mutex);
4428
	put_task_struct(tsk);
4429
out_free:
4430
	kfree(buf);
4431
out:
4432
	return retval;
4433
}
4434

4435
static int cgroup_open(struct inode *inode, struct file *file)
4436
{
4437
	struct pid *pid = PROC_I(inode)->pid;
4438
	return single_open(file, proc_cgroup_show, pid);
4439
}
4440

4441
const struct file_operations proc_cgroup_operations = {
4442
	.open		= cgroup_open,
4443
	.read		= seq_read,
4444
	.llseek		= seq_lseek,
4445
	.release	= single_release,
4446
};
4447

4448
/* Display information about each subsystem and each hierarchy */
4449
static int proc_cgroupstats_show(struct seq_file *m, void *v)
4450
{
4451
	int i;
4452

4453
	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4454
	/*
4455
	 * ideally we don't want subsystems moving around while we do this.
4456
	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4457
	 * subsys/hierarchy state.
4458
	 */
4459
	mutex_lock(&cgroup_mutex);
4460
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4461
		struct cgroup_subsys *ss = subsys[i];
4462
		if (ss == NULL)
4463
			continue;
4464
		seq_printf(m, "%s\t%d\t%d\t%d\n",
4465
			   ss->name, ss->root->hierarchy_id,
4466
			   ss->root->number_of_cgroups, !ss->disabled);
4467
	}
4468
	mutex_unlock(&cgroup_mutex);
4469
	return 0;
4470
}
4471

4472
static int cgroupstats_open(struct inode *inode, struct file *file)
4473
{
4474
	return single_open(file, proc_cgroupstats_show, NULL);
4475
}
4476

4477
static const struct file_operations proc_cgroupstats_operations = {
4478
	.open = cgroupstats_open,
4479
	.read = seq_read,
4480
	.llseek = seq_lseek,
4481
	.release = single_release,
4482
};
4483

4484
/**
4485
 * cgroup_fork - attach newly forked task to its parents cgroup.
4486
 * @child: pointer to task_struct of forking parent process.
4487
 *
4488
 * Description: A task inherits its parent's cgroup at fork().
4489
 *
4490
 * A pointer to the shared css_set was automatically copied in
4491
 * fork.c by dup_task_struct().  However, we ignore that copy, since
4492
 * it was not made under the protection of RCU or cgroup_mutex, so
4493
 * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
4494
 * have already changed current->cgroups, allowing the previously
4495
 * referenced cgroup group to be removed and freed.
4496
 *
4497
 * At the point that cgroup_fork() is called, 'current' is the parent
4498
 * task, and the passed argument 'child' points to the child task.
4499
 */
4500
void cgroup_fork(struct task_struct *child)
4501
{
4502
	task_lock(current);
4503
	child->cgroups = current->cgroups;
4504
	get_css_set(child->cgroups);
4505
	task_unlock(current);
4506
	INIT_LIST_HEAD(&child->cg_list);
4507
}
4508

4509
/**
4510
 * cgroup_fork_callbacks - run fork callbacks
4511
 * @child: the new task
4512
 *
4513
 * Called on a new task very soon before adding it to the
4514
 * tasklist. No need to take any locks since no-one can
4515
 * be operating on this task.
4516
 */
4517
void cgroup_fork_callbacks(struct task_struct *child)
4518
{
4519
	if (need_forkexit_callback) {
4520
		int i;
4521
		/*
4522
		 * forkexit callbacks are only supported for builtin
4523
		 * subsystems, and the builtin section of the subsys array is
4524
		 * immutable, so we don't need to lock the subsys array here.
4525
		 */
4526
		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4527
			struct cgroup_subsys *ss = subsys[i];
4528
			if (ss->fork)
4529
				ss->fork(ss, child);
4530
		}
4531
	}
4532
}
4533

4534
/**
4535
 * cgroup_post_fork - called on a new task after adding it to the task list
4536
 * @child: the task in question
4537
 *
4538
 * Adds the task to the list running through its css_set if necessary.
4539
 * Has to be after the task is visible on the task list in case we race
4540
 * with the first call to cgroup_iter_start() - to guarantee that the
4541
 * new task ends up on its list.
4542
 */
4543
void cgroup_post_fork(struct task_struct *child)
4544
{
4545
	if (use_task_css_set_links) {
4546
		write_lock(&css_set_lock);
4547
		task_lock(child);
4548
		if (list_empty(&child->cg_list))
4549
			list_add(&child->cg_list, &child->cgroups->tasks);
4550
		task_unlock(child);
4551
		write_unlock(&css_set_lock);
4552
	}
4553
}
4554
/**
4555
 * cgroup_exit - detach cgroup from exiting task
4556
 * @tsk: pointer to task_struct of exiting process
4557
 * @run_callback: run exit callbacks?
4558
 *
4559
 * Description: Detach cgroup from @tsk and release it.
4560
 *
4561
 * Note that cgroups marked notify_on_release force every task in
4562
 * them to take the global cgroup_mutex mutex when exiting.
4563
 * This could impact scaling on very large systems.  Be reluctant to
4564
 * use notify_on_release cgroups where very high task exit scaling
4565
 * is required on large systems.
4566
 *
4567
 * the_top_cgroup_hack:
4568
 *
4569
 *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
4570
 *
4571
 *    We call cgroup_exit() while the task is still competent to
4572
 *    handle notify_on_release(), then leave the task attached to the
4573
 *    root cgroup in each hierarchy for the remainder of its exit.
4574
 *
4575
 *    To do this properly, we would increment the reference count on
4576
 *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
4577
 *    code we would add a second cgroup function call, to drop that
4578
 *    reference.  This would just create an unnecessary hot spot on
4579
 *    the top_cgroup reference count, to no avail.
4580
 *
4581
 *    Normally, holding a reference to a cgroup without bumping its
4582
 *    count is unsafe.   The cgroup could go away, or someone could
4583
 *    attach us to a different cgroup, decrementing the count on
4584
 *    the first cgroup that we never incremented.  But in this case,
4585
 *    top_cgroup isn't going away, and either task has PF_EXITING set,
4586
 *    which wards off any cgroup_attach_task() attempts, or task is a failed
4587
 *    fork, never visible to cgroup_attach_task.
4588
 */
4589
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4590
{
4591
	struct css_set *cg;
4592
	int i;
4593

4594
	/*
4595
	 * Unlink from the css_set task list if necessary.
4596
	 * Optimistically check cg_list before taking
4597
	 * css_set_lock
4598
	 */
4599
	if (!list_empty(&tsk->cg_list)) {
4600
		write_lock(&css_set_lock);
4601
		if (!list_empty(&tsk->cg_list))
4602
			list_del_init(&tsk->cg_list);
4603
		write_unlock(&css_set_lock);
4604
	}
4605

4606
	/* Reassign the task to the init_css_set. */
4607
	task_lock(tsk);
4608
	cg = tsk->cgroups;
4609
	tsk->cgroups = &init_css_set;
4610

4611
	if (run_callbacks && need_forkexit_callback) {
4612
		/*
4613
		 * modular subsystems can't use callbacks, so no need to lock
4614
		 * the subsys array
4615
		 */
4616
		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4617
			struct cgroup_subsys *ss = subsys[i];
4618
			if (ss->exit) {
4619
				struct cgroup *old_cgrp =
4620
					rcu_dereference_raw(cg->subsys[i])->cgroup;
4621
				struct cgroup *cgrp = task_cgroup(tsk, i);
4622
				ss->exit(ss, cgrp, old_cgrp, tsk);
4623
			}
4624
		}
4625
	}
4626
	task_unlock(tsk);
4627

4628
	if (cg)
4629
		put_css_set_taskexit(cg);
4630
}
4631

4632
/**
4633
 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
4634
 * @cgrp: the cgroup in question
4635
 * @task: the task in question
4636
 *
4637
 * See if @cgrp is a descendant of @task's cgroup in the appropriate
4638
 * hierarchy.
4639
 *
4640
 * If we are sending in dummytop, then presumably we are creating
4641
 * the top cgroup in the subsystem.
4642
 *
4643
 * Called only by the ns (nsproxy) cgroup.
4644
 */
4645
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
4646
{
4647
	int ret;
4648
	struct cgroup *target;
4649

4650
	if (cgrp == dummytop)
4651
		return 1;
4652

4653
	target = task_cgroup_from_root(task, cgrp->root);
4654
	while (cgrp != target && cgrp!= cgrp->top_cgroup)
4655
		cgrp = cgrp->parent;
4656
	ret = (cgrp == target);
4657
	return ret;
4658
}
4659

4660
static void check_for_release(struct cgroup *cgrp)
4661
{
4662
	/* All of these checks rely on RCU to keep the cgroup
4663
	 * structure alive */
4664
	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
4665
	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
4666
		/* Control Group is currently removeable. If it's not
4667
		 * already queued for a userspace notification, queue
4668
		 * it now */
4669
		int need_schedule_work = 0;
4670
		spin_lock(&release_list_lock);
4671
		if (!cgroup_is_removed(cgrp) &&
4672
		    list_empty(&cgrp->release_list)) {
4673
			list_add(&cgrp->release_list, &release_list);
4674
			need_schedule_work = 1;
4675
		}
4676
		spin_unlock(&release_list_lock);
4677
		if (need_schedule_work)
4678
			schedule_work(&release_agent_work);
4679
	}
4680
}
4681

4682
/* Caller must verify that the css is not for root cgroup */
4683
void __css_put(struct cgroup_subsys_state *css, int count)
4684
{
4685
	struct cgroup *cgrp = css->cgroup;
4686
	int val;
4687
	rcu_read_lock();
4688
	val = atomic_sub_return(count, &css->refcnt);
4689
	if (val == 1) {
4690
		if (notify_on_release(cgrp)) {
4691
			set_bit(CGRP_RELEASABLE, &cgrp->flags);
4692
			check_for_release(cgrp);
4693
		}
4694
		cgroup_wakeup_rmdir_waiter(cgrp);
4695
	}
4696
	rcu_read_unlock();
4697
	WARN_ON_ONCE(val < 1);
4698
}
4699
EXPORT_SYMBOL_GPL(__css_put);
4700

4701
/*
4702
 * Notify userspace when a cgroup is released, by running the
4703
 * configured release agent with the name of the cgroup (path
4704
 * relative to the root of cgroup file system) as the argument.
4705
 *
4706
 * Most likely, this user command will try to rmdir this cgroup.
4707
 *
4708
 * This races with the possibility that some other task will be
4709
 * attached to this cgroup before it is removed, or that some other
4710
 * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
4711
 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
4712
 * unused, and this cgroup will be reprieved from its death sentence,
4713
 * to continue to serve a useful existence.  Next time it's released,
4714
 * we will get notified again, if it still has 'notify_on_release' set.
4715
 *
4716
 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
4717
 * means only wait until the task is successfully execve()'d.  The
4718
 * separate release agent task is forked by call_usermodehelper(),
4719
 * then control in this thread returns here, without waiting for the
4720
 * release agent task.  We don't bother to wait because the caller of
4721
 * this routine has no use for the exit status of the release agent
4722
 * task, so no sense holding our caller up for that.
4723
 */
4724
static void cgroup_release_agent(struct work_struct *work)
4725
{
4726
	BUG_ON(work != &release_agent_work);
4727
	mutex_lock(&cgroup_mutex);
4728
	spin_lock(&release_list_lock);
4729
	while (!list_empty(&release_list)) {
4730
		char *argv[3], *envp[3];
4731
		int i;
4732
		char *pathbuf = NULL, *agentbuf = NULL;
4733
		struct cgroup *cgrp = list_entry(release_list.next,
4734
						    struct cgroup,
4735
						    release_list);
4736
		list_del_init(&cgrp->release_list);
4737
		spin_unlock(&release_list_lock);
4738
		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
4739
		if (!pathbuf)
4740
			goto continue_free;
4741
		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
4742
			goto continue_free;
4743
		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
4744
		if (!agentbuf)
4745
			goto continue_free;
4746

4747
		i = 0;
4748
		argv[i++] = agentbuf;
4749
		argv[i++] = pathbuf;
4750
		argv[i] = NULL;
4751

4752
		i = 0;
4753
		/* minimal command environment */
4754
		envp[i++] = "HOME=/";
4755
		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
4756
		envp[i] = NULL;
4757

4758
		/* Drop the lock while we invoke the usermode helper,
4759
		 * since the exec could involve hitting disk and hence
4760
		 * be a slow process */
4761
		mutex_unlock(&cgroup_mutex);
4762
		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
4763
		mutex_lock(&cgroup_mutex);
4764
 continue_free:
4765
		kfree(pathbuf);
4766
		kfree(agentbuf);
4767
		spin_lock(&release_list_lock);
4768
	}
4769
	spin_unlock(&release_list_lock);
4770
	mutex_unlock(&cgroup_mutex);
4771
}
4772

4773
static int __init cgroup_disable(char *str)
4774
{
4775
	int i;
4776
	char *token;
4777

4778
	while ((token = strsep(&str, ",")) != NULL) {
4779
		if (!*token)
4780
			continue;
4781
		/*
4782
		 * cgroup_disable, being at boot time, can't know about module
4783
		 * subsystems, so we don't worry about them.
4784
		 */
4785
		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4786
			struct cgroup_subsys *ss = subsys[i];
4787

4788
			if (!strcmp(token, ss->name)) {
4789
				ss->disabled = 1;
4790
				printk(KERN_INFO "Disabling %s control group"
4791
					" subsystem\n", ss->name);
4792
				break;
4793
			}
4794
		}
4795
	}
4796
	return 1;
4797
}
4798
__setup("cgroup_disable=", cgroup_disable);
4799

4800
/*
4801
 * Functons for CSS ID.
4802
 */
4803

4804
/*
4805
 *To get ID other than 0, this should be called when !cgroup_is_removed().
4806
 */
4807
unsigned short css_id(struct cgroup_subsys_state *css)
4808
{
4809
	struct css_id *cssid;
4810

4811
	/*
4812
	 * This css_id() can return correct value when somone has refcnt
4813
	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4814
	 * it's unchanged until freed.
4815
	 */
4816
	cssid = rcu_dereference_check(css->id,
4817
			rcu_read_lock_held() || atomic_read(&css->refcnt));
4818

4819
	if (cssid)
4820
		return cssid->id;
4821
	return 0;
4822
}
4823
EXPORT_SYMBOL_GPL(css_id);
4824

4825
unsigned short css_depth(struct cgroup_subsys_state *css)
4826
{
4827
	struct css_id *cssid;
4828

4829
	cssid = rcu_dereference_check(css->id,
4830
			rcu_read_lock_held() || atomic_read(&css->refcnt));
4831

4832
	if (cssid)
4833
		return cssid->depth;
4834
	return 0;
4835
}
4836
EXPORT_SYMBOL_GPL(css_depth);
4837

4838
/**
4839
 *  css_is_ancestor - test "root" css is an ancestor of "child"
4840
 * @child: the css to be tested.
4841
 * @root: the css supporsed to be an ancestor of the child.
4842
 *
4843
 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4844
 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4845
 * But, considering usual usage, the csses should be valid objects after test.
4846
 * Assuming that the caller will do some action to the child if this returns
4847
 * returns true, the caller must take "child";s reference count.
4848
 * If "child" is valid object and this returns true, "root" is valid, too.
4849
 */
4850

4851
bool css_is_ancestor(struct cgroup_subsys_state *child,
4852
		    const struct cgroup_subsys_state *root)
4853
{
4854
	struct css_id *child_id;
4855
	struct css_id *root_id;
4856
	bool ret = true;
4857

4858
	rcu_read_lock();
4859
	child_id  = rcu_dereference(child->id);
4860
	root_id = rcu_dereference(root->id);
4861
	if (!child_id
4862
	    || !root_id
4863
	    || (child_id->depth < root_id->depth)
4864
	    || (child_id->stack[root_id->depth] != root_id->id))
4865
		ret = false;
4866
	rcu_read_unlock();
4867
	return ret;
4868
}
4869

4870
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4871
{
4872
	struct css_id *id = css->id;
4873
	/* When this is called before css_id initialization, id can be NULL */
4874
	if (!id)
4875
		return;
4876

4877
	BUG_ON(!ss->use_id);
4878

4879
	rcu_assign_pointer(id->css, NULL);
4880
	rcu_assign_pointer(css->id, NULL);
4881
	spin_lock(&ss->id_lock);
4882
	idr_remove(&ss->idr, id->id);
4883
	spin_unlock(&ss->id_lock);
4884
	kfree_rcu(id, rcu_head);
4885
}
4886
EXPORT_SYMBOL_GPL(free_css_id);
4887

4888
/*
4889
 * This is called by init or create(). Then, calls to this function are
4890
 * always serialized (By cgroup_mutex() at create()).
4891
 */
4892

4893
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4894
{
4895
	struct css_id *newid;
4896
	int myid, error, size;
4897

4898
	BUG_ON(!ss->use_id);
4899

4900
	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
4901
	newid = kzalloc(size, GFP_KERNEL);
4902
	if (!newid)
4903
		return ERR_PTR(-ENOMEM);
4904
	/* get id */
4905
	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
4906
		error = -ENOMEM;
4907
		goto err_out;
4908
	}
4909
	spin_lock(&ss->id_lock);
4910
	/* Don't use 0. allocates an ID of 1-65535 */
4911
	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4912
	spin_unlock(&ss->id_lock);
4913

4914
	/* Returns error when there are no free spaces for new ID.*/
4915
	if (error) {
4916
		error = -ENOSPC;
4917
		goto err_out;
4918
	}
4919
	if (myid > CSS_ID_MAX)
4920
		goto remove_idr;
4921

4922
	newid->id = myid;
4923
	newid->depth = depth;
4924
	return newid;
4925
remove_idr:
4926
	error = -ENOSPC;
4927
	spin_lock(&ss->id_lock);
4928
	idr_remove(&ss->idr, myid);
4929
	spin_unlock(&ss->id_lock);
4930
err_out:
4931
	kfree(newid);
4932
	return ERR_PTR(error);
4933

4934
}
4935

4936
static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4937
					    struct cgroup_subsys_state *rootcss)
4938
{
4939
	struct css_id *newid;
4940

4941
	spin_lock_init(&ss->id_lock);
4942
	idr_init(&ss->idr);
4943

4944
	newid = get_new_cssid(ss, 0);
4945
	if (IS_ERR(newid))
4946
		return PTR_ERR(newid);
4947

4948
	newid->stack[0] = newid->id;
4949
	newid->css = rootcss;
4950
	rootcss->id = newid;
4951
	return 0;
4952
}
4953

4954
static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4955
			struct cgroup *child)
4956
{
4957
	int subsys_id, i, depth = 0;
4958
	struct cgroup_subsys_state *parent_css, *child_css;
4959
	struct css_id *child_id, *parent_id;
4960

4961
	subsys_id = ss->subsys_id;
4962
	parent_css = parent->subsys[subsys_id];
4963
	child_css = child->subsys[subsys_id];
4964
	parent_id = parent_css->id;
4965
	depth = parent_id->depth + 1;
4966

4967
	child_id = get_new_cssid(ss, depth);
4968
	if (IS_ERR(child_id))
4969
		return PTR_ERR(child_id);
4970

4971
	for (i = 0; i < depth; i++)
4972
		child_id->stack[i] = parent_id->stack[i];
4973
	child_id->stack[depth] = child_id->id;
4974
	/*
4975
	 * child_id->css pointer will be set after this cgroup is available
4976
	 * see cgroup_populate_dir()
4977
	 */
4978
	rcu_assign_pointer(child_css->id, child_id);
4979

4980
	return 0;
4981
}
4982

4983
/**
4984
 * css_lookup - lookup css by id
4985
 * @ss: cgroup subsys to be looked into.
4986
 * @id: the id
4987
 *
4988
 * Returns pointer to cgroup_subsys_state if there is valid one with id.
4989
 * NULL if not. Should be called under rcu_read_lock()
4990
 */
4991
struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4992
{
4993
	struct css_id *cssid = NULL;
4994

4995
	BUG_ON(!ss->use_id);
4996
	cssid = idr_find(&ss->idr, id);
4997

4998
	if (unlikely(!cssid))
4999
		return NULL;
5000

5001
	return rcu_dereference(cssid->css);
5002
}
5003
EXPORT_SYMBOL_GPL(css_lookup);
5004

5005
/**
5006
 * css_get_next - lookup next cgroup under specified hierarchy.
5007
 * @ss: pointer to subsystem
5008
 * @id: current position of iteration.
5009
 * @root: pointer to css. search tree under this.
5010
 * @foundid: position of found object.
5011
 *
5012
 * Search next css under the specified hierarchy of rootid. Calling under
5013
 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
5014
 */
5015
struct cgroup_subsys_state *
5016
css_get_next(struct cgroup_subsys *ss, int id,
5017
	     struct cgroup_subsys_state *root, int *foundid)
5018
{
5019
	struct cgroup_subsys_state *ret = NULL;
5020
	struct css_id *tmp;
5021
	int tmpid;
5022
	int rootid = css_id(root);
5023
	int depth = css_depth(root);
5024

5025
	if (!rootid)
5026
		return NULL;
5027

5028
	BUG_ON(!ss->use_id);
5029
	/* fill start point for scan */
5030
	tmpid = id;
5031
	while (1) {
5032
		/*
5033
		 * scan next entry from bitmap(tree), tmpid is updated after
5034
		 * idr_get_next().
5035
		 */
5036
		spin_lock(&ss->id_lock);
5037
		tmp = idr_get_next(&ss->idr, &tmpid);
5038
		spin_unlock(&ss->id_lock);
5039

5040
		if (!tmp)
5041
			break;
5042
		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5043
			ret = rcu_dereference(tmp->css);
5044
			if (ret) {
5045
				*foundid = tmpid;
5046
				break;
5047
			}
5048
		}
5049
		/* continue to scan from next id */
5050
		tmpid = tmpid + 1;
5051
	}
5052
	return ret;
5053
}
5054

5055
/*
5056
 * get corresponding css from file open on cgroupfs directory
5057
 */
5058
struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5059
{
5060
	struct cgroup *cgrp;
5061
	struct inode *inode;
5062
	struct cgroup_subsys_state *css;
5063

5064
	inode = f->f_dentry->d_inode;
5065
	/* check in cgroup filesystem dir */
5066
	if (inode->i_op != &cgroup_dir_inode_operations)
5067
		return ERR_PTR(-EBADF);
5068

5069
	if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
5070
		return ERR_PTR(-EINVAL);
5071

5072
	/* get cgroup */
5073
	cgrp = __d_cgrp(f->f_dentry);
5074
	css = cgrp->subsys[id];
5075
	return css ? css : ERR_PTR(-ENOENT);
5076
}
5077

5078
#ifdef CONFIG_CGROUP_DEBUG
5079
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5080
						   struct cgroup *cont)
5081
{
5082
	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5083

5084
	if (!css)
5085
		return ERR_PTR(-ENOMEM);
5086

5087
	return css;
5088
}
5089

5090
static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
5091
{
5092
	kfree(cont->subsys[debug_subsys_id]);
5093
}
5094

5095
static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5096
{
5097
	return atomic_read(&cont->count);
5098
}
5099

5100
static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
5101
{
5102
	return cgroup_task_count(cont);
5103
}
5104

5105
static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
5106
{
5107
	return (u64)(unsigned long)current->cgroups;
5108
}
5109

5110
static u64 current_css_set_refcount_read(struct cgroup *cont,
5111
					   struct cftype *cft)
5112
{
5113
	u64 count;
5114

5115
	rcu_read_lock();
5116
	count = atomic_read(&current->cgroups->refcount);
5117
	rcu_read_unlock();
5118
	return count;
5119
}
5120

5121
static int current_css_set_cg_links_read(struct cgroup *cont,
5122
					 struct cftype *cft,
5123
					 struct seq_file *seq)
5124
{
5125
	struct cg_cgroup_link *link;
5126
	struct css_set *cg;
5127

5128
	read_lock(&css_set_lock);
5129
	rcu_read_lock();
5130
	cg = rcu_dereference(current->cgroups);
5131
	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
5132
		struct cgroup *c = link->cgrp;
5133
		const char *name;
5134

5135
		if (c->dentry)
5136
			name = c->dentry->d_name.name;
5137
		else
5138
			name = "?";
5139
		seq_printf(seq, "Root %d group %s\n",
5140
			   c->root->hierarchy_id, name);
5141
	}
5142
	rcu_read_unlock();
5143
	read_unlock(&css_set_lock);
5144
	return 0;
5145
}
5146

5147
#define MAX_TASKS_SHOWN_PER_CSS 25
5148
static int cgroup_css_links_read(struct cgroup *cont,
5149
				 struct cftype *cft,
5150
				 struct seq_file *seq)
5151
{
5152
	struct cg_cgroup_link *link;
5153

5154
	read_lock(&css_set_lock);
5155
	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
5156
		struct css_set *cg = link->cg;
5157
		struct task_struct *task;
5158
		int count = 0;
5159
		seq_printf(seq, "css_set %p\n", cg);
5160
		list_for_each_entry(task, &cg->tasks, cg_list) {
5161
			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5162
				seq_puts(seq, "  ...\n");
5163
				break;
5164
			} else {
5165
				seq_printf(seq, "  task %d\n",
5166
					   task_pid_vnr(task));
5167
			}
5168
		}
5169
	}
5170
	read_unlock(&css_set_lock);
5171
	return 0;
5172
}
5173

5174
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5175
{
5176
	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
5177
}
5178

5179
static struct cftype debug_files[] =  {
5180
	{
5181
		.name = "cgroup_refcount",
5182
		.read_u64 = cgroup_refcount_read,
5183
	},
5184
	{
5185
		.name = "taskcount",
5186
		.read_u64 = debug_taskcount_read,
5187
	},
5188

5189
	{
5190
		.name = "current_css_set",
5191
		.read_u64 = current_css_set_read,
5192
	},
5193

5194
	{
5195
		.name = "current_css_set_refcount",
5196
		.read_u64 = current_css_set_refcount_read,
5197
	},
5198

5199
	{
5200
		.name = "current_css_set_cg_links",
5201
		.read_seq_string = current_css_set_cg_links_read,
5202
	},
5203

5204
	{
5205
		.name = "cgroup_css_links",
5206
		.read_seq_string = cgroup_css_links_read,
5207
	},
5208

5209
	{
5210
		.name = "releasable",
5211
		.read_u64 = releasable_read,
5212
	},
5213
};
5214

5215
static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
5216
{
5217
	return cgroup_add_files(cont, ss, debug_files,
5218
				ARRAY_SIZE(debug_files));
5219
}
5220

5221
struct cgroup_subsys debug_subsys = {
5222
	.name = "debug",
5223
	.create = debug_create,
5224
	.destroy = debug_destroy,
5225
	.populate = debug_populate,
5226
	.subsys_id = debug_subsys_id,
5227
};
5228
#endif /* CONFIG_CGROUP_DEBUG */
5229

5230
Product

Resources

Company