CoCalc -- mempolicy.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/mempolicy.c
¹⁰⁸¹⁴ views
1
/*
2
 * Simple NUMA memory policy for the Linux kernel.
3
 *
4
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6
 * Subject to the GNU Public License, version 2.
7
 *
8
 * NUMA policy allows the user to give hints in which node(s) memory should
9
 * be allocated.
10
 *
11
 * Support four policies per VMA and per process:
12
 *
13
 * The VMA policy has priority over the process policy for a page fault.
14
 *
15
 * interleave     Allocate memory interleaved over a set of nodes,
16
 *                with normal fallback if it fails.
17
 *                For VMA based allocations this interleaves based on the
18
 *                offset into the backing object or offset into the mapping
19
 *                for anonymous memory. For process policy an process counter
20
 *                is used.
21
 *
22
 * bind           Only allocate memory on a specific set of nodes,
23
 *                no fallback.
24
 *                FIXME: memory is allocated starting with the first node
25
 *                to the last. It would be better if bind would truly restrict
26
 *                the allocation to memory nodes instead
27
 *
28
 * preferred       Try a specific node first before normal fallback.
29
 *                As a special case node -1 here means do the allocation
30
 *                on the local CPU. This is normally identical to default,
31
 *                but useful to set in a VMA when you have a non default
32
 *                process policy.
33
 *
34
 * default        Allocate on the local node first, or when on a VMA
35
 *                use the process policy. This is what Linux always did
36
 *		  in a NUMA aware kernel and still does by, ahem, default.
37
 *
38
 * The process policy is applied for most non interrupt memory allocations
39
 * in that process' context. Interrupts ignore the policies and always
40
 * try to allocate on the local CPU. The VMA policy is only applied for memory
41
 * allocations for a VMA in the VM.
42
 *
43
 * Currently there are a few corner cases in swapping where the policy
44
 * is not applied, but the majority should be handled. When process policy
45
 * is used it is not remembered over swap outs/swap ins.
46
 *
47
 * Only the highest zone in the zone hierarchy gets policied. Allocations
48
 * requesting a lower zone just use default policy. This implies that
49
 * on systems with highmem kernel lowmem allocation don't get policied.
50
 * Same with GFP_DMA allocations.
51
 *
52
 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53
 * all users and remembered even when nobody has memory mapped.
54
 */
55

56
/* Notebook:
57
   fix mmap readahead to honour policy and enable policy for any page cache
58
   object
59
   statistics for bigpages
60
   global policy for page cache? currently it uses process policy. Requires
61
   first item above.
62
   handle mremap for shared memory (currently ignored for the policy)
63
   grows down?
64
   make bind policy root only? It can trigger oom much faster and the
65
   kernel is not always grateful with that.
66
*/
67

68
#include <linux/mempolicy.h>
69
#include <linux/mm.h>
70
#include <linux/highmem.h>
71
#include <linux/hugetlb.h>
72
#include <linux/kernel.h>
73
#include <linux/sched.h>
74
#include <linux/nodemask.h>
75
#include <linux/cpuset.h>
76
#include <linux/slab.h>
77
#include <linux/string.h>
78
#include <linux/module.h>
79
#include <linux/nsproxy.h>
80
#include <linux/interrupt.h>
81
#include <linux/init.h>
82
#include <linux/compat.h>
83
#include <linux/swap.h>
84
#include <linux/seq_file.h>
85
#include <linux/proc_fs.h>
86
#include <linux/migrate.h>
87
#include <linux/ksm.h>
88
#include <linux/rmap.h>
89
#include <linux/security.h>
90
#include <linux/syscalls.h>
91
#include <linux/ctype.h>
92
#include <linux/mm_inline.h>
93

94
#include <asm/tlbflush.h>
95
#include <asm/uaccess.h>
96

97
#include "internal.h"
98

99
/* Internal flags */
100
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
101
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
102

103
static struct kmem_cache *policy_cache;
104
static struct kmem_cache *sn_cache;
105

106
/* Highest zone. An specific allocation for a zone below that is not
107
   policied. */
108
enum zone_type policy_zone = 0;
109

110
/*
111
 * run-time system-wide default policy => local allocation
112
 */
113
struct mempolicy default_policy = {
114
	.refcnt = ATOMIC_INIT(1), /* never free it */
115
	.mode = MPOL_PREFERRED,
116
	.flags = MPOL_F_LOCAL,
117
};
118

119
static const struct mempolicy_operations {
120
	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121
	/*
122
	 * If read-side task has no lock to protect task->mempolicy, write-side
123
	 * task will rebind the task->mempolicy by two step. The first step is
124
	 * setting all the newly nodes, and the second step is cleaning all the
125
	 * disallowed nodes. In this way, we can avoid finding no node to alloc
126
	 * page.
127
	 * If we have a lock to protect task->mempolicy in read-side, we do
128
	 * rebind directly.
129
	 *
130
	 * step:
131
	 * 	MPOL_REBIND_ONCE - do rebind work at once
132
	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
133
	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
134
	 */
135
	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
136
			enum mpol_rebind_step step);
137
} mpol_ops[MPOL_MAX];
138

139
/* Check that the nodemask contains at least one populated zone */
140
static int is_valid_nodemask(const nodemask_t *nodemask)
141
{
142
	int nd, k;
143

144
	for_each_node_mask(nd, *nodemask) {
145
		struct zone *z;
146

147
		for (k = 0; k <= policy_zone; k++) {
148
			z = &NODE_DATA(nd)->node_zones[k];
149
			if (z->present_pages > 0)
150
				return 1;
151
		}
152
	}
153

154
	return 0;
155
}
156

157
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
158
{
159
	return pol->flags & MPOL_MODE_FLAGS;
160
}
161

162
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
163
				   const nodemask_t *rel)
164
{
165
	nodemask_t tmp;
166
	nodes_fold(tmp, *orig, nodes_weight(*rel));
167
	nodes_onto(*ret, tmp, *rel);
168
}
169

170
static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
171
{
172
	if (nodes_empty(*nodes))
173
		return -EINVAL;
174
	pol->v.nodes = *nodes;
175
	return 0;
176
}
177

178
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
179
{
180
	if (!nodes)
181
		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
182
	else if (nodes_empty(*nodes))
183
		return -EINVAL;			/*  no allowed nodes */
184
	else
185
		pol->v.preferred_node = first_node(*nodes);
186
	return 0;
187
}
188

189
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
190
{
191
	if (!is_valid_nodemask(nodes))
192
		return -EINVAL;
193
	pol->v.nodes = *nodes;
194
	return 0;
195
}
196

197
/*
198
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
199
 * any, for the new policy.  mpol_new() has already validated the nodes
200
 * parameter with respect to the policy mode and flags.  But, we need to
201
 * handle an empty nodemask with MPOL_PREFERRED here.
202
 *
203
 * Must be called holding task's alloc_lock to protect task's mems_allowed
204
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
205
 */
206
static int mpol_set_nodemask(struct mempolicy *pol,
207
		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
208
{
209
	int ret;
210

211
	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
212
	if (pol == NULL)
213
		return 0;
214
	/* Check N_HIGH_MEMORY */
215
	nodes_and(nsc->mask1,
216
		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
217

218
	VM_BUG_ON(!nodes);
219
	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
220
		nodes = NULL;	/* explicit local allocation */
221
	else {
222
		if (pol->flags & MPOL_F_RELATIVE_NODES)
223
			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
224
		else
225
			nodes_and(nsc->mask2, *nodes, nsc->mask1);
226

227
		if (mpol_store_user_nodemask(pol))
228
			pol->w.user_nodemask = *nodes;
229
		else
230
			pol->w.cpuset_mems_allowed =
231
						cpuset_current_mems_allowed;
232
	}
233

234
	if (nodes)
235
		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
236
	else
237
		ret = mpol_ops[pol->mode].create(pol, NULL);
238
	return ret;
239
}
240

241
/*
242
 * This function just creates a new policy, does some check and simple
243
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
244
 */
245
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
246
				  nodemask_t *nodes)
247
{
248
	struct mempolicy *policy;
249

250
	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
251
		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
252

253
	if (mode == MPOL_DEFAULT) {
254
		if (nodes && !nodes_empty(*nodes))
255
			return ERR_PTR(-EINVAL);
256
		return NULL;	/* simply delete any existing policy */
257
	}
258
	VM_BUG_ON(!nodes);
259

260
	/*
261
	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
262
	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
263
	 * All other modes require a valid pointer to a non-empty nodemask.
264
	 */
265
	if (mode == MPOL_PREFERRED) {
266
		if (nodes_empty(*nodes)) {
267
			if (((flags & MPOL_F_STATIC_NODES) ||
268
			     (flags & MPOL_F_RELATIVE_NODES)))
269
				return ERR_PTR(-EINVAL);
270
		}
271
	} else if (nodes_empty(*nodes))
272
		return ERR_PTR(-EINVAL);
273
	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
274
	if (!policy)
275
		return ERR_PTR(-ENOMEM);
276
	atomic_set(&policy->refcnt, 1);
277
	policy->mode = mode;
278
	policy->flags = flags;
279

280
	return policy;
281
}
282

283
/* Slow path of a mpol destructor. */
284
void __mpol_put(struct mempolicy *p)
285
{
286
	if (!atomic_dec_and_test(&p->refcnt))
287
		return;
288
	kmem_cache_free(policy_cache, p);
289
}
290

291
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
292
				enum mpol_rebind_step step)
293
{
294
}
295

296
/*
297
 * step:
298
 * 	MPOL_REBIND_ONCE  - do rebind work at once
299
 * 	MPOL_REBIND_STEP1 - set all the newly nodes
300
 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
301
 */
302
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
303
				 enum mpol_rebind_step step)
304
{
305
	nodemask_t tmp;
306

307
	if (pol->flags & MPOL_F_STATIC_NODES)
308
		nodes_and(tmp, pol->w.user_nodemask, *nodes);
309
	else if (pol->flags & MPOL_F_RELATIVE_NODES)
310
		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
311
	else {
312
		/*
313
		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
314
		 * result
315
		 */
316
		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
317
			nodes_remap(tmp, pol->v.nodes,
318
					pol->w.cpuset_mems_allowed, *nodes);
319
			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
320
		} else if (step == MPOL_REBIND_STEP2) {
321
			tmp = pol->w.cpuset_mems_allowed;
322
			pol->w.cpuset_mems_allowed = *nodes;
323
		} else
324
			BUG();
325
	}
326

327
	if (nodes_empty(tmp))
328
		tmp = *nodes;
329

330
	if (step == MPOL_REBIND_STEP1)
331
		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
332
	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
333
		pol->v.nodes = tmp;
334
	else
335
		BUG();
336

337
	if (!node_isset(current->il_next, tmp)) {
338
		current->il_next = next_node(current->il_next, tmp);
339
		if (current->il_next >= MAX_NUMNODES)
340
			current->il_next = first_node(tmp);
341
		if (current->il_next >= MAX_NUMNODES)
342
			current->il_next = numa_node_id();
343
	}
344
}
345

346
static void mpol_rebind_preferred(struct mempolicy *pol,
347
				  const nodemask_t *nodes,
348
				  enum mpol_rebind_step step)
349
{
350
	nodemask_t tmp;
351

352
	if (pol->flags & MPOL_F_STATIC_NODES) {
353
		int node = first_node(pol->w.user_nodemask);
354

355
		if (node_isset(node, *nodes)) {
356
			pol->v.preferred_node = node;
357
			pol->flags &= ~MPOL_F_LOCAL;
358
		} else
359
			pol->flags |= MPOL_F_LOCAL;
360
	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
361
		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
362
		pol->v.preferred_node = first_node(tmp);
363
	} else if (!(pol->flags & MPOL_F_LOCAL)) {
364
		pol->v.preferred_node = node_remap(pol->v.preferred_node,
365
						   pol->w.cpuset_mems_allowed,
366
						   *nodes);
367
		pol->w.cpuset_mems_allowed = *nodes;
368
	}
369
}
370

371
/*
372
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
373
 *
374
 * If read-side task has no lock to protect task->mempolicy, write-side
375
 * task will rebind the task->mempolicy by two step. The first step is
376
 * setting all the newly nodes, and the second step is cleaning all the
377
 * disallowed nodes. In this way, we can avoid finding no node to alloc
378
 * page.
379
 * If we have a lock to protect task->mempolicy in read-side, we do
380
 * rebind directly.
381
 *
382
 * step:
383
 * 	MPOL_REBIND_ONCE  - do rebind work at once
384
 * 	MPOL_REBIND_STEP1 - set all the newly nodes
385
 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
386
 */
387
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
388
				enum mpol_rebind_step step)
389
{
390
	if (!pol)
391
		return;
392
	if (!mpol_store_user_nodemask(pol) && step == 0 &&
393
	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
394
		return;
395

396
	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
397
		return;
398

399
	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
400
		BUG();
401

402
	if (step == MPOL_REBIND_STEP1)
403
		pol->flags |= MPOL_F_REBINDING;
404
	else if (step == MPOL_REBIND_STEP2)
405
		pol->flags &= ~MPOL_F_REBINDING;
406
	else if (step >= MPOL_REBIND_NSTEP)
407
		BUG();
408

409
	mpol_ops[pol->mode].rebind(pol, newmask, step);
410
}
411

412
/*
413
 * Wrapper for mpol_rebind_policy() that just requires task
414
 * pointer, and updates task mempolicy.
415
 *
416
 * Called with task's alloc_lock held.
417
 */
418

419
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
420
			enum mpol_rebind_step step)
421
{
422
	mpol_rebind_policy(tsk->mempolicy, new, step);
423
}
424

425
/*
426
 * Rebind each vma in mm to new nodemask.
427
 *
428
 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
429
 */
430

431
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
432
{
433
	struct vm_area_struct *vma;
434

435
	down_write(&mm->mmap_sem);
436
	for (vma = mm->mmap; vma; vma = vma->vm_next)
437
		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
438
	up_write(&mm->mmap_sem);
439
}
440

441
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
442
	[MPOL_DEFAULT] = {
443
		.rebind = mpol_rebind_default,
444
	},
445
	[MPOL_INTERLEAVE] = {
446
		.create = mpol_new_interleave,
447
		.rebind = mpol_rebind_nodemask,
448
	},
449
	[MPOL_PREFERRED] = {
450
		.create = mpol_new_preferred,
451
		.rebind = mpol_rebind_preferred,
452
	},
453
	[MPOL_BIND] = {
454
		.create = mpol_new_bind,
455
		.rebind = mpol_rebind_nodemask,
456
	},
457
};
458

459
static void migrate_page_add(struct page *page, struct list_head *pagelist,
460
				unsigned long flags);
461

462
/* Scan through pages checking if pages follow certain conditions. */
463
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
464
		unsigned long addr, unsigned long end,
465
		const nodemask_t *nodes, unsigned long flags,
466
		void *private)
467
{
468
	pte_t *orig_pte;
469
	pte_t *pte;
470
	spinlock_t *ptl;
471

472
	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
473
	do {
474
		struct page *page;
475
		int nid;
476

477
		if (!pte_present(*pte))
478
			continue;
479
		page = vm_normal_page(vma, addr, *pte);
480
		if (!page)
481
			continue;
482
		/*
483
		 * vm_normal_page() filters out zero pages, but there might
484
		 * still be PageReserved pages to skip, perhaps in a VDSO.
485
		 * And we cannot move PageKsm pages sensibly or safely yet.
486
		 */
487
		if (PageReserved(page) || PageKsm(page))
488
			continue;
489
		nid = page_to_nid(page);
490
		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
491
			continue;
492

493
		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
494
			migrate_page_add(page, private, flags);
495
		else
496
			break;
497
	} while (pte++, addr += PAGE_SIZE, addr != end);
498
	pte_unmap_unlock(orig_pte, ptl);
499
	return addr != end;
500
}
501

502
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
503
		unsigned long addr, unsigned long end,
504
		const nodemask_t *nodes, unsigned long flags,
505
		void *private)
506
{
507
	pmd_t *pmd;
508
	unsigned long next;
509

510
	pmd = pmd_offset(pud, addr);
511
	do {
512
		next = pmd_addr_end(addr, end);
513
		split_huge_page_pmd(vma->vm_mm, pmd);
514
		if (pmd_none_or_clear_bad(pmd))
515
			continue;
516
		if (check_pte_range(vma, pmd, addr, next, nodes,
517
				    flags, private))
518
			return -EIO;
519
	} while (pmd++, addr = next, addr != end);
520
	return 0;
521
}
522

523
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
524
		unsigned long addr, unsigned long end,
525
		const nodemask_t *nodes, unsigned long flags,
526
		void *private)
527
{
528
	pud_t *pud;
529
	unsigned long next;
530

531
	pud = pud_offset(pgd, addr);
532
	do {
533
		next = pud_addr_end(addr, end);
534
		if (pud_none_or_clear_bad(pud))
535
			continue;
536
		if (check_pmd_range(vma, pud, addr, next, nodes,
537
				    flags, private))
538
			return -EIO;
539
	} while (pud++, addr = next, addr != end);
540
	return 0;
541
}
542

543
static inline int check_pgd_range(struct vm_area_struct *vma,
544
		unsigned long addr, unsigned long end,
545
		const nodemask_t *nodes, unsigned long flags,
546
		void *private)
547
{
548
	pgd_t *pgd;
549
	unsigned long next;
550

551
	pgd = pgd_offset(vma->vm_mm, addr);
552
	do {
553
		next = pgd_addr_end(addr, end);
554
		if (pgd_none_or_clear_bad(pgd))
555
			continue;
556
		if (check_pud_range(vma, pgd, addr, next, nodes,
557
				    flags, private))
558
			return -EIO;
559
	} while (pgd++, addr = next, addr != end);
560
	return 0;
561
}
562

563
/*
564
 * Check if all pages in a range are on a set of nodes.
565
 * If pagelist != NULL then isolate pages from the LRU and
566
 * put them on the pagelist.
567
 */
568
static struct vm_area_struct *
569
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
570
		const nodemask_t *nodes, unsigned long flags, void *private)
571
{
572
	int err;
573
	struct vm_area_struct *first, *vma, *prev;
574

575

576
	first = find_vma(mm, start);
577
	if (!first)
578
		return ERR_PTR(-EFAULT);
579
	prev = NULL;
580
	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
581
		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
582
			if (!vma->vm_next && vma->vm_end < end)
583
				return ERR_PTR(-EFAULT);
584
			if (prev && prev->vm_end < vma->vm_start)
585
				return ERR_PTR(-EFAULT);
586
		}
587
		if (!is_vm_hugetlb_page(vma) &&
588
		    ((flags & MPOL_MF_STRICT) ||
589
		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
590
				vma_migratable(vma)))) {
591
			unsigned long endvma = vma->vm_end;
592

593
			if (endvma > end)
594
				endvma = end;
595
			if (vma->vm_start > start)
596
				start = vma->vm_start;
597
			err = check_pgd_range(vma, start, endvma, nodes,
598
						flags, private);
599
			if (err) {
600
				first = ERR_PTR(err);
601
				break;
602
			}
603
		}
604
		prev = vma;
605
	}
606
	return first;
607
}
608

609
/* Apply policy to a single VMA */
610
static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
611
{
612
	int err = 0;
613
	struct mempolicy *old = vma->vm_policy;
614

615
	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
616
		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
617
		 vma->vm_ops, vma->vm_file,
618
		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
619

620
	if (vma->vm_ops && vma->vm_ops->set_policy)
621
		err = vma->vm_ops->set_policy(vma, new);
622
	if (!err) {
623
		mpol_get(new);
624
		vma->vm_policy = new;
625
		mpol_put(old);
626
	}
627
	return err;
628
}
629

630
/* Step 2: apply policy to a range and do splits. */
631
static int mbind_range(struct mm_struct *mm, unsigned long start,
632
		       unsigned long end, struct mempolicy *new_pol)
633
{
634
	struct vm_area_struct *next;
635
	struct vm_area_struct *prev;
636
	struct vm_area_struct *vma;
637
	int err = 0;
638
	pgoff_t pgoff;
639
	unsigned long vmstart;
640
	unsigned long vmend;
641

642
	vma = find_vma_prev(mm, start, &prev);
643
	if (!vma || vma->vm_start > start)
644
		return -EFAULT;
645

646
	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
647
		next = vma->vm_next;
648
		vmstart = max(start, vma->vm_start);
649
		vmend   = min(end, vma->vm_end);
650

651
		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
652
		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
653
				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
654
		if (prev) {
655
			vma = prev;
656
			next = vma->vm_next;
657
			continue;
658
		}
659
		if (vma->vm_start != vmstart) {
660
			err = split_vma(vma->vm_mm, vma, vmstart, 1);
661
			if (err)
662
				goto out;
663
		}
664
		if (vma->vm_end != vmend) {
665
			err = split_vma(vma->vm_mm, vma, vmend, 0);
666
			if (err)
667
				goto out;
668
		}
669
		err = policy_vma(vma, new_pol);
670
		if (err)
671
			goto out;
672
	}
673

674
 out:
675
	return err;
676
}
677

678
/*
679
 * Update task->flags PF_MEMPOLICY bit: set iff non-default
680
 * mempolicy.  Allows more rapid checking of this (combined perhaps
681
 * with other PF_* flag bits) on memory allocation hot code paths.
682
 *
683
 * If called from outside this file, the task 'p' should -only- be
684
 * a newly forked child not yet visible on the task list, because
685
 * manipulating the task flags of a visible task is not safe.
686
 *
687
 * The above limitation is why this routine has the funny name
688
 * mpol_fix_fork_child_flag().
689
 *
690
 * It is also safe to call this with a task pointer of current,
691
 * which the static wrapper mpol_set_task_struct_flag() does,
692
 * for use within this file.
693
 */
694

695
void mpol_fix_fork_child_flag(struct task_struct *p)
696
{
697
	if (p->mempolicy)
698
		p->flags |= PF_MEMPOLICY;
699
	else
700
		p->flags &= ~PF_MEMPOLICY;
701
}
702

703
static void mpol_set_task_struct_flag(void)
704
{
705
	mpol_fix_fork_child_flag(current);
706
}
707

708
/* Set the process memory policy */
709
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
710
			     nodemask_t *nodes)
711
{
712
	struct mempolicy *new, *old;
713
	struct mm_struct *mm = current->mm;
714
	NODEMASK_SCRATCH(scratch);
715
	int ret;
716

717
	if (!scratch)
718
		return -ENOMEM;
719

720
	new = mpol_new(mode, flags, nodes);
721
	if (IS_ERR(new)) {
722
		ret = PTR_ERR(new);
723
		goto out;
724
	}
725
	/*
726
	 * prevent changing our mempolicy while show_numa_maps()
727
	 * is using it.
728
	 * Note:  do_set_mempolicy() can be called at init time
729
	 * with no 'mm'.
730
	 */
731
	if (mm)
732
		down_write(&mm->mmap_sem);
733
	task_lock(current);
734
	ret = mpol_set_nodemask(new, nodes, scratch);
735
	if (ret) {
736
		task_unlock(current);
737
		if (mm)
738
			up_write(&mm->mmap_sem);
739
		mpol_put(new);
740
		goto out;
741
	}
742
	old = current->mempolicy;
743
	current->mempolicy = new;
744
	mpol_set_task_struct_flag();
745
	if (new && new->mode == MPOL_INTERLEAVE &&
746
	    nodes_weight(new->v.nodes))
747
		current->il_next = first_node(new->v.nodes);
748
	task_unlock(current);
749
	if (mm)
750
		up_write(&mm->mmap_sem);
751

752
	mpol_put(old);
753
	ret = 0;
754
out:
755
	NODEMASK_SCRATCH_FREE(scratch);
756
	return ret;
757
}
758

759
/*
760
 * Return nodemask for policy for get_mempolicy() query
761
 *
762
 * Called with task's alloc_lock held
763
 */
764
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
765
{
766
	nodes_clear(*nodes);
767
	if (p == &default_policy)
768
		return;
769

770
	switch (p->mode) {
771
	case MPOL_BIND:
772
		/* Fall through */
773
	case MPOL_INTERLEAVE:
774
		*nodes = p->v.nodes;
775
		break;
776
	case MPOL_PREFERRED:
777
		if (!(p->flags & MPOL_F_LOCAL))
778
			node_set(p->v.preferred_node, *nodes);
779
		/* else return empty node mask for local allocation */
780
		break;
781
	default:
782
		BUG();
783
	}
784
}
785

786
static int lookup_node(struct mm_struct *mm, unsigned long addr)
787
{
788
	struct page *p;
789
	int err;
790

791
	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
792
	if (err >= 0) {
793
		err = page_to_nid(p);
794
		put_page(p);
795
	}
796
	return err;
797
}
798

799
/* Retrieve NUMA policy */
800
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
801
			     unsigned long addr, unsigned long flags)
802
{
803
	int err;
804
	struct mm_struct *mm = current->mm;
805
	struct vm_area_struct *vma = NULL;
806
	struct mempolicy *pol = current->mempolicy;
807

808
	if (flags &
809
		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
810
		return -EINVAL;
811

812
	if (flags & MPOL_F_MEMS_ALLOWED) {
813
		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
814
			return -EINVAL;
815
		*policy = 0;	/* just so it's initialized */
816
		task_lock(current);
817
		*nmask  = cpuset_current_mems_allowed;
818
		task_unlock(current);
819
		return 0;
820
	}
821

822
	if (flags & MPOL_F_ADDR) {
823
		/*
824
		 * Do NOT fall back to task policy if the
825
		 * vma/shared policy at addr is NULL.  We
826
		 * want to return MPOL_DEFAULT in this case.
827
		 */
828
		down_read(&mm->mmap_sem);
829
		vma = find_vma_intersection(mm, addr, addr+1);
830
		if (!vma) {
831
			up_read(&mm->mmap_sem);
832
			return -EFAULT;
833
		}
834
		if (vma->vm_ops && vma->vm_ops->get_policy)
835
			pol = vma->vm_ops->get_policy(vma, addr);
836
		else
837
			pol = vma->vm_policy;
838
	} else if (addr)
839
		return -EINVAL;
840

841
	if (!pol)
842
		pol = &default_policy;	/* indicates default behavior */
843

844
	if (flags & MPOL_F_NODE) {
845
		if (flags & MPOL_F_ADDR) {
846
			err = lookup_node(mm, addr);
847
			if (err < 0)
848
				goto out;
849
			*policy = err;
850
		} else if (pol == current->mempolicy &&
851
				pol->mode == MPOL_INTERLEAVE) {
852
			*policy = current->il_next;
853
		} else {
854
			err = -EINVAL;
855
			goto out;
856
		}
857
	} else {
858
		*policy = pol == &default_policy ? MPOL_DEFAULT :
859
						pol->mode;
860
		/*
861
		 * Internal mempolicy flags must be masked off before exposing
862
		 * the policy to userspace.
863
		 */
864
		*policy |= (pol->flags & MPOL_MODE_FLAGS);
865
	}
866

867
	if (vma) {
868
		up_read(&current->mm->mmap_sem);
869
		vma = NULL;
870
	}
871

872
	err = 0;
873
	if (nmask) {
874
		if (mpol_store_user_nodemask(pol)) {
875
			*nmask = pol->w.user_nodemask;
876
		} else {
877
			task_lock(current);
878
			get_policy_nodemask(pol, nmask);
879
			task_unlock(current);
880
		}
881
	}
882

883
 out:
884
	mpol_cond_put(pol);
885
	if (vma)
886
		up_read(&current->mm->mmap_sem);
887
	return err;
888
}
889

890
#ifdef CONFIG_MIGRATION
891
/*
892
 * page migration
893
 */
894
static void migrate_page_add(struct page *page, struct list_head *pagelist,
895
				unsigned long flags)
896
{
897
	/*
898
	 * Avoid migrating a page that is shared with others.
899
	 */
900
	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
901
		if (!isolate_lru_page(page)) {
902
			list_add_tail(&page->lru, pagelist);
903
			inc_zone_page_state(page, NR_ISOLATED_ANON +
904
					    page_is_file_cache(page));
905
		}
906
	}
907
}
908

909
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
910
{
911
	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
912
}
913

914
/*
915
 * Migrate pages from one node to a target node.
916
 * Returns error or the number of pages not migrated.
917
 */
918
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
919
			   int flags)
920
{
921
	nodemask_t nmask;
922
	LIST_HEAD(pagelist);
923
	int err = 0;
924
	struct vm_area_struct *vma;
925

926
	nodes_clear(nmask);
927
	node_set(source, nmask);
928

929
	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
930
			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
931
	if (IS_ERR(vma))
932
		return PTR_ERR(vma);
933

934
	if (!list_empty(&pagelist)) {
935
		err = migrate_pages(&pagelist, new_node_page, dest,
936
								false, true);
937
		if (err)
938
			putback_lru_pages(&pagelist);
939
	}
940

941
	return err;
942
}
943

944
/*
945
 * Move pages between the two nodesets so as to preserve the physical
946
 * layout as much as possible.
947
 *
948
 * Returns the number of page that could not be moved.
949
 */
950
int do_migrate_pages(struct mm_struct *mm,
951
	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
952
{
953
	int busy = 0;
954
	int err;
955
	nodemask_t tmp;
956

957
	err = migrate_prep();
958
	if (err)
959
		return err;
960

961
	down_read(&mm->mmap_sem);
962

963
	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
964
	if (err)
965
		goto out;
966

967
	/*
968
	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
969
	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
970
	 * bit in 'tmp', and return that <source, dest> pair for migration.
971
	 * The pair of nodemasks 'to' and 'from' define the map.
972
	 *
973
	 * If no pair of bits is found that way, fallback to picking some
974
	 * pair of 'source' and 'dest' bits that are not the same.  If the
975
	 * 'source' and 'dest' bits are the same, this represents a node
976
	 * that will be migrating to itself, so no pages need move.
977
	 *
978
	 * If no bits are left in 'tmp', or if all remaining bits left
979
	 * in 'tmp' correspond to the same bit in 'to', return false
980
	 * (nothing left to migrate).
981
	 *
982
	 * This lets us pick a pair of nodes to migrate between, such that
983
	 * if possible the dest node is not already occupied by some other
984
	 * source node, minimizing the risk of overloading the memory on a
985
	 * node that would happen if we migrated incoming memory to a node
986
	 * before migrating outgoing memory source that same node.
987
	 *
988
	 * A single scan of tmp is sufficient.  As we go, we remember the
989
	 * most recent <s, d> pair that moved (s != d).  If we find a pair
990
	 * that not only moved, but what's better, moved to an empty slot
991
	 * (d is not set in tmp), then we break out then, with that pair.
992
	 * Otherwise when we finish scanning from_tmp, we at least have the
993
	 * most recent <s, d> pair that moved.  If we get all the way through
994
	 * the scan of tmp without finding any node that moved, much less
995
	 * moved to an empty node, then there is nothing left worth migrating.
996
	 */
997

998
	tmp = *from_nodes;
999
	while (!nodes_empty(tmp)) {
1000
		int s,d;
1001
		int source = -1;
1002
		int dest = 0;
1003

1004
		for_each_node_mask(s, tmp) {
1005
			d = node_remap(s, *from_nodes, *to_nodes);
1006
			if (s == d)
1007
				continue;
1008

1009
			source = s;	/* Node moved. Memorize */
1010
			dest = d;
1011

1012
			/* dest not in remaining from nodes? */
1013
			if (!node_isset(dest, tmp))
1014
				break;
1015
		}
1016
		if (source == -1)
1017
			break;
1018

1019
		node_clear(source, tmp);
1020
		err = migrate_to_node(mm, source, dest, flags);
1021
		if (err > 0)
1022
			busy += err;
1023
		if (err < 0)
1024
			break;
1025
	}
1026
out:
1027
	up_read(&mm->mmap_sem);
1028
	if (err < 0)
1029
		return err;
1030
	return busy;
1031

1032
}
1033

1034
/*
1035
 * Allocate a new page for page migration based on vma policy.
1036
 * Start assuming that page is mapped by vma pointed to by @private.
1037
 * Search forward from there, if not.  N.B., this assumes that the
1038
 * list of pages handed to migrate_pages()--which is how we get here--
1039
 * is in virtual address order.
1040
 */
1041
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1042
{
1043
	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1044
	unsigned long uninitialized_var(address);
1045

1046
	while (vma) {
1047
		address = page_address_in_vma(page, vma);
1048
		if (address != -EFAULT)
1049
			break;
1050
		vma = vma->vm_next;
1051
	}
1052

1053
	/*
1054
	 * if !vma, alloc_page_vma() will use task or system default policy
1055
	 */
1056
	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1057
}
1058
#else
1059

1060
static void migrate_page_add(struct page *page, struct list_head *pagelist,
1061
				unsigned long flags)
1062
{
1063
}
1064

1065
int do_migrate_pages(struct mm_struct *mm,
1066
	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1067
{
1068
	return -ENOSYS;
1069
}
1070

1071
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1072
{
1073
	return NULL;
1074
}
1075
#endif
1076

1077
static long do_mbind(unsigned long start, unsigned long len,
1078
		     unsigned short mode, unsigned short mode_flags,
1079
		     nodemask_t *nmask, unsigned long flags)
1080
{
1081
	struct vm_area_struct *vma;
1082
	struct mm_struct *mm = current->mm;
1083
	struct mempolicy *new;
1084
	unsigned long end;
1085
	int err;
1086
	LIST_HEAD(pagelist);
1087

1088
	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1089
				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1090
		return -EINVAL;
1091
	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1092
		return -EPERM;
1093

1094
	if (start & ~PAGE_MASK)
1095
		return -EINVAL;
1096

1097
	if (mode == MPOL_DEFAULT)
1098
		flags &= ~MPOL_MF_STRICT;
1099

1100
	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1101
	end = start + len;
1102

1103
	if (end < start)
1104
		return -EINVAL;
1105
	if (end == start)
1106
		return 0;
1107

1108
	new = mpol_new(mode, mode_flags, nmask);
1109
	if (IS_ERR(new))
1110
		return PTR_ERR(new);
1111

1112
	/*
1113
	 * If we are using the default policy then operation
1114
	 * on discontinuous address spaces is okay after all
1115
	 */
1116
	if (!new)
1117
		flags |= MPOL_MF_DISCONTIG_OK;
1118

1119
	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1120
		 start, start + len, mode, mode_flags,
1121
		 nmask ? nodes_addr(*nmask)[0] : -1);
1122

1123
	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1124

1125
		err = migrate_prep();
1126
		if (err)
1127
			goto mpol_out;
1128
	}
1129
	{
1130
		NODEMASK_SCRATCH(scratch);
1131
		if (scratch) {
1132
			down_write(&mm->mmap_sem);
1133
			task_lock(current);
1134
			err = mpol_set_nodemask(new, nmask, scratch);
1135
			task_unlock(current);
1136
			if (err)
1137
				up_write(&mm->mmap_sem);
1138
		} else
1139
			err = -ENOMEM;
1140
		NODEMASK_SCRATCH_FREE(scratch);
1141
	}
1142
	if (err)
1143
		goto mpol_out;
1144

1145
	vma = check_range(mm, start, end, nmask,
1146
			  flags | MPOL_MF_INVERT, &pagelist);
1147

1148
	err = PTR_ERR(vma);
1149
	if (!IS_ERR(vma)) {
1150
		int nr_failed = 0;
1151

1152
		err = mbind_range(mm, start, end, new);
1153

1154
		if (!list_empty(&pagelist)) {
1155
			nr_failed = migrate_pages(&pagelist, new_vma_page,
1156
						(unsigned long)vma,
1157
						false, true);
1158
			if (nr_failed)
1159
				putback_lru_pages(&pagelist);
1160
		}
1161

1162
		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1163
			err = -EIO;
1164
	} else
1165
		putback_lru_pages(&pagelist);
1166

1167
	up_write(&mm->mmap_sem);
1168
 mpol_out:
1169
	mpol_put(new);
1170
	return err;
1171
}
1172

1173
/*
1174
 * User space interface with variable sized bitmaps for nodelists.
1175
 */
1176

1177
/* Copy a node mask from user space. */
1178
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1179
		     unsigned long maxnode)
1180
{
1181
	unsigned long k;
1182
	unsigned long nlongs;
1183
	unsigned long endmask;
1184

1185
	--maxnode;
1186
	nodes_clear(*nodes);
1187
	if (maxnode == 0 || !nmask)
1188
		return 0;
1189
	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1190
		return -EINVAL;
1191

1192
	nlongs = BITS_TO_LONGS(maxnode);
1193
	if ((maxnode % BITS_PER_LONG) == 0)
1194
		endmask = ~0UL;
1195
	else
1196
		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1197

1198
	/* When the user specified more nodes than supported just check
1199
	   if the non supported part is all zero. */
1200
	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1201
		if (nlongs > PAGE_SIZE/sizeof(long))
1202
			return -EINVAL;
1203
		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1204
			unsigned long t;
1205
			if (get_user(t, nmask + k))
1206
				return -EFAULT;
1207
			if (k == nlongs - 1) {
1208
				if (t & endmask)
1209
					return -EINVAL;
1210
			} else if (t)
1211
				return -EINVAL;
1212
		}
1213
		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1214
		endmask = ~0UL;
1215
	}
1216

1217
	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1218
		return -EFAULT;
1219
	nodes_addr(*nodes)[nlongs-1] &= endmask;
1220
	return 0;
1221
}
1222

1223
/* Copy a kernel node mask to user space */
1224
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1225
			      nodemask_t *nodes)
1226
{
1227
	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1228
	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1229

1230
	if (copy > nbytes) {
1231
		if (copy > PAGE_SIZE)
1232
			return -EINVAL;
1233
		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1234
			return -EFAULT;
1235
		copy = nbytes;
1236
	}
1237
	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1238
}
1239

1240
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1241
		unsigned long, mode, unsigned long __user *, nmask,
1242
		unsigned long, maxnode, unsigned, flags)
1243
{
1244
	nodemask_t nodes;
1245
	int err;
1246
	unsigned short mode_flags;
1247

1248
	mode_flags = mode & MPOL_MODE_FLAGS;
1249
	mode &= ~MPOL_MODE_FLAGS;
1250
	if (mode >= MPOL_MAX)
1251
		return -EINVAL;
1252
	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1253
	    (mode_flags & MPOL_F_RELATIVE_NODES))
1254
		return -EINVAL;
1255
	err = get_nodes(&nodes, nmask, maxnode);
1256
	if (err)
1257
		return err;
1258
	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1259
}
1260

1261
/* Set the process memory policy */
1262
SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1263
		unsigned long, maxnode)
1264
{
1265
	int err;
1266
	nodemask_t nodes;
1267
	unsigned short flags;
1268

1269
	flags = mode & MPOL_MODE_FLAGS;
1270
	mode &= ~MPOL_MODE_FLAGS;
1271
	if ((unsigned int)mode >= MPOL_MAX)
1272
		return -EINVAL;
1273
	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1274
		return -EINVAL;
1275
	err = get_nodes(&nodes, nmask, maxnode);
1276
	if (err)
1277
		return err;
1278
	return do_set_mempolicy(mode, flags, &nodes);
1279
}
1280

1281
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1282
		const unsigned long __user *, old_nodes,
1283
		const unsigned long __user *, new_nodes)
1284
{
1285
	const struct cred *cred = current_cred(), *tcred;
1286
	struct mm_struct *mm = NULL;
1287
	struct task_struct *task;
1288
	nodemask_t task_nodes;
1289
	int err;
1290
	nodemask_t *old;
1291
	nodemask_t *new;
1292
	NODEMASK_SCRATCH(scratch);
1293

1294
	if (!scratch)
1295
		return -ENOMEM;
1296

1297
	old = &scratch->mask1;
1298
	new = &scratch->mask2;
1299

1300
	err = get_nodes(old, old_nodes, maxnode);
1301
	if (err)
1302
		goto out;
1303

1304
	err = get_nodes(new, new_nodes, maxnode);
1305
	if (err)
1306
		goto out;
1307

1308
	/* Find the mm_struct */
1309
	rcu_read_lock();
1310
	task = pid ? find_task_by_vpid(pid) : current;
1311
	if (!task) {
1312
		rcu_read_unlock();
1313
		err = -ESRCH;
1314
		goto out;
1315
	}
1316
	mm = get_task_mm(task);
1317
	rcu_read_unlock();
1318

1319
	err = -EINVAL;
1320
	if (!mm)
1321
		goto out;
1322

1323
	/*
1324
	 * Check if this process has the right to modify the specified
1325
	 * process. The right exists if the process has administrative
1326
	 * capabilities, superuser privileges or the same
1327
	 * userid as the target process.
1328
	 */
1329
	rcu_read_lock();
1330
	tcred = __task_cred(task);
1331
	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1332
	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1333
	    !capable(CAP_SYS_NICE)) {
1334
		rcu_read_unlock();
1335
		err = -EPERM;
1336
		goto out;
1337
	}
1338
	rcu_read_unlock();
1339

1340
	task_nodes = cpuset_mems_allowed(task);
1341
	/* Is the user allowed to access the target nodes? */
1342
	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1343
		err = -EPERM;
1344
		goto out;
1345
	}
1346

1347
	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1348
		err = -EINVAL;
1349
		goto out;
1350
	}
1351

1352
	err = security_task_movememory(task);
1353
	if (err)
1354
		goto out;
1355

1356
	err = do_migrate_pages(mm, old, new,
1357
		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1358
out:
1359
	if (mm)
1360
		mmput(mm);
1361
	NODEMASK_SCRATCH_FREE(scratch);
1362

1363
	return err;
1364
}
1365

1366

1367
/* Retrieve NUMA policy */
1368
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1369
		unsigned long __user *, nmask, unsigned long, maxnode,
1370
		unsigned long, addr, unsigned long, flags)
1371
{
1372
	int err;
1373
	int uninitialized_var(pval);
1374
	nodemask_t nodes;
1375

1376
	if (nmask != NULL && maxnode < MAX_NUMNODES)
1377
		return -EINVAL;
1378

1379
	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1380

1381
	if (err)
1382
		return err;
1383

1384
	if (policy && put_user(pval, policy))
1385
		return -EFAULT;
1386

1387
	if (nmask)
1388
		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1389

1390
	return err;
1391
}
1392

1393
#ifdef CONFIG_COMPAT
1394

1395
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1396
				     compat_ulong_t __user *nmask,
1397
				     compat_ulong_t maxnode,
1398
				     compat_ulong_t addr, compat_ulong_t flags)
1399
{
1400
	long err;
1401
	unsigned long __user *nm = NULL;
1402
	unsigned long nr_bits, alloc_size;
1403
	DECLARE_BITMAP(bm, MAX_NUMNODES);
1404

1405
	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1406
	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1407

1408
	if (nmask)
1409
		nm = compat_alloc_user_space(alloc_size);
1410

1411
	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1412

1413
	if (!err && nmask) {
1414
		err = copy_from_user(bm, nm, alloc_size);
1415
		/* ensure entire bitmap is zeroed */
1416
		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1417
		err |= compat_put_bitmap(nmask, bm, nr_bits);
1418
	}
1419

1420
	return err;
1421
}
1422

1423
asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1424
				     compat_ulong_t maxnode)
1425
{
1426
	long err = 0;
1427
	unsigned long __user *nm = NULL;
1428
	unsigned long nr_bits, alloc_size;
1429
	DECLARE_BITMAP(bm, MAX_NUMNODES);
1430

1431
	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1432
	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1433

1434
	if (nmask) {
1435
		err = compat_get_bitmap(bm, nmask, nr_bits);
1436
		nm = compat_alloc_user_space(alloc_size);
1437
		err |= copy_to_user(nm, bm, alloc_size);
1438
	}
1439

1440
	if (err)
1441
		return -EFAULT;
1442

1443
	return sys_set_mempolicy(mode, nm, nr_bits+1);
1444
}
1445

1446
asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1447
			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1448
			     compat_ulong_t maxnode, compat_ulong_t flags)
1449
{
1450
	long err = 0;
1451
	unsigned long __user *nm = NULL;
1452
	unsigned long nr_bits, alloc_size;
1453
	nodemask_t bm;
1454

1455
	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1456
	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1457

1458
	if (nmask) {
1459
		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1460
		nm = compat_alloc_user_space(alloc_size);
1461
		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1462
	}
1463

1464
	if (err)
1465
		return -EFAULT;
1466

1467
	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1468
}
1469

1470
#endif
1471

1472
/*
1473
 * get_vma_policy(@task, @vma, @addr)
1474
 * @task - task for fallback if vma policy == default
1475
 * @vma   - virtual memory area whose policy is sought
1476
 * @addr  - address in @vma for shared policy lookup
1477
 *
1478
 * Returns effective policy for a VMA at specified address.
1479
 * Falls back to @task or system default policy, as necessary.
1480
 * Current or other task's task mempolicy and non-shared vma policies
1481
 * are protected by the task's mmap_sem, which must be held for read by
1482
 * the caller.
1483
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1484
 * count--added by the get_policy() vm_op, as appropriate--to protect against
1485
 * freeing by another task.  It is the caller's responsibility to free the
1486
 * extra reference for shared policies.
1487
 */
1488
struct mempolicy *get_vma_policy(struct task_struct *task,
1489
		struct vm_area_struct *vma, unsigned long addr)
1490
{
1491
	struct mempolicy *pol = task->mempolicy;
1492

1493
	if (vma) {
1494
		if (vma->vm_ops && vma->vm_ops->get_policy) {
1495
			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1496
									addr);
1497
			if (vpol)
1498
				pol = vpol;
1499
		} else if (vma->vm_policy)
1500
			pol = vma->vm_policy;
1501
	}
1502
	if (!pol)
1503
		pol = &default_policy;
1504
	return pol;
1505
}
1506

1507
/*
1508
 * Return a nodemask representing a mempolicy for filtering nodes for
1509
 * page allocation
1510
 */
1511
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1512
{
1513
	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1514
	if (unlikely(policy->mode == MPOL_BIND) &&
1515
			gfp_zone(gfp) >= policy_zone &&
1516
			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1517
		return &policy->v.nodes;
1518

1519
	return NULL;
1520
}
1521

1522
/* Return a zonelist indicated by gfp for node representing a mempolicy */
1523
static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1524
	int nd)
1525
{
1526
	switch (policy->mode) {
1527
	case MPOL_PREFERRED:
1528
		if (!(policy->flags & MPOL_F_LOCAL))
1529
			nd = policy->v.preferred_node;
1530
		break;
1531
	case MPOL_BIND:
1532
		/*
1533
		 * Normally, MPOL_BIND allocations are node-local within the
1534
		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1535
		 * current node isn't part of the mask, we use the zonelist for
1536
		 * the first node in the mask instead.
1537
		 */
1538
		if (unlikely(gfp & __GFP_THISNODE) &&
1539
				unlikely(!node_isset(nd, policy->v.nodes)))
1540
			nd = first_node(policy->v.nodes);
1541
		break;
1542
	default:
1543
		BUG();
1544
	}
1545
	return node_zonelist(nd, gfp);
1546
}
1547

1548
/* Do dynamic interleaving for a process */
1549
static unsigned interleave_nodes(struct mempolicy *policy)
1550
{
1551
	unsigned nid, next;
1552
	struct task_struct *me = current;
1553

1554
	nid = me->il_next;
1555
	next = next_node(nid, policy->v.nodes);
1556
	if (next >= MAX_NUMNODES)
1557
		next = first_node(policy->v.nodes);
1558
	if (next < MAX_NUMNODES)
1559
		me->il_next = next;
1560
	return nid;
1561
}
1562

1563
/*
1564
 * Depending on the memory policy provide a node from which to allocate the
1565
 * next slab entry.
1566
 * @policy must be protected by freeing by the caller.  If @policy is
1567
 * the current task's mempolicy, this protection is implicit, as only the
1568
 * task can change it's policy.  The system default policy requires no
1569
 * such protection.
1570
 */
1571
unsigned slab_node(struct mempolicy *policy)
1572
{
1573
	if (!policy || policy->flags & MPOL_F_LOCAL)
1574
		return numa_node_id();
1575

1576
	switch (policy->mode) {
1577
	case MPOL_PREFERRED:
1578
		/*
1579
		 * handled MPOL_F_LOCAL above
1580
		 */
1581
		return policy->v.preferred_node;
1582

1583
	case MPOL_INTERLEAVE:
1584
		return interleave_nodes(policy);
1585

1586
	case MPOL_BIND: {
1587
		/*
1588
		 * Follow bind policy behavior and start allocation at the
1589
		 * first node.
1590
		 */
1591
		struct zonelist *zonelist;
1592
		struct zone *zone;
1593
		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1594
		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1595
		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1596
							&policy->v.nodes,
1597
							&zone);
1598
		return zone ? zone->node : numa_node_id();
1599
	}
1600

1601
	default:
1602
		BUG();
1603
	}
1604
}
1605

1606
/* Do static interleaving for a VMA with known offset. */
1607
static unsigned offset_il_node(struct mempolicy *pol,
1608
		struct vm_area_struct *vma, unsigned long off)
1609
{
1610
	unsigned nnodes = nodes_weight(pol->v.nodes);
1611
	unsigned target;
1612
	int c;
1613
	int nid = -1;
1614

1615
	if (!nnodes)
1616
		return numa_node_id();
1617
	target = (unsigned int)off % nnodes;
1618
	c = 0;
1619
	do {
1620
		nid = next_node(nid, pol->v.nodes);
1621
		c++;
1622
	} while (c <= target);
1623
	return nid;
1624
}
1625

1626
/* Determine a node number for interleave */
1627
static inline unsigned interleave_nid(struct mempolicy *pol,
1628
		 struct vm_area_struct *vma, unsigned long addr, int shift)
1629
{
1630
	if (vma) {
1631
		unsigned long off;
1632

1633
		/*
1634
		 * for small pages, there is no difference between
1635
		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1636
		 * for huge pages, since vm_pgoff is in units of small
1637
		 * pages, we need to shift off the always 0 bits to get
1638
		 * a useful offset.
1639
		 */
1640
		BUG_ON(shift < PAGE_SHIFT);
1641
		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1642
		off += (addr - vma->vm_start) >> shift;
1643
		return offset_il_node(pol, vma, off);
1644
	} else
1645
		return interleave_nodes(pol);
1646
}
1647

1648
#ifdef CONFIG_HUGETLBFS
1649
/*
1650
 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1651
 * @vma = virtual memory area whose policy is sought
1652
 * @addr = address in @vma for shared policy lookup and interleave policy
1653
 * @gfp_flags = for requested zone
1654
 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1655
 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1656
 *
1657
 * Returns a zonelist suitable for a huge page allocation and a pointer
1658
 * to the struct mempolicy for conditional unref after allocation.
1659
 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1660
 * @nodemask for filtering the zonelist.
1661
 *
1662
 * Must be protected by get_mems_allowed()
1663
 */
1664
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1665
				gfp_t gfp_flags, struct mempolicy **mpol,
1666
				nodemask_t **nodemask)
1667
{
1668
	struct zonelist *zl;
1669

1670
	*mpol = get_vma_policy(current, vma, addr);
1671
	*nodemask = NULL;	/* assume !MPOL_BIND */
1672

1673
	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1674
		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1675
				huge_page_shift(hstate_vma(vma))), gfp_flags);
1676
	} else {
1677
		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1678
		if ((*mpol)->mode == MPOL_BIND)
1679
			*nodemask = &(*mpol)->v.nodes;
1680
	}
1681
	return zl;
1682
}
1683

1684
/*
1685
 * init_nodemask_of_mempolicy
1686
 *
1687
 * If the current task's mempolicy is "default" [NULL], return 'false'
1688
 * to indicate default policy.  Otherwise, extract the policy nodemask
1689
 * for 'bind' or 'interleave' policy into the argument nodemask, or
1690
 * initialize the argument nodemask to contain the single node for
1691
 * 'preferred' or 'local' policy and return 'true' to indicate presence
1692
 * of non-default mempolicy.
1693
 *
1694
 * We don't bother with reference counting the mempolicy [mpol_get/put]
1695
 * because the current task is examining it's own mempolicy and a task's
1696
 * mempolicy is only ever changed by the task itself.
1697
 *
1698
 * N.B., it is the caller's responsibility to free a returned nodemask.
1699
 */
1700
bool init_nodemask_of_mempolicy(nodemask_t *mask)
1701
{
1702
	struct mempolicy *mempolicy;
1703
	int nid;
1704

1705
	if (!(mask && current->mempolicy))
1706
		return false;
1707

1708
	task_lock(current);
1709
	mempolicy = current->mempolicy;
1710
	switch (mempolicy->mode) {
1711
	case MPOL_PREFERRED:
1712
		if (mempolicy->flags & MPOL_F_LOCAL)
1713
			nid = numa_node_id();
1714
		else
1715
			nid = mempolicy->v.preferred_node;
1716
		init_nodemask_of_node(mask, nid);
1717
		break;
1718

1719
	case MPOL_BIND:
1720
		/* Fall through */
1721
	case MPOL_INTERLEAVE:
1722
		*mask =  mempolicy->v.nodes;
1723
		break;
1724

1725
	default:
1726
		BUG();
1727
	}
1728
	task_unlock(current);
1729

1730
	return true;
1731
}
1732
#endif
1733

1734
/*
1735
 * mempolicy_nodemask_intersects
1736
 *
1737
 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1738
 * policy.  Otherwise, check for intersection between mask and the policy
1739
 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1740
 * policy, always return true since it may allocate elsewhere on fallback.
1741
 *
1742
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1743
 */
1744
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1745
					const nodemask_t *mask)
1746
{
1747
	struct mempolicy *mempolicy;
1748
	bool ret = true;
1749

1750
	if (!mask)
1751
		return ret;
1752
	task_lock(tsk);
1753
	mempolicy = tsk->mempolicy;
1754
	if (!mempolicy)
1755
		goto out;
1756

1757
	switch (mempolicy->mode) {
1758
	case MPOL_PREFERRED:
1759
		/*
1760
		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1761
		 * allocate from, they may fallback to other nodes when oom.
1762
		 * Thus, it's possible for tsk to have allocated memory from
1763
		 * nodes in mask.
1764
		 */
1765
		break;
1766
	case MPOL_BIND:
1767
	case MPOL_INTERLEAVE:
1768
		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1769
		break;
1770
	default:
1771
		BUG();
1772
	}
1773
out:
1774
	task_unlock(tsk);
1775
	return ret;
1776
}
1777

1778
/* Allocate a page in interleaved policy.
1779
   Own path because it needs to do special accounting. */
1780
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1781
					unsigned nid)
1782
{
1783
	struct zonelist *zl;
1784
	struct page *page;
1785

1786
	zl = node_zonelist(nid, gfp);
1787
	page = __alloc_pages(gfp, order, zl);
1788
	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1789
		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1790
	return page;
1791
}
1792

1793
/**
1794
 * 	alloc_pages_vma	- Allocate a page for a VMA.
1795
 *
1796
 * 	@gfp:
1797
 *      %GFP_USER    user allocation.
1798
 *      %GFP_KERNEL  kernel allocations,
1799
 *      %GFP_HIGHMEM highmem/user allocations,
1800
 *      %GFP_FS      allocation should not call back into a file system.
1801
 *      %GFP_ATOMIC  don't sleep.
1802
 *
1803
 *	@order:Order of the GFP allocation.
1804
 * 	@vma:  Pointer to VMA or NULL if not available.
1805
 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1806
 *
1807
 * 	This function allocates a page from the kernel page pool and applies
1808
 *	a NUMA policy associated with the VMA or the current process.
1809
 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1810
 *	mm_struct of the VMA to prevent it from going away. Should be used for
1811
 *	all allocations for pages that will be mapped into
1812
 * 	user space. Returns NULL when no page can be allocated.
1813
 *
1814
 *	Should be called with the mm_sem of the vma hold.
1815
 */
1816
struct page *
1817
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1818
		unsigned long addr, int node)
1819
{
1820
	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1821
	struct zonelist *zl;
1822
	struct page *page;
1823

1824
	get_mems_allowed();
1825
	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1826
		unsigned nid;
1827

1828
		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1829
		mpol_cond_put(pol);
1830
		page = alloc_page_interleave(gfp, order, nid);
1831
		put_mems_allowed();
1832
		return page;
1833
	}
1834
	zl = policy_zonelist(gfp, pol, node);
1835
	if (unlikely(mpol_needs_cond_ref(pol))) {
1836
		/*
1837
		 * slow path: ref counted shared policy
1838
		 */
1839
		struct page *page =  __alloc_pages_nodemask(gfp, order,
1840
						zl, policy_nodemask(gfp, pol));
1841
		__mpol_put(pol);
1842
		put_mems_allowed();
1843
		return page;
1844
	}
1845
	/*
1846
	 * fast path:  default or task policy
1847
	 */
1848
	page = __alloc_pages_nodemask(gfp, order, zl,
1849
				      policy_nodemask(gfp, pol));
1850
	put_mems_allowed();
1851
	return page;
1852
}
1853

1854
/**
1855
 * 	alloc_pages_current - Allocate pages.
1856
 *
1857
 *	@gfp:
1858
 *		%GFP_USER   user allocation,
1859
 *      	%GFP_KERNEL kernel allocation,
1860
 *      	%GFP_HIGHMEM highmem allocation,
1861
 *      	%GFP_FS     don't call back into a file system.
1862
 *      	%GFP_ATOMIC don't sleep.
1863
 *	@order: Power of two of allocation size in pages. 0 is a single page.
1864
 *
1865
 *	Allocate a page from the kernel page pool.  When not in
1866
 *	interrupt context and apply the current process NUMA policy.
1867
 *	Returns NULL when no page can be allocated.
1868
 *
1869
 *	Don't call cpuset_update_task_memory_state() unless
1870
 *	1) it's ok to take cpuset_sem (can WAIT), and
1871
 *	2) allocating for current task (not interrupt).
1872
 */
1873
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1874
{
1875
	struct mempolicy *pol = current->mempolicy;
1876
	struct page *page;
1877

1878
	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1879
		pol = &default_policy;
1880

1881
	get_mems_allowed();
1882
	/*
1883
	 * No reference counting needed for current->mempolicy
1884
	 * nor system default_policy
1885
	 */
1886
	if (pol->mode == MPOL_INTERLEAVE)
1887
		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1888
	else
1889
		page = __alloc_pages_nodemask(gfp, order,
1890
				policy_zonelist(gfp, pol, numa_node_id()),
1891
				policy_nodemask(gfp, pol));
1892
	put_mems_allowed();
1893
	return page;
1894
}
1895
EXPORT_SYMBOL(alloc_pages_current);
1896

1897
/*
1898
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1899
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1900
 * with the mems_allowed returned by cpuset_mems_allowed().  This
1901
 * keeps mempolicies cpuset relative after its cpuset moves.  See
1902
 * further kernel/cpuset.c update_nodemask().
1903
 *
1904
 * current's mempolicy may be rebinded by the other task(the task that changes
1905
 * cpuset's mems), so we needn't do rebind work for current task.
1906
 */
1907

1908
/* Slow path of a mempolicy duplicate */
1909
struct mempolicy *__mpol_dup(struct mempolicy *old)
1910
{
1911
	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1912

1913
	if (!new)
1914
		return ERR_PTR(-ENOMEM);
1915

1916
	/* task's mempolicy is protected by alloc_lock */
1917
	if (old == current->mempolicy) {
1918
		task_lock(current);
1919
		*new = *old;
1920
		task_unlock(current);
1921
	} else
1922
		*new = *old;
1923

1924
	rcu_read_lock();
1925
	if (current_cpuset_is_being_rebound()) {
1926
		nodemask_t mems = cpuset_mems_allowed(current);
1927
		if (new->flags & MPOL_F_REBINDING)
1928
			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1929
		else
1930
			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1931
	}
1932
	rcu_read_unlock();
1933
	atomic_set(&new->refcnt, 1);
1934
	return new;
1935
}
1936

1937
/*
1938
 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1939
 * eliminate the * MPOL_F_* flags that require conditional ref and
1940
 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1941
 * after return.  Use the returned value.
1942
 *
1943
 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1944
 * policy lookup, even if the policy needs/has extra ref on lookup.
1945
 * shmem_readahead needs this.
1946
 */
1947
struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1948
						struct mempolicy *frompol)
1949
{
1950
	if (!mpol_needs_cond_ref(frompol))
1951
		return frompol;
1952

1953
	*tompol = *frompol;
1954
	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1955
	__mpol_put(frompol);
1956
	return tompol;
1957
}
1958

1959
/* Slow path of a mempolicy comparison */
1960
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1961
{
1962
	if (!a || !b)
1963
		return 0;
1964
	if (a->mode != b->mode)
1965
		return 0;
1966
	if (a->flags != b->flags)
1967
		return 0;
1968
	if (mpol_store_user_nodemask(a))
1969
		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1970
			return 0;
1971

1972
	switch (a->mode) {
1973
	case MPOL_BIND:
1974
		/* Fall through */
1975
	case MPOL_INTERLEAVE:
1976
		return nodes_equal(a->v.nodes, b->v.nodes);
1977
	case MPOL_PREFERRED:
1978
		return a->v.preferred_node == b->v.preferred_node;
1979
	default:
1980
		BUG();
1981
		return 0;
1982
	}
1983
}
1984

1985
/*
1986
 * Shared memory backing store policy support.
1987
 *
1988
 * Remember policies even when nobody has shared memory mapped.
1989
 * The policies are kept in Red-Black tree linked from the inode.
1990
 * They are protected by the sp->lock spinlock, which should be held
1991
 * for any accesses to the tree.
1992
 */
1993

1994
/* lookup first element intersecting start-end */
1995
/* Caller holds sp->lock */
1996
static struct sp_node *
1997
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1998
{
1999
	struct rb_node *n = sp->root.rb_node;
2000

2001
	while (n) {
2002
		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2003

2004
		if (start >= p->end)
2005
			n = n->rb_right;
2006
		else if (end <= p->start)
2007
			n = n->rb_left;
2008
		else
2009
			break;
2010
	}
2011
	if (!n)
2012
		return NULL;
2013
	for (;;) {
2014
		struct sp_node *w = NULL;
2015
		struct rb_node *prev = rb_prev(n);
2016
		if (!prev)
2017
			break;
2018
		w = rb_entry(prev, struct sp_node, nd);
2019
		if (w->end <= start)
2020
			break;
2021
		n = prev;
2022
	}
2023
	return rb_entry(n, struct sp_node, nd);
2024
}
2025

2026
/* Insert a new shared policy into the list. */
2027
/* Caller holds sp->lock */
2028
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2029
{
2030
	struct rb_node **p = &sp->root.rb_node;
2031
	struct rb_node *parent = NULL;
2032
	struct sp_node *nd;
2033

2034
	while (*p) {
2035
		parent = *p;
2036
		nd = rb_entry(parent, struct sp_node, nd);
2037
		if (new->start < nd->start)
2038
			p = &(*p)->rb_left;
2039
		else if (new->end > nd->end)
2040
			p = &(*p)->rb_right;
2041
		else
2042
			BUG();
2043
	}
2044
	rb_link_node(&new->nd, parent, p);
2045
	rb_insert_color(&new->nd, &sp->root);
2046
	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2047
		 new->policy ? new->policy->mode : 0);
2048
}
2049

2050
/* Find shared policy intersecting idx */
2051
struct mempolicy *
2052
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2053
{
2054
	struct mempolicy *pol = NULL;
2055
	struct sp_node *sn;
2056

2057
	if (!sp->root.rb_node)
2058
		return NULL;
2059
	spin_lock(&sp->lock);
2060
	sn = sp_lookup(sp, idx, idx+1);
2061
	if (sn) {
2062
		mpol_get(sn->policy);
2063
		pol = sn->policy;
2064
	}
2065
	spin_unlock(&sp->lock);
2066
	return pol;
2067
}
2068

2069
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2070
{
2071
	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2072
	rb_erase(&n->nd, &sp->root);
2073
	mpol_put(n->policy);
2074
	kmem_cache_free(sn_cache, n);
2075
}
2076

2077
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2078
				struct mempolicy *pol)
2079
{
2080
	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2081

2082
	if (!n)
2083
		return NULL;
2084
	n->start = start;
2085
	n->end = end;
2086
	mpol_get(pol);
2087
	pol->flags |= MPOL_F_SHARED;	/* for unref */
2088
	n->policy = pol;
2089
	return n;
2090
}
2091

2092
/* Replace a policy range. */
2093
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2094
				 unsigned long end, struct sp_node *new)
2095
{
2096
	struct sp_node *n, *new2 = NULL;
2097

2098
restart:
2099
	spin_lock(&sp->lock);
2100
	n = sp_lookup(sp, start, end);
2101
	/* Take care of old policies in the same range. */
2102
	while (n && n->start < end) {
2103
		struct rb_node *next = rb_next(&n->nd);
2104
		if (n->start >= start) {
2105
			if (n->end <= end)
2106
				sp_delete(sp, n);
2107
			else
2108
				n->start = end;
2109
		} else {
2110
			/* Old policy spanning whole new range. */
2111
			if (n->end > end) {
2112
				if (!new2) {
2113
					spin_unlock(&sp->lock);
2114
					new2 = sp_alloc(end, n->end, n->policy);
2115
					if (!new2)
2116
						return -ENOMEM;
2117
					goto restart;
2118
				}
2119
				n->end = start;
2120
				sp_insert(sp, new2);
2121
				new2 = NULL;
2122
				break;
2123
			} else
2124
				n->end = start;
2125
		}
2126
		if (!next)
2127
			break;
2128
		n = rb_entry(next, struct sp_node, nd);
2129
	}
2130
	if (new)
2131
		sp_insert(sp, new);
2132
	spin_unlock(&sp->lock);
2133
	if (new2) {
2134
		mpol_put(new2->policy);
2135
		kmem_cache_free(sn_cache, new2);
2136
	}
2137
	return 0;
2138
}
2139

2140
/**
2141
 * mpol_shared_policy_init - initialize shared policy for inode
2142
 * @sp: pointer to inode shared policy
2143
 * @mpol:  struct mempolicy to install
2144
 *
2145
 * Install non-NULL @mpol in inode's shared policy rb-tree.
2146
 * On entry, the current task has a reference on a non-NULL @mpol.
2147
 * This must be released on exit.
2148
 * This is called at get_inode() calls and we can use GFP_KERNEL.
2149
 */
2150
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2151
{
2152
	int ret;
2153

2154
	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2155
	spin_lock_init(&sp->lock);
2156

2157
	if (mpol) {
2158
		struct vm_area_struct pvma;
2159
		struct mempolicy *new;
2160
		NODEMASK_SCRATCH(scratch);
2161

2162
		if (!scratch)
2163
			goto put_mpol;
2164
		/* contextualize the tmpfs mount point mempolicy */
2165
		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2166
		if (IS_ERR(new))
2167
			goto free_scratch; /* no valid nodemask intersection */
2168

2169
		task_lock(current);
2170
		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2171
		task_unlock(current);
2172
		if (ret)
2173
			goto put_new;
2174

2175
		/* Create pseudo-vma that contains just the policy */
2176
		memset(&pvma, 0, sizeof(struct vm_area_struct));
2177
		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2178
		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2179

2180
put_new:
2181
		mpol_put(new);			/* drop initial ref */
2182
free_scratch:
2183
		NODEMASK_SCRATCH_FREE(scratch);
2184
put_mpol:
2185
		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2186
	}
2187
}
2188

2189
int mpol_set_shared_policy(struct shared_policy *info,
2190
			struct vm_area_struct *vma, struct mempolicy *npol)
2191
{
2192
	int err;
2193
	struct sp_node *new = NULL;
2194
	unsigned long sz = vma_pages(vma);
2195

2196
	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2197
		 vma->vm_pgoff,
2198
		 sz, npol ? npol->mode : -1,
2199
		 npol ? npol->flags : -1,
2200
		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2201

2202
	if (npol) {
2203
		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2204
		if (!new)
2205
			return -ENOMEM;
2206
	}
2207
	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2208
	if (err && new)
2209
		kmem_cache_free(sn_cache, new);
2210
	return err;
2211
}
2212

2213
/* Free a backing policy store on inode delete. */
2214
void mpol_free_shared_policy(struct shared_policy *p)
2215
{
2216
	struct sp_node *n;
2217
	struct rb_node *next;
2218

2219
	if (!p->root.rb_node)
2220
		return;
2221
	spin_lock(&p->lock);
2222
	next = rb_first(&p->root);
2223
	while (next) {
2224
		n = rb_entry(next, struct sp_node, nd);
2225
		next = rb_next(&n->nd);
2226
		rb_erase(&n->nd, &p->root);
2227
		mpol_put(n->policy);
2228
		kmem_cache_free(sn_cache, n);
2229
	}
2230
	spin_unlock(&p->lock);
2231
}
2232

2233
/* assumes fs == KERNEL_DS */
2234
void __init numa_policy_init(void)
2235
{
2236
	nodemask_t interleave_nodes;
2237
	unsigned long largest = 0;
2238
	int nid, prefer = 0;
2239

2240
	policy_cache = kmem_cache_create("numa_policy",
2241
					 sizeof(struct mempolicy),
2242
					 0, SLAB_PANIC, NULL);
2243

2244
	sn_cache = kmem_cache_create("shared_policy_node",
2245
				     sizeof(struct sp_node),
2246
				     0, SLAB_PANIC, NULL);
2247

2248
	/*
2249
	 * Set interleaving policy for system init. Interleaving is only
2250
	 * enabled across suitably sized nodes (default is >= 16MB), or
2251
	 * fall back to the largest node if they're all smaller.
2252
	 */
2253
	nodes_clear(interleave_nodes);
2254
	for_each_node_state(nid, N_HIGH_MEMORY) {
2255
		unsigned long total_pages = node_present_pages(nid);
2256

2257
		/* Preserve the largest node */
2258
		if (largest < total_pages) {
2259
			largest = total_pages;
2260
			prefer = nid;
2261
		}
2262

2263
		/* Interleave this node? */
2264
		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2265
			node_set(nid, interleave_nodes);
2266
	}
2267

2268
	/* All too small, use the largest */
2269
	if (unlikely(nodes_empty(interleave_nodes)))
2270
		node_set(prefer, interleave_nodes);
2271

2272
	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2273
		printk("numa_policy_init: interleaving failed\n");
2274
}
2275

2276
/* Reset policy of current process to default */
2277
void numa_default_policy(void)
2278
{
2279
	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2280
}
2281

2282
/*
2283
 * Parse and format mempolicy from/to strings
2284
 */
2285

2286
/*
2287
 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2288
 * Used only for mpol_parse_str() and mpol_to_str()
2289
 */
2290
#define MPOL_LOCAL MPOL_MAX
2291
static const char * const policy_modes[] =
2292
{
2293
	[MPOL_DEFAULT]    = "default",
2294
	[MPOL_PREFERRED]  = "prefer",
2295
	[MPOL_BIND]       = "bind",
2296
	[MPOL_INTERLEAVE] = "interleave",
2297
	[MPOL_LOCAL]      = "local"
2298
};
2299

2300

2301
#ifdef CONFIG_TMPFS
2302
/**
2303
 * mpol_parse_str - parse string to mempolicy
2304
 * @str:  string containing mempolicy to parse
2305
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2306
 * @no_context:  flag whether to "contextualize" the mempolicy
2307
 *
2308
 * Format of input:
2309
 *	<mode>[=<flags>][:<nodelist>]
2310
 *
2311
 * if @no_context is true, save the input nodemask in w.user_nodemask in
2312
 * the returned mempolicy.  This will be used to "clone" the mempolicy in
2313
 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2314
 * mount option.  Note that if 'static' or 'relative' mode flags were
2315
 * specified, the input nodemask will already have been saved.  Saving
2316
 * it again is redundant, but safe.
2317
 *
2318
 * On success, returns 0, else 1
2319
 */
2320
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2321
{
2322
	struct mempolicy *new = NULL;
2323
	unsigned short mode;
2324
	unsigned short uninitialized_var(mode_flags);
2325
	nodemask_t nodes;
2326
	char *nodelist = strchr(str, ':');
2327
	char *flags = strchr(str, '=');
2328
	int err = 1;
2329

2330
	if (nodelist) {
2331
		/* NUL-terminate mode or flags string */
2332
		*nodelist++ = '\0';
2333
		if (nodelist_parse(nodelist, nodes))
2334
			goto out;
2335
		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2336
			goto out;
2337
	} else
2338
		nodes_clear(nodes);
2339

2340
	if (flags)
2341
		*flags++ = '\0';	/* terminate mode string */
2342

2343
	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2344
		if (!strcmp(str, policy_modes[mode])) {
2345
			break;
2346
		}
2347
	}
2348
	if (mode > MPOL_LOCAL)
2349
		goto out;
2350

2351
	switch (mode) {
2352
	case MPOL_PREFERRED:
2353
		/*
2354
		 * Insist on a nodelist of one node only
2355
		 */
2356
		if (nodelist) {
2357
			char *rest = nodelist;
2358
			while (isdigit(*rest))
2359
				rest++;
2360
			if (*rest)
2361
				goto out;
2362
		}
2363
		break;
2364
	case MPOL_INTERLEAVE:
2365
		/*
2366
		 * Default to online nodes with memory if no nodelist
2367
		 */
2368
		if (!nodelist)
2369
			nodes = node_states[N_HIGH_MEMORY];
2370
		break;
2371
	case MPOL_LOCAL:
2372
		/*
2373
		 * Don't allow a nodelist;  mpol_new() checks flags
2374
		 */
2375
		if (nodelist)
2376
			goto out;
2377
		mode = MPOL_PREFERRED;
2378
		break;
2379
	case MPOL_DEFAULT:
2380
		/*
2381
		 * Insist on a empty nodelist
2382
		 */
2383
		if (!nodelist)
2384
			err = 0;
2385
		goto out;
2386
	case MPOL_BIND:
2387
		/*
2388
		 * Insist on a nodelist
2389
		 */
2390
		if (!nodelist)
2391
			goto out;
2392
	}
2393

2394
	mode_flags = 0;
2395
	if (flags) {
2396
		/*
2397
		 * Currently, we only support two mutually exclusive
2398
		 * mode flags.
2399
		 */
2400
		if (!strcmp(flags, "static"))
2401
			mode_flags |= MPOL_F_STATIC_NODES;
2402
		else if (!strcmp(flags, "relative"))
2403
			mode_flags |= MPOL_F_RELATIVE_NODES;
2404
		else
2405
			goto out;
2406
	}
2407

2408
	new = mpol_new(mode, mode_flags, &nodes);
2409
	if (IS_ERR(new))
2410
		goto out;
2411

2412
	if (no_context) {
2413
		/* save for contextualization */
2414
		new->w.user_nodemask = nodes;
2415
	} else {
2416
		int ret;
2417
		NODEMASK_SCRATCH(scratch);
2418
		if (scratch) {
2419
			task_lock(current);
2420
			ret = mpol_set_nodemask(new, &nodes, scratch);
2421
			task_unlock(current);
2422
		} else
2423
			ret = -ENOMEM;
2424
		NODEMASK_SCRATCH_FREE(scratch);
2425
		if (ret) {
2426
			mpol_put(new);
2427
			goto out;
2428
		}
2429
	}
2430
	err = 0;
2431

2432
out:
2433
	/* Restore string for error message */
2434
	if (nodelist)
2435
		*--nodelist = ':';
2436
	if (flags)
2437
		*--flags = '=';
2438
	if (!err)
2439
		*mpol = new;
2440
	return err;
2441
}
2442
#endif /* CONFIG_TMPFS */
2443

2444
/**
2445
 * mpol_to_str - format a mempolicy structure for printing
2446
 * @buffer:  to contain formatted mempolicy string
2447
 * @maxlen:  length of @buffer
2448
 * @pol:  pointer to mempolicy to be formatted
2449
 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2450
 *
2451
 * Convert a mempolicy into a string.
2452
 * Returns the number of characters in buffer (if positive)
2453
 * or an error (negative)
2454
 */
2455
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2456
{
2457
	char *p = buffer;
2458
	int l;
2459
	nodemask_t nodes;
2460
	unsigned short mode;
2461
	unsigned short flags = pol ? pol->flags : 0;
2462

2463
	/*
2464
	 * Sanity check:  room for longest mode, flag and some nodes
2465
	 */
2466
	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2467

2468
	if (!pol || pol == &default_policy)
2469
		mode = MPOL_DEFAULT;
2470
	else
2471
		mode = pol->mode;
2472

2473
	switch (mode) {
2474
	case MPOL_DEFAULT:
2475
		nodes_clear(nodes);
2476
		break;
2477

2478
	case MPOL_PREFERRED:
2479
		nodes_clear(nodes);
2480
		if (flags & MPOL_F_LOCAL)
2481
			mode = MPOL_LOCAL;	/* pseudo-policy */
2482
		else
2483
			node_set(pol->v.preferred_node, nodes);
2484
		break;
2485

2486
	case MPOL_BIND:
2487
		/* Fall through */
2488
	case MPOL_INTERLEAVE:
2489
		if (no_context)
2490
			nodes = pol->w.user_nodemask;
2491
		else
2492
			nodes = pol->v.nodes;
2493
		break;
2494

2495
	default:
2496
		BUG();
2497
	}
2498

2499
	l = strlen(policy_modes[mode]);
2500
	if (buffer + maxlen < p + l + 1)
2501
		return -ENOSPC;
2502

2503
	strcpy(p, policy_modes[mode]);
2504
	p += l;
2505

2506
	if (flags & MPOL_MODE_FLAGS) {
2507
		if (buffer + maxlen < p + 2)
2508
			return -ENOSPC;
2509
		*p++ = '=';
2510

2511
		/*
2512
		 * Currently, the only defined flags are mutually exclusive
2513
		 */
2514
		if (flags & MPOL_F_STATIC_NODES)
2515
			p += snprintf(p, buffer + maxlen - p, "static");
2516
		else if (flags & MPOL_F_RELATIVE_NODES)
2517
			p += snprintf(p, buffer + maxlen - p, "relative");
2518
	}
2519

2520
	if (!nodes_empty(nodes)) {
2521
		if (buffer + maxlen < p + 2)
2522
			return -ENOSPC;
2523
		*p++ = ':';
2524
	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2525
	}
2526
	return p - buffer;
2527
}
2528

2529
Product

Resources

Company