Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/mempolicy.c
10814 views
1
/*
2
* Simple NUMA memory policy for the Linux kernel.
3
*
4
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
5
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6
* Subject to the GNU Public License, version 2.
7
*
8
* NUMA policy allows the user to give hints in which node(s) memory should
9
* be allocated.
10
*
11
* Support four policies per VMA and per process:
12
*
13
* The VMA policy has priority over the process policy for a page fault.
14
*
15
* interleave Allocate memory interleaved over a set of nodes,
16
* with normal fallback if it fails.
17
* For VMA based allocations this interleaves based on the
18
* offset into the backing object or offset into the mapping
19
* for anonymous memory. For process policy an process counter
20
* is used.
21
*
22
* bind Only allocate memory on a specific set of nodes,
23
* no fallback.
24
* FIXME: memory is allocated starting with the first node
25
* to the last. It would be better if bind would truly restrict
26
* the allocation to memory nodes instead
27
*
28
* preferred Try a specific node first before normal fallback.
29
* As a special case node -1 here means do the allocation
30
* on the local CPU. This is normally identical to default,
31
* but useful to set in a VMA when you have a non default
32
* process policy.
33
*
34
* default Allocate on the local node first, or when on a VMA
35
* use the process policy. This is what Linux always did
36
* in a NUMA aware kernel and still does by, ahem, default.
37
*
38
* The process policy is applied for most non interrupt memory allocations
39
* in that process' context. Interrupts ignore the policies and always
40
* try to allocate on the local CPU. The VMA policy is only applied for memory
41
* allocations for a VMA in the VM.
42
*
43
* Currently there are a few corner cases in swapping where the policy
44
* is not applied, but the majority should be handled. When process policy
45
* is used it is not remembered over swap outs/swap ins.
46
*
47
* Only the highest zone in the zone hierarchy gets policied. Allocations
48
* requesting a lower zone just use default policy. This implies that
49
* on systems with highmem kernel lowmem allocation don't get policied.
50
* Same with GFP_DMA allocations.
51
*
52
* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53
* all users and remembered even when nobody has memory mapped.
54
*/
55
56
/* Notebook:
57
fix mmap readahead to honour policy and enable policy for any page cache
58
object
59
statistics for bigpages
60
global policy for page cache? currently it uses process policy. Requires
61
first item above.
62
handle mremap for shared memory (currently ignored for the policy)
63
grows down?
64
make bind policy root only? It can trigger oom much faster and the
65
kernel is not always grateful with that.
66
*/
67
68
#include <linux/mempolicy.h>
69
#include <linux/mm.h>
70
#include <linux/highmem.h>
71
#include <linux/hugetlb.h>
72
#include <linux/kernel.h>
73
#include <linux/sched.h>
74
#include <linux/nodemask.h>
75
#include <linux/cpuset.h>
76
#include <linux/slab.h>
77
#include <linux/string.h>
78
#include <linux/module.h>
79
#include <linux/nsproxy.h>
80
#include <linux/interrupt.h>
81
#include <linux/init.h>
82
#include <linux/compat.h>
83
#include <linux/swap.h>
84
#include <linux/seq_file.h>
85
#include <linux/proc_fs.h>
86
#include <linux/migrate.h>
87
#include <linux/ksm.h>
88
#include <linux/rmap.h>
89
#include <linux/security.h>
90
#include <linux/syscalls.h>
91
#include <linux/ctype.h>
92
#include <linux/mm_inline.h>
93
94
#include <asm/tlbflush.h>
95
#include <asm/uaccess.h>
96
97
#include "internal.h"
98
99
/* Internal flags */
100
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
101
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
102
103
static struct kmem_cache *policy_cache;
104
static struct kmem_cache *sn_cache;
105
106
/* Highest zone. An specific allocation for a zone below that is not
107
policied. */
108
enum zone_type policy_zone = 0;
109
110
/*
111
* run-time system-wide default policy => local allocation
112
*/
113
struct mempolicy default_policy = {
114
.refcnt = ATOMIC_INIT(1), /* never free it */
115
.mode = MPOL_PREFERRED,
116
.flags = MPOL_F_LOCAL,
117
};
118
119
static const struct mempolicy_operations {
120
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121
/*
122
* If read-side task has no lock to protect task->mempolicy, write-side
123
* task will rebind the task->mempolicy by two step. The first step is
124
* setting all the newly nodes, and the second step is cleaning all the
125
* disallowed nodes. In this way, we can avoid finding no node to alloc
126
* page.
127
* If we have a lock to protect task->mempolicy in read-side, we do
128
* rebind directly.
129
*
130
* step:
131
* MPOL_REBIND_ONCE - do rebind work at once
132
* MPOL_REBIND_STEP1 - set all the newly nodes
133
* MPOL_REBIND_STEP2 - clean all the disallowed nodes
134
*/
135
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
136
enum mpol_rebind_step step);
137
} mpol_ops[MPOL_MAX];
138
139
/* Check that the nodemask contains at least one populated zone */
140
static int is_valid_nodemask(const nodemask_t *nodemask)
141
{
142
int nd, k;
143
144
for_each_node_mask(nd, *nodemask) {
145
struct zone *z;
146
147
for (k = 0; k <= policy_zone; k++) {
148
z = &NODE_DATA(nd)->node_zones[k];
149
if (z->present_pages > 0)
150
return 1;
151
}
152
}
153
154
return 0;
155
}
156
157
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
158
{
159
return pol->flags & MPOL_MODE_FLAGS;
160
}
161
162
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
163
const nodemask_t *rel)
164
{
165
nodemask_t tmp;
166
nodes_fold(tmp, *orig, nodes_weight(*rel));
167
nodes_onto(*ret, tmp, *rel);
168
}
169
170
static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
171
{
172
if (nodes_empty(*nodes))
173
return -EINVAL;
174
pol->v.nodes = *nodes;
175
return 0;
176
}
177
178
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
179
{
180
if (!nodes)
181
pol->flags |= MPOL_F_LOCAL; /* local allocation */
182
else if (nodes_empty(*nodes))
183
return -EINVAL; /* no allowed nodes */
184
else
185
pol->v.preferred_node = first_node(*nodes);
186
return 0;
187
}
188
189
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
190
{
191
if (!is_valid_nodemask(nodes))
192
return -EINVAL;
193
pol->v.nodes = *nodes;
194
return 0;
195
}
196
197
/*
198
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
199
* any, for the new policy. mpol_new() has already validated the nodes
200
* parameter with respect to the policy mode and flags. But, we need to
201
* handle an empty nodemask with MPOL_PREFERRED here.
202
*
203
* Must be called holding task's alloc_lock to protect task's mems_allowed
204
* and mempolicy. May also be called holding the mmap_semaphore for write.
205
*/
206
static int mpol_set_nodemask(struct mempolicy *pol,
207
const nodemask_t *nodes, struct nodemask_scratch *nsc)
208
{
209
int ret;
210
211
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
212
if (pol == NULL)
213
return 0;
214
/* Check N_HIGH_MEMORY */
215
nodes_and(nsc->mask1,
216
cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
217
218
VM_BUG_ON(!nodes);
219
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
220
nodes = NULL; /* explicit local allocation */
221
else {
222
if (pol->flags & MPOL_F_RELATIVE_NODES)
223
mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
224
else
225
nodes_and(nsc->mask2, *nodes, nsc->mask1);
226
227
if (mpol_store_user_nodemask(pol))
228
pol->w.user_nodemask = *nodes;
229
else
230
pol->w.cpuset_mems_allowed =
231
cpuset_current_mems_allowed;
232
}
233
234
if (nodes)
235
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
236
else
237
ret = mpol_ops[pol->mode].create(pol, NULL);
238
return ret;
239
}
240
241
/*
242
* This function just creates a new policy, does some check and simple
243
* initialization. You must invoke mpol_set_nodemask() to set nodes.
244
*/
245
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
246
nodemask_t *nodes)
247
{
248
struct mempolicy *policy;
249
250
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
251
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
252
253
if (mode == MPOL_DEFAULT) {
254
if (nodes && !nodes_empty(*nodes))
255
return ERR_PTR(-EINVAL);
256
return NULL; /* simply delete any existing policy */
257
}
258
VM_BUG_ON(!nodes);
259
260
/*
261
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
262
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
263
* All other modes require a valid pointer to a non-empty nodemask.
264
*/
265
if (mode == MPOL_PREFERRED) {
266
if (nodes_empty(*nodes)) {
267
if (((flags & MPOL_F_STATIC_NODES) ||
268
(flags & MPOL_F_RELATIVE_NODES)))
269
return ERR_PTR(-EINVAL);
270
}
271
} else if (nodes_empty(*nodes))
272
return ERR_PTR(-EINVAL);
273
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
274
if (!policy)
275
return ERR_PTR(-ENOMEM);
276
atomic_set(&policy->refcnt, 1);
277
policy->mode = mode;
278
policy->flags = flags;
279
280
return policy;
281
}
282
283
/* Slow path of a mpol destructor. */
284
void __mpol_put(struct mempolicy *p)
285
{
286
if (!atomic_dec_and_test(&p->refcnt))
287
return;
288
kmem_cache_free(policy_cache, p);
289
}
290
291
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
292
enum mpol_rebind_step step)
293
{
294
}
295
296
/*
297
* step:
298
* MPOL_REBIND_ONCE - do rebind work at once
299
* MPOL_REBIND_STEP1 - set all the newly nodes
300
* MPOL_REBIND_STEP2 - clean all the disallowed nodes
301
*/
302
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
303
enum mpol_rebind_step step)
304
{
305
nodemask_t tmp;
306
307
if (pol->flags & MPOL_F_STATIC_NODES)
308
nodes_and(tmp, pol->w.user_nodemask, *nodes);
309
else if (pol->flags & MPOL_F_RELATIVE_NODES)
310
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
311
else {
312
/*
313
* if step == 1, we use ->w.cpuset_mems_allowed to cache the
314
* result
315
*/
316
if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
317
nodes_remap(tmp, pol->v.nodes,
318
pol->w.cpuset_mems_allowed, *nodes);
319
pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
320
} else if (step == MPOL_REBIND_STEP2) {
321
tmp = pol->w.cpuset_mems_allowed;
322
pol->w.cpuset_mems_allowed = *nodes;
323
} else
324
BUG();
325
}
326
327
if (nodes_empty(tmp))
328
tmp = *nodes;
329
330
if (step == MPOL_REBIND_STEP1)
331
nodes_or(pol->v.nodes, pol->v.nodes, tmp);
332
else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
333
pol->v.nodes = tmp;
334
else
335
BUG();
336
337
if (!node_isset(current->il_next, tmp)) {
338
current->il_next = next_node(current->il_next, tmp);
339
if (current->il_next >= MAX_NUMNODES)
340
current->il_next = first_node(tmp);
341
if (current->il_next >= MAX_NUMNODES)
342
current->il_next = numa_node_id();
343
}
344
}
345
346
static void mpol_rebind_preferred(struct mempolicy *pol,
347
const nodemask_t *nodes,
348
enum mpol_rebind_step step)
349
{
350
nodemask_t tmp;
351
352
if (pol->flags & MPOL_F_STATIC_NODES) {
353
int node = first_node(pol->w.user_nodemask);
354
355
if (node_isset(node, *nodes)) {
356
pol->v.preferred_node = node;
357
pol->flags &= ~MPOL_F_LOCAL;
358
} else
359
pol->flags |= MPOL_F_LOCAL;
360
} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
361
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
362
pol->v.preferred_node = first_node(tmp);
363
} else if (!(pol->flags & MPOL_F_LOCAL)) {
364
pol->v.preferred_node = node_remap(pol->v.preferred_node,
365
pol->w.cpuset_mems_allowed,
366
*nodes);
367
pol->w.cpuset_mems_allowed = *nodes;
368
}
369
}
370
371
/*
372
* mpol_rebind_policy - Migrate a policy to a different set of nodes
373
*
374
* If read-side task has no lock to protect task->mempolicy, write-side
375
* task will rebind the task->mempolicy by two step. The first step is
376
* setting all the newly nodes, and the second step is cleaning all the
377
* disallowed nodes. In this way, we can avoid finding no node to alloc
378
* page.
379
* If we have a lock to protect task->mempolicy in read-side, we do
380
* rebind directly.
381
*
382
* step:
383
* MPOL_REBIND_ONCE - do rebind work at once
384
* MPOL_REBIND_STEP1 - set all the newly nodes
385
* MPOL_REBIND_STEP2 - clean all the disallowed nodes
386
*/
387
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
388
enum mpol_rebind_step step)
389
{
390
if (!pol)
391
return;
392
if (!mpol_store_user_nodemask(pol) && step == 0 &&
393
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
394
return;
395
396
if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
397
return;
398
399
if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
400
BUG();
401
402
if (step == MPOL_REBIND_STEP1)
403
pol->flags |= MPOL_F_REBINDING;
404
else if (step == MPOL_REBIND_STEP2)
405
pol->flags &= ~MPOL_F_REBINDING;
406
else if (step >= MPOL_REBIND_NSTEP)
407
BUG();
408
409
mpol_ops[pol->mode].rebind(pol, newmask, step);
410
}
411
412
/*
413
* Wrapper for mpol_rebind_policy() that just requires task
414
* pointer, and updates task mempolicy.
415
*
416
* Called with task's alloc_lock held.
417
*/
418
419
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
420
enum mpol_rebind_step step)
421
{
422
mpol_rebind_policy(tsk->mempolicy, new, step);
423
}
424
425
/*
426
* Rebind each vma in mm to new nodemask.
427
*
428
* Call holding a reference to mm. Takes mm->mmap_sem during call.
429
*/
430
431
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
432
{
433
struct vm_area_struct *vma;
434
435
down_write(&mm->mmap_sem);
436
for (vma = mm->mmap; vma; vma = vma->vm_next)
437
mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
438
up_write(&mm->mmap_sem);
439
}
440
441
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
442
[MPOL_DEFAULT] = {
443
.rebind = mpol_rebind_default,
444
},
445
[MPOL_INTERLEAVE] = {
446
.create = mpol_new_interleave,
447
.rebind = mpol_rebind_nodemask,
448
},
449
[MPOL_PREFERRED] = {
450
.create = mpol_new_preferred,
451
.rebind = mpol_rebind_preferred,
452
},
453
[MPOL_BIND] = {
454
.create = mpol_new_bind,
455
.rebind = mpol_rebind_nodemask,
456
},
457
};
458
459
static void migrate_page_add(struct page *page, struct list_head *pagelist,
460
unsigned long flags);
461
462
/* Scan through pages checking if pages follow certain conditions. */
463
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
464
unsigned long addr, unsigned long end,
465
const nodemask_t *nodes, unsigned long flags,
466
void *private)
467
{
468
pte_t *orig_pte;
469
pte_t *pte;
470
spinlock_t *ptl;
471
472
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
473
do {
474
struct page *page;
475
int nid;
476
477
if (!pte_present(*pte))
478
continue;
479
page = vm_normal_page(vma, addr, *pte);
480
if (!page)
481
continue;
482
/*
483
* vm_normal_page() filters out zero pages, but there might
484
* still be PageReserved pages to skip, perhaps in a VDSO.
485
* And we cannot move PageKsm pages sensibly or safely yet.
486
*/
487
if (PageReserved(page) || PageKsm(page))
488
continue;
489
nid = page_to_nid(page);
490
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
491
continue;
492
493
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
494
migrate_page_add(page, private, flags);
495
else
496
break;
497
} while (pte++, addr += PAGE_SIZE, addr != end);
498
pte_unmap_unlock(orig_pte, ptl);
499
return addr != end;
500
}
501
502
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
503
unsigned long addr, unsigned long end,
504
const nodemask_t *nodes, unsigned long flags,
505
void *private)
506
{
507
pmd_t *pmd;
508
unsigned long next;
509
510
pmd = pmd_offset(pud, addr);
511
do {
512
next = pmd_addr_end(addr, end);
513
split_huge_page_pmd(vma->vm_mm, pmd);
514
if (pmd_none_or_clear_bad(pmd))
515
continue;
516
if (check_pte_range(vma, pmd, addr, next, nodes,
517
flags, private))
518
return -EIO;
519
} while (pmd++, addr = next, addr != end);
520
return 0;
521
}
522
523
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
524
unsigned long addr, unsigned long end,
525
const nodemask_t *nodes, unsigned long flags,
526
void *private)
527
{
528
pud_t *pud;
529
unsigned long next;
530
531
pud = pud_offset(pgd, addr);
532
do {
533
next = pud_addr_end(addr, end);
534
if (pud_none_or_clear_bad(pud))
535
continue;
536
if (check_pmd_range(vma, pud, addr, next, nodes,
537
flags, private))
538
return -EIO;
539
} while (pud++, addr = next, addr != end);
540
return 0;
541
}
542
543
static inline int check_pgd_range(struct vm_area_struct *vma,
544
unsigned long addr, unsigned long end,
545
const nodemask_t *nodes, unsigned long flags,
546
void *private)
547
{
548
pgd_t *pgd;
549
unsigned long next;
550
551
pgd = pgd_offset(vma->vm_mm, addr);
552
do {
553
next = pgd_addr_end(addr, end);
554
if (pgd_none_or_clear_bad(pgd))
555
continue;
556
if (check_pud_range(vma, pgd, addr, next, nodes,
557
flags, private))
558
return -EIO;
559
} while (pgd++, addr = next, addr != end);
560
return 0;
561
}
562
563
/*
564
* Check if all pages in a range are on a set of nodes.
565
* If pagelist != NULL then isolate pages from the LRU and
566
* put them on the pagelist.
567
*/
568
static struct vm_area_struct *
569
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
570
const nodemask_t *nodes, unsigned long flags, void *private)
571
{
572
int err;
573
struct vm_area_struct *first, *vma, *prev;
574
575
576
first = find_vma(mm, start);
577
if (!first)
578
return ERR_PTR(-EFAULT);
579
prev = NULL;
580
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
581
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
582
if (!vma->vm_next && vma->vm_end < end)
583
return ERR_PTR(-EFAULT);
584
if (prev && prev->vm_end < vma->vm_start)
585
return ERR_PTR(-EFAULT);
586
}
587
if (!is_vm_hugetlb_page(vma) &&
588
((flags & MPOL_MF_STRICT) ||
589
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
590
vma_migratable(vma)))) {
591
unsigned long endvma = vma->vm_end;
592
593
if (endvma > end)
594
endvma = end;
595
if (vma->vm_start > start)
596
start = vma->vm_start;
597
err = check_pgd_range(vma, start, endvma, nodes,
598
flags, private);
599
if (err) {
600
first = ERR_PTR(err);
601
break;
602
}
603
}
604
prev = vma;
605
}
606
return first;
607
}
608
609
/* Apply policy to a single VMA */
610
static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
611
{
612
int err = 0;
613
struct mempolicy *old = vma->vm_policy;
614
615
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
616
vma->vm_start, vma->vm_end, vma->vm_pgoff,
617
vma->vm_ops, vma->vm_file,
618
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
619
620
if (vma->vm_ops && vma->vm_ops->set_policy)
621
err = vma->vm_ops->set_policy(vma, new);
622
if (!err) {
623
mpol_get(new);
624
vma->vm_policy = new;
625
mpol_put(old);
626
}
627
return err;
628
}
629
630
/* Step 2: apply policy to a range and do splits. */
631
static int mbind_range(struct mm_struct *mm, unsigned long start,
632
unsigned long end, struct mempolicy *new_pol)
633
{
634
struct vm_area_struct *next;
635
struct vm_area_struct *prev;
636
struct vm_area_struct *vma;
637
int err = 0;
638
pgoff_t pgoff;
639
unsigned long vmstart;
640
unsigned long vmend;
641
642
vma = find_vma_prev(mm, start, &prev);
643
if (!vma || vma->vm_start > start)
644
return -EFAULT;
645
646
for (; vma && vma->vm_start < end; prev = vma, vma = next) {
647
next = vma->vm_next;
648
vmstart = max(start, vma->vm_start);
649
vmend = min(end, vma->vm_end);
650
651
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
652
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
653
vma->anon_vma, vma->vm_file, pgoff, new_pol);
654
if (prev) {
655
vma = prev;
656
next = vma->vm_next;
657
continue;
658
}
659
if (vma->vm_start != vmstart) {
660
err = split_vma(vma->vm_mm, vma, vmstart, 1);
661
if (err)
662
goto out;
663
}
664
if (vma->vm_end != vmend) {
665
err = split_vma(vma->vm_mm, vma, vmend, 0);
666
if (err)
667
goto out;
668
}
669
err = policy_vma(vma, new_pol);
670
if (err)
671
goto out;
672
}
673
674
out:
675
return err;
676
}
677
678
/*
679
* Update task->flags PF_MEMPOLICY bit: set iff non-default
680
* mempolicy. Allows more rapid checking of this (combined perhaps
681
* with other PF_* flag bits) on memory allocation hot code paths.
682
*
683
* If called from outside this file, the task 'p' should -only- be
684
* a newly forked child not yet visible on the task list, because
685
* manipulating the task flags of a visible task is not safe.
686
*
687
* The above limitation is why this routine has the funny name
688
* mpol_fix_fork_child_flag().
689
*
690
* It is also safe to call this with a task pointer of current,
691
* which the static wrapper mpol_set_task_struct_flag() does,
692
* for use within this file.
693
*/
694
695
void mpol_fix_fork_child_flag(struct task_struct *p)
696
{
697
if (p->mempolicy)
698
p->flags |= PF_MEMPOLICY;
699
else
700
p->flags &= ~PF_MEMPOLICY;
701
}
702
703
static void mpol_set_task_struct_flag(void)
704
{
705
mpol_fix_fork_child_flag(current);
706
}
707
708
/* Set the process memory policy */
709
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
710
nodemask_t *nodes)
711
{
712
struct mempolicy *new, *old;
713
struct mm_struct *mm = current->mm;
714
NODEMASK_SCRATCH(scratch);
715
int ret;
716
717
if (!scratch)
718
return -ENOMEM;
719
720
new = mpol_new(mode, flags, nodes);
721
if (IS_ERR(new)) {
722
ret = PTR_ERR(new);
723
goto out;
724
}
725
/*
726
* prevent changing our mempolicy while show_numa_maps()
727
* is using it.
728
* Note: do_set_mempolicy() can be called at init time
729
* with no 'mm'.
730
*/
731
if (mm)
732
down_write(&mm->mmap_sem);
733
task_lock(current);
734
ret = mpol_set_nodemask(new, nodes, scratch);
735
if (ret) {
736
task_unlock(current);
737
if (mm)
738
up_write(&mm->mmap_sem);
739
mpol_put(new);
740
goto out;
741
}
742
old = current->mempolicy;
743
current->mempolicy = new;
744
mpol_set_task_struct_flag();
745
if (new && new->mode == MPOL_INTERLEAVE &&
746
nodes_weight(new->v.nodes))
747
current->il_next = first_node(new->v.nodes);
748
task_unlock(current);
749
if (mm)
750
up_write(&mm->mmap_sem);
751
752
mpol_put(old);
753
ret = 0;
754
out:
755
NODEMASK_SCRATCH_FREE(scratch);
756
return ret;
757
}
758
759
/*
760
* Return nodemask for policy for get_mempolicy() query
761
*
762
* Called with task's alloc_lock held
763
*/
764
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
765
{
766
nodes_clear(*nodes);
767
if (p == &default_policy)
768
return;
769
770
switch (p->mode) {
771
case MPOL_BIND:
772
/* Fall through */
773
case MPOL_INTERLEAVE:
774
*nodes = p->v.nodes;
775
break;
776
case MPOL_PREFERRED:
777
if (!(p->flags & MPOL_F_LOCAL))
778
node_set(p->v.preferred_node, *nodes);
779
/* else return empty node mask for local allocation */
780
break;
781
default:
782
BUG();
783
}
784
}
785
786
static int lookup_node(struct mm_struct *mm, unsigned long addr)
787
{
788
struct page *p;
789
int err;
790
791
err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
792
if (err >= 0) {
793
err = page_to_nid(p);
794
put_page(p);
795
}
796
return err;
797
}
798
799
/* Retrieve NUMA policy */
800
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
801
unsigned long addr, unsigned long flags)
802
{
803
int err;
804
struct mm_struct *mm = current->mm;
805
struct vm_area_struct *vma = NULL;
806
struct mempolicy *pol = current->mempolicy;
807
808
if (flags &
809
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
810
return -EINVAL;
811
812
if (flags & MPOL_F_MEMS_ALLOWED) {
813
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
814
return -EINVAL;
815
*policy = 0; /* just so it's initialized */
816
task_lock(current);
817
*nmask = cpuset_current_mems_allowed;
818
task_unlock(current);
819
return 0;
820
}
821
822
if (flags & MPOL_F_ADDR) {
823
/*
824
* Do NOT fall back to task policy if the
825
* vma/shared policy at addr is NULL. We
826
* want to return MPOL_DEFAULT in this case.
827
*/
828
down_read(&mm->mmap_sem);
829
vma = find_vma_intersection(mm, addr, addr+1);
830
if (!vma) {
831
up_read(&mm->mmap_sem);
832
return -EFAULT;
833
}
834
if (vma->vm_ops && vma->vm_ops->get_policy)
835
pol = vma->vm_ops->get_policy(vma, addr);
836
else
837
pol = vma->vm_policy;
838
} else if (addr)
839
return -EINVAL;
840
841
if (!pol)
842
pol = &default_policy; /* indicates default behavior */
843
844
if (flags & MPOL_F_NODE) {
845
if (flags & MPOL_F_ADDR) {
846
err = lookup_node(mm, addr);
847
if (err < 0)
848
goto out;
849
*policy = err;
850
} else if (pol == current->mempolicy &&
851
pol->mode == MPOL_INTERLEAVE) {
852
*policy = current->il_next;
853
} else {
854
err = -EINVAL;
855
goto out;
856
}
857
} else {
858
*policy = pol == &default_policy ? MPOL_DEFAULT :
859
pol->mode;
860
/*
861
* Internal mempolicy flags must be masked off before exposing
862
* the policy to userspace.
863
*/
864
*policy |= (pol->flags & MPOL_MODE_FLAGS);
865
}
866
867
if (vma) {
868
up_read(&current->mm->mmap_sem);
869
vma = NULL;
870
}
871
872
err = 0;
873
if (nmask) {
874
if (mpol_store_user_nodemask(pol)) {
875
*nmask = pol->w.user_nodemask;
876
} else {
877
task_lock(current);
878
get_policy_nodemask(pol, nmask);
879
task_unlock(current);
880
}
881
}
882
883
out:
884
mpol_cond_put(pol);
885
if (vma)
886
up_read(&current->mm->mmap_sem);
887
return err;
888
}
889
890
#ifdef CONFIG_MIGRATION
891
/*
892
* page migration
893
*/
894
static void migrate_page_add(struct page *page, struct list_head *pagelist,
895
unsigned long flags)
896
{
897
/*
898
* Avoid migrating a page that is shared with others.
899
*/
900
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
901
if (!isolate_lru_page(page)) {
902
list_add_tail(&page->lru, pagelist);
903
inc_zone_page_state(page, NR_ISOLATED_ANON +
904
page_is_file_cache(page));
905
}
906
}
907
}
908
909
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
910
{
911
return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
912
}
913
914
/*
915
* Migrate pages from one node to a target node.
916
* Returns error or the number of pages not migrated.
917
*/
918
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
919
int flags)
920
{
921
nodemask_t nmask;
922
LIST_HEAD(pagelist);
923
int err = 0;
924
struct vm_area_struct *vma;
925
926
nodes_clear(nmask);
927
node_set(source, nmask);
928
929
vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
930
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
931
if (IS_ERR(vma))
932
return PTR_ERR(vma);
933
934
if (!list_empty(&pagelist)) {
935
err = migrate_pages(&pagelist, new_node_page, dest,
936
false, true);
937
if (err)
938
putback_lru_pages(&pagelist);
939
}
940
941
return err;
942
}
943
944
/*
945
* Move pages between the two nodesets so as to preserve the physical
946
* layout as much as possible.
947
*
948
* Returns the number of page that could not be moved.
949
*/
950
int do_migrate_pages(struct mm_struct *mm,
951
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
952
{
953
int busy = 0;
954
int err;
955
nodemask_t tmp;
956
957
err = migrate_prep();
958
if (err)
959
return err;
960
961
down_read(&mm->mmap_sem);
962
963
err = migrate_vmas(mm, from_nodes, to_nodes, flags);
964
if (err)
965
goto out;
966
967
/*
968
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
969
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
970
* bit in 'tmp', and return that <source, dest> pair for migration.
971
* The pair of nodemasks 'to' and 'from' define the map.
972
*
973
* If no pair of bits is found that way, fallback to picking some
974
* pair of 'source' and 'dest' bits that are not the same. If the
975
* 'source' and 'dest' bits are the same, this represents a node
976
* that will be migrating to itself, so no pages need move.
977
*
978
* If no bits are left in 'tmp', or if all remaining bits left
979
* in 'tmp' correspond to the same bit in 'to', return false
980
* (nothing left to migrate).
981
*
982
* This lets us pick a pair of nodes to migrate between, such that
983
* if possible the dest node is not already occupied by some other
984
* source node, minimizing the risk of overloading the memory on a
985
* node that would happen if we migrated incoming memory to a node
986
* before migrating outgoing memory source that same node.
987
*
988
* A single scan of tmp is sufficient. As we go, we remember the
989
* most recent <s, d> pair that moved (s != d). If we find a pair
990
* that not only moved, but what's better, moved to an empty slot
991
* (d is not set in tmp), then we break out then, with that pair.
992
* Otherwise when we finish scanning from_tmp, we at least have the
993
* most recent <s, d> pair that moved. If we get all the way through
994
* the scan of tmp without finding any node that moved, much less
995
* moved to an empty node, then there is nothing left worth migrating.
996
*/
997
998
tmp = *from_nodes;
999
while (!nodes_empty(tmp)) {
1000
int s,d;
1001
int source = -1;
1002
int dest = 0;
1003
1004
for_each_node_mask(s, tmp) {
1005
d = node_remap(s, *from_nodes, *to_nodes);
1006
if (s == d)
1007
continue;
1008
1009
source = s; /* Node moved. Memorize */
1010
dest = d;
1011
1012
/* dest not in remaining from nodes? */
1013
if (!node_isset(dest, tmp))
1014
break;
1015
}
1016
if (source == -1)
1017
break;
1018
1019
node_clear(source, tmp);
1020
err = migrate_to_node(mm, source, dest, flags);
1021
if (err > 0)
1022
busy += err;
1023
if (err < 0)
1024
break;
1025
}
1026
out:
1027
up_read(&mm->mmap_sem);
1028
if (err < 0)
1029
return err;
1030
return busy;
1031
1032
}
1033
1034
/*
1035
* Allocate a new page for page migration based on vma policy.
1036
* Start assuming that page is mapped by vma pointed to by @private.
1037
* Search forward from there, if not. N.B., this assumes that the
1038
* list of pages handed to migrate_pages()--which is how we get here--
1039
* is in virtual address order.
1040
*/
1041
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1042
{
1043
struct vm_area_struct *vma = (struct vm_area_struct *)private;
1044
unsigned long uninitialized_var(address);
1045
1046
while (vma) {
1047
address = page_address_in_vma(page, vma);
1048
if (address != -EFAULT)
1049
break;
1050
vma = vma->vm_next;
1051
}
1052
1053
/*
1054
* if !vma, alloc_page_vma() will use task or system default policy
1055
*/
1056
return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1057
}
1058
#else
1059
1060
static void migrate_page_add(struct page *page, struct list_head *pagelist,
1061
unsigned long flags)
1062
{
1063
}
1064
1065
int do_migrate_pages(struct mm_struct *mm,
1066
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1067
{
1068
return -ENOSYS;
1069
}
1070
1071
static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1072
{
1073
return NULL;
1074
}
1075
#endif
1076
1077
static long do_mbind(unsigned long start, unsigned long len,
1078
unsigned short mode, unsigned short mode_flags,
1079
nodemask_t *nmask, unsigned long flags)
1080
{
1081
struct vm_area_struct *vma;
1082
struct mm_struct *mm = current->mm;
1083
struct mempolicy *new;
1084
unsigned long end;
1085
int err;
1086
LIST_HEAD(pagelist);
1087
1088
if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1089
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1090
return -EINVAL;
1091
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1092
return -EPERM;
1093
1094
if (start & ~PAGE_MASK)
1095
return -EINVAL;
1096
1097
if (mode == MPOL_DEFAULT)
1098
flags &= ~MPOL_MF_STRICT;
1099
1100
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1101
end = start + len;
1102
1103
if (end < start)
1104
return -EINVAL;
1105
if (end == start)
1106
return 0;
1107
1108
new = mpol_new(mode, mode_flags, nmask);
1109
if (IS_ERR(new))
1110
return PTR_ERR(new);
1111
1112
/*
1113
* If we are using the default policy then operation
1114
* on discontinuous address spaces is okay after all
1115
*/
1116
if (!new)
1117
flags |= MPOL_MF_DISCONTIG_OK;
1118
1119
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1120
start, start + len, mode, mode_flags,
1121
nmask ? nodes_addr(*nmask)[0] : -1);
1122
1123
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1124
1125
err = migrate_prep();
1126
if (err)
1127
goto mpol_out;
1128
}
1129
{
1130
NODEMASK_SCRATCH(scratch);
1131
if (scratch) {
1132
down_write(&mm->mmap_sem);
1133
task_lock(current);
1134
err = mpol_set_nodemask(new, nmask, scratch);
1135
task_unlock(current);
1136
if (err)
1137
up_write(&mm->mmap_sem);
1138
} else
1139
err = -ENOMEM;
1140
NODEMASK_SCRATCH_FREE(scratch);
1141
}
1142
if (err)
1143
goto mpol_out;
1144
1145
vma = check_range(mm, start, end, nmask,
1146
flags | MPOL_MF_INVERT, &pagelist);
1147
1148
err = PTR_ERR(vma);
1149
if (!IS_ERR(vma)) {
1150
int nr_failed = 0;
1151
1152
err = mbind_range(mm, start, end, new);
1153
1154
if (!list_empty(&pagelist)) {
1155
nr_failed = migrate_pages(&pagelist, new_vma_page,
1156
(unsigned long)vma,
1157
false, true);
1158
if (nr_failed)
1159
putback_lru_pages(&pagelist);
1160
}
1161
1162
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1163
err = -EIO;
1164
} else
1165
putback_lru_pages(&pagelist);
1166
1167
up_write(&mm->mmap_sem);
1168
mpol_out:
1169
mpol_put(new);
1170
return err;
1171
}
1172
1173
/*
1174
* User space interface with variable sized bitmaps for nodelists.
1175
*/
1176
1177
/* Copy a node mask from user space. */
1178
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1179
unsigned long maxnode)
1180
{
1181
unsigned long k;
1182
unsigned long nlongs;
1183
unsigned long endmask;
1184
1185
--maxnode;
1186
nodes_clear(*nodes);
1187
if (maxnode == 0 || !nmask)
1188
return 0;
1189
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1190
return -EINVAL;
1191
1192
nlongs = BITS_TO_LONGS(maxnode);
1193
if ((maxnode % BITS_PER_LONG) == 0)
1194
endmask = ~0UL;
1195
else
1196
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1197
1198
/* When the user specified more nodes than supported just check
1199
if the non supported part is all zero. */
1200
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1201
if (nlongs > PAGE_SIZE/sizeof(long))
1202
return -EINVAL;
1203
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1204
unsigned long t;
1205
if (get_user(t, nmask + k))
1206
return -EFAULT;
1207
if (k == nlongs - 1) {
1208
if (t & endmask)
1209
return -EINVAL;
1210
} else if (t)
1211
return -EINVAL;
1212
}
1213
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1214
endmask = ~0UL;
1215
}
1216
1217
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1218
return -EFAULT;
1219
nodes_addr(*nodes)[nlongs-1] &= endmask;
1220
return 0;
1221
}
1222
1223
/* Copy a kernel node mask to user space */
1224
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1225
nodemask_t *nodes)
1226
{
1227
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1228
const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1229
1230
if (copy > nbytes) {
1231
if (copy > PAGE_SIZE)
1232
return -EINVAL;
1233
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1234
return -EFAULT;
1235
copy = nbytes;
1236
}
1237
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1238
}
1239
1240
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1241
unsigned long, mode, unsigned long __user *, nmask,
1242
unsigned long, maxnode, unsigned, flags)
1243
{
1244
nodemask_t nodes;
1245
int err;
1246
unsigned short mode_flags;
1247
1248
mode_flags = mode & MPOL_MODE_FLAGS;
1249
mode &= ~MPOL_MODE_FLAGS;
1250
if (mode >= MPOL_MAX)
1251
return -EINVAL;
1252
if ((mode_flags & MPOL_F_STATIC_NODES) &&
1253
(mode_flags & MPOL_F_RELATIVE_NODES))
1254
return -EINVAL;
1255
err = get_nodes(&nodes, nmask, maxnode);
1256
if (err)
1257
return err;
1258
return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1259
}
1260
1261
/* Set the process memory policy */
1262
SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1263
unsigned long, maxnode)
1264
{
1265
int err;
1266
nodemask_t nodes;
1267
unsigned short flags;
1268
1269
flags = mode & MPOL_MODE_FLAGS;
1270
mode &= ~MPOL_MODE_FLAGS;
1271
if ((unsigned int)mode >= MPOL_MAX)
1272
return -EINVAL;
1273
if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1274
return -EINVAL;
1275
err = get_nodes(&nodes, nmask, maxnode);
1276
if (err)
1277
return err;
1278
return do_set_mempolicy(mode, flags, &nodes);
1279
}
1280
1281
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1282
const unsigned long __user *, old_nodes,
1283
const unsigned long __user *, new_nodes)
1284
{
1285
const struct cred *cred = current_cred(), *tcred;
1286
struct mm_struct *mm = NULL;
1287
struct task_struct *task;
1288
nodemask_t task_nodes;
1289
int err;
1290
nodemask_t *old;
1291
nodemask_t *new;
1292
NODEMASK_SCRATCH(scratch);
1293
1294
if (!scratch)
1295
return -ENOMEM;
1296
1297
old = &scratch->mask1;
1298
new = &scratch->mask2;
1299
1300
err = get_nodes(old, old_nodes, maxnode);
1301
if (err)
1302
goto out;
1303
1304
err = get_nodes(new, new_nodes, maxnode);
1305
if (err)
1306
goto out;
1307
1308
/* Find the mm_struct */
1309
rcu_read_lock();
1310
task = pid ? find_task_by_vpid(pid) : current;
1311
if (!task) {
1312
rcu_read_unlock();
1313
err = -ESRCH;
1314
goto out;
1315
}
1316
mm = get_task_mm(task);
1317
rcu_read_unlock();
1318
1319
err = -EINVAL;
1320
if (!mm)
1321
goto out;
1322
1323
/*
1324
* Check if this process has the right to modify the specified
1325
* process. The right exists if the process has administrative
1326
* capabilities, superuser privileges or the same
1327
* userid as the target process.
1328
*/
1329
rcu_read_lock();
1330
tcred = __task_cred(task);
1331
if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1332
cred->uid != tcred->suid && cred->uid != tcred->uid &&
1333
!capable(CAP_SYS_NICE)) {
1334
rcu_read_unlock();
1335
err = -EPERM;
1336
goto out;
1337
}
1338
rcu_read_unlock();
1339
1340
task_nodes = cpuset_mems_allowed(task);
1341
/* Is the user allowed to access the target nodes? */
1342
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1343
err = -EPERM;
1344
goto out;
1345
}
1346
1347
if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1348
err = -EINVAL;
1349
goto out;
1350
}
1351
1352
err = security_task_movememory(task);
1353
if (err)
1354
goto out;
1355
1356
err = do_migrate_pages(mm, old, new,
1357
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1358
out:
1359
if (mm)
1360
mmput(mm);
1361
NODEMASK_SCRATCH_FREE(scratch);
1362
1363
return err;
1364
}
1365
1366
1367
/* Retrieve NUMA policy */
1368
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1369
unsigned long __user *, nmask, unsigned long, maxnode,
1370
unsigned long, addr, unsigned long, flags)
1371
{
1372
int err;
1373
int uninitialized_var(pval);
1374
nodemask_t nodes;
1375
1376
if (nmask != NULL && maxnode < MAX_NUMNODES)
1377
return -EINVAL;
1378
1379
err = do_get_mempolicy(&pval, &nodes, addr, flags);
1380
1381
if (err)
1382
return err;
1383
1384
if (policy && put_user(pval, policy))
1385
return -EFAULT;
1386
1387
if (nmask)
1388
err = copy_nodes_to_user(nmask, maxnode, &nodes);
1389
1390
return err;
1391
}
1392
1393
#ifdef CONFIG_COMPAT
1394
1395
asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1396
compat_ulong_t __user *nmask,
1397
compat_ulong_t maxnode,
1398
compat_ulong_t addr, compat_ulong_t flags)
1399
{
1400
long err;
1401
unsigned long __user *nm = NULL;
1402
unsigned long nr_bits, alloc_size;
1403
DECLARE_BITMAP(bm, MAX_NUMNODES);
1404
1405
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1406
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1407
1408
if (nmask)
1409
nm = compat_alloc_user_space(alloc_size);
1410
1411
err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1412
1413
if (!err && nmask) {
1414
err = copy_from_user(bm, nm, alloc_size);
1415
/* ensure entire bitmap is zeroed */
1416
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1417
err |= compat_put_bitmap(nmask, bm, nr_bits);
1418
}
1419
1420
return err;
1421
}
1422
1423
asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1424
compat_ulong_t maxnode)
1425
{
1426
long err = 0;
1427
unsigned long __user *nm = NULL;
1428
unsigned long nr_bits, alloc_size;
1429
DECLARE_BITMAP(bm, MAX_NUMNODES);
1430
1431
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1432
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1433
1434
if (nmask) {
1435
err = compat_get_bitmap(bm, nmask, nr_bits);
1436
nm = compat_alloc_user_space(alloc_size);
1437
err |= copy_to_user(nm, bm, alloc_size);
1438
}
1439
1440
if (err)
1441
return -EFAULT;
1442
1443
return sys_set_mempolicy(mode, nm, nr_bits+1);
1444
}
1445
1446
asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1447
compat_ulong_t mode, compat_ulong_t __user *nmask,
1448
compat_ulong_t maxnode, compat_ulong_t flags)
1449
{
1450
long err = 0;
1451
unsigned long __user *nm = NULL;
1452
unsigned long nr_bits, alloc_size;
1453
nodemask_t bm;
1454
1455
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1456
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1457
1458
if (nmask) {
1459
err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1460
nm = compat_alloc_user_space(alloc_size);
1461
err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1462
}
1463
1464
if (err)
1465
return -EFAULT;
1466
1467
return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1468
}
1469
1470
#endif
1471
1472
/*
1473
* get_vma_policy(@task, @vma, @addr)
1474
* @task - task for fallback if vma policy == default
1475
* @vma - virtual memory area whose policy is sought
1476
* @addr - address in @vma for shared policy lookup
1477
*
1478
* Returns effective policy for a VMA at specified address.
1479
* Falls back to @task or system default policy, as necessary.
1480
* Current or other task's task mempolicy and non-shared vma policies
1481
* are protected by the task's mmap_sem, which must be held for read by
1482
* the caller.
1483
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1484
* count--added by the get_policy() vm_op, as appropriate--to protect against
1485
* freeing by another task. It is the caller's responsibility to free the
1486
* extra reference for shared policies.
1487
*/
1488
struct mempolicy *get_vma_policy(struct task_struct *task,
1489
struct vm_area_struct *vma, unsigned long addr)
1490
{
1491
struct mempolicy *pol = task->mempolicy;
1492
1493
if (vma) {
1494
if (vma->vm_ops && vma->vm_ops->get_policy) {
1495
struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1496
addr);
1497
if (vpol)
1498
pol = vpol;
1499
} else if (vma->vm_policy)
1500
pol = vma->vm_policy;
1501
}
1502
if (!pol)
1503
pol = &default_policy;
1504
return pol;
1505
}
1506
1507
/*
1508
* Return a nodemask representing a mempolicy for filtering nodes for
1509
* page allocation
1510
*/
1511
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1512
{
1513
/* Lower zones don't get a nodemask applied for MPOL_BIND */
1514
if (unlikely(policy->mode == MPOL_BIND) &&
1515
gfp_zone(gfp) >= policy_zone &&
1516
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1517
return &policy->v.nodes;
1518
1519
return NULL;
1520
}
1521
1522
/* Return a zonelist indicated by gfp for node representing a mempolicy */
1523
static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1524
int nd)
1525
{
1526
switch (policy->mode) {
1527
case MPOL_PREFERRED:
1528
if (!(policy->flags & MPOL_F_LOCAL))
1529
nd = policy->v.preferred_node;
1530
break;
1531
case MPOL_BIND:
1532
/*
1533
* Normally, MPOL_BIND allocations are node-local within the
1534
* allowed nodemask. However, if __GFP_THISNODE is set and the
1535
* current node isn't part of the mask, we use the zonelist for
1536
* the first node in the mask instead.
1537
*/
1538
if (unlikely(gfp & __GFP_THISNODE) &&
1539
unlikely(!node_isset(nd, policy->v.nodes)))
1540
nd = first_node(policy->v.nodes);
1541
break;
1542
default:
1543
BUG();
1544
}
1545
return node_zonelist(nd, gfp);
1546
}
1547
1548
/* Do dynamic interleaving for a process */
1549
static unsigned interleave_nodes(struct mempolicy *policy)
1550
{
1551
unsigned nid, next;
1552
struct task_struct *me = current;
1553
1554
nid = me->il_next;
1555
next = next_node(nid, policy->v.nodes);
1556
if (next >= MAX_NUMNODES)
1557
next = first_node(policy->v.nodes);
1558
if (next < MAX_NUMNODES)
1559
me->il_next = next;
1560
return nid;
1561
}
1562
1563
/*
1564
* Depending on the memory policy provide a node from which to allocate the
1565
* next slab entry.
1566
* @policy must be protected by freeing by the caller. If @policy is
1567
* the current task's mempolicy, this protection is implicit, as only the
1568
* task can change it's policy. The system default policy requires no
1569
* such protection.
1570
*/
1571
unsigned slab_node(struct mempolicy *policy)
1572
{
1573
if (!policy || policy->flags & MPOL_F_LOCAL)
1574
return numa_node_id();
1575
1576
switch (policy->mode) {
1577
case MPOL_PREFERRED:
1578
/*
1579
* handled MPOL_F_LOCAL above
1580
*/
1581
return policy->v.preferred_node;
1582
1583
case MPOL_INTERLEAVE:
1584
return interleave_nodes(policy);
1585
1586
case MPOL_BIND: {
1587
/*
1588
* Follow bind policy behavior and start allocation at the
1589
* first node.
1590
*/
1591
struct zonelist *zonelist;
1592
struct zone *zone;
1593
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1594
zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1595
(void)first_zones_zonelist(zonelist, highest_zoneidx,
1596
&policy->v.nodes,
1597
&zone);
1598
return zone ? zone->node : numa_node_id();
1599
}
1600
1601
default:
1602
BUG();
1603
}
1604
}
1605
1606
/* Do static interleaving for a VMA with known offset. */
1607
static unsigned offset_il_node(struct mempolicy *pol,
1608
struct vm_area_struct *vma, unsigned long off)
1609
{
1610
unsigned nnodes = nodes_weight(pol->v.nodes);
1611
unsigned target;
1612
int c;
1613
int nid = -1;
1614
1615
if (!nnodes)
1616
return numa_node_id();
1617
target = (unsigned int)off % nnodes;
1618
c = 0;
1619
do {
1620
nid = next_node(nid, pol->v.nodes);
1621
c++;
1622
} while (c <= target);
1623
return nid;
1624
}
1625
1626
/* Determine a node number for interleave */
1627
static inline unsigned interleave_nid(struct mempolicy *pol,
1628
struct vm_area_struct *vma, unsigned long addr, int shift)
1629
{
1630
if (vma) {
1631
unsigned long off;
1632
1633
/*
1634
* for small pages, there is no difference between
1635
* shift and PAGE_SHIFT, so the bit-shift is safe.
1636
* for huge pages, since vm_pgoff is in units of small
1637
* pages, we need to shift off the always 0 bits to get
1638
* a useful offset.
1639
*/
1640
BUG_ON(shift < PAGE_SHIFT);
1641
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1642
off += (addr - vma->vm_start) >> shift;
1643
return offset_il_node(pol, vma, off);
1644
} else
1645
return interleave_nodes(pol);
1646
}
1647
1648
#ifdef CONFIG_HUGETLBFS
1649
/*
1650
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1651
* @vma = virtual memory area whose policy is sought
1652
* @addr = address in @vma for shared policy lookup and interleave policy
1653
* @gfp_flags = for requested zone
1654
* @mpol = pointer to mempolicy pointer for reference counted mempolicy
1655
* @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1656
*
1657
* Returns a zonelist suitable for a huge page allocation and a pointer
1658
* to the struct mempolicy for conditional unref after allocation.
1659
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
1660
* @nodemask for filtering the zonelist.
1661
*
1662
* Must be protected by get_mems_allowed()
1663
*/
1664
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1665
gfp_t gfp_flags, struct mempolicy **mpol,
1666
nodemask_t **nodemask)
1667
{
1668
struct zonelist *zl;
1669
1670
*mpol = get_vma_policy(current, vma, addr);
1671
*nodemask = NULL; /* assume !MPOL_BIND */
1672
1673
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1674
zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1675
huge_page_shift(hstate_vma(vma))), gfp_flags);
1676
} else {
1677
zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1678
if ((*mpol)->mode == MPOL_BIND)
1679
*nodemask = &(*mpol)->v.nodes;
1680
}
1681
return zl;
1682
}
1683
1684
/*
1685
* init_nodemask_of_mempolicy
1686
*
1687
* If the current task's mempolicy is "default" [NULL], return 'false'
1688
* to indicate default policy. Otherwise, extract the policy nodemask
1689
* for 'bind' or 'interleave' policy into the argument nodemask, or
1690
* initialize the argument nodemask to contain the single node for
1691
* 'preferred' or 'local' policy and return 'true' to indicate presence
1692
* of non-default mempolicy.
1693
*
1694
* We don't bother with reference counting the mempolicy [mpol_get/put]
1695
* because the current task is examining it's own mempolicy and a task's
1696
* mempolicy is only ever changed by the task itself.
1697
*
1698
* N.B., it is the caller's responsibility to free a returned nodemask.
1699
*/
1700
bool init_nodemask_of_mempolicy(nodemask_t *mask)
1701
{
1702
struct mempolicy *mempolicy;
1703
int nid;
1704
1705
if (!(mask && current->mempolicy))
1706
return false;
1707
1708
task_lock(current);
1709
mempolicy = current->mempolicy;
1710
switch (mempolicy->mode) {
1711
case MPOL_PREFERRED:
1712
if (mempolicy->flags & MPOL_F_LOCAL)
1713
nid = numa_node_id();
1714
else
1715
nid = mempolicy->v.preferred_node;
1716
init_nodemask_of_node(mask, nid);
1717
break;
1718
1719
case MPOL_BIND:
1720
/* Fall through */
1721
case MPOL_INTERLEAVE:
1722
*mask = mempolicy->v.nodes;
1723
break;
1724
1725
default:
1726
BUG();
1727
}
1728
task_unlock(current);
1729
1730
return true;
1731
}
1732
#endif
1733
1734
/*
1735
* mempolicy_nodemask_intersects
1736
*
1737
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1738
* policy. Otherwise, check for intersection between mask and the policy
1739
* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1740
* policy, always return true since it may allocate elsewhere on fallback.
1741
*
1742
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
1743
*/
1744
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1745
const nodemask_t *mask)
1746
{
1747
struct mempolicy *mempolicy;
1748
bool ret = true;
1749
1750
if (!mask)
1751
return ret;
1752
task_lock(tsk);
1753
mempolicy = tsk->mempolicy;
1754
if (!mempolicy)
1755
goto out;
1756
1757
switch (mempolicy->mode) {
1758
case MPOL_PREFERRED:
1759
/*
1760
* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1761
* allocate from, they may fallback to other nodes when oom.
1762
* Thus, it's possible for tsk to have allocated memory from
1763
* nodes in mask.
1764
*/
1765
break;
1766
case MPOL_BIND:
1767
case MPOL_INTERLEAVE:
1768
ret = nodes_intersects(mempolicy->v.nodes, *mask);
1769
break;
1770
default:
1771
BUG();
1772
}
1773
out:
1774
task_unlock(tsk);
1775
return ret;
1776
}
1777
1778
/* Allocate a page in interleaved policy.
1779
Own path because it needs to do special accounting. */
1780
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1781
unsigned nid)
1782
{
1783
struct zonelist *zl;
1784
struct page *page;
1785
1786
zl = node_zonelist(nid, gfp);
1787
page = __alloc_pages(gfp, order, zl);
1788
if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1789
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1790
return page;
1791
}
1792
1793
/**
1794
* alloc_pages_vma - Allocate a page for a VMA.
1795
*
1796
* @gfp:
1797
* %GFP_USER user allocation.
1798
* %GFP_KERNEL kernel allocations,
1799
* %GFP_HIGHMEM highmem/user allocations,
1800
* %GFP_FS allocation should not call back into a file system.
1801
* %GFP_ATOMIC don't sleep.
1802
*
1803
* @order:Order of the GFP allocation.
1804
* @vma: Pointer to VMA or NULL if not available.
1805
* @addr: Virtual Address of the allocation. Must be inside the VMA.
1806
*
1807
* This function allocates a page from the kernel page pool and applies
1808
* a NUMA policy associated with the VMA or the current process.
1809
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
1810
* mm_struct of the VMA to prevent it from going away. Should be used for
1811
* all allocations for pages that will be mapped into
1812
* user space. Returns NULL when no page can be allocated.
1813
*
1814
* Should be called with the mm_sem of the vma hold.
1815
*/
1816
struct page *
1817
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1818
unsigned long addr, int node)
1819
{
1820
struct mempolicy *pol = get_vma_policy(current, vma, addr);
1821
struct zonelist *zl;
1822
struct page *page;
1823
1824
get_mems_allowed();
1825
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1826
unsigned nid;
1827
1828
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1829
mpol_cond_put(pol);
1830
page = alloc_page_interleave(gfp, order, nid);
1831
put_mems_allowed();
1832
return page;
1833
}
1834
zl = policy_zonelist(gfp, pol, node);
1835
if (unlikely(mpol_needs_cond_ref(pol))) {
1836
/*
1837
* slow path: ref counted shared policy
1838
*/
1839
struct page *page = __alloc_pages_nodemask(gfp, order,
1840
zl, policy_nodemask(gfp, pol));
1841
__mpol_put(pol);
1842
put_mems_allowed();
1843
return page;
1844
}
1845
/*
1846
* fast path: default or task policy
1847
*/
1848
page = __alloc_pages_nodemask(gfp, order, zl,
1849
policy_nodemask(gfp, pol));
1850
put_mems_allowed();
1851
return page;
1852
}
1853
1854
/**
1855
* alloc_pages_current - Allocate pages.
1856
*
1857
* @gfp:
1858
* %GFP_USER user allocation,
1859
* %GFP_KERNEL kernel allocation,
1860
* %GFP_HIGHMEM highmem allocation,
1861
* %GFP_FS don't call back into a file system.
1862
* %GFP_ATOMIC don't sleep.
1863
* @order: Power of two of allocation size in pages. 0 is a single page.
1864
*
1865
* Allocate a page from the kernel page pool. When not in
1866
* interrupt context and apply the current process NUMA policy.
1867
* Returns NULL when no page can be allocated.
1868
*
1869
* Don't call cpuset_update_task_memory_state() unless
1870
* 1) it's ok to take cpuset_sem (can WAIT), and
1871
* 2) allocating for current task (not interrupt).
1872
*/
1873
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1874
{
1875
struct mempolicy *pol = current->mempolicy;
1876
struct page *page;
1877
1878
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1879
pol = &default_policy;
1880
1881
get_mems_allowed();
1882
/*
1883
* No reference counting needed for current->mempolicy
1884
* nor system default_policy
1885
*/
1886
if (pol->mode == MPOL_INTERLEAVE)
1887
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1888
else
1889
page = __alloc_pages_nodemask(gfp, order,
1890
policy_zonelist(gfp, pol, numa_node_id()),
1891
policy_nodemask(gfp, pol));
1892
put_mems_allowed();
1893
return page;
1894
}
1895
EXPORT_SYMBOL(alloc_pages_current);
1896
1897
/*
1898
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1899
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
1900
* with the mems_allowed returned by cpuset_mems_allowed(). This
1901
* keeps mempolicies cpuset relative after its cpuset moves. See
1902
* further kernel/cpuset.c update_nodemask().
1903
*
1904
* current's mempolicy may be rebinded by the other task(the task that changes
1905
* cpuset's mems), so we needn't do rebind work for current task.
1906
*/
1907
1908
/* Slow path of a mempolicy duplicate */
1909
struct mempolicy *__mpol_dup(struct mempolicy *old)
1910
{
1911
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1912
1913
if (!new)
1914
return ERR_PTR(-ENOMEM);
1915
1916
/* task's mempolicy is protected by alloc_lock */
1917
if (old == current->mempolicy) {
1918
task_lock(current);
1919
*new = *old;
1920
task_unlock(current);
1921
} else
1922
*new = *old;
1923
1924
rcu_read_lock();
1925
if (current_cpuset_is_being_rebound()) {
1926
nodemask_t mems = cpuset_mems_allowed(current);
1927
if (new->flags & MPOL_F_REBINDING)
1928
mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1929
else
1930
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1931
}
1932
rcu_read_unlock();
1933
atomic_set(&new->refcnt, 1);
1934
return new;
1935
}
1936
1937
/*
1938
* If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1939
* eliminate the * MPOL_F_* flags that require conditional ref and
1940
* [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1941
* after return. Use the returned value.
1942
*
1943
* Allows use of a mempolicy for, e.g., multiple allocations with a single
1944
* policy lookup, even if the policy needs/has extra ref on lookup.
1945
* shmem_readahead needs this.
1946
*/
1947
struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1948
struct mempolicy *frompol)
1949
{
1950
if (!mpol_needs_cond_ref(frompol))
1951
return frompol;
1952
1953
*tompol = *frompol;
1954
tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1955
__mpol_put(frompol);
1956
return tompol;
1957
}
1958
1959
/* Slow path of a mempolicy comparison */
1960
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1961
{
1962
if (!a || !b)
1963
return 0;
1964
if (a->mode != b->mode)
1965
return 0;
1966
if (a->flags != b->flags)
1967
return 0;
1968
if (mpol_store_user_nodemask(a))
1969
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1970
return 0;
1971
1972
switch (a->mode) {
1973
case MPOL_BIND:
1974
/* Fall through */
1975
case MPOL_INTERLEAVE:
1976
return nodes_equal(a->v.nodes, b->v.nodes);
1977
case MPOL_PREFERRED:
1978
return a->v.preferred_node == b->v.preferred_node;
1979
default:
1980
BUG();
1981
return 0;
1982
}
1983
}
1984
1985
/*
1986
* Shared memory backing store policy support.
1987
*
1988
* Remember policies even when nobody has shared memory mapped.
1989
* The policies are kept in Red-Black tree linked from the inode.
1990
* They are protected by the sp->lock spinlock, which should be held
1991
* for any accesses to the tree.
1992
*/
1993
1994
/* lookup first element intersecting start-end */
1995
/* Caller holds sp->lock */
1996
static struct sp_node *
1997
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1998
{
1999
struct rb_node *n = sp->root.rb_node;
2000
2001
while (n) {
2002
struct sp_node *p = rb_entry(n, struct sp_node, nd);
2003
2004
if (start >= p->end)
2005
n = n->rb_right;
2006
else if (end <= p->start)
2007
n = n->rb_left;
2008
else
2009
break;
2010
}
2011
if (!n)
2012
return NULL;
2013
for (;;) {
2014
struct sp_node *w = NULL;
2015
struct rb_node *prev = rb_prev(n);
2016
if (!prev)
2017
break;
2018
w = rb_entry(prev, struct sp_node, nd);
2019
if (w->end <= start)
2020
break;
2021
n = prev;
2022
}
2023
return rb_entry(n, struct sp_node, nd);
2024
}
2025
2026
/* Insert a new shared policy into the list. */
2027
/* Caller holds sp->lock */
2028
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2029
{
2030
struct rb_node **p = &sp->root.rb_node;
2031
struct rb_node *parent = NULL;
2032
struct sp_node *nd;
2033
2034
while (*p) {
2035
parent = *p;
2036
nd = rb_entry(parent, struct sp_node, nd);
2037
if (new->start < nd->start)
2038
p = &(*p)->rb_left;
2039
else if (new->end > nd->end)
2040
p = &(*p)->rb_right;
2041
else
2042
BUG();
2043
}
2044
rb_link_node(&new->nd, parent, p);
2045
rb_insert_color(&new->nd, &sp->root);
2046
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2047
new->policy ? new->policy->mode : 0);
2048
}
2049
2050
/* Find shared policy intersecting idx */
2051
struct mempolicy *
2052
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2053
{
2054
struct mempolicy *pol = NULL;
2055
struct sp_node *sn;
2056
2057
if (!sp->root.rb_node)
2058
return NULL;
2059
spin_lock(&sp->lock);
2060
sn = sp_lookup(sp, idx, idx+1);
2061
if (sn) {
2062
mpol_get(sn->policy);
2063
pol = sn->policy;
2064
}
2065
spin_unlock(&sp->lock);
2066
return pol;
2067
}
2068
2069
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2070
{
2071
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2072
rb_erase(&n->nd, &sp->root);
2073
mpol_put(n->policy);
2074
kmem_cache_free(sn_cache, n);
2075
}
2076
2077
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2078
struct mempolicy *pol)
2079
{
2080
struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2081
2082
if (!n)
2083
return NULL;
2084
n->start = start;
2085
n->end = end;
2086
mpol_get(pol);
2087
pol->flags |= MPOL_F_SHARED; /* for unref */
2088
n->policy = pol;
2089
return n;
2090
}
2091
2092
/* Replace a policy range. */
2093
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2094
unsigned long end, struct sp_node *new)
2095
{
2096
struct sp_node *n, *new2 = NULL;
2097
2098
restart:
2099
spin_lock(&sp->lock);
2100
n = sp_lookup(sp, start, end);
2101
/* Take care of old policies in the same range. */
2102
while (n && n->start < end) {
2103
struct rb_node *next = rb_next(&n->nd);
2104
if (n->start >= start) {
2105
if (n->end <= end)
2106
sp_delete(sp, n);
2107
else
2108
n->start = end;
2109
} else {
2110
/* Old policy spanning whole new range. */
2111
if (n->end > end) {
2112
if (!new2) {
2113
spin_unlock(&sp->lock);
2114
new2 = sp_alloc(end, n->end, n->policy);
2115
if (!new2)
2116
return -ENOMEM;
2117
goto restart;
2118
}
2119
n->end = start;
2120
sp_insert(sp, new2);
2121
new2 = NULL;
2122
break;
2123
} else
2124
n->end = start;
2125
}
2126
if (!next)
2127
break;
2128
n = rb_entry(next, struct sp_node, nd);
2129
}
2130
if (new)
2131
sp_insert(sp, new);
2132
spin_unlock(&sp->lock);
2133
if (new2) {
2134
mpol_put(new2->policy);
2135
kmem_cache_free(sn_cache, new2);
2136
}
2137
return 0;
2138
}
2139
2140
/**
2141
* mpol_shared_policy_init - initialize shared policy for inode
2142
* @sp: pointer to inode shared policy
2143
* @mpol: struct mempolicy to install
2144
*
2145
* Install non-NULL @mpol in inode's shared policy rb-tree.
2146
* On entry, the current task has a reference on a non-NULL @mpol.
2147
* This must be released on exit.
2148
* This is called at get_inode() calls and we can use GFP_KERNEL.
2149
*/
2150
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2151
{
2152
int ret;
2153
2154
sp->root = RB_ROOT; /* empty tree == default mempolicy */
2155
spin_lock_init(&sp->lock);
2156
2157
if (mpol) {
2158
struct vm_area_struct pvma;
2159
struct mempolicy *new;
2160
NODEMASK_SCRATCH(scratch);
2161
2162
if (!scratch)
2163
goto put_mpol;
2164
/* contextualize the tmpfs mount point mempolicy */
2165
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2166
if (IS_ERR(new))
2167
goto free_scratch; /* no valid nodemask intersection */
2168
2169
task_lock(current);
2170
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2171
task_unlock(current);
2172
if (ret)
2173
goto put_new;
2174
2175
/* Create pseudo-vma that contains just the policy */
2176
memset(&pvma, 0, sizeof(struct vm_area_struct));
2177
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2178
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2179
2180
put_new:
2181
mpol_put(new); /* drop initial ref */
2182
free_scratch:
2183
NODEMASK_SCRATCH_FREE(scratch);
2184
put_mpol:
2185
mpol_put(mpol); /* drop our incoming ref on sb mpol */
2186
}
2187
}
2188
2189
int mpol_set_shared_policy(struct shared_policy *info,
2190
struct vm_area_struct *vma, struct mempolicy *npol)
2191
{
2192
int err;
2193
struct sp_node *new = NULL;
2194
unsigned long sz = vma_pages(vma);
2195
2196
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2197
vma->vm_pgoff,
2198
sz, npol ? npol->mode : -1,
2199
npol ? npol->flags : -1,
2200
npol ? nodes_addr(npol->v.nodes)[0] : -1);
2201
2202
if (npol) {
2203
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2204
if (!new)
2205
return -ENOMEM;
2206
}
2207
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2208
if (err && new)
2209
kmem_cache_free(sn_cache, new);
2210
return err;
2211
}
2212
2213
/* Free a backing policy store on inode delete. */
2214
void mpol_free_shared_policy(struct shared_policy *p)
2215
{
2216
struct sp_node *n;
2217
struct rb_node *next;
2218
2219
if (!p->root.rb_node)
2220
return;
2221
spin_lock(&p->lock);
2222
next = rb_first(&p->root);
2223
while (next) {
2224
n = rb_entry(next, struct sp_node, nd);
2225
next = rb_next(&n->nd);
2226
rb_erase(&n->nd, &p->root);
2227
mpol_put(n->policy);
2228
kmem_cache_free(sn_cache, n);
2229
}
2230
spin_unlock(&p->lock);
2231
}
2232
2233
/* assumes fs == KERNEL_DS */
2234
void __init numa_policy_init(void)
2235
{
2236
nodemask_t interleave_nodes;
2237
unsigned long largest = 0;
2238
int nid, prefer = 0;
2239
2240
policy_cache = kmem_cache_create("numa_policy",
2241
sizeof(struct mempolicy),
2242
0, SLAB_PANIC, NULL);
2243
2244
sn_cache = kmem_cache_create("shared_policy_node",
2245
sizeof(struct sp_node),
2246
0, SLAB_PANIC, NULL);
2247
2248
/*
2249
* Set interleaving policy for system init. Interleaving is only
2250
* enabled across suitably sized nodes (default is >= 16MB), or
2251
* fall back to the largest node if they're all smaller.
2252
*/
2253
nodes_clear(interleave_nodes);
2254
for_each_node_state(nid, N_HIGH_MEMORY) {
2255
unsigned long total_pages = node_present_pages(nid);
2256
2257
/* Preserve the largest node */
2258
if (largest < total_pages) {
2259
largest = total_pages;
2260
prefer = nid;
2261
}
2262
2263
/* Interleave this node? */
2264
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2265
node_set(nid, interleave_nodes);
2266
}
2267
2268
/* All too small, use the largest */
2269
if (unlikely(nodes_empty(interleave_nodes)))
2270
node_set(prefer, interleave_nodes);
2271
2272
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2273
printk("numa_policy_init: interleaving failed\n");
2274
}
2275
2276
/* Reset policy of current process to default */
2277
void numa_default_policy(void)
2278
{
2279
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2280
}
2281
2282
/*
2283
* Parse and format mempolicy from/to strings
2284
*/
2285
2286
/*
2287
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2288
* Used only for mpol_parse_str() and mpol_to_str()
2289
*/
2290
#define MPOL_LOCAL MPOL_MAX
2291
static const char * const policy_modes[] =
2292
{
2293
[MPOL_DEFAULT] = "default",
2294
[MPOL_PREFERRED] = "prefer",
2295
[MPOL_BIND] = "bind",
2296
[MPOL_INTERLEAVE] = "interleave",
2297
[MPOL_LOCAL] = "local"
2298
};
2299
2300
2301
#ifdef CONFIG_TMPFS
2302
/**
2303
* mpol_parse_str - parse string to mempolicy
2304
* @str: string containing mempolicy to parse
2305
* @mpol: pointer to struct mempolicy pointer, returned on success.
2306
* @no_context: flag whether to "contextualize" the mempolicy
2307
*
2308
* Format of input:
2309
* <mode>[=<flags>][:<nodelist>]
2310
*
2311
* if @no_context is true, save the input nodemask in w.user_nodemask in
2312
* the returned mempolicy. This will be used to "clone" the mempolicy in
2313
* a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2314
* mount option. Note that if 'static' or 'relative' mode flags were
2315
* specified, the input nodemask will already have been saved. Saving
2316
* it again is redundant, but safe.
2317
*
2318
* On success, returns 0, else 1
2319
*/
2320
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2321
{
2322
struct mempolicy *new = NULL;
2323
unsigned short mode;
2324
unsigned short uninitialized_var(mode_flags);
2325
nodemask_t nodes;
2326
char *nodelist = strchr(str, ':');
2327
char *flags = strchr(str, '=');
2328
int err = 1;
2329
2330
if (nodelist) {
2331
/* NUL-terminate mode or flags string */
2332
*nodelist++ = '\0';
2333
if (nodelist_parse(nodelist, nodes))
2334
goto out;
2335
if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2336
goto out;
2337
} else
2338
nodes_clear(nodes);
2339
2340
if (flags)
2341
*flags++ = '\0'; /* terminate mode string */
2342
2343
for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2344
if (!strcmp(str, policy_modes[mode])) {
2345
break;
2346
}
2347
}
2348
if (mode > MPOL_LOCAL)
2349
goto out;
2350
2351
switch (mode) {
2352
case MPOL_PREFERRED:
2353
/*
2354
* Insist on a nodelist of one node only
2355
*/
2356
if (nodelist) {
2357
char *rest = nodelist;
2358
while (isdigit(*rest))
2359
rest++;
2360
if (*rest)
2361
goto out;
2362
}
2363
break;
2364
case MPOL_INTERLEAVE:
2365
/*
2366
* Default to online nodes with memory if no nodelist
2367
*/
2368
if (!nodelist)
2369
nodes = node_states[N_HIGH_MEMORY];
2370
break;
2371
case MPOL_LOCAL:
2372
/*
2373
* Don't allow a nodelist; mpol_new() checks flags
2374
*/
2375
if (nodelist)
2376
goto out;
2377
mode = MPOL_PREFERRED;
2378
break;
2379
case MPOL_DEFAULT:
2380
/*
2381
* Insist on a empty nodelist
2382
*/
2383
if (!nodelist)
2384
err = 0;
2385
goto out;
2386
case MPOL_BIND:
2387
/*
2388
* Insist on a nodelist
2389
*/
2390
if (!nodelist)
2391
goto out;
2392
}
2393
2394
mode_flags = 0;
2395
if (flags) {
2396
/*
2397
* Currently, we only support two mutually exclusive
2398
* mode flags.
2399
*/
2400
if (!strcmp(flags, "static"))
2401
mode_flags |= MPOL_F_STATIC_NODES;
2402
else if (!strcmp(flags, "relative"))
2403
mode_flags |= MPOL_F_RELATIVE_NODES;
2404
else
2405
goto out;
2406
}
2407
2408
new = mpol_new(mode, mode_flags, &nodes);
2409
if (IS_ERR(new))
2410
goto out;
2411
2412
if (no_context) {
2413
/* save for contextualization */
2414
new->w.user_nodemask = nodes;
2415
} else {
2416
int ret;
2417
NODEMASK_SCRATCH(scratch);
2418
if (scratch) {
2419
task_lock(current);
2420
ret = mpol_set_nodemask(new, &nodes, scratch);
2421
task_unlock(current);
2422
} else
2423
ret = -ENOMEM;
2424
NODEMASK_SCRATCH_FREE(scratch);
2425
if (ret) {
2426
mpol_put(new);
2427
goto out;
2428
}
2429
}
2430
err = 0;
2431
2432
out:
2433
/* Restore string for error message */
2434
if (nodelist)
2435
*--nodelist = ':';
2436
if (flags)
2437
*--flags = '=';
2438
if (!err)
2439
*mpol = new;
2440
return err;
2441
}
2442
#endif /* CONFIG_TMPFS */
2443
2444
/**
2445
* mpol_to_str - format a mempolicy structure for printing
2446
* @buffer: to contain formatted mempolicy string
2447
* @maxlen: length of @buffer
2448
* @pol: pointer to mempolicy to be formatted
2449
* @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2450
*
2451
* Convert a mempolicy into a string.
2452
* Returns the number of characters in buffer (if positive)
2453
* or an error (negative)
2454
*/
2455
int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2456
{
2457
char *p = buffer;
2458
int l;
2459
nodemask_t nodes;
2460
unsigned short mode;
2461
unsigned short flags = pol ? pol->flags : 0;
2462
2463
/*
2464
* Sanity check: room for longest mode, flag and some nodes
2465
*/
2466
VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2467
2468
if (!pol || pol == &default_policy)
2469
mode = MPOL_DEFAULT;
2470
else
2471
mode = pol->mode;
2472
2473
switch (mode) {
2474
case MPOL_DEFAULT:
2475
nodes_clear(nodes);
2476
break;
2477
2478
case MPOL_PREFERRED:
2479
nodes_clear(nodes);
2480
if (flags & MPOL_F_LOCAL)
2481
mode = MPOL_LOCAL; /* pseudo-policy */
2482
else
2483
node_set(pol->v.preferred_node, nodes);
2484
break;
2485
2486
case MPOL_BIND:
2487
/* Fall through */
2488
case MPOL_INTERLEAVE:
2489
if (no_context)
2490
nodes = pol->w.user_nodemask;
2491
else
2492
nodes = pol->v.nodes;
2493
break;
2494
2495
default:
2496
BUG();
2497
}
2498
2499
l = strlen(policy_modes[mode]);
2500
if (buffer + maxlen < p + l + 1)
2501
return -ENOSPC;
2502
2503
strcpy(p, policy_modes[mode]);
2504
p += l;
2505
2506
if (flags & MPOL_MODE_FLAGS) {
2507
if (buffer + maxlen < p + 2)
2508
return -ENOSPC;
2509
*p++ = '=';
2510
2511
/*
2512
* Currently, the only defined flags are mutually exclusive
2513
*/
2514
if (flags & MPOL_F_STATIC_NODES)
2515
p += snprintf(p, buffer + maxlen - p, "static");
2516
else if (flags & MPOL_F_RELATIVE_NODES)
2517
p += snprintf(p, buffer + maxlen - p, "relative");
2518
}
2519
2520
if (!nodes_empty(nodes)) {
2521
if (buffer + maxlen < p + 2)
2522
return -ENOSPC;
2523
*p++ = ':';
2524
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2525
}
2526
return p - buffer;
2527
}
2528
2529