Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/nommu.c
10814 views
1
/*
2
* linux/mm/nommu.c
3
*
4
* Replacement code for mm functions to support CPU's that don't
5
* have any form of memory management unit (thus no virtual memory).
6
*
7
* See Documentation/nommu-mmap.txt
8
*
9
* Copyright (c) 2004-2008 David Howells <[email protected]>
10
* Copyright (c) 2000-2003 David McCullough <[email protected]>
11
* Copyright (c) 2000-2001 D Jeff Dionne <[email protected]>
12
* Copyright (c) 2002 Greg Ungerer <[email protected]>
13
* Copyright (c) 2007-2010 Paul Mundt <[email protected]>
14
*/
15
16
#include <linux/module.h>
17
#include <linux/mm.h>
18
#include <linux/mman.h>
19
#include <linux/swap.h>
20
#include <linux/file.h>
21
#include <linux/highmem.h>
22
#include <linux/pagemap.h>
23
#include <linux/slab.h>
24
#include <linux/vmalloc.h>
25
#include <linux/tracehook.h>
26
#include <linux/blkdev.h>
27
#include <linux/backing-dev.h>
28
#include <linux/mount.h>
29
#include <linux/personality.h>
30
#include <linux/security.h>
31
#include <linux/syscalls.h>
32
#include <linux/audit.h>
33
34
#include <asm/uaccess.h>
35
#include <asm/tlb.h>
36
#include <asm/tlbflush.h>
37
#include <asm/mmu_context.h>
38
#include "internal.h"
39
40
#if 0
41
#define kenter(FMT, ...) \
42
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
43
#define kleave(FMT, ...) \
44
printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
45
#define kdebug(FMT, ...) \
46
printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
47
#else
48
#define kenter(FMT, ...) \
49
no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
50
#define kleave(FMT, ...) \
51
no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
52
#define kdebug(FMT, ...) \
53
no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
54
#endif
55
56
void *high_memory;
57
struct page *mem_map;
58
unsigned long max_mapnr;
59
unsigned long num_physpages;
60
unsigned long highest_memmap_pfn;
61
struct percpu_counter vm_committed_as;
62
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
63
int sysctl_overcommit_ratio = 50; /* default is 50% */
64
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
65
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
66
int heap_stack_gap = 0;
67
68
atomic_long_t mmap_pages_allocated;
69
70
EXPORT_SYMBOL(mem_map);
71
EXPORT_SYMBOL(num_physpages);
72
73
/* list of mapped, potentially shareable regions */
74
static struct kmem_cache *vm_region_jar;
75
struct rb_root nommu_region_tree = RB_ROOT;
76
DECLARE_RWSEM(nommu_region_sem);
77
78
const struct vm_operations_struct generic_file_vm_ops = {
79
};
80
81
/*
82
* Return the total memory allocated for this pointer, not
83
* just what the caller asked for.
84
*
85
* Doesn't have to be accurate, i.e. may have races.
86
*/
87
unsigned int kobjsize(const void *objp)
88
{
89
struct page *page;
90
91
/*
92
* If the object we have should not have ksize performed on it,
93
* return size of 0
94
*/
95
if (!objp || !virt_addr_valid(objp))
96
return 0;
97
98
page = virt_to_head_page(objp);
99
100
/*
101
* If the allocator sets PageSlab, we know the pointer came from
102
* kmalloc().
103
*/
104
if (PageSlab(page))
105
return ksize(objp);
106
107
/*
108
* If it's not a compound page, see if we have a matching VMA
109
* region. This test is intentionally done in reverse order,
110
* so if there's no VMA, we still fall through and hand back
111
* PAGE_SIZE for 0-order pages.
112
*/
113
if (!PageCompound(page)) {
114
struct vm_area_struct *vma;
115
116
vma = find_vma(current->mm, (unsigned long)objp);
117
if (vma)
118
return vma->vm_end - vma->vm_start;
119
}
120
121
/*
122
* The ksize() function is only guaranteed to work for pointers
123
* returned by kmalloc(). So handle arbitrary pointers here.
124
*/
125
return PAGE_SIZE << compound_order(page);
126
}
127
128
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
129
unsigned long start, int nr_pages, unsigned int foll_flags,
130
struct page **pages, struct vm_area_struct **vmas,
131
int *retry)
132
{
133
struct vm_area_struct *vma;
134
unsigned long vm_flags;
135
int i;
136
137
/* calculate required read or write permissions.
138
* If FOLL_FORCE is set, we only require the "MAY" flags.
139
*/
140
vm_flags = (foll_flags & FOLL_WRITE) ?
141
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
142
vm_flags &= (foll_flags & FOLL_FORCE) ?
143
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
144
145
for (i = 0; i < nr_pages; i++) {
146
vma = find_vma(mm, start);
147
if (!vma)
148
goto finish_or_fault;
149
150
/* protect what we can, including chardevs */
151
if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
152
!(vm_flags & vma->vm_flags))
153
goto finish_or_fault;
154
155
if (pages) {
156
pages[i] = virt_to_page(start);
157
if (pages[i])
158
page_cache_get(pages[i]);
159
}
160
if (vmas)
161
vmas[i] = vma;
162
start = (start + PAGE_SIZE) & PAGE_MASK;
163
}
164
165
return i;
166
167
finish_or_fault:
168
return i ? : -EFAULT;
169
}
170
171
/*
172
* get a list of pages in an address range belonging to the specified process
173
* and indicate the VMA that covers each page
174
* - this is potentially dodgy as we may end incrementing the page count of a
175
* slab page or a secondary page from a compound page
176
* - don't permit access to VMAs that don't support it, such as I/O mappings
177
*/
178
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
179
unsigned long start, int nr_pages, int write, int force,
180
struct page **pages, struct vm_area_struct **vmas)
181
{
182
int flags = 0;
183
184
if (write)
185
flags |= FOLL_WRITE;
186
if (force)
187
flags |= FOLL_FORCE;
188
189
return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
190
NULL);
191
}
192
EXPORT_SYMBOL(get_user_pages);
193
194
/**
195
* follow_pfn - look up PFN at a user virtual address
196
* @vma: memory mapping
197
* @address: user virtual address
198
* @pfn: location to store found PFN
199
*
200
* Only IO mappings and raw PFN mappings are allowed.
201
*
202
* Returns zero and the pfn at @pfn on success, -ve otherwise.
203
*/
204
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
205
unsigned long *pfn)
206
{
207
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
208
return -EINVAL;
209
210
*pfn = address >> PAGE_SHIFT;
211
return 0;
212
}
213
EXPORT_SYMBOL(follow_pfn);
214
215
DEFINE_RWLOCK(vmlist_lock);
216
struct vm_struct *vmlist;
217
218
void vfree(const void *addr)
219
{
220
kfree(addr);
221
}
222
EXPORT_SYMBOL(vfree);
223
224
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
225
{
226
/*
227
* You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
228
* returns only a logical address.
229
*/
230
return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
231
}
232
EXPORT_SYMBOL(__vmalloc);
233
234
void *vmalloc_user(unsigned long size)
235
{
236
void *ret;
237
238
ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
239
PAGE_KERNEL);
240
if (ret) {
241
struct vm_area_struct *vma;
242
243
down_write(&current->mm->mmap_sem);
244
vma = find_vma(current->mm, (unsigned long)ret);
245
if (vma)
246
vma->vm_flags |= VM_USERMAP;
247
up_write(&current->mm->mmap_sem);
248
}
249
250
return ret;
251
}
252
EXPORT_SYMBOL(vmalloc_user);
253
254
struct page *vmalloc_to_page(const void *addr)
255
{
256
return virt_to_page(addr);
257
}
258
EXPORT_SYMBOL(vmalloc_to_page);
259
260
unsigned long vmalloc_to_pfn(const void *addr)
261
{
262
return page_to_pfn(virt_to_page(addr));
263
}
264
EXPORT_SYMBOL(vmalloc_to_pfn);
265
266
long vread(char *buf, char *addr, unsigned long count)
267
{
268
memcpy(buf, addr, count);
269
return count;
270
}
271
272
long vwrite(char *buf, char *addr, unsigned long count)
273
{
274
/* Don't allow overflow */
275
if ((unsigned long) addr + count < count)
276
count = -(unsigned long) addr;
277
278
memcpy(addr, buf, count);
279
return(count);
280
}
281
282
/*
283
* vmalloc - allocate virtually continguos memory
284
*
285
* @size: allocation size
286
*
287
* Allocate enough pages to cover @size from the page level
288
* allocator and map them into continguos kernel virtual space.
289
*
290
* For tight control over page level allocator and protection flags
291
* use __vmalloc() instead.
292
*/
293
void *vmalloc(unsigned long size)
294
{
295
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
296
}
297
EXPORT_SYMBOL(vmalloc);
298
299
/*
300
* vzalloc - allocate virtually continguos memory with zero fill
301
*
302
* @size: allocation size
303
*
304
* Allocate enough pages to cover @size from the page level
305
* allocator and map them into continguos kernel virtual space.
306
* The memory allocated is set to zero.
307
*
308
* For tight control over page level allocator and protection flags
309
* use __vmalloc() instead.
310
*/
311
void *vzalloc(unsigned long size)
312
{
313
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
314
PAGE_KERNEL);
315
}
316
EXPORT_SYMBOL(vzalloc);
317
318
/**
319
* vmalloc_node - allocate memory on a specific node
320
* @size: allocation size
321
* @node: numa node
322
*
323
* Allocate enough pages to cover @size from the page level
324
* allocator and map them into contiguous kernel virtual space.
325
*
326
* For tight control over page level allocator and protection flags
327
* use __vmalloc() instead.
328
*/
329
void *vmalloc_node(unsigned long size, int node)
330
{
331
return vmalloc(size);
332
}
333
EXPORT_SYMBOL(vmalloc_node);
334
335
/**
336
* vzalloc_node - allocate memory on a specific node with zero fill
337
* @size: allocation size
338
* @node: numa node
339
*
340
* Allocate enough pages to cover @size from the page level
341
* allocator and map them into contiguous kernel virtual space.
342
* The memory allocated is set to zero.
343
*
344
* For tight control over page level allocator and protection flags
345
* use __vmalloc() instead.
346
*/
347
void *vzalloc_node(unsigned long size, int node)
348
{
349
return vzalloc(size);
350
}
351
EXPORT_SYMBOL(vzalloc_node);
352
353
#ifndef PAGE_KERNEL_EXEC
354
# define PAGE_KERNEL_EXEC PAGE_KERNEL
355
#endif
356
357
/**
358
* vmalloc_exec - allocate virtually contiguous, executable memory
359
* @size: allocation size
360
*
361
* Kernel-internal function to allocate enough pages to cover @size
362
* the page level allocator and map them into contiguous and
363
* executable kernel virtual space.
364
*
365
* For tight control over page level allocator and protection flags
366
* use __vmalloc() instead.
367
*/
368
369
void *vmalloc_exec(unsigned long size)
370
{
371
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
372
}
373
374
/**
375
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
376
* @size: allocation size
377
*
378
* Allocate enough 32bit PA addressable pages to cover @size from the
379
* page level allocator and map them into continguos kernel virtual space.
380
*/
381
void *vmalloc_32(unsigned long size)
382
{
383
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
384
}
385
EXPORT_SYMBOL(vmalloc_32);
386
387
/**
388
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
389
* @size: allocation size
390
*
391
* The resulting memory area is 32bit addressable and zeroed so it can be
392
* mapped to userspace without leaking data.
393
*
394
* VM_USERMAP is set on the corresponding VMA so that subsequent calls to
395
* remap_vmalloc_range() are permissible.
396
*/
397
void *vmalloc_32_user(unsigned long size)
398
{
399
/*
400
* We'll have to sort out the ZONE_DMA bits for 64-bit,
401
* but for now this can simply use vmalloc_user() directly.
402
*/
403
return vmalloc_user(size);
404
}
405
EXPORT_SYMBOL(vmalloc_32_user);
406
407
void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
408
{
409
BUG();
410
return NULL;
411
}
412
EXPORT_SYMBOL(vmap);
413
414
void vunmap(const void *addr)
415
{
416
BUG();
417
}
418
EXPORT_SYMBOL(vunmap);
419
420
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
421
{
422
BUG();
423
return NULL;
424
}
425
EXPORT_SYMBOL(vm_map_ram);
426
427
void vm_unmap_ram(const void *mem, unsigned int count)
428
{
429
BUG();
430
}
431
EXPORT_SYMBOL(vm_unmap_ram);
432
433
void vm_unmap_aliases(void)
434
{
435
}
436
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
437
438
/*
439
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
440
* have one.
441
*/
442
void __attribute__((weak)) vmalloc_sync_all(void)
443
{
444
}
445
446
/**
447
* alloc_vm_area - allocate a range of kernel address space
448
* @size: size of the area
449
*
450
* Returns: NULL on failure, vm_struct on success
451
*
452
* This function reserves a range of kernel address space, and
453
* allocates pagetables to map that range. No actual mappings
454
* are created. If the kernel address space is not shared
455
* between processes, it syncs the pagetable across all
456
* processes.
457
*/
458
struct vm_struct *alloc_vm_area(size_t size)
459
{
460
BUG();
461
return NULL;
462
}
463
EXPORT_SYMBOL_GPL(alloc_vm_area);
464
465
void free_vm_area(struct vm_struct *area)
466
{
467
BUG();
468
}
469
EXPORT_SYMBOL_GPL(free_vm_area);
470
471
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
472
struct page *page)
473
{
474
return -EINVAL;
475
}
476
EXPORT_SYMBOL(vm_insert_page);
477
478
/*
479
* sys_brk() for the most part doesn't need the global kernel
480
* lock, except when an application is doing something nasty
481
* like trying to un-brk an area that has already been mapped
482
* to a regular file. in this case, the unmapping will need
483
* to invoke file system routines that need the global lock.
484
*/
485
SYSCALL_DEFINE1(brk, unsigned long, brk)
486
{
487
struct mm_struct *mm = current->mm;
488
489
if (brk < mm->start_brk || brk > mm->context.end_brk)
490
return mm->brk;
491
492
if (mm->brk == brk)
493
return mm->brk;
494
495
/*
496
* Always allow shrinking brk
497
*/
498
if (brk <= mm->brk) {
499
mm->brk = brk;
500
return brk;
501
}
502
503
/*
504
* Ok, looks good - let it rip.
505
*/
506
flush_icache_range(mm->brk, brk);
507
return mm->brk = brk;
508
}
509
510
/*
511
* initialise the VMA and region record slabs
512
*/
513
void __init mmap_init(void)
514
{
515
int ret;
516
517
ret = percpu_counter_init(&vm_committed_as, 0);
518
VM_BUG_ON(ret);
519
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
520
}
521
522
/*
523
* validate the region tree
524
* - the caller must hold the region lock
525
*/
526
#ifdef CONFIG_DEBUG_NOMMU_REGIONS
527
static noinline void validate_nommu_regions(void)
528
{
529
struct vm_region *region, *last;
530
struct rb_node *p, *lastp;
531
532
lastp = rb_first(&nommu_region_tree);
533
if (!lastp)
534
return;
535
536
last = rb_entry(lastp, struct vm_region, vm_rb);
537
BUG_ON(unlikely(last->vm_end <= last->vm_start));
538
BUG_ON(unlikely(last->vm_top < last->vm_end));
539
540
while ((p = rb_next(lastp))) {
541
region = rb_entry(p, struct vm_region, vm_rb);
542
last = rb_entry(lastp, struct vm_region, vm_rb);
543
544
BUG_ON(unlikely(region->vm_end <= region->vm_start));
545
BUG_ON(unlikely(region->vm_top < region->vm_end));
546
BUG_ON(unlikely(region->vm_start < last->vm_top));
547
548
lastp = p;
549
}
550
}
551
#else
552
static void validate_nommu_regions(void)
553
{
554
}
555
#endif
556
557
/*
558
* add a region into the global tree
559
*/
560
static void add_nommu_region(struct vm_region *region)
561
{
562
struct vm_region *pregion;
563
struct rb_node **p, *parent;
564
565
validate_nommu_regions();
566
567
parent = NULL;
568
p = &nommu_region_tree.rb_node;
569
while (*p) {
570
parent = *p;
571
pregion = rb_entry(parent, struct vm_region, vm_rb);
572
if (region->vm_start < pregion->vm_start)
573
p = &(*p)->rb_left;
574
else if (region->vm_start > pregion->vm_start)
575
p = &(*p)->rb_right;
576
else if (pregion == region)
577
return;
578
else
579
BUG();
580
}
581
582
rb_link_node(&region->vm_rb, parent, p);
583
rb_insert_color(&region->vm_rb, &nommu_region_tree);
584
585
validate_nommu_regions();
586
}
587
588
/*
589
* delete a region from the global tree
590
*/
591
static void delete_nommu_region(struct vm_region *region)
592
{
593
BUG_ON(!nommu_region_tree.rb_node);
594
595
validate_nommu_regions();
596
rb_erase(&region->vm_rb, &nommu_region_tree);
597
validate_nommu_regions();
598
}
599
600
/*
601
* free a contiguous series of pages
602
*/
603
static void free_page_series(unsigned long from, unsigned long to)
604
{
605
for (; from < to; from += PAGE_SIZE) {
606
struct page *page = virt_to_page(from);
607
608
kdebug("- free %lx", from);
609
atomic_long_dec(&mmap_pages_allocated);
610
if (page_count(page) != 1)
611
kdebug("free page %p: refcount not one: %d",
612
page, page_count(page));
613
put_page(page);
614
}
615
}
616
617
/*
618
* release a reference to a region
619
* - the caller must hold the region semaphore for writing, which this releases
620
* - the region may not have been added to the tree yet, in which case vm_top
621
* will equal vm_start
622
*/
623
static void __put_nommu_region(struct vm_region *region)
624
__releases(nommu_region_sem)
625
{
626
kenter("%p{%d}", region, region->vm_usage);
627
628
BUG_ON(!nommu_region_tree.rb_node);
629
630
if (--region->vm_usage == 0) {
631
if (region->vm_top > region->vm_start)
632
delete_nommu_region(region);
633
up_write(&nommu_region_sem);
634
635
if (region->vm_file)
636
fput(region->vm_file);
637
638
/* IO memory and memory shared directly out of the pagecache
639
* from ramfs/tmpfs mustn't be released here */
640
if (region->vm_flags & VM_MAPPED_COPY) {
641
kdebug("free series");
642
free_page_series(region->vm_start, region->vm_top);
643
}
644
kmem_cache_free(vm_region_jar, region);
645
} else {
646
up_write(&nommu_region_sem);
647
}
648
}
649
650
/*
651
* release a reference to a region
652
*/
653
static void put_nommu_region(struct vm_region *region)
654
{
655
down_write(&nommu_region_sem);
656
__put_nommu_region(region);
657
}
658
659
/*
660
* update protection on a vma
661
*/
662
static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
663
{
664
#ifdef CONFIG_MPU
665
struct mm_struct *mm = vma->vm_mm;
666
long start = vma->vm_start & PAGE_MASK;
667
while (start < vma->vm_end) {
668
protect_page(mm, start, flags);
669
start += PAGE_SIZE;
670
}
671
update_protections(mm);
672
#endif
673
}
674
675
/*
676
* add a VMA into a process's mm_struct in the appropriate place in the list
677
* and tree and add to the address space's page tree also if not an anonymous
678
* page
679
* - should be called with mm->mmap_sem held writelocked
680
*/
681
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
682
{
683
struct vm_area_struct *pvma, *prev;
684
struct address_space *mapping;
685
struct rb_node **p, *parent, *rb_prev;
686
687
kenter(",%p", vma);
688
689
BUG_ON(!vma->vm_region);
690
691
mm->map_count++;
692
vma->vm_mm = mm;
693
694
protect_vma(vma, vma->vm_flags);
695
696
/* add the VMA to the mapping */
697
if (vma->vm_file) {
698
mapping = vma->vm_file->f_mapping;
699
700
flush_dcache_mmap_lock(mapping);
701
vma_prio_tree_insert(vma, &mapping->i_mmap);
702
flush_dcache_mmap_unlock(mapping);
703
}
704
705
/* add the VMA to the tree */
706
parent = rb_prev = NULL;
707
p = &mm->mm_rb.rb_node;
708
while (*p) {
709
parent = *p;
710
pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
711
712
/* sort by: start addr, end addr, VMA struct addr in that order
713
* (the latter is necessary as we may get identical VMAs) */
714
if (vma->vm_start < pvma->vm_start)
715
p = &(*p)->rb_left;
716
else if (vma->vm_start > pvma->vm_start) {
717
rb_prev = parent;
718
p = &(*p)->rb_right;
719
} else if (vma->vm_end < pvma->vm_end)
720
p = &(*p)->rb_left;
721
else if (vma->vm_end > pvma->vm_end) {
722
rb_prev = parent;
723
p = &(*p)->rb_right;
724
} else if (vma < pvma)
725
p = &(*p)->rb_left;
726
else if (vma > pvma) {
727
rb_prev = parent;
728
p = &(*p)->rb_right;
729
} else
730
BUG();
731
}
732
733
rb_link_node(&vma->vm_rb, parent, p);
734
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
735
736
/* add VMA to the VMA list also */
737
prev = NULL;
738
if (rb_prev)
739
prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
740
741
__vma_link_list(mm, vma, prev, parent);
742
}
743
744
/*
745
* delete a VMA from its owning mm_struct and address space
746
*/
747
static void delete_vma_from_mm(struct vm_area_struct *vma)
748
{
749
struct address_space *mapping;
750
struct mm_struct *mm = vma->vm_mm;
751
752
kenter("%p", vma);
753
754
protect_vma(vma, 0);
755
756
mm->map_count--;
757
if (mm->mmap_cache == vma)
758
mm->mmap_cache = NULL;
759
760
/* remove the VMA from the mapping */
761
if (vma->vm_file) {
762
mapping = vma->vm_file->f_mapping;
763
764
flush_dcache_mmap_lock(mapping);
765
vma_prio_tree_remove(vma, &mapping->i_mmap);
766
flush_dcache_mmap_unlock(mapping);
767
}
768
769
/* remove from the MM's tree and list */
770
rb_erase(&vma->vm_rb, &mm->mm_rb);
771
772
if (vma->vm_prev)
773
vma->vm_prev->vm_next = vma->vm_next;
774
else
775
mm->mmap = vma->vm_next;
776
777
if (vma->vm_next)
778
vma->vm_next->vm_prev = vma->vm_prev;
779
780
vma->vm_mm = NULL;
781
}
782
783
/*
784
* destroy a VMA record
785
*/
786
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
787
{
788
kenter("%p", vma);
789
if (vma->vm_ops && vma->vm_ops->close)
790
vma->vm_ops->close(vma);
791
if (vma->vm_file) {
792
fput(vma->vm_file);
793
if (vma->vm_flags & VM_EXECUTABLE)
794
removed_exe_file_vma(mm);
795
}
796
put_nommu_region(vma->vm_region);
797
kmem_cache_free(vm_area_cachep, vma);
798
}
799
800
/*
801
* look up the first VMA in which addr resides, NULL if none
802
* - should be called with mm->mmap_sem at least held readlocked
803
*/
804
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
805
{
806
struct vm_area_struct *vma;
807
808
/* check the cache first */
809
vma = mm->mmap_cache;
810
if (vma && vma->vm_start <= addr && vma->vm_end > addr)
811
return vma;
812
813
/* trawl the list (there may be multiple mappings in which addr
814
* resides) */
815
for (vma = mm->mmap; vma; vma = vma->vm_next) {
816
if (vma->vm_start > addr)
817
return NULL;
818
if (vma->vm_end > addr) {
819
mm->mmap_cache = vma;
820
return vma;
821
}
822
}
823
824
return NULL;
825
}
826
EXPORT_SYMBOL(find_vma);
827
828
/*
829
* find a VMA
830
* - we don't extend stack VMAs under NOMMU conditions
831
*/
832
struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
833
{
834
return find_vma(mm, addr);
835
}
836
837
/*
838
* expand a stack to a given address
839
* - not supported under NOMMU conditions
840
*/
841
int expand_stack(struct vm_area_struct *vma, unsigned long address)
842
{
843
return -ENOMEM;
844
}
845
846
/*
847
* look up the first VMA exactly that exactly matches addr
848
* - should be called with mm->mmap_sem at least held readlocked
849
*/
850
static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
851
unsigned long addr,
852
unsigned long len)
853
{
854
struct vm_area_struct *vma;
855
unsigned long end = addr + len;
856
857
/* check the cache first */
858
vma = mm->mmap_cache;
859
if (vma && vma->vm_start == addr && vma->vm_end == end)
860
return vma;
861
862
/* trawl the list (there may be multiple mappings in which addr
863
* resides) */
864
for (vma = mm->mmap; vma; vma = vma->vm_next) {
865
if (vma->vm_start < addr)
866
continue;
867
if (vma->vm_start > addr)
868
return NULL;
869
if (vma->vm_end == end) {
870
mm->mmap_cache = vma;
871
return vma;
872
}
873
}
874
875
return NULL;
876
}
877
878
/*
879
* determine whether a mapping should be permitted and, if so, what sort of
880
* mapping we're capable of supporting
881
*/
882
static int validate_mmap_request(struct file *file,
883
unsigned long addr,
884
unsigned long len,
885
unsigned long prot,
886
unsigned long flags,
887
unsigned long pgoff,
888
unsigned long *_capabilities)
889
{
890
unsigned long capabilities, rlen;
891
unsigned long reqprot = prot;
892
int ret;
893
894
/* do the simple checks first */
895
if (flags & MAP_FIXED) {
896
printk(KERN_DEBUG
897
"%d: Can't do fixed-address/overlay mmap of RAM\n",
898
current->pid);
899
return -EINVAL;
900
}
901
902
if ((flags & MAP_TYPE) != MAP_PRIVATE &&
903
(flags & MAP_TYPE) != MAP_SHARED)
904
return -EINVAL;
905
906
if (!len)
907
return -EINVAL;
908
909
/* Careful about overflows.. */
910
rlen = PAGE_ALIGN(len);
911
if (!rlen || rlen > TASK_SIZE)
912
return -ENOMEM;
913
914
/* offset overflow? */
915
if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
916
return -EOVERFLOW;
917
918
if (file) {
919
/* validate file mapping requests */
920
struct address_space *mapping;
921
922
/* files must support mmap */
923
if (!file->f_op || !file->f_op->mmap)
924
return -ENODEV;
925
926
/* work out if what we've got could possibly be shared
927
* - we support chardevs that provide their own "memory"
928
* - we support files/blockdevs that are memory backed
929
*/
930
mapping = file->f_mapping;
931
if (!mapping)
932
mapping = file->f_path.dentry->d_inode->i_mapping;
933
934
capabilities = 0;
935
if (mapping && mapping->backing_dev_info)
936
capabilities = mapping->backing_dev_info->capabilities;
937
938
if (!capabilities) {
939
/* no explicit capabilities set, so assume some
940
* defaults */
941
switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
942
case S_IFREG:
943
case S_IFBLK:
944
capabilities = BDI_CAP_MAP_COPY;
945
break;
946
947
case S_IFCHR:
948
capabilities =
949
BDI_CAP_MAP_DIRECT |
950
BDI_CAP_READ_MAP |
951
BDI_CAP_WRITE_MAP;
952
break;
953
954
default:
955
return -EINVAL;
956
}
957
}
958
959
/* eliminate any capabilities that we can't support on this
960
* device */
961
if (!file->f_op->get_unmapped_area)
962
capabilities &= ~BDI_CAP_MAP_DIRECT;
963
if (!file->f_op->read)
964
capabilities &= ~BDI_CAP_MAP_COPY;
965
966
/* The file shall have been opened with read permission. */
967
if (!(file->f_mode & FMODE_READ))
968
return -EACCES;
969
970
if (flags & MAP_SHARED) {
971
/* do checks for writing, appending and locking */
972
if ((prot & PROT_WRITE) &&
973
!(file->f_mode & FMODE_WRITE))
974
return -EACCES;
975
976
if (IS_APPEND(file->f_path.dentry->d_inode) &&
977
(file->f_mode & FMODE_WRITE))
978
return -EACCES;
979
980
if (locks_verify_locked(file->f_path.dentry->d_inode))
981
return -EAGAIN;
982
983
if (!(capabilities & BDI_CAP_MAP_DIRECT))
984
return -ENODEV;
985
986
/* we mustn't privatise shared mappings */
987
capabilities &= ~BDI_CAP_MAP_COPY;
988
}
989
else {
990
/* we're going to read the file into private memory we
991
* allocate */
992
if (!(capabilities & BDI_CAP_MAP_COPY))
993
return -ENODEV;
994
995
/* we don't permit a private writable mapping to be
996
* shared with the backing device */
997
if (prot & PROT_WRITE)
998
capabilities &= ~BDI_CAP_MAP_DIRECT;
999
}
1000
1001
if (capabilities & BDI_CAP_MAP_DIRECT) {
1002
if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
1003
((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
1004
((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
1005
) {
1006
capabilities &= ~BDI_CAP_MAP_DIRECT;
1007
if (flags & MAP_SHARED) {
1008
printk(KERN_WARNING
1009
"MAP_SHARED not completely supported on !MMU\n");
1010
return -EINVAL;
1011
}
1012
}
1013
}
1014
1015
/* handle executable mappings and implied executable
1016
* mappings */
1017
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1018
if (prot & PROT_EXEC)
1019
return -EPERM;
1020
}
1021
else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
1022
/* handle implication of PROT_EXEC by PROT_READ */
1023
if (current->personality & READ_IMPLIES_EXEC) {
1024
if (capabilities & BDI_CAP_EXEC_MAP)
1025
prot |= PROT_EXEC;
1026
}
1027
}
1028
else if ((prot & PROT_READ) &&
1029
(prot & PROT_EXEC) &&
1030
!(capabilities & BDI_CAP_EXEC_MAP)
1031
) {
1032
/* backing file is not executable, try to copy */
1033
capabilities &= ~BDI_CAP_MAP_DIRECT;
1034
}
1035
}
1036
else {
1037
/* anonymous mappings are always memory backed and can be
1038
* privately mapped
1039
*/
1040
capabilities = BDI_CAP_MAP_COPY;
1041
1042
/* handle PROT_EXEC implication by PROT_READ */
1043
if ((prot & PROT_READ) &&
1044
(current->personality & READ_IMPLIES_EXEC))
1045
prot |= PROT_EXEC;
1046
}
1047
1048
/* allow the security API to have its say */
1049
ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1050
if (ret < 0)
1051
return ret;
1052
1053
/* looks okay */
1054
*_capabilities = capabilities;
1055
return 0;
1056
}
1057
1058
/*
1059
* we've determined that we can make the mapping, now translate what we
1060
* now know into VMA flags
1061
*/
1062
static unsigned long determine_vm_flags(struct file *file,
1063
unsigned long prot,
1064
unsigned long flags,
1065
unsigned long capabilities)
1066
{
1067
unsigned long vm_flags;
1068
1069
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
1070
/* vm_flags |= mm->def_flags; */
1071
1072
if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
1073
/* attempt to share read-only copies of mapped file chunks */
1074
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1075
if (file && !(prot & PROT_WRITE))
1076
vm_flags |= VM_MAYSHARE;
1077
} else {
1078
/* overlay a shareable mapping on the backing device or inode
1079
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
1080
* romfs/cramfs */
1081
vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
1082
if (flags & MAP_SHARED)
1083
vm_flags |= VM_SHARED;
1084
}
1085
1086
/* refuse to let anyone share private mappings with this process if
1087
* it's being traced - otherwise breakpoints set in it may interfere
1088
* with another untraced process
1089
*/
1090
if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
1091
vm_flags &= ~VM_MAYSHARE;
1092
1093
return vm_flags;
1094
}
1095
1096
/*
1097
* set up a shared mapping on a file (the driver or filesystem provides and
1098
* pins the storage)
1099
*/
1100
static int do_mmap_shared_file(struct vm_area_struct *vma)
1101
{
1102
int ret;
1103
1104
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1105
if (ret == 0) {
1106
vma->vm_region->vm_top = vma->vm_region->vm_end;
1107
return 0;
1108
}
1109
if (ret != -ENOSYS)
1110
return ret;
1111
1112
/* getting -ENOSYS indicates that direct mmap isn't possible (as
1113
* opposed to tried but failed) so we can only give a suitable error as
1114
* it's not possible to make a private copy if MAP_SHARED was given */
1115
return -ENODEV;
1116
}
1117
1118
/*
1119
* set up a private mapping or an anonymous shared mapping
1120
*/
1121
static int do_mmap_private(struct vm_area_struct *vma,
1122
struct vm_region *region,
1123
unsigned long len,
1124
unsigned long capabilities)
1125
{
1126
struct page *pages;
1127
unsigned long total, point, n;
1128
void *base;
1129
int ret, order;
1130
1131
/* invoke the file's mapping function so that it can keep track of
1132
* shared mappings on devices or memory
1133
* - VM_MAYSHARE will be set if it may attempt to share
1134
*/
1135
if (capabilities & BDI_CAP_MAP_DIRECT) {
1136
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1137
if (ret == 0) {
1138
/* shouldn't return success if we're not sharing */
1139
BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1140
vma->vm_region->vm_top = vma->vm_region->vm_end;
1141
return 0;
1142
}
1143
if (ret != -ENOSYS)
1144
return ret;
1145
1146
/* getting an ENOSYS error indicates that direct mmap isn't
1147
* possible (as opposed to tried but failed) so we'll try to
1148
* make a private copy of the data and map that instead */
1149
}
1150
1151
1152
/* allocate some memory to hold the mapping
1153
* - note that this may not return a page-aligned address if the object
1154
* we're allocating is smaller than a page
1155
*/
1156
order = get_order(len);
1157
kdebug("alloc order %d for %lx", order, len);
1158
1159
pages = alloc_pages(GFP_KERNEL, order);
1160
if (!pages)
1161
goto enomem;
1162
1163
total = 1 << order;
1164
atomic_long_add(total, &mmap_pages_allocated);
1165
1166
point = len >> PAGE_SHIFT;
1167
1168
/* we allocated a power-of-2 sized page set, so we may want to trim off
1169
* the excess */
1170
if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1171
while (total > point) {
1172
order = ilog2(total - point);
1173
n = 1 << order;
1174
kdebug("shave %lu/%lu @%lu", n, total - point, total);
1175
atomic_long_sub(n, &mmap_pages_allocated);
1176
total -= n;
1177
set_page_refcounted(pages + total);
1178
__free_pages(pages + total, order);
1179
}
1180
}
1181
1182
for (point = 1; point < total; point++)
1183
set_page_refcounted(&pages[point]);
1184
1185
base = page_address(pages);
1186
region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1187
region->vm_start = (unsigned long) base;
1188
region->vm_end = region->vm_start + len;
1189
region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1190
1191
vma->vm_start = region->vm_start;
1192
vma->vm_end = region->vm_start + len;
1193
1194
if (vma->vm_file) {
1195
/* read the contents of a file into the copy */
1196
mm_segment_t old_fs;
1197
loff_t fpos;
1198
1199
fpos = vma->vm_pgoff;
1200
fpos <<= PAGE_SHIFT;
1201
1202
old_fs = get_fs();
1203
set_fs(KERNEL_DS);
1204
ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
1205
set_fs(old_fs);
1206
1207
if (ret < 0)
1208
goto error_free;
1209
1210
/* clear the last little bit */
1211
if (ret < len)
1212
memset(base + ret, 0, len - ret);
1213
1214
}
1215
1216
return 0;
1217
1218
error_free:
1219
free_page_series(region->vm_start, region->vm_top);
1220
region->vm_start = vma->vm_start = 0;
1221
region->vm_end = vma->vm_end = 0;
1222
region->vm_top = 0;
1223
return ret;
1224
1225
enomem:
1226
printk("Allocation of length %lu from process %d (%s) failed\n",
1227
len, current->pid, current->comm);
1228
show_free_areas(0);
1229
return -ENOMEM;
1230
}
1231
1232
/*
1233
* handle mapping creation for uClinux
1234
*/
1235
unsigned long do_mmap_pgoff(struct file *file,
1236
unsigned long addr,
1237
unsigned long len,
1238
unsigned long prot,
1239
unsigned long flags,
1240
unsigned long pgoff)
1241
{
1242
struct vm_area_struct *vma;
1243
struct vm_region *region;
1244
struct rb_node *rb;
1245
unsigned long capabilities, vm_flags, result;
1246
int ret;
1247
1248
kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1249
1250
/* decide whether we should attempt the mapping, and if so what sort of
1251
* mapping */
1252
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1253
&capabilities);
1254
if (ret < 0) {
1255
kleave(" = %d [val]", ret);
1256
return ret;
1257
}
1258
1259
/* we ignore the address hint */
1260
addr = 0;
1261
len = PAGE_ALIGN(len);
1262
1263
/* we've determined that we can make the mapping, now translate what we
1264
* now know into VMA flags */
1265
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
1266
1267
/* we're going to need to record the mapping */
1268
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
1269
if (!region)
1270
goto error_getting_region;
1271
1272
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1273
if (!vma)
1274
goto error_getting_vma;
1275
1276
region->vm_usage = 1;
1277
region->vm_flags = vm_flags;
1278
region->vm_pgoff = pgoff;
1279
1280
INIT_LIST_HEAD(&vma->anon_vma_chain);
1281
vma->vm_flags = vm_flags;
1282
vma->vm_pgoff = pgoff;
1283
1284
if (file) {
1285
region->vm_file = file;
1286
get_file(file);
1287
vma->vm_file = file;
1288
get_file(file);
1289
if (vm_flags & VM_EXECUTABLE) {
1290
added_exe_file_vma(current->mm);
1291
vma->vm_mm = current->mm;
1292
}
1293
}
1294
1295
down_write(&nommu_region_sem);
1296
1297
/* if we want to share, we need to check for regions created by other
1298
* mmap() calls that overlap with our proposed mapping
1299
* - we can only share with a superset match on most regular files
1300
* - shared mappings on character devices and memory backed files are
1301
* permitted to overlap inexactly as far as we are concerned for in
1302
* these cases, sharing is handled in the driver or filesystem rather
1303
* than here
1304
*/
1305
if (vm_flags & VM_MAYSHARE) {
1306
struct vm_region *pregion;
1307
unsigned long pglen, rpglen, pgend, rpgend, start;
1308
1309
pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1310
pgend = pgoff + pglen;
1311
1312
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
1313
pregion = rb_entry(rb, struct vm_region, vm_rb);
1314
1315
if (!(pregion->vm_flags & VM_MAYSHARE))
1316
continue;
1317
1318
/* search for overlapping mappings on the same file */
1319
if (pregion->vm_file->f_path.dentry->d_inode !=
1320
file->f_path.dentry->d_inode)
1321
continue;
1322
1323
if (pregion->vm_pgoff >= pgend)
1324
continue;
1325
1326
rpglen = pregion->vm_end - pregion->vm_start;
1327
rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1328
rpgend = pregion->vm_pgoff + rpglen;
1329
if (pgoff >= rpgend)
1330
continue;
1331
1332
/* handle inexactly overlapping matches between
1333
* mappings */
1334
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1335
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1336
/* new mapping is not a subset of the region */
1337
if (!(capabilities & BDI_CAP_MAP_DIRECT))
1338
goto sharing_violation;
1339
continue;
1340
}
1341
1342
/* we've found a region we can share */
1343
pregion->vm_usage++;
1344
vma->vm_region = pregion;
1345
start = pregion->vm_start;
1346
start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
1347
vma->vm_start = start;
1348
vma->vm_end = start + len;
1349
1350
if (pregion->vm_flags & VM_MAPPED_COPY) {
1351
kdebug("share copy");
1352
vma->vm_flags |= VM_MAPPED_COPY;
1353
} else {
1354
kdebug("share mmap");
1355
ret = do_mmap_shared_file(vma);
1356
if (ret < 0) {
1357
vma->vm_region = NULL;
1358
vma->vm_start = 0;
1359
vma->vm_end = 0;
1360
pregion->vm_usage--;
1361
pregion = NULL;
1362
goto error_just_free;
1363
}
1364
}
1365
fput(region->vm_file);
1366
kmem_cache_free(vm_region_jar, region);
1367
region = pregion;
1368
result = start;
1369
goto share;
1370
}
1371
1372
/* obtain the address at which to make a shared mapping
1373
* - this is the hook for quasi-memory character devices to
1374
* tell us the location of a shared mapping
1375
*/
1376
if (capabilities & BDI_CAP_MAP_DIRECT) {
1377
addr = file->f_op->get_unmapped_area(file, addr, len,
1378
pgoff, flags);
1379
if (IS_ERR_VALUE(addr)) {
1380
ret = addr;
1381
if (ret != -ENOSYS)
1382
goto error_just_free;
1383
1384
/* the driver refused to tell us where to site
1385
* the mapping so we'll have to attempt to copy
1386
* it */
1387
ret = -ENODEV;
1388
if (!(capabilities & BDI_CAP_MAP_COPY))
1389
goto error_just_free;
1390
1391
capabilities &= ~BDI_CAP_MAP_DIRECT;
1392
} else {
1393
vma->vm_start = region->vm_start = addr;
1394
vma->vm_end = region->vm_end = addr + len;
1395
}
1396
}
1397
}
1398
1399
vma->vm_region = region;
1400
1401
/* set up the mapping
1402
* - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1403
*/
1404
if (file && vma->vm_flags & VM_SHARED)
1405
ret = do_mmap_shared_file(vma);
1406
else
1407
ret = do_mmap_private(vma, region, len, capabilities);
1408
if (ret < 0)
1409
goto error_just_free;
1410
add_nommu_region(region);
1411
1412
/* clear anonymous mappings that don't ask for uninitialized data */
1413
if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1414
memset((void *)region->vm_start, 0,
1415
region->vm_end - region->vm_start);
1416
1417
/* okay... we have a mapping; now we have to register it */
1418
result = vma->vm_start;
1419
1420
current->mm->total_vm += len >> PAGE_SHIFT;
1421
1422
share:
1423
add_vma_to_mm(current->mm, vma);
1424
1425
/* we flush the region from the icache only when the first executable
1426
* mapping of it is made */
1427
if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1428
flush_icache_range(region->vm_start, region->vm_end);
1429
region->vm_icache_flushed = true;
1430
}
1431
1432
up_write(&nommu_region_sem);
1433
1434
kleave(" = %lx", result);
1435
return result;
1436
1437
error_just_free:
1438
up_write(&nommu_region_sem);
1439
error:
1440
if (region->vm_file)
1441
fput(region->vm_file);
1442
kmem_cache_free(vm_region_jar, region);
1443
if (vma->vm_file)
1444
fput(vma->vm_file);
1445
if (vma->vm_flags & VM_EXECUTABLE)
1446
removed_exe_file_vma(vma->vm_mm);
1447
kmem_cache_free(vm_area_cachep, vma);
1448
kleave(" = %d", ret);
1449
return ret;
1450
1451
sharing_violation:
1452
up_write(&nommu_region_sem);
1453
printk(KERN_WARNING "Attempt to share mismatched mappings\n");
1454
ret = -EINVAL;
1455
goto error;
1456
1457
error_getting_vma:
1458
kmem_cache_free(vm_region_jar, region);
1459
printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1460
" from process %d failed\n",
1461
len, current->pid);
1462
show_free_areas(0);
1463
return -ENOMEM;
1464
1465
error_getting_region:
1466
printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1467
" from process %d failed\n",
1468
len, current->pid);
1469
show_free_areas(0);
1470
return -ENOMEM;
1471
}
1472
EXPORT_SYMBOL(do_mmap_pgoff);
1473
1474
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1475
unsigned long, prot, unsigned long, flags,
1476
unsigned long, fd, unsigned long, pgoff)
1477
{
1478
struct file *file = NULL;
1479
unsigned long retval = -EBADF;
1480
1481
audit_mmap_fd(fd, flags);
1482
if (!(flags & MAP_ANONYMOUS)) {
1483
file = fget(fd);
1484
if (!file)
1485
goto out;
1486
}
1487
1488
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1489
1490
down_write(&current->mm->mmap_sem);
1491
retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1492
up_write(&current->mm->mmap_sem);
1493
1494
if (file)
1495
fput(file);
1496
out:
1497
return retval;
1498
}
1499
1500
#ifdef __ARCH_WANT_SYS_OLD_MMAP
1501
struct mmap_arg_struct {
1502
unsigned long addr;
1503
unsigned long len;
1504
unsigned long prot;
1505
unsigned long flags;
1506
unsigned long fd;
1507
unsigned long offset;
1508
};
1509
1510
SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1511
{
1512
struct mmap_arg_struct a;
1513
1514
if (copy_from_user(&a, arg, sizeof(a)))
1515
return -EFAULT;
1516
if (a.offset & ~PAGE_MASK)
1517
return -EINVAL;
1518
1519
return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1520
a.offset >> PAGE_SHIFT);
1521
}
1522
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1523
1524
/*
1525
* split a vma into two pieces at address 'addr', a new vma is allocated either
1526
* for the first part or the tail.
1527
*/
1528
int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1529
unsigned long addr, int new_below)
1530
{
1531
struct vm_area_struct *new;
1532
struct vm_region *region;
1533
unsigned long npages;
1534
1535
kenter("");
1536
1537
/* we're only permitted to split anonymous regions (these should have
1538
* only a single usage on the region) */
1539
if (vma->vm_file)
1540
return -ENOMEM;
1541
1542
if (mm->map_count >= sysctl_max_map_count)
1543
return -ENOMEM;
1544
1545
region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1546
if (!region)
1547
return -ENOMEM;
1548
1549
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1550
if (!new) {
1551
kmem_cache_free(vm_region_jar, region);
1552
return -ENOMEM;
1553
}
1554
1555
/* most fields are the same, copy all, and then fixup */
1556
*new = *vma;
1557
*region = *vma->vm_region;
1558
new->vm_region = region;
1559
1560
npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1561
1562
if (new_below) {
1563
region->vm_top = region->vm_end = new->vm_end = addr;
1564
} else {
1565
region->vm_start = new->vm_start = addr;
1566
region->vm_pgoff = new->vm_pgoff += npages;
1567
}
1568
1569
if (new->vm_ops && new->vm_ops->open)
1570
new->vm_ops->open(new);
1571
1572
delete_vma_from_mm(vma);
1573
down_write(&nommu_region_sem);
1574
delete_nommu_region(vma->vm_region);
1575
if (new_below) {
1576
vma->vm_region->vm_start = vma->vm_start = addr;
1577
vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1578
} else {
1579
vma->vm_region->vm_end = vma->vm_end = addr;
1580
vma->vm_region->vm_top = addr;
1581
}
1582
add_nommu_region(vma->vm_region);
1583
add_nommu_region(new->vm_region);
1584
up_write(&nommu_region_sem);
1585
add_vma_to_mm(mm, vma);
1586
add_vma_to_mm(mm, new);
1587
return 0;
1588
}
1589
1590
/*
1591
* shrink a VMA by removing the specified chunk from either the beginning or
1592
* the end
1593
*/
1594
static int shrink_vma(struct mm_struct *mm,
1595
struct vm_area_struct *vma,
1596
unsigned long from, unsigned long to)
1597
{
1598
struct vm_region *region;
1599
1600
kenter("");
1601
1602
/* adjust the VMA's pointers, which may reposition it in the MM's tree
1603
* and list */
1604
delete_vma_from_mm(vma);
1605
if (from > vma->vm_start)
1606
vma->vm_end = from;
1607
else
1608
vma->vm_start = to;
1609
add_vma_to_mm(mm, vma);
1610
1611
/* cut the backing region down to size */
1612
region = vma->vm_region;
1613
BUG_ON(region->vm_usage != 1);
1614
1615
down_write(&nommu_region_sem);
1616
delete_nommu_region(region);
1617
if (from > region->vm_start) {
1618
to = region->vm_top;
1619
region->vm_top = region->vm_end = from;
1620
} else {
1621
region->vm_start = to;
1622
}
1623
add_nommu_region(region);
1624
up_write(&nommu_region_sem);
1625
1626
free_page_series(from, to);
1627
return 0;
1628
}
1629
1630
/*
1631
* release a mapping
1632
* - under NOMMU conditions the chunk to be unmapped must be backed by a single
1633
* VMA, though it need not cover the whole VMA
1634
*/
1635
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1636
{
1637
struct vm_area_struct *vma;
1638
unsigned long end;
1639
int ret;
1640
1641
kenter(",%lx,%zx", start, len);
1642
1643
len = PAGE_ALIGN(len);
1644
if (len == 0)
1645
return -EINVAL;
1646
1647
end = start + len;
1648
1649
/* find the first potentially overlapping VMA */
1650
vma = find_vma(mm, start);
1651
if (!vma) {
1652
static int limit = 0;
1653
if (limit < 5) {
1654
printk(KERN_WARNING
1655
"munmap of memory not mmapped by process %d"
1656
" (%s): 0x%lx-0x%lx\n",
1657
current->pid, current->comm,
1658
start, start + len - 1);
1659
limit++;
1660
}
1661
return -EINVAL;
1662
}
1663
1664
/* we're allowed to split an anonymous VMA but not a file-backed one */
1665
if (vma->vm_file) {
1666
do {
1667
if (start > vma->vm_start) {
1668
kleave(" = -EINVAL [miss]");
1669
return -EINVAL;
1670
}
1671
if (end == vma->vm_end)
1672
goto erase_whole_vma;
1673
vma = vma->vm_next;
1674
} while (vma);
1675
kleave(" = -EINVAL [split file]");
1676
return -EINVAL;
1677
} else {
1678
/* the chunk must be a subset of the VMA found */
1679
if (start == vma->vm_start && end == vma->vm_end)
1680
goto erase_whole_vma;
1681
if (start < vma->vm_start || end > vma->vm_end) {
1682
kleave(" = -EINVAL [superset]");
1683
return -EINVAL;
1684
}
1685
if (start & ~PAGE_MASK) {
1686
kleave(" = -EINVAL [unaligned start]");
1687
return -EINVAL;
1688
}
1689
if (end != vma->vm_end && end & ~PAGE_MASK) {
1690
kleave(" = -EINVAL [unaligned split]");
1691
return -EINVAL;
1692
}
1693
if (start != vma->vm_start && end != vma->vm_end) {
1694
ret = split_vma(mm, vma, start, 1);
1695
if (ret < 0) {
1696
kleave(" = %d [split]", ret);
1697
return ret;
1698
}
1699
}
1700
return shrink_vma(mm, vma, start, end);
1701
}
1702
1703
erase_whole_vma:
1704
delete_vma_from_mm(vma);
1705
delete_vma(mm, vma);
1706
kleave(" = 0");
1707
return 0;
1708
}
1709
EXPORT_SYMBOL(do_munmap);
1710
1711
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1712
{
1713
int ret;
1714
struct mm_struct *mm = current->mm;
1715
1716
down_write(&mm->mmap_sem);
1717
ret = do_munmap(mm, addr, len);
1718
up_write(&mm->mmap_sem);
1719
return ret;
1720
}
1721
1722
/*
1723
* release all the mappings made in a process's VM space
1724
*/
1725
void exit_mmap(struct mm_struct *mm)
1726
{
1727
struct vm_area_struct *vma;
1728
1729
if (!mm)
1730
return;
1731
1732
kenter("");
1733
1734
mm->total_vm = 0;
1735
1736
while ((vma = mm->mmap)) {
1737
mm->mmap = vma->vm_next;
1738
delete_vma_from_mm(vma);
1739
delete_vma(mm, vma);
1740
cond_resched();
1741
}
1742
1743
kleave("");
1744
}
1745
1746
unsigned long do_brk(unsigned long addr, unsigned long len)
1747
{
1748
return -ENOMEM;
1749
}
1750
1751
/*
1752
* expand (or shrink) an existing mapping, potentially moving it at the same
1753
* time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1754
*
1755
* under NOMMU conditions, we only permit changing a mapping's size, and only
1756
* as long as it stays within the region allocated by do_mmap_private() and the
1757
* block is not shareable
1758
*
1759
* MREMAP_FIXED is not supported under NOMMU conditions
1760
*/
1761
unsigned long do_mremap(unsigned long addr,
1762
unsigned long old_len, unsigned long new_len,
1763
unsigned long flags, unsigned long new_addr)
1764
{
1765
struct vm_area_struct *vma;
1766
1767
/* insanity checks first */
1768
old_len = PAGE_ALIGN(old_len);
1769
new_len = PAGE_ALIGN(new_len);
1770
if (old_len == 0 || new_len == 0)
1771
return (unsigned long) -EINVAL;
1772
1773
if (addr & ~PAGE_MASK)
1774
return -EINVAL;
1775
1776
if (flags & MREMAP_FIXED && new_addr != addr)
1777
return (unsigned long) -EINVAL;
1778
1779
vma = find_vma_exact(current->mm, addr, old_len);
1780
if (!vma)
1781
return (unsigned long) -EINVAL;
1782
1783
if (vma->vm_end != vma->vm_start + old_len)
1784
return (unsigned long) -EFAULT;
1785
1786
if (vma->vm_flags & VM_MAYSHARE)
1787
return (unsigned long) -EPERM;
1788
1789
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1790
return (unsigned long) -ENOMEM;
1791
1792
/* all checks complete - do it */
1793
vma->vm_end = vma->vm_start + new_len;
1794
return vma->vm_start;
1795
}
1796
EXPORT_SYMBOL(do_mremap);
1797
1798
SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1799
unsigned long, new_len, unsigned long, flags,
1800
unsigned long, new_addr)
1801
{
1802
unsigned long ret;
1803
1804
down_write(&current->mm->mmap_sem);
1805
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1806
up_write(&current->mm->mmap_sem);
1807
return ret;
1808
}
1809
1810
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1811
unsigned int foll_flags)
1812
{
1813
return NULL;
1814
}
1815
1816
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1817
unsigned long pfn, unsigned long size, pgprot_t prot)
1818
{
1819
if (addr != (pfn << PAGE_SHIFT))
1820
return -EINVAL;
1821
1822
vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1823
return 0;
1824
}
1825
EXPORT_SYMBOL(remap_pfn_range);
1826
1827
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1828
unsigned long pgoff)
1829
{
1830
unsigned int size = vma->vm_end - vma->vm_start;
1831
1832
if (!(vma->vm_flags & VM_USERMAP))
1833
return -EINVAL;
1834
1835
vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1836
vma->vm_end = vma->vm_start + size;
1837
1838
return 0;
1839
}
1840
EXPORT_SYMBOL(remap_vmalloc_range);
1841
1842
unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1843
unsigned long len, unsigned long pgoff, unsigned long flags)
1844
{
1845
return -ENOMEM;
1846
}
1847
1848
void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1849
{
1850
}
1851
1852
void unmap_mapping_range(struct address_space *mapping,
1853
loff_t const holebegin, loff_t const holelen,
1854
int even_cows)
1855
{
1856
}
1857
EXPORT_SYMBOL(unmap_mapping_range);
1858
1859
/*
1860
* Check that a process has enough memory to allocate a new virtual
1861
* mapping. 0 means there is enough memory for the allocation to
1862
* succeed and -ENOMEM implies there is not.
1863
*
1864
* We currently support three overcommit policies, which are set via the
1865
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
1866
*
1867
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
1868
* Additional code 2002 Jul 20 by Robert Love.
1869
*
1870
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
1871
*
1872
* Note this is a helper function intended to be used by LSMs which
1873
* wish to use this logic.
1874
*/
1875
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1876
{
1877
unsigned long free, allowed;
1878
1879
vm_acct_memory(pages);
1880
1881
/*
1882
* Sometimes we want to use more memory than we have
1883
*/
1884
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
1885
return 0;
1886
1887
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1888
unsigned long n;
1889
1890
free = global_page_state(NR_FILE_PAGES);
1891
free += nr_swap_pages;
1892
1893
/*
1894
* Any slabs which are created with the
1895
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
1896
* which are reclaimable, under pressure. The dentry
1897
* cache and most inode caches should fall into this
1898
*/
1899
free += global_page_state(NR_SLAB_RECLAIMABLE);
1900
1901
/*
1902
* Leave the last 3% for root
1903
*/
1904
if (!cap_sys_admin)
1905
free -= free / 32;
1906
1907
if (free > pages)
1908
return 0;
1909
1910
/*
1911
* nr_free_pages() is very expensive on large systems,
1912
* only call if we're about to fail.
1913
*/
1914
n = nr_free_pages();
1915
1916
/*
1917
* Leave reserved pages. The pages are not for anonymous pages.
1918
*/
1919
if (n <= totalreserve_pages)
1920
goto error;
1921
else
1922
n -= totalreserve_pages;
1923
1924
/*
1925
* Leave the last 3% for root
1926
*/
1927
if (!cap_sys_admin)
1928
n -= n / 32;
1929
free += n;
1930
1931
if (free > pages)
1932
return 0;
1933
1934
goto error;
1935
}
1936
1937
allowed = totalram_pages * sysctl_overcommit_ratio / 100;
1938
/*
1939
* Leave the last 3% for root
1940
*/
1941
if (!cap_sys_admin)
1942
allowed -= allowed / 32;
1943
allowed += total_swap_pages;
1944
1945
/* Don't let a single process grow too big:
1946
leave 3% of the size of this process for other processes */
1947
if (mm)
1948
allowed -= mm->total_vm / 32;
1949
1950
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1951
return 0;
1952
1953
error:
1954
vm_unacct_memory(pages);
1955
1956
return -ENOMEM;
1957
}
1958
1959
int in_gate_area_no_mm(unsigned long addr)
1960
{
1961
return 0;
1962
}
1963
1964
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1965
{
1966
BUG();
1967
return 0;
1968
}
1969
EXPORT_SYMBOL(filemap_fault);
1970
1971
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1972
unsigned long addr, void *buf, int len, int write)
1973
{
1974
struct vm_area_struct *vma;
1975
1976
down_read(&mm->mmap_sem);
1977
1978
/* the access must start within one of the target process's mappings */
1979
vma = find_vma(mm, addr);
1980
if (vma) {
1981
/* don't overrun this mapping */
1982
if (addr + len >= vma->vm_end)
1983
len = vma->vm_end - addr;
1984
1985
/* only read or write mappings where it is permitted */
1986
if (write && vma->vm_flags & VM_MAYWRITE)
1987
copy_to_user_page(vma, NULL, addr,
1988
(void *) addr, buf, len);
1989
else if (!write && vma->vm_flags & VM_MAYREAD)
1990
copy_from_user_page(vma, NULL, addr,
1991
buf, (void *) addr, len);
1992
else
1993
len = 0;
1994
} else {
1995
len = 0;
1996
}
1997
1998
up_read(&mm->mmap_sem);
1999
2000
return len;
2001
}
2002
2003
/**
2004
* @access_remote_vm - access another process' address space
2005
* @mm: the mm_struct of the target address space
2006
* @addr: start address to access
2007
* @buf: source or destination buffer
2008
* @len: number of bytes to transfer
2009
* @write: whether the access is a write
2010
*
2011
* The caller must hold a reference on @mm.
2012
*/
2013
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
2014
void *buf, int len, int write)
2015
{
2016
return __access_remote_vm(NULL, mm, addr, buf, len, write);
2017
}
2018
2019
/*
2020
* Access another process' address space.
2021
* - source/target buffer must be kernel space
2022
*/
2023
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2024
{
2025
struct mm_struct *mm;
2026
2027
if (addr + len < addr)
2028
return 0;
2029
2030
mm = get_task_mm(tsk);
2031
if (!mm)
2032
return 0;
2033
2034
len = __access_remote_vm(tsk, mm, addr, buf, len, write);
2035
2036
mmput(mm);
2037
return len;
2038
}
2039
2040
/**
2041
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
2042
* @inode: The inode to check
2043
* @size: The current filesize of the inode
2044
* @newsize: The proposed filesize of the inode
2045
*
2046
* Check the shared mappings on an inode on behalf of a shrinking truncate to
2047
* make sure that that any outstanding VMAs aren't broken and then shrink the
2048
* vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
2049
* automatically grant mappings that are too large.
2050
*/
2051
int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2052
size_t newsize)
2053
{
2054
struct vm_area_struct *vma;
2055
struct prio_tree_iter iter;
2056
struct vm_region *region;
2057
pgoff_t low, high;
2058
size_t r_size, r_top;
2059
2060
low = newsize >> PAGE_SHIFT;
2061
high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
2062
2063
down_write(&nommu_region_sem);
2064
2065
/* search for VMAs that fall within the dead zone */
2066
vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
2067
low, high) {
2068
/* found one - only interested if it's shared out of the page
2069
* cache */
2070
if (vma->vm_flags & VM_SHARED) {
2071
up_write(&nommu_region_sem);
2072
return -ETXTBSY; /* not quite true, but near enough */
2073
}
2074
}
2075
2076
/* reduce any regions that overlap the dead zone - if in existence,
2077
* these will be pointed to by VMAs that don't overlap the dead zone
2078
*
2079
* we don't check for any regions that start beyond the EOF as there
2080
* shouldn't be any
2081
*/
2082
vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
2083
0, ULONG_MAX) {
2084
if (!(vma->vm_flags & VM_SHARED))
2085
continue;
2086
2087
region = vma->vm_region;
2088
r_size = region->vm_top - region->vm_start;
2089
r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
2090
2091
if (r_top > newsize) {
2092
region->vm_top -= r_top - newsize;
2093
if (region->vm_end > region->vm_top)
2094
region->vm_end = region->vm_top;
2095
}
2096
}
2097
2098
up_write(&nommu_region_sem);
2099
return 0;
2100
}
2101
2102