Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/madvise.c
10814 views
1
/*
2
* linux/mm/madvise.c
3
*
4
* Copyright (C) 1999 Linus Torvalds
5
* Copyright (C) 2002 Christoph Hellwig
6
*/
7
8
#include <linux/mman.h>
9
#include <linux/pagemap.h>
10
#include <linux/syscalls.h>
11
#include <linux/mempolicy.h>
12
#include <linux/page-isolation.h>
13
#include <linux/hugetlb.h>
14
#include <linux/sched.h>
15
#include <linux/ksm.h>
16
17
/*
18
* Any behaviour which results in changes to the vma->vm_flags needs to
19
* take mmap_sem for writing. Others, which simply traverse vmas, need
20
* to only take it for reading.
21
*/
22
static int madvise_need_mmap_write(int behavior)
23
{
24
switch (behavior) {
25
case MADV_REMOVE:
26
case MADV_WILLNEED:
27
case MADV_DONTNEED:
28
return 0;
29
default:
30
/* be safe, default to 1. list exceptions explicitly */
31
return 1;
32
}
33
}
34
35
/*
36
* We can potentially split a vm area into separate
37
* areas, each area with its own behavior.
38
*/
39
static long madvise_behavior(struct vm_area_struct * vma,
40
struct vm_area_struct **prev,
41
unsigned long start, unsigned long end, int behavior)
42
{
43
struct mm_struct * mm = vma->vm_mm;
44
int error = 0;
45
pgoff_t pgoff;
46
unsigned long new_flags = vma->vm_flags;
47
48
switch (behavior) {
49
case MADV_NORMAL:
50
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
51
break;
52
case MADV_SEQUENTIAL:
53
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
54
break;
55
case MADV_RANDOM:
56
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
57
break;
58
case MADV_DONTFORK:
59
new_flags |= VM_DONTCOPY;
60
break;
61
case MADV_DOFORK:
62
if (vma->vm_flags & VM_IO) {
63
error = -EINVAL;
64
goto out;
65
}
66
new_flags &= ~VM_DONTCOPY;
67
break;
68
case MADV_MERGEABLE:
69
case MADV_UNMERGEABLE:
70
error = ksm_madvise(vma, start, end, behavior, &new_flags);
71
if (error)
72
goto out;
73
break;
74
case MADV_HUGEPAGE:
75
case MADV_NOHUGEPAGE:
76
error = hugepage_madvise(vma, &new_flags, behavior);
77
if (error)
78
goto out;
79
break;
80
}
81
82
if (new_flags == vma->vm_flags) {
83
*prev = vma;
84
goto out;
85
}
86
87
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
88
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
89
vma->vm_file, pgoff, vma_policy(vma));
90
if (*prev) {
91
vma = *prev;
92
goto success;
93
}
94
95
*prev = vma;
96
97
if (start != vma->vm_start) {
98
error = split_vma(mm, vma, start, 1);
99
if (error)
100
goto out;
101
}
102
103
if (end != vma->vm_end) {
104
error = split_vma(mm, vma, end, 0);
105
if (error)
106
goto out;
107
}
108
109
success:
110
/*
111
* vm_flags is protected by the mmap_sem held in write mode.
112
*/
113
vma->vm_flags = new_flags;
114
115
out:
116
if (error == -ENOMEM)
117
error = -EAGAIN;
118
return error;
119
}
120
121
/*
122
* Schedule all required I/O operations. Do not wait for completion.
123
*/
124
static long madvise_willneed(struct vm_area_struct * vma,
125
struct vm_area_struct ** prev,
126
unsigned long start, unsigned long end)
127
{
128
struct file *file = vma->vm_file;
129
130
if (!file)
131
return -EBADF;
132
133
if (file->f_mapping->a_ops->get_xip_mem) {
134
/* no bad return value, but ignore advice */
135
return 0;
136
}
137
138
*prev = vma;
139
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
140
if (end > vma->vm_end)
141
end = vma->vm_end;
142
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
143
144
force_page_cache_readahead(file->f_mapping, file, start, end - start);
145
return 0;
146
}
147
148
/*
149
* Application no longer needs these pages. If the pages are dirty,
150
* it's OK to just throw them away. The app will be more careful about
151
* data it wants to keep. Be sure to free swap resources too. The
152
* zap_page_range call sets things up for shrink_active_list to actually free
153
* these pages later if no one else has touched them in the meantime,
154
* although we could add these pages to a global reuse list for
155
* shrink_active_list to pick up before reclaiming other pages.
156
*
157
* NB: This interface discards data rather than pushes it out to swap,
158
* as some implementations do. This has performance implications for
159
* applications like large transactional databases which want to discard
160
* pages in anonymous maps after committing to backing store the data
161
* that was kept in them. There is no reason to write this data out to
162
* the swap area if the application is discarding it.
163
*
164
* An interface that causes the system to free clean pages and flush
165
* dirty pages is already available as msync(MS_INVALIDATE).
166
*/
167
static long madvise_dontneed(struct vm_area_struct * vma,
168
struct vm_area_struct ** prev,
169
unsigned long start, unsigned long end)
170
{
171
*prev = vma;
172
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
173
return -EINVAL;
174
175
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
176
struct zap_details details = {
177
.nonlinear_vma = vma,
178
.last_index = ULONG_MAX,
179
};
180
zap_page_range(vma, start, end - start, &details);
181
} else
182
zap_page_range(vma, start, end - start, NULL);
183
return 0;
184
}
185
186
/*
187
* Application wants to free up the pages and associated backing store.
188
* This is effectively punching a hole into the middle of a file.
189
*
190
* NOTE: Currently, only shmfs/tmpfs is supported for this operation.
191
* Other filesystems return -ENOSYS.
192
*/
193
static long madvise_remove(struct vm_area_struct *vma,
194
struct vm_area_struct **prev,
195
unsigned long start, unsigned long end)
196
{
197
struct address_space *mapping;
198
loff_t offset, endoff;
199
int error;
200
201
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
202
203
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
204
return -EINVAL;
205
206
if (!vma->vm_file || !vma->vm_file->f_mapping
207
|| !vma->vm_file->f_mapping->host) {
208
return -EINVAL;
209
}
210
211
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
212
return -EACCES;
213
214
mapping = vma->vm_file->f_mapping;
215
216
offset = (loff_t)(start - vma->vm_start)
217
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
218
endoff = (loff_t)(end - vma->vm_start - 1)
219
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220
221
/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
222
up_read(&current->mm->mmap_sem);
223
error = vmtruncate_range(mapping->host, offset, endoff);
224
down_read(&current->mm->mmap_sem);
225
return error;
226
}
227
228
#ifdef CONFIG_MEMORY_FAILURE
229
/*
230
* Error injection support for memory error handling.
231
*/
232
static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
233
{
234
int ret = 0;
235
236
if (!capable(CAP_SYS_ADMIN))
237
return -EPERM;
238
for (; start < end; start += PAGE_SIZE) {
239
struct page *p;
240
int ret = get_user_pages_fast(start, 1, 0, &p);
241
if (ret != 1)
242
return ret;
243
if (bhv == MADV_SOFT_OFFLINE) {
244
printk(KERN_INFO "Soft offlining page %lx at %lx\n",
245
page_to_pfn(p), start);
246
ret = soft_offline_page(p, MF_COUNT_INCREASED);
247
if (ret)
248
break;
249
continue;
250
}
251
printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
252
page_to_pfn(p), start);
253
/* Ignore return value for now */
254
__memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
255
}
256
return ret;
257
}
258
#endif
259
260
static long
261
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
262
unsigned long start, unsigned long end, int behavior)
263
{
264
switch (behavior) {
265
case MADV_REMOVE:
266
return madvise_remove(vma, prev, start, end);
267
case MADV_WILLNEED:
268
return madvise_willneed(vma, prev, start, end);
269
case MADV_DONTNEED:
270
return madvise_dontneed(vma, prev, start, end);
271
default:
272
return madvise_behavior(vma, prev, start, end, behavior);
273
}
274
}
275
276
static int
277
madvise_behavior_valid(int behavior)
278
{
279
switch (behavior) {
280
case MADV_DOFORK:
281
case MADV_DONTFORK:
282
case MADV_NORMAL:
283
case MADV_SEQUENTIAL:
284
case MADV_RANDOM:
285
case MADV_REMOVE:
286
case MADV_WILLNEED:
287
case MADV_DONTNEED:
288
#ifdef CONFIG_KSM
289
case MADV_MERGEABLE:
290
case MADV_UNMERGEABLE:
291
#endif
292
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
293
case MADV_HUGEPAGE:
294
case MADV_NOHUGEPAGE:
295
#endif
296
return 1;
297
298
default:
299
return 0;
300
}
301
}
302
303
/*
304
* The madvise(2) system call.
305
*
306
* Applications can use madvise() to advise the kernel how it should
307
* handle paging I/O in this VM area. The idea is to help the kernel
308
* use appropriate read-ahead and caching techniques. The information
309
* provided is advisory only, and can be safely disregarded by the
310
* kernel without affecting the correct operation of the application.
311
*
312
* behavior values:
313
* MADV_NORMAL - the default behavior is to read clusters. This
314
* results in some read-ahead and read-behind.
315
* MADV_RANDOM - the system should read the minimum amount of data
316
* on any access, since it is unlikely that the appli-
317
* cation will need more than what it asks for.
318
* MADV_SEQUENTIAL - pages in the given range will probably be accessed
319
* once, so they can be aggressively read ahead, and
320
* can be freed soon after they are accessed.
321
* MADV_WILLNEED - the application is notifying the system to read
322
* some pages ahead.
323
* MADV_DONTNEED - the application is finished with the given range,
324
* so the kernel can free resources associated with it.
325
* MADV_REMOVE - the application wants to free up the given range of
326
* pages and associated backing store.
327
* MADV_DONTFORK - omit this area from child's address space when forking:
328
* typically, to avoid COWing pages pinned by get_user_pages().
329
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
330
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
331
* this area with pages of identical content from other such areas.
332
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
333
*
334
* return values:
335
* zero - success
336
* -EINVAL - start + len < 0, start is not page-aligned,
337
* "behavior" is not a valid value, or application
338
* is attempting to release locked or shared pages.
339
* -ENOMEM - addresses in the specified range are not currently
340
* mapped, or are outside the AS of the process.
341
* -EIO - an I/O error occurred while paging in data.
342
* -EBADF - map exists, but area maps something that isn't a file.
343
* -EAGAIN - a kernel resource was temporarily unavailable.
344
*/
345
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
346
{
347
unsigned long end, tmp;
348
struct vm_area_struct * vma, *prev;
349
int unmapped_error = 0;
350
int error = -EINVAL;
351
int write;
352
size_t len;
353
354
#ifdef CONFIG_MEMORY_FAILURE
355
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
356
return madvise_hwpoison(behavior, start, start+len_in);
357
#endif
358
if (!madvise_behavior_valid(behavior))
359
return error;
360
361
write = madvise_need_mmap_write(behavior);
362
if (write)
363
down_write(&current->mm->mmap_sem);
364
else
365
down_read(&current->mm->mmap_sem);
366
367
if (start & ~PAGE_MASK)
368
goto out;
369
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
370
371
/* Check to see whether len was rounded up from small -ve to zero */
372
if (len_in && !len)
373
goto out;
374
375
end = start + len;
376
if (end < start)
377
goto out;
378
379
error = 0;
380
if (end == start)
381
goto out;
382
383
/*
384
* If the interval [start,end) covers some unmapped address
385
* ranges, just ignore them, but return -ENOMEM at the end.
386
* - different from the way of handling in mlock etc.
387
*/
388
vma = find_vma_prev(current->mm, start, &prev);
389
if (vma && start > vma->vm_start)
390
prev = vma;
391
392
for (;;) {
393
/* Still start < end. */
394
error = -ENOMEM;
395
if (!vma)
396
goto out;
397
398
/* Here start < (end|vma->vm_end). */
399
if (start < vma->vm_start) {
400
unmapped_error = -ENOMEM;
401
start = vma->vm_start;
402
if (start >= end)
403
goto out;
404
}
405
406
/* Here vma->vm_start <= start < (end|vma->vm_end) */
407
tmp = vma->vm_end;
408
if (end < tmp)
409
tmp = end;
410
411
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
412
error = madvise_vma(vma, &prev, start, tmp, behavior);
413
if (error)
414
goto out;
415
start = tmp;
416
if (prev && start < prev->vm_end)
417
start = prev->vm_end;
418
error = unmapped_error;
419
if (start >= end)
420
goto out;
421
if (prev)
422
vma = prev->vm_next;
423
else /* madvise_remove dropped mmap_sem */
424
vma = find_vma(current->mm, start);
425
}
426
out:
427
if (write)
428
up_write(&current->mm->mmap_sem);
429
else
430
up_read(&current->mm->mmap_sem);
431
432
return error;
433
}
434
435