Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/mm/huge_memory.c
10814 views
1
/*
2
* Copyright (C) 2009 Red Hat, Inc.
3
*
4
* This work is licensed under the terms of the GNU GPL, version 2. See
5
* the COPYING file in the top-level directory.
6
*/
7
8
#include <linux/mm.h>
9
#include <linux/sched.h>
10
#include <linux/highmem.h>
11
#include <linux/hugetlb.h>
12
#include <linux/mmu_notifier.h>
13
#include <linux/rmap.h>
14
#include <linux/swap.h>
15
#include <linux/mm_inline.h>
16
#include <linux/kthread.h>
17
#include <linux/khugepaged.h>
18
#include <linux/freezer.h>
19
#include <linux/mman.h>
20
#include <asm/tlb.h>
21
#include <asm/pgalloc.h>
22
#include "internal.h"
23
24
/*
25
* By default transparent hugepage support is enabled for all mappings
26
* and khugepaged scans all mappings. Defrag is only invoked by
27
* khugepaged hugepage allocations and by page faults inside
28
* MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
29
* allocations.
30
*/
31
unsigned long transparent_hugepage_flags __read_mostly =
32
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
33
(1<<TRANSPARENT_HUGEPAGE_FLAG)|
34
#endif
35
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
36
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
37
#endif
38
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
39
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
40
41
/* default scan 8*512 pte (or vmas) every 30 second */
42
static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
43
static unsigned int khugepaged_pages_collapsed;
44
static unsigned int khugepaged_full_scans;
45
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
46
/* during fragmentation poll the hugepage allocator once every minute */
47
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
48
static struct task_struct *khugepaged_thread __read_mostly;
49
static DEFINE_MUTEX(khugepaged_mutex);
50
static DEFINE_SPINLOCK(khugepaged_mm_lock);
51
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
52
/*
53
* default collapse hugepages if there is at least one pte mapped like
54
* it would have happened if the vma was large enough during page
55
* fault.
56
*/
57
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
58
59
static int khugepaged(void *none);
60
static int mm_slots_hash_init(void);
61
static int khugepaged_slab_init(void);
62
static void khugepaged_slab_free(void);
63
64
#define MM_SLOTS_HASH_HEADS 1024
65
static struct hlist_head *mm_slots_hash __read_mostly;
66
static struct kmem_cache *mm_slot_cache __read_mostly;
67
68
/**
69
* struct mm_slot - hash lookup from mm to mm_slot
70
* @hash: hash collision list
71
* @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
72
* @mm: the mm that this information is valid for
73
*/
74
struct mm_slot {
75
struct hlist_node hash;
76
struct list_head mm_node;
77
struct mm_struct *mm;
78
};
79
80
/**
81
* struct khugepaged_scan - cursor for scanning
82
* @mm_head: the head of the mm list to scan
83
* @mm_slot: the current mm_slot we are scanning
84
* @address: the next address inside that to be scanned
85
*
86
* There is only the one khugepaged_scan instance of this cursor structure.
87
*/
88
struct khugepaged_scan {
89
struct list_head mm_head;
90
struct mm_slot *mm_slot;
91
unsigned long address;
92
} khugepaged_scan = {
93
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94
};
95
96
97
static int set_recommended_min_free_kbytes(void)
98
{
99
struct zone *zone;
100
int nr_zones = 0;
101
unsigned long recommended_min;
102
extern int min_free_kbytes;
103
104
if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
105
&transparent_hugepage_flags) &&
106
!test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
107
&transparent_hugepage_flags))
108
return 0;
109
110
for_each_populated_zone(zone)
111
nr_zones++;
112
113
/* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
114
recommended_min = pageblock_nr_pages * nr_zones * 2;
115
116
/*
117
* Make sure that on average at least two pageblocks are almost free
118
* of another type, one for a migratetype to fall back to and a
119
* second to avoid subsequent fallbacks of other types There are 3
120
* MIGRATE_TYPES we care about.
121
*/
122
recommended_min += pageblock_nr_pages * nr_zones *
123
MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
124
125
/* don't ever allow to reserve more than 5% of the lowmem */
126
recommended_min = min(recommended_min,
127
(unsigned long) nr_free_buffer_pages() / 20);
128
recommended_min <<= (PAGE_SHIFT-10);
129
130
if (recommended_min > min_free_kbytes)
131
min_free_kbytes = recommended_min;
132
setup_per_zone_wmarks();
133
return 0;
134
}
135
late_initcall(set_recommended_min_free_kbytes);
136
137
static int start_khugepaged(void)
138
{
139
int err = 0;
140
if (khugepaged_enabled()) {
141
int wakeup;
142
if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
143
err = -ENOMEM;
144
goto out;
145
}
146
mutex_lock(&khugepaged_mutex);
147
if (!khugepaged_thread)
148
khugepaged_thread = kthread_run(khugepaged, NULL,
149
"khugepaged");
150
if (unlikely(IS_ERR(khugepaged_thread))) {
151
printk(KERN_ERR
152
"khugepaged: kthread_run(khugepaged) failed\n");
153
err = PTR_ERR(khugepaged_thread);
154
khugepaged_thread = NULL;
155
}
156
wakeup = !list_empty(&khugepaged_scan.mm_head);
157
mutex_unlock(&khugepaged_mutex);
158
if (wakeup)
159
wake_up_interruptible(&khugepaged_wait);
160
161
set_recommended_min_free_kbytes();
162
} else
163
/* wakeup to exit */
164
wake_up_interruptible(&khugepaged_wait);
165
out:
166
return err;
167
}
168
169
#ifdef CONFIG_SYSFS
170
171
static ssize_t double_flag_show(struct kobject *kobj,
172
struct kobj_attribute *attr, char *buf,
173
enum transparent_hugepage_flag enabled,
174
enum transparent_hugepage_flag req_madv)
175
{
176
if (test_bit(enabled, &transparent_hugepage_flags)) {
177
VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
178
return sprintf(buf, "[always] madvise never\n");
179
} else if (test_bit(req_madv, &transparent_hugepage_flags))
180
return sprintf(buf, "always [madvise] never\n");
181
else
182
return sprintf(buf, "always madvise [never]\n");
183
}
184
static ssize_t double_flag_store(struct kobject *kobj,
185
struct kobj_attribute *attr,
186
const char *buf, size_t count,
187
enum transparent_hugepage_flag enabled,
188
enum transparent_hugepage_flag req_madv)
189
{
190
if (!memcmp("always", buf,
191
min(sizeof("always")-1, count))) {
192
set_bit(enabled, &transparent_hugepage_flags);
193
clear_bit(req_madv, &transparent_hugepage_flags);
194
} else if (!memcmp("madvise", buf,
195
min(sizeof("madvise")-1, count))) {
196
clear_bit(enabled, &transparent_hugepage_flags);
197
set_bit(req_madv, &transparent_hugepage_flags);
198
} else if (!memcmp("never", buf,
199
min(sizeof("never")-1, count))) {
200
clear_bit(enabled, &transparent_hugepage_flags);
201
clear_bit(req_madv, &transparent_hugepage_flags);
202
} else
203
return -EINVAL;
204
205
return count;
206
}
207
208
static ssize_t enabled_show(struct kobject *kobj,
209
struct kobj_attribute *attr, char *buf)
210
{
211
return double_flag_show(kobj, attr, buf,
212
TRANSPARENT_HUGEPAGE_FLAG,
213
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
214
}
215
static ssize_t enabled_store(struct kobject *kobj,
216
struct kobj_attribute *attr,
217
const char *buf, size_t count)
218
{
219
ssize_t ret;
220
221
ret = double_flag_store(kobj, attr, buf, count,
222
TRANSPARENT_HUGEPAGE_FLAG,
223
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
224
225
if (ret > 0) {
226
int err = start_khugepaged();
227
if (err)
228
ret = err;
229
}
230
231
if (ret > 0 &&
232
(test_bit(TRANSPARENT_HUGEPAGE_FLAG,
233
&transparent_hugepage_flags) ||
234
test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
235
&transparent_hugepage_flags)))
236
set_recommended_min_free_kbytes();
237
238
return ret;
239
}
240
static struct kobj_attribute enabled_attr =
241
__ATTR(enabled, 0644, enabled_show, enabled_store);
242
243
static ssize_t single_flag_show(struct kobject *kobj,
244
struct kobj_attribute *attr, char *buf,
245
enum transparent_hugepage_flag flag)
246
{
247
return sprintf(buf, "%d\n",
248
!!test_bit(flag, &transparent_hugepage_flags));
249
}
250
251
static ssize_t single_flag_store(struct kobject *kobj,
252
struct kobj_attribute *attr,
253
const char *buf, size_t count,
254
enum transparent_hugepage_flag flag)
255
{
256
unsigned long value;
257
int ret;
258
259
ret = kstrtoul(buf, 10, &value);
260
if (ret < 0)
261
return ret;
262
if (value > 1)
263
return -EINVAL;
264
265
if (value)
266
set_bit(flag, &transparent_hugepage_flags);
267
else
268
clear_bit(flag, &transparent_hugepage_flags);
269
270
return count;
271
}
272
273
/*
274
* Currently defrag only disables __GFP_NOWAIT for allocation. A blind
275
* __GFP_REPEAT is too aggressive, it's never worth swapping tons of
276
* memory just to allocate one more hugepage.
277
*/
278
static ssize_t defrag_show(struct kobject *kobj,
279
struct kobj_attribute *attr, char *buf)
280
{
281
return double_flag_show(kobj, attr, buf,
282
TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
283
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
284
}
285
static ssize_t defrag_store(struct kobject *kobj,
286
struct kobj_attribute *attr,
287
const char *buf, size_t count)
288
{
289
return double_flag_store(kobj, attr, buf, count,
290
TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
291
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
292
}
293
static struct kobj_attribute defrag_attr =
294
__ATTR(defrag, 0644, defrag_show, defrag_store);
295
296
#ifdef CONFIG_DEBUG_VM
297
static ssize_t debug_cow_show(struct kobject *kobj,
298
struct kobj_attribute *attr, char *buf)
299
{
300
return single_flag_show(kobj, attr, buf,
301
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
302
}
303
static ssize_t debug_cow_store(struct kobject *kobj,
304
struct kobj_attribute *attr,
305
const char *buf, size_t count)
306
{
307
return single_flag_store(kobj, attr, buf, count,
308
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
309
}
310
static struct kobj_attribute debug_cow_attr =
311
__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
312
#endif /* CONFIG_DEBUG_VM */
313
314
static struct attribute *hugepage_attr[] = {
315
&enabled_attr.attr,
316
&defrag_attr.attr,
317
#ifdef CONFIG_DEBUG_VM
318
&debug_cow_attr.attr,
319
#endif
320
NULL,
321
};
322
323
static struct attribute_group hugepage_attr_group = {
324
.attrs = hugepage_attr,
325
};
326
327
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
328
struct kobj_attribute *attr,
329
char *buf)
330
{
331
return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
332
}
333
334
static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
335
struct kobj_attribute *attr,
336
const char *buf, size_t count)
337
{
338
unsigned long msecs;
339
int err;
340
341
err = strict_strtoul(buf, 10, &msecs);
342
if (err || msecs > UINT_MAX)
343
return -EINVAL;
344
345
khugepaged_scan_sleep_millisecs = msecs;
346
wake_up_interruptible(&khugepaged_wait);
347
348
return count;
349
}
350
static struct kobj_attribute scan_sleep_millisecs_attr =
351
__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
352
scan_sleep_millisecs_store);
353
354
static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
355
struct kobj_attribute *attr,
356
char *buf)
357
{
358
return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
359
}
360
361
static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
362
struct kobj_attribute *attr,
363
const char *buf, size_t count)
364
{
365
unsigned long msecs;
366
int err;
367
368
err = strict_strtoul(buf, 10, &msecs);
369
if (err || msecs > UINT_MAX)
370
return -EINVAL;
371
372
khugepaged_alloc_sleep_millisecs = msecs;
373
wake_up_interruptible(&khugepaged_wait);
374
375
return count;
376
}
377
static struct kobj_attribute alloc_sleep_millisecs_attr =
378
__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
379
alloc_sleep_millisecs_store);
380
381
static ssize_t pages_to_scan_show(struct kobject *kobj,
382
struct kobj_attribute *attr,
383
char *buf)
384
{
385
return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
386
}
387
static ssize_t pages_to_scan_store(struct kobject *kobj,
388
struct kobj_attribute *attr,
389
const char *buf, size_t count)
390
{
391
int err;
392
unsigned long pages;
393
394
err = strict_strtoul(buf, 10, &pages);
395
if (err || !pages || pages > UINT_MAX)
396
return -EINVAL;
397
398
khugepaged_pages_to_scan = pages;
399
400
return count;
401
}
402
static struct kobj_attribute pages_to_scan_attr =
403
__ATTR(pages_to_scan, 0644, pages_to_scan_show,
404
pages_to_scan_store);
405
406
static ssize_t pages_collapsed_show(struct kobject *kobj,
407
struct kobj_attribute *attr,
408
char *buf)
409
{
410
return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
411
}
412
static struct kobj_attribute pages_collapsed_attr =
413
__ATTR_RO(pages_collapsed);
414
415
static ssize_t full_scans_show(struct kobject *kobj,
416
struct kobj_attribute *attr,
417
char *buf)
418
{
419
return sprintf(buf, "%u\n", khugepaged_full_scans);
420
}
421
static struct kobj_attribute full_scans_attr =
422
__ATTR_RO(full_scans);
423
424
static ssize_t khugepaged_defrag_show(struct kobject *kobj,
425
struct kobj_attribute *attr, char *buf)
426
{
427
return single_flag_show(kobj, attr, buf,
428
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
429
}
430
static ssize_t khugepaged_defrag_store(struct kobject *kobj,
431
struct kobj_attribute *attr,
432
const char *buf, size_t count)
433
{
434
return single_flag_store(kobj, attr, buf, count,
435
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
436
}
437
static struct kobj_attribute khugepaged_defrag_attr =
438
__ATTR(defrag, 0644, khugepaged_defrag_show,
439
khugepaged_defrag_store);
440
441
/*
442
* max_ptes_none controls if khugepaged should collapse hugepages over
443
* any unmapped ptes in turn potentially increasing the memory
444
* footprint of the vmas. When max_ptes_none is 0 khugepaged will not
445
* reduce the available free memory in the system as it
446
* runs. Increasing max_ptes_none will instead potentially reduce the
447
* free memory in the system during the khugepaged scan.
448
*/
449
static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
450
struct kobj_attribute *attr,
451
char *buf)
452
{
453
return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
454
}
455
static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
456
struct kobj_attribute *attr,
457
const char *buf, size_t count)
458
{
459
int err;
460
unsigned long max_ptes_none;
461
462
err = strict_strtoul(buf, 10, &max_ptes_none);
463
if (err || max_ptes_none > HPAGE_PMD_NR-1)
464
return -EINVAL;
465
466
khugepaged_max_ptes_none = max_ptes_none;
467
468
return count;
469
}
470
static struct kobj_attribute khugepaged_max_ptes_none_attr =
471
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
472
khugepaged_max_ptes_none_store);
473
474
static struct attribute *khugepaged_attr[] = {
475
&khugepaged_defrag_attr.attr,
476
&khugepaged_max_ptes_none_attr.attr,
477
&pages_to_scan_attr.attr,
478
&pages_collapsed_attr.attr,
479
&full_scans_attr.attr,
480
&scan_sleep_millisecs_attr.attr,
481
&alloc_sleep_millisecs_attr.attr,
482
NULL,
483
};
484
485
static struct attribute_group khugepaged_attr_group = {
486
.attrs = khugepaged_attr,
487
.name = "khugepaged",
488
};
489
#endif /* CONFIG_SYSFS */
490
491
static int __init hugepage_init(void)
492
{
493
int err;
494
#ifdef CONFIG_SYSFS
495
static struct kobject *hugepage_kobj;
496
#endif
497
498
err = -EINVAL;
499
if (!has_transparent_hugepage()) {
500
transparent_hugepage_flags = 0;
501
goto out;
502
}
503
504
#ifdef CONFIG_SYSFS
505
err = -ENOMEM;
506
hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
507
if (unlikely(!hugepage_kobj)) {
508
printk(KERN_ERR "hugepage: failed kobject create\n");
509
goto out;
510
}
511
512
err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
513
if (err) {
514
printk(KERN_ERR "hugepage: failed register hugeage group\n");
515
goto out;
516
}
517
518
err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
519
if (err) {
520
printk(KERN_ERR "hugepage: failed register hugeage group\n");
521
goto out;
522
}
523
#endif
524
525
err = khugepaged_slab_init();
526
if (err)
527
goto out;
528
529
err = mm_slots_hash_init();
530
if (err) {
531
khugepaged_slab_free();
532
goto out;
533
}
534
535
/*
536
* By default disable transparent hugepages on smaller systems,
537
* where the extra memory used could hurt more than TLB overhead
538
* is likely to save. The admin can still enable it through /sys.
539
*/
540
if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
541
transparent_hugepage_flags = 0;
542
543
start_khugepaged();
544
545
set_recommended_min_free_kbytes();
546
547
out:
548
return err;
549
}
550
module_init(hugepage_init)
551
552
static int __init setup_transparent_hugepage(char *str)
553
{
554
int ret = 0;
555
if (!str)
556
goto out;
557
if (!strcmp(str, "always")) {
558
set_bit(TRANSPARENT_HUGEPAGE_FLAG,
559
&transparent_hugepage_flags);
560
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
561
&transparent_hugepage_flags);
562
ret = 1;
563
} else if (!strcmp(str, "madvise")) {
564
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
565
&transparent_hugepage_flags);
566
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
567
&transparent_hugepage_flags);
568
ret = 1;
569
} else if (!strcmp(str, "never")) {
570
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
571
&transparent_hugepage_flags);
572
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
573
&transparent_hugepage_flags);
574
ret = 1;
575
}
576
out:
577
if (!ret)
578
printk(KERN_WARNING
579
"transparent_hugepage= cannot parse, ignored\n");
580
return ret;
581
}
582
__setup("transparent_hugepage=", setup_transparent_hugepage);
583
584
static void prepare_pmd_huge_pte(pgtable_t pgtable,
585
struct mm_struct *mm)
586
{
587
assert_spin_locked(&mm->page_table_lock);
588
589
/* FIFO */
590
if (!mm->pmd_huge_pte)
591
INIT_LIST_HEAD(&pgtable->lru);
592
else
593
list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
594
mm->pmd_huge_pte = pgtable;
595
}
596
597
static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
598
{
599
if (likely(vma->vm_flags & VM_WRITE))
600
pmd = pmd_mkwrite(pmd);
601
return pmd;
602
}
603
604
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
605
struct vm_area_struct *vma,
606
unsigned long haddr, pmd_t *pmd,
607
struct page *page)
608
{
609
int ret = 0;
610
pgtable_t pgtable;
611
612
VM_BUG_ON(!PageCompound(page));
613
pgtable = pte_alloc_one(mm, haddr);
614
if (unlikely(!pgtable)) {
615
mem_cgroup_uncharge_page(page);
616
put_page(page);
617
return VM_FAULT_OOM;
618
}
619
620
clear_huge_page(page, haddr, HPAGE_PMD_NR);
621
__SetPageUptodate(page);
622
623
spin_lock(&mm->page_table_lock);
624
if (unlikely(!pmd_none(*pmd))) {
625
spin_unlock(&mm->page_table_lock);
626
mem_cgroup_uncharge_page(page);
627
put_page(page);
628
pte_free(mm, pgtable);
629
} else {
630
pmd_t entry;
631
entry = mk_pmd(page, vma->vm_page_prot);
632
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
633
entry = pmd_mkhuge(entry);
634
/*
635
* The spinlocking to take the lru_lock inside
636
* page_add_new_anon_rmap() acts as a full memory
637
* barrier to be sure clear_huge_page writes become
638
* visible after the set_pmd_at() write.
639
*/
640
page_add_new_anon_rmap(page, vma, haddr);
641
set_pmd_at(mm, haddr, pmd, entry);
642
prepare_pmd_huge_pte(pgtable, mm);
643
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
644
spin_unlock(&mm->page_table_lock);
645
}
646
647
return ret;
648
}
649
650
static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
651
{
652
return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
653
}
654
655
static inline struct page *alloc_hugepage_vma(int defrag,
656
struct vm_area_struct *vma,
657
unsigned long haddr, int nd,
658
gfp_t extra_gfp)
659
{
660
return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
661
HPAGE_PMD_ORDER, vma, haddr, nd);
662
}
663
664
#ifndef CONFIG_NUMA
665
static inline struct page *alloc_hugepage(int defrag)
666
{
667
return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
668
HPAGE_PMD_ORDER);
669
}
670
#endif
671
672
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
673
unsigned long address, pmd_t *pmd,
674
unsigned int flags)
675
{
676
struct page *page;
677
unsigned long haddr = address & HPAGE_PMD_MASK;
678
pte_t *pte;
679
680
if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
681
if (unlikely(anon_vma_prepare(vma)))
682
return VM_FAULT_OOM;
683
if (unlikely(khugepaged_enter(vma)))
684
return VM_FAULT_OOM;
685
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
686
vma, haddr, numa_node_id(), 0);
687
if (unlikely(!page)) {
688
count_vm_event(THP_FAULT_FALLBACK);
689
goto out;
690
}
691
count_vm_event(THP_FAULT_ALLOC);
692
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
693
put_page(page);
694
goto out;
695
}
696
697
return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
698
}
699
out:
700
/*
701
* Use __pte_alloc instead of pte_alloc_map, because we can't
702
* run pte_offset_map on the pmd, if an huge pmd could
703
* materialize from under us from a different thread.
704
*/
705
if (unlikely(__pte_alloc(mm, vma, pmd, address)))
706
return VM_FAULT_OOM;
707
/* if an huge pmd materialized from under us just retry later */
708
if (unlikely(pmd_trans_huge(*pmd)))
709
return 0;
710
/*
711
* A regular pmd is established and it can't morph into a huge pmd
712
* from under us anymore at this point because we hold the mmap_sem
713
* read mode and khugepaged takes it in write mode. So now it's
714
* safe to run pte_offset_map().
715
*/
716
pte = pte_offset_map(pmd, address);
717
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
718
}
719
720
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
721
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
722
struct vm_area_struct *vma)
723
{
724
struct page *src_page;
725
pmd_t pmd;
726
pgtable_t pgtable;
727
int ret;
728
729
ret = -ENOMEM;
730
pgtable = pte_alloc_one(dst_mm, addr);
731
if (unlikely(!pgtable))
732
goto out;
733
734
spin_lock(&dst_mm->page_table_lock);
735
spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
736
737
ret = -EAGAIN;
738
pmd = *src_pmd;
739
if (unlikely(!pmd_trans_huge(pmd))) {
740
pte_free(dst_mm, pgtable);
741
goto out_unlock;
742
}
743
if (unlikely(pmd_trans_splitting(pmd))) {
744
/* split huge page running from under us */
745
spin_unlock(&src_mm->page_table_lock);
746
spin_unlock(&dst_mm->page_table_lock);
747
pte_free(dst_mm, pgtable);
748
749
wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
750
goto out;
751
}
752
src_page = pmd_page(pmd);
753
VM_BUG_ON(!PageHead(src_page));
754
get_page(src_page);
755
page_dup_rmap(src_page);
756
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
757
758
pmdp_set_wrprotect(src_mm, addr, src_pmd);
759
pmd = pmd_mkold(pmd_wrprotect(pmd));
760
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
761
prepare_pmd_huge_pte(pgtable, dst_mm);
762
763
ret = 0;
764
out_unlock:
765
spin_unlock(&src_mm->page_table_lock);
766
spin_unlock(&dst_mm->page_table_lock);
767
out:
768
return ret;
769
}
770
771
/* no "address" argument so destroys page coloring of some arch */
772
pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
773
{
774
pgtable_t pgtable;
775
776
assert_spin_locked(&mm->page_table_lock);
777
778
/* FIFO */
779
pgtable = mm->pmd_huge_pte;
780
if (list_empty(&pgtable->lru))
781
mm->pmd_huge_pte = NULL;
782
else {
783
mm->pmd_huge_pte = list_entry(pgtable->lru.next,
784
struct page, lru);
785
list_del(&pgtable->lru);
786
}
787
return pgtable;
788
}
789
790
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
791
struct vm_area_struct *vma,
792
unsigned long address,
793
pmd_t *pmd, pmd_t orig_pmd,
794
struct page *page,
795
unsigned long haddr)
796
{
797
pgtable_t pgtable;
798
pmd_t _pmd;
799
int ret = 0, i;
800
struct page **pages;
801
802
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
803
GFP_KERNEL);
804
if (unlikely(!pages)) {
805
ret |= VM_FAULT_OOM;
806
goto out;
807
}
808
809
for (i = 0; i < HPAGE_PMD_NR; i++) {
810
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
811
__GFP_OTHER_NODE,
812
vma, address, page_to_nid(page));
813
if (unlikely(!pages[i] ||
814
mem_cgroup_newpage_charge(pages[i], mm,
815
GFP_KERNEL))) {
816
if (pages[i])
817
put_page(pages[i]);
818
mem_cgroup_uncharge_start();
819
while (--i >= 0) {
820
mem_cgroup_uncharge_page(pages[i]);
821
put_page(pages[i]);
822
}
823
mem_cgroup_uncharge_end();
824
kfree(pages);
825
ret |= VM_FAULT_OOM;
826
goto out;
827
}
828
}
829
830
for (i = 0; i < HPAGE_PMD_NR; i++) {
831
copy_user_highpage(pages[i], page + i,
832
haddr + PAGE_SHIFT*i, vma);
833
__SetPageUptodate(pages[i]);
834
cond_resched();
835
}
836
837
spin_lock(&mm->page_table_lock);
838
if (unlikely(!pmd_same(*pmd, orig_pmd)))
839
goto out_free_pages;
840
VM_BUG_ON(!PageHead(page));
841
842
pmdp_clear_flush_notify(vma, haddr, pmd);
843
/* leave pmd empty until pte is filled */
844
845
pgtable = get_pmd_huge_pte(mm);
846
pmd_populate(mm, &_pmd, pgtable);
847
848
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
849
pte_t *pte, entry;
850
entry = mk_pte(pages[i], vma->vm_page_prot);
851
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
852
page_add_new_anon_rmap(pages[i], vma, haddr);
853
pte = pte_offset_map(&_pmd, haddr);
854
VM_BUG_ON(!pte_none(*pte));
855
set_pte_at(mm, haddr, pte, entry);
856
pte_unmap(pte);
857
}
858
kfree(pages);
859
860
mm->nr_ptes++;
861
smp_wmb(); /* make pte visible before pmd */
862
pmd_populate(mm, pmd, pgtable);
863
page_remove_rmap(page);
864
spin_unlock(&mm->page_table_lock);
865
866
ret |= VM_FAULT_WRITE;
867
put_page(page);
868
869
out:
870
return ret;
871
872
out_free_pages:
873
spin_unlock(&mm->page_table_lock);
874
mem_cgroup_uncharge_start();
875
for (i = 0; i < HPAGE_PMD_NR; i++) {
876
mem_cgroup_uncharge_page(pages[i]);
877
put_page(pages[i]);
878
}
879
mem_cgroup_uncharge_end();
880
kfree(pages);
881
goto out;
882
}
883
884
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
885
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
886
{
887
int ret = 0;
888
struct page *page, *new_page;
889
unsigned long haddr;
890
891
VM_BUG_ON(!vma->anon_vma);
892
spin_lock(&mm->page_table_lock);
893
if (unlikely(!pmd_same(*pmd, orig_pmd)))
894
goto out_unlock;
895
896
page = pmd_page(orig_pmd);
897
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
898
haddr = address & HPAGE_PMD_MASK;
899
if (page_mapcount(page) == 1) {
900
pmd_t entry;
901
entry = pmd_mkyoung(orig_pmd);
902
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
903
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
904
update_mmu_cache(vma, address, entry);
905
ret |= VM_FAULT_WRITE;
906
goto out_unlock;
907
}
908
get_page(page);
909
spin_unlock(&mm->page_table_lock);
910
911
if (transparent_hugepage_enabled(vma) &&
912
!transparent_hugepage_debug_cow())
913
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
914
vma, haddr, numa_node_id(), 0);
915
else
916
new_page = NULL;
917
918
if (unlikely(!new_page)) {
919
count_vm_event(THP_FAULT_FALLBACK);
920
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
921
pmd, orig_pmd, page, haddr);
922
put_page(page);
923
goto out;
924
}
925
count_vm_event(THP_FAULT_ALLOC);
926
927
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
928
put_page(new_page);
929
put_page(page);
930
ret |= VM_FAULT_OOM;
931
goto out;
932
}
933
934
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
935
__SetPageUptodate(new_page);
936
937
spin_lock(&mm->page_table_lock);
938
put_page(page);
939
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
940
mem_cgroup_uncharge_page(new_page);
941
put_page(new_page);
942
} else {
943
pmd_t entry;
944
VM_BUG_ON(!PageHead(page));
945
entry = mk_pmd(new_page, vma->vm_page_prot);
946
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
947
entry = pmd_mkhuge(entry);
948
pmdp_clear_flush_notify(vma, haddr, pmd);
949
page_add_new_anon_rmap(new_page, vma, haddr);
950
set_pmd_at(mm, haddr, pmd, entry);
951
update_mmu_cache(vma, address, entry);
952
page_remove_rmap(page);
953
put_page(page);
954
ret |= VM_FAULT_WRITE;
955
}
956
out_unlock:
957
spin_unlock(&mm->page_table_lock);
958
out:
959
return ret;
960
}
961
962
struct page *follow_trans_huge_pmd(struct mm_struct *mm,
963
unsigned long addr,
964
pmd_t *pmd,
965
unsigned int flags)
966
{
967
struct page *page = NULL;
968
969
assert_spin_locked(&mm->page_table_lock);
970
971
if (flags & FOLL_WRITE && !pmd_write(*pmd))
972
goto out;
973
974
page = pmd_page(*pmd);
975
VM_BUG_ON(!PageHead(page));
976
if (flags & FOLL_TOUCH) {
977
pmd_t _pmd;
978
/*
979
* We should set the dirty bit only for FOLL_WRITE but
980
* for now the dirty bit in the pmd is meaningless.
981
* And if the dirty bit will become meaningful and
982
* we'll only set it with FOLL_WRITE, an atomic
983
* set_bit will be required on the pmd to set the
984
* young bit, instead of the current set_pmd_at.
985
*/
986
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
987
set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
988
}
989
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990
VM_BUG_ON(!PageCompound(page));
991
if (flags & FOLL_GET)
992
get_page(page);
993
994
out:
995
return page;
996
}
997
998
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
999
pmd_t *pmd)
1000
{
1001
int ret = 0;
1002
1003
spin_lock(&tlb->mm->page_table_lock);
1004
if (likely(pmd_trans_huge(*pmd))) {
1005
if (unlikely(pmd_trans_splitting(*pmd))) {
1006
spin_unlock(&tlb->mm->page_table_lock);
1007
wait_split_huge_page(vma->anon_vma,
1008
pmd);
1009
} else {
1010
struct page *page;
1011
pgtable_t pgtable;
1012
pgtable = get_pmd_huge_pte(tlb->mm);
1013
page = pmd_page(*pmd);
1014
pmd_clear(pmd);
1015
page_remove_rmap(page);
1016
VM_BUG_ON(page_mapcount(page) < 0);
1017
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1018
VM_BUG_ON(!PageHead(page));
1019
spin_unlock(&tlb->mm->page_table_lock);
1020
tlb_remove_page(tlb, page);
1021
pte_free(tlb->mm, pgtable);
1022
ret = 1;
1023
}
1024
} else
1025
spin_unlock(&tlb->mm->page_table_lock);
1026
1027
return ret;
1028
}
1029
1030
int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1031
unsigned long addr, unsigned long end,
1032
unsigned char *vec)
1033
{
1034
int ret = 0;
1035
1036
spin_lock(&vma->vm_mm->page_table_lock);
1037
if (likely(pmd_trans_huge(*pmd))) {
1038
ret = !pmd_trans_splitting(*pmd);
1039
spin_unlock(&vma->vm_mm->page_table_lock);
1040
if (unlikely(!ret))
1041
wait_split_huge_page(vma->anon_vma, pmd);
1042
else {
1043
/*
1044
* All logical pages in the range are present
1045
* if backed by a huge page.
1046
*/
1047
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1048
}
1049
} else
1050
spin_unlock(&vma->vm_mm->page_table_lock);
1051
1052
return ret;
1053
}
1054
1055
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1056
unsigned long addr, pgprot_t newprot)
1057
{
1058
struct mm_struct *mm = vma->vm_mm;
1059
int ret = 0;
1060
1061
spin_lock(&mm->page_table_lock);
1062
if (likely(pmd_trans_huge(*pmd))) {
1063
if (unlikely(pmd_trans_splitting(*pmd))) {
1064
spin_unlock(&mm->page_table_lock);
1065
wait_split_huge_page(vma->anon_vma, pmd);
1066
} else {
1067
pmd_t entry;
1068
1069
entry = pmdp_get_and_clear(mm, addr, pmd);
1070
entry = pmd_modify(entry, newprot);
1071
set_pmd_at(mm, addr, pmd, entry);
1072
spin_unlock(&vma->vm_mm->page_table_lock);
1073
flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1074
ret = 1;
1075
}
1076
} else
1077
spin_unlock(&vma->vm_mm->page_table_lock);
1078
1079
return ret;
1080
}
1081
1082
pmd_t *page_check_address_pmd(struct page *page,
1083
struct mm_struct *mm,
1084
unsigned long address,
1085
enum page_check_address_pmd_flag flag)
1086
{
1087
pgd_t *pgd;
1088
pud_t *pud;
1089
pmd_t *pmd, *ret = NULL;
1090
1091
if (address & ~HPAGE_PMD_MASK)
1092
goto out;
1093
1094
pgd = pgd_offset(mm, address);
1095
if (!pgd_present(*pgd))
1096
goto out;
1097
1098
pud = pud_offset(pgd, address);
1099
if (!pud_present(*pud))
1100
goto out;
1101
1102
pmd = pmd_offset(pud, address);
1103
if (pmd_none(*pmd))
1104
goto out;
1105
if (pmd_page(*pmd) != page)
1106
goto out;
1107
/*
1108
* split_vma() may create temporary aliased mappings. There is
1109
* no risk as long as all huge pmd are found and have their
1110
* splitting bit set before __split_huge_page_refcount
1111
* runs. Finding the same huge pmd more than once during the
1112
* same rmap walk is not a problem.
1113
*/
1114
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1115
pmd_trans_splitting(*pmd))
1116
goto out;
1117
if (pmd_trans_huge(*pmd)) {
1118
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1119
!pmd_trans_splitting(*pmd));
1120
ret = pmd;
1121
}
1122
out:
1123
return ret;
1124
}
1125
1126
static int __split_huge_page_splitting(struct page *page,
1127
struct vm_area_struct *vma,
1128
unsigned long address)
1129
{
1130
struct mm_struct *mm = vma->vm_mm;
1131
pmd_t *pmd;
1132
int ret = 0;
1133
1134
spin_lock(&mm->page_table_lock);
1135
pmd = page_check_address_pmd(page, mm, address,
1136
PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1137
if (pmd) {
1138
/*
1139
* We can't temporarily set the pmd to null in order
1140
* to split it, the pmd must remain marked huge at all
1141
* times or the VM won't take the pmd_trans_huge paths
1142
* and it won't wait on the anon_vma->root->mutex to
1143
* serialize against split_huge_page*.
1144
*/
1145
pmdp_splitting_flush_notify(vma, address, pmd);
1146
ret = 1;
1147
}
1148
spin_unlock(&mm->page_table_lock);
1149
1150
return ret;
1151
}
1152
1153
static void __split_huge_page_refcount(struct page *page)
1154
{
1155
int i;
1156
unsigned long head_index = page->index;
1157
struct zone *zone = page_zone(page);
1158
int zonestat;
1159
1160
/* prevent PageLRU to go away from under us, and freeze lru stats */
1161
spin_lock_irq(&zone->lru_lock);
1162
compound_lock(page);
1163
1164
for (i = 1; i < HPAGE_PMD_NR; i++) {
1165
struct page *page_tail = page + i;
1166
1167
/* tail_page->_count cannot change */
1168
atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1169
BUG_ON(page_count(page) <= 0);
1170
atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1171
BUG_ON(atomic_read(&page_tail->_count) <= 0);
1172
1173
/* after clearing PageTail the gup refcount can be released */
1174
smp_mb();
1175
1176
/*
1177
* retain hwpoison flag of the poisoned tail page:
1178
* fix for the unsuitable process killed on Guest Machine(KVM)
1179
* by the memory-failure.
1180
*/
1181
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1182
page_tail->flags |= (page->flags &
1183
((1L << PG_referenced) |
1184
(1L << PG_swapbacked) |
1185
(1L << PG_mlocked) |
1186
(1L << PG_uptodate)));
1187
page_tail->flags |= (1L << PG_dirty);
1188
1189
/*
1190
* 1) clear PageTail before overwriting first_page
1191
* 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192
*/
1193
smp_wmb();
1194
1195
/*
1196
* __split_huge_page_splitting() already set the
1197
* splitting bit in all pmd that could map this
1198
* hugepage, that will ensure no CPU can alter the
1199
* mapcount on the head page. The mapcount is only
1200
* accounted in the head page and it has to be
1201
* transferred to all tail pages in the below code. So
1202
* for this code to be safe, the split the mapcount
1203
* can't change. But that doesn't mean userland can't
1204
* keep changing and reading the page contents while
1205
* we transfer the mapcount, so the pmd splitting
1206
* status is achieved setting a reserved bit in the
1207
* pmd, not by clearing the present bit.
1208
*/
1209
BUG_ON(page_mapcount(page_tail));
1210
page_tail->_mapcount = page->_mapcount;
1211
1212
BUG_ON(page_tail->mapping);
1213
page_tail->mapping = page->mapping;
1214
1215
page_tail->index = ++head_index;
1216
1217
BUG_ON(!PageAnon(page_tail));
1218
BUG_ON(!PageUptodate(page_tail));
1219
BUG_ON(!PageDirty(page_tail));
1220
BUG_ON(!PageSwapBacked(page_tail));
1221
1222
mem_cgroup_split_huge_fixup(page, page_tail);
1223
1224
lru_add_page_tail(zone, page, page_tail);
1225
}
1226
1227
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228
__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1229
1230
/*
1231
* A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1232
* so adjust those appropriately if this page is on the LRU.
1233
*/
1234
if (PageLRU(page)) {
1235
zonestat = NR_LRU_BASE + page_lru(page);
1236
__mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1237
}
1238
1239
ClearPageCompound(page);
1240
compound_unlock(page);
1241
spin_unlock_irq(&zone->lru_lock);
1242
1243
for (i = 1; i < HPAGE_PMD_NR; i++) {
1244
struct page *page_tail = page + i;
1245
BUG_ON(page_count(page_tail) <= 0);
1246
/*
1247
* Tail pages may be freed if there wasn't any mapping
1248
* like if add_to_swap() is running on a lru page that
1249
* had its mapping zapped. And freeing these pages
1250
* requires taking the lru_lock so we do the put_page
1251
* of the tail pages after the split is complete.
1252
*/
1253
put_page(page_tail);
1254
}
1255
1256
/*
1257
* Only the head page (now become a regular page) is required
1258
* to be pinned by the caller.
1259
*/
1260
BUG_ON(page_count(page) <= 0);
1261
}
1262
1263
static int __split_huge_page_map(struct page *page,
1264
struct vm_area_struct *vma,
1265
unsigned long address)
1266
{
1267
struct mm_struct *mm = vma->vm_mm;
1268
pmd_t *pmd, _pmd;
1269
int ret = 0, i;
1270
pgtable_t pgtable;
1271
unsigned long haddr;
1272
1273
spin_lock(&mm->page_table_lock);
1274
pmd = page_check_address_pmd(page, mm, address,
1275
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1276
if (pmd) {
1277
pgtable = get_pmd_huge_pte(mm);
1278
pmd_populate(mm, &_pmd, pgtable);
1279
1280
for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1281
i++, haddr += PAGE_SIZE) {
1282
pte_t *pte, entry;
1283
BUG_ON(PageCompound(page+i));
1284
entry = mk_pte(page + i, vma->vm_page_prot);
1285
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1286
if (!pmd_write(*pmd))
1287
entry = pte_wrprotect(entry);
1288
else
1289
BUG_ON(page_mapcount(page) != 1);
1290
if (!pmd_young(*pmd))
1291
entry = pte_mkold(entry);
1292
pte = pte_offset_map(&_pmd, haddr);
1293
BUG_ON(!pte_none(*pte));
1294
set_pte_at(mm, haddr, pte, entry);
1295
pte_unmap(pte);
1296
}
1297
1298
mm->nr_ptes++;
1299
smp_wmb(); /* make pte visible before pmd */
1300
/*
1301
* Up to this point the pmd is present and huge and
1302
* userland has the whole access to the hugepage
1303
* during the split (which happens in place). If we
1304
* overwrite the pmd with the not-huge version
1305
* pointing to the pte here (which of course we could
1306
* if all CPUs were bug free), userland could trigger
1307
* a small page size TLB miss on the small sized TLB
1308
* while the hugepage TLB entry is still established
1309
* in the huge TLB. Some CPU doesn't like that. See
1310
* http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1311
* Erratum 383 on page 93. Intel should be safe but is
1312
* also warns that it's only safe if the permission
1313
* and cache attributes of the two entries loaded in
1314
* the two TLB is identical (which should be the case
1315
* here). But it is generally safer to never allow
1316
* small and huge TLB entries for the same virtual
1317
* address to be loaded simultaneously. So instead of
1318
* doing "pmd_populate(); flush_tlb_range();" we first
1319
* mark the current pmd notpresent (atomically because
1320
* here the pmd_trans_huge and pmd_trans_splitting
1321
* must remain set at all times on the pmd until the
1322
* split is complete for this pmd), then we flush the
1323
* SMP TLB and finally we write the non-huge version
1324
* of the pmd entry with pmd_populate.
1325
*/
1326
set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1327
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1328
pmd_populate(mm, pmd, pgtable);
1329
ret = 1;
1330
}
1331
spin_unlock(&mm->page_table_lock);
1332
1333
return ret;
1334
}
1335
1336
/* must be called with anon_vma->root->mutex hold */
1337
static void __split_huge_page(struct page *page,
1338
struct anon_vma *anon_vma)
1339
{
1340
int mapcount, mapcount2;
1341
struct anon_vma_chain *avc;
1342
1343
BUG_ON(!PageHead(page));
1344
BUG_ON(PageTail(page));
1345
1346
mapcount = 0;
1347
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1348
struct vm_area_struct *vma = avc->vma;
1349
unsigned long addr = vma_address(page, vma);
1350
BUG_ON(is_vma_temporary_stack(vma));
1351
if (addr == -EFAULT)
1352
continue;
1353
mapcount += __split_huge_page_splitting(page, vma, addr);
1354
}
1355
/*
1356
* It is critical that new vmas are added to the tail of the
1357
* anon_vma list. This guarantes that if copy_huge_pmd() runs
1358
* and establishes a child pmd before
1359
* __split_huge_page_splitting() freezes the parent pmd (so if
1360
* we fail to prevent copy_huge_pmd() from running until the
1361
* whole __split_huge_page() is complete), we will still see
1362
* the newly established pmd of the child later during the
1363
* walk, to be able to set it as pmd_trans_splitting too.
1364
*/
1365
if (mapcount != page_mapcount(page))
1366
printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1367
mapcount, page_mapcount(page));
1368
BUG_ON(mapcount != page_mapcount(page));
1369
1370
__split_huge_page_refcount(page);
1371
1372
mapcount2 = 0;
1373
list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1374
struct vm_area_struct *vma = avc->vma;
1375
unsigned long addr = vma_address(page, vma);
1376
BUG_ON(is_vma_temporary_stack(vma));
1377
if (addr == -EFAULT)
1378
continue;
1379
mapcount2 += __split_huge_page_map(page, vma, addr);
1380
}
1381
if (mapcount != mapcount2)
1382
printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1383
mapcount, mapcount2, page_mapcount(page));
1384
BUG_ON(mapcount != mapcount2);
1385
}
1386
1387
int split_huge_page(struct page *page)
1388
{
1389
struct anon_vma *anon_vma;
1390
int ret = 1;
1391
1392
BUG_ON(!PageAnon(page));
1393
anon_vma = page_lock_anon_vma(page);
1394
if (!anon_vma)
1395
goto out;
1396
ret = 0;
1397
if (!PageCompound(page))
1398
goto out_unlock;
1399
1400
BUG_ON(!PageSwapBacked(page));
1401
__split_huge_page(page, anon_vma);
1402
count_vm_event(THP_SPLIT);
1403
1404
BUG_ON(PageCompound(page));
1405
out_unlock:
1406
page_unlock_anon_vma(anon_vma);
1407
out:
1408
return ret;
1409
}
1410
1411
#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
1412
VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1413
1414
int hugepage_madvise(struct vm_area_struct *vma,
1415
unsigned long *vm_flags, int advice)
1416
{
1417
switch (advice) {
1418
case MADV_HUGEPAGE:
1419
/*
1420
* Be somewhat over-protective like KSM for now!
1421
*/
1422
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1423
return -EINVAL;
1424
*vm_flags &= ~VM_NOHUGEPAGE;
1425
*vm_flags |= VM_HUGEPAGE;
1426
/*
1427
* If the vma become good for khugepaged to scan,
1428
* register it here without waiting a page fault that
1429
* may not happen any time soon.
1430
*/
1431
if (unlikely(khugepaged_enter_vma_merge(vma)))
1432
return -ENOMEM;
1433
break;
1434
case MADV_NOHUGEPAGE:
1435
/*
1436
* Be somewhat over-protective like KSM for now!
1437
*/
1438
if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1439
return -EINVAL;
1440
*vm_flags &= ~VM_HUGEPAGE;
1441
*vm_flags |= VM_NOHUGEPAGE;
1442
/*
1443
* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1444
* this vma even if we leave the mm registered in khugepaged if
1445
* it got registered before VM_NOHUGEPAGE was set.
1446
*/
1447
break;
1448
}
1449
1450
return 0;
1451
}
1452
1453
static int __init khugepaged_slab_init(void)
1454
{
1455
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1456
sizeof(struct mm_slot),
1457
__alignof__(struct mm_slot), 0, NULL);
1458
if (!mm_slot_cache)
1459
return -ENOMEM;
1460
1461
return 0;
1462
}
1463
1464
static void __init khugepaged_slab_free(void)
1465
{
1466
kmem_cache_destroy(mm_slot_cache);
1467
mm_slot_cache = NULL;
1468
}
1469
1470
static inline struct mm_slot *alloc_mm_slot(void)
1471
{
1472
if (!mm_slot_cache) /* initialization failed */
1473
return NULL;
1474
return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1475
}
1476
1477
static inline void free_mm_slot(struct mm_slot *mm_slot)
1478
{
1479
kmem_cache_free(mm_slot_cache, mm_slot);
1480
}
1481
1482
static int __init mm_slots_hash_init(void)
1483
{
1484
mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1485
GFP_KERNEL);
1486
if (!mm_slots_hash)
1487
return -ENOMEM;
1488
return 0;
1489
}
1490
1491
#if 0
1492
static void __init mm_slots_hash_free(void)
1493
{
1494
kfree(mm_slots_hash);
1495
mm_slots_hash = NULL;
1496
}
1497
#endif
1498
1499
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1500
{
1501
struct mm_slot *mm_slot;
1502
struct hlist_head *bucket;
1503
struct hlist_node *node;
1504
1505
bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1506
% MM_SLOTS_HASH_HEADS];
1507
hlist_for_each_entry(mm_slot, node, bucket, hash) {
1508
if (mm == mm_slot->mm)
1509
return mm_slot;
1510
}
1511
return NULL;
1512
}
1513
1514
static void insert_to_mm_slots_hash(struct mm_struct *mm,
1515
struct mm_slot *mm_slot)
1516
{
1517
struct hlist_head *bucket;
1518
1519
bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1520
% MM_SLOTS_HASH_HEADS];
1521
mm_slot->mm = mm;
1522
hlist_add_head(&mm_slot->hash, bucket);
1523
}
1524
1525
static inline int khugepaged_test_exit(struct mm_struct *mm)
1526
{
1527
return atomic_read(&mm->mm_users) == 0;
1528
}
1529
1530
int __khugepaged_enter(struct mm_struct *mm)
1531
{
1532
struct mm_slot *mm_slot;
1533
int wakeup;
1534
1535
mm_slot = alloc_mm_slot();
1536
if (!mm_slot)
1537
return -ENOMEM;
1538
1539
/* __khugepaged_exit() must not run from under us */
1540
VM_BUG_ON(khugepaged_test_exit(mm));
1541
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1542
free_mm_slot(mm_slot);
1543
return 0;
1544
}
1545
1546
spin_lock(&khugepaged_mm_lock);
1547
insert_to_mm_slots_hash(mm, mm_slot);
1548
/*
1549
* Insert just behind the scanning cursor, to let the area settle
1550
* down a little.
1551
*/
1552
wakeup = list_empty(&khugepaged_scan.mm_head);
1553
list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1554
spin_unlock(&khugepaged_mm_lock);
1555
1556
atomic_inc(&mm->mm_count);
1557
if (wakeup)
1558
wake_up_interruptible(&khugepaged_wait);
1559
1560
return 0;
1561
}
1562
1563
int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1564
{
1565
unsigned long hstart, hend;
1566
if (!vma->anon_vma)
1567
/*
1568
* Not yet faulted in so we will register later in the
1569
* page fault if needed.
1570
*/
1571
return 0;
1572
if (vma->vm_ops)
1573
/* khugepaged not yet working on file or special mappings */
1574
return 0;
1575
/*
1576
* If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1577
* true too, verify it here.
1578
*/
1579
VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1580
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1581
hend = vma->vm_end & HPAGE_PMD_MASK;
1582
if (hstart < hend)
1583
return khugepaged_enter(vma);
1584
return 0;
1585
}
1586
1587
void __khugepaged_exit(struct mm_struct *mm)
1588
{
1589
struct mm_slot *mm_slot;
1590
int free = 0;
1591
1592
spin_lock(&khugepaged_mm_lock);
1593
mm_slot = get_mm_slot(mm);
1594
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1595
hlist_del(&mm_slot->hash);
1596
list_del(&mm_slot->mm_node);
1597
free = 1;
1598
}
1599
1600
if (free) {
1601
spin_unlock(&khugepaged_mm_lock);
1602
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1603
free_mm_slot(mm_slot);
1604
mmdrop(mm);
1605
} else if (mm_slot) {
1606
spin_unlock(&khugepaged_mm_lock);
1607
/*
1608
* This is required to serialize against
1609
* khugepaged_test_exit() (which is guaranteed to run
1610
* under mmap sem read mode). Stop here (after we
1611
* return all pagetables will be destroyed) until
1612
* khugepaged has finished working on the pagetables
1613
* under the mmap_sem.
1614
*/
1615
down_write(&mm->mmap_sem);
1616
up_write(&mm->mmap_sem);
1617
} else
1618
spin_unlock(&khugepaged_mm_lock);
1619
}
1620
1621
static void release_pte_page(struct page *page)
1622
{
1623
/* 0 stands for page_is_file_cache(page) == false */
1624
dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1625
unlock_page(page);
1626
putback_lru_page(page);
1627
}
1628
1629
static void release_pte_pages(pte_t *pte, pte_t *_pte)
1630
{
1631
while (--_pte >= pte) {
1632
pte_t pteval = *_pte;
1633
if (!pte_none(pteval))
1634
release_pte_page(pte_page(pteval));
1635
}
1636
}
1637
1638
static void release_all_pte_pages(pte_t *pte)
1639
{
1640
release_pte_pages(pte, pte + HPAGE_PMD_NR);
1641
}
1642
1643
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1644
unsigned long address,
1645
pte_t *pte)
1646
{
1647
struct page *page;
1648
pte_t *_pte;
1649
int referenced = 0, isolated = 0, none = 0;
1650
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1651
_pte++, address += PAGE_SIZE) {
1652
pte_t pteval = *_pte;
1653
if (pte_none(pteval)) {
1654
if (++none <= khugepaged_max_ptes_none)
1655
continue;
1656
else {
1657
release_pte_pages(pte, _pte);
1658
goto out;
1659
}
1660
}
1661
if (!pte_present(pteval) || !pte_write(pteval)) {
1662
release_pte_pages(pte, _pte);
1663
goto out;
1664
}
1665
page = vm_normal_page(vma, address, pteval);
1666
if (unlikely(!page)) {
1667
release_pte_pages(pte, _pte);
1668
goto out;
1669
}
1670
VM_BUG_ON(PageCompound(page));
1671
BUG_ON(!PageAnon(page));
1672
VM_BUG_ON(!PageSwapBacked(page));
1673
1674
/* cannot use mapcount: can't collapse if there's a gup pin */
1675
if (page_count(page) != 1) {
1676
release_pte_pages(pte, _pte);
1677
goto out;
1678
}
1679
/*
1680
* We can do it before isolate_lru_page because the
1681
* page can't be freed from under us. NOTE: PG_lock
1682
* is needed to serialize against split_huge_page
1683
* when invoked from the VM.
1684
*/
1685
if (!trylock_page(page)) {
1686
release_pte_pages(pte, _pte);
1687
goto out;
1688
}
1689
/*
1690
* Isolate the page to avoid collapsing an hugepage
1691
* currently in use by the VM.
1692
*/
1693
if (isolate_lru_page(page)) {
1694
unlock_page(page);
1695
release_pte_pages(pte, _pte);
1696
goto out;
1697
}
1698
/* 0 stands for page_is_file_cache(page) == false */
1699
inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1700
VM_BUG_ON(!PageLocked(page));
1701
VM_BUG_ON(PageLRU(page));
1702
1703
/* If there is no mapped pte young don't collapse the page */
1704
if (pte_young(pteval) || PageReferenced(page) ||
1705
mmu_notifier_test_young(vma->vm_mm, address))
1706
referenced = 1;
1707
}
1708
if (unlikely(!referenced))
1709
release_all_pte_pages(pte);
1710
else
1711
isolated = 1;
1712
out:
1713
return isolated;
1714
}
1715
1716
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1717
struct vm_area_struct *vma,
1718
unsigned long address,
1719
spinlock_t *ptl)
1720
{
1721
pte_t *_pte;
1722
for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1723
pte_t pteval = *_pte;
1724
struct page *src_page;
1725
1726
if (pte_none(pteval)) {
1727
clear_user_highpage(page, address);
1728
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1729
} else {
1730
src_page = pte_page(pteval);
1731
copy_user_highpage(page, src_page, address, vma);
1732
VM_BUG_ON(page_mapcount(src_page) != 1);
1733
VM_BUG_ON(page_count(src_page) != 2);
1734
release_pte_page(src_page);
1735
/*
1736
* ptl mostly unnecessary, but preempt has to
1737
* be disabled to update the per-cpu stats
1738
* inside page_remove_rmap().
1739
*/
1740
spin_lock(ptl);
1741
/*
1742
* paravirt calls inside pte_clear here are
1743
* superfluous.
1744
*/
1745
pte_clear(vma->vm_mm, address, _pte);
1746
page_remove_rmap(src_page);
1747
spin_unlock(ptl);
1748
free_page_and_swap_cache(src_page);
1749
}
1750
1751
address += PAGE_SIZE;
1752
page++;
1753
}
1754
}
1755
1756
static void collapse_huge_page(struct mm_struct *mm,
1757
unsigned long address,
1758
struct page **hpage,
1759
struct vm_area_struct *vma,
1760
int node)
1761
{
1762
pgd_t *pgd;
1763
pud_t *pud;
1764
pmd_t *pmd, _pmd;
1765
pte_t *pte;
1766
pgtable_t pgtable;
1767
struct page *new_page;
1768
spinlock_t *ptl;
1769
int isolated;
1770
unsigned long hstart, hend;
1771
1772
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1773
#ifndef CONFIG_NUMA
1774
up_read(&mm->mmap_sem);
1775
VM_BUG_ON(!*hpage);
1776
new_page = *hpage;
1777
#else
1778
VM_BUG_ON(*hpage);
1779
/*
1780
* Allocate the page while the vma is still valid and under
1781
* the mmap_sem read mode so there is no memory allocation
1782
* later when we take the mmap_sem in write mode. This is more
1783
* friendly behavior (OTOH it may actually hide bugs) to
1784
* filesystems in userland with daemons allocating memory in
1785
* the userland I/O paths. Allocating memory with the
1786
* mmap_sem in read mode is good idea also to allow greater
1787
* scalability.
1788
*/
1789
new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1790
node, __GFP_OTHER_NODE);
1791
1792
/*
1793
* After allocating the hugepage, release the mmap_sem read lock in
1794
* preparation for taking it in write mode.
1795
*/
1796
up_read(&mm->mmap_sem);
1797
if (unlikely(!new_page)) {
1798
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1799
*hpage = ERR_PTR(-ENOMEM);
1800
return;
1801
}
1802
#endif
1803
1804
count_vm_event(THP_COLLAPSE_ALLOC);
1805
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1806
#ifdef CONFIG_NUMA
1807
put_page(new_page);
1808
#endif
1809
return;
1810
}
1811
1812
/*
1813
* Prevent all access to pagetables with the exception of
1814
* gup_fast later hanlded by the ptep_clear_flush and the VM
1815
* handled by the anon_vma lock + PG_lock.
1816
*/
1817
down_write(&mm->mmap_sem);
1818
if (unlikely(khugepaged_test_exit(mm)))
1819
goto out;
1820
1821
vma = find_vma(mm, address);
1822
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1823
hend = vma->vm_end & HPAGE_PMD_MASK;
1824
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1825
goto out;
1826
1827
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1828
(vma->vm_flags & VM_NOHUGEPAGE))
1829
goto out;
1830
1831
if (!vma->anon_vma || vma->vm_ops)
1832
goto out;
1833
if (is_vma_temporary_stack(vma))
1834
goto out;
1835
/*
1836
* If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1837
* true too, verify it here.
1838
*/
1839
VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1840
1841
pgd = pgd_offset(mm, address);
1842
if (!pgd_present(*pgd))
1843
goto out;
1844
1845
pud = pud_offset(pgd, address);
1846
if (!pud_present(*pud))
1847
goto out;
1848
1849
pmd = pmd_offset(pud, address);
1850
/* pmd can't go away or become huge under us */
1851
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1852
goto out;
1853
1854
anon_vma_lock(vma->anon_vma);
1855
1856
pte = pte_offset_map(pmd, address);
1857
ptl = pte_lockptr(mm, pmd);
1858
1859
spin_lock(&mm->page_table_lock); /* probably unnecessary */
1860
/*
1861
* After this gup_fast can't run anymore. This also removes
1862
* any huge TLB entry from the CPU so we won't allow
1863
* huge and small TLB entries for the same virtual address
1864
* to avoid the risk of CPU bugs in that area.
1865
*/
1866
_pmd = pmdp_clear_flush_notify(vma, address, pmd);
1867
spin_unlock(&mm->page_table_lock);
1868
1869
spin_lock(ptl);
1870
isolated = __collapse_huge_page_isolate(vma, address, pte);
1871
spin_unlock(ptl);
1872
1873
if (unlikely(!isolated)) {
1874
pte_unmap(pte);
1875
spin_lock(&mm->page_table_lock);
1876
BUG_ON(!pmd_none(*pmd));
1877
set_pmd_at(mm, address, pmd, _pmd);
1878
spin_unlock(&mm->page_table_lock);
1879
anon_vma_unlock(vma->anon_vma);
1880
goto out;
1881
}
1882
1883
/*
1884
* All pages are isolated and locked so anon_vma rmap
1885
* can't run anymore.
1886
*/
1887
anon_vma_unlock(vma->anon_vma);
1888
1889
__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1890
pte_unmap(pte);
1891
__SetPageUptodate(new_page);
1892
pgtable = pmd_pgtable(_pmd);
1893
VM_BUG_ON(page_count(pgtable) != 1);
1894
VM_BUG_ON(page_mapcount(pgtable) != 0);
1895
1896
_pmd = mk_pmd(new_page, vma->vm_page_prot);
1897
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1898
_pmd = pmd_mkhuge(_pmd);
1899
1900
/*
1901
* spin_lock() below is not the equivalent of smp_wmb(), so
1902
* this is needed to avoid the copy_huge_page writes to become
1903
* visible after the set_pmd_at() write.
1904
*/
1905
smp_wmb();
1906
1907
spin_lock(&mm->page_table_lock);
1908
BUG_ON(!pmd_none(*pmd));
1909
page_add_new_anon_rmap(new_page, vma, address);
1910
set_pmd_at(mm, address, pmd, _pmd);
1911
update_mmu_cache(vma, address, entry);
1912
prepare_pmd_huge_pte(pgtable, mm);
1913
mm->nr_ptes--;
1914
spin_unlock(&mm->page_table_lock);
1915
1916
#ifndef CONFIG_NUMA
1917
*hpage = NULL;
1918
#endif
1919
khugepaged_pages_collapsed++;
1920
out_up_write:
1921
up_write(&mm->mmap_sem);
1922
return;
1923
1924
out:
1925
mem_cgroup_uncharge_page(new_page);
1926
#ifdef CONFIG_NUMA
1927
put_page(new_page);
1928
#endif
1929
goto out_up_write;
1930
}
1931
1932
static int khugepaged_scan_pmd(struct mm_struct *mm,
1933
struct vm_area_struct *vma,
1934
unsigned long address,
1935
struct page **hpage)
1936
{
1937
pgd_t *pgd;
1938
pud_t *pud;
1939
pmd_t *pmd;
1940
pte_t *pte, *_pte;
1941
int ret = 0, referenced = 0, none = 0;
1942
struct page *page;
1943
unsigned long _address;
1944
spinlock_t *ptl;
1945
int node = -1;
1946
1947
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1948
1949
pgd = pgd_offset(mm, address);
1950
if (!pgd_present(*pgd))
1951
goto out;
1952
1953
pud = pud_offset(pgd, address);
1954
if (!pud_present(*pud))
1955
goto out;
1956
1957
pmd = pmd_offset(pud, address);
1958
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1959
goto out;
1960
1961
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1962
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1963
_pte++, _address += PAGE_SIZE) {
1964
pte_t pteval = *_pte;
1965
if (pte_none(pteval)) {
1966
if (++none <= khugepaged_max_ptes_none)
1967
continue;
1968
else
1969
goto out_unmap;
1970
}
1971
if (!pte_present(pteval) || !pte_write(pteval))
1972
goto out_unmap;
1973
page = vm_normal_page(vma, _address, pteval);
1974
if (unlikely(!page))
1975
goto out_unmap;
1976
/*
1977
* Chose the node of the first page. This could
1978
* be more sophisticated and look at more pages,
1979
* but isn't for now.
1980
*/
1981
if (node == -1)
1982
node = page_to_nid(page);
1983
VM_BUG_ON(PageCompound(page));
1984
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1985
goto out_unmap;
1986
/* cannot use mapcount: can't collapse if there's a gup pin */
1987
if (page_count(page) != 1)
1988
goto out_unmap;
1989
if (pte_young(pteval) || PageReferenced(page) ||
1990
mmu_notifier_test_young(vma->vm_mm, address))
1991
referenced = 1;
1992
}
1993
if (referenced)
1994
ret = 1;
1995
out_unmap:
1996
pte_unmap_unlock(pte, ptl);
1997
if (ret)
1998
/* collapse_huge_page will return with the mmap_sem released */
1999
collapse_huge_page(mm, address, hpage, vma, node);
2000
out:
2001
return ret;
2002
}
2003
2004
static void collect_mm_slot(struct mm_slot *mm_slot)
2005
{
2006
struct mm_struct *mm = mm_slot->mm;
2007
2008
VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
2009
2010
if (khugepaged_test_exit(mm)) {
2011
/* free mm_slot */
2012
hlist_del(&mm_slot->hash);
2013
list_del(&mm_slot->mm_node);
2014
2015
/*
2016
* Not strictly needed because the mm exited already.
2017
*
2018
* clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2019
*/
2020
2021
/* khugepaged_mm_lock actually not necessary for the below */
2022
free_mm_slot(mm_slot);
2023
mmdrop(mm);
2024
}
2025
}
2026
2027
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2028
struct page **hpage)
2029
{
2030
struct mm_slot *mm_slot;
2031
struct mm_struct *mm;
2032
struct vm_area_struct *vma;
2033
int progress = 0;
2034
2035
VM_BUG_ON(!pages);
2036
VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
2037
2038
if (khugepaged_scan.mm_slot)
2039
mm_slot = khugepaged_scan.mm_slot;
2040
else {
2041
mm_slot = list_entry(khugepaged_scan.mm_head.next,
2042
struct mm_slot, mm_node);
2043
khugepaged_scan.address = 0;
2044
khugepaged_scan.mm_slot = mm_slot;
2045
}
2046
spin_unlock(&khugepaged_mm_lock);
2047
2048
mm = mm_slot->mm;
2049
down_read(&mm->mmap_sem);
2050
if (unlikely(khugepaged_test_exit(mm)))
2051
vma = NULL;
2052
else
2053
vma = find_vma(mm, khugepaged_scan.address);
2054
2055
progress++;
2056
for (; vma; vma = vma->vm_next) {
2057
unsigned long hstart, hend;
2058
2059
cond_resched();
2060
if (unlikely(khugepaged_test_exit(mm))) {
2061
progress++;
2062
break;
2063
}
2064
2065
if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2066
!khugepaged_always()) ||
2067
(vma->vm_flags & VM_NOHUGEPAGE)) {
2068
skip:
2069
progress++;
2070
continue;
2071
}
2072
if (!vma->anon_vma || vma->vm_ops)
2073
goto skip;
2074
if (is_vma_temporary_stack(vma))
2075
goto skip;
2076
/*
2077
* If is_pfn_mapping() is true is_learn_pfn_mapping()
2078
* must be true too, verify it here.
2079
*/
2080
VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2081
vma->vm_flags & VM_NO_THP);
2082
2083
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2084
hend = vma->vm_end & HPAGE_PMD_MASK;
2085
if (hstart >= hend)
2086
goto skip;
2087
if (khugepaged_scan.address > hend)
2088
goto skip;
2089
if (khugepaged_scan.address < hstart)
2090
khugepaged_scan.address = hstart;
2091
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2092
2093
while (khugepaged_scan.address < hend) {
2094
int ret;
2095
cond_resched();
2096
if (unlikely(khugepaged_test_exit(mm)))
2097
goto breakouterloop;
2098
2099
VM_BUG_ON(khugepaged_scan.address < hstart ||
2100
khugepaged_scan.address + HPAGE_PMD_SIZE >
2101
hend);
2102
ret = khugepaged_scan_pmd(mm, vma,
2103
khugepaged_scan.address,
2104
hpage);
2105
/* move to next address */
2106
khugepaged_scan.address += HPAGE_PMD_SIZE;
2107
progress += HPAGE_PMD_NR;
2108
if (ret)
2109
/* we released mmap_sem so break loop */
2110
goto breakouterloop_mmap_sem;
2111
if (progress >= pages)
2112
goto breakouterloop;
2113
}
2114
}
2115
breakouterloop:
2116
up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2117
breakouterloop_mmap_sem:
2118
2119
spin_lock(&khugepaged_mm_lock);
2120
VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2121
/*
2122
* Release the current mm_slot if this mm is about to die, or
2123
* if we scanned all vmas of this mm.
2124
*/
2125
if (khugepaged_test_exit(mm) || !vma) {
2126
/*
2127
* Make sure that if mm_users is reaching zero while
2128
* khugepaged runs here, khugepaged_exit will find
2129
* mm_slot not pointing to the exiting mm.
2130
*/
2131
if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2132
khugepaged_scan.mm_slot = list_entry(
2133
mm_slot->mm_node.next,
2134
struct mm_slot, mm_node);
2135
khugepaged_scan.address = 0;
2136
} else {
2137
khugepaged_scan.mm_slot = NULL;
2138
khugepaged_full_scans++;
2139
}
2140
2141
collect_mm_slot(mm_slot);
2142
}
2143
2144
return progress;
2145
}
2146
2147
static int khugepaged_has_work(void)
2148
{
2149
return !list_empty(&khugepaged_scan.mm_head) &&
2150
khugepaged_enabled();
2151
}
2152
2153
static int khugepaged_wait_event(void)
2154
{
2155
return !list_empty(&khugepaged_scan.mm_head) ||
2156
!khugepaged_enabled();
2157
}
2158
2159
static void khugepaged_do_scan(struct page **hpage)
2160
{
2161
unsigned int progress = 0, pass_through_head = 0;
2162
unsigned int pages = khugepaged_pages_to_scan;
2163
2164
barrier(); /* write khugepaged_pages_to_scan to local stack */
2165
2166
while (progress < pages) {
2167
cond_resched();
2168
2169
#ifndef CONFIG_NUMA
2170
if (!*hpage) {
2171
*hpage = alloc_hugepage(khugepaged_defrag());
2172
if (unlikely(!*hpage)) {
2173
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2174
break;
2175
}
2176
count_vm_event(THP_COLLAPSE_ALLOC);
2177
}
2178
#else
2179
if (IS_ERR(*hpage))
2180
break;
2181
#endif
2182
2183
if (unlikely(kthread_should_stop() || freezing(current)))
2184
break;
2185
2186
spin_lock(&khugepaged_mm_lock);
2187
if (!khugepaged_scan.mm_slot)
2188
pass_through_head++;
2189
if (khugepaged_has_work() &&
2190
pass_through_head < 2)
2191
progress += khugepaged_scan_mm_slot(pages - progress,
2192
hpage);
2193
else
2194
progress = pages;
2195
spin_unlock(&khugepaged_mm_lock);
2196
}
2197
}
2198
2199
static void khugepaged_alloc_sleep(void)
2200
{
2201
DEFINE_WAIT(wait);
2202
add_wait_queue(&khugepaged_wait, &wait);
2203
schedule_timeout_interruptible(
2204
msecs_to_jiffies(
2205
khugepaged_alloc_sleep_millisecs));
2206
remove_wait_queue(&khugepaged_wait, &wait);
2207
}
2208
2209
#ifndef CONFIG_NUMA
2210
static struct page *khugepaged_alloc_hugepage(void)
2211
{
2212
struct page *hpage;
2213
2214
do {
2215
hpage = alloc_hugepage(khugepaged_defrag());
2216
if (!hpage) {
2217
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2218
khugepaged_alloc_sleep();
2219
} else
2220
count_vm_event(THP_COLLAPSE_ALLOC);
2221
} while (unlikely(!hpage) &&
2222
likely(khugepaged_enabled()));
2223
return hpage;
2224
}
2225
#endif
2226
2227
static void khugepaged_loop(void)
2228
{
2229
struct page *hpage;
2230
2231
#ifdef CONFIG_NUMA
2232
hpage = NULL;
2233
#endif
2234
while (likely(khugepaged_enabled())) {
2235
#ifndef CONFIG_NUMA
2236
hpage = khugepaged_alloc_hugepage();
2237
if (unlikely(!hpage))
2238
break;
2239
#else
2240
if (IS_ERR(hpage)) {
2241
khugepaged_alloc_sleep();
2242
hpage = NULL;
2243
}
2244
#endif
2245
2246
khugepaged_do_scan(&hpage);
2247
#ifndef CONFIG_NUMA
2248
if (hpage)
2249
put_page(hpage);
2250
#endif
2251
try_to_freeze();
2252
if (unlikely(kthread_should_stop()))
2253
break;
2254
if (khugepaged_has_work()) {
2255
DEFINE_WAIT(wait);
2256
if (!khugepaged_scan_sleep_millisecs)
2257
continue;
2258
add_wait_queue(&khugepaged_wait, &wait);
2259
schedule_timeout_interruptible(
2260
msecs_to_jiffies(
2261
khugepaged_scan_sleep_millisecs));
2262
remove_wait_queue(&khugepaged_wait, &wait);
2263
} else if (khugepaged_enabled())
2264
wait_event_freezable(khugepaged_wait,
2265
khugepaged_wait_event());
2266
}
2267
}
2268
2269
static int khugepaged(void *none)
2270
{
2271
struct mm_slot *mm_slot;
2272
2273
set_freezable();
2274
set_user_nice(current, 19);
2275
2276
/* serialize with start_khugepaged() */
2277
mutex_lock(&khugepaged_mutex);
2278
2279
for (;;) {
2280
mutex_unlock(&khugepaged_mutex);
2281
VM_BUG_ON(khugepaged_thread != current);
2282
khugepaged_loop();
2283
VM_BUG_ON(khugepaged_thread != current);
2284
2285
mutex_lock(&khugepaged_mutex);
2286
if (!khugepaged_enabled())
2287
break;
2288
if (unlikely(kthread_should_stop()))
2289
break;
2290
}
2291
2292
spin_lock(&khugepaged_mm_lock);
2293
mm_slot = khugepaged_scan.mm_slot;
2294
khugepaged_scan.mm_slot = NULL;
2295
if (mm_slot)
2296
collect_mm_slot(mm_slot);
2297
spin_unlock(&khugepaged_mm_lock);
2298
2299
khugepaged_thread = NULL;
2300
mutex_unlock(&khugepaged_mutex);
2301
2302
return 0;
2303
}
2304
2305
void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2306
{
2307
struct page *page;
2308
2309
spin_lock(&mm->page_table_lock);
2310
if (unlikely(!pmd_trans_huge(*pmd))) {
2311
spin_unlock(&mm->page_table_lock);
2312
return;
2313
}
2314
page = pmd_page(*pmd);
2315
VM_BUG_ON(!page_count(page));
2316
get_page(page);
2317
spin_unlock(&mm->page_table_lock);
2318
2319
split_huge_page(page);
2320
2321
put_page(page);
2322
BUG_ON(pmd_trans_huge(*pmd));
2323
}
2324
2325
static void split_huge_page_address(struct mm_struct *mm,
2326
unsigned long address)
2327
{
2328
pgd_t *pgd;
2329
pud_t *pud;
2330
pmd_t *pmd;
2331
2332
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2333
2334
pgd = pgd_offset(mm, address);
2335
if (!pgd_present(*pgd))
2336
return;
2337
2338
pud = pud_offset(pgd, address);
2339
if (!pud_present(*pud))
2340
return;
2341
2342
pmd = pmd_offset(pud, address);
2343
if (!pmd_present(*pmd))
2344
return;
2345
/*
2346
* Caller holds the mmap_sem write mode, so a huge pmd cannot
2347
* materialize from under us.
2348
*/
2349
split_huge_page_pmd(mm, pmd);
2350
}
2351
2352
void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2353
unsigned long start,
2354
unsigned long end,
2355
long adjust_next)
2356
{
2357
/*
2358
* If the new start address isn't hpage aligned and it could
2359
* previously contain an hugepage: check if we need to split
2360
* an huge pmd.
2361
*/
2362
if (start & ~HPAGE_PMD_MASK &&
2363
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
2364
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2365
split_huge_page_address(vma->vm_mm, start);
2366
2367
/*
2368
* If the new end address isn't hpage aligned and it could
2369
* previously contain an hugepage: check if we need to split
2370
* an huge pmd.
2371
*/
2372
if (end & ~HPAGE_PMD_MASK &&
2373
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
2374
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2375
split_huge_page_address(vma->vm_mm, end);
2376
2377
/*
2378
* If we're also updating the vma->vm_next->vm_start, if the new
2379
* vm_next->vm_start isn't page aligned and it could previously
2380
* contain an hugepage: check if we need to split an huge pmd.
2381
*/
2382
if (adjust_next > 0) {
2383
struct vm_area_struct *next = vma->vm_next;
2384
unsigned long nstart = next->vm_start;
2385
nstart += adjust_next << PAGE_SHIFT;
2386
if (nstart & ~HPAGE_PMD_MASK &&
2387
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2388
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2389
split_huge_page_address(next->vm_mm, nstart);
2390
}
2391
}
2392
2393