Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/huge_memory.c
26131 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Copyright (C) 2009 Red Hat, Inc.
4
*/
5
6
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8
#include <linux/mm.h>
9
#include <linux/sched.h>
10
#include <linux/sched/mm.h>
11
#include <linux/sched/numa_balancing.h>
12
#include <linux/highmem.h>
13
#include <linux/hugetlb.h>
14
#include <linux/mmu_notifier.h>
15
#include <linux/rmap.h>
16
#include <linux/swap.h>
17
#include <linux/shrinker.h>
18
#include <linux/mm_inline.h>
19
#include <linux/swapops.h>
20
#include <linux/backing-dev.h>
21
#include <linux/dax.h>
22
#include <linux/mm_types.h>
23
#include <linux/khugepaged.h>
24
#include <linux/freezer.h>
25
#include <linux/mman.h>
26
#include <linux/memremap.h>
27
#include <linux/pagemap.h>
28
#include <linux/debugfs.h>
29
#include <linux/migrate.h>
30
#include <linux/hashtable.h>
31
#include <linux/userfaultfd_k.h>
32
#include <linux/page_idle.h>
33
#include <linux/shmem_fs.h>
34
#include <linux/oom.h>
35
#include <linux/numa.h>
36
#include <linux/page_owner.h>
37
#include <linux/sched/sysctl.h>
38
#include <linux/memory-tiers.h>
39
#include <linux/compat.h>
40
#include <linux/pgalloc_tag.h>
41
#include <linux/pagewalk.h>
42
43
#include <asm/tlb.h>
44
#include <asm/pgalloc.h>
45
#include "internal.h"
46
#include "swap.h"
47
48
#define CREATE_TRACE_POINTS
49
#include <trace/events/thp.h>
50
51
/*
52
* By default, transparent hugepage support is disabled in order to avoid
53
* risking an increased memory footprint for applications that are not
54
* guaranteed to benefit from it. When transparent hugepage support is
55
* enabled, it is for all mappings, and khugepaged scans all mappings.
56
* Defrag is invoked by khugepaged hugepage allocations and by page faults
57
* for all hugepage allocations.
58
*/
59
unsigned long transparent_hugepage_flags __read_mostly =
60
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
61
(1<<TRANSPARENT_HUGEPAGE_FLAG)|
62
#endif
63
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
64
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
65
#endif
66
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
67
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
68
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
69
70
static struct shrinker *deferred_split_shrinker;
71
static unsigned long deferred_split_count(struct shrinker *shrink,
72
struct shrink_control *sc);
73
static unsigned long deferred_split_scan(struct shrinker *shrink,
74
struct shrink_control *sc);
75
static bool split_underused_thp = true;
76
77
static atomic_t huge_zero_refcount;
78
struct folio *huge_zero_folio __read_mostly;
79
unsigned long huge_zero_pfn __read_mostly = ~0UL;
80
unsigned long huge_anon_orders_always __read_mostly;
81
unsigned long huge_anon_orders_madvise __read_mostly;
82
unsigned long huge_anon_orders_inherit __read_mostly;
83
static bool anon_orders_configured __initdata;
84
85
static inline bool file_thp_enabled(struct vm_area_struct *vma)
86
{
87
struct inode *inode;
88
89
if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
90
return false;
91
92
if (!vma->vm_file)
93
return false;
94
95
inode = file_inode(vma->vm_file);
96
97
return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
98
}
99
100
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
101
vm_flags_t vm_flags,
102
unsigned long tva_flags,
103
unsigned long orders)
104
{
105
bool smaps = tva_flags & TVA_SMAPS;
106
bool in_pf = tva_flags & TVA_IN_PF;
107
bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
108
unsigned long supported_orders;
109
110
/* Check the intersection of requested and supported orders. */
111
if (vma_is_anonymous(vma))
112
supported_orders = THP_ORDERS_ALL_ANON;
113
else if (vma_is_special_huge(vma))
114
supported_orders = THP_ORDERS_ALL_SPECIAL;
115
else
116
supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
117
118
orders &= supported_orders;
119
if (!orders)
120
return 0;
121
122
if (!vma->vm_mm) /* vdso */
123
return 0;
124
125
if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
126
return 0;
127
128
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
129
if (vma_is_dax(vma))
130
return in_pf ? orders : 0;
131
132
/*
133
* khugepaged special VMA and hugetlb VMA.
134
* Must be checked after dax since some dax mappings may have
135
* VM_MIXEDMAP set.
136
*/
137
if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
138
return 0;
139
140
/*
141
* Check alignment for file vma and size for both file and anon vma by
142
* filtering out the unsuitable orders.
143
*
144
* Skip the check for page fault. Huge fault does the check in fault
145
* handlers.
146
*/
147
if (!in_pf) {
148
int order = highest_order(orders);
149
unsigned long addr;
150
151
while (orders) {
152
addr = vma->vm_end - (PAGE_SIZE << order);
153
if (thp_vma_suitable_order(vma, addr, order))
154
break;
155
order = next_order(&orders, order);
156
}
157
158
if (!orders)
159
return 0;
160
}
161
162
/*
163
* Enabled via shmem mount options or sysfs settings.
164
* Must be done before hugepage flags check since shmem has its
165
* own flags.
166
*/
167
if (!in_pf && shmem_file(vma->vm_file))
168
return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
169
vma, vma->vm_pgoff, 0,
170
!enforce_sysfs);
171
172
if (!vma_is_anonymous(vma)) {
173
/*
174
* Enforce sysfs THP requirements as necessary. Anonymous vmas
175
* were already handled in thp_vma_allowable_orders().
176
*/
177
if (enforce_sysfs &&
178
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
179
!hugepage_global_always())))
180
return 0;
181
182
/*
183
* Trust that ->huge_fault() handlers know what they are doing
184
* in fault path.
185
*/
186
if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
187
return orders;
188
/* Only regular file is valid in collapse path */
189
if (((!in_pf || smaps)) && file_thp_enabled(vma))
190
return orders;
191
return 0;
192
}
193
194
if (vma_is_temporary_stack(vma))
195
return 0;
196
197
/*
198
* THPeligible bit of smaps should show 1 for proper VMAs even
199
* though anon_vma is not initialized yet.
200
*
201
* Allow page fault since anon_vma may be not initialized until
202
* the first page fault.
203
*/
204
if (!vma->anon_vma)
205
return (smaps || in_pf) ? orders : 0;
206
207
return orders;
208
}
209
210
static bool get_huge_zero_page(void)
211
{
212
struct folio *zero_folio;
213
retry:
214
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
215
return true;
216
217
zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
218
HPAGE_PMD_ORDER);
219
if (!zero_folio) {
220
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
221
return false;
222
}
223
/* Ensure zero folio won't have large_rmappable flag set. */
224
folio_clear_large_rmappable(zero_folio);
225
preempt_disable();
226
if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
227
preempt_enable();
228
folio_put(zero_folio);
229
goto retry;
230
}
231
WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
232
233
/* We take additional reference here. It will be put back by shrinker */
234
atomic_set(&huge_zero_refcount, 2);
235
preempt_enable();
236
count_vm_event(THP_ZERO_PAGE_ALLOC);
237
return true;
238
}
239
240
static void put_huge_zero_page(void)
241
{
242
/*
243
* Counter should never go to zero here. Only shrinker can put
244
* last reference.
245
*/
246
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
247
}
248
249
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
250
{
251
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
252
return READ_ONCE(huge_zero_folio);
253
254
if (!get_huge_zero_page())
255
return NULL;
256
257
if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
258
put_huge_zero_page();
259
260
return READ_ONCE(huge_zero_folio);
261
}
262
263
void mm_put_huge_zero_folio(struct mm_struct *mm)
264
{
265
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
266
put_huge_zero_page();
267
}
268
269
static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
270
struct shrink_control *sc)
271
{
272
/* we can free zero page only if last reference remains */
273
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
274
}
275
276
static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
277
struct shrink_control *sc)
278
{
279
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
280
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
281
BUG_ON(zero_folio == NULL);
282
WRITE_ONCE(huge_zero_pfn, ~0UL);
283
folio_put(zero_folio);
284
return HPAGE_PMD_NR;
285
}
286
287
return 0;
288
}
289
290
static struct shrinker *huge_zero_page_shrinker;
291
292
#ifdef CONFIG_SYSFS
293
static ssize_t enabled_show(struct kobject *kobj,
294
struct kobj_attribute *attr, char *buf)
295
{
296
const char *output;
297
298
if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
299
output = "[always] madvise never";
300
else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
301
&transparent_hugepage_flags))
302
output = "always [madvise] never";
303
else
304
output = "always madvise [never]";
305
306
return sysfs_emit(buf, "%s\n", output);
307
}
308
309
static ssize_t enabled_store(struct kobject *kobj,
310
struct kobj_attribute *attr,
311
const char *buf, size_t count)
312
{
313
ssize_t ret = count;
314
315
if (sysfs_streq(buf, "always")) {
316
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
317
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
318
} else if (sysfs_streq(buf, "madvise")) {
319
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
320
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
321
} else if (sysfs_streq(buf, "never")) {
322
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
323
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
324
} else
325
ret = -EINVAL;
326
327
if (ret > 0) {
328
int err = start_stop_khugepaged();
329
if (err)
330
ret = err;
331
}
332
return ret;
333
}
334
335
static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
336
337
ssize_t single_hugepage_flag_show(struct kobject *kobj,
338
struct kobj_attribute *attr, char *buf,
339
enum transparent_hugepage_flag flag)
340
{
341
return sysfs_emit(buf, "%d\n",
342
!!test_bit(flag, &transparent_hugepage_flags));
343
}
344
345
ssize_t single_hugepage_flag_store(struct kobject *kobj,
346
struct kobj_attribute *attr,
347
const char *buf, size_t count,
348
enum transparent_hugepage_flag flag)
349
{
350
unsigned long value;
351
int ret;
352
353
ret = kstrtoul(buf, 10, &value);
354
if (ret < 0)
355
return ret;
356
if (value > 1)
357
return -EINVAL;
358
359
if (value)
360
set_bit(flag, &transparent_hugepage_flags);
361
else
362
clear_bit(flag, &transparent_hugepage_flags);
363
364
return count;
365
}
366
367
static ssize_t defrag_show(struct kobject *kobj,
368
struct kobj_attribute *attr, char *buf)
369
{
370
const char *output;
371
372
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
373
&transparent_hugepage_flags))
374
output = "[always] defer defer+madvise madvise never";
375
else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
376
&transparent_hugepage_flags))
377
output = "always [defer] defer+madvise madvise never";
378
else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
379
&transparent_hugepage_flags))
380
output = "always defer [defer+madvise] madvise never";
381
else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
382
&transparent_hugepage_flags))
383
output = "always defer defer+madvise [madvise] never";
384
else
385
output = "always defer defer+madvise madvise [never]";
386
387
return sysfs_emit(buf, "%s\n", output);
388
}
389
390
static ssize_t defrag_store(struct kobject *kobj,
391
struct kobj_attribute *attr,
392
const char *buf, size_t count)
393
{
394
if (sysfs_streq(buf, "always")) {
395
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
396
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
397
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
398
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
399
} else if (sysfs_streq(buf, "defer+madvise")) {
400
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
401
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
402
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
403
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
404
} else if (sysfs_streq(buf, "defer")) {
405
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
406
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
407
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
408
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
409
} else if (sysfs_streq(buf, "madvise")) {
410
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
411
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
412
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
413
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
414
} else if (sysfs_streq(buf, "never")) {
415
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
416
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
417
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
418
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
419
} else
420
return -EINVAL;
421
422
return count;
423
}
424
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
425
426
static ssize_t use_zero_page_show(struct kobject *kobj,
427
struct kobj_attribute *attr, char *buf)
428
{
429
return single_hugepage_flag_show(kobj, attr, buf,
430
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
431
}
432
static ssize_t use_zero_page_store(struct kobject *kobj,
433
struct kobj_attribute *attr, const char *buf, size_t count)
434
{
435
return single_hugepage_flag_store(kobj, attr, buf, count,
436
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
437
}
438
static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
439
440
static ssize_t hpage_pmd_size_show(struct kobject *kobj,
441
struct kobj_attribute *attr, char *buf)
442
{
443
return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
444
}
445
static struct kobj_attribute hpage_pmd_size_attr =
446
__ATTR_RO(hpage_pmd_size);
447
448
static ssize_t split_underused_thp_show(struct kobject *kobj,
449
struct kobj_attribute *attr, char *buf)
450
{
451
return sysfs_emit(buf, "%d\n", split_underused_thp);
452
}
453
454
static ssize_t split_underused_thp_store(struct kobject *kobj,
455
struct kobj_attribute *attr,
456
const char *buf, size_t count)
457
{
458
int err = kstrtobool(buf, &split_underused_thp);
459
460
if (err < 0)
461
return err;
462
463
return count;
464
}
465
466
static struct kobj_attribute split_underused_thp_attr = __ATTR(
467
shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
468
469
static struct attribute *hugepage_attr[] = {
470
&enabled_attr.attr,
471
&defrag_attr.attr,
472
&use_zero_page_attr.attr,
473
&hpage_pmd_size_attr.attr,
474
#ifdef CONFIG_SHMEM
475
&shmem_enabled_attr.attr,
476
#endif
477
&split_underused_thp_attr.attr,
478
NULL,
479
};
480
481
static const struct attribute_group hugepage_attr_group = {
482
.attrs = hugepage_attr,
483
};
484
485
static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
486
static void thpsize_release(struct kobject *kobj);
487
static DEFINE_SPINLOCK(huge_anon_orders_lock);
488
static LIST_HEAD(thpsize_list);
489
490
static ssize_t anon_enabled_show(struct kobject *kobj,
491
struct kobj_attribute *attr, char *buf)
492
{
493
int order = to_thpsize(kobj)->order;
494
const char *output;
495
496
if (test_bit(order, &huge_anon_orders_always))
497
output = "[always] inherit madvise never";
498
else if (test_bit(order, &huge_anon_orders_inherit))
499
output = "always [inherit] madvise never";
500
else if (test_bit(order, &huge_anon_orders_madvise))
501
output = "always inherit [madvise] never";
502
else
503
output = "always inherit madvise [never]";
504
505
return sysfs_emit(buf, "%s\n", output);
506
}
507
508
static ssize_t anon_enabled_store(struct kobject *kobj,
509
struct kobj_attribute *attr,
510
const char *buf, size_t count)
511
{
512
int order = to_thpsize(kobj)->order;
513
ssize_t ret = count;
514
515
if (sysfs_streq(buf, "always")) {
516
spin_lock(&huge_anon_orders_lock);
517
clear_bit(order, &huge_anon_orders_inherit);
518
clear_bit(order, &huge_anon_orders_madvise);
519
set_bit(order, &huge_anon_orders_always);
520
spin_unlock(&huge_anon_orders_lock);
521
} else if (sysfs_streq(buf, "inherit")) {
522
spin_lock(&huge_anon_orders_lock);
523
clear_bit(order, &huge_anon_orders_always);
524
clear_bit(order, &huge_anon_orders_madvise);
525
set_bit(order, &huge_anon_orders_inherit);
526
spin_unlock(&huge_anon_orders_lock);
527
} else if (sysfs_streq(buf, "madvise")) {
528
spin_lock(&huge_anon_orders_lock);
529
clear_bit(order, &huge_anon_orders_always);
530
clear_bit(order, &huge_anon_orders_inherit);
531
set_bit(order, &huge_anon_orders_madvise);
532
spin_unlock(&huge_anon_orders_lock);
533
} else if (sysfs_streq(buf, "never")) {
534
spin_lock(&huge_anon_orders_lock);
535
clear_bit(order, &huge_anon_orders_always);
536
clear_bit(order, &huge_anon_orders_inherit);
537
clear_bit(order, &huge_anon_orders_madvise);
538
spin_unlock(&huge_anon_orders_lock);
539
} else
540
ret = -EINVAL;
541
542
if (ret > 0) {
543
int err;
544
545
err = start_stop_khugepaged();
546
if (err)
547
ret = err;
548
}
549
return ret;
550
}
551
552
static struct kobj_attribute anon_enabled_attr =
553
__ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
554
555
static struct attribute *anon_ctrl_attrs[] = {
556
&anon_enabled_attr.attr,
557
NULL,
558
};
559
560
static const struct attribute_group anon_ctrl_attr_grp = {
561
.attrs = anon_ctrl_attrs,
562
};
563
564
static struct attribute *file_ctrl_attrs[] = {
565
#ifdef CONFIG_SHMEM
566
&thpsize_shmem_enabled_attr.attr,
567
#endif
568
NULL,
569
};
570
571
static const struct attribute_group file_ctrl_attr_grp = {
572
.attrs = file_ctrl_attrs,
573
};
574
575
static struct attribute *any_ctrl_attrs[] = {
576
NULL,
577
};
578
579
static const struct attribute_group any_ctrl_attr_grp = {
580
.attrs = any_ctrl_attrs,
581
};
582
583
static const struct kobj_type thpsize_ktype = {
584
.release = &thpsize_release,
585
.sysfs_ops = &kobj_sysfs_ops,
586
};
587
588
DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
589
590
static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
591
{
592
unsigned long sum = 0;
593
int cpu;
594
595
for_each_possible_cpu(cpu) {
596
struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
597
598
sum += this->stats[order][item];
599
}
600
601
return sum;
602
}
603
604
#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
605
static ssize_t _name##_show(struct kobject *kobj, \
606
struct kobj_attribute *attr, char *buf) \
607
{ \
608
int order = to_thpsize(kobj)->order; \
609
\
610
return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
611
} \
612
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
613
614
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
615
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
616
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
617
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
618
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
619
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
620
DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
621
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
622
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
623
#ifdef CONFIG_SHMEM
624
DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
625
DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
626
DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
627
#endif
628
DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
629
DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
630
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
631
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
632
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
633
634
static struct attribute *anon_stats_attrs[] = {
635
&anon_fault_alloc_attr.attr,
636
&anon_fault_fallback_attr.attr,
637
&anon_fault_fallback_charge_attr.attr,
638
#ifndef CONFIG_SHMEM
639
&zswpout_attr.attr,
640
&swpin_attr.attr,
641
&swpin_fallback_attr.attr,
642
&swpin_fallback_charge_attr.attr,
643
&swpout_attr.attr,
644
&swpout_fallback_attr.attr,
645
#endif
646
&split_deferred_attr.attr,
647
&nr_anon_attr.attr,
648
&nr_anon_partially_mapped_attr.attr,
649
NULL,
650
};
651
652
static struct attribute_group anon_stats_attr_grp = {
653
.name = "stats",
654
.attrs = anon_stats_attrs,
655
};
656
657
static struct attribute *file_stats_attrs[] = {
658
#ifdef CONFIG_SHMEM
659
&shmem_alloc_attr.attr,
660
&shmem_fallback_attr.attr,
661
&shmem_fallback_charge_attr.attr,
662
#endif
663
NULL,
664
};
665
666
static struct attribute_group file_stats_attr_grp = {
667
.name = "stats",
668
.attrs = file_stats_attrs,
669
};
670
671
static struct attribute *any_stats_attrs[] = {
672
#ifdef CONFIG_SHMEM
673
&zswpout_attr.attr,
674
&swpin_attr.attr,
675
&swpin_fallback_attr.attr,
676
&swpin_fallback_charge_attr.attr,
677
&swpout_attr.attr,
678
&swpout_fallback_attr.attr,
679
#endif
680
&split_attr.attr,
681
&split_failed_attr.attr,
682
NULL,
683
};
684
685
static struct attribute_group any_stats_attr_grp = {
686
.name = "stats",
687
.attrs = any_stats_attrs,
688
};
689
690
static int sysfs_add_group(struct kobject *kobj,
691
const struct attribute_group *grp)
692
{
693
int ret = -ENOENT;
694
695
/*
696
* If the group is named, try to merge first, assuming the subdirectory
697
* was already created. This avoids the warning emitted by
698
* sysfs_create_group() if the directory already exists.
699
*/
700
if (grp->name)
701
ret = sysfs_merge_group(kobj, grp);
702
if (ret)
703
ret = sysfs_create_group(kobj, grp);
704
705
return ret;
706
}
707
708
static struct thpsize *thpsize_create(int order, struct kobject *parent)
709
{
710
unsigned long size = (PAGE_SIZE << order) / SZ_1K;
711
struct thpsize *thpsize;
712
int ret = -ENOMEM;
713
714
thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
715
if (!thpsize)
716
goto err;
717
718
thpsize->order = order;
719
720
ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
721
"hugepages-%lukB", size);
722
if (ret) {
723
kfree(thpsize);
724
goto err;
725
}
726
727
728
ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
729
if (ret)
730
goto err_put;
731
732
ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
733
if (ret)
734
goto err_put;
735
736
if (BIT(order) & THP_ORDERS_ALL_ANON) {
737
ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
738
if (ret)
739
goto err_put;
740
741
ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
742
if (ret)
743
goto err_put;
744
}
745
746
if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
747
ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
748
if (ret)
749
goto err_put;
750
751
ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
752
if (ret)
753
goto err_put;
754
}
755
756
return thpsize;
757
err_put:
758
kobject_put(&thpsize->kobj);
759
err:
760
return ERR_PTR(ret);
761
}
762
763
static void thpsize_release(struct kobject *kobj)
764
{
765
kfree(to_thpsize(kobj));
766
}
767
768
static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
769
{
770
int err;
771
struct thpsize *thpsize;
772
unsigned long orders;
773
int order;
774
775
/*
776
* Default to setting PMD-sized THP to inherit the global setting and
777
* disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
778
* constant so we have to do this here.
779
*/
780
if (!anon_orders_configured)
781
huge_anon_orders_inherit = BIT(PMD_ORDER);
782
783
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
784
if (unlikely(!*hugepage_kobj)) {
785
pr_err("failed to create transparent hugepage kobject\n");
786
return -ENOMEM;
787
}
788
789
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
790
if (err) {
791
pr_err("failed to register transparent hugepage group\n");
792
goto delete_obj;
793
}
794
795
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
796
if (err) {
797
pr_err("failed to register transparent hugepage group\n");
798
goto remove_hp_group;
799
}
800
801
orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
802
order = highest_order(orders);
803
while (orders) {
804
thpsize = thpsize_create(order, *hugepage_kobj);
805
if (IS_ERR(thpsize)) {
806
pr_err("failed to create thpsize for order %d\n", order);
807
err = PTR_ERR(thpsize);
808
goto remove_all;
809
}
810
list_add(&thpsize->node, &thpsize_list);
811
order = next_order(&orders, order);
812
}
813
814
return 0;
815
816
remove_all:
817
hugepage_exit_sysfs(*hugepage_kobj);
818
return err;
819
remove_hp_group:
820
sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
821
delete_obj:
822
kobject_put(*hugepage_kobj);
823
return err;
824
}
825
826
static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
827
{
828
struct thpsize *thpsize, *tmp;
829
830
list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
831
list_del(&thpsize->node);
832
kobject_put(&thpsize->kobj);
833
}
834
835
sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
836
sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
837
kobject_put(hugepage_kobj);
838
}
839
#else
840
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
841
{
842
return 0;
843
}
844
845
static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
846
{
847
}
848
#endif /* CONFIG_SYSFS */
849
850
static int __init thp_shrinker_init(void)
851
{
852
huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
853
if (!huge_zero_page_shrinker)
854
return -ENOMEM;
855
856
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
857
SHRINKER_MEMCG_AWARE |
858
SHRINKER_NONSLAB,
859
"thp-deferred_split");
860
if (!deferred_split_shrinker) {
861
shrinker_free(huge_zero_page_shrinker);
862
return -ENOMEM;
863
}
864
865
huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
866
huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
867
shrinker_register(huge_zero_page_shrinker);
868
869
deferred_split_shrinker->count_objects = deferred_split_count;
870
deferred_split_shrinker->scan_objects = deferred_split_scan;
871
shrinker_register(deferred_split_shrinker);
872
873
return 0;
874
}
875
876
static void __init thp_shrinker_exit(void)
877
{
878
shrinker_free(huge_zero_page_shrinker);
879
shrinker_free(deferred_split_shrinker);
880
}
881
882
static int __init hugepage_init(void)
883
{
884
int err;
885
struct kobject *hugepage_kobj;
886
887
if (!has_transparent_hugepage()) {
888
transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
889
return -EINVAL;
890
}
891
892
/*
893
* hugepages can't be allocated by the buddy allocator
894
*/
895
MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
896
897
err = hugepage_init_sysfs(&hugepage_kobj);
898
if (err)
899
goto err_sysfs;
900
901
err = khugepaged_init();
902
if (err)
903
goto err_slab;
904
905
err = thp_shrinker_init();
906
if (err)
907
goto err_shrinker;
908
909
/*
910
* By default disable transparent hugepages on smaller systems,
911
* where the extra memory used could hurt more than TLB overhead
912
* is likely to save. The admin can still enable it through /sys.
913
*/
914
if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
915
transparent_hugepage_flags = 0;
916
return 0;
917
}
918
919
err = start_stop_khugepaged();
920
if (err)
921
goto err_khugepaged;
922
923
return 0;
924
err_khugepaged:
925
thp_shrinker_exit();
926
err_shrinker:
927
khugepaged_destroy();
928
err_slab:
929
hugepage_exit_sysfs(hugepage_kobj);
930
err_sysfs:
931
return err;
932
}
933
subsys_initcall(hugepage_init);
934
935
static int __init setup_transparent_hugepage(char *str)
936
{
937
int ret = 0;
938
if (!str)
939
goto out;
940
if (!strcmp(str, "always")) {
941
set_bit(TRANSPARENT_HUGEPAGE_FLAG,
942
&transparent_hugepage_flags);
943
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
944
&transparent_hugepage_flags);
945
ret = 1;
946
} else if (!strcmp(str, "madvise")) {
947
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
948
&transparent_hugepage_flags);
949
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
950
&transparent_hugepage_flags);
951
ret = 1;
952
} else if (!strcmp(str, "never")) {
953
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
954
&transparent_hugepage_flags);
955
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
956
&transparent_hugepage_flags);
957
ret = 1;
958
}
959
out:
960
if (!ret)
961
pr_warn("transparent_hugepage= cannot parse, ignored\n");
962
return ret;
963
}
964
__setup("transparent_hugepage=", setup_transparent_hugepage);
965
966
static char str_dup[PAGE_SIZE] __initdata;
967
static int __init setup_thp_anon(char *str)
968
{
969
char *token, *range, *policy, *subtoken;
970
unsigned long always, inherit, madvise;
971
char *start_size, *end_size;
972
int start, end, nr;
973
char *p;
974
975
if (!str || strlen(str) + 1 > PAGE_SIZE)
976
goto err;
977
strscpy(str_dup, str);
978
979
always = huge_anon_orders_always;
980
madvise = huge_anon_orders_madvise;
981
inherit = huge_anon_orders_inherit;
982
p = str_dup;
983
while ((token = strsep(&p, ";")) != NULL) {
984
range = strsep(&token, ":");
985
policy = token;
986
987
if (!policy)
988
goto err;
989
990
while ((subtoken = strsep(&range, ",")) != NULL) {
991
if (strchr(subtoken, '-')) {
992
start_size = strsep(&subtoken, "-");
993
end_size = subtoken;
994
995
start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
996
end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
997
} else {
998
start_size = end_size = subtoken;
999
start = end = get_order_from_str(subtoken,
1000
THP_ORDERS_ALL_ANON);
1001
}
1002
1003
if (start == -EINVAL) {
1004
pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
1005
goto err;
1006
}
1007
1008
if (end == -EINVAL) {
1009
pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
1010
goto err;
1011
}
1012
1013
if (start < 0 || end < 0 || start > end)
1014
goto err;
1015
1016
nr = end - start + 1;
1017
if (!strcmp(policy, "always")) {
1018
bitmap_set(&always, start, nr);
1019
bitmap_clear(&inherit, start, nr);
1020
bitmap_clear(&madvise, start, nr);
1021
} else if (!strcmp(policy, "madvise")) {
1022
bitmap_set(&madvise, start, nr);
1023
bitmap_clear(&inherit, start, nr);
1024
bitmap_clear(&always, start, nr);
1025
} else if (!strcmp(policy, "inherit")) {
1026
bitmap_set(&inherit, start, nr);
1027
bitmap_clear(&madvise, start, nr);
1028
bitmap_clear(&always, start, nr);
1029
} else if (!strcmp(policy, "never")) {
1030
bitmap_clear(&inherit, start, nr);
1031
bitmap_clear(&madvise, start, nr);
1032
bitmap_clear(&always, start, nr);
1033
} else {
1034
pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
1035
goto err;
1036
}
1037
}
1038
}
1039
1040
huge_anon_orders_always = always;
1041
huge_anon_orders_madvise = madvise;
1042
huge_anon_orders_inherit = inherit;
1043
anon_orders_configured = true;
1044
return 1;
1045
1046
err:
1047
pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
1048
return 0;
1049
}
1050
__setup("thp_anon=", setup_thp_anon);
1051
1052
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
1053
{
1054
if (likely(vma->vm_flags & VM_WRITE))
1055
pmd = pmd_mkwrite(pmd, vma);
1056
return pmd;
1057
}
1058
1059
#ifdef CONFIG_MEMCG
1060
static inline
1061
struct deferred_split *get_deferred_split_queue(struct folio *folio)
1062
{
1063
struct mem_cgroup *memcg = folio_memcg(folio);
1064
struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
1065
1066
if (memcg)
1067
return &memcg->deferred_split_queue;
1068
else
1069
return &pgdat->deferred_split_queue;
1070
}
1071
#else
1072
static inline
1073
struct deferred_split *get_deferred_split_queue(struct folio *folio)
1074
{
1075
struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
1076
1077
return &pgdat->deferred_split_queue;
1078
}
1079
#endif
1080
1081
static inline bool is_transparent_hugepage(const struct folio *folio)
1082
{
1083
if (!folio_test_large(folio))
1084
return false;
1085
1086
return is_huge_zero_folio(folio) ||
1087
folio_test_large_rmappable(folio);
1088
}
1089
1090
static unsigned long __thp_get_unmapped_area(struct file *filp,
1091
unsigned long addr, unsigned long len,
1092
loff_t off, unsigned long flags, unsigned long size,
1093
vm_flags_t vm_flags)
1094
{
1095
loff_t off_end = off + len;
1096
loff_t off_align = round_up(off, size);
1097
unsigned long len_pad, ret, off_sub;
1098
1099
if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
1100
return 0;
1101
1102
if (off_end <= off_align || (off_end - off_align) < size)
1103
return 0;
1104
1105
len_pad = len + size;
1106
if (len_pad < len || (off + len_pad) < off)
1107
return 0;
1108
1109
ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
1110
off >> PAGE_SHIFT, flags, vm_flags);
1111
1112
/*
1113
* The failure might be due to length padding. The caller will retry
1114
* without the padding.
1115
*/
1116
if (IS_ERR_VALUE(ret))
1117
return 0;
1118
1119
/*
1120
* Do not try to align to THP boundary if allocation at the address
1121
* hint succeeds.
1122
*/
1123
if (ret == addr)
1124
return addr;
1125
1126
off_sub = (off - ret) & (size - 1);
1127
1128
if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
1129
return ret + size;
1130
1131
ret += off_sub;
1132
return ret;
1133
}
1134
1135
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
1136
unsigned long len, unsigned long pgoff, unsigned long flags,
1137
vm_flags_t vm_flags)
1138
{
1139
unsigned long ret;
1140
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1141
1142
ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
1143
if (ret)
1144
return ret;
1145
1146
return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
1147
vm_flags);
1148
}
1149
1150
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
1151
unsigned long len, unsigned long pgoff, unsigned long flags)
1152
{
1153
return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
1154
}
1155
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1156
1157
static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
1158
unsigned long addr)
1159
{
1160
gfp_t gfp = vma_thp_gfp_mask(vma);
1161
const int order = HPAGE_PMD_ORDER;
1162
struct folio *folio;
1163
1164
folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
1165
1166
if (unlikely(!folio)) {
1167
count_vm_event(THP_FAULT_FALLBACK);
1168
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
1169
return NULL;
1170
}
1171
1172
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
1173
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
1174
folio_put(folio);
1175
count_vm_event(THP_FAULT_FALLBACK);
1176
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
1177
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
1178
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
1179
return NULL;
1180
}
1181
folio_throttle_swaprate(folio, gfp);
1182
1183
/*
1184
* When a folio is not zeroed during allocation (__GFP_ZERO not used)
1185
* or user folios require special handling, folio_zero_user() is used to
1186
* make sure that the page corresponding to the faulting address will be
1187
* hot in the cache after zeroing.
1188
*/
1189
if (user_alloc_needs_zeroing())
1190
folio_zero_user(folio, addr);
1191
/*
1192
* The memory barrier inside __folio_mark_uptodate makes sure that
1193
* folio_zero_user writes become visible before the set_pmd_at()
1194
* write.
1195
*/
1196
__folio_mark_uptodate(folio);
1197
return folio;
1198
}
1199
1200
static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
1201
struct vm_area_struct *vma, unsigned long haddr)
1202
{
1203
pmd_t entry;
1204
1205
entry = folio_mk_pmd(folio, vma->vm_page_prot);
1206
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1207
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
1208
folio_add_lru_vma(folio, vma);
1209
set_pmd_at(vma->vm_mm, haddr, pmd, entry);
1210
update_mmu_cache_pmd(vma, haddr, pmd);
1211
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1212
count_vm_event(THP_FAULT_ALLOC);
1213
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
1214
count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
1215
}
1216
1217
static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1218
{
1219
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1220
struct vm_area_struct *vma = vmf->vma;
1221
struct folio *folio;
1222
pgtable_t pgtable;
1223
vm_fault_t ret = 0;
1224
1225
folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
1226
if (unlikely(!folio))
1227
return VM_FAULT_FALLBACK;
1228
1229
pgtable = pte_alloc_one(vma->vm_mm);
1230
if (unlikely(!pgtable)) {
1231
ret = VM_FAULT_OOM;
1232
goto release;
1233
}
1234
1235
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1236
if (unlikely(!pmd_none(*vmf->pmd))) {
1237
goto unlock_release;
1238
} else {
1239
ret = check_stable_address_space(vma->vm_mm);
1240
if (ret)
1241
goto unlock_release;
1242
1243
/* Deliver the page fault to userland */
1244
if (userfaultfd_missing(vma)) {
1245
spin_unlock(vmf->ptl);
1246
folio_put(folio);
1247
pte_free(vma->vm_mm, pgtable);
1248
ret = handle_userfault(vmf, VM_UFFD_MISSING);
1249
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1250
return ret;
1251
}
1252
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1253
map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
1254
mm_inc_nr_ptes(vma->vm_mm);
1255
deferred_split_folio(folio, false);
1256
spin_unlock(vmf->ptl);
1257
}
1258
1259
return 0;
1260
unlock_release:
1261
spin_unlock(vmf->ptl);
1262
release:
1263
if (pgtable)
1264
pte_free(vma->vm_mm, pgtable);
1265
folio_put(folio);
1266
return ret;
1267
1268
}
1269
1270
/*
1271
* always: directly stall for all thp allocations
1272
* defer: wake kswapd and fail if not immediately available
1273
* defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1274
* fail if not immediately available
1275
* madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1276
* available
1277
* never: never stall for any thp allocation
1278
*/
1279
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
1280
{
1281
const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
1282
1283
/* Always do synchronous compaction */
1284
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
1285
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
1286
1287
/* Kick kcompactd and fail quickly */
1288
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
1289
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
1290
1291
/* Synchronous compaction if madvised, otherwise kick kcompactd */
1292
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
1293
return GFP_TRANSHUGE_LIGHT |
1294
(vma_madvised ? __GFP_DIRECT_RECLAIM :
1295
__GFP_KSWAPD_RECLAIM);
1296
1297
/* Only do synchronous compaction if madvised */
1298
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
1299
return GFP_TRANSHUGE_LIGHT |
1300
(vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
1301
1302
return GFP_TRANSHUGE_LIGHT;
1303
}
1304
1305
/* Caller must hold page table lock. */
1306
static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
1307
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
1308
struct folio *zero_folio)
1309
{
1310
pmd_t entry;
1311
entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
1312
pgtable_trans_huge_deposit(mm, pmd, pgtable);
1313
set_pmd_at(mm, haddr, pmd, entry);
1314
mm_inc_nr_ptes(mm);
1315
}
1316
1317
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
1318
{
1319
struct vm_area_struct *vma = vmf->vma;
1320
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1321
vm_fault_t ret;
1322
1323
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
1324
return VM_FAULT_FALLBACK;
1325
ret = vmf_anon_prepare(vmf);
1326
if (ret)
1327
return ret;
1328
khugepaged_enter_vma(vma, vma->vm_flags);
1329
1330
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
1331
!mm_forbids_zeropage(vma->vm_mm) &&
1332
transparent_hugepage_use_zero_page()) {
1333
pgtable_t pgtable;
1334
struct folio *zero_folio;
1335
vm_fault_t ret;
1336
1337
pgtable = pte_alloc_one(vma->vm_mm);
1338
if (unlikely(!pgtable))
1339
return VM_FAULT_OOM;
1340
zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1341
if (unlikely(!zero_folio)) {
1342
pte_free(vma->vm_mm, pgtable);
1343
count_vm_event(THP_FAULT_FALLBACK);
1344
return VM_FAULT_FALLBACK;
1345
}
1346
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1347
ret = 0;
1348
if (pmd_none(*vmf->pmd)) {
1349
ret = check_stable_address_space(vma->vm_mm);
1350
if (ret) {
1351
spin_unlock(vmf->ptl);
1352
pte_free(vma->vm_mm, pgtable);
1353
} else if (userfaultfd_missing(vma)) {
1354
spin_unlock(vmf->ptl);
1355
pte_free(vma->vm_mm, pgtable);
1356
ret = handle_userfault(vmf, VM_UFFD_MISSING);
1357
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1358
} else {
1359
set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1360
haddr, vmf->pmd, zero_folio);
1361
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1362
spin_unlock(vmf->ptl);
1363
}
1364
} else {
1365
spin_unlock(vmf->ptl);
1366
pte_free(vma->vm_mm, pgtable);
1367
}
1368
return ret;
1369
}
1370
1371
return __do_huge_pmd_anonymous_page(vmf);
1372
}
1373
1374
struct folio_or_pfn {
1375
union {
1376
struct folio *folio;
1377
unsigned long pfn;
1378
};
1379
bool is_folio;
1380
};
1381
1382
static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
1383
pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
1384
bool write, pgtable_t pgtable)
1385
{
1386
struct mm_struct *mm = vma->vm_mm;
1387
pmd_t entry;
1388
1389
lockdep_assert_held(pmd_lockptr(mm, pmd));
1390
1391
if (!pmd_none(*pmd)) {
1392
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
1393
fop.pfn;
1394
1395
if (write) {
1396
if (pmd_pfn(*pmd) != pfn) {
1397
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1398
return -EEXIST;
1399
}
1400
entry = pmd_mkyoung(*pmd);
1401
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1402
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1403
update_mmu_cache_pmd(vma, addr, pmd);
1404
}
1405
1406
return -EEXIST;
1407
}
1408
1409
if (fop.is_folio) {
1410
entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
1411
1412
folio_get(fop.folio);
1413
folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
1414
add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
1415
} else {
1416
entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
1417
entry = pmd_mkspecial(entry);
1418
}
1419
if (write) {
1420
entry = pmd_mkyoung(pmd_mkdirty(entry));
1421
entry = maybe_pmd_mkwrite(entry, vma);
1422
}
1423
1424
if (pgtable) {
1425
pgtable_trans_huge_deposit(mm, pmd, pgtable);
1426
mm_inc_nr_ptes(mm);
1427
}
1428
1429
set_pmd_at(mm, addr, pmd, entry);
1430
update_mmu_cache_pmd(vma, addr, pmd);
1431
return 0;
1432
}
1433
1434
/**
1435
* vmf_insert_pfn_pmd - insert a pmd size pfn
1436
* @vmf: Structure describing the fault
1437
* @pfn: pfn to insert
1438
* @write: whether it's a write fault
1439
*
1440
* Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
1441
*
1442
* Return: vm_fault_t value.
1443
*/
1444
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
1445
bool write)
1446
{
1447
unsigned long addr = vmf->address & PMD_MASK;
1448
struct vm_area_struct *vma = vmf->vma;
1449
pgprot_t pgprot = vma->vm_page_prot;
1450
struct folio_or_pfn fop = {
1451
.pfn = pfn,
1452
};
1453
pgtable_t pgtable = NULL;
1454
spinlock_t *ptl;
1455
int error;
1456
1457
/*
1458
* If we had pmd_special, we could avoid all these restrictions,
1459
* but we need to be consistent with PTEs and architectures that
1460
* can't support a 'special' bit.
1461
*/
1462
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1463
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1464
(VM_PFNMAP|VM_MIXEDMAP));
1465
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1466
1467
if (addr < vma->vm_start || addr >= vma->vm_end)
1468
return VM_FAULT_SIGBUS;
1469
1470
if (arch_needs_pgtable_deposit()) {
1471
pgtable = pte_alloc_one(vma->vm_mm);
1472
if (!pgtable)
1473
return VM_FAULT_OOM;
1474
}
1475
1476
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
1477
1478
ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1479
error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
1480
pgtable);
1481
spin_unlock(ptl);
1482
if (error && pgtable)
1483
pte_free(vma->vm_mm, pgtable);
1484
1485
return VM_FAULT_NOPAGE;
1486
}
1487
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
1488
1489
vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
1490
bool write)
1491
{
1492
struct vm_area_struct *vma = vmf->vma;
1493
unsigned long addr = vmf->address & PMD_MASK;
1494
struct mm_struct *mm = vma->vm_mm;
1495
struct folio_or_pfn fop = {
1496
.folio = folio,
1497
.is_folio = true,
1498
};
1499
spinlock_t *ptl;
1500
pgtable_t pgtable = NULL;
1501
int error;
1502
1503
if (addr < vma->vm_start || addr >= vma->vm_end)
1504
return VM_FAULT_SIGBUS;
1505
1506
if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
1507
return VM_FAULT_SIGBUS;
1508
1509
if (arch_needs_pgtable_deposit()) {
1510
pgtable = pte_alloc_one(vma->vm_mm);
1511
if (!pgtable)
1512
return VM_FAULT_OOM;
1513
}
1514
1515
ptl = pmd_lock(mm, vmf->pmd);
1516
error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
1517
write, pgtable);
1518
spin_unlock(ptl);
1519
if (error && pgtable)
1520
pte_free(mm, pgtable);
1521
1522
return VM_FAULT_NOPAGE;
1523
}
1524
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
1525
1526
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1527
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
1528
{
1529
if (likely(vma->vm_flags & VM_WRITE))
1530
pud = pud_mkwrite(pud);
1531
return pud;
1532
}
1533
1534
static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
1535
pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
1536
{
1537
struct mm_struct *mm = vma->vm_mm;
1538
pud_t entry;
1539
1540
if (!pud_none(*pud)) {
1541
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
1542
fop.pfn;
1543
1544
if (write) {
1545
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
1546
return;
1547
entry = pud_mkyoung(*pud);
1548
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1549
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1550
update_mmu_cache_pud(vma, addr, pud);
1551
}
1552
return;
1553
}
1554
1555
if (fop.is_folio) {
1556
entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
1557
1558
folio_get(fop.folio);
1559
folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
1560
add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
1561
} else {
1562
entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
1563
entry = pud_mkspecial(entry);
1564
}
1565
if (write) {
1566
entry = pud_mkyoung(pud_mkdirty(entry));
1567
entry = maybe_pud_mkwrite(entry, vma);
1568
}
1569
set_pud_at(mm, addr, pud, entry);
1570
update_mmu_cache_pud(vma, addr, pud);
1571
}
1572
1573
/**
1574
* vmf_insert_pfn_pud - insert a pud size pfn
1575
* @vmf: Structure describing the fault
1576
* @pfn: pfn to insert
1577
* @write: whether it's a write fault
1578
*
1579
* Insert a pud size pfn. See vmf_insert_pfn() for additional info.
1580
*
1581
* Return: vm_fault_t value.
1582
*/
1583
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
1584
bool write)
1585
{
1586
unsigned long addr = vmf->address & PUD_MASK;
1587
struct vm_area_struct *vma = vmf->vma;
1588
pgprot_t pgprot = vma->vm_page_prot;
1589
struct folio_or_pfn fop = {
1590
.pfn = pfn,
1591
};
1592
spinlock_t *ptl;
1593
1594
/*
1595
* If we had pud_special, we could avoid all these restrictions,
1596
* but we need to be consistent with PTEs and architectures that
1597
* can't support a 'special' bit.
1598
*/
1599
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1600
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1601
(VM_PFNMAP|VM_MIXEDMAP));
1602
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1603
1604
if (addr < vma->vm_start || addr >= vma->vm_end)
1605
return VM_FAULT_SIGBUS;
1606
1607
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
1608
1609
ptl = pud_lock(vma->vm_mm, vmf->pud);
1610
insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
1611
spin_unlock(ptl);
1612
1613
return VM_FAULT_NOPAGE;
1614
}
1615
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
1616
1617
/**
1618
* vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
1619
* @vmf: Structure describing the fault
1620
* @folio: folio to insert
1621
* @write: whether it's a write fault
1622
*
1623
* Return: vm_fault_t value.
1624
*/
1625
vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
1626
bool write)
1627
{
1628
struct vm_area_struct *vma = vmf->vma;
1629
unsigned long addr = vmf->address & PUD_MASK;
1630
pud_t *pud = vmf->pud;
1631
struct mm_struct *mm = vma->vm_mm;
1632
struct folio_or_pfn fop = {
1633
.folio = folio,
1634
.is_folio = true,
1635
};
1636
spinlock_t *ptl;
1637
1638
if (addr < vma->vm_start || addr >= vma->vm_end)
1639
return VM_FAULT_SIGBUS;
1640
1641
if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
1642
return VM_FAULT_SIGBUS;
1643
1644
ptl = pud_lock(mm, pud);
1645
insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
1646
spin_unlock(ptl);
1647
1648
return VM_FAULT_NOPAGE;
1649
}
1650
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
1651
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1652
1653
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1654
pmd_t *pmd, bool write)
1655
{
1656
pmd_t _pmd;
1657
1658
_pmd = pmd_mkyoung(*pmd);
1659
if (write)
1660
_pmd = pmd_mkdirty(_pmd);
1661
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1662
pmd, _pmd, write))
1663
update_mmu_cache_pmd(vma, addr, pmd);
1664
}
1665
1666
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1667
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
1668
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
1669
{
1670
spinlock_t *dst_ptl, *src_ptl;
1671
struct page *src_page;
1672
struct folio *src_folio;
1673
pmd_t pmd;
1674
pgtable_t pgtable = NULL;
1675
int ret = -ENOMEM;
1676
1677
pmd = pmdp_get_lockless(src_pmd);
1678
if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
1679
dst_ptl = pmd_lock(dst_mm, dst_pmd);
1680
src_ptl = pmd_lockptr(src_mm, src_pmd);
1681
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1682
/*
1683
* No need to recheck the pmd, it can't change with write
1684
* mmap lock held here.
1685
*
1686
* Meanwhile, making sure it's not a CoW VMA with writable
1687
* mapping, otherwise it means either the anon page wrongly
1688
* applied special bit, or we made the PRIVATE mapping be
1689
* able to wrongly write to the backend MMIO.
1690
*/
1691
VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
1692
goto set_pmd;
1693
}
1694
1695
/* Skip if can be re-fill on fault */
1696
if (!vma_is_anonymous(dst_vma))
1697
return 0;
1698
1699
pgtable = pte_alloc_one(dst_mm);
1700
if (unlikely(!pgtable))
1701
goto out;
1702
1703
dst_ptl = pmd_lock(dst_mm, dst_pmd);
1704
src_ptl = pmd_lockptr(src_mm, src_pmd);
1705
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1706
1707
ret = -EAGAIN;
1708
pmd = *src_pmd;
1709
1710
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1711
if (unlikely(is_swap_pmd(pmd))) {
1712
swp_entry_t entry = pmd_to_swp_entry(pmd);
1713
1714
VM_BUG_ON(!is_pmd_migration_entry(pmd));
1715
if (!is_readable_migration_entry(entry)) {
1716
entry = make_readable_migration_entry(
1717
swp_offset(entry));
1718
pmd = swp_entry_to_pmd(entry);
1719
if (pmd_swp_soft_dirty(*src_pmd))
1720
pmd = pmd_swp_mksoft_dirty(pmd);
1721
if (pmd_swp_uffd_wp(*src_pmd))
1722
pmd = pmd_swp_mkuffd_wp(pmd);
1723
set_pmd_at(src_mm, addr, src_pmd, pmd);
1724
}
1725
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1726
mm_inc_nr_ptes(dst_mm);
1727
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1728
if (!userfaultfd_wp(dst_vma))
1729
pmd = pmd_swp_clear_uffd_wp(pmd);
1730
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1731
ret = 0;
1732
goto out_unlock;
1733
}
1734
#endif
1735
1736
if (unlikely(!pmd_trans_huge(pmd))) {
1737
pte_free(dst_mm, pgtable);
1738
goto out_unlock;
1739
}
1740
/*
1741
* When page table lock is held, the huge zero pmd should not be
1742
* under splitting since we don't split the page itself, only pmd to
1743
* a page table.
1744
*/
1745
if (is_huge_zero_pmd(pmd)) {
1746
/*
1747
* mm_get_huge_zero_folio() will never allocate a new
1748
* folio here, since we already have a zero page to
1749
* copy. It just takes a reference.
1750
*/
1751
mm_get_huge_zero_folio(dst_mm);
1752
goto out_zero_page;
1753
}
1754
1755
src_page = pmd_page(pmd);
1756
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1757
src_folio = page_folio(src_page);
1758
1759
folio_get(src_folio);
1760
if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
1761
/* Page maybe pinned: split and retry the fault on PTEs. */
1762
folio_put(src_folio);
1763
pte_free(dst_mm, pgtable);
1764
spin_unlock(src_ptl);
1765
spin_unlock(dst_ptl);
1766
__split_huge_pmd(src_vma, src_pmd, addr, false);
1767
return -EAGAIN;
1768
}
1769
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1770
out_zero_page:
1771
mm_inc_nr_ptes(dst_mm);
1772
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1773
pmdp_set_wrprotect(src_mm, addr, src_pmd);
1774
if (!userfaultfd_wp(dst_vma))
1775
pmd = pmd_clear_uffd_wp(pmd);
1776
pmd = pmd_wrprotect(pmd);
1777
set_pmd:
1778
pmd = pmd_mkold(pmd);
1779
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1780
1781
ret = 0;
1782
out_unlock:
1783
spin_unlock(src_ptl);
1784
spin_unlock(dst_ptl);
1785
out:
1786
return ret;
1787
}
1788
1789
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1790
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1791
pud_t *pud, bool write)
1792
{
1793
pud_t _pud;
1794
1795
_pud = pud_mkyoung(*pud);
1796
if (write)
1797
_pud = pud_mkdirty(_pud);
1798
if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1799
pud, _pud, write))
1800
update_mmu_cache_pud(vma, addr, pud);
1801
}
1802
1803
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1804
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1805
struct vm_area_struct *vma)
1806
{
1807
spinlock_t *dst_ptl, *src_ptl;
1808
pud_t pud;
1809
int ret;
1810
1811
dst_ptl = pud_lock(dst_mm, dst_pud);
1812
src_ptl = pud_lockptr(src_mm, src_pud);
1813
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1814
1815
ret = -EAGAIN;
1816
pud = *src_pud;
1817
if (unlikely(!pud_trans_huge(pud)))
1818
goto out_unlock;
1819
1820
/*
1821
* TODO: once we support anonymous pages, use
1822
* folio_try_dup_anon_rmap_*() and split if duplicating fails.
1823
*/
1824
if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
1825
pudp_set_wrprotect(src_mm, addr, src_pud);
1826
pud = pud_wrprotect(pud);
1827
}
1828
pud = pud_mkold(pud);
1829
set_pud_at(dst_mm, addr, dst_pud, pud);
1830
1831
ret = 0;
1832
out_unlock:
1833
spin_unlock(src_ptl);
1834
spin_unlock(dst_ptl);
1835
return ret;
1836
}
1837
1838
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1839
{
1840
bool write = vmf->flags & FAULT_FLAG_WRITE;
1841
1842
vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1843
if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1844
goto unlock;
1845
1846
touch_pud(vmf->vma, vmf->address, vmf->pud, write);
1847
unlock:
1848
spin_unlock(vmf->ptl);
1849
}
1850
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1851
1852
void huge_pmd_set_accessed(struct vm_fault *vmf)
1853
{
1854
bool write = vmf->flags & FAULT_FLAG_WRITE;
1855
1856
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1857
if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
1858
goto unlock;
1859
1860
touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
1861
1862
unlock:
1863
spin_unlock(vmf->ptl);
1864
}
1865
1866
static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
1867
{
1868
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1869
struct vm_area_struct *vma = vmf->vma;
1870
struct mmu_notifier_range range;
1871
struct folio *folio;
1872
vm_fault_t ret = 0;
1873
1874
folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
1875
if (unlikely(!folio))
1876
return VM_FAULT_FALLBACK;
1877
1878
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
1879
haddr + HPAGE_PMD_SIZE);
1880
mmu_notifier_invalidate_range_start(&range);
1881
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1882
if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
1883
goto release;
1884
ret = check_stable_address_space(vma->vm_mm);
1885
if (ret)
1886
goto release;
1887
(void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
1888
map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
1889
goto unlock;
1890
release:
1891
folio_put(folio);
1892
unlock:
1893
spin_unlock(vmf->ptl);
1894
mmu_notifier_invalidate_range_end(&range);
1895
return ret;
1896
}
1897
1898
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
1899
{
1900
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
1901
struct vm_area_struct *vma = vmf->vma;
1902
struct folio *folio;
1903
struct page *page;
1904
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1905
pmd_t orig_pmd = vmf->orig_pmd;
1906
1907
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1908
VM_BUG_ON_VMA(!vma->anon_vma, vma);
1909
1910
if (is_huge_zero_pmd(orig_pmd)) {
1911
vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
1912
1913
if (!(ret & VM_FAULT_FALLBACK))
1914
return ret;
1915
1916
/* Fallback to splitting PMD if THP cannot be allocated */
1917
goto fallback;
1918
}
1919
1920
spin_lock(vmf->ptl);
1921
1922
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1923
spin_unlock(vmf->ptl);
1924
return 0;
1925
}
1926
1927
page = pmd_page(orig_pmd);
1928
folio = page_folio(page);
1929
VM_BUG_ON_PAGE(!PageHead(page), page);
1930
1931
/* Early check when only holding the PT lock. */
1932
if (PageAnonExclusive(page))
1933
goto reuse;
1934
1935
if (!folio_trylock(folio)) {
1936
folio_get(folio);
1937
spin_unlock(vmf->ptl);
1938
folio_lock(folio);
1939
spin_lock(vmf->ptl);
1940
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1941
spin_unlock(vmf->ptl);
1942
folio_unlock(folio);
1943
folio_put(folio);
1944
return 0;
1945
}
1946
folio_put(folio);
1947
}
1948
1949
/* Recheck after temporarily dropping the PT lock. */
1950
if (PageAnonExclusive(page)) {
1951
folio_unlock(folio);
1952
goto reuse;
1953
}
1954
1955
/*
1956
* See do_wp_page(): we can only reuse the folio exclusively if
1957
* there are no additional references. Note that we always drain
1958
* the LRU cache immediately after adding a THP.
1959
*/
1960
if (folio_ref_count(folio) >
1961
1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
1962
goto unlock_fallback;
1963
if (folio_test_swapcache(folio))
1964
folio_free_swap(folio);
1965
if (folio_ref_count(folio) == 1) {
1966
pmd_t entry;
1967
1968
folio_move_anon_rmap(folio, vma);
1969
SetPageAnonExclusive(page);
1970
folio_unlock(folio);
1971
reuse:
1972
if (unlikely(unshare)) {
1973
spin_unlock(vmf->ptl);
1974
return 0;
1975
}
1976
entry = pmd_mkyoung(orig_pmd);
1977
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1978
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1979
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1980
spin_unlock(vmf->ptl);
1981
return 0;
1982
}
1983
1984
unlock_fallback:
1985
folio_unlock(folio);
1986
spin_unlock(vmf->ptl);
1987
fallback:
1988
__split_huge_pmd(vma, vmf->pmd, vmf->address, false);
1989
return VM_FAULT_FALLBACK;
1990
}
1991
1992
static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1993
unsigned long addr, pmd_t pmd)
1994
{
1995
struct page *page;
1996
1997
if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1998
return false;
1999
2000
/* Don't touch entries that are not even readable (NUMA hinting). */
2001
if (pmd_protnone(pmd))
2002
return false;
2003
2004
/* Do we need write faults for softdirty tracking? */
2005
if (pmd_needs_soft_dirty_wp(vma, pmd))
2006
return false;
2007
2008
/* Do we need write faults for uffd-wp tracking? */
2009
if (userfaultfd_huge_pmd_wp(vma, pmd))
2010
return false;
2011
2012
if (!(vma->vm_flags & VM_SHARED)) {
2013
/* See can_change_pte_writable(). */
2014
page = vm_normal_page_pmd(vma, addr, pmd);
2015
return page && PageAnon(page) && PageAnonExclusive(page);
2016
}
2017
2018
/* See can_change_pte_writable(). */
2019
return pmd_dirty(pmd);
2020
}
2021
2022
/* NUMA hinting page fault entry point for trans huge pmds */
2023
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
2024
{
2025
struct vm_area_struct *vma = vmf->vma;
2026
struct folio *folio;
2027
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2028
int nid = NUMA_NO_NODE;
2029
int target_nid, last_cpupid;
2030
pmd_t pmd, old_pmd;
2031
bool writable = false;
2032
int flags = 0;
2033
2034
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2035
old_pmd = pmdp_get(vmf->pmd);
2036
2037
if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
2038
spin_unlock(vmf->ptl);
2039
return 0;
2040
}
2041
2042
pmd = pmd_modify(old_pmd, vma->vm_page_prot);
2043
2044
/*
2045
* Detect now whether the PMD could be writable; this information
2046
* is only valid while holding the PT lock.
2047
*/
2048
writable = pmd_write(pmd);
2049
if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
2050
can_change_pmd_writable(vma, vmf->address, pmd))
2051
writable = true;
2052
2053
folio = vm_normal_folio_pmd(vma, haddr, pmd);
2054
if (!folio)
2055
goto out_map;
2056
2057
nid = folio_nid(folio);
2058
2059
target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
2060
&last_cpupid);
2061
if (target_nid == NUMA_NO_NODE)
2062
goto out_map;
2063
if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
2064
flags |= TNF_MIGRATE_FAIL;
2065
goto out_map;
2066
}
2067
/* The folio is isolated and isolation code holds a folio reference. */
2068
spin_unlock(vmf->ptl);
2069
writable = false;
2070
2071
if (!migrate_misplaced_folio(folio, target_nid)) {
2072
flags |= TNF_MIGRATED;
2073
nid = target_nid;
2074
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
2075
return 0;
2076
}
2077
2078
flags |= TNF_MIGRATE_FAIL;
2079
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2080
if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
2081
spin_unlock(vmf->ptl);
2082
return 0;
2083
}
2084
out_map:
2085
/* Restore the PMD */
2086
pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
2087
pmd = pmd_mkyoung(pmd);
2088
if (writable)
2089
pmd = pmd_mkwrite(pmd, vma);
2090
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
2091
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
2092
spin_unlock(vmf->ptl);
2093
2094
if (nid != NUMA_NO_NODE)
2095
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
2096
return 0;
2097
}
2098
2099
/*
2100
* Return true if we do MADV_FREE successfully on entire pmd page.
2101
* Otherwise, return false.
2102
*/
2103
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2104
pmd_t *pmd, unsigned long addr, unsigned long next)
2105
{
2106
spinlock_t *ptl;
2107
pmd_t orig_pmd;
2108
struct folio *folio;
2109
struct mm_struct *mm = tlb->mm;
2110
bool ret = false;
2111
2112
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2113
2114
ptl = pmd_trans_huge_lock(pmd, vma);
2115
if (!ptl)
2116
goto out_unlocked;
2117
2118
orig_pmd = *pmd;
2119
if (is_huge_zero_pmd(orig_pmd))
2120
goto out;
2121
2122
if (unlikely(!pmd_present(orig_pmd))) {
2123
VM_BUG_ON(thp_migration_supported() &&
2124
!is_pmd_migration_entry(orig_pmd));
2125
goto out;
2126
}
2127
2128
folio = pmd_folio(orig_pmd);
2129
/*
2130
* If other processes are mapping this folio, we couldn't discard
2131
* the folio unless they all do MADV_FREE so let's skip the folio.
2132
*/
2133
if (folio_maybe_mapped_shared(folio))
2134
goto out;
2135
2136
if (!folio_trylock(folio))
2137
goto out;
2138
2139
/*
2140
* If user want to discard part-pages of THP, split it so MADV_FREE
2141
* will deactivate only them.
2142
*/
2143
if (next - addr != HPAGE_PMD_SIZE) {
2144
folio_get(folio);
2145
spin_unlock(ptl);
2146
split_folio(folio);
2147
folio_unlock(folio);
2148
folio_put(folio);
2149
goto out_unlocked;
2150
}
2151
2152
if (folio_test_dirty(folio))
2153
folio_clear_dirty(folio);
2154
folio_unlock(folio);
2155
2156
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
2157
pmdp_invalidate(vma, addr, pmd);
2158
orig_pmd = pmd_mkold(orig_pmd);
2159
orig_pmd = pmd_mkclean(orig_pmd);
2160
2161
set_pmd_at(mm, addr, pmd, orig_pmd);
2162
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2163
}
2164
2165
folio_mark_lazyfree(folio);
2166
ret = true;
2167
out:
2168
spin_unlock(ptl);
2169
out_unlocked:
2170
return ret;
2171
}
2172
2173
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
2174
{
2175
pgtable_t pgtable;
2176
2177
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2178
pte_free(mm, pgtable);
2179
mm_dec_nr_ptes(mm);
2180
}
2181
2182
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2183
pmd_t *pmd, unsigned long addr)
2184
{
2185
pmd_t orig_pmd;
2186
spinlock_t *ptl;
2187
2188
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2189
2190
ptl = __pmd_trans_huge_lock(pmd, vma);
2191
if (!ptl)
2192
return 0;
2193
/*
2194
* For architectures like ppc64 we look at deposited pgtable
2195
* when calling pmdp_huge_get_and_clear. So do the
2196
* pgtable_trans_huge_withdraw after finishing pmdp related
2197
* operations.
2198
*/
2199
orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
2200
tlb->fullmm);
2201
arch_check_zapped_pmd(vma, orig_pmd);
2202
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2203
if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2204
if (arch_needs_pgtable_deposit())
2205
zap_deposited_table(tlb->mm, pmd);
2206
spin_unlock(ptl);
2207
} else if (is_huge_zero_pmd(orig_pmd)) {
2208
if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
2209
zap_deposited_table(tlb->mm, pmd);
2210
spin_unlock(ptl);
2211
} else {
2212
struct folio *folio = NULL;
2213
int flush_needed = 1;
2214
2215
if (pmd_present(orig_pmd)) {
2216
struct page *page = pmd_page(orig_pmd);
2217
2218
folio = page_folio(page);
2219
folio_remove_rmap_pmd(folio, page, vma);
2220
WARN_ON_ONCE(folio_mapcount(folio) < 0);
2221
VM_BUG_ON_PAGE(!PageHead(page), page);
2222
} else if (thp_migration_supported()) {
2223
swp_entry_t entry;
2224
2225
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
2226
entry = pmd_to_swp_entry(orig_pmd);
2227
folio = pfn_swap_entry_folio(entry);
2228
flush_needed = 0;
2229
} else
2230
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2231
2232
if (folio_test_anon(folio)) {
2233
zap_deposited_table(tlb->mm, pmd);
2234
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2235
} else {
2236
if (arch_needs_pgtable_deposit())
2237
zap_deposited_table(tlb->mm, pmd);
2238
add_mm_counter(tlb->mm, mm_counter_file(folio),
2239
-HPAGE_PMD_NR);
2240
2241
/*
2242
* Use flush_needed to indicate whether the PMD entry
2243
* is present, instead of checking pmd_present() again.
2244
*/
2245
if (flush_needed && pmd_young(orig_pmd) &&
2246
likely(vma_has_recency(vma)))
2247
folio_mark_accessed(folio);
2248
}
2249
2250
spin_unlock(ptl);
2251
if (flush_needed)
2252
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
2253
}
2254
return 1;
2255
}
2256
2257
#ifndef pmd_move_must_withdraw
2258
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2259
spinlock_t *old_pmd_ptl,
2260
struct vm_area_struct *vma)
2261
{
2262
/*
2263
* With split pmd lock we also need to move preallocated
2264
* PTE page table if new_pmd is on different PMD page table.
2265
*
2266
* We also don't deposit and withdraw tables for file pages.
2267
*/
2268
return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2269
}
2270
#endif
2271
2272
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2273
{
2274
#ifdef CONFIG_MEM_SOFT_DIRTY
2275
if (unlikely(is_pmd_migration_entry(pmd)))
2276
pmd = pmd_swp_mksoft_dirty(pmd);
2277
else if (pmd_present(pmd))
2278
pmd = pmd_mksoft_dirty(pmd);
2279
#endif
2280
return pmd;
2281
}
2282
2283
static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
2284
{
2285
if (pmd_present(pmd))
2286
pmd = pmd_clear_uffd_wp(pmd);
2287
else if (is_swap_pmd(pmd))
2288
pmd = pmd_swp_clear_uffd_wp(pmd);
2289
2290
return pmd;
2291
}
2292
2293
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
2294
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
2295
{
2296
spinlock_t *old_ptl, *new_ptl;
2297
pmd_t pmd;
2298
struct mm_struct *mm = vma->vm_mm;
2299
bool force_flush = false;
2300
2301
/*
2302
* The destination pmd shouldn't be established, free_pgtables()
2303
* should have released it; but move_page_tables() might have already
2304
* inserted a page table, if racing against shmem/file collapse.
2305
*/
2306
if (!pmd_none(*new_pmd)) {
2307
VM_BUG_ON(pmd_trans_huge(*new_pmd));
2308
return false;
2309
}
2310
2311
/*
2312
* We don't have to worry about the ordering of src and dst
2313
* ptlocks because exclusive mmap_lock prevents deadlock.
2314
*/
2315
old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2316
if (old_ptl) {
2317
new_ptl = pmd_lockptr(mm, new_pmd);
2318
if (new_ptl != old_ptl)
2319
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
2320
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
2321
if (pmd_present(pmd))
2322
force_flush = true;
2323
VM_BUG_ON(!pmd_none(*new_pmd));
2324
2325
if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
2326
pgtable_t pgtable;
2327
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2328
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
2329
}
2330
pmd = move_soft_dirty_pmd(pmd);
2331
if (vma_has_uffd_without_event_remap(vma))
2332
pmd = clear_uffd_wp_pmd(pmd);
2333
set_pmd_at(mm, new_addr, new_pmd, pmd);
2334
if (force_flush)
2335
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
2336
if (new_ptl != old_ptl)
2337
spin_unlock(new_ptl);
2338
spin_unlock(old_ptl);
2339
return true;
2340
}
2341
return false;
2342
}
2343
2344
/*
2345
* Returns
2346
* - 0 if PMD could not be locked
2347
* - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
2348
* or if prot_numa but THP migration is not supported
2349
* - HPAGE_PMD_NR if protections changed and TLB flush necessary
2350
*/
2351
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2352
pmd_t *pmd, unsigned long addr, pgprot_t newprot,
2353
unsigned long cp_flags)
2354
{
2355
struct mm_struct *mm = vma->vm_mm;
2356
spinlock_t *ptl;
2357
pmd_t oldpmd, entry;
2358
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
2359
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2360
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
2361
int ret = 1;
2362
2363
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2364
2365
if (prot_numa && !thp_migration_supported())
2366
return 1;
2367
2368
ptl = __pmd_trans_huge_lock(pmd, vma);
2369
if (!ptl)
2370
return 0;
2371
2372
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2373
if (is_swap_pmd(*pmd)) {
2374
swp_entry_t entry = pmd_to_swp_entry(*pmd);
2375
struct folio *folio = pfn_swap_entry_folio(entry);
2376
pmd_t newpmd;
2377
2378
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
2379
if (is_writable_migration_entry(entry)) {
2380
/*
2381
* A protection check is difficult so
2382
* just be safe and disable write
2383
*/
2384
if (folio_test_anon(folio))
2385
entry = make_readable_exclusive_migration_entry(swp_offset(entry));
2386
else
2387
entry = make_readable_migration_entry(swp_offset(entry));
2388
newpmd = swp_entry_to_pmd(entry);
2389
if (pmd_swp_soft_dirty(*pmd))
2390
newpmd = pmd_swp_mksoft_dirty(newpmd);
2391
} else {
2392
newpmd = *pmd;
2393
}
2394
2395
if (uffd_wp)
2396
newpmd = pmd_swp_mkuffd_wp(newpmd);
2397
else if (uffd_wp_resolve)
2398
newpmd = pmd_swp_clear_uffd_wp(newpmd);
2399
if (!pmd_same(*pmd, newpmd))
2400
set_pmd_at(mm, addr, pmd, newpmd);
2401
goto unlock;
2402
}
2403
#endif
2404
2405
if (prot_numa) {
2406
struct folio *folio;
2407
bool toptier;
2408
/*
2409
* Avoid trapping faults against the zero page. The read-only
2410
* data is likely to be read-cached on the local CPU and
2411
* local/remote hits to the zero page are not interesting.
2412
*/
2413
if (is_huge_zero_pmd(*pmd))
2414
goto unlock;
2415
2416
if (pmd_protnone(*pmd))
2417
goto unlock;
2418
2419
folio = pmd_folio(*pmd);
2420
toptier = node_is_toptier(folio_nid(folio));
2421
/*
2422
* Skip scanning top tier node if normal numa
2423
* balancing is disabled
2424
*/
2425
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
2426
toptier)
2427
goto unlock;
2428
2429
if (folio_use_access_time(folio))
2430
folio_xchg_access_time(folio,
2431
jiffies_to_msecs(jiffies));
2432
}
2433
/*
2434
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
2435
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
2436
* which is also under mmap_read_lock(mm):
2437
*
2438
* CPU0: CPU1:
2439
* change_huge_pmd(prot_numa=1)
2440
* pmdp_huge_get_and_clear_notify()
2441
* madvise_dontneed()
2442
* zap_pmd_range()
2443
* pmd_trans_huge(*pmd) == 0 (without ptl)
2444
* // skip the pmd
2445
* set_pmd_at();
2446
* // pmd is re-established
2447
*
2448
* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2449
* which may break userspace.
2450
*
2451
* pmdp_invalidate_ad() is required to make sure we don't miss
2452
* dirty/young flags set by hardware.
2453
*/
2454
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
2455
2456
entry = pmd_modify(oldpmd, newprot);
2457
if (uffd_wp)
2458
entry = pmd_mkuffd_wp(entry);
2459
else if (uffd_wp_resolve)
2460
/*
2461
* Leave the write bit to be handled by PF interrupt
2462
* handler, then things like COW could be properly
2463
* handled.
2464
*/
2465
entry = pmd_clear_uffd_wp(entry);
2466
2467
/* See change_pte_range(). */
2468
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2469
can_change_pmd_writable(vma, addr, entry))
2470
entry = pmd_mkwrite(entry, vma);
2471
2472
ret = HPAGE_PMD_NR;
2473
set_pmd_at(mm, addr, pmd, entry);
2474
2475
if (huge_pmd_needs_flush(oldpmd, entry))
2476
tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
2477
unlock:
2478
spin_unlock(ptl);
2479
return ret;
2480
}
2481
2482
/*
2483
* Returns:
2484
*
2485
* - 0: if pud leaf changed from under us
2486
* - 1: if pud can be skipped
2487
* - HPAGE_PUD_NR: if pud was successfully processed
2488
*/
2489
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2490
int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2491
pud_t *pudp, unsigned long addr, pgprot_t newprot,
2492
unsigned long cp_flags)
2493
{
2494
struct mm_struct *mm = vma->vm_mm;
2495
pud_t oldpud, entry;
2496
spinlock_t *ptl;
2497
2498
tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
2499
2500
/* NUMA balancing doesn't apply to dax */
2501
if (cp_flags & MM_CP_PROT_NUMA)
2502
return 1;
2503
2504
/*
2505
* Huge entries on userfault-wp only works with anonymous, while we
2506
* don't have anonymous PUDs yet.
2507
*/
2508
if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
2509
return 1;
2510
2511
ptl = __pud_trans_huge_lock(pudp, vma);
2512
if (!ptl)
2513
return 0;
2514
2515
/*
2516
* Can't clear PUD or it can race with concurrent zapping. See
2517
* change_huge_pmd().
2518
*/
2519
oldpud = pudp_invalidate(vma, addr, pudp);
2520
entry = pud_modify(oldpud, newprot);
2521
set_pud_at(mm, addr, pudp, entry);
2522
tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
2523
2524
spin_unlock(ptl);
2525
return HPAGE_PUD_NR;
2526
}
2527
#endif
2528
2529
#ifdef CONFIG_USERFAULTFD
2530
/*
2531
* The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
2532
* the caller, but it must return after releasing the page_table_lock.
2533
* Just move the page from src_pmd to dst_pmd if possible.
2534
* Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2535
* repeated by the caller, or other errors in case of failure.
2536
*/
2537
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2538
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2539
unsigned long dst_addr, unsigned long src_addr)
2540
{
2541
pmd_t _dst_pmd, src_pmdval;
2542
struct page *src_page;
2543
struct folio *src_folio;
2544
struct anon_vma *src_anon_vma;
2545
spinlock_t *src_ptl, *dst_ptl;
2546
pgtable_t src_pgtable;
2547
struct mmu_notifier_range range;
2548
int err = 0;
2549
2550
src_pmdval = *src_pmd;
2551
src_ptl = pmd_lockptr(mm, src_pmd);
2552
2553
lockdep_assert_held(src_ptl);
2554
vma_assert_locked(src_vma);
2555
vma_assert_locked(dst_vma);
2556
2557
/* Sanity checks before the operation */
2558
if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2559
WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2560
spin_unlock(src_ptl);
2561
return -EINVAL;
2562
}
2563
2564
if (!pmd_trans_huge(src_pmdval)) {
2565
spin_unlock(src_ptl);
2566
if (is_pmd_migration_entry(src_pmdval)) {
2567
pmd_migration_entry_wait(mm, &src_pmdval);
2568
return -EAGAIN;
2569
}
2570
return -ENOENT;
2571
}
2572
2573
src_page = pmd_page(src_pmdval);
2574
2575
if (!is_huge_zero_pmd(src_pmdval)) {
2576
if (unlikely(!PageAnonExclusive(src_page))) {
2577
spin_unlock(src_ptl);
2578
return -EBUSY;
2579
}
2580
2581
src_folio = page_folio(src_page);
2582
folio_get(src_folio);
2583
} else
2584
src_folio = NULL;
2585
2586
spin_unlock(src_ptl);
2587
2588
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2589
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2590
src_addr + HPAGE_PMD_SIZE);
2591
mmu_notifier_invalidate_range_start(&range);
2592
2593
if (src_folio) {
2594
folio_lock(src_folio);
2595
2596
/*
2597
* split_huge_page walks the anon_vma chain without the page
2598
* lock. Serialize against it with the anon_vma lock, the page
2599
* lock is not enough.
2600
*/
2601
src_anon_vma = folio_get_anon_vma(src_folio);
2602
if (!src_anon_vma) {
2603
err = -EAGAIN;
2604
goto unlock_folio;
2605
}
2606
anon_vma_lock_write(src_anon_vma);
2607
} else
2608
src_anon_vma = NULL;
2609
2610
dst_ptl = pmd_lockptr(mm, dst_pmd);
2611
double_pt_lock(src_ptl, dst_ptl);
2612
if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2613
!pmd_same(*dst_pmd, dst_pmdval))) {
2614
err = -EAGAIN;
2615
goto unlock_ptls;
2616
}
2617
if (src_folio) {
2618
if (folio_maybe_dma_pinned(src_folio) ||
2619
!PageAnonExclusive(&src_folio->page)) {
2620
err = -EBUSY;
2621
goto unlock_ptls;
2622
}
2623
2624
if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2625
WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2626
err = -EBUSY;
2627
goto unlock_ptls;
2628
}
2629
2630
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2631
/* Folio got pinned from under us. Put it back and fail the move. */
2632
if (folio_maybe_dma_pinned(src_folio)) {
2633
set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2634
err = -EBUSY;
2635
goto unlock_ptls;
2636
}
2637
2638
folio_move_anon_rmap(src_folio, dst_vma);
2639
src_folio->index = linear_page_index(dst_vma, dst_addr);
2640
2641
_dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
2642
/* Follow mremap() behavior and treat the entry dirty after the move */
2643
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2644
} else {
2645
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2646
_dst_pmd = folio_mk_pmd(src_folio, dst_vma->vm_page_prot);
2647
}
2648
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2649
2650
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2651
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2652
unlock_ptls:
2653
double_pt_unlock(src_ptl, dst_ptl);
2654
if (src_anon_vma) {
2655
anon_vma_unlock_write(src_anon_vma);
2656
put_anon_vma(src_anon_vma);
2657
}
2658
unlock_folio:
2659
/* unblock rmap walks */
2660
if (src_folio)
2661
folio_unlock(src_folio);
2662
mmu_notifier_invalidate_range_end(&range);
2663
if (src_folio)
2664
folio_put(src_folio);
2665
return err;
2666
}
2667
#endif /* CONFIG_USERFAULTFD */
2668
2669
/*
2670
* Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2671
*
2672
* Note that if it returns page table lock pointer, this routine returns without
2673
* unlocking page table lock. So callers must unlock it.
2674
*/
2675
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2676
{
2677
spinlock_t *ptl;
2678
ptl = pmd_lock(vma->vm_mm, pmd);
2679
if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
2680
return ptl;
2681
spin_unlock(ptl);
2682
return NULL;
2683
}
2684
2685
/*
2686
* Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
2687
*
2688
* Note that if it returns page table lock pointer, this routine returns without
2689
* unlocking page table lock. So callers must unlock it.
2690
*/
2691
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2692
{
2693
spinlock_t *ptl;
2694
2695
ptl = pud_lock(vma->vm_mm, pud);
2696
if (likely(pud_trans_huge(*pud)))
2697
return ptl;
2698
spin_unlock(ptl);
2699
return NULL;
2700
}
2701
2702
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2703
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2704
pud_t *pud, unsigned long addr)
2705
{
2706
spinlock_t *ptl;
2707
pud_t orig_pud;
2708
2709
ptl = __pud_trans_huge_lock(pud, vma);
2710
if (!ptl)
2711
return 0;
2712
2713
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2714
arch_check_zapped_pud(vma, orig_pud);
2715
tlb_remove_pud_tlb_entry(tlb, pud, addr);
2716
if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
2717
spin_unlock(ptl);
2718
/* No zero page support yet */
2719
} else {
2720
struct page *page = NULL;
2721
struct folio *folio;
2722
2723
/* No support for anonymous PUD pages or migration yet */
2724
VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
2725
!pud_present(orig_pud));
2726
2727
page = pud_page(orig_pud);
2728
folio = page_folio(page);
2729
folio_remove_rmap_pud(folio, page, vma);
2730
add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
2731
2732
spin_unlock(ptl);
2733
tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
2734
}
2735
return 1;
2736
}
2737
2738
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2739
unsigned long haddr)
2740
{
2741
struct folio *folio;
2742
struct page *page;
2743
pud_t old_pud;
2744
2745
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2746
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2747
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2748
VM_BUG_ON(!pud_trans_huge(*pud));
2749
2750
count_vm_event(THP_SPLIT_PUD);
2751
2752
old_pud = pudp_huge_clear_flush(vma, haddr, pud);
2753
2754
if (!vma_is_dax(vma))
2755
return;
2756
2757
page = pud_page(old_pud);
2758
folio = page_folio(page);
2759
2760
if (!folio_test_dirty(folio) && pud_dirty(old_pud))
2761
folio_mark_dirty(folio);
2762
if (!folio_test_referenced(folio) && pud_young(old_pud))
2763
folio_set_referenced(folio);
2764
folio_remove_rmap_pud(folio, page, vma);
2765
folio_put(folio);
2766
add_mm_counter(vma->vm_mm, mm_counter_file(folio),
2767
-HPAGE_PUD_NR);
2768
}
2769
2770
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2771
unsigned long address)
2772
{
2773
spinlock_t *ptl;
2774
struct mmu_notifier_range range;
2775
2776
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
2777
address & HPAGE_PUD_MASK,
2778
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2779
mmu_notifier_invalidate_range_start(&range);
2780
ptl = pud_lock(vma->vm_mm, pud);
2781
if (unlikely(!pud_trans_huge(*pud)))
2782
goto out;
2783
__split_huge_pud_locked(vma, pud, range.start);
2784
2785
out:
2786
spin_unlock(ptl);
2787
mmu_notifier_invalidate_range_end(&range);
2788
}
2789
#else
2790
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2791
unsigned long address)
2792
{
2793
}
2794
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2795
2796
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2797
unsigned long haddr, pmd_t *pmd)
2798
{
2799
struct mm_struct *mm = vma->vm_mm;
2800
pgtable_t pgtable;
2801
pmd_t _pmd, old_pmd;
2802
unsigned long addr;
2803
pte_t *pte;
2804
int i;
2805
2806
/*
2807
* Leave pmd empty until pte is filled note that it is fine to delay
2808
* notification until mmu_notifier_invalidate_range_end() as we are
2809
* replacing a zero pmd write protected page with a zero pte write
2810
* protected page.
2811
*
2812
* See Documentation/mm/mmu_notifier.rst
2813
*/
2814
old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2815
2816
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2817
pmd_populate(mm, &_pmd, pgtable);
2818
2819
pte = pte_offset_map(&_pmd, haddr);
2820
VM_BUG_ON(!pte);
2821
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2822
pte_t entry;
2823
2824
entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
2825
entry = pte_mkspecial(entry);
2826
if (pmd_uffd_wp(old_pmd))
2827
entry = pte_mkuffd_wp(entry);
2828
VM_BUG_ON(!pte_none(ptep_get(pte)));
2829
set_pte_at(mm, addr, pte, entry);
2830
pte++;
2831
}
2832
pte_unmap(pte - 1);
2833
smp_wmb(); /* make pte visible before pmd */
2834
pmd_populate(mm, pmd, pgtable);
2835
}
2836
2837
static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2838
unsigned long haddr, bool freeze)
2839
{
2840
struct mm_struct *mm = vma->vm_mm;
2841
struct folio *folio;
2842
struct page *page;
2843
pgtable_t pgtable;
2844
pmd_t old_pmd, _pmd;
2845
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
2846
bool anon_exclusive = false, dirty = false;
2847
unsigned long addr;
2848
pte_t *pte;
2849
int i;
2850
2851
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2852
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2853
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2854
VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
2855
2856
count_vm_event(THP_SPLIT_PMD);
2857
2858
if (!vma_is_anonymous(vma)) {
2859
old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
2860
/*
2861
* We are going to unmap this huge page. So
2862
* just go ahead and zap it
2863
*/
2864
if (arch_needs_pgtable_deposit())
2865
zap_deposited_table(mm, pmd);
2866
if (!vma_is_dax(vma) && vma_is_special_huge(vma))
2867
return;
2868
if (unlikely(is_pmd_migration_entry(old_pmd))) {
2869
swp_entry_t entry;
2870
2871
entry = pmd_to_swp_entry(old_pmd);
2872
folio = pfn_swap_entry_folio(entry);
2873
} else if (is_huge_zero_pmd(old_pmd)) {
2874
return;
2875
} else {
2876
page = pmd_page(old_pmd);
2877
folio = page_folio(page);
2878
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
2879
folio_mark_dirty(folio);
2880
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2881
folio_set_referenced(folio);
2882
folio_remove_rmap_pmd(folio, page, vma);
2883
folio_put(folio);
2884
}
2885
add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
2886
return;
2887
}
2888
2889
if (is_huge_zero_pmd(*pmd)) {
2890
/*
2891
* FIXME: Do we want to invalidate secondary mmu by calling
2892
* mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2893
* inside __split_huge_pmd() ?
2894
*
2895
* We are going from a zero huge page write protected to zero
2896
* small page also write protected so it does not seems useful
2897
* to invalidate secondary mmu at this time.
2898
*/
2899
return __split_huge_zero_page_pmd(vma, haddr, pmd);
2900
}
2901
2902
pmd_migration = is_pmd_migration_entry(*pmd);
2903
if (unlikely(pmd_migration)) {
2904
swp_entry_t entry;
2905
2906
old_pmd = *pmd;
2907
entry = pmd_to_swp_entry(old_pmd);
2908
page = pfn_swap_entry_to_page(entry);
2909
write = is_writable_migration_entry(entry);
2910
if (PageAnon(page))
2911
anon_exclusive = is_readable_exclusive_migration_entry(entry);
2912
young = is_migration_entry_young(entry);
2913
dirty = is_migration_entry_dirty(entry);
2914
soft_dirty = pmd_swp_soft_dirty(old_pmd);
2915
uffd_wp = pmd_swp_uffd_wp(old_pmd);
2916
} else {
2917
/*
2918
* Up to this point the pmd is present and huge and userland has
2919
* the whole access to the hugepage during the split (which
2920
* happens in place). If we overwrite the pmd with the not-huge
2921
* version pointing to the pte here (which of course we could if
2922
* all CPUs were bug free), userland could trigger a small page
2923
* size TLB miss on the small sized TLB while the hugepage TLB
2924
* entry is still established in the huge TLB. Some CPU doesn't
2925
* like that. See
2926
* http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2927
* 383 on page 105. Intel should be safe but is also warns that
2928
* it's only safe if the permission and cache attributes of the
2929
* two entries loaded in the two TLB is identical (which should
2930
* be the case here). But it is generally safer to never allow
2931
* small and huge TLB entries for the same virtual address to be
2932
* loaded simultaneously. So instead of doing "pmd_populate();
2933
* flush_pmd_tlb_range();" we first mark the current pmd
2934
* notpresent (atomically because here the pmd_trans_huge must
2935
* remain set at all times on the pmd until the split is
2936
* complete for this pmd), then we flush the SMP TLB and finally
2937
* we write the non-huge version of the pmd entry with
2938
* pmd_populate.
2939
*/
2940
old_pmd = pmdp_invalidate(vma, haddr, pmd);
2941
page = pmd_page(old_pmd);
2942
folio = page_folio(page);
2943
if (pmd_dirty(old_pmd)) {
2944
dirty = true;
2945
folio_set_dirty(folio);
2946
}
2947
write = pmd_write(old_pmd);
2948
young = pmd_young(old_pmd);
2949
soft_dirty = pmd_soft_dirty(old_pmd);
2950
uffd_wp = pmd_uffd_wp(old_pmd);
2951
2952
VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2953
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2954
2955
/*
2956
* Without "freeze", we'll simply split the PMD, propagating the
2957
* PageAnonExclusive() flag for each PTE by setting it for
2958
* each subpage -- no need to (temporarily) clear.
2959
*
2960
* With "freeze" we want to replace mapped pages by
2961
* migration entries right away. This is only possible if we
2962
* managed to clear PageAnonExclusive() -- see
2963
* set_pmd_migration_entry().
2964
*
2965
* In case we cannot clear PageAnonExclusive(), split the PMD
2966
* only and let try_to_migrate_one() fail later.
2967
*
2968
* See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
2969
*/
2970
anon_exclusive = PageAnonExclusive(page);
2971
if (freeze && anon_exclusive &&
2972
folio_try_share_anon_rmap_pmd(folio, page))
2973
freeze = false;
2974
if (!freeze) {
2975
rmap_t rmap_flags = RMAP_NONE;
2976
2977
folio_ref_add(folio, HPAGE_PMD_NR - 1);
2978
if (anon_exclusive)
2979
rmap_flags |= RMAP_EXCLUSIVE;
2980
folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2981
vma, haddr, rmap_flags);
2982
}
2983
}
2984
2985
/*
2986
* Withdraw the table only after we mark the pmd entry invalid.
2987
* This's critical for some architectures (Power).
2988
*/
2989
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2990
pmd_populate(mm, &_pmd, pgtable);
2991
2992
pte = pte_offset_map(&_pmd, haddr);
2993
VM_BUG_ON(!pte);
2994
2995
/*
2996
* Note that NUMA hinting access restrictions are not transferred to
2997
* avoid any possibility of altering permissions across VMAs.
2998
*/
2999
if (freeze || pmd_migration) {
3000
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
3001
pte_t entry;
3002
swp_entry_t swp_entry;
3003
3004
if (write)
3005
swp_entry = make_writable_migration_entry(
3006
page_to_pfn(page + i));
3007
else if (anon_exclusive)
3008
swp_entry = make_readable_exclusive_migration_entry(
3009
page_to_pfn(page + i));
3010
else
3011
swp_entry = make_readable_migration_entry(
3012
page_to_pfn(page + i));
3013
if (young)
3014
swp_entry = make_migration_entry_young(swp_entry);
3015
if (dirty)
3016
swp_entry = make_migration_entry_dirty(swp_entry);
3017
entry = swp_entry_to_pte(swp_entry);
3018
if (soft_dirty)
3019
entry = pte_swp_mksoft_dirty(entry);
3020
if (uffd_wp)
3021
entry = pte_swp_mkuffd_wp(entry);
3022
3023
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3024
set_pte_at(mm, addr, pte + i, entry);
3025
}
3026
} else {
3027
pte_t entry;
3028
3029
entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
3030
if (write)
3031
entry = pte_mkwrite(entry, vma);
3032
if (!young)
3033
entry = pte_mkold(entry);
3034
/* NOTE: this may set soft-dirty too on some archs */
3035
if (dirty)
3036
entry = pte_mkdirty(entry);
3037
if (soft_dirty)
3038
entry = pte_mksoft_dirty(entry);
3039
if (uffd_wp)
3040
entry = pte_mkuffd_wp(entry);
3041
3042
for (i = 0; i < HPAGE_PMD_NR; i++)
3043
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
3044
3045
set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
3046
}
3047
pte_unmap(pte);
3048
3049
if (!pmd_migration)
3050
folio_remove_rmap_pmd(folio, page, vma);
3051
if (freeze)
3052
put_page(page);
3053
3054
smp_wmb(); /* make pte visible before pmd */
3055
pmd_populate(mm, pmd, pgtable);
3056
}
3057
3058
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
3059
pmd_t *pmd, bool freeze)
3060
{
3061
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
3062
if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
3063
__split_huge_pmd_locked(vma, pmd, address, freeze);
3064
}
3065
3066
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
3067
unsigned long address, bool freeze)
3068
{
3069
spinlock_t *ptl;
3070
struct mmu_notifier_range range;
3071
3072
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
3073
address & HPAGE_PMD_MASK,
3074
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
3075
mmu_notifier_invalidate_range_start(&range);
3076
ptl = pmd_lock(vma->vm_mm, pmd);
3077
split_huge_pmd_locked(vma, range.start, pmd, freeze);
3078
spin_unlock(ptl);
3079
mmu_notifier_invalidate_range_end(&range);
3080
}
3081
3082
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
3083
bool freeze)
3084
{
3085
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
3086
3087
if (!pmd)
3088
return;
3089
3090
__split_huge_pmd(vma, pmd, address, freeze);
3091
}
3092
3093
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
3094
{
3095
/*
3096
* If the new address isn't hpage aligned and it could previously
3097
* contain an hugepage: check if we need to split an huge pmd.
3098
*/
3099
if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
3100
range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
3101
ALIGN(address, HPAGE_PMD_SIZE)))
3102
split_huge_pmd_address(vma, address, false);
3103
}
3104
3105
void vma_adjust_trans_huge(struct vm_area_struct *vma,
3106
unsigned long start,
3107
unsigned long end,
3108
struct vm_area_struct *next)
3109
{
3110
/* Check if we need to split start first. */
3111
split_huge_pmd_if_needed(vma, start);
3112
3113
/* Check if we need to split end next. */
3114
split_huge_pmd_if_needed(vma, end);
3115
3116
/* If we're incrementing next->vm_start, we might need to split it. */
3117
if (next)
3118
split_huge_pmd_if_needed(next, end);
3119
}
3120
3121
static void unmap_folio(struct folio *folio)
3122
{
3123
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
3124
TTU_BATCH_FLUSH;
3125
3126
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3127
3128
if (folio_test_pmd_mappable(folio))
3129
ttu_flags |= TTU_SPLIT_HUGE_PMD;
3130
3131
/*
3132
* Anon pages need migration entries to preserve them, but file
3133
* pages can simply be left unmapped, then faulted back on demand.
3134
* If that is ever changed (perhaps for mlock), update remap_page().
3135
*/
3136
if (folio_test_anon(folio))
3137
try_to_migrate(folio, ttu_flags);
3138
else
3139
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
3140
3141
try_to_unmap_flush();
3142
}
3143
3144
static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
3145
unsigned long addr, pmd_t *pmdp,
3146
struct folio *folio)
3147
{
3148
struct mm_struct *mm = vma->vm_mm;
3149
int ref_count, map_count;
3150
pmd_t orig_pmd = *pmdp;
3151
3152
if (pmd_dirty(orig_pmd))
3153
folio_set_dirty(folio);
3154
if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3155
folio_set_swapbacked(folio);
3156
return false;
3157
}
3158
3159
orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
3160
3161
/*
3162
* Syncing against concurrent GUP-fast:
3163
* - clear PMD; barrier; read refcount
3164
* - inc refcount; barrier; read PMD
3165
*/
3166
smp_mb();
3167
3168
ref_count = folio_ref_count(folio);
3169
map_count = folio_mapcount(folio);
3170
3171
/*
3172
* Order reads for folio refcount and dirty flag
3173
* (see comments in __remove_mapping()).
3174
*/
3175
smp_rmb();
3176
3177
/*
3178
* If the folio or its PMD is redirtied at this point, or if there
3179
* are unexpected references, we will give up to discard this folio
3180
* and remap it.
3181
*
3182
* The only folio refs must be one from isolation plus the rmap(s).
3183
*/
3184
if (pmd_dirty(orig_pmd))
3185
folio_set_dirty(folio);
3186
if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
3187
folio_set_swapbacked(folio);
3188
set_pmd_at(mm, addr, pmdp, orig_pmd);
3189
return false;
3190
}
3191
3192
if (ref_count != map_count + 1) {
3193
set_pmd_at(mm, addr, pmdp, orig_pmd);
3194
return false;
3195
}
3196
3197
folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
3198
zap_deposited_table(mm, pmdp);
3199
add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
3200
if (vma->vm_flags & VM_LOCKED)
3201
mlock_drain_local();
3202
folio_put(folio);
3203
3204
return true;
3205
}
3206
3207
bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
3208
pmd_t *pmdp, struct folio *folio)
3209
{
3210
VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
3211
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
3212
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
3213
VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
3214
VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
3215
3216
return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
3217
}
3218
3219
static void remap_page(struct folio *folio, unsigned long nr, int flags)
3220
{
3221
int i = 0;
3222
3223
/* If unmap_folio() uses try_to_migrate() on file, remove this check */
3224
if (!folio_test_anon(folio))
3225
return;
3226
for (;;) {
3227
remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
3228
i += folio_nr_pages(folio);
3229
if (i >= nr)
3230
break;
3231
folio = folio_next(folio);
3232
}
3233
}
3234
3235
static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
3236
struct lruvec *lruvec, struct list_head *list)
3237
{
3238
VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
3239
lockdep_assert_held(&lruvec->lru_lock);
3240
3241
if (list) {
3242
/* page reclaim is reclaiming a huge page */
3243
VM_WARN_ON(folio_test_lru(folio));
3244
folio_get(new_folio);
3245
list_add_tail(&new_folio->lru, list);
3246
} else {
3247
/* head is still on lru (and we have it frozen) */
3248
VM_WARN_ON(!folio_test_lru(folio));
3249
if (folio_test_unevictable(folio))
3250
new_folio->mlock_count = 0;
3251
else
3252
list_add_tail(&new_folio->lru, &folio->lru);
3253
folio_set_lru(new_folio);
3254
}
3255
}
3256
3257
/* Racy check whether the huge page can be split */
3258
bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
3259
{
3260
int extra_pins;
3261
3262
/* Additional pins from page cache */
3263
if (folio_test_anon(folio))
3264
extra_pins = folio_test_swapcache(folio) ?
3265
folio_nr_pages(folio) : 0;
3266
else
3267
extra_pins = folio_nr_pages(folio);
3268
if (pextra_pins)
3269
*pextra_pins = extra_pins;
3270
return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
3271
caller_pins;
3272
}
3273
3274
/*
3275
* It splits @folio into @new_order folios and copies the @folio metadata to
3276
* all the resulting folios.
3277
*/
3278
static void __split_folio_to_order(struct folio *folio, int old_order,
3279
int new_order)
3280
{
3281
long new_nr_pages = 1 << new_order;
3282
long nr_pages = 1 << old_order;
3283
long i;
3284
3285
/*
3286
* Skip the first new_nr_pages, since the new folio from them have all
3287
* the flags from the original folio.
3288
*/
3289
for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
3290
struct page *new_head = &folio->page + i;
3291
3292
/*
3293
* Careful: new_folio is not a "real" folio before we cleared PageTail.
3294
* Don't pass it around before clear_compound_head().
3295
*/
3296
struct folio *new_folio = (struct folio *)new_head;
3297
3298
VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
3299
3300
/*
3301
* Clone page flags before unfreezing refcount.
3302
*
3303
* After successful get_page_unless_zero() might follow flags change,
3304
* for example lock_page() which set PG_waiters.
3305
*
3306
* Note that for mapped sub-pages of an anonymous THP,
3307
* PG_anon_exclusive has been cleared in unmap_folio() and is stored in
3308
* the migration entry instead from where remap_page() will restore it.
3309
* We can still have PG_anon_exclusive set on effectively unmapped and
3310
* unreferenced sub-pages of an anonymous THP: we can simply drop
3311
* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
3312
*/
3313
new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3314
new_folio->flags |= (folio->flags &
3315
((1L << PG_referenced) |
3316
(1L << PG_swapbacked) |
3317
(1L << PG_swapcache) |
3318
(1L << PG_mlocked) |
3319
(1L << PG_uptodate) |
3320
(1L << PG_active) |
3321
(1L << PG_workingset) |
3322
(1L << PG_locked) |
3323
(1L << PG_unevictable) |
3324
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
3325
(1L << PG_arch_2) |
3326
#endif
3327
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
3328
(1L << PG_arch_3) |
3329
#endif
3330
(1L << PG_dirty) |
3331
LRU_GEN_MASK | LRU_REFS_MASK));
3332
3333
new_folio->mapping = folio->mapping;
3334
new_folio->index = folio->index + i;
3335
3336
/*
3337
* page->private should not be set in tail pages. Fix up and warn once
3338
* if private is unexpectedly set.
3339
*/
3340
if (unlikely(new_folio->private)) {
3341
VM_WARN_ON_ONCE_PAGE(true, new_head);
3342
new_folio->private = NULL;
3343
}
3344
3345
if (folio_test_swapcache(folio))
3346
new_folio->swap.val = folio->swap.val + i;
3347
3348
/* Page flags must be visible before we make the page non-compound. */
3349
smp_wmb();
3350
3351
/*
3352
* Clear PageTail before unfreezing page refcount.
3353
*
3354
* After successful get_page_unless_zero() might follow put_page()
3355
* which needs correct compound_head().
3356
*/
3357
clear_compound_head(new_head);
3358
if (new_order) {
3359
prep_compound_page(new_head, new_order);
3360
folio_set_large_rmappable(new_folio);
3361
}
3362
3363
if (folio_test_young(folio))
3364
folio_set_young(new_folio);
3365
if (folio_test_idle(folio))
3366
folio_set_idle(new_folio);
3367
#ifdef CONFIG_MEMCG
3368
new_folio->memcg_data = folio->memcg_data;
3369
#endif
3370
3371
folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
3372
}
3373
3374
if (new_order)
3375
folio_set_order(folio, new_order);
3376
else
3377
ClearPageCompound(&folio->page);
3378
}
3379
3380
/*
3381
* It splits an unmapped @folio to lower order smaller folios in two ways.
3382
* @folio: the to-be-split folio
3383
* @new_order: the smallest order of the after split folios (since buddy
3384
* allocator like split generates folios with orders from @folio's
3385
* order - 1 to new_order).
3386
* @split_at: in buddy allocator like split, the folio containing @split_at
3387
* will be split until its order becomes @new_order.
3388
* @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
3389
* @mapping: @folio->mapping
3390
* @uniform_split: if the split is uniform or not (buddy allocator like split)
3391
*
3392
*
3393
* 1. uniform split: the given @folio into multiple @new_order small folios,
3394
* where all small folios have the same order. This is done when
3395
* uniform_split is true.
3396
* 2. buddy allocator like (non-uniform) split: the given @folio is split into
3397
* half and one of the half (containing the given page) is split into half
3398
* until the given @page's order becomes @new_order. This is done when
3399
* uniform_split is false.
3400
*
3401
* The high level flow for these two methods are:
3402
* 1. uniform split: a single __split_folio_to_order() is called to split the
3403
* @folio into @new_order, then we traverse all the resulting folios one by
3404
* one in PFN ascending order and perform stats, unfreeze, adding to list,
3405
* and file mapping index operations.
3406
* 2. non-uniform split: in general, folio_order - @new_order calls to
3407
* __split_folio_to_order() are made in a for loop to split the @folio
3408
* to one lower order at a time. The resulting small folios are processed
3409
* like what is done during the traversal in 1, except the one containing
3410
* @page, which is split in next for loop.
3411
*
3412
* After splitting, the caller's folio reference will be transferred to the
3413
* folio containing @page. The caller needs to unlock and/or free after-split
3414
* folios if necessary.
3415
*
3416
* For !uniform_split, when -ENOMEM is returned, the original folio might be
3417
* split. The caller needs to check the input folio.
3418
*/
3419
static int __split_unmapped_folio(struct folio *folio, int new_order,
3420
struct page *split_at, struct xa_state *xas,
3421
struct address_space *mapping, bool uniform_split)
3422
{
3423
int order = folio_order(folio);
3424
int start_order = uniform_split ? new_order : order - 1;
3425
bool stop_split = false;
3426
struct folio *next;
3427
int split_order;
3428
int ret = 0;
3429
3430
if (folio_test_anon(folio))
3431
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
3432
3433
folio_clear_has_hwpoisoned(folio);
3434
3435
/*
3436
* split to new_order one order at a time. For uniform split,
3437
* folio is split to new_order directly.
3438
*/
3439
for (split_order = start_order;
3440
split_order >= new_order && !stop_split;
3441
split_order--) {
3442
struct folio *end_folio = folio_next(folio);
3443
int old_order = folio_order(folio);
3444
struct folio *new_folio;
3445
3446
/* order-1 anonymous folio is not supported */
3447
if (folio_test_anon(folio) && split_order == 1)
3448
continue;
3449
if (uniform_split && split_order != new_order)
3450
continue;
3451
3452
if (mapping) {
3453
/*
3454
* uniform split has xas_split_alloc() called before
3455
* irq is disabled to allocate enough memory, whereas
3456
* non-uniform split can handle ENOMEM.
3457
*/
3458
if (uniform_split)
3459
xas_split(xas, folio, old_order);
3460
else {
3461
xas_set_order(xas, folio->index, split_order);
3462
xas_try_split(xas, folio, old_order);
3463
if (xas_error(xas)) {
3464
ret = xas_error(xas);
3465
stop_split = true;
3466
}
3467
}
3468
}
3469
3470
if (!stop_split) {
3471
folio_split_memcg_refs(folio, old_order, split_order);
3472
split_page_owner(&folio->page, old_order, split_order);
3473
pgalloc_tag_split(folio, old_order, split_order);
3474
3475
__split_folio_to_order(folio, old_order, split_order);
3476
}
3477
3478
/*
3479
* Iterate through after-split folios and update folio stats.
3480
* But in buddy allocator like split, the folio
3481
* containing the specified page is skipped until its order
3482
* is new_order, since the folio will be worked on in next
3483
* iteration.
3484
*/
3485
for (new_folio = folio; new_folio != end_folio; new_folio = next) {
3486
next = folio_next(new_folio);
3487
/*
3488
* for buddy allocator like split, new_folio containing
3489
* @split_at page could be split again, thus do not
3490
* change stats yet. Wait until new_folio's order is
3491
* @new_order or stop_split is set to true by the above
3492
* xas_split() failure.
3493
*/
3494
if (new_folio == page_folio(split_at)) {
3495
folio = new_folio;
3496
if (split_order != new_order && !stop_split)
3497
continue;
3498
}
3499
if (folio_test_anon(new_folio))
3500
mod_mthp_stat(folio_order(new_folio),
3501
MTHP_STAT_NR_ANON, 1);
3502
}
3503
}
3504
3505
return ret;
3506
}
3507
3508
bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
3509
bool warns)
3510
{
3511
if (folio_test_anon(folio)) {
3512
/* order-1 is not supported for anonymous THP. */
3513
VM_WARN_ONCE(warns && new_order == 1,
3514
"Cannot split to order-1 folio");
3515
return new_order != 1;
3516
} else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3517
!mapping_large_folio_support(folio->mapping)) {
3518
/*
3519
* No split if the file system does not support large folio.
3520
* Note that we might still have THPs in such mappings due to
3521
* CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
3522
* does not actually support large folios properly.
3523
*/
3524
VM_WARN_ONCE(warns,
3525
"Cannot split file folio to non-0 order");
3526
return false;
3527
}
3528
3529
/* Only swapping a whole PMD-mapped folio is supported */
3530
if (folio_test_swapcache(folio)) {
3531
VM_WARN_ONCE(warns,
3532
"Cannot split swapcache folio to non-0 order");
3533
return false;
3534
}
3535
3536
return true;
3537
}
3538
3539
/* See comments in non_uniform_split_supported() */
3540
bool uniform_split_supported(struct folio *folio, unsigned int new_order,
3541
bool warns)
3542
{
3543
if (folio_test_anon(folio)) {
3544
VM_WARN_ONCE(warns && new_order == 1,
3545
"Cannot split to order-1 folio");
3546
return new_order != 1;
3547
} else if (new_order) {
3548
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3549
!mapping_large_folio_support(folio->mapping)) {
3550
VM_WARN_ONCE(warns,
3551
"Cannot split file folio to non-0 order");
3552
return false;
3553
}
3554
}
3555
3556
if (new_order && folio_test_swapcache(folio)) {
3557
VM_WARN_ONCE(warns,
3558
"Cannot split swapcache folio to non-0 order");
3559
return false;
3560
}
3561
3562
return true;
3563
}
3564
3565
/*
3566
* __folio_split: split a folio at @split_at to a @new_order folio
3567
* @folio: folio to split
3568
* @new_order: the order of the new folio
3569
* @split_at: a page within the new folio
3570
* @lock_at: a page within @folio to be left locked to caller
3571
* @list: after-split folios will be put on it if non NULL
3572
* @uniform_split: perform uniform split or not (non-uniform split)
3573
*
3574
* It calls __split_unmapped_folio() to perform uniform and non-uniform split.
3575
* It is in charge of checking whether the split is supported or not and
3576
* preparing @folio for __split_unmapped_folio().
3577
*
3578
* After splitting, the after-split folio containing @lock_at remains locked
3579
* and others are unlocked:
3580
* 1. for uniform split, @lock_at points to one of @folio's subpages;
3581
* 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
3582
*
3583
* return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
3584
* split but not to @new_order, the caller needs to check)
3585
*/
3586
static int __folio_split(struct folio *folio, unsigned int new_order,
3587
struct page *split_at, struct page *lock_at,
3588
struct list_head *list, bool uniform_split)
3589
{
3590
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
3591
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
3592
struct folio *end_folio = folio_next(folio);
3593
bool is_anon = folio_test_anon(folio);
3594
struct address_space *mapping = NULL;
3595
struct anon_vma *anon_vma = NULL;
3596
int order = folio_order(folio);
3597
struct folio *new_folio, *next;
3598
int nr_shmem_dropped = 0;
3599
int remap_flags = 0;
3600
int extra_pins, ret;
3601
pgoff_t end;
3602
bool is_hzp;
3603
3604
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
3605
VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
3606
3607
if (folio != page_folio(split_at) || folio != page_folio(lock_at))
3608
return -EINVAL;
3609
3610
if (new_order >= folio_order(folio))
3611
return -EINVAL;
3612
3613
if (uniform_split && !uniform_split_supported(folio, new_order, true))
3614
return -EINVAL;
3615
3616
if (!uniform_split &&
3617
!non_uniform_split_supported(folio, new_order, true))
3618
return -EINVAL;
3619
3620
is_hzp = is_huge_zero_folio(folio);
3621
if (is_hzp) {
3622
pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
3623
return -EBUSY;
3624
}
3625
3626
if (folio_test_writeback(folio))
3627
return -EBUSY;
3628
3629
if (is_anon) {
3630
/*
3631
* The caller does not necessarily hold an mmap_lock that would
3632
* prevent the anon_vma disappearing so we first we take a
3633
* reference to it and then lock the anon_vma for write. This
3634
* is similar to folio_lock_anon_vma_read except the write lock
3635
* is taken to serialise against parallel split or collapse
3636
* operations.
3637
*/
3638
anon_vma = folio_get_anon_vma(folio);
3639
if (!anon_vma) {
3640
ret = -EBUSY;
3641
goto out;
3642
}
3643
mapping = NULL;
3644
anon_vma_lock_write(anon_vma);
3645
} else {
3646
unsigned int min_order;
3647
gfp_t gfp;
3648
3649
mapping = folio->mapping;
3650
3651
/* Truncated ? */
3652
/*
3653
* TODO: add support for large shmem folio in swap cache.
3654
* When shmem is in swap cache, mapping is NULL and
3655
* folio_test_swapcache() is true.
3656
*/
3657
if (!mapping) {
3658
ret = -EBUSY;
3659
goto out;
3660
}
3661
3662
min_order = mapping_min_folio_order(folio->mapping);
3663
if (new_order < min_order) {
3664
VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
3665
min_order);
3666
ret = -EINVAL;
3667
goto out;
3668
}
3669
3670
gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3671
GFP_RECLAIM_MASK);
3672
3673
if (!filemap_release_folio(folio, gfp)) {
3674
ret = -EBUSY;
3675
goto out;
3676
}
3677
3678
if (uniform_split) {
3679
xas_set_order(&xas, folio->index, new_order);
3680
xas_split_alloc(&xas, folio, folio_order(folio), gfp);
3681
if (xas_error(&xas)) {
3682
ret = xas_error(&xas);
3683
goto out;
3684
}
3685
}
3686
3687
anon_vma = NULL;
3688
i_mmap_lock_read(mapping);
3689
3690
/*
3691
*__split_unmapped_folio() may need to trim off pages beyond
3692
* EOF: but on 32-bit, i_size_read() takes an irq-unsafe
3693
* seqlock, which cannot be nested inside the page tree lock.
3694
* So note end now: i_size itself may be changed at any moment,
3695
* but folio lock is good enough to serialize the trimming.
3696
*/
3697
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
3698
if (shmem_mapping(mapping))
3699
end = shmem_fallocend(mapping->host, end);
3700
}
3701
3702
/*
3703
* Racy check if we can split the page, before unmap_folio() will
3704
* split PMDs
3705
*/
3706
if (!can_split_folio(folio, 1, &extra_pins)) {
3707
ret = -EAGAIN;
3708
goto out_unlock;
3709
}
3710
3711
unmap_folio(folio);
3712
3713
/* block interrupt reentry in xa_lock and spinlock */
3714
local_irq_disable();
3715
if (mapping) {
3716
/*
3717
* Check if the folio is present in page cache.
3718
* We assume all tail are present too, if folio is there.
3719
*/
3720
xas_lock(&xas);
3721
xas_reset(&xas);
3722
if (xas_load(&xas) != folio) {
3723
ret = -EAGAIN;
3724
goto fail;
3725
}
3726
}
3727
3728
/* Prevent deferred_split_scan() touching ->_refcount */
3729
spin_lock(&ds_queue->split_queue_lock);
3730
if (folio_ref_freeze(folio, 1 + extra_pins)) {
3731
struct address_space *swap_cache = NULL;
3732
struct lruvec *lruvec;
3733
int expected_refs;
3734
3735
if (folio_order(folio) > 1 &&
3736
!list_empty(&folio->_deferred_list)) {
3737
ds_queue->split_queue_len--;
3738
if (folio_test_partially_mapped(folio)) {
3739
folio_clear_partially_mapped(folio);
3740
mod_mthp_stat(folio_order(folio),
3741
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
3742
}
3743
/*
3744
* Reinitialize page_deferred_list after removing the
3745
* page from the split_queue, otherwise a subsequent
3746
* split will see list corruption when checking the
3747
* page_deferred_list.
3748
*/
3749
list_del_init(&folio->_deferred_list);
3750
}
3751
spin_unlock(&ds_queue->split_queue_lock);
3752
if (mapping) {
3753
int nr = folio_nr_pages(folio);
3754
3755
if (folio_test_pmd_mappable(folio) &&
3756
new_order < HPAGE_PMD_ORDER) {
3757
if (folio_test_swapbacked(folio)) {
3758
__lruvec_stat_mod_folio(folio,
3759
NR_SHMEM_THPS, -nr);
3760
} else {
3761
__lruvec_stat_mod_folio(folio,
3762
NR_FILE_THPS, -nr);
3763
filemap_nr_thps_dec(mapping);
3764
}
3765
}
3766
}
3767
3768
if (folio_test_swapcache(folio)) {
3769
if (mapping) {
3770
VM_WARN_ON_ONCE_FOLIO(mapping, folio);
3771
ret = -EINVAL;
3772
goto fail;
3773
}
3774
3775
swap_cache = swap_address_space(folio->swap);
3776
xa_lock(&swap_cache->i_pages);
3777
}
3778
3779
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
3780
lruvec = folio_lruvec_lock(folio);
3781
3782
ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
3783
mapping, uniform_split);
3784
3785
/*
3786
* Unfreeze after-split folios and put them back to the right
3787
* list. @folio should be kept frozon until page cache
3788
* entries are updated with all the other after-split folios
3789
* to prevent others seeing stale page cache entries.
3790
* As a result, new_folio starts from the next folio of
3791
* @folio.
3792
*/
3793
for (new_folio = folio_next(folio); new_folio != end_folio;
3794
new_folio = next) {
3795
unsigned long nr_pages = folio_nr_pages(new_folio);
3796
3797
next = folio_next(new_folio);
3798
3799
expected_refs = folio_expected_ref_count(new_folio) + 1;
3800
folio_ref_unfreeze(new_folio, expected_refs);
3801
3802
lru_add_split_folio(folio, new_folio, lruvec, list);
3803
3804
/*
3805
* Anonymous folio with swap cache.
3806
* NOTE: shmem in swap cache is not supported yet.
3807
*/
3808
if (swap_cache) {
3809
__xa_store(&swap_cache->i_pages,
3810
swap_cache_index(new_folio->swap),
3811
new_folio, 0);
3812
continue;
3813
}
3814
3815
/* Anonymous folio without swap cache */
3816
if (!mapping)
3817
continue;
3818
3819
/* Add the new folio to the page cache. */
3820
if (new_folio->index < end) {
3821
__xa_store(&mapping->i_pages, new_folio->index,
3822
new_folio, 0);
3823
continue;
3824
}
3825
3826
/* Drop folio beyond EOF: ->index >= end */
3827
if (shmem_mapping(mapping))
3828
nr_shmem_dropped += nr_pages;
3829
else if (folio_test_clear_dirty(new_folio))
3830
folio_account_cleaned(
3831
new_folio, inode_to_wb(mapping->host));
3832
__filemap_remove_folio(new_folio, NULL);
3833
folio_put_refs(new_folio, nr_pages);
3834
}
3835
/*
3836
* Unfreeze @folio only after all page cache entries, which
3837
* used to point to it, have been updated with new folios.
3838
* Otherwise, a parallel folio_try_get() can grab @folio
3839
* and its caller can see stale page cache entries.
3840
*/
3841
expected_refs = folio_expected_ref_count(folio) + 1;
3842
folio_ref_unfreeze(folio, expected_refs);
3843
3844
unlock_page_lruvec(lruvec);
3845
3846
if (swap_cache)
3847
xa_unlock(&swap_cache->i_pages);
3848
} else {
3849
spin_unlock(&ds_queue->split_queue_lock);
3850
ret = -EAGAIN;
3851
}
3852
fail:
3853
if (mapping)
3854
xas_unlock(&xas);
3855
3856
local_irq_enable();
3857
3858
if (nr_shmem_dropped)
3859
shmem_uncharge(mapping->host, nr_shmem_dropped);
3860
3861
if (!ret && is_anon)
3862
remap_flags = RMP_USE_SHARED_ZEROPAGE;
3863
remap_page(folio, 1 << order, remap_flags);
3864
3865
/*
3866
* Unlock all after-split folios except the one containing
3867
* @lock_at page. If @folio is not split, it will be kept locked.
3868
*/
3869
for (new_folio = folio; new_folio != end_folio; new_folio = next) {
3870
next = folio_next(new_folio);
3871
if (new_folio == page_folio(lock_at))
3872
continue;
3873
3874
folio_unlock(new_folio);
3875
/*
3876
* Subpages may be freed if there wasn't any mapping
3877
* like if add_to_swap() is running on a lru page that
3878
* had its mapping zapped. And freeing these pages
3879
* requires taking the lru_lock so we do the put_page
3880
* of the tail pages after the split is complete.
3881
*/
3882
free_folio_and_swap_cache(new_folio);
3883
}
3884
3885
out_unlock:
3886
if (anon_vma) {
3887
anon_vma_unlock_write(anon_vma);
3888
put_anon_vma(anon_vma);
3889
}
3890
if (mapping)
3891
i_mmap_unlock_read(mapping);
3892
out:
3893
xas_destroy(&xas);
3894
if (order == HPAGE_PMD_ORDER)
3895
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3896
count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
3897
return ret;
3898
}
3899
3900
/*
3901
* This function splits a large folio into smaller folios of order @new_order.
3902
* @page can point to any page of the large folio to split. The split operation
3903
* does not change the position of @page.
3904
*
3905
* Prerequisites:
3906
*
3907
* 1) The caller must hold a reference on the @page's owning folio, also known
3908
* as the large folio.
3909
*
3910
* 2) The large folio must be locked.
3911
*
3912
* 3) The folio must not be pinned. Any unexpected folio references, including
3913
* GUP pins, will result in the folio not getting split; instead, the caller
3914
* will receive an -EAGAIN.
3915
*
3916
* 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
3917
* supported for non-file-backed folios, because folio->_deferred_list, which
3918
* is used by partially mapped folios, is stored in subpage 2, but an order-1
3919
* folio only has subpages 0 and 1. File-backed order-1 folios are supported,
3920
* since they do not use _deferred_list.
3921
*
3922
* After splitting, the caller's folio reference will be transferred to @page,
3923
* resulting in a raised refcount of @page after this call. The other pages may
3924
* be freed if they are not mapped.
3925
*
3926
* If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3927
*
3928
* Pages in @new_order will inherit the mapping, flags, and so on from the
3929
* huge page.
3930
*
3931
* Returns 0 if the huge page was split successfully.
3932
*
3933
* Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
3934
* the folio was concurrently removed from the page cache.
3935
*
3936
* Returns -EBUSY when trying to split the huge zeropage, if the folio is
3937
* under writeback, if fs-specific folio metadata cannot currently be
3938
* released, or if some unexpected race happened (e.g., anon VMA disappeared,
3939
* truncation).
3940
*
3941
* Callers should ensure that the order respects the address space mapping
3942
* min-order if one is set for non-anonymous folios.
3943
*
3944
* Returns -EINVAL when trying to split to an order that is incompatible
3945
* with the folio. Splitting to order 0 is compatible with all folios.
3946
*/
3947
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
3948
unsigned int new_order)
3949
{
3950
struct folio *folio = page_folio(page);
3951
3952
return __folio_split(folio, new_order, &folio->page, page, list, true);
3953
}
3954
3955
/*
3956
* folio_split: split a folio at @split_at to a @new_order folio
3957
* @folio: folio to split
3958
* @new_order: the order of the new folio
3959
* @split_at: a page within the new folio
3960
*
3961
* return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
3962
* split but not to @new_order, the caller needs to check)
3963
*
3964
* It has the same prerequisites and returns as
3965
* split_huge_page_to_list_to_order().
3966
*
3967
* Split a folio at @split_at to a new_order folio, leave the
3968
* remaining subpages of the original folio as large as possible. For example,
3969
* in the case of splitting an order-9 folio at its third order-3 subpages to
3970
* an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
3971
* After the split, there will be a group of folios with different orders and
3972
* the new folio containing @split_at is marked in bracket:
3973
* [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
3974
*
3975
* After split, folio is left locked for caller.
3976
*/
3977
int folio_split(struct folio *folio, unsigned int new_order,
3978
struct page *split_at, struct list_head *list)
3979
{
3980
return __folio_split(folio, new_order, split_at, &folio->page, list,
3981
false);
3982
}
3983
3984
int min_order_for_split(struct folio *folio)
3985
{
3986
if (folio_test_anon(folio))
3987
return 0;
3988
3989
if (!folio->mapping) {
3990
if (folio_test_pmd_mappable(folio))
3991
count_vm_event(THP_SPLIT_PAGE_FAILED);
3992
return -EBUSY;
3993
}
3994
3995
return mapping_min_folio_order(folio->mapping);
3996
}
3997
3998
int split_folio_to_list(struct folio *folio, struct list_head *list)
3999
{
4000
int ret = min_order_for_split(folio);
4001
4002
if (ret < 0)
4003
return ret;
4004
4005
return split_huge_page_to_list_to_order(&folio->page, list, ret);
4006
}
4007
4008
/*
4009
* __folio_unqueue_deferred_split() is not to be called directly:
4010
* the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
4011
* limits its calls to those folios which may have a _deferred_list for
4012
* queueing THP splits, and that list is (racily observed to be) non-empty.
4013
*
4014
* It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
4015
* zero: because even when split_queue_lock is held, a non-empty _deferred_list
4016
* might be in use on deferred_split_scan()'s unlocked on-stack list.
4017
*
4018
* If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
4019
* therefore important to unqueue deferred split before changing folio memcg.
4020
*/
4021
bool __folio_unqueue_deferred_split(struct folio *folio)
4022
{
4023
struct deferred_split *ds_queue;
4024
unsigned long flags;
4025
bool unqueued = false;
4026
4027
WARN_ON_ONCE(folio_ref_count(folio));
4028
WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
4029
4030
ds_queue = get_deferred_split_queue(folio);
4031
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4032
if (!list_empty(&folio->_deferred_list)) {
4033
ds_queue->split_queue_len--;
4034
if (folio_test_partially_mapped(folio)) {
4035
folio_clear_partially_mapped(folio);
4036
mod_mthp_stat(folio_order(folio),
4037
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
4038
}
4039
list_del_init(&folio->_deferred_list);
4040
unqueued = true;
4041
}
4042
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4043
4044
return unqueued; /* useful for debug warnings */
4045
}
4046
4047
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
4048
void deferred_split_folio(struct folio *folio, bool partially_mapped)
4049
{
4050
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
4051
#ifdef CONFIG_MEMCG
4052
struct mem_cgroup *memcg = folio_memcg(folio);
4053
#endif
4054
unsigned long flags;
4055
4056
/*
4057
* Order 1 folios have no space for a deferred list, but we also
4058
* won't waste much memory by not adding them to the deferred list.
4059
*/
4060
if (folio_order(folio) <= 1)
4061
return;
4062
4063
if (!partially_mapped && !split_underused_thp)
4064
return;
4065
4066
/*
4067
* Exclude swapcache: originally to avoid a corrupt deferred split
4068
* queue. Nowadays that is fully prevented by memcg1_swapout();
4069
* but if page reclaim is already handling the same folio, it is
4070
* unnecessary to handle it again in the shrinker, so excluding
4071
* swapcache here may still be a useful optimization.
4072
*/
4073
if (folio_test_swapcache(folio))
4074
return;
4075
4076
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4077
if (partially_mapped) {
4078
if (!folio_test_partially_mapped(folio)) {
4079
folio_set_partially_mapped(folio);
4080
if (folio_test_pmd_mappable(folio))
4081
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
4082
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
4083
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
4084
4085
}
4086
} else {
4087
/* partially mapped folios cannot become non-partially mapped */
4088
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
4089
}
4090
if (list_empty(&folio->_deferred_list)) {
4091
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
4092
ds_queue->split_queue_len++;
4093
#ifdef CONFIG_MEMCG
4094
if (memcg)
4095
set_shrinker_bit(memcg, folio_nid(folio),
4096
deferred_split_shrinker->id);
4097
#endif
4098
}
4099
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4100
}
4101
4102
static unsigned long deferred_split_count(struct shrinker *shrink,
4103
struct shrink_control *sc)
4104
{
4105
struct pglist_data *pgdata = NODE_DATA(sc->nid);
4106
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4107
4108
#ifdef CONFIG_MEMCG
4109
if (sc->memcg)
4110
ds_queue = &sc->memcg->deferred_split_queue;
4111
#endif
4112
return READ_ONCE(ds_queue->split_queue_len);
4113
}
4114
4115
static bool thp_underused(struct folio *folio)
4116
{
4117
int num_zero_pages = 0, num_filled_pages = 0;
4118
void *kaddr;
4119
int i;
4120
4121
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
4122
return false;
4123
4124
for (i = 0; i < folio_nr_pages(folio); i++) {
4125
kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
4126
if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
4127
num_zero_pages++;
4128
if (num_zero_pages > khugepaged_max_ptes_none) {
4129
kunmap_local(kaddr);
4130
return true;
4131
}
4132
} else {
4133
/*
4134
* Another path for early exit once the number
4135
* of non-zero filled pages exceeds threshold.
4136
*/
4137
num_filled_pages++;
4138
if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
4139
kunmap_local(kaddr);
4140
return false;
4141
}
4142
}
4143
kunmap_local(kaddr);
4144
}
4145
return false;
4146
}
4147
4148
static unsigned long deferred_split_scan(struct shrinker *shrink,
4149
struct shrink_control *sc)
4150
{
4151
struct pglist_data *pgdata = NODE_DATA(sc->nid);
4152
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
4153
unsigned long flags;
4154
LIST_HEAD(list);
4155
struct folio *folio, *next, *prev = NULL;
4156
int split = 0, removed = 0;
4157
4158
#ifdef CONFIG_MEMCG
4159
if (sc->memcg)
4160
ds_queue = &sc->memcg->deferred_split_queue;
4161
#endif
4162
4163
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4164
/* Take pin on all head pages to avoid freeing them under us */
4165
list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
4166
_deferred_list) {
4167
if (folio_try_get(folio)) {
4168
list_move(&folio->_deferred_list, &list);
4169
} else {
4170
/* We lost race with folio_put() */
4171
if (folio_test_partially_mapped(folio)) {
4172
folio_clear_partially_mapped(folio);
4173
mod_mthp_stat(folio_order(folio),
4174
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
4175
}
4176
list_del_init(&folio->_deferred_list);
4177
ds_queue->split_queue_len--;
4178
}
4179
if (!--sc->nr_to_scan)
4180
break;
4181
}
4182
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4183
4184
list_for_each_entry_safe(folio, next, &list, _deferred_list) {
4185
bool did_split = false;
4186
bool underused = false;
4187
4188
if (!folio_test_partially_mapped(folio)) {
4189
underused = thp_underused(folio);
4190
if (!underused)
4191
goto next;
4192
}
4193
if (!folio_trylock(folio))
4194
goto next;
4195
if (!split_folio(folio)) {
4196
did_split = true;
4197
if (underused)
4198
count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
4199
split++;
4200
}
4201
folio_unlock(folio);
4202
next:
4203
/*
4204
* split_folio() removes folio from list on success.
4205
* Only add back to the queue if folio is partially mapped.
4206
* If thp_underused returns false, or if split_folio fails
4207
* in the case it was underused, then consider it used and
4208
* don't add it back to split_queue.
4209
*/
4210
if (did_split) {
4211
; /* folio already removed from list */
4212
} else if (!folio_test_partially_mapped(folio)) {
4213
list_del_init(&folio->_deferred_list);
4214
removed++;
4215
} else {
4216
/*
4217
* That unlocked list_del_init() above would be unsafe,
4218
* unless its folio is separated from any earlier folios
4219
* left on the list (which may be concurrently unqueued)
4220
* by one safe folio with refcount still raised.
4221
*/
4222
swap(folio, prev);
4223
}
4224
if (folio)
4225
folio_put(folio);
4226
}
4227
4228
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
4229
list_splice_tail(&list, &ds_queue->split_queue);
4230
ds_queue->split_queue_len -= removed;
4231
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
4232
4233
if (prev)
4234
folio_put(prev);
4235
4236
/*
4237
* Stop shrinker if we didn't split any page, but the queue is empty.
4238
* This can happen if pages were freed under us.
4239
*/
4240
if (!split && list_empty(&ds_queue->split_queue))
4241
return SHRINK_STOP;
4242
return split;
4243
}
4244
4245
#ifdef CONFIG_DEBUG_FS
4246
static void split_huge_pages_all(void)
4247
{
4248
struct zone *zone;
4249
struct page *page;
4250
struct folio *folio;
4251
unsigned long pfn, max_zone_pfn;
4252
unsigned long total = 0, split = 0;
4253
4254
pr_debug("Split all THPs\n");
4255
for_each_zone(zone) {
4256
if (!managed_zone(zone))
4257
continue;
4258
max_zone_pfn = zone_end_pfn(zone);
4259
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
4260
int nr_pages;
4261
4262
page = pfn_to_online_page(pfn);
4263
if (!page || PageTail(page))
4264
continue;
4265
folio = page_folio(page);
4266
if (!folio_try_get(folio))
4267
continue;
4268
4269
if (unlikely(page_folio(page) != folio))
4270
goto next;
4271
4272
if (zone != folio_zone(folio))
4273
goto next;
4274
4275
if (!folio_test_large(folio)
4276
|| folio_test_hugetlb(folio)
4277
|| !folio_test_lru(folio))
4278
goto next;
4279
4280
total++;
4281
folio_lock(folio);
4282
nr_pages = folio_nr_pages(folio);
4283
if (!split_folio(folio))
4284
split++;
4285
pfn += nr_pages - 1;
4286
folio_unlock(folio);
4287
next:
4288
folio_put(folio);
4289
cond_resched();
4290
}
4291
}
4292
4293
pr_debug("%lu of %lu THP split\n", split, total);
4294
}
4295
4296
static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
4297
{
4298
return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
4299
is_vm_hugetlb_page(vma);
4300
}
4301
4302
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
4303
unsigned long vaddr_end, unsigned int new_order,
4304
long in_folio_offset)
4305
{
4306
int ret = 0;
4307
struct task_struct *task;
4308
struct mm_struct *mm;
4309
unsigned long total = 0, split = 0;
4310
unsigned long addr;
4311
4312
vaddr_start &= PAGE_MASK;
4313
vaddr_end &= PAGE_MASK;
4314
4315
task = find_get_task_by_vpid(pid);
4316
if (!task) {
4317
ret = -ESRCH;
4318
goto out;
4319
}
4320
4321
/* Find the mm_struct */
4322
mm = get_task_mm(task);
4323
put_task_struct(task);
4324
4325
if (!mm) {
4326
ret = -EINVAL;
4327
goto out;
4328
}
4329
4330
pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
4331
pid, vaddr_start, vaddr_end);
4332
4333
mmap_read_lock(mm);
4334
/*
4335
* always increase addr by PAGE_SIZE, since we could have a PTE page
4336
* table filled with PTE-mapped THPs, each of which is distinct.
4337
*/
4338
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
4339
struct vm_area_struct *vma = vma_lookup(mm, addr);
4340
struct folio_walk fw;
4341
struct folio *folio;
4342
struct address_space *mapping;
4343
unsigned int target_order = new_order;
4344
4345
if (!vma)
4346
break;
4347
4348
/* skip special VMA and hugetlb VMA */
4349
if (vma_not_suitable_for_thp_split(vma)) {
4350
addr = vma->vm_end;
4351
continue;
4352
}
4353
4354
folio = folio_walk_start(&fw, vma, addr, 0);
4355
if (!folio)
4356
continue;
4357
4358
if (!is_transparent_hugepage(folio))
4359
goto next;
4360
4361
if (!folio_test_anon(folio)) {
4362
mapping = folio->mapping;
4363
target_order = max(new_order,
4364
mapping_min_folio_order(mapping));
4365
}
4366
4367
if (target_order >= folio_order(folio))
4368
goto next;
4369
4370
total++;
4371
/*
4372
* For folios with private, split_huge_page_to_list_to_order()
4373
* will try to drop it before split and then check if the folio
4374
* can be split or not. So skip the check here.
4375
*/
4376
if (!folio_test_private(folio) &&
4377
!can_split_folio(folio, 0, NULL))
4378
goto next;
4379
4380
if (!folio_trylock(folio))
4381
goto next;
4382
folio_get(folio);
4383
folio_walk_end(&fw, vma);
4384
4385
if (!folio_test_anon(folio) && folio->mapping != mapping)
4386
goto unlock;
4387
4388
if (in_folio_offset < 0 ||
4389
in_folio_offset >= folio_nr_pages(folio)) {
4390
if (!split_folio_to_order(folio, target_order))
4391
split++;
4392
} else {
4393
struct page *split_at = folio_page(folio,
4394
in_folio_offset);
4395
if (!folio_split(folio, target_order, split_at, NULL))
4396
split++;
4397
}
4398
4399
unlock:
4400
4401
folio_unlock(folio);
4402
folio_put(folio);
4403
4404
cond_resched();
4405
continue;
4406
next:
4407
folio_walk_end(&fw, vma);
4408
cond_resched();
4409
}
4410
mmap_read_unlock(mm);
4411
mmput(mm);
4412
4413
pr_debug("%lu of %lu THP split\n", split, total);
4414
4415
out:
4416
return ret;
4417
}
4418
4419
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
4420
pgoff_t off_end, unsigned int new_order,
4421
long in_folio_offset)
4422
{
4423
struct filename *file;
4424
struct file *candidate;
4425
struct address_space *mapping;
4426
int ret = -EINVAL;
4427
pgoff_t index;
4428
int nr_pages = 1;
4429
unsigned long total = 0, split = 0;
4430
unsigned int min_order;
4431
unsigned int target_order;
4432
4433
file = getname_kernel(file_path);
4434
if (IS_ERR(file))
4435
return ret;
4436
4437
candidate = file_open_name(file, O_RDONLY, 0);
4438
if (IS_ERR(candidate))
4439
goto out;
4440
4441
pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
4442
file_path, off_start, off_end);
4443
4444
mapping = candidate->f_mapping;
4445
min_order = mapping_min_folio_order(mapping);
4446
target_order = max(new_order, min_order);
4447
4448
for (index = off_start; index < off_end; index += nr_pages) {
4449
struct folio *folio = filemap_get_folio(mapping, index);
4450
4451
nr_pages = 1;
4452
if (IS_ERR(folio))
4453
continue;
4454
4455
if (!folio_test_large(folio))
4456
goto next;
4457
4458
total++;
4459
nr_pages = folio_nr_pages(folio);
4460
4461
if (target_order >= folio_order(folio))
4462
goto next;
4463
4464
if (!folio_trylock(folio))
4465
goto next;
4466
4467
if (folio->mapping != mapping)
4468
goto unlock;
4469
4470
if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
4471
if (!split_folio_to_order(folio, target_order))
4472
split++;
4473
} else {
4474
struct page *split_at = folio_page(folio,
4475
in_folio_offset);
4476
if (!folio_split(folio, target_order, split_at, NULL))
4477
split++;
4478
}
4479
4480
unlock:
4481
folio_unlock(folio);
4482
next:
4483
folio_put(folio);
4484
cond_resched();
4485
}
4486
4487
filp_close(candidate, NULL);
4488
ret = 0;
4489
4490
pr_debug("%lu of %lu file-backed THP split\n", split, total);
4491
out:
4492
putname(file);
4493
return ret;
4494
}
4495
4496
#define MAX_INPUT_BUF_SZ 255
4497
4498
static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
4499
size_t count, loff_t *ppops)
4500
{
4501
static DEFINE_MUTEX(split_debug_mutex);
4502
ssize_t ret;
4503
/*
4504
* hold pid, start_vaddr, end_vaddr, new_order or
4505
* file_path, off_start, off_end, new_order
4506
*/
4507
char input_buf[MAX_INPUT_BUF_SZ];
4508
int pid;
4509
unsigned long vaddr_start, vaddr_end;
4510
unsigned int new_order = 0;
4511
long in_folio_offset = -1;
4512
4513
ret = mutex_lock_interruptible(&split_debug_mutex);
4514
if (ret)
4515
return ret;
4516
4517
ret = -EFAULT;
4518
4519
memset(input_buf, 0, MAX_INPUT_BUF_SZ);
4520
if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
4521
goto out;
4522
4523
input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
4524
4525
if (input_buf[0] == '/') {
4526
char *tok;
4527
char *tok_buf = input_buf;
4528
char file_path[MAX_INPUT_BUF_SZ];
4529
pgoff_t off_start = 0, off_end = 0;
4530
size_t input_len = strlen(input_buf);
4531
4532
tok = strsep(&tok_buf, ",");
4533
if (tok && tok_buf) {
4534
strscpy(file_path, tok);
4535
} else {
4536
ret = -EINVAL;
4537
goto out;
4538
}
4539
4540
ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
4541
&new_order, &in_folio_offset);
4542
if (ret != 2 && ret != 3 && ret != 4) {
4543
ret = -EINVAL;
4544
goto out;
4545
}
4546
ret = split_huge_pages_in_file(file_path, off_start, off_end,
4547
new_order, in_folio_offset);
4548
if (!ret)
4549
ret = input_len;
4550
4551
goto out;
4552
}
4553
4554
ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
4555
&vaddr_end, &new_order, &in_folio_offset);
4556
if (ret == 1 && pid == 1) {
4557
split_huge_pages_all();
4558
ret = strlen(input_buf);
4559
goto out;
4560
} else if (ret != 3 && ret != 4 && ret != 5) {
4561
ret = -EINVAL;
4562
goto out;
4563
}
4564
4565
ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
4566
in_folio_offset);
4567
if (!ret)
4568
ret = strlen(input_buf);
4569
out:
4570
mutex_unlock(&split_debug_mutex);
4571
return ret;
4572
4573
}
4574
4575
static const struct file_operations split_huge_pages_fops = {
4576
.owner = THIS_MODULE,
4577
.write = split_huge_pages_write,
4578
};
4579
4580
static int __init split_huge_pages_debugfs(void)
4581
{
4582
debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
4583
&split_huge_pages_fops);
4584
return 0;
4585
}
4586
late_initcall(split_huge_pages_debugfs);
4587
#endif
4588
4589
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
4590
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
4591
struct page *page)
4592
{
4593
struct folio *folio = page_folio(page);
4594
struct vm_area_struct *vma = pvmw->vma;
4595
struct mm_struct *mm = vma->vm_mm;
4596
unsigned long address = pvmw->address;
4597
bool anon_exclusive;
4598
pmd_t pmdval;
4599
swp_entry_t entry;
4600
pmd_t pmdswp;
4601
4602
if (!(pvmw->pmd && !pvmw->pte))
4603
return 0;
4604
4605
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
4606
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
4607
4608
/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
4609
anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
4610
if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
4611
set_pmd_at(mm, address, pvmw->pmd, pmdval);
4612
return -EBUSY;
4613
}
4614
4615
if (pmd_dirty(pmdval))
4616
folio_mark_dirty(folio);
4617
if (pmd_write(pmdval))
4618
entry = make_writable_migration_entry(page_to_pfn(page));
4619
else if (anon_exclusive)
4620
entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
4621
else
4622
entry = make_readable_migration_entry(page_to_pfn(page));
4623
if (pmd_young(pmdval))
4624
entry = make_migration_entry_young(entry);
4625
if (pmd_dirty(pmdval))
4626
entry = make_migration_entry_dirty(entry);
4627
pmdswp = swp_entry_to_pmd(entry);
4628
if (pmd_soft_dirty(pmdval))
4629
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
4630
if (pmd_uffd_wp(pmdval))
4631
pmdswp = pmd_swp_mkuffd_wp(pmdswp);
4632
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
4633
folio_remove_rmap_pmd(folio, page, vma);
4634
folio_put(folio);
4635
trace_set_migration_pmd(address, pmd_val(pmdswp));
4636
4637
return 0;
4638
}
4639
4640
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
4641
{
4642
struct folio *folio = page_folio(new);
4643
struct vm_area_struct *vma = pvmw->vma;
4644
struct mm_struct *mm = vma->vm_mm;
4645
unsigned long address = pvmw->address;
4646
unsigned long haddr = address & HPAGE_PMD_MASK;
4647
pmd_t pmde;
4648
swp_entry_t entry;
4649
4650
if (!(pvmw->pmd && !pvmw->pte))
4651
return;
4652
4653
entry = pmd_to_swp_entry(*pvmw->pmd);
4654
folio_get(folio);
4655
pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
4656
if (pmd_swp_soft_dirty(*pvmw->pmd))
4657
pmde = pmd_mksoft_dirty(pmde);
4658
if (is_writable_migration_entry(entry))
4659
pmde = pmd_mkwrite(pmde, vma);
4660
if (pmd_swp_uffd_wp(*pvmw->pmd))
4661
pmde = pmd_mkuffd_wp(pmde);
4662
if (!is_migration_entry_young(entry))
4663
pmde = pmd_mkold(pmde);
4664
/* NOTE: this may contain setting soft-dirty on some archs */
4665
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
4666
pmde = pmd_mkdirty(pmde);
4667
4668
if (folio_test_anon(folio)) {
4669
rmap_t rmap_flags = RMAP_NONE;
4670
4671
if (!is_readable_migration_entry(entry))
4672
rmap_flags |= RMAP_EXCLUSIVE;
4673
4674
folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
4675
} else {
4676
folio_add_file_rmap_pmd(folio, new, vma);
4677
}
4678
VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
4679
set_pmd_at(mm, haddr, pvmw->pmd, pmde);
4680
4681
/* No need to invalidate - it was non-present before */
4682
update_mmu_cache_pmd(vma, address, pvmw->pmd);
4683
trace_remove_migration_pmd(address, pmd_val(pmde));
4684
}
4685
#endif
4686
4687