Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/hugetlb.c
26131 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Generic hugetlb support.
4
* (C) Nadia Yvette Chambers, April 2004
5
*/
6
#include <linux/list.h>
7
#include <linux/init.h>
8
#include <linux/mm.h>
9
#include <linux/seq_file.h>
10
#include <linux/sysctl.h>
11
#include <linux/highmem.h>
12
#include <linux/mmu_notifier.h>
13
#include <linux/nodemask.h>
14
#include <linux/pagemap.h>
15
#include <linux/mempolicy.h>
16
#include <linux/compiler.h>
17
#include <linux/cpumask.h>
18
#include <linux/cpuset.h>
19
#include <linux/mutex.h>
20
#include <linux/memblock.h>
21
#include <linux/minmax.h>
22
#include <linux/sysfs.h>
23
#include <linux/slab.h>
24
#include <linux/sched/mm.h>
25
#include <linux/mmdebug.h>
26
#include <linux/sched/signal.h>
27
#include <linux/rmap.h>
28
#include <linux/string_choices.h>
29
#include <linux/string_helpers.h>
30
#include <linux/swap.h>
31
#include <linux/swapops.h>
32
#include <linux/jhash.h>
33
#include <linux/numa.h>
34
#include <linux/llist.h>
35
#include <linux/cma.h>
36
#include <linux/migrate.h>
37
#include <linux/nospec.h>
38
#include <linux/delayacct.h>
39
#include <linux/memory.h>
40
#include <linux/mm_inline.h>
41
#include <linux/padata.h>
42
43
#include <asm/page.h>
44
#include <asm/pgalloc.h>
45
#include <asm/tlb.h>
46
#include <asm/setup.h>
47
48
#include <linux/io.h>
49
#include <linux/hugetlb.h>
50
#include <linux/hugetlb_cgroup.h>
51
#include <linux/node.h>
52
#include <linux/page_owner.h>
53
#include "internal.h"
54
#include "hugetlb_vmemmap.h"
55
#include "hugetlb_cma.h"
56
#include <linux/page-isolation.h>
57
58
int hugetlb_max_hstate __read_mostly;
59
unsigned int default_hstate_idx;
60
struct hstate hstates[HUGE_MAX_HSTATE];
61
62
__initdata nodemask_t hugetlb_bootmem_nodes;
63
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
64
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
65
66
/*
67
* Due to ordering constraints across the init code for various
68
* architectures, hugetlb hstate cmdline parameters can't simply
69
* be early_param. early_param might call the setup function
70
* before valid hugetlb page sizes are determined, leading to
71
* incorrect rejection of valid hugepagesz= options.
72
*
73
* So, record the parameters early and consume them whenever the
74
* init code is ready for them, by calling hugetlb_parse_params().
75
*/
76
77
/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
78
#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1)
79
struct hugetlb_cmdline {
80
char *val;
81
int (*setup)(char *val);
82
};
83
84
/* for command line parsing */
85
static struct hstate * __initdata parsed_hstate;
86
static unsigned long __initdata default_hstate_max_huge_pages;
87
static bool __initdata parsed_valid_hugepagesz = true;
88
static bool __initdata parsed_default_hugepagesz;
89
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
90
static unsigned long hugepage_allocation_threads __initdata;
91
92
static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
93
static int hstate_cmdline_index __initdata;
94
static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
95
static int hugetlb_param_index __initdata;
96
static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
97
static __init void hugetlb_parse_params(void);
98
99
#define hugetlb_early_param(str, func) \
100
static __init int func##args(char *s) \
101
{ \
102
return hugetlb_add_param(s, func); \
103
} \
104
early_param(str, func##args)
105
106
/*
107
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
108
* free_huge_pages, and surplus_huge_pages.
109
*/
110
__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);
111
112
/*
113
* Serializes faults on the same logical page. This is used to
114
* prevent spurious OOMs when the hugepage pool is fully utilized.
115
*/
116
static int num_fault_mutexes __ro_after_init;
117
struct mutex *hugetlb_fault_mutex_table __ro_after_init;
118
119
/* Forward declaration */
120
static int hugetlb_acct_memory(struct hstate *h, long delta);
121
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
122
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
123
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
124
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
125
unsigned long start, unsigned long end, bool take_locks);
126
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
127
128
static void hugetlb_free_folio(struct folio *folio)
129
{
130
if (folio_test_hugetlb_cma(folio)) {
131
hugetlb_cma_free_folio(folio);
132
return;
133
}
134
135
folio_put(folio);
136
}
137
138
static inline bool subpool_is_free(struct hugepage_subpool *spool)
139
{
140
if (spool->count)
141
return false;
142
if (spool->max_hpages != -1)
143
return spool->used_hpages == 0;
144
if (spool->min_hpages != -1)
145
return spool->rsv_hpages == spool->min_hpages;
146
147
return true;
148
}
149
150
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
151
unsigned long irq_flags)
152
{
153
spin_unlock_irqrestore(&spool->lock, irq_flags);
154
155
/* If no pages are used, and no other handles to the subpool
156
* remain, give up any reservations based on minimum size and
157
* free the subpool */
158
if (subpool_is_free(spool)) {
159
if (spool->min_hpages != -1)
160
hugetlb_acct_memory(spool->hstate,
161
-spool->min_hpages);
162
kfree(spool);
163
}
164
}
165
166
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
167
long min_hpages)
168
{
169
struct hugepage_subpool *spool;
170
171
spool = kzalloc(sizeof(*spool), GFP_KERNEL);
172
if (!spool)
173
return NULL;
174
175
spin_lock_init(&spool->lock);
176
spool->count = 1;
177
spool->max_hpages = max_hpages;
178
spool->hstate = h;
179
spool->min_hpages = min_hpages;
180
181
if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
182
kfree(spool);
183
return NULL;
184
}
185
spool->rsv_hpages = min_hpages;
186
187
return spool;
188
}
189
190
void hugepage_put_subpool(struct hugepage_subpool *spool)
191
{
192
unsigned long flags;
193
194
spin_lock_irqsave(&spool->lock, flags);
195
BUG_ON(!spool->count);
196
spool->count--;
197
unlock_or_release_subpool(spool, flags);
198
}
199
200
/*
201
* Subpool accounting for allocating and reserving pages.
202
* Return -ENOMEM if there are not enough resources to satisfy the
203
* request. Otherwise, return the number of pages by which the
204
* global pools must be adjusted (upward). The returned value may
205
* only be different than the passed value (delta) in the case where
206
* a subpool minimum size must be maintained.
207
*/
208
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
209
long delta)
210
{
211
long ret = delta;
212
213
if (!spool)
214
return ret;
215
216
spin_lock_irq(&spool->lock);
217
218
if (spool->max_hpages != -1) { /* maximum size accounting */
219
if ((spool->used_hpages + delta) <= spool->max_hpages)
220
spool->used_hpages += delta;
221
else {
222
ret = -ENOMEM;
223
goto unlock_ret;
224
}
225
}
226
227
/* minimum size accounting */
228
if (spool->min_hpages != -1 && spool->rsv_hpages) {
229
if (delta > spool->rsv_hpages) {
230
/*
231
* Asking for more reserves than those already taken on
232
* behalf of subpool. Return difference.
233
*/
234
ret = delta - spool->rsv_hpages;
235
spool->rsv_hpages = 0;
236
} else {
237
ret = 0; /* reserves already accounted for */
238
spool->rsv_hpages -= delta;
239
}
240
}
241
242
unlock_ret:
243
spin_unlock_irq(&spool->lock);
244
return ret;
245
}
246
247
/*
248
* Subpool accounting for freeing and unreserving pages.
249
* Return the number of global page reservations that must be dropped.
250
* The return value may only be different than the passed value (delta)
251
* in the case where a subpool minimum size must be maintained.
252
*/
253
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
254
long delta)
255
{
256
long ret = delta;
257
unsigned long flags;
258
259
if (!spool)
260
return delta;
261
262
spin_lock_irqsave(&spool->lock, flags);
263
264
if (spool->max_hpages != -1) /* maximum size accounting */
265
spool->used_hpages -= delta;
266
267
/* minimum size accounting */
268
if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
269
if (spool->rsv_hpages + delta <= spool->min_hpages)
270
ret = 0;
271
else
272
ret = spool->rsv_hpages + delta - spool->min_hpages;
273
274
spool->rsv_hpages += delta;
275
if (spool->rsv_hpages > spool->min_hpages)
276
spool->rsv_hpages = spool->min_hpages;
277
}
278
279
/*
280
* If hugetlbfs_put_super couldn't free spool due to an outstanding
281
* quota reference, free it now.
282
*/
283
unlock_or_release_subpool(spool, flags);
284
285
return ret;
286
}
287
288
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
289
{
290
return subpool_inode(file_inode(vma->vm_file));
291
}
292
293
/*
294
* hugetlb vma_lock helper routines
295
*/
296
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
297
{
298
if (__vma_shareable_lock(vma)) {
299
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
300
301
down_read(&vma_lock->rw_sema);
302
} else if (__vma_private_lock(vma)) {
303
struct resv_map *resv_map = vma_resv_map(vma);
304
305
down_read(&resv_map->rw_sema);
306
}
307
}
308
309
void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
310
{
311
if (__vma_shareable_lock(vma)) {
312
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
313
314
up_read(&vma_lock->rw_sema);
315
} else if (__vma_private_lock(vma)) {
316
struct resv_map *resv_map = vma_resv_map(vma);
317
318
up_read(&resv_map->rw_sema);
319
}
320
}
321
322
void hugetlb_vma_lock_write(struct vm_area_struct *vma)
323
{
324
if (__vma_shareable_lock(vma)) {
325
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
326
327
down_write(&vma_lock->rw_sema);
328
} else if (__vma_private_lock(vma)) {
329
struct resv_map *resv_map = vma_resv_map(vma);
330
331
down_write(&resv_map->rw_sema);
332
}
333
}
334
335
void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
336
{
337
if (__vma_shareable_lock(vma)) {
338
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
339
340
up_write(&vma_lock->rw_sema);
341
} else if (__vma_private_lock(vma)) {
342
struct resv_map *resv_map = vma_resv_map(vma);
343
344
up_write(&resv_map->rw_sema);
345
}
346
}
347
348
int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
349
{
350
351
if (__vma_shareable_lock(vma)) {
352
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
353
354
return down_write_trylock(&vma_lock->rw_sema);
355
} else if (__vma_private_lock(vma)) {
356
struct resv_map *resv_map = vma_resv_map(vma);
357
358
return down_write_trylock(&resv_map->rw_sema);
359
}
360
361
return 1;
362
}
363
364
void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
365
{
366
if (__vma_shareable_lock(vma)) {
367
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
368
369
lockdep_assert_held(&vma_lock->rw_sema);
370
} else if (__vma_private_lock(vma)) {
371
struct resv_map *resv_map = vma_resv_map(vma);
372
373
lockdep_assert_held(&resv_map->rw_sema);
374
}
375
}
376
377
void hugetlb_vma_lock_release(struct kref *kref)
378
{
379
struct hugetlb_vma_lock *vma_lock = container_of(kref,
380
struct hugetlb_vma_lock, refs);
381
382
kfree(vma_lock);
383
}
384
385
static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
386
{
387
struct vm_area_struct *vma = vma_lock->vma;
388
389
/*
390
* vma_lock structure may or not be released as a result of put,
391
* it certainly will no longer be attached to vma so clear pointer.
392
* Semaphore synchronizes access to vma_lock->vma field.
393
*/
394
vma_lock->vma = NULL;
395
vma->vm_private_data = NULL;
396
up_write(&vma_lock->rw_sema);
397
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
398
}
399
400
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
401
{
402
if (__vma_shareable_lock(vma)) {
403
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
404
405
__hugetlb_vma_unlock_write_put(vma_lock);
406
} else if (__vma_private_lock(vma)) {
407
struct resv_map *resv_map = vma_resv_map(vma);
408
409
/* no free for anon vmas, but still need to unlock */
410
up_write(&resv_map->rw_sema);
411
}
412
}
413
414
static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
415
{
416
/*
417
* Only present in sharable vmas.
418
*/
419
if (!vma || !__vma_shareable_lock(vma))
420
return;
421
422
if (vma->vm_private_data) {
423
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
424
425
down_write(&vma_lock->rw_sema);
426
__hugetlb_vma_unlock_write_put(vma_lock);
427
}
428
}
429
430
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
431
{
432
struct hugetlb_vma_lock *vma_lock;
433
434
/* Only establish in (flags) sharable vmas */
435
if (!vma || !(vma->vm_flags & VM_MAYSHARE))
436
return;
437
438
/* Should never get here with non-NULL vm_private_data */
439
if (vma->vm_private_data)
440
return;
441
442
vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
443
if (!vma_lock) {
444
/*
445
* If we can not allocate structure, then vma can not
446
* participate in pmd sharing. This is only a possible
447
* performance enhancement and memory saving issue.
448
* However, the lock is also used to synchronize page
449
* faults with truncation. If the lock is not present,
450
* unlikely races could leave pages in a file past i_size
451
* until the file is removed. Warn in the unlikely case of
452
* allocation failure.
453
*/
454
pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
455
return;
456
}
457
458
kref_init(&vma_lock->refs);
459
init_rwsem(&vma_lock->rw_sema);
460
vma_lock->vma = vma;
461
vma->vm_private_data = vma_lock;
462
}
463
464
/* Helper that removes a struct file_region from the resv_map cache and returns
465
* it for use.
466
*/
467
static struct file_region *
468
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
469
{
470
struct file_region *nrg;
471
472
VM_BUG_ON(resv->region_cache_count <= 0);
473
474
resv->region_cache_count--;
475
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
476
list_del(&nrg->link);
477
478
nrg->from = from;
479
nrg->to = to;
480
481
return nrg;
482
}
483
484
static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
485
struct file_region *rg)
486
{
487
#ifdef CONFIG_CGROUP_HUGETLB
488
nrg->reservation_counter = rg->reservation_counter;
489
nrg->css = rg->css;
490
if (rg->css)
491
css_get(rg->css);
492
#endif
493
}
494
495
/* Helper that records hugetlb_cgroup uncharge info. */
496
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
497
struct hstate *h,
498
struct resv_map *resv,
499
struct file_region *nrg)
500
{
501
#ifdef CONFIG_CGROUP_HUGETLB
502
if (h_cg) {
503
nrg->reservation_counter =
504
&h_cg->rsvd_hugepage[hstate_index(h)];
505
nrg->css = &h_cg->css;
506
/*
507
* The caller will hold exactly one h_cg->css reference for the
508
* whole contiguous reservation region. But this area might be
509
* scattered when there are already some file_regions reside in
510
* it. As a result, many file_regions may share only one css
511
* reference. In order to ensure that one file_region must hold
512
* exactly one h_cg->css reference, we should do css_get for
513
* each file_region and leave the reference held by caller
514
* untouched.
515
*/
516
css_get(&h_cg->css);
517
if (!resv->pages_per_hpage)
518
resv->pages_per_hpage = pages_per_huge_page(h);
519
/* pages_per_hpage should be the same for all entries in
520
* a resv_map.
521
*/
522
VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
523
} else {
524
nrg->reservation_counter = NULL;
525
nrg->css = NULL;
526
}
527
#endif
528
}
529
530
static void put_uncharge_info(struct file_region *rg)
531
{
532
#ifdef CONFIG_CGROUP_HUGETLB
533
if (rg->css)
534
css_put(rg->css);
535
#endif
536
}
537
538
static bool has_same_uncharge_info(struct file_region *rg,
539
struct file_region *org)
540
{
541
#ifdef CONFIG_CGROUP_HUGETLB
542
return rg->reservation_counter == org->reservation_counter &&
543
rg->css == org->css;
544
545
#else
546
return true;
547
#endif
548
}
549
550
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
551
{
552
struct file_region *nrg, *prg;
553
554
prg = list_prev_entry(rg, link);
555
if (&prg->link != &resv->regions && prg->to == rg->from &&
556
has_same_uncharge_info(prg, rg)) {
557
prg->to = rg->to;
558
559
list_del(&rg->link);
560
put_uncharge_info(rg);
561
kfree(rg);
562
563
rg = prg;
564
}
565
566
nrg = list_next_entry(rg, link);
567
if (&nrg->link != &resv->regions && nrg->from == rg->to &&
568
has_same_uncharge_info(nrg, rg)) {
569
nrg->from = rg->from;
570
571
list_del(&rg->link);
572
put_uncharge_info(rg);
573
kfree(rg);
574
}
575
}
576
577
static inline long
578
hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
579
long to, struct hstate *h, struct hugetlb_cgroup *cg,
580
long *regions_needed)
581
{
582
struct file_region *nrg;
583
584
if (!regions_needed) {
585
nrg = get_file_region_entry_from_cache(map, from, to);
586
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
587
list_add(&nrg->link, rg);
588
coalesce_file_region(map, nrg);
589
} else
590
*regions_needed += 1;
591
592
return to - from;
593
}
594
595
/*
596
* Must be called with resv->lock held.
597
*
598
* Calling this with regions_needed != NULL will count the number of pages
599
* to be added but will not modify the linked list. And regions_needed will
600
* indicate the number of file_regions needed in the cache to carry out to add
601
* the regions for this range.
602
*/
603
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
604
struct hugetlb_cgroup *h_cg,
605
struct hstate *h, long *regions_needed)
606
{
607
long add = 0;
608
struct list_head *head = &resv->regions;
609
long last_accounted_offset = f;
610
struct file_region *iter, *trg = NULL;
611
struct list_head *rg = NULL;
612
613
if (regions_needed)
614
*regions_needed = 0;
615
616
/* In this loop, we essentially handle an entry for the range
617
* [last_accounted_offset, iter->from), at every iteration, with some
618
* bounds checking.
619
*/
620
list_for_each_entry_safe(iter, trg, head, link) {
621
/* Skip irrelevant regions that start before our range. */
622
if (iter->from < f) {
623
/* If this region ends after the last accounted offset,
624
* then we need to update last_accounted_offset.
625
*/
626
if (iter->to > last_accounted_offset)
627
last_accounted_offset = iter->to;
628
continue;
629
}
630
631
/* When we find a region that starts beyond our range, we've
632
* finished.
633
*/
634
if (iter->from >= t) {
635
rg = iter->link.prev;
636
break;
637
}
638
639
/* Add an entry for last_accounted_offset -> iter->from, and
640
* update last_accounted_offset.
641
*/
642
if (iter->from > last_accounted_offset)
643
add += hugetlb_resv_map_add(resv, iter->link.prev,
644
last_accounted_offset,
645
iter->from, h, h_cg,
646
regions_needed);
647
648
last_accounted_offset = iter->to;
649
}
650
651
/* Handle the case where our range extends beyond
652
* last_accounted_offset.
653
*/
654
if (!rg)
655
rg = head->prev;
656
if (last_accounted_offset < t)
657
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
658
t, h, h_cg, regions_needed);
659
660
return add;
661
}
662
663
/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
664
*/
665
static int allocate_file_region_entries(struct resv_map *resv,
666
int regions_needed)
667
__must_hold(&resv->lock)
668
{
669
LIST_HEAD(allocated_regions);
670
int to_allocate = 0, i = 0;
671
struct file_region *trg = NULL, *rg = NULL;
672
673
VM_BUG_ON(regions_needed < 0);
674
675
/*
676
* Check for sufficient descriptors in the cache to accommodate
677
* the number of in progress add operations plus regions_needed.
678
*
679
* This is a while loop because when we drop the lock, some other call
680
* to region_add or region_del may have consumed some region_entries,
681
* so we keep looping here until we finally have enough entries for
682
* (adds_in_progress + regions_needed).
683
*/
684
while (resv->region_cache_count <
685
(resv->adds_in_progress + regions_needed)) {
686
to_allocate = resv->adds_in_progress + regions_needed -
687
resv->region_cache_count;
688
689
/* At this point, we should have enough entries in the cache
690
* for all the existing adds_in_progress. We should only be
691
* needing to allocate for regions_needed.
692
*/
693
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
694
695
spin_unlock(&resv->lock);
696
for (i = 0; i < to_allocate; i++) {
697
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
698
if (!trg)
699
goto out_of_memory;
700
list_add(&trg->link, &allocated_regions);
701
}
702
703
spin_lock(&resv->lock);
704
705
list_splice(&allocated_regions, &resv->region_cache);
706
resv->region_cache_count += to_allocate;
707
}
708
709
return 0;
710
711
out_of_memory:
712
list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
713
list_del(&rg->link);
714
kfree(rg);
715
}
716
return -ENOMEM;
717
}
718
719
/*
720
* Add the huge page range represented by [f, t) to the reserve
721
* map. Regions will be taken from the cache to fill in this range.
722
* Sufficient regions should exist in the cache due to the previous
723
* call to region_chg with the same range, but in some cases the cache will not
724
* have sufficient entries due to races with other code doing region_add or
725
* region_del. The extra needed entries will be allocated.
726
*
727
* regions_needed is the out value provided by a previous call to region_chg.
728
*
729
* Return the number of new huge pages added to the map. This number is greater
730
* than or equal to zero. If file_region entries needed to be allocated for
731
* this operation and we were not able to allocate, it returns -ENOMEM.
732
* region_add of regions of length 1 never allocate file_regions and cannot
733
* fail; region_chg will always allocate at least 1 entry and a region_add for
734
* 1 page will only require at most 1 entry.
735
*/
736
static long region_add(struct resv_map *resv, long f, long t,
737
long in_regions_needed, struct hstate *h,
738
struct hugetlb_cgroup *h_cg)
739
{
740
long add = 0, actual_regions_needed = 0;
741
742
spin_lock(&resv->lock);
743
retry:
744
745
/* Count how many regions are actually needed to execute this add. */
746
add_reservation_in_range(resv, f, t, NULL, NULL,
747
&actual_regions_needed);
748
749
/*
750
* Check for sufficient descriptors in the cache to accommodate
751
* this add operation. Note that actual_regions_needed may be greater
752
* than in_regions_needed, as the resv_map may have been modified since
753
* the region_chg call. In this case, we need to make sure that we
754
* allocate extra entries, such that we have enough for all the
755
* existing adds_in_progress, plus the excess needed for this
756
* operation.
757
*/
758
if (actual_regions_needed > in_regions_needed &&
759
resv->region_cache_count <
760
resv->adds_in_progress +
761
(actual_regions_needed - in_regions_needed)) {
762
/* region_add operation of range 1 should never need to
763
* allocate file_region entries.
764
*/
765
VM_BUG_ON(t - f <= 1);
766
767
if (allocate_file_region_entries(
768
resv, actual_regions_needed - in_regions_needed)) {
769
return -ENOMEM;
770
}
771
772
goto retry;
773
}
774
775
add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
776
777
resv->adds_in_progress -= in_regions_needed;
778
779
spin_unlock(&resv->lock);
780
return add;
781
}
782
783
/*
784
* Examine the existing reserve map and determine how many
785
* huge pages in the specified range [f, t) are NOT currently
786
* represented. This routine is called before a subsequent
787
* call to region_add that will actually modify the reserve
788
* map to add the specified range [f, t). region_chg does
789
* not change the number of huge pages represented by the
790
* map. A number of new file_region structures is added to the cache as a
791
* placeholder, for the subsequent region_add call to use. At least 1
792
* file_region structure is added.
793
*
794
* out_regions_needed is the number of regions added to the
795
* resv->adds_in_progress. This value needs to be provided to a follow up call
796
* to region_add or region_abort for proper accounting.
797
*
798
* Returns the number of huge pages that need to be added to the existing
799
* reservation map for the range [f, t). This number is greater or equal to
800
* zero. -ENOMEM is returned if a new file_region structure or cache entry
801
* is needed and can not be allocated.
802
*/
803
static long region_chg(struct resv_map *resv, long f, long t,
804
long *out_regions_needed)
805
{
806
long chg = 0;
807
808
spin_lock(&resv->lock);
809
810
/* Count how many hugepages in this range are NOT represented. */
811
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
812
out_regions_needed);
813
814
if (*out_regions_needed == 0)
815
*out_regions_needed = 1;
816
817
if (allocate_file_region_entries(resv, *out_regions_needed))
818
return -ENOMEM;
819
820
resv->adds_in_progress += *out_regions_needed;
821
822
spin_unlock(&resv->lock);
823
return chg;
824
}
825
826
/*
827
* Abort the in progress add operation. The adds_in_progress field
828
* of the resv_map keeps track of the operations in progress between
829
* calls to region_chg and region_add. Operations are sometimes
830
* aborted after the call to region_chg. In such cases, region_abort
831
* is called to decrement the adds_in_progress counter. regions_needed
832
* is the value returned by the region_chg call, it is used to decrement
833
* the adds_in_progress counter.
834
*
835
* NOTE: The range arguments [f, t) are not needed or used in this
836
* routine. They are kept to make reading the calling code easier as
837
* arguments will match the associated region_chg call.
838
*/
839
static void region_abort(struct resv_map *resv, long f, long t,
840
long regions_needed)
841
{
842
spin_lock(&resv->lock);
843
VM_BUG_ON(!resv->region_cache_count);
844
resv->adds_in_progress -= regions_needed;
845
spin_unlock(&resv->lock);
846
}
847
848
/*
849
* Delete the specified range [f, t) from the reserve map. If the
850
* t parameter is LONG_MAX, this indicates that ALL regions after f
851
* should be deleted. Locate the regions which intersect [f, t)
852
* and either trim, delete or split the existing regions.
853
*
854
* Returns the number of huge pages deleted from the reserve map.
855
* In the normal case, the return value is zero or more. In the
856
* case where a region must be split, a new region descriptor must
857
* be allocated. If the allocation fails, -ENOMEM will be returned.
858
* NOTE: If the parameter t == LONG_MAX, then we will never split
859
* a region and possibly return -ENOMEM. Callers specifying
860
* t == LONG_MAX do not need to check for -ENOMEM error.
861
*/
862
static long region_del(struct resv_map *resv, long f, long t)
863
{
864
struct list_head *head = &resv->regions;
865
struct file_region *rg, *trg;
866
struct file_region *nrg = NULL;
867
long del = 0;
868
869
retry:
870
spin_lock(&resv->lock);
871
list_for_each_entry_safe(rg, trg, head, link) {
872
/*
873
* Skip regions before the range to be deleted. file_region
874
* ranges are normally of the form [from, to). However, there
875
* may be a "placeholder" entry in the map which is of the form
876
* (from, to) with from == to. Check for placeholder entries
877
* at the beginning of the range to be deleted.
878
*/
879
if (rg->to <= f && (rg->to != rg->from || rg->to != f))
880
continue;
881
882
if (rg->from >= t)
883
break;
884
885
if (f > rg->from && t < rg->to) { /* Must split region */
886
/*
887
* Check for an entry in the cache before dropping
888
* lock and attempting allocation.
889
*/
890
if (!nrg &&
891
resv->region_cache_count > resv->adds_in_progress) {
892
nrg = list_first_entry(&resv->region_cache,
893
struct file_region,
894
link);
895
list_del(&nrg->link);
896
resv->region_cache_count--;
897
}
898
899
if (!nrg) {
900
spin_unlock(&resv->lock);
901
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
902
if (!nrg)
903
return -ENOMEM;
904
goto retry;
905
}
906
907
del += t - f;
908
hugetlb_cgroup_uncharge_file_region(
909
resv, rg, t - f, false);
910
911
/* New entry for end of split region */
912
nrg->from = t;
913
nrg->to = rg->to;
914
915
copy_hugetlb_cgroup_uncharge_info(nrg, rg);
916
917
INIT_LIST_HEAD(&nrg->link);
918
919
/* Original entry is trimmed */
920
rg->to = f;
921
922
list_add(&nrg->link, &rg->link);
923
nrg = NULL;
924
break;
925
}
926
927
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
928
del += rg->to - rg->from;
929
hugetlb_cgroup_uncharge_file_region(resv, rg,
930
rg->to - rg->from, true);
931
list_del(&rg->link);
932
kfree(rg);
933
continue;
934
}
935
936
if (f <= rg->from) { /* Trim beginning of region */
937
hugetlb_cgroup_uncharge_file_region(resv, rg,
938
t - rg->from, false);
939
940
del += t - rg->from;
941
rg->from = t;
942
} else { /* Trim end of region */
943
hugetlb_cgroup_uncharge_file_region(resv, rg,
944
rg->to - f, false);
945
946
del += rg->to - f;
947
rg->to = f;
948
}
949
}
950
951
spin_unlock(&resv->lock);
952
kfree(nrg);
953
return del;
954
}
955
956
/*
957
* A rare out of memory error was encountered which prevented removal of
958
* the reserve map region for a page. The huge page itself was free'ed
959
* and removed from the page cache. This routine will adjust the subpool
960
* usage count, and the global reserve count if needed. By incrementing
961
* these counts, the reserve map entry which could not be deleted will
962
* appear as a "reserved" entry instead of simply dangling with incorrect
963
* counts.
964
*/
965
void hugetlb_fix_reserve_counts(struct inode *inode)
966
{
967
struct hugepage_subpool *spool = subpool_inode(inode);
968
long rsv_adjust;
969
bool reserved = false;
970
971
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
972
if (rsv_adjust > 0) {
973
struct hstate *h = hstate_inode(inode);
974
975
if (!hugetlb_acct_memory(h, 1))
976
reserved = true;
977
} else if (!rsv_adjust) {
978
reserved = true;
979
}
980
981
if (!reserved)
982
pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
983
}
984
985
/*
986
* Count and return the number of huge pages in the reserve map
987
* that intersect with the range [f, t).
988
*/
989
static long region_count(struct resv_map *resv, long f, long t)
990
{
991
struct list_head *head = &resv->regions;
992
struct file_region *rg;
993
long chg = 0;
994
995
spin_lock(&resv->lock);
996
/* Locate each segment we overlap with, and count that overlap. */
997
list_for_each_entry(rg, head, link) {
998
long seg_from;
999
long seg_to;
1000
1001
if (rg->to <= f)
1002
continue;
1003
if (rg->from >= t)
1004
break;
1005
1006
seg_from = max(rg->from, f);
1007
seg_to = min(rg->to, t);
1008
1009
chg += seg_to - seg_from;
1010
}
1011
spin_unlock(&resv->lock);
1012
1013
return chg;
1014
}
1015
1016
/*
1017
* Convert the address within this vma to the page offset within
1018
* the mapping, huge page units here.
1019
*/
1020
static pgoff_t vma_hugecache_offset(struct hstate *h,
1021
struct vm_area_struct *vma, unsigned long address)
1022
{
1023
return ((address - vma->vm_start) >> huge_page_shift(h)) +
1024
(vma->vm_pgoff >> huge_page_order(h));
1025
}
1026
1027
/**
1028
* vma_kernel_pagesize - Page size granularity for this VMA.
1029
* @vma: The user mapping.
1030
*
1031
* Folios in this VMA will be aligned to, and at least the size of the
1032
* number of bytes returned by this function.
1033
*
1034
* Return: The default size of the folios allocated when backing a VMA.
1035
*/
1036
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
1037
{
1038
if (vma->vm_ops && vma->vm_ops->pagesize)
1039
return vma->vm_ops->pagesize(vma);
1040
return PAGE_SIZE;
1041
}
1042
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
1043
1044
/*
1045
* Return the page size being used by the MMU to back a VMA. In the majority
1046
* of cases, the page size used by the kernel matches the MMU size. On
1047
* architectures where it differs, an architecture-specific 'strong'
1048
* version of this symbol is required.
1049
*/
1050
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
1051
{
1052
return vma_kernel_pagesize(vma);
1053
}
1054
1055
/*
1056
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
1057
* bits of the reservation map pointer, which are always clear due to
1058
* alignment.
1059
*/
1060
#define HPAGE_RESV_OWNER (1UL << 0)
1061
#define HPAGE_RESV_UNMAPPED (1UL << 1)
1062
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
1063
1064
/*
1065
* These helpers are used to track how many pages are reserved for
1066
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
1067
* is guaranteed to have their future faults succeed.
1068
*
1069
* With the exception of hugetlb_dup_vma_private() which is called at fork(),
1070
* the reserve counters are updated with the hugetlb_lock held. It is safe
1071
* to reset the VMA at fork() time as it is not in use yet and there is no
1072
* chance of the global counters getting corrupted as a result of the values.
1073
*
1074
* The private mapping reservation is represented in a subtly different
1075
* manner to a shared mapping. A shared mapping has a region map associated
1076
* with the underlying file, this region map represents the backing file
1077
* pages which have ever had a reservation assigned which this persists even
1078
* after the page is instantiated. A private mapping has a region map
1079
* associated with the original mmap which is attached to all VMAs which
1080
* reference it, this region map represents those offsets which have consumed
1081
* reservation ie. where pages have been instantiated.
1082
*/
1083
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
1084
{
1085
return (unsigned long)vma->vm_private_data;
1086
}
1087
1088
static void set_vma_private_data(struct vm_area_struct *vma,
1089
unsigned long value)
1090
{
1091
vma->vm_private_data = (void *)value;
1092
}
1093
1094
static void
1095
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
1096
struct hugetlb_cgroup *h_cg,
1097
struct hstate *h)
1098
{
1099
#ifdef CONFIG_CGROUP_HUGETLB
1100
if (!h_cg || !h) {
1101
resv_map->reservation_counter = NULL;
1102
resv_map->pages_per_hpage = 0;
1103
resv_map->css = NULL;
1104
} else {
1105
resv_map->reservation_counter =
1106
&h_cg->rsvd_hugepage[hstate_index(h)];
1107
resv_map->pages_per_hpage = pages_per_huge_page(h);
1108
resv_map->css = &h_cg->css;
1109
}
1110
#endif
1111
}
1112
1113
struct resv_map *resv_map_alloc(void)
1114
{
1115
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
1116
struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
1117
1118
if (!resv_map || !rg) {
1119
kfree(resv_map);
1120
kfree(rg);
1121
return NULL;
1122
}
1123
1124
kref_init(&resv_map->refs);
1125
spin_lock_init(&resv_map->lock);
1126
INIT_LIST_HEAD(&resv_map->regions);
1127
init_rwsem(&resv_map->rw_sema);
1128
1129
resv_map->adds_in_progress = 0;
1130
/*
1131
* Initialize these to 0. On shared mappings, 0's here indicate these
1132
* fields don't do cgroup accounting. On private mappings, these will be
1133
* re-initialized to the proper values, to indicate that hugetlb cgroup
1134
* reservations are to be un-charged from here.
1135
*/
1136
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
1137
1138
INIT_LIST_HEAD(&resv_map->region_cache);
1139
list_add(&rg->link, &resv_map->region_cache);
1140
resv_map->region_cache_count = 1;
1141
1142
return resv_map;
1143
}
1144
1145
void resv_map_release(struct kref *ref)
1146
{
1147
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
1148
struct list_head *head = &resv_map->region_cache;
1149
struct file_region *rg, *trg;
1150
1151
/* Clear out any active regions before we release the map. */
1152
region_del(resv_map, 0, LONG_MAX);
1153
1154
/* ... and any entries left in the cache */
1155
list_for_each_entry_safe(rg, trg, head, link) {
1156
list_del(&rg->link);
1157
kfree(rg);
1158
}
1159
1160
VM_BUG_ON(resv_map->adds_in_progress);
1161
1162
kfree(resv_map);
1163
}
1164
1165
static inline struct resv_map *inode_resv_map(struct inode *inode)
1166
{
1167
/*
1168
* At inode evict time, i_mapping may not point to the original
1169
* address space within the inode. This original address space
1170
* contains the pointer to the resv_map. So, always use the
1171
* address space embedded within the inode.
1172
* The VERY common case is inode->mapping == &inode->i_data but,
1173
* this may not be true for device special inodes.
1174
*/
1175
return (struct resv_map *)(&inode->i_data)->i_private_data;
1176
}
1177
1178
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
1179
{
1180
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1181
if (vma->vm_flags & VM_MAYSHARE) {
1182
struct address_space *mapping = vma->vm_file->f_mapping;
1183
struct inode *inode = mapping->host;
1184
1185
return inode_resv_map(inode);
1186
1187
} else {
1188
return (struct resv_map *)(get_vma_private_data(vma) &
1189
~HPAGE_RESV_MASK);
1190
}
1191
}
1192
1193
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
1194
{
1195
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1196
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1197
1198
set_vma_private_data(vma, (unsigned long)map);
1199
}
1200
1201
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
1202
{
1203
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1204
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1205
1206
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1207
}
1208
1209
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1210
{
1211
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1212
1213
return (get_vma_private_data(vma) & flag) != 0;
1214
}
1215
1216
bool __vma_private_lock(struct vm_area_struct *vma)
1217
{
1218
return !(vma->vm_flags & VM_MAYSHARE) &&
1219
get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
1220
is_vma_resv_set(vma, HPAGE_RESV_OWNER);
1221
}
1222
1223
void hugetlb_dup_vma_private(struct vm_area_struct *vma)
1224
{
1225
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1226
/*
1227
* Clear vm_private_data
1228
* - For shared mappings this is a per-vma semaphore that may be
1229
* allocated in a subsequent call to hugetlb_vm_op_open.
1230
* Before clearing, make sure pointer is not associated with vma
1231
* as this will leak the structure. This is the case when called
1232
* via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
1233
* been called to allocate a new structure.
1234
* - For MAP_PRIVATE mappings, this is the reserve map which does
1235
* not apply to children. Faults generated by the children are
1236
* not guaranteed to succeed, even if read-only.
1237
*/
1238
if (vma->vm_flags & VM_MAYSHARE) {
1239
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
1240
1241
if (vma_lock && vma_lock->vma != vma)
1242
vma->vm_private_data = NULL;
1243
} else
1244
vma->vm_private_data = NULL;
1245
}
1246
1247
/*
1248
* Reset and decrement one ref on hugepage private reservation.
1249
* Called with mm->mmap_lock writer semaphore held.
1250
* This function should be only used by mremap and operate on
1251
* same sized vma. It should never come here with last ref on the
1252
* reservation.
1253
*/
1254
void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1255
{
1256
/*
1257
* Clear the old hugetlb private page reservation.
1258
* It has already been transferred to new_vma.
1259
*
1260
* During a mremap() operation of a hugetlb vma we call move_vma()
1261
* which copies vma into new_vma and unmaps vma. After the copy
1262
* operation both new_vma and vma share a reference to the resv_map
1263
* struct, and at that point vma is about to be unmapped. We don't
1264
* want to return the reservation to the pool at unmap of vma because
1265
* the reservation still lives on in new_vma, so simply decrement the
1266
* ref here and remove the resv_map reference from this vma.
1267
*/
1268
struct resv_map *reservations = vma_resv_map(vma);
1269
1270
if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1271
resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1272
kref_put(&reservations->refs, resv_map_release);
1273
}
1274
1275
hugetlb_dup_vma_private(vma);
1276
}
1277
1278
static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
1279
{
1280
int nid = folio_nid(folio);
1281
1282
lockdep_assert_held(&hugetlb_lock);
1283
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1284
1285
list_move(&folio->lru, &h->hugepage_freelists[nid]);
1286
h->free_huge_pages++;
1287
h->free_huge_pages_node[nid]++;
1288
folio_set_hugetlb_freed(folio);
1289
}
1290
1291
static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
1292
int nid)
1293
{
1294
struct folio *folio;
1295
bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1296
1297
lockdep_assert_held(&hugetlb_lock);
1298
list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
1299
if (pin && !folio_is_longterm_pinnable(folio))
1300
continue;
1301
1302
if (folio_test_hwpoison(folio))
1303
continue;
1304
1305
if (is_migrate_isolate_page(&folio->page))
1306
continue;
1307
1308
list_move(&folio->lru, &h->hugepage_activelist);
1309
folio_ref_unfreeze(folio, 1);
1310
folio_clear_hugetlb_freed(folio);
1311
h->free_huge_pages--;
1312
h->free_huge_pages_node[nid]--;
1313
return folio;
1314
}
1315
1316
return NULL;
1317
}
1318
1319
static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
1320
int nid, nodemask_t *nmask)
1321
{
1322
unsigned int cpuset_mems_cookie;
1323
struct zonelist *zonelist;
1324
struct zone *zone;
1325
struct zoneref *z;
1326
int node = NUMA_NO_NODE;
1327
1328
/* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */
1329
if (nid == NUMA_NO_NODE)
1330
nid = numa_node_id();
1331
1332
zonelist = node_zonelist(nid, gfp_mask);
1333
1334
retry_cpuset:
1335
cpuset_mems_cookie = read_mems_allowed_begin();
1336
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1337
struct folio *folio;
1338
1339
if (!cpuset_zone_allowed(zone, gfp_mask))
1340
continue;
1341
/*
1342
* no need to ask again on the same node. Pool is node rather than
1343
* zone aware
1344
*/
1345
if (zone_to_nid(zone) == node)
1346
continue;
1347
node = zone_to_nid(zone);
1348
1349
folio = dequeue_hugetlb_folio_node_exact(h, node);
1350
if (folio)
1351
return folio;
1352
}
1353
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1354
goto retry_cpuset;
1355
1356
return NULL;
1357
}
1358
1359
static unsigned long available_huge_pages(struct hstate *h)
1360
{
1361
return h->free_huge_pages - h->resv_huge_pages;
1362
}
1363
1364
static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
1365
struct vm_area_struct *vma,
1366
unsigned long address, long gbl_chg)
1367
{
1368
struct folio *folio = NULL;
1369
struct mempolicy *mpol;
1370
gfp_t gfp_mask;
1371
nodemask_t *nodemask;
1372
int nid;
1373
1374
/*
1375
* gbl_chg==1 means the allocation requires a new page that was not
1376
* reserved before. Making sure there's at least one free page.
1377
*/
1378
if (gbl_chg && !available_huge_pages(h))
1379
goto err;
1380
1381
gfp_mask = htlb_alloc_mask(h);
1382
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1383
1384
if (mpol_is_preferred_many(mpol)) {
1385
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1386
nid, nodemask);
1387
1388
/* Fallback to all nodes if page==NULL */
1389
nodemask = NULL;
1390
}
1391
1392
if (!folio)
1393
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1394
nid, nodemask);
1395
1396
mpol_cond_put(mpol);
1397
return folio;
1398
1399
err:
1400
return NULL;
1401
}
1402
1403
/*
1404
* common helper functions for hstate_next_node_to_{alloc|free}.
1405
* We may have allocated or freed a huge page based on a different
1406
* nodes_allowed previously, so h->next_node_to_{alloc|free} might
1407
* be outside of *nodes_allowed. Ensure that we use an allowed
1408
* node for alloc or free.
1409
*/
1410
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1411
{
1412
nid = next_node_in(nid, *nodes_allowed);
1413
VM_BUG_ON(nid >= MAX_NUMNODES);
1414
1415
return nid;
1416
}
1417
1418
static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1419
{
1420
if (!node_isset(nid, *nodes_allowed))
1421
nid = next_node_allowed(nid, nodes_allowed);
1422
return nid;
1423
}
1424
1425
/*
1426
* returns the previously saved node ["this node"] from which to
1427
* allocate a persistent huge page for the pool and advance the
1428
* next node from which to allocate, handling wrap at end of node
1429
* mask.
1430
*/
1431
static int hstate_next_node_to_alloc(int *next_node,
1432
nodemask_t *nodes_allowed)
1433
{
1434
int nid;
1435
1436
VM_BUG_ON(!nodes_allowed);
1437
1438
nid = get_valid_node_allowed(*next_node, nodes_allowed);
1439
*next_node = next_node_allowed(nid, nodes_allowed);
1440
1441
return nid;
1442
}
1443
1444
/*
1445
* helper for remove_pool_hugetlb_folio() - return the previously saved
1446
* node ["this node"] from which to free a huge page. Advance the
1447
* next node id whether or not we find a free huge page to free so
1448
* that the next attempt to free addresses the next node.
1449
*/
1450
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1451
{
1452
int nid;
1453
1454
VM_BUG_ON(!nodes_allowed);
1455
1456
nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1457
h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1458
1459
return nid;
1460
}
1461
1462
#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \
1463
for (nr_nodes = nodes_weight(*mask); \
1464
nr_nodes > 0 && \
1465
((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
1466
nr_nodes--)
1467
1468
#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1469
for (nr_nodes = nodes_weight(*mask); \
1470
nr_nodes > 0 && \
1471
((node = hstate_next_node_to_free(hs, mask)) || 1); \
1472
nr_nodes--)
1473
1474
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1475
#ifdef CONFIG_CONTIG_ALLOC
1476
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
1477
int nid, nodemask_t *nodemask)
1478
{
1479
struct folio *folio;
1480
int order = huge_page_order(h);
1481
bool retried = false;
1482
1483
if (nid == NUMA_NO_NODE)
1484
nid = numa_mem_id();
1485
retry:
1486
folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
1487
if (!folio) {
1488
if (hugetlb_cma_exclusive_alloc())
1489
return NULL;
1490
1491
folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
1492
if (!folio)
1493
return NULL;
1494
}
1495
1496
if (folio_ref_freeze(folio, 1))
1497
return folio;
1498
1499
pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
1500
hugetlb_free_folio(folio);
1501
if (!retried) {
1502
retried = true;
1503
goto retry;
1504
}
1505
return NULL;
1506
}
1507
1508
#else /* !CONFIG_CONTIG_ALLOC */
1509
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
1510
int nid, nodemask_t *nodemask)
1511
{
1512
return NULL;
1513
}
1514
#endif /* CONFIG_CONTIG_ALLOC */
1515
1516
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1517
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
1518
int nid, nodemask_t *nodemask)
1519
{
1520
return NULL;
1521
}
1522
#endif
1523
1524
/*
1525
* Remove hugetlb folio from lists.
1526
* If vmemmap exists for the folio, clear the hugetlb flag so that the
1527
* folio appears as just a compound page. Otherwise, wait until after
1528
* allocating vmemmap to clear the flag.
1529
*
1530
* Must be called with hugetlb lock held.
1531
*/
1532
static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
1533
bool adjust_surplus)
1534
{
1535
int nid = folio_nid(folio);
1536
1537
VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
1538
VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
1539
1540
lockdep_assert_held(&hugetlb_lock);
1541
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1542
return;
1543
1544
list_del(&folio->lru);
1545
1546
if (folio_test_hugetlb_freed(folio)) {
1547
folio_clear_hugetlb_freed(folio);
1548
h->free_huge_pages--;
1549
h->free_huge_pages_node[nid]--;
1550
}
1551
if (adjust_surplus) {
1552
h->surplus_huge_pages--;
1553
h->surplus_huge_pages_node[nid]--;
1554
}
1555
1556
/*
1557
* We can only clear the hugetlb flag after allocating vmemmap
1558
* pages. Otherwise, someone (memory error handling) may try to write
1559
* to tail struct pages.
1560
*/
1561
if (!folio_test_hugetlb_vmemmap_optimized(folio))
1562
__folio_clear_hugetlb(folio);
1563
1564
h->nr_huge_pages--;
1565
h->nr_huge_pages_node[nid]--;
1566
}
1567
1568
static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
1569
bool adjust_surplus)
1570
{
1571
int nid = folio_nid(folio);
1572
1573
VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
1574
1575
lockdep_assert_held(&hugetlb_lock);
1576
1577
INIT_LIST_HEAD(&folio->lru);
1578
h->nr_huge_pages++;
1579
h->nr_huge_pages_node[nid]++;
1580
1581
if (adjust_surplus) {
1582
h->surplus_huge_pages++;
1583
h->surplus_huge_pages_node[nid]++;
1584
}
1585
1586
__folio_set_hugetlb(folio);
1587
folio_change_private(folio, NULL);
1588
/*
1589
* We have to set hugetlb_vmemmap_optimized again as above
1590
* folio_change_private(folio, NULL) cleared it.
1591
*/
1592
folio_set_hugetlb_vmemmap_optimized(folio);
1593
1594
arch_clear_hugetlb_flags(folio);
1595
enqueue_hugetlb_folio(h, folio);
1596
}
1597
1598
static void __update_and_free_hugetlb_folio(struct hstate *h,
1599
struct folio *folio)
1600
{
1601
bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
1602
1603
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1604
return;
1605
1606
/*
1607
* If we don't know which subpages are hwpoisoned, we can't free
1608
* the hugepage, so it's leaked intentionally.
1609
*/
1610
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1611
return;
1612
1613
/*
1614
* If folio is not vmemmap optimized (!clear_flag), then the folio
1615
* is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio
1616
* can only be passed hugetlb pages and will BUG otherwise.
1617
*/
1618
if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
1619
spin_lock_irq(&hugetlb_lock);
1620
/*
1621
* If we cannot allocate vmemmap pages, just refuse to free the
1622
* page and put the page back on the hugetlb free list and treat
1623
* as a surplus page.
1624
*/
1625
add_hugetlb_folio(h, folio, true);
1626
spin_unlock_irq(&hugetlb_lock);
1627
return;
1628
}
1629
1630
/*
1631
* If vmemmap pages were allocated above, then we need to clear the
1632
* hugetlb flag under the hugetlb lock.
1633
*/
1634
if (folio_test_hugetlb(folio)) {
1635
spin_lock_irq(&hugetlb_lock);
1636
__folio_clear_hugetlb(folio);
1637
spin_unlock_irq(&hugetlb_lock);
1638
}
1639
1640
/*
1641
* Move PageHWPoison flag from head page to the raw error pages,
1642
* which makes any healthy subpages reusable.
1643
*/
1644
if (unlikely(folio_test_hwpoison(folio)))
1645
folio_clear_hugetlb_hwpoison(folio);
1646
1647
folio_ref_unfreeze(folio, 1);
1648
1649
hugetlb_free_folio(folio);
1650
}
1651
1652
/*
1653
* As update_and_free_hugetlb_folio() can be called under any context, so we cannot
1654
* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1655
* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1656
* the vmemmap pages.
1657
*
1658
* free_hpage_workfn() locklessly retrieves the linked list of pages to be
1659
* freed and frees them one-by-one. As the page->mapping pointer is going
1660
* to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1661
* structure of a lockless linked list of huge pages to be freed.
1662
*/
1663
static LLIST_HEAD(hpage_freelist);
1664
1665
static void free_hpage_workfn(struct work_struct *work)
1666
{
1667
struct llist_node *node;
1668
1669
node = llist_del_all(&hpage_freelist);
1670
1671
while (node) {
1672
struct folio *folio;
1673
struct hstate *h;
1674
1675
folio = container_of((struct address_space **)node,
1676
struct folio, mapping);
1677
node = node->next;
1678
folio->mapping = NULL;
1679
/*
1680
* The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
1681
* folio_hstate() is going to trigger because a previous call to
1682
* remove_hugetlb_folio() will clear the hugetlb bit, so do
1683
* not use folio_hstate() directly.
1684
*/
1685
h = size_to_hstate(folio_size(folio));
1686
1687
__update_and_free_hugetlb_folio(h, folio);
1688
1689
cond_resched();
1690
}
1691
}
1692
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1693
1694
static inline void flush_free_hpage_work(struct hstate *h)
1695
{
1696
if (hugetlb_vmemmap_optimizable(h))
1697
flush_work(&free_hpage_work);
1698
}
1699
1700
static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
1701
bool atomic)
1702
{
1703
if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
1704
__update_and_free_hugetlb_folio(h, folio);
1705
return;
1706
}
1707
1708
/*
1709
* Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1710
*
1711
* Only call schedule_work() if hpage_freelist is previously
1712
* empty. Otherwise, schedule_work() had been called but the workfn
1713
* hasn't retrieved the list yet.
1714
*/
1715
if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
1716
schedule_work(&free_hpage_work);
1717
}
1718
1719
static void bulk_vmemmap_restore_error(struct hstate *h,
1720
struct list_head *folio_list,
1721
struct list_head *non_hvo_folios)
1722
{
1723
struct folio *folio, *t_folio;
1724
1725
if (!list_empty(non_hvo_folios)) {
1726
/*
1727
* Free any restored hugetlb pages so that restore of the
1728
* entire list can be retried.
1729
* The idea is that in the common case of ENOMEM errors freeing
1730
* hugetlb pages with vmemmap we will free up memory so that we
1731
* can allocate vmemmap for more hugetlb pages.
1732
*/
1733
list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
1734
list_del(&folio->lru);
1735
spin_lock_irq(&hugetlb_lock);
1736
__folio_clear_hugetlb(folio);
1737
spin_unlock_irq(&hugetlb_lock);
1738
update_and_free_hugetlb_folio(h, folio, false);
1739
cond_resched();
1740
}
1741
} else {
1742
/*
1743
* In the case where there are no folios which can be
1744
* immediately freed, we loop through the list trying to restore
1745
* vmemmap individually in the hope that someone elsewhere may
1746
* have done something to cause success (such as freeing some
1747
* memory). If unable to restore a hugetlb page, the hugetlb
1748
* page is made a surplus page and removed from the list.
1749
* If are able to restore vmemmap and free one hugetlb page, we
1750
* quit processing the list to retry the bulk operation.
1751
*/
1752
list_for_each_entry_safe(folio, t_folio, folio_list, lru)
1753
if (hugetlb_vmemmap_restore_folio(h, folio)) {
1754
list_del(&folio->lru);
1755
spin_lock_irq(&hugetlb_lock);
1756
add_hugetlb_folio(h, folio, true);
1757
spin_unlock_irq(&hugetlb_lock);
1758
} else {
1759
list_del(&folio->lru);
1760
spin_lock_irq(&hugetlb_lock);
1761
__folio_clear_hugetlb(folio);
1762
spin_unlock_irq(&hugetlb_lock);
1763
update_and_free_hugetlb_folio(h, folio, false);
1764
cond_resched();
1765
break;
1766
}
1767
}
1768
}
1769
1770
static void update_and_free_pages_bulk(struct hstate *h,
1771
struct list_head *folio_list)
1772
{
1773
long ret;
1774
struct folio *folio, *t_folio;
1775
LIST_HEAD(non_hvo_folios);
1776
1777
/*
1778
* First allocate required vmemmmap (if necessary) for all folios.
1779
* Carefully handle errors and free up any available hugetlb pages
1780
* in an effort to make forward progress.
1781
*/
1782
retry:
1783
ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
1784
if (ret < 0) {
1785
bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
1786
goto retry;
1787
}
1788
1789
/*
1790
* At this point, list should be empty, ret should be >= 0 and there
1791
* should only be pages on the non_hvo_folios list.
1792
* Do note that the non_hvo_folios list could be empty.
1793
* Without HVO enabled, ret will be 0 and there is no need to call
1794
* __folio_clear_hugetlb as this was done previously.
1795
*/
1796
VM_WARN_ON(!list_empty(folio_list));
1797
VM_WARN_ON(ret < 0);
1798
if (!list_empty(&non_hvo_folios) && ret) {
1799
spin_lock_irq(&hugetlb_lock);
1800
list_for_each_entry(folio, &non_hvo_folios, lru)
1801
__folio_clear_hugetlb(folio);
1802
spin_unlock_irq(&hugetlb_lock);
1803
}
1804
1805
list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
1806
update_and_free_hugetlb_folio(h, folio, false);
1807
cond_resched();
1808
}
1809
}
1810
1811
struct hstate *size_to_hstate(unsigned long size)
1812
{
1813
struct hstate *h;
1814
1815
for_each_hstate(h) {
1816
if (huge_page_size(h) == size)
1817
return h;
1818
}
1819
return NULL;
1820
}
1821
1822
void free_huge_folio(struct folio *folio)
1823
{
1824
/*
1825
* Can't pass hstate in here because it is called from the
1826
* generic mm code.
1827
*/
1828
struct hstate *h = folio_hstate(folio);
1829
int nid = folio_nid(folio);
1830
struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
1831
bool restore_reserve;
1832
unsigned long flags;
1833
1834
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1835
VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
1836
1837
hugetlb_set_folio_subpool(folio, NULL);
1838
if (folio_test_anon(folio))
1839
__ClearPageAnonExclusive(&folio->page);
1840
folio->mapping = NULL;
1841
restore_reserve = folio_test_hugetlb_restore_reserve(folio);
1842
folio_clear_hugetlb_restore_reserve(folio);
1843
1844
/*
1845
* If HPageRestoreReserve was set on page, page allocation consumed a
1846
* reservation. If the page was associated with a subpool, there
1847
* would have been a page reserved in the subpool before allocation
1848
* via hugepage_subpool_get_pages(). Since we are 'restoring' the
1849
* reservation, do not call hugepage_subpool_put_pages() as this will
1850
* remove the reserved page from the subpool.
1851
*/
1852
if (!restore_reserve) {
1853
/*
1854
* A return code of zero implies that the subpool will be
1855
* under its minimum size if the reservation is not restored
1856
* after page is free. Therefore, force restore_reserve
1857
* operation.
1858
*/
1859
if (hugepage_subpool_put_pages(spool, 1) == 0)
1860
restore_reserve = true;
1861
}
1862
1863
spin_lock_irqsave(&hugetlb_lock, flags);
1864
folio_clear_hugetlb_migratable(folio);
1865
hugetlb_cgroup_uncharge_folio(hstate_index(h),
1866
pages_per_huge_page(h), folio);
1867
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
1868
pages_per_huge_page(h), folio);
1869
lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
1870
mem_cgroup_uncharge(folio);
1871
if (restore_reserve)
1872
h->resv_huge_pages++;
1873
1874
if (folio_test_hugetlb_temporary(folio)) {
1875
remove_hugetlb_folio(h, folio, false);
1876
spin_unlock_irqrestore(&hugetlb_lock, flags);
1877
update_and_free_hugetlb_folio(h, folio, true);
1878
} else if (h->surplus_huge_pages_node[nid]) {
1879
/* remove the page from active list */
1880
remove_hugetlb_folio(h, folio, true);
1881
spin_unlock_irqrestore(&hugetlb_lock, flags);
1882
update_and_free_hugetlb_folio(h, folio, true);
1883
} else {
1884
arch_clear_hugetlb_flags(folio);
1885
enqueue_hugetlb_folio(h, folio);
1886
spin_unlock_irqrestore(&hugetlb_lock, flags);
1887
}
1888
}
1889
1890
/*
1891
* Must be called with the hugetlb lock held
1892
*/
1893
static void __prep_account_new_huge_page(struct hstate *h, int nid)
1894
{
1895
lockdep_assert_held(&hugetlb_lock);
1896
h->nr_huge_pages++;
1897
h->nr_huge_pages_node[nid]++;
1898
}
1899
1900
static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
1901
{
1902
__folio_set_hugetlb(folio);
1903
INIT_LIST_HEAD(&folio->lru);
1904
hugetlb_set_folio_subpool(folio, NULL);
1905
set_hugetlb_cgroup(folio, NULL);
1906
set_hugetlb_cgroup_rsvd(folio, NULL);
1907
}
1908
1909
static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
1910
{
1911
init_new_hugetlb_folio(h, folio);
1912
hugetlb_vmemmap_optimize_folio(h, folio);
1913
}
1914
1915
static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
1916
{
1917
__prep_new_hugetlb_folio(h, folio);
1918
spin_lock_irq(&hugetlb_lock);
1919
__prep_account_new_huge_page(h, nid);
1920
spin_unlock_irq(&hugetlb_lock);
1921
}
1922
1923
/*
1924
* Find and lock address space (mapping) in write mode.
1925
*
1926
* Upon entry, the folio is locked which means that folio_mapping() is
1927
* stable. Due to locking order, we can only trylock_write. If we can
1928
* not get the lock, simply return NULL to caller.
1929
*/
1930
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
1931
{
1932
struct address_space *mapping = folio_mapping(folio);
1933
1934
if (!mapping)
1935
return mapping;
1936
1937
if (i_mmap_trylock_write(mapping))
1938
return mapping;
1939
1940
return NULL;
1941
}
1942
1943
static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
1944
gfp_t gfp_mask, int nid, nodemask_t *nmask,
1945
nodemask_t *node_alloc_noretry)
1946
{
1947
int order = huge_page_order(h);
1948
struct folio *folio;
1949
bool alloc_try_hard = true;
1950
1951
/*
1952
* By default we always try hard to allocate the folio with
1953
* __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in
1954
* a loop (to adjust global huge page counts) and previous allocation
1955
* failed, do not continue to try hard on the same node. Use the
1956
* node_alloc_noretry bitmap to manage this state information.
1957
*/
1958
if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1959
alloc_try_hard = false;
1960
if (alloc_try_hard)
1961
gfp_mask |= __GFP_RETRY_MAYFAIL;
1962
if (nid == NUMA_NO_NODE)
1963
nid = numa_mem_id();
1964
1965
folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
1966
1967
/*
1968
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a
1969
* folio this indicates an overall state change. Clear bit so
1970
* that we resume normal 'try hard' allocations.
1971
*/
1972
if (node_alloc_noretry && folio && !alloc_try_hard)
1973
node_clear(nid, *node_alloc_noretry);
1974
1975
/*
1976
* If we tried hard to get a folio but failed, set bit so that
1977
* subsequent attempts will not try as hard until there is an
1978
* overall state change.
1979
*/
1980
if (node_alloc_noretry && !folio && alloc_try_hard)
1981
node_set(nid, *node_alloc_noretry);
1982
1983
if (!folio) {
1984
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1985
return NULL;
1986
}
1987
1988
__count_vm_event(HTLB_BUDDY_PGALLOC);
1989
return folio;
1990
}
1991
1992
static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
1993
gfp_t gfp_mask, int nid, nodemask_t *nmask,
1994
nodemask_t *node_alloc_noretry)
1995
{
1996
struct folio *folio;
1997
1998
if (hstate_is_gigantic(h))
1999
folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
2000
else
2001
folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
2002
if (folio)
2003
init_new_hugetlb_folio(h, folio);
2004
return folio;
2005
}
2006
2007
/*
2008
* Common helper to allocate a fresh hugetlb page. All specific allocators
2009
* should use this function to get new hugetlb pages
2010
*
2011
* Note that returned page is 'frozen': ref count of head page and all tail
2012
* pages is zero.
2013
*/
2014
static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
2015
gfp_t gfp_mask, int nid, nodemask_t *nmask)
2016
{
2017
struct folio *folio;
2018
2019
if (hstate_is_gigantic(h))
2020
folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
2021
else
2022
folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2023
if (!folio)
2024
return NULL;
2025
2026
prep_new_hugetlb_folio(h, folio, folio_nid(folio));
2027
return folio;
2028
}
2029
2030
static void prep_and_add_allocated_folios(struct hstate *h,
2031
struct list_head *folio_list)
2032
{
2033
unsigned long flags;
2034
struct folio *folio, *tmp_f;
2035
2036
/* Send list for bulk vmemmap optimization processing */
2037
hugetlb_vmemmap_optimize_folios(h, folio_list);
2038
2039
/* Add all new pool pages to free lists in one lock cycle */
2040
spin_lock_irqsave(&hugetlb_lock, flags);
2041
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
2042
__prep_account_new_huge_page(h, folio_nid(folio));
2043
enqueue_hugetlb_folio(h, folio);
2044
}
2045
spin_unlock_irqrestore(&hugetlb_lock, flags);
2046
}
2047
2048
/*
2049
* Allocates a fresh hugetlb page in a node interleaved manner. The page
2050
* will later be added to the appropriate hugetlb pool.
2051
*/
2052
static struct folio *alloc_pool_huge_folio(struct hstate *h,
2053
nodemask_t *nodes_allowed,
2054
nodemask_t *node_alloc_noretry,
2055
int *next_node)
2056
{
2057
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2058
int nr_nodes, node;
2059
2060
for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
2061
struct folio *folio;
2062
2063
folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
2064
nodes_allowed, node_alloc_noretry);
2065
if (folio)
2066
return folio;
2067
}
2068
2069
return NULL;
2070
}
2071
2072
/*
2073
* Remove huge page from pool from next node to free. Attempt to keep
2074
* persistent huge pages more or less balanced over allowed nodes.
2075
* This routine only 'removes' the hugetlb page. The caller must make
2076
* an additional call to free the page to low level allocators.
2077
* Called with hugetlb_lock locked.
2078
*/
2079
static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
2080
nodemask_t *nodes_allowed, bool acct_surplus)
2081
{
2082
int nr_nodes, node;
2083
struct folio *folio = NULL;
2084
2085
lockdep_assert_held(&hugetlb_lock);
2086
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2087
/*
2088
* If we're returning unused surplus pages, only examine
2089
* nodes with surplus pages.
2090
*/
2091
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2092
!list_empty(&h->hugepage_freelists[node])) {
2093
folio = list_entry(h->hugepage_freelists[node].next,
2094
struct folio, lru);
2095
remove_hugetlb_folio(h, folio, acct_surplus);
2096
break;
2097
}
2098
}
2099
2100
return folio;
2101
}
2102
2103
/*
2104
* Dissolve a given free hugetlb folio into free buddy pages. This function
2105
* does nothing for in-use hugetlb folios and non-hugetlb folios.
2106
* This function returns values like below:
2107
*
2108
* -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2109
* when the system is under memory pressure and the feature of
2110
* freeing unused vmemmap pages associated with each hugetlb page
2111
* is enabled.
2112
* -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
2113
* (allocated or reserved.)
2114
* 0: successfully dissolved free hugepages or the page is not a
2115
* hugepage (considered as already dissolved)
2116
*/
2117
int dissolve_free_hugetlb_folio(struct folio *folio)
2118
{
2119
int rc = -EBUSY;
2120
2121
retry:
2122
/* Not to disrupt normal path by vainly holding hugetlb_lock */
2123
if (!folio_test_hugetlb(folio))
2124
return 0;
2125
2126
spin_lock_irq(&hugetlb_lock);
2127
if (!folio_test_hugetlb(folio)) {
2128
rc = 0;
2129
goto out;
2130
}
2131
2132
if (!folio_ref_count(folio)) {
2133
struct hstate *h = folio_hstate(folio);
2134
bool adjust_surplus = false;
2135
2136
if (!available_huge_pages(h))
2137
goto out;
2138
2139
/*
2140
* We should make sure that the page is already on the free list
2141
* when it is dissolved.
2142
*/
2143
if (unlikely(!folio_test_hugetlb_freed(folio))) {
2144
spin_unlock_irq(&hugetlb_lock);
2145
cond_resched();
2146
2147
/*
2148
* Theoretically, we should return -EBUSY when we
2149
* encounter this race. In fact, we have a chance
2150
* to successfully dissolve the page if we do a
2151
* retry. Because the race window is quite small.
2152
* If we seize this opportunity, it is an optimization
2153
* for increasing the success rate of dissolving page.
2154
*/
2155
goto retry;
2156
}
2157
2158
if (h->surplus_huge_pages_node[folio_nid(folio)])
2159
adjust_surplus = true;
2160
remove_hugetlb_folio(h, folio, adjust_surplus);
2161
h->max_huge_pages--;
2162
spin_unlock_irq(&hugetlb_lock);
2163
2164
/*
2165
* Normally update_and_free_hugtlb_folio will allocate required vmemmmap
2166
* before freeing the page. update_and_free_hugtlb_folio will fail to
2167
* free the page if it can not allocate required vmemmap. We
2168
* need to adjust max_huge_pages if the page is not freed.
2169
* Attempt to allocate vmemmmap here so that we can take
2170
* appropriate action on failure.
2171
*
2172
* The folio_test_hugetlb check here is because
2173
* remove_hugetlb_folio will clear hugetlb folio flag for
2174
* non-vmemmap optimized hugetlb folios.
2175
*/
2176
if (folio_test_hugetlb(folio)) {
2177
rc = hugetlb_vmemmap_restore_folio(h, folio);
2178
if (rc) {
2179
spin_lock_irq(&hugetlb_lock);
2180
add_hugetlb_folio(h, folio, adjust_surplus);
2181
h->max_huge_pages++;
2182
goto out;
2183
}
2184
} else
2185
rc = 0;
2186
2187
update_and_free_hugetlb_folio(h, folio, false);
2188
return rc;
2189
}
2190
out:
2191
spin_unlock_irq(&hugetlb_lock);
2192
return rc;
2193
}
2194
2195
/*
2196
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2197
* make specified memory blocks removable from the system.
2198
* Note that this will dissolve a free gigantic hugepage completely, if any
2199
* part of it lies within the given range.
2200
* Also note that if dissolve_free_hugetlb_folio() returns with an error, all
2201
* free hugetlb folios that were dissolved before that error are lost.
2202
*/
2203
int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
2204
{
2205
unsigned long pfn;
2206
struct folio *folio;
2207
int rc = 0;
2208
unsigned int order;
2209
struct hstate *h;
2210
2211
if (!hugepages_supported())
2212
return rc;
2213
2214
order = huge_page_order(&default_hstate);
2215
for_each_hstate(h)
2216
order = min(order, huge_page_order(h));
2217
2218
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
2219
folio = pfn_folio(pfn);
2220
rc = dissolve_free_hugetlb_folio(folio);
2221
if (rc)
2222
break;
2223
}
2224
2225
return rc;
2226
}
2227
2228
/*
2229
* Allocates a fresh surplus page from the page allocator.
2230
*/
2231
static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
2232
gfp_t gfp_mask, int nid, nodemask_t *nmask)
2233
{
2234
struct folio *folio = NULL;
2235
2236
if (hstate_is_gigantic(h))
2237
return NULL;
2238
2239
spin_lock_irq(&hugetlb_lock);
2240
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2241
goto out_unlock;
2242
spin_unlock_irq(&hugetlb_lock);
2243
2244
folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2245
if (!folio)
2246
return NULL;
2247
2248
hugetlb_vmemmap_optimize_folio(h, folio);
2249
2250
spin_lock_irq(&hugetlb_lock);
2251
/*
2252
* nr_huge_pages needs to be adjusted within the same lock cycle
2253
* as surplus_pages, otherwise it might confuse
2254
* persistent_huge_pages() momentarily.
2255
*/
2256
__prep_account_new_huge_page(h, folio_nid(folio));
2257
2258
/*
2259
* We could have raced with the pool size change.
2260
* Double check that and simply deallocate the new page
2261
* if we would end up overcommiting the surpluses. Abuse
2262
* temporary page to workaround the nasty free_huge_folio
2263
* codeflow
2264
*/
2265
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2266
folio_set_hugetlb_temporary(folio);
2267
spin_unlock_irq(&hugetlb_lock);
2268
free_huge_folio(folio);
2269
return NULL;
2270
}
2271
2272
h->surplus_huge_pages++;
2273
h->surplus_huge_pages_node[folio_nid(folio)]++;
2274
2275
out_unlock:
2276
spin_unlock_irq(&hugetlb_lock);
2277
2278
return folio;
2279
}
2280
2281
static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
2282
int nid, nodemask_t *nmask)
2283
{
2284
struct folio *folio;
2285
2286
if (hstate_is_gigantic(h))
2287
return NULL;
2288
2289
folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
2290
if (!folio)
2291
return NULL;
2292
2293
/* fresh huge pages are frozen */
2294
folio_ref_unfreeze(folio, 1);
2295
/*
2296
* We do not account these pages as surplus because they are only
2297
* temporary and will be released properly on the last reference
2298
*/
2299
folio_set_hugetlb_temporary(folio);
2300
2301
return folio;
2302
}
2303
2304
/*
2305
* Use the VMA's mpolicy to allocate a huge page from the buddy.
2306
*/
2307
static
2308
struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
2309
struct vm_area_struct *vma, unsigned long addr)
2310
{
2311
struct folio *folio = NULL;
2312
struct mempolicy *mpol;
2313
gfp_t gfp_mask = htlb_alloc_mask(h);
2314
int nid;
2315
nodemask_t *nodemask;
2316
2317
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2318
if (mpol_is_preferred_many(mpol)) {
2319
gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2320
2321
folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
2322
2323
/* Fallback to all nodes if page==NULL */
2324
nodemask = NULL;
2325
}
2326
2327
if (!folio)
2328
folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
2329
mpol_cond_put(mpol);
2330
return folio;
2331
}
2332
2333
struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
2334
nodemask_t *nmask, gfp_t gfp_mask)
2335
{
2336
struct folio *folio;
2337
2338
spin_lock_irq(&hugetlb_lock);
2339
if (!h->resv_huge_pages) {
2340
spin_unlock_irq(&hugetlb_lock);
2341
return NULL;
2342
}
2343
2344
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
2345
nmask);
2346
if (folio)
2347
h->resv_huge_pages--;
2348
2349
spin_unlock_irq(&hugetlb_lock);
2350
return folio;
2351
}
2352
2353
/* folio migration callback function */
2354
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
2355
nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
2356
{
2357
spin_lock_irq(&hugetlb_lock);
2358
if (available_huge_pages(h)) {
2359
struct folio *folio;
2360
2361
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
2362
preferred_nid, nmask);
2363
if (folio) {
2364
spin_unlock_irq(&hugetlb_lock);
2365
return folio;
2366
}
2367
}
2368
spin_unlock_irq(&hugetlb_lock);
2369
2370
/* We cannot fallback to other nodes, as we could break the per-node pool. */
2371
if (!allow_alloc_fallback)
2372
gfp_mask |= __GFP_THISNODE;
2373
2374
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
2375
}
2376
2377
static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
2378
{
2379
#ifdef CONFIG_NUMA
2380
struct mempolicy *mpol = get_task_policy(current);
2381
2382
/*
2383
* Only enforce MPOL_BIND policy which overlaps with cpuset policy
2384
* (from policy_nodemask) specifically for hugetlb case
2385
*/
2386
if (mpol->mode == MPOL_BIND &&
2387
(apply_policy_zone(mpol, gfp_zone(gfp)) &&
2388
cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
2389
return &mpol->nodes;
2390
#endif
2391
return NULL;
2392
}
2393
2394
/*
2395
* Increase the hugetlb pool such that it can accommodate a reservation
2396
* of size 'delta'.
2397
*/
2398
static int gather_surplus_pages(struct hstate *h, long delta)
2399
__must_hold(&hugetlb_lock)
2400
{
2401
LIST_HEAD(surplus_list);
2402
struct folio *folio, *tmp;
2403
int ret;
2404
long i;
2405
long needed, allocated;
2406
bool alloc_ok = true;
2407
nodemask_t *mbind_nodemask, alloc_nodemask;
2408
2409
mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
2410
if (mbind_nodemask)
2411
nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
2412
else
2413
alloc_nodemask = cpuset_current_mems_allowed;
2414
2415
lockdep_assert_held(&hugetlb_lock);
2416
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2417
if (needed <= 0) {
2418
h->resv_huge_pages += delta;
2419
return 0;
2420
}
2421
2422
allocated = 0;
2423
2424
ret = -ENOMEM;
2425
retry:
2426
spin_unlock_irq(&hugetlb_lock);
2427
for (i = 0; i < needed; i++) {
2428
folio = NULL;
2429
2430
/*
2431
* It is okay to use NUMA_NO_NODE because we use numa_mem_id()
2432
* down the road to pick the current node if that is the case.
2433
*/
2434
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2435
NUMA_NO_NODE, &alloc_nodemask);
2436
if (!folio) {
2437
alloc_ok = false;
2438
break;
2439
}
2440
list_add(&folio->lru, &surplus_list);
2441
cond_resched();
2442
}
2443
allocated += i;
2444
2445
/*
2446
* After retaking hugetlb_lock, we need to recalculate 'needed'
2447
* because either resv_huge_pages or free_huge_pages may have changed.
2448
*/
2449
spin_lock_irq(&hugetlb_lock);
2450
needed = (h->resv_huge_pages + delta) -
2451
(h->free_huge_pages + allocated);
2452
if (needed > 0) {
2453
if (alloc_ok)
2454
goto retry;
2455
/*
2456
* We were not able to allocate enough pages to
2457
* satisfy the entire reservation so we free what
2458
* we've allocated so far.
2459
*/
2460
goto free;
2461
}
2462
/*
2463
* The surplus_list now contains _at_least_ the number of extra pages
2464
* needed to accommodate the reservation. Add the appropriate number
2465
* of pages to the hugetlb pool and free the extras back to the buddy
2466
* allocator. Commit the entire reservation here to prevent another
2467
* process from stealing the pages as they are added to the pool but
2468
* before they are reserved.
2469
*/
2470
needed += allocated;
2471
h->resv_huge_pages += delta;
2472
ret = 0;
2473
2474
/* Free the needed pages to the hugetlb pool */
2475
list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
2476
if ((--needed) < 0)
2477
break;
2478
/* Add the page to the hugetlb allocator */
2479
enqueue_hugetlb_folio(h, folio);
2480
}
2481
free:
2482
spin_unlock_irq(&hugetlb_lock);
2483
2484
/*
2485
* Free unnecessary surplus pages to the buddy allocator.
2486
* Pages have no ref count, call free_huge_folio directly.
2487
*/
2488
list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
2489
free_huge_folio(folio);
2490
spin_lock_irq(&hugetlb_lock);
2491
2492
return ret;
2493
}
2494
2495
/*
2496
* This routine has two main purposes:
2497
* 1) Decrement the reservation count (resv_huge_pages) by the value passed
2498
* in unused_resv_pages. This corresponds to the prior adjustments made
2499
* to the associated reservation map.
2500
* 2) Free any unused surplus pages that may have been allocated to satisfy
2501
* the reservation. As many as unused_resv_pages may be freed.
2502
*/
2503
static void return_unused_surplus_pages(struct hstate *h,
2504
unsigned long unused_resv_pages)
2505
{
2506
unsigned long nr_pages;
2507
LIST_HEAD(page_list);
2508
2509
lockdep_assert_held(&hugetlb_lock);
2510
/* Uncommit the reservation */
2511
h->resv_huge_pages -= unused_resv_pages;
2512
2513
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
2514
goto out;
2515
2516
/*
2517
* Part (or even all) of the reservation could have been backed
2518
* by pre-allocated pages. Only free surplus pages.
2519
*/
2520
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2521
2522
/*
2523
* We want to release as many surplus pages as possible, spread
2524
* evenly across all nodes with memory. Iterate across these nodes
2525
* until we can no longer free unreserved surplus pages. This occurs
2526
* when the nodes with surplus pages have no free pages.
2527
* remove_pool_hugetlb_folio() will balance the freed pages across the
2528
* on-line nodes with memory and will handle the hstate accounting.
2529
*/
2530
while (nr_pages--) {
2531
struct folio *folio;
2532
2533
folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
2534
if (!folio)
2535
goto out;
2536
2537
list_add(&folio->lru, &page_list);
2538
}
2539
2540
out:
2541
spin_unlock_irq(&hugetlb_lock);
2542
update_and_free_pages_bulk(h, &page_list);
2543
spin_lock_irq(&hugetlb_lock);
2544
}
2545
2546
2547
/*
2548
* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
2549
* are used by the huge page allocation routines to manage reservations.
2550
*
2551
* vma_needs_reservation is called to determine if the huge page at addr
2552
* within the vma has an associated reservation. If a reservation is
2553
* needed, the value 1 is returned. The caller is then responsible for
2554
* managing the global reservation and subpool usage counts. After
2555
* the huge page has been allocated, vma_commit_reservation is called
2556
* to add the page to the reservation map. If the page allocation fails,
2557
* the reservation must be ended instead of committed. vma_end_reservation
2558
* is called in such cases.
2559
*
2560
* In the normal case, vma_commit_reservation returns the same value
2561
* as the preceding vma_needs_reservation call. The only time this
2562
* is not the case is if a reserve map was changed between calls. It
2563
* is the responsibility of the caller to notice the difference and
2564
* take appropriate action.
2565
*
2566
* vma_add_reservation is used in error paths where a reservation must
2567
* be restored when a newly allocated huge page must be freed. It is
2568
* to be called after calling vma_needs_reservation to determine if a
2569
* reservation exists.
2570
*
2571
* vma_del_reservation is used in error paths where an entry in the reserve
2572
* map was created during huge page allocation and must be removed. It is to
2573
* be called after calling vma_needs_reservation to determine if a reservation
2574
* exists.
2575
*/
2576
enum vma_resv_mode {
2577
VMA_NEEDS_RESV,
2578
VMA_COMMIT_RESV,
2579
VMA_END_RESV,
2580
VMA_ADD_RESV,
2581
VMA_DEL_RESV,
2582
};
2583
static long __vma_reservation_common(struct hstate *h,
2584
struct vm_area_struct *vma, unsigned long addr,
2585
enum vma_resv_mode mode)
2586
{
2587
struct resv_map *resv;
2588
pgoff_t idx;
2589
long ret;
2590
long dummy_out_regions_needed;
2591
2592
resv = vma_resv_map(vma);
2593
if (!resv)
2594
return 1;
2595
2596
idx = vma_hugecache_offset(h, vma, addr);
2597
switch (mode) {
2598
case VMA_NEEDS_RESV:
2599
ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2600
/* We assume that vma_reservation_* routines always operate on
2601
* 1 page, and that adding to resv map a 1 page entry can only
2602
* ever require 1 region.
2603
*/
2604
VM_BUG_ON(dummy_out_regions_needed != 1);
2605
break;
2606
case VMA_COMMIT_RESV:
2607
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2608
/* region_add calls of range 1 should never fail. */
2609
VM_BUG_ON(ret < 0);
2610
break;
2611
case VMA_END_RESV:
2612
region_abort(resv, idx, idx + 1, 1);
2613
ret = 0;
2614
break;
2615
case VMA_ADD_RESV:
2616
if (vma->vm_flags & VM_MAYSHARE) {
2617
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2618
/* region_add calls of range 1 should never fail. */
2619
VM_BUG_ON(ret < 0);
2620
} else {
2621
region_abort(resv, idx, idx + 1, 1);
2622
ret = region_del(resv, idx, idx + 1);
2623
}
2624
break;
2625
case VMA_DEL_RESV:
2626
if (vma->vm_flags & VM_MAYSHARE) {
2627
region_abort(resv, idx, idx + 1, 1);
2628
ret = region_del(resv, idx, idx + 1);
2629
} else {
2630
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2631
/* region_add calls of range 1 should never fail. */
2632
VM_BUG_ON(ret < 0);
2633
}
2634
break;
2635
default:
2636
BUG();
2637
}
2638
2639
if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2640
return ret;
2641
/*
2642
* We know private mapping must have HPAGE_RESV_OWNER set.
2643
*
2644
* In most cases, reserves always exist for private mappings.
2645
* However, a file associated with mapping could have been
2646
* hole punched or truncated after reserves were consumed.
2647
* As subsequent fault on such a range will not use reserves.
2648
* Subtle - The reserve map for private mappings has the
2649
* opposite meaning than that of shared mappings. If NO
2650
* entry is in the reserve map, it means a reservation exists.
2651
* If an entry exists in the reserve map, it means the
2652
* reservation has already been consumed. As a result, the
2653
* return value of this routine is the opposite of the
2654
* value returned from reserve map manipulation routines above.
2655
*/
2656
if (ret > 0)
2657
return 0;
2658
if (ret == 0)
2659
return 1;
2660
return ret;
2661
}
2662
2663
static long vma_needs_reservation(struct hstate *h,
2664
struct vm_area_struct *vma, unsigned long addr)
2665
{
2666
return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2667
}
2668
2669
static long vma_commit_reservation(struct hstate *h,
2670
struct vm_area_struct *vma, unsigned long addr)
2671
{
2672
return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2673
}
2674
2675
static void vma_end_reservation(struct hstate *h,
2676
struct vm_area_struct *vma, unsigned long addr)
2677
{
2678
(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2679
}
2680
2681
static long vma_add_reservation(struct hstate *h,
2682
struct vm_area_struct *vma, unsigned long addr)
2683
{
2684
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2685
}
2686
2687
static long vma_del_reservation(struct hstate *h,
2688
struct vm_area_struct *vma, unsigned long addr)
2689
{
2690
return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2691
}
2692
2693
/*
2694
* This routine is called to restore reservation information on error paths.
2695
* It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
2696
* and the hugetlb mutex should remain held when calling this routine.
2697
*
2698
* It handles two specific cases:
2699
* 1) A reservation was in place and the folio consumed the reservation.
2700
* hugetlb_restore_reserve is set in the folio.
2701
* 2) No reservation was in place for the page, so hugetlb_restore_reserve is
2702
* not set. However, alloc_hugetlb_folio always updates the reserve map.
2703
*
2704
* In case 1, free_huge_folio later in the error path will increment the
2705
* global reserve count. But, free_huge_folio does not have enough context
2706
* to adjust the reservation map. This case deals primarily with private
2707
* mappings. Adjust the reserve map here to be consistent with global
2708
* reserve count adjustments to be made by free_huge_folio. Make sure the
2709
* reserve map indicates there is a reservation present.
2710
*
2711
* In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
2712
*/
2713
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2714
unsigned long address, struct folio *folio)
2715
{
2716
long rc = vma_needs_reservation(h, vma, address);
2717
2718
if (folio_test_hugetlb_restore_reserve(folio)) {
2719
if (unlikely(rc < 0))
2720
/*
2721
* Rare out of memory condition in reserve map
2722
* manipulation. Clear hugetlb_restore_reserve so
2723
* that global reserve count will not be incremented
2724
* by free_huge_folio. This will make it appear
2725
* as though the reservation for this folio was
2726
* consumed. This may prevent the task from
2727
* faulting in the folio at a later time. This
2728
* is better than inconsistent global huge page
2729
* accounting of reserve counts.
2730
*/
2731
folio_clear_hugetlb_restore_reserve(folio);
2732
else if (rc)
2733
(void)vma_add_reservation(h, vma, address);
2734
else
2735
vma_end_reservation(h, vma, address);
2736
} else {
2737
if (!rc) {
2738
/*
2739
* This indicates there is an entry in the reserve map
2740
* not added by alloc_hugetlb_folio. We know it was added
2741
* before the alloc_hugetlb_folio call, otherwise
2742
* hugetlb_restore_reserve would be set on the folio.
2743
* Remove the entry so that a subsequent allocation
2744
* does not consume a reservation.
2745
*/
2746
rc = vma_del_reservation(h, vma, address);
2747
if (rc < 0)
2748
/*
2749
* VERY rare out of memory condition. Since
2750
* we can not delete the entry, set
2751
* hugetlb_restore_reserve so that the reserve
2752
* count will be incremented when the folio
2753
* is freed. This reserve will be consumed
2754
* on a subsequent allocation.
2755
*/
2756
folio_set_hugetlb_restore_reserve(folio);
2757
} else if (rc < 0) {
2758
/*
2759
* Rare out of memory condition from
2760
* vma_needs_reservation call. Memory allocation is
2761
* only attempted if a new entry is needed. Therefore,
2762
* this implies there is not an entry in the
2763
* reserve map.
2764
*
2765
* For shared mappings, no entry in the map indicates
2766
* no reservation. We are done.
2767
*/
2768
if (!(vma->vm_flags & VM_MAYSHARE))
2769
/*
2770
* For private mappings, no entry indicates
2771
* a reservation is present. Since we can
2772
* not add an entry, set hugetlb_restore_reserve
2773
* on the folio so reserve count will be
2774
* incremented when freed. This reserve will
2775
* be consumed on a subsequent allocation.
2776
*/
2777
folio_set_hugetlb_restore_reserve(folio);
2778
} else
2779
/*
2780
* No reservation present, do nothing
2781
*/
2782
vma_end_reservation(h, vma, address);
2783
}
2784
}
2785
2786
/*
2787
* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
2788
* the old one
2789
* @old_folio: Old folio to dissolve
2790
* @list: List to isolate the page in case we need to
2791
* Returns 0 on success, otherwise negated error.
2792
*/
2793
static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
2794
struct list_head *list)
2795
{
2796
gfp_t gfp_mask;
2797
struct hstate *h;
2798
int nid = folio_nid(old_folio);
2799
struct folio *new_folio = NULL;
2800
int ret = 0;
2801
2802
retry:
2803
/*
2804
* The old_folio might have been dissolved from under our feet, so make sure
2805
* to carefully check the state under the lock.
2806
*/
2807
spin_lock_irq(&hugetlb_lock);
2808
if (!folio_test_hugetlb(old_folio)) {
2809
/*
2810
* Freed from under us. Drop new_folio too.
2811
*/
2812
goto free_new;
2813
} else if (folio_ref_count(old_folio)) {
2814
bool isolated;
2815
2816
/*
2817
* Someone has grabbed the folio, try to isolate it here.
2818
* Fail with -EBUSY if not possible.
2819
*/
2820
spin_unlock_irq(&hugetlb_lock);
2821
isolated = folio_isolate_hugetlb(old_folio, list);
2822
ret = isolated ? 0 : -EBUSY;
2823
spin_lock_irq(&hugetlb_lock);
2824
goto free_new;
2825
} else if (!folio_test_hugetlb_freed(old_folio)) {
2826
/*
2827
* Folio's refcount is 0 but it has not been enqueued in the
2828
* freelist yet. Race window is small, so we can succeed here if
2829
* we retry.
2830
*/
2831
spin_unlock_irq(&hugetlb_lock);
2832
cond_resched();
2833
goto retry;
2834
} else {
2835
h = folio_hstate(old_folio);
2836
if (!new_folio) {
2837
spin_unlock_irq(&hugetlb_lock);
2838
gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2839
new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
2840
NULL, NULL);
2841
if (!new_folio)
2842
return -ENOMEM;
2843
__prep_new_hugetlb_folio(h, new_folio);
2844
goto retry;
2845
}
2846
2847
/*
2848
* Ok, old_folio is still a genuine free hugepage. Remove it from
2849
* the freelist and decrease the counters. These will be
2850
* incremented again when calling __prep_account_new_huge_page()
2851
* and enqueue_hugetlb_folio() for new_folio. The counters will
2852
* remain stable since this happens under the lock.
2853
*/
2854
remove_hugetlb_folio(h, old_folio, false);
2855
2856
/*
2857
* Ref count on new_folio is already zero as it was dropped
2858
* earlier. It can be directly added to the pool free list.
2859
*/
2860
__prep_account_new_huge_page(h, nid);
2861
enqueue_hugetlb_folio(h, new_folio);
2862
2863
/*
2864
* Folio has been replaced, we can safely free the old one.
2865
*/
2866
spin_unlock_irq(&hugetlb_lock);
2867
update_and_free_hugetlb_folio(h, old_folio, false);
2868
}
2869
2870
return ret;
2871
2872
free_new:
2873
spin_unlock_irq(&hugetlb_lock);
2874
if (new_folio)
2875
update_and_free_hugetlb_folio(h, new_folio, false);
2876
2877
return ret;
2878
}
2879
2880
int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
2881
{
2882
int ret = -EBUSY;
2883
2884
/* Not to disrupt normal path by vainly holding hugetlb_lock */
2885
if (!folio_test_hugetlb(folio))
2886
return 0;
2887
2888
/*
2889
* Fence off gigantic pages as there is a cyclic dependency between
2890
* alloc_contig_range and them. Return -ENOMEM as this has the effect
2891
* of bailing out right away without further retrying.
2892
*/
2893
if (folio_order(folio) > MAX_PAGE_ORDER)
2894
return -ENOMEM;
2895
2896
if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
2897
ret = 0;
2898
else if (!folio_ref_count(folio))
2899
ret = alloc_and_dissolve_hugetlb_folio(folio, list);
2900
2901
return ret;
2902
}
2903
2904
/*
2905
* replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
2906
* range with new folios.
2907
* @start_pfn: start pfn of the given pfn range
2908
* @end_pfn: end pfn of the given pfn range
2909
* Returns 0 on success, otherwise negated error.
2910
*/
2911
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
2912
{
2913
struct folio *folio;
2914
int ret = 0;
2915
2916
LIST_HEAD(isolate_list);
2917
2918
while (start_pfn < end_pfn) {
2919
folio = pfn_folio(start_pfn);
2920
2921
/* Not to disrupt normal path by vainly holding hugetlb_lock */
2922
if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
2923
ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
2924
if (ret)
2925
break;
2926
2927
putback_movable_pages(&isolate_list);
2928
}
2929
start_pfn++;
2930
}
2931
2932
return ret;
2933
}
2934
2935
void wait_for_freed_hugetlb_folios(void)
2936
{
2937
if (llist_empty(&hpage_freelist))
2938
return;
2939
2940
flush_work(&free_hpage_work);
2941
}
2942
2943
typedef enum {
2944
/*
2945
* For either 0/1: we checked the per-vma resv map, and one resv
2946
* count either can be reused (0), or an extra needed (1).
2947
*/
2948
MAP_CHG_REUSE = 0,
2949
MAP_CHG_NEEDED = 1,
2950
/*
2951
* Cannot use per-vma resv count can be used, hence a new resv
2952
* count is enforced.
2953
*
2954
* NOTE: This is mostly identical to MAP_CHG_NEEDED, except
2955
* that currently vma_needs_reservation() has an unwanted side
2956
* effect to either use end() or commit() to complete the
2957
* transaction. Hence it needs to differenciate from NEEDED.
2958
*/
2959
MAP_CHG_ENFORCED = 2,
2960
} map_chg_state;
2961
2962
/*
2963
* NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
2964
* faults of hugetlb private mappings on top of a non-page-cache folio (in
2965
* which case even if there's a private vma resv map it won't cover such
2966
* allocation). New call sites should (probably) never set it to true!!
2967
* When it's set, the allocation will bypass all vma level reservations.
2968
*/
2969
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
2970
unsigned long addr, bool cow_from_owner)
2971
{
2972
struct hugepage_subpool *spool = subpool_vma(vma);
2973
struct hstate *h = hstate_vma(vma);
2974
struct folio *folio;
2975
long retval, gbl_chg, gbl_reserve;
2976
map_chg_state map_chg;
2977
int ret, idx;
2978
struct hugetlb_cgroup *h_cg = NULL;
2979
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
2980
2981
idx = hstate_index(h);
2982
2983
/* Whether we need a separate per-vma reservation? */
2984
if (cow_from_owner) {
2985
/*
2986
* Special case! Since it's a CoW on top of a reserved
2987
* page, the private resv map doesn't count. So it cannot
2988
* consume the per-vma resv map even if it's reserved.
2989
*/
2990
map_chg = MAP_CHG_ENFORCED;
2991
} else {
2992
/*
2993
* Examine the region/reserve map to determine if the process
2994
* has a reservation for the page to be allocated. A return
2995
* code of zero indicates a reservation exists (no change).
2996
*/
2997
retval = vma_needs_reservation(h, vma, addr);
2998
if (retval < 0)
2999
return ERR_PTR(-ENOMEM);
3000
map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
3001
}
3002
3003
/*
3004
* Whether we need a separate global reservation?
3005
*
3006
* Processes that did not create the mapping will have no
3007
* reserves as indicated by the region/reserve map. Check
3008
* that the allocation will not exceed the subpool limit.
3009
* Or if it can get one from the pool reservation directly.
3010
*/
3011
if (map_chg) {
3012
gbl_chg = hugepage_subpool_get_pages(spool, 1);
3013
if (gbl_chg < 0)
3014
goto out_end_reservation;
3015
} else {
3016
/*
3017
* If we have the vma reservation ready, no need for extra
3018
* global reservation.
3019
*/
3020
gbl_chg = 0;
3021
}
3022
3023
/*
3024
* If this allocation is not consuming a per-vma reservation,
3025
* charge the hugetlb cgroup now.
3026
*/
3027
if (map_chg) {
3028
ret = hugetlb_cgroup_charge_cgroup_rsvd(
3029
idx, pages_per_huge_page(h), &h_cg);
3030
if (ret)
3031
goto out_subpool_put;
3032
}
3033
3034
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
3035
if (ret)
3036
goto out_uncharge_cgroup_reservation;
3037
3038
spin_lock_irq(&hugetlb_lock);
3039
/*
3040
* glb_chg is passed to indicate whether or not a page must be taken
3041
* from the global free pool (global change). gbl_chg == 0 indicates
3042
* a reservation exists for the allocation.
3043
*/
3044
folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
3045
if (!folio) {
3046
spin_unlock_irq(&hugetlb_lock);
3047
folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
3048
if (!folio)
3049
goto out_uncharge_cgroup;
3050
spin_lock_irq(&hugetlb_lock);
3051
list_add(&folio->lru, &h->hugepage_activelist);
3052
folio_ref_unfreeze(folio, 1);
3053
/* Fall through */
3054
}
3055
3056
/*
3057
* Either dequeued or buddy-allocated folio needs to add special
3058
* mark to the folio when it consumes a global reservation.
3059
*/
3060
if (!gbl_chg) {
3061
folio_set_hugetlb_restore_reserve(folio);
3062
h->resv_huge_pages--;
3063
}
3064
3065
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
3066
/* If allocation is not consuming a reservation, also store the
3067
* hugetlb_cgroup pointer on the page.
3068
*/
3069
if (map_chg) {
3070
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
3071
h_cg, folio);
3072
}
3073
3074
spin_unlock_irq(&hugetlb_lock);
3075
3076
hugetlb_set_folio_subpool(folio, spool);
3077
3078
if (map_chg != MAP_CHG_ENFORCED) {
3079
/* commit() is only needed if the map_chg is not enforced */
3080
retval = vma_commit_reservation(h, vma, addr);
3081
/*
3082
* Check for possible race conditions. When it happens..
3083
* The page was added to the reservation map between
3084
* vma_needs_reservation and vma_commit_reservation.
3085
* This indicates a race with hugetlb_reserve_pages.
3086
* Adjust for the subpool count incremented above AND
3087
* in hugetlb_reserve_pages for the same page. Also,
3088
* the reservation count added in hugetlb_reserve_pages
3089
* no longer applies.
3090
*/
3091
if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
3092
long rsv_adjust;
3093
3094
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
3095
hugetlb_acct_memory(h, -rsv_adjust);
3096
if (map_chg) {
3097
spin_lock_irq(&hugetlb_lock);
3098
hugetlb_cgroup_uncharge_folio_rsvd(
3099
hstate_index(h), pages_per_huge_page(h),
3100
folio);
3101
spin_unlock_irq(&hugetlb_lock);
3102
}
3103
}
3104
}
3105
3106
ret = mem_cgroup_charge_hugetlb(folio, gfp);
3107
/*
3108
* Unconditionally increment NR_HUGETLB here. If it turns out that
3109
* mem_cgroup_charge_hugetlb failed, then immediately free the page and
3110
* decrement NR_HUGETLB.
3111
*/
3112
lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
3113
3114
if (ret == -ENOMEM) {
3115
free_huge_folio(folio);
3116
return ERR_PTR(-ENOMEM);
3117
}
3118
3119
return folio;
3120
3121
out_uncharge_cgroup:
3122
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
3123
out_uncharge_cgroup_reservation:
3124
if (map_chg)
3125
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
3126
h_cg);
3127
out_subpool_put:
3128
/*
3129
* put page to subpool iff the quota of subpool's rsv_hpages is used
3130
* during hugepage_subpool_get_pages.
3131
*/
3132
if (map_chg && !gbl_chg) {
3133
gbl_reserve = hugepage_subpool_put_pages(spool, 1);
3134
hugetlb_acct_memory(h, -gbl_reserve);
3135
}
3136
3137
3138
out_end_reservation:
3139
if (map_chg != MAP_CHG_ENFORCED)
3140
vma_end_reservation(h, vma, addr);
3141
return ERR_PTR(-ENOSPC);
3142
}
3143
3144
static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
3145
{
3146
struct huge_bootmem_page *m;
3147
int listnode = nid;
3148
3149
if (hugetlb_early_cma(h))
3150
m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
3151
else {
3152
if (node_exact)
3153
m = memblock_alloc_exact_nid_raw(huge_page_size(h),
3154
huge_page_size(h), 0,
3155
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3156
else {
3157
m = memblock_alloc_try_nid_raw(huge_page_size(h),
3158
huge_page_size(h), 0,
3159
MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3160
/*
3161
* For pre-HVO to work correctly, pages need to be on
3162
* the list for the node they were actually allocated
3163
* from. That node may be different in the case of
3164
* fallback by memblock_alloc_try_nid_raw. So,
3165
* extract the actual node first.
3166
*/
3167
if (m)
3168
listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
3169
}
3170
3171
if (m) {
3172
m->flags = 0;
3173
m->cma = NULL;
3174
}
3175
}
3176
3177
if (m) {
3178
/*
3179
* Use the beginning of the huge page to store the
3180
* huge_bootmem_page struct (until gather_bootmem
3181
* puts them into the mem_map).
3182
*
3183
* Put them into a private list first because mem_map
3184
* is not up yet.
3185
*/
3186
INIT_LIST_HEAD(&m->list);
3187
list_add(&m->list, &huge_boot_pages[listnode]);
3188
m->hstate = h;
3189
}
3190
3191
return m;
3192
}
3193
3194
int alloc_bootmem_huge_page(struct hstate *h, int nid)
3195
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
3196
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
3197
{
3198
struct huge_bootmem_page *m = NULL; /* initialize for clang */
3199
int nr_nodes, node = nid;
3200
3201
/* do node specific alloc */
3202
if (nid != NUMA_NO_NODE) {
3203
m = alloc_bootmem(h, node, true);
3204
if (!m)
3205
return 0;
3206
goto found;
3207
}
3208
3209
/* allocate from next node when distributing huge pages */
3210
for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
3211
&hugetlb_bootmem_nodes) {
3212
m = alloc_bootmem(h, node, false);
3213
if (!m)
3214
return 0;
3215
goto found;
3216
}
3217
3218
found:
3219
3220
/*
3221
* Only initialize the head struct page in memmap_init_reserved_pages,
3222
* rest of the struct pages will be initialized by the HugeTLB
3223
* subsystem itself.
3224
* The head struct page is used to get folio information by the HugeTLB
3225
* subsystem like zone id and node id.
3226
*/
3227
memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
3228
huge_page_size(h) - PAGE_SIZE);
3229
3230
return 1;
3231
}
3232
3233
/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
3234
static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
3235
unsigned long start_page_number,
3236
unsigned long end_page_number)
3237
{
3238
enum zone_type zone = zone_idx(folio_zone(folio));
3239
int nid = folio_nid(folio);
3240
unsigned long head_pfn = folio_pfn(folio);
3241
unsigned long pfn, end_pfn = head_pfn + end_page_number;
3242
int ret;
3243
3244
for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
3245
struct page *page = pfn_to_page(pfn);
3246
3247
__init_single_page(page, pfn, zone, nid);
3248
prep_compound_tail((struct page *)folio, pfn - head_pfn);
3249
ret = page_ref_freeze(page, 1);
3250
VM_BUG_ON(!ret);
3251
}
3252
}
3253
3254
static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
3255
struct hstate *h,
3256
unsigned long nr_pages)
3257
{
3258
int ret;
3259
3260
/* Prepare folio head */
3261
__folio_clear_reserved(folio);
3262
__folio_set_head(folio);
3263
ret = folio_ref_freeze(folio, 1);
3264
VM_BUG_ON(!ret);
3265
/* Initialize the necessary tail struct pages */
3266
hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
3267
prep_compound_head((struct page *)folio, huge_page_order(h));
3268
}
3269
3270
static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
3271
{
3272
return m->flags & HUGE_BOOTMEM_HVO;
3273
}
3274
3275
static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
3276
{
3277
return m->flags & HUGE_BOOTMEM_CMA;
3278
}
3279
3280
/*
3281
* memblock-allocated pageblocks might not have the migrate type set
3282
* if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
3283
* here, or MIGRATE_CMA if this was a page allocated through an early CMA
3284
* reservation.
3285
*
3286
* In case of vmemmap optimized folios, the tail vmemmap pages are mapped
3287
* read-only, but that's ok - for sparse vmemmap this does not write to
3288
* the page structure.
3289
*/
3290
static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
3291
struct hstate *h)
3292
{
3293
unsigned long nr_pages = pages_per_huge_page(h), i;
3294
3295
WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
3296
3297
for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
3298
if (folio_test_hugetlb_cma(folio))
3299
init_cma_pageblock(folio_page(folio, i));
3300
else
3301
init_pageblock_migratetype(folio_page(folio, i),
3302
MIGRATE_MOVABLE, false);
3303
}
3304
}
3305
3306
static void __init prep_and_add_bootmem_folios(struct hstate *h,
3307
struct list_head *folio_list)
3308
{
3309
unsigned long flags;
3310
struct folio *folio, *tmp_f;
3311
3312
/* Send list for bulk vmemmap optimization processing */
3313
hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
3314
3315
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
3316
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
3317
/*
3318
* If HVO fails, initialize all tail struct pages
3319
* We do not worry about potential long lock hold
3320
* time as this is early in boot and there should
3321
* be no contention.
3322
*/
3323
hugetlb_folio_init_tail_vmemmap(folio,
3324
HUGETLB_VMEMMAP_RESERVE_PAGES,
3325
pages_per_huge_page(h));
3326
}
3327
hugetlb_bootmem_init_migratetype(folio, h);
3328
/* Subdivide locks to achieve better parallel performance */
3329
spin_lock_irqsave(&hugetlb_lock, flags);
3330
__prep_account_new_huge_page(h, folio_nid(folio));
3331
enqueue_hugetlb_folio(h, folio);
3332
spin_unlock_irqrestore(&hugetlb_lock, flags);
3333
}
3334
}
3335
3336
bool __init hugetlb_bootmem_page_zones_valid(int nid,
3337
struct huge_bootmem_page *m)
3338
{
3339
unsigned long start_pfn;
3340
bool valid;
3341
3342
if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
3343
/*
3344
* Already validated, skip check.
3345
*/
3346
return true;
3347
}
3348
3349
if (hugetlb_bootmem_page_earlycma(m)) {
3350
valid = cma_validate_zones(m->cma);
3351
goto out;
3352
}
3353
3354
start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
3355
3356
valid = !pfn_range_intersects_zones(nid, start_pfn,
3357
pages_per_huge_page(m->hstate));
3358
out:
3359
if (!valid)
3360
hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
3361
3362
return valid;
3363
}
3364
3365
/*
3366
* Free a bootmem page that was found to be invalid (intersecting with
3367
* multiple zones).
3368
*
3369
* Since it intersects with multiple zones, we can't just do a free
3370
* operation on all pages at once, but instead have to walk all
3371
* pages, freeing them one by one.
3372
*/
3373
static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
3374
struct hstate *h)
3375
{
3376
unsigned long npages = pages_per_huge_page(h);
3377
unsigned long pfn;
3378
3379
while (npages--) {
3380
pfn = page_to_pfn(page);
3381
__init_page_from_nid(pfn, nid);
3382
free_reserved_page(page);
3383
page++;
3384
}
3385
}
3386
3387
/*
3388
* Put bootmem huge pages into the standard lists after mem_map is up.
3389
* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
3390
*/
3391
static void __init gather_bootmem_prealloc_node(unsigned long nid)
3392
{
3393
LIST_HEAD(folio_list);
3394
struct huge_bootmem_page *m, *tm;
3395
struct hstate *h = NULL, *prev_h = NULL;
3396
3397
list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
3398
struct page *page = virt_to_page(m);
3399
struct folio *folio = (void *)page;
3400
3401
h = m->hstate;
3402
if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
3403
/*
3404
* Can't use this page. Initialize the
3405
* page structures if that hasn't already
3406
* been done, and give them to the page
3407
* allocator.
3408
*/
3409
hugetlb_bootmem_free_invalid_page(nid, page, h);
3410
continue;
3411
}
3412
3413
/*
3414
* It is possible to have multiple huge page sizes (hstates)
3415
* in this list. If so, process each size separately.
3416
*/
3417
if (h != prev_h && prev_h != NULL)
3418
prep_and_add_bootmem_folios(prev_h, &folio_list);
3419
prev_h = h;
3420
3421
VM_BUG_ON(!hstate_is_gigantic(h));
3422
WARN_ON(folio_ref_count(folio) != 1);
3423
3424
hugetlb_folio_init_vmemmap(folio, h,
3425
HUGETLB_VMEMMAP_RESERVE_PAGES);
3426
init_new_hugetlb_folio(h, folio);
3427
3428
if (hugetlb_bootmem_page_prehvo(m))
3429
/*
3430
* If pre-HVO was done, just set the
3431
* flag, the HVO code will then skip
3432
* this folio.
3433
*/
3434
folio_set_hugetlb_vmemmap_optimized(folio);
3435
3436
if (hugetlb_bootmem_page_earlycma(m))
3437
folio_set_hugetlb_cma(folio);
3438
3439
list_add(&folio->lru, &folio_list);
3440
3441
/*
3442
* We need to restore the 'stolen' pages to totalram_pages
3443
* in order to fix confusing memory reports from free(1) and
3444
* other side-effects, like CommitLimit going negative.
3445
*
3446
* For CMA pages, this is done in init_cma_pageblock
3447
* (via hugetlb_bootmem_init_migratetype), so skip it here.
3448
*/
3449
if (!folio_test_hugetlb_cma(folio))
3450
adjust_managed_page_count(page, pages_per_huge_page(h));
3451
cond_resched();
3452
}
3453
3454
prep_and_add_bootmem_folios(h, &folio_list);
3455
}
3456
3457
static void __init gather_bootmem_prealloc_parallel(unsigned long start,
3458
unsigned long end, void *arg)
3459
{
3460
int nid;
3461
3462
for (nid = start; nid < end; nid++)
3463
gather_bootmem_prealloc_node(nid);
3464
}
3465
3466
static void __init gather_bootmem_prealloc(void)
3467
{
3468
struct padata_mt_job job = {
3469
.thread_fn = gather_bootmem_prealloc_parallel,
3470
.fn_arg = NULL,
3471
.start = 0,
3472
.size = nr_node_ids,
3473
.align = 1,
3474
.min_chunk = 1,
3475
.max_threads = num_node_state(N_MEMORY),
3476
.numa_aware = true,
3477
};
3478
3479
padata_do_multithreaded(&job);
3480
}
3481
3482
static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3483
{
3484
unsigned long i;
3485
char buf[32];
3486
LIST_HEAD(folio_list);
3487
3488
for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3489
if (hstate_is_gigantic(h)) {
3490
if (!alloc_bootmem_huge_page(h, nid))
3491
break;
3492
} else {
3493
struct folio *folio;
3494
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3495
3496
folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
3497
&node_states[N_MEMORY], NULL);
3498
if (!folio)
3499
break;
3500
list_add(&folio->lru, &folio_list);
3501
}
3502
cond_resched();
3503
}
3504
3505
if (!list_empty(&folio_list))
3506
prep_and_add_allocated_folios(h, &folio_list);
3507
3508
if (i == h->max_huge_pages_node[nid])
3509
return;
3510
3511
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3512
pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3513
h->max_huge_pages_node[nid], buf, nid, i);
3514
h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3515
h->max_huge_pages_node[nid] = i;
3516
}
3517
3518
static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
3519
{
3520
int i;
3521
bool node_specific_alloc = false;
3522
3523
for_each_online_node(i) {
3524
if (h->max_huge_pages_node[i] > 0) {
3525
hugetlb_hstate_alloc_pages_onenode(h, i);
3526
node_specific_alloc = true;
3527
}
3528
}
3529
3530
return node_specific_alloc;
3531
}
3532
3533
static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
3534
{
3535
if (allocated < h->max_huge_pages) {
3536
char buf[32];
3537
3538
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3539
pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
3540
h->max_huge_pages, buf, allocated);
3541
h->max_huge_pages = allocated;
3542
}
3543
}
3544
3545
static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
3546
{
3547
struct hstate *h = (struct hstate *)arg;
3548
int i, num = end - start;
3549
nodemask_t node_alloc_noretry;
3550
LIST_HEAD(folio_list);
3551
int next_node = first_online_node;
3552
3553
/* Bit mask controlling how hard we retry per-node allocations.*/
3554
nodes_clear(node_alloc_noretry);
3555
3556
for (i = 0; i < num; ++i) {
3557
struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
3558
&node_alloc_noretry, &next_node);
3559
if (!folio)
3560
break;
3561
3562
list_move(&folio->lru, &folio_list);
3563
cond_resched();
3564
}
3565
3566
prep_and_add_allocated_folios(h, &folio_list);
3567
}
3568
3569
static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
3570
{
3571
unsigned long i;
3572
3573
for (i = 0; i < h->max_huge_pages; ++i) {
3574
if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3575
break;
3576
cond_resched();
3577
}
3578
3579
return i;
3580
}
3581
3582
static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
3583
{
3584
struct padata_mt_job job = {
3585
.fn_arg = h,
3586
.align = 1,
3587
.numa_aware = true
3588
};
3589
3590
unsigned long jiffies_start;
3591
unsigned long jiffies_end;
3592
3593
job.thread_fn = hugetlb_pages_alloc_boot_node;
3594
job.start = 0;
3595
job.size = h->max_huge_pages;
3596
3597
/*
3598
* job.max_threads is 25% of the available cpu threads by default.
3599
*
3600
* On large servers with terabytes of memory, huge page allocation
3601
* can consume a considerably amount of time.
3602
*
3603
* Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
3604
* 2MiB huge pages. Using more threads can significantly improve allocation time.
3605
*
3606
* +-----------------------+-------+-------+-------+-------+-------+
3607
* | threads | 8 | 16 | 32 | 64 | 128 |
3608
* +-----------------------+-------+-------+-------+-------+-------+
3609
* | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s |
3610
* | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s |
3611
* +-----------------------+-------+-------+-------+-------+-------+
3612
*/
3613
if (hugepage_allocation_threads == 0) {
3614
hugepage_allocation_threads = num_online_cpus() / 4;
3615
hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
3616
}
3617
3618
job.max_threads = hugepage_allocation_threads;
3619
job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
3620
3621
jiffies_start = jiffies;
3622
padata_do_multithreaded(&job);
3623
jiffies_end = jiffies;
3624
3625
pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
3626
jiffies_to_msecs(jiffies_end - jiffies_start),
3627
hugepage_allocation_threads);
3628
3629
return h->nr_huge_pages;
3630
}
3631
3632
/*
3633
* NOTE: this routine is called in different contexts for gigantic and
3634
* non-gigantic pages.
3635
* - For gigantic pages, this is called early in the boot process and
3636
* pages are allocated from memblock allocated or something similar.
3637
* Gigantic pages are actually added to pools later with the routine
3638
* gather_bootmem_prealloc.
3639
* - For non-gigantic pages, this is called later in the boot process after
3640
* all of mm is up and functional. Pages are allocated from buddy and
3641
* then added to hugetlb pools.
3642
*/
3643
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3644
{
3645
unsigned long allocated;
3646
3647
/*
3648
* Skip gigantic hugepages allocation if early CMA
3649
* reservations are not available.
3650
*/
3651
if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
3652
!hugetlb_early_cma(h)) {
3653
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3654
return;
3655
}
3656
3657
/* do node specific alloc */
3658
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
3659
return;
3660
3661
/* below will do all node balanced alloc */
3662
if (hstate_is_gigantic(h))
3663
allocated = hugetlb_gigantic_pages_alloc_boot(h);
3664
else
3665
allocated = hugetlb_pages_alloc_boot(h);
3666
3667
hugetlb_hstate_alloc_pages_errcheck(allocated, h);
3668
}
3669
3670
static void __init hugetlb_init_hstates(void)
3671
{
3672
struct hstate *h, *h2;
3673
3674
for_each_hstate(h) {
3675
/*
3676
* Always reset to first_memory_node here, even if
3677
* next_nid_to_alloc was set before - we can't
3678
* reference hugetlb_bootmem_nodes after init, and
3679
* first_memory_node is right for all further allocations.
3680
*/
3681
h->next_nid_to_alloc = first_memory_node;
3682
h->next_nid_to_free = first_memory_node;
3683
3684
/* oversize hugepages were init'ed in early boot */
3685
if (!hstate_is_gigantic(h))
3686
hugetlb_hstate_alloc_pages(h);
3687
3688
/*
3689
* Set demote order for each hstate. Note that
3690
* h->demote_order is initially 0.
3691
* - We can not demote gigantic pages if runtime freeing
3692
* is not supported, so skip this.
3693
* - If CMA allocation is possible, we can not demote
3694
* HUGETLB_PAGE_ORDER or smaller size pages.
3695
*/
3696
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3697
continue;
3698
if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
3699
continue;
3700
for_each_hstate(h2) {
3701
if (h2 == h)
3702
continue;
3703
if (h2->order < h->order &&
3704
h2->order > h->demote_order)
3705
h->demote_order = h2->order;
3706
}
3707
}
3708
}
3709
3710
static void __init report_hugepages(void)
3711
{
3712
struct hstate *h;
3713
unsigned long nrinvalid;
3714
3715
for_each_hstate(h) {
3716
char buf[32];
3717
3718
nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
3719
h->max_huge_pages -= nrinvalid;
3720
3721
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3722
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3723
buf, h->nr_huge_pages);
3724
if (nrinvalid)
3725
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
3726
buf, nrinvalid, str_plural(nrinvalid));
3727
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3728
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
3729
}
3730
}
3731
3732
#ifdef CONFIG_HIGHMEM
3733
static void try_to_free_low(struct hstate *h, unsigned long count,
3734
nodemask_t *nodes_allowed)
3735
{
3736
int i;
3737
LIST_HEAD(page_list);
3738
3739
lockdep_assert_held(&hugetlb_lock);
3740
if (hstate_is_gigantic(h))
3741
return;
3742
3743
/*
3744
* Collect pages to be freed on a list, and free after dropping lock
3745
*/
3746
for_each_node_mask(i, *nodes_allowed) {
3747
struct folio *folio, *next;
3748
struct list_head *freel = &h->hugepage_freelists[i];
3749
list_for_each_entry_safe(folio, next, freel, lru) {
3750
if (count >= h->nr_huge_pages)
3751
goto out;
3752
if (folio_test_highmem(folio))
3753
continue;
3754
remove_hugetlb_folio(h, folio, false);
3755
list_add(&folio->lru, &page_list);
3756
}
3757
}
3758
3759
out:
3760
spin_unlock_irq(&hugetlb_lock);
3761
update_and_free_pages_bulk(h, &page_list);
3762
spin_lock_irq(&hugetlb_lock);
3763
}
3764
#else
3765
static inline void try_to_free_low(struct hstate *h, unsigned long count,
3766
nodemask_t *nodes_allowed)
3767
{
3768
}
3769
#endif
3770
3771
/*
3772
* Increment or decrement surplus_huge_pages. Keep node-specific counters
3773
* balanced by operating on them in a round-robin fashion.
3774
* Returns 1 if an adjustment was made.
3775
*/
3776
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3777
int delta)
3778
{
3779
int nr_nodes, node;
3780
3781
lockdep_assert_held(&hugetlb_lock);
3782
VM_BUG_ON(delta != -1 && delta != 1);
3783
3784
if (delta < 0) {
3785
for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
3786
if (h->surplus_huge_pages_node[node])
3787
goto found;
3788
}
3789
} else {
3790
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3791
if (h->surplus_huge_pages_node[node] <
3792
h->nr_huge_pages_node[node])
3793
goto found;
3794
}
3795
}
3796
return 0;
3797
3798
found:
3799
h->surplus_huge_pages += delta;
3800
h->surplus_huge_pages_node[node] += delta;
3801
return 1;
3802
}
3803
3804
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3805
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3806
nodemask_t *nodes_allowed)
3807
{
3808
unsigned long persistent_free_count;
3809
unsigned long min_count;
3810
unsigned long allocated;
3811
struct folio *folio;
3812
LIST_HEAD(page_list);
3813
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3814
3815
/*
3816
* Bit mask controlling how hard we retry per-node allocations.
3817
* If we can not allocate the bit mask, do not attempt to allocate
3818
* the requested huge pages.
3819
*/
3820
if (node_alloc_noretry)
3821
nodes_clear(*node_alloc_noretry);
3822
else
3823
return -ENOMEM;
3824
3825
/*
3826
* resize_lock mutex prevents concurrent adjustments to number of
3827
* pages in hstate via the proc/sysfs interfaces.
3828
*/
3829
mutex_lock(&h->resize_lock);
3830
flush_free_hpage_work(h);
3831
spin_lock_irq(&hugetlb_lock);
3832
3833
/*
3834
* Check for a node specific request.
3835
* Changing node specific huge page count may require a corresponding
3836
* change to the global count. In any case, the passed node mask
3837
* (nodes_allowed) will restrict alloc/free to the specified node.
3838
*/
3839
if (nid != NUMA_NO_NODE) {
3840
unsigned long old_count = count;
3841
3842
count += persistent_huge_pages(h) -
3843
(h->nr_huge_pages_node[nid] -
3844
h->surplus_huge_pages_node[nid]);
3845
/*
3846
* User may have specified a large count value which caused the
3847
* above calculation to overflow. In this case, they wanted
3848
* to allocate as many huge pages as possible. Set count to
3849
* largest possible value to align with their intention.
3850
*/
3851
if (count < old_count)
3852
count = ULONG_MAX;
3853
}
3854
3855
/*
3856
* Gigantic pages runtime allocation depend on the capability for large
3857
* page range allocation.
3858
* If the system does not provide this feature, return an error when
3859
* the user tries to allocate gigantic pages but let the user free the
3860
* boottime allocated gigantic pages.
3861
*/
3862
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3863
if (count > persistent_huge_pages(h)) {
3864
spin_unlock_irq(&hugetlb_lock);
3865
mutex_unlock(&h->resize_lock);
3866
NODEMASK_FREE(node_alloc_noretry);
3867
return -EINVAL;
3868
}
3869
/* Fall through to decrease pool */
3870
}
3871
3872
/*
3873
* Increase the pool size
3874
* First take pages out of surplus state. Then make up the
3875
* remaining difference by allocating fresh huge pages.
3876
*
3877
* We might race with alloc_surplus_hugetlb_folio() here and be unable
3878
* to convert a surplus huge page to a normal huge page. That is
3879
* not critical, though, it just means the overall size of the
3880
* pool might be one hugepage larger than it needs to be, but
3881
* within all the constraints specified by the sysctls.
3882
*/
3883
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3884
if (!adjust_pool_surplus(h, nodes_allowed, -1))
3885
break;
3886
}
3887
3888
allocated = 0;
3889
while (count > (persistent_huge_pages(h) + allocated)) {
3890
/*
3891
* If this allocation races such that we no longer need the
3892
* page, free_huge_folio will handle it by freeing the page
3893
* and reducing the surplus.
3894
*/
3895
spin_unlock_irq(&hugetlb_lock);
3896
3897
/* yield cpu to avoid soft lockup */
3898
cond_resched();
3899
3900
folio = alloc_pool_huge_folio(h, nodes_allowed,
3901
node_alloc_noretry,
3902
&h->next_nid_to_alloc);
3903
if (!folio) {
3904
prep_and_add_allocated_folios(h, &page_list);
3905
spin_lock_irq(&hugetlb_lock);
3906
goto out;
3907
}
3908
3909
list_add(&folio->lru, &page_list);
3910
allocated++;
3911
3912
/* Bail for signals. Probably ctrl-c from user */
3913
if (signal_pending(current)) {
3914
prep_and_add_allocated_folios(h, &page_list);
3915
spin_lock_irq(&hugetlb_lock);
3916
goto out;
3917
}
3918
3919
spin_lock_irq(&hugetlb_lock);
3920
}
3921
3922
/* Add allocated pages to the pool */
3923
if (!list_empty(&page_list)) {
3924
spin_unlock_irq(&hugetlb_lock);
3925
prep_and_add_allocated_folios(h, &page_list);
3926
spin_lock_irq(&hugetlb_lock);
3927
}
3928
3929
/*
3930
* Decrease the pool size
3931
* First return free pages to the buddy allocator (being careful
3932
* to keep enough around to satisfy reservations). Then place
3933
* pages into surplus state as needed so the pool will shrink
3934
* to the desired size as pages become free.
3935
*
3936
* By placing pages into the surplus state independent of the
3937
* overcommit value, we are allowing the surplus pool size to
3938
* exceed overcommit. There are few sane options here. Since
3939
* alloc_surplus_hugetlb_folio() is checking the global counter,
3940
* though, we'll note that we're not allowed to exceed surplus
3941
* and won't grow the pool anywhere else. Not until one of the
3942
* sysctls are changed, or the surplus pages go out of use.
3943
*
3944
* min_count is the expected number of persistent pages, we
3945
* shouldn't calculate min_count by using
3946
* resv_huge_pages + persistent_huge_pages() - free_huge_pages,
3947
* because there may exist free surplus huge pages, and this will
3948
* lead to subtracting twice. Free surplus huge pages come from HVO
3949
* failing to restore vmemmap, see comments in the callers of
3950
* hugetlb_vmemmap_restore_folio(). Thus, we should calculate
3951
* persistent free count first.
3952
*/
3953
persistent_free_count = h->free_huge_pages;
3954
if (h->free_huge_pages > persistent_huge_pages(h)) {
3955
if (h->free_huge_pages > h->surplus_huge_pages)
3956
persistent_free_count -= h->surplus_huge_pages;
3957
else
3958
persistent_free_count = 0;
3959
}
3960
min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
3961
min_count = max(count, min_count);
3962
try_to_free_low(h, min_count, nodes_allowed);
3963
3964
/*
3965
* Collect pages to be removed on list without dropping lock
3966
*/
3967
while (min_count < persistent_huge_pages(h)) {
3968
folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
3969
if (!folio)
3970
break;
3971
3972
list_add(&folio->lru, &page_list);
3973
}
3974
/* free the pages after dropping lock */
3975
spin_unlock_irq(&hugetlb_lock);
3976
update_and_free_pages_bulk(h, &page_list);
3977
flush_free_hpage_work(h);
3978
spin_lock_irq(&hugetlb_lock);
3979
3980
while (count < persistent_huge_pages(h)) {
3981
if (!adjust_pool_surplus(h, nodes_allowed, 1))
3982
break;
3983
}
3984
out:
3985
h->max_huge_pages = persistent_huge_pages(h);
3986
spin_unlock_irq(&hugetlb_lock);
3987
mutex_unlock(&h->resize_lock);
3988
3989
NODEMASK_FREE(node_alloc_noretry);
3990
3991
return 0;
3992
}
3993
3994
static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
3995
struct list_head *src_list)
3996
{
3997
long rc;
3998
struct folio *folio, *next;
3999
LIST_HEAD(dst_list);
4000
LIST_HEAD(ret_list);
4001
4002
rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
4003
list_splice_init(&ret_list, src_list);
4004
4005
/*
4006
* Taking target hstate mutex synchronizes with set_max_huge_pages.
4007
* Without the mutex, pages added to target hstate could be marked
4008
* as surplus.
4009
*
4010
* Note that we already hold src->resize_lock. To prevent deadlock,
4011
* use the convention of always taking larger size hstate mutex first.
4012
*/
4013
mutex_lock(&dst->resize_lock);
4014
4015
list_for_each_entry_safe(folio, next, src_list, lru) {
4016
int i;
4017
bool cma;
4018
4019
if (folio_test_hugetlb_vmemmap_optimized(folio))
4020
continue;
4021
4022
cma = folio_test_hugetlb_cma(folio);
4023
4024
list_del(&folio->lru);
4025
4026
split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
4027
pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
4028
4029
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
4030
struct page *page = folio_page(folio, i);
4031
/* Careful: see __split_huge_page_tail() */
4032
struct folio *new_folio = (struct folio *)page;
4033
4034
clear_compound_head(page);
4035
prep_compound_page(page, dst->order);
4036
4037
new_folio->mapping = NULL;
4038
init_new_hugetlb_folio(dst, new_folio);
4039
/* Copy the CMA flag so that it is freed correctly */
4040
if (cma)
4041
folio_set_hugetlb_cma(new_folio);
4042
list_add(&new_folio->lru, &dst_list);
4043
}
4044
}
4045
4046
prep_and_add_allocated_folios(dst, &dst_list);
4047
4048
mutex_unlock(&dst->resize_lock);
4049
4050
return rc;
4051
}
4052
4053
static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
4054
unsigned long nr_to_demote)
4055
__must_hold(&hugetlb_lock)
4056
{
4057
int nr_nodes, node;
4058
struct hstate *dst;
4059
long rc = 0;
4060
long nr_demoted = 0;
4061
4062
lockdep_assert_held(&hugetlb_lock);
4063
4064
/* We should never get here if no demote order */
4065
if (!src->demote_order) {
4066
pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
4067
return -EINVAL; /* internal error */
4068
}
4069
dst = size_to_hstate(PAGE_SIZE << src->demote_order);
4070
4071
for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
4072
LIST_HEAD(list);
4073
struct folio *folio, *next;
4074
4075
list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
4076
if (folio_test_hwpoison(folio))
4077
continue;
4078
4079
remove_hugetlb_folio(src, folio, false);
4080
list_add(&folio->lru, &list);
4081
4082
if (++nr_demoted == nr_to_demote)
4083
break;
4084
}
4085
4086
spin_unlock_irq(&hugetlb_lock);
4087
4088
rc = demote_free_hugetlb_folios(src, dst, &list);
4089
4090
spin_lock_irq(&hugetlb_lock);
4091
4092
list_for_each_entry_safe(folio, next, &list, lru) {
4093
list_del(&folio->lru);
4094
add_hugetlb_folio(src, folio, false);
4095
4096
nr_demoted--;
4097
}
4098
4099
if (rc < 0 || nr_demoted == nr_to_demote)
4100
break;
4101
}
4102
4103
/*
4104
* Not absolutely necessary, but for consistency update max_huge_pages
4105
* based on pool changes for the demoted page.
4106
*/
4107
src->max_huge_pages -= nr_demoted;
4108
dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
4109
4110
if (rc < 0)
4111
return rc;
4112
4113
if (nr_demoted)
4114
return nr_demoted;
4115
/*
4116
* Only way to get here is if all pages on free lists are poisoned.
4117
* Return -EBUSY so that caller will not retry.
4118
*/
4119
return -EBUSY;
4120
}
4121
4122
#define HSTATE_ATTR_RO(_name) \
4123
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
4124
4125
#define HSTATE_ATTR_WO(_name) \
4126
static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
4127
4128
#define HSTATE_ATTR(_name) \
4129
static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
4130
4131
static struct kobject *hugepages_kobj;
4132
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
4133
4134
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
4135
4136
static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
4137
{
4138
int i;
4139
4140
for (i = 0; i < HUGE_MAX_HSTATE; i++)
4141
if (hstate_kobjs[i] == kobj) {
4142
if (nidp)
4143
*nidp = NUMA_NO_NODE;
4144
return &hstates[i];
4145
}
4146
4147
return kobj_to_node_hstate(kobj, nidp);
4148
}
4149
4150
static ssize_t nr_hugepages_show_common(struct kobject *kobj,
4151
struct kobj_attribute *attr, char *buf)
4152
{
4153
struct hstate *h;
4154
unsigned long nr_huge_pages;
4155
int nid;
4156
4157
h = kobj_to_hstate(kobj, &nid);
4158
if (nid == NUMA_NO_NODE)
4159
nr_huge_pages = h->nr_huge_pages;
4160
else
4161
nr_huge_pages = h->nr_huge_pages_node[nid];
4162
4163
return sysfs_emit(buf, "%lu\n", nr_huge_pages);
4164
}
4165
4166
static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
4167
struct hstate *h, int nid,
4168
unsigned long count, size_t len)
4169
{
4170
int err;
4171
nodemask_t nodes_allowed, *n_mask;
4172
4173
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
4174
return -EINVAL;
4175
4176
if (nid == NUMA_NO_NODE) {
4177
/*
4178
* global hstate attribute
4179
*/
4180
if (!(obey_mempolicy &&
4181
init_nodemask_of_mempolicy(&nodes_allowed)))
4182
n_mask = &node_states[N_MEMORY];
4183
else
4184
n_mask = &nodes_allowed;
4185
} else {
4186
/*
4187
* Node specific request. count adjustment happens in
4188
* set_max_huge_pages() after acquiring hugetlb_lock.
4189
*/
4190
init_nodemask_of_node(&nodes_allowed, nid);
4191
n_mask = &nodes_allowed;
4192
}
4193
4194
err = set_max_huge_pages(h, count, nid, n_mask);
4195
4196
return err ? err : len;
4197
}
4198
4199
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
4200
struct kobject *kobj, const char *buf,
4201
size_t len)
4202
{
4203
struct hstate *h;
4204
unsigned long count;
4205
int nid;
4206
int err;
4207
4208
err = kstrtoul(buf, 10, &count);
4209
if (err)
4210
return err;
4211
4212
h = kobj_to_hstate(kobj, &nid);
4213
return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
4214
}
4215
4216
static ssize_t nr_hugepages_show(struct kobject *kobj,
4217
struct kobj_attribute *attr, char *buf)
4218
{
4219
return nr_hugepages_show_common(kobj, attr, buf);
4220
}
4221
4222
static ssize_t nr_hugepages_store(struct kobject *kobj,
4223
struct kobj_attribute *attr, const char *buf, size_t len)
4224
{
4225
return nr_hugepages_store_common(false, kobj, buf, len);
4226
}
4227
HSTATE_ATTR(nr_hugepages);
4228
4229
#ifdef CONFIG_NUMA
4230
4231
/*
4232
* hstate attribute for optionally mempolicy-based constraint on persistent
4233
* huge page alloc/free.
4234
*/
4235
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
4236
struct kobj_attribute *attr,
4237
char *buf)
4238
{
4239
return nr_hugepages_show_common(kobj, attr, buf);
4240
}
4241
4242
static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
4243
struct kobj_attribute *attr, const char *buf, size_t len)
4244
{
4245
return nr_hugepages_store_common(true, kobj, buf, len);
4246
}
4247
HSTATE_ATTR(nr_hugepages_mempolicy);
4248
#endif
4249
4250
4251
static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
4252
struct kobj_attribute *attr, char *buf)
4253
{
4254
struct hstate *h = kobj_to_hstate(kobj, NULL);
4255
return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
4256
}
4257
4258
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
4259
struct kobj_attribute *attr, const char *buf, size_t count)
4260
{
4261
int err;
4262
unsigned long input;
4263
struct hstate *h = kobj_to_hstate(kobj, NULL);
4264
4265
if (hstate_is_gigantic(h))
4266
return -EINVAL;
4267
4268
err = kstrtoul(buf, 10, &input);
4269
if (err)
4270
return err;
4271
4272
spin_lock_irq(&hugetlb_lock);
4273
h->nr_overcommit_huge_pages = input;
4274
spin_unlock_irq(&hugetlb_lock);
4275
4276
return count;
4277
}
4278
HSTATE_ATTR(nr_overcommit_hugepages);
4279
4280
static ssize_t free_hugepages_show(struct kobject *kobj,
4281
struct kobj_attribute *attr, char *buf)
4282
{
4283
struct hstate *h;
4284
unsigned long free_huge_pages;
4285
int nid;
4286
4287
h = kobj_to_hstate(kobj, &nid);
4288
if (nid == NUMA_NO_NODE)
4289
free_huge_pages = h->free_huge_pages;
4290
else
4291
free_huge_pages = h->free_huge_pages_node[nid];
4292
4293
return sysfs_emit(buf, "%lu\n", free_huge_pages);
4294
}
4295
HSTATE_ATTR_RO(free_hugepages);
4296
4297
static ssize_t resv_hugepages_show(struct kobject *kobj,
4298
struct kobj_attribute *attr, char *buf)
4299
{
4300
struct hstate *h = kobj_to_hstate(kobj, NULL);
4301
return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
4302
}
4303
HSTATE_ATTR_RO(resv_hugepages);
4304
4305
static ssize_t surplus_hugepages_show(struct kobject *kobj,
4306
struct kobj_attribute *attr, char *buf)
4307
{
4308
struct hstate *h;
4309
unsigned long surplus_huge_pages;
4310
int nid;
4311
4312
h = kobj_to_hstate(kobj, &nid);
4313
if (nid == NUMA_NO_NODE)
4314
surplus_huge_pages = h->surplus_huge_pages;
4315
else
4316
surplus_huge_pages = h->surplus_huge_pages_node[nid];
4317
4318
return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
4319
}
4320
HSTATE_ATTR_RO(surplus_hugepages);
4321
4322
static ssize_t demote_store(struct kobject *kobj,
4323
struct kobj_attribute *attr, const char *buf, size_t len)
4324
{
4325
unsigned long nr_demote;
4326
unsigned long nr_available;
4327
nodemask_t nodes_allowed, *n_mask;
4328
struct hstate *h;
4329
int err;
4330
int nid;
4331
4332
err = kstrtoul(buf, 10, &nr_demote);
4333
if (err)
4334
return err;
4335
h = kobj_to_hstate(kobj, &nid);
4336
4337
if (nid != NUMA_NO_NODE) {
4338
init_nodemask_of_node(&nodes_allowed, nid);
4339
n_mask = &nodes_allowed;
4340
} else {
4341
n_mask = &node_states[N_MEMORY];
4342
}
4343
4344
/* Synchronize with other sysfs operations modifying huge pages */
4345
mutex_lock(&h->resize_lock);
4346
spin_lock_irq(&hugetlb_lock);
4347
4348
while (nr_demote) {
4349
long rc;
4350
4351
/*
4352
* Check for available pages to demote each time thorough the
4353
* loop as demote_pool_huge_page will drop hugetlb_lock.
4354
*/
4355
if (nid != NUMA_NO_NODE)
4356
nr_available = h->free_huge_pages_node[nid];
4357
else
4358
nr_available = h->free_huge_pages;
4359
nr_available -= h->resv_huge_pages;
4360
if (!nr_available)
4361
break;
4362
4363
rc = demote_pool_huge_page(h, n_mask, nr_demote);
4364
if (rc < 0) {
4365
err = rc;
4366
break;
4367
}
4368
4369
nr_demote -= rc;
4370
}
4371
4372
spin_unlock_irq(&hugetlb_lock);
4373
mutex_unlock(&h->resize_lock);
4374
4375
if (err)
4376
return err;
4377
return len;
4378
}
4379
HSTATE_ATTR_WO(demote);
4380
4381
static ssize_t demote_size_show(struct kobject *kobj,
4382
struct kobj_attribute *attr, char *buf)
4383
{
4384
struct hstate *h = kobj_to_hstate(kobj, NULL);
4385
unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
4386
4387
return sysfs_emit(buf, "%lukB\n", demote_size);
4388
}
4389
4390
static ssize_t demote_size_store(struct kobject *kobj,
4391
struct kobj_attribute *attr,
4392
const char *buf, size_t count)
4393
{
4394
struct hstate *h, *demote_hstate;
4395
unsigned long demote_size;
4396
unsigned int demote_order;
4397
4398
demote_size = (unsigned long)memparse(buf, NULL);
4399
4400
demote_hstate = size_to_hstate(demote_size);
4401
if (!demote_hstate)
4402
return -EINVAL;
4403
demote_order = demote_hstate->order;
4404
if (demote_order < HUGETLB_PAGE_ORDER)
4405
return -EINVAL;
4406
4407
/* demote order must be smaller than hstate order */
4408
h = kobj_to_hstate(kobj, NULL);
4409
if (demote_order >= h->order)
4410
return -EINVAL;
4411
4412
/* resize_lock synchronizes access to demote size and writes */
4413
mutex_lock(&h->resize_lock);
4414
h->demote_order = demote_order;
4415
mutex_unlock(&h->resize_lock);
4416
4417
return count;
4418
}
4419
HSTATE_ATTR(demote_size);
4420
4421
static struct attribute *hstate_attrs[] = {
4422
&nr_hugepages_attr.attr,
4423
&nr_overcommit_hugepages_attr.attr,
4424
&free_hugepages_attr.attr,
4425
&resv_hugepages_attr.attr,
4426
&surplus_hugepages_attr.attr,
4427
#ifdef CONFIG_NUMA
4428
&nr_hugepages_mempolicy_attr.attr,
4429
#endif
4430
NULL,
4431
};
4432
4433
static const struct attribute_group hstate_attr_group = {
4434
.attrs = hstate_attrs,
4435
};
4436
4437
static struct attribute *hstate_demote_attrs[] = {
4438
&demote_size_attr.attr,
4439
&demote_attr.attr,
4440
NULL,
4441
};
4442
4443
static const struct attribute_group hstate_demote_attr_group = {
4444
.attrs = hstate_demote_attrs,
4445
};
4446
4447
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
4448
struct kobject **hstate_kobjs,
4449
const struct attribute_group *hstate_attr_group)
4450
{
4451
int retval;
4452
int hi = hstate_index(h);
4453
4454
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
4455
if (!hstate_kobjs[hi])
4456
return -ENOMEM;
4457
4458
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
4459
if (retval) {
4460
kobject_put(hstate_kobjs[hi]);
4461
hstate_kobjs[hi] = NULL;
4462
return retval;
4463
}
4464
4465
if (h->demote_order) {
4466
retval = sysfs_create_group(hstate_kobjs[hi],
4467
&hstate_demote_attr_group);
4468
if (retval) {
4469
pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
4470
sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
4471
kobject_put(hstate_kobjs[hi]);
4472
hstate_kobjs[hi] = NULL;
4473
return retval;
4474
}
4475
}
4476
4477
return 0;
4478
}
4479
4480
#ifdef CONFIG_NUMA
4481
static bool hugetlb_sysfs_initialized __ro_after_init;
4482
4483
/*
4484
* node_hstate/s - associate per node hstate attributes, via their kobjects,
4485
* with node devices in node_devices[] using a parallel array. The array
4486
* index of a node device or _hstate == node id.
4487
* This is here to avoid any static dependency of the node device driver, in
4488
* the base kernel, on the hugetlb module.
4489
*/
4490
struct node_hstate {
4491
struct kobject *hugepages_kobj;
4492
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
4493
};
4494
static struct node_hstate node_hstates[MAX_NUMNODES];
4495
4496
/*
4497
* A subset of global hstate attributes for node devices
4498
*/
4499
static struct attribute *per_node_hstate_attrs[] = {
4500
&nr_hugepages_attr.attr,
4501
&free_hugepages_attr.attr,
4502
&surplus_hugepages_attr.attr,
4503
NULL,
4504
};
4505
4506
static const struct attribute_group per_node_hstate_attr_group = {
4507
.attrs = per_node_hstate_attrs,
4508
};
4509
4510
/*
4511
* kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
4512
* Returns node id via non-NULL nidp.
4513
*/
4514
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4515
{
4516
int nid;
4517
4518
for (nid = 0; nid < nr_node_ids; nid++) {
4519
struct node_hstate *nhs = &node_hstates[nid];
4520
int i;
4521
for (i = 0; i < HUGE_MAX_HSTATE; i++)
4522
if (nhs->hstate_kobjs[i] == kobj) {
4523
if (nidp)
4524
*nidp = nid;
4525
return &hstates[i];
4526
}
4527
}
4528
4529
BUG();
4530
return NULL;
4531
}
4532
4533
/*
4534
* Unregister hstate attributes from a single node device.
4535
* No-op if no hstate attributes attached.
4536
*/
4537
void hugetlb_unregister_node(struct node *node)
4538
{
4539
struct hstate *h;
4540
struct node_hstate *nhs = &node_hstates[node->dev.id];
4541
4542
if (!nhs->hugepages_kobj)
4543
return; /* no hstate attributes */
4544
4545
for_each_hstate(h) {
4546
int idx = hstate_index(h);
4547
struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
4548
4549
if (!hstate_kobj)
4550
continue;
4551
if (h->demote_order)
4552
sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
4553
sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
4554
kobject_put(hstate_kobj);
4555
nhs->hstate_kobjs[idx] = NULL;
4556
}
4557
4558
kobject_put(nhs->hugepages_kobj);
4559
nhs->hugepages_kobj = NULL;
4560
}
4561
4562
4563
/*
4564
* Register hstate attributes for a single node device.
4565
* No-op if attributes already registered.
4566
*/
4567
void hugetlb_register_node(struct node *node)
4568
{
4569
struct hstate *h;
4570
struct node_hstate *nhs = &node_hstates[node->dev.id];
4571
int err;
4572
4573
if (!hugetlb_sysfs_initialized)
4574
return;
4575
4576
if (nhs->hugepages_kobj)
4577
return; /* already allocated */
4578
4579
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
4580
&node->dev.kobj);
4581
if (!nhs->hugepages_kobj)
4582
return;
4583
4584
for_each_hstate(h) {
4585
err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
4586
nhs->hstate_kobjs,
4587
&per_node_hstate_attr_group);
4588
if (err) {
4589
pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
4590
h->name, node->dev.id);
4591
hugetlb_unregister_node(node);
4592
break;
4593
}
4594
}
4595
}
4596
4597
/*
4598
* hugetlb init time: register hstate attributes for all registered node
4599
* devices of nodes that have memory. All on-line nodes should have
4600
* registered their associated device by this time.
4601
*/
4602
static void __init hugetlb_register_all_nodes(void)
4603
{
4604
int nid;
4605
4606
for_each_online_node(nid)
4607
hugetlb_register_node(node_devices[nid]);
4608
}
4609
#else /* !CONFIG_NUMA */
4610
4611
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4612
{
4613
BUG();
4614
if (nidp)
4615
*nidp = -1;
4616
return NULL;
4617
}
4618
4619
static void hugetlb_register_all_nodes(void) { }
4620
4621
#endif
4622
4623
static void __init hugetlb_sysfs_init(void)
4624
{
4625
struct hstate *h;
4626
int err;
4627
4628
hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
4629
if (!hugepages_kobj)
4630
return;
4631
4632
for_each_hstate(h) {
4633
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
4634
hstate_kobjs, &hstate_attr_group);
4635
if (err)
4636
pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
4637
}
4638
4639
#ifdef CONFIG_NUMA
4640
hugetlb_sysfs_initialized = true;
4641
#endif
4642
hugetlb_register_all_nodes();
4643
}
4644
4645
#ifdef CONFIG_SYSCTL
4646
static void hugetlb_sysctl_init(void);
4647
#else
4648
static inline void hugetlb_sysctl_init(void) { }
4649
#endif
4650
4651
static int __init hugetlb_init(void)
4652
{
4653
int i;
4654
4655
BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4656
__NR_HPAGEFLAGS);
4657
4658
if (!hugepages_supported()) {
4659
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4660
pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4661
return 0;
4662
}
4663
4664
/*
4665
* Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
4666
* architectures depend on setup being done here.
4667
*/
4668
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4669
if (!parsed_default_hugepagesz) {
4670
/*
4671
* If we did not parse a default huge page size, set
4672
* default_hstate_idx to HPAGE_SIZE hstate. And, if the
4673
* number of huge pages for this default size was implicitly
4674
* specified, set that here as well.
4675
* Note that the implicit setting will overwrite an explicit
4676
* setting. A warning will be printed in this case.
4677
*/
4678
default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4679
if (default_hstate_max_huge_pages) {
4680
if (default_hstate.max_huge_pages) {
4681
char buf[32];
4682
4683
string_get_size(huge_page_size(&default_hstate),
4684
1, STRING_UNITS_2, buf, 32);
4685
pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4686
default_hstate.max_huge_pages, buf);
4687
pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4688
default_hstate_max_huge_pages);
4689
}
4690
default_hstate.max_huge_pages =
4691
default_hstate_max_huge_pages;
4692
4693
for_each_online_node(i)
4694
default_hstate.max_huge_pages_node[i] =
4695
default_hugepages_in_node[i];
4696
}
4697
}
4698
4699
hugetlb_cma_check();
4700
hugetlb_init_hstates();
4701
gather_bootmem_prealloc();
4702
report_hugepages();
4703
4704
hugetlb_sysfs_init();
4705
hugetlb_cgroup_file_init();
4706
hugetlb_sysctl_init();
4707
4708
#ifdef CONFIG_SMP
4709
num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4710
#else
4711
num_fault_mutexes = 1;
4712
#endif
4713
hugetlb_fault_mutex_table =
4714
kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4715
GFP_KERNEL);
4716
BUG_ON(!hugetlb_fault_mutex_table);
4717
4718
for (i = 0; i < num_fault_mutexes; i++)
4719
mutex_init(&hugetlb_fault_mutex_table[i]);
4720
return 0;
4721
}
4722
subsys_initcall(hugetlb_init);
4723
4724
/* Overwritten by architectures with more huge page sizes */
4725
bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4726
{
4727
return size == HPAGE_SIZE;
4728
}
4729
4730
void __init hugetlb_add_hstate(unsigned int order)
4731
{
4732
struct hstate *h;
4733
unsigned long i;
4734
4735
if (size_to_hstate(PAGE_SIZE << order)) {
4736
return;
4737
}
4738
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4739
BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
4740
h = &hstates[hugetlb_max_hstate++];
4741
__mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
4742
h->order = order;
4743
h->mask = ~(huge_page_size(h) - 1);
4744
for (i = 0; i < MAX_NUMNODES; ++i)
4745
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4746
INIT_LIST_HEAD(&h->hugepage_activelist);
4747
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4748
huge_page_size(h)/SZ_1K);
4749
4750
parsed_hstate = h;
4751
}
4752
4753
bool __init __weak hugetlb_node_alloc_supported(void)
4754
{
4755
return true;
4756
}
4757
4758
static void __init hugepages_clear_pages_in_node(void)
4759
{
4760
if (!hugetlb_max_hstate) {
4761
default_hstate_max_huge_pages = 0;
4762
memset(default_hugepages_in_node, 0,
4763
sizeof(default_hugepages_in_node));
4764
} else {
4765
parsed_hstate->max_huge_pages = 0;
4766
memset(parsed_hstate->max_huge_pages_node, 0,
4767
sizeof(parsed_hstate->max_huge_pages_node));
4768
}
4769
}
4770
4771
static __init int hugetlb_add_param(char *s, int (*setup)(char *))
4772
{
4773
size_t len;
4774
char *p;
4775
4776
if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
4777
return -EINVAL;
4778
4779
len = strlen(s) + 1;
4780
if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
4781
return -EINVAL;
4782
4783
p = &hstate_cmdline_buf[hstate_cmdline_index];
4784
memcpy(p, s, len);
4785
hstate_cmdline_index += len;
4786
4787
hugetlb_params[hugetlb_param_index].val = p;
4788
hugetlb_params[hugetlb_param_index].setup = setup;
4789
4790
hugetlb_param_index++;
4791
4792
return 0;
4793
}
4794
4795
static __init void hugetlb_parse_params(void)
4796
{
4797
int i;
4798
struct hugetlb_cmdline *hcp;
4799
4800
for (i = 0; i < hugetlb_param_index; i++) {
4801
hcp = &hugetlb_params[i];
4802
4803
hcp->setup(hcp->val);
4804
}
4805
4806
hugetlb_cma_validate_params();
4807
}
4808
4809
/*
4810
* hugepages command line processing
4811
* hugepages normally follows a valid hugepagsz or default_hugepagsz
4812
* specification. If not, ignore the hugepages value. hugepages can also
4813
* be the first huge page command line option in which case it implicitly
4814
* specifies the number of huge pages for the default size.
4815
*/
4816
static int __init hugepages_setup(char *s)
4817
{
4818
unsigned long *mhp;
4819
static unsigned long *last_mhp;
4820
int node = NUMA_NO_NODE;
4821
int count;
4822
unsigned long tmp;
4823
char *p = s;
4824
4825
if (!parsed_valid_hugepagesz) {
4826
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4827
parsed_valid_hugepagesz = true;
4828
return -EINVAL;
4829
}
4830
4831
/*
4832
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4833
* yet, so this hugepages= parameter goes to the "default hstate".
4834
* Otherwise, it goes with the previously parsed hugepagesz or
4835
* default_hugepagesz.
4836
*/
4837
else if (!hugetlb_max_hstate)
4838
mhp = &default_hstate_max_huge_pages;
4839
else
4840
mhp = &parsed_hstate->max_huge_pages;
4841
4842
if (mhp == last_mhp) {
4843
pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4844
return 1;
4845
}
4846
4847
while (*p) {
4848
count = 0;
4849
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4850
goto invalid;
4851
/* Parameter is node format */
4852
if (p[count] == ':') {
4853
if (!hugetlb_node_alloc_supported()) {
4854
pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4855
return 1;
4856
}
4857
if (tmp >= MAX_NUMNODES || !node_online(tmp))
4858
goto invalid;
4859
node = array_index_nospec(tmp, MAX_NUMNODES);
4860
p += count + 1;
4861
/* Parse hugepages */
4862
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4863
goto invalid;
4864
if (!hugetlb_max_hstate)
4865
default_hugepages_in_node[node] = tmp;
4866
else
4867
parsed_hstate->max_huge_pages_node[node] = tmp;
4868
*mhp += tmp;
4869
/* Go to parse next node*/
4870
if (p[count] == ',')
4871
p += count + 1;
4872
else
4873
break;
4874
} else {
4875
if (p != s)
4876
goto invalid;
4877
*mhp = tmp;
4878
break;
4879
}
4880
}
4881
4882
last_mhp = mhp;
4883
4884
return 0;
4885
4886
invalid:
4887
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4888
hugepages_clear_pages_in_node();
4889
return -EINVAL;
4890
}
4891
hugetlb_early_param("hugepages", hugepages_setup);
4892
4893
/*
4894
* hugepagesz command line processing
4895
* A specific huge page size can only be specified once with hugepagesz.
4896
* hugepagesz is followed by hugepages on the command line. The global
4897
* variable 'parsed_valid_hugepagesz' is used to determine if prior
4898
* hugepagesz argument was valid.
4899
*/
4900
static int __init hugepagesz_setup(char *s)
4901
{
4902
unsigned long size;
4903
struct hstate *h;
4904
4905
parsed_valid_hugepagesz = false;
4906
size = (unsigned long)memparse(s, NULL);
4907
4908
if (!arch_hugetlb_valid_size(size)) {
4909
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4910
return -EINVAL;
4911
}
4912
4913
h = size_to_hstate(size);
4914
if (h) {
4915
/*
4916
* hstate for this size already exists. This is normally
4917
* an error, but is allowed if the existing hstate is the
4918
* default hstate. More specifically, it is only allowed if
4919
* the number of huge pages for the default hstate was not
4920
* previously specified.
4921
*/
4922
if (!parsed_default_hugepagesz || h != &default_hstate ||
4923
default_hstate.max_huge_pages) {
4924
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4925
return -EINVAL;
4926
}
4927
4928
/*
4929
* No need to call hugetlb_add_hstate() as hstate already
4930
* exists. But, do set parsed_hstate so that a following
4931
* hugepages= parameter will be applied to this hstate.
4932
*/
4933
parsed_hstate = h;
4934
parsed_valid_hugepagesz = true;
4935
return 0;
4936
}
4937
4938
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4939
parsed_valid_hugepagesz = true;
4940
return 0;
4941
}
4942
hugetlb_early_param("hugepagesz", hugepagesz_setup);
4943
4944
/*
4945
* default_hugepagesz command line input
4946
* Only one instance of default_hugepagesz allowed on command line.
4947
*/
4948
static int __init default_hugepagesz_setup(char *s)
4949
{
4950
unsigned long size;
4951
int i;
4952
4953
parsed_valid_hugepagesz = false;
4954
if (parsed_default_hugepagesz) {
4955
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4956
return -EINVAL;
4957
}
4958
4959
size = (unsigned long)memparse(s, NULL);
4960
4961
if (!arch_hugetlb_valid_size(size)) {
4962
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4963
return -EINVAL;
4964
}
4965
4966
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4967
parsed_valid_hugepagesz = true;
4968
parsed_default_hugepagesz = true;
4969
default_hstate_idx = hstate_index(size_to_hstate(size));
4970
4971
/*
4972
* The number of default huge pages (for this size) could have been
4973
* specified as the first hugetlb parameter: hugepages=X. If so,
4974
* then default_hstate_max_huge_pages is set. If the default huge
4975
* page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
4976
* allocated here from bootmem allocator.
4977
*/
4978
if (default_hstate_max_huge_pages) {
4979
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4980
/*
4981
* Since this is an early parameter, we can't check
4982
* NUMA node state yet, so loop through MAX_NUMNODES.
4983
*/
4984
for (i = 0; i < MAX_NUMNODES; i++) {
4985
if (default_hugepages_in_node[i] != 0)
4986
default_hstate.max_huge_pages_node[i] =
4987
default_hugepages_in_node[i];
4988
}
4989
default_hstate_max_huge_pages = 0;
4990
}
4991
4992
return 0;
4993
}
4994
hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
4995
4996
void __init hugetlb_bootmem_set_nodes(void)
4997
{
4998
int i, nid;
4999
unsigned long start_pfn, end_pfn;
5000
5001
if (!nodes_empty(hugetlb_bootmem_nodes))
5002
return;
5003
5004
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5005
if (end_pfn > start_pfn)
5006
node_set(nid, hugetlb_bootmem_nodes);
5007
}
5008
}
5009
5010
static bool __hugetlb_bootmem_allocated __initdata;
5011
5012
bool __init hugetlb_bootmem_allocated(void)
5013
{
5014
return __hugetlb_bootmem_allocated;
5015
}
5016
5017
void __init hugetlb_bootmem_alloc(void)
5018
{
5019
struct hstate *h;
5020
int i;
5021
5022
if (__hugetlb_bootmem_allocated)
5023
return;
5024
5025
hugetlb_bootmem_set_nodes();
5026
5027
for (i = 0; i < MAX_NUMNODES; i++)
5028
INIT_LIST_HEAD(&huge_boot_pages[i]);
5029
5030
hugetlb_parse_params();
5031
5032
for_each_hstate(h) {
5033
h->next_nid_to_alloc = first_online_node;
5034
5035
if (hstate_is_gigantic(h))
5036
hugetlb_hstate_alloc_pages(h);
5037
}
5038
5039
__hugetlb_bootmem_allocated = true;
5040
}
5041
5042
/*
5043
* hugepage_alloc_threads command line parsing.
5044
*
5045
* When set, use this specific number of threads for the boot
5046
* allocation of hugepages.
5047
*/
5048
static int __init hugepage_alloc_threads_setup(char *s)
5049
{
5050
unsigned long allocation_threads;
5051
5052
if (kstrtoul(s, 0, &allocation_threads) != 0)
5053
return 1;
5054
5055
if (allocation_threads == 0)
5056
return 1;
5057
5058
hugepage_allocation_threads = allocation_threads;
5059
5060
return 1;
5061
}
5062
__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);
5063
5064
static unsigned int allowed_mems_nr(struct hstate *h)
5065
{
5066
int node;
5067
unsigned int nr = 0;
5068
nodemask_t *mbind_nodemask;
5069
unsigned int *array = h->free_huge_pages_node;
5070
gfp_t gfp_mask = htlb_alloc_mask(h);
5071
5072
mbind_nodemask = policy_mbind_nodemask(gfp_mask);
5073
for_each_node_mask(node, cpuset_current_mems_allowed) {
5074
if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
5075
nr += array[node];
5076
}
5077
5078
return nr;
5079
}
5080
5081
#ifdef CONFIG_SYSCTL
5082
static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
5083
void *buffer, size_t *length,
5084
loff_t *ppos, unsigned long *out)
5085
{
5086
struct ctl_table dup_table;
5087
5088
/*
5089
* In order to avoid races with __do_proc_doulongvec_minmax(), we
5090
* can duplicate the @table and alter the duplicate of it.
5091
*/
5092
dup_table = *table;
5093
dup_table.data = out;
5094
5095
return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
5096
}
5097
5098
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
5099
const struct ctl_table *table, int write,
5100
void *buffer, size_t *length, loff_t *ppos)
5101
{
5102
struct hstate *h = &default_hstate;
5103
unsigned long tmp = h->max_huge_pages;
5104
int ret;
5105
5106
if (!hugepages_supported())
5107
return -EOPNOTSUPP;
5108
5109
ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
5110
&tmp);
5111
if (ret)
5112
goto out;
5113
5114
if (write)
5115
ret = __nr_hugepages_store_common(obey_mempolicy, h,
5116
NUMA_NO_NODE, tmp, *length);
5117
out:
5118
return ret;
5119
}
5120
5121
static int hugetlb_sysctl_handler(const struct ctl_table *table, int write,
5122
void *buffer, size_t *length, loff_t *ppos)
5123
{
5124
5125
return hugetlb_sysctl_handler_common(false, table, write,
5126
buffer, length, ppos);
5127
}
5128
5129
#ifdef CONFIG_NUMA
5130
static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write,
5131
void *buffer, size_t *length, loff_t *ppos)
5132
{
5133
return hugetlb_sysctl_handler_common(true, table, write,
5134
buffer, length, ppos);
5135
}
5136
#endif /* CONFIG_NUMA */
5137
5138
static int hugetlb_overcommit_handler(const struct ctl_table *table, int write,
5139
void *buffer, size_t *length, loff_t *ppos)
5140
{
5141
struct hstate *h = &default_hstate;
5142
unsigned long tmp;
5143
int ret;
5144
5145
if (!hugepages_supported())
5146
return -EOPNOTSUPP;
5147
5148
tmp = h->nr_overcommit_huge_pages;
5149
5150
if (write && hstate_is_gigantic(h))
5151
return -EINVAL;
5152
5153
ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
5154
&tmp);
5155
if (ret)
5156
goto out;
5157
5158
if (write) {
5159
spin_lock_irq(&hugetlb_lock);
5160
h->nr_overcommit_huge_pages = tmp;
5161
spin_unlock_irq(&hugetlb_lock);
5162
}
5163
out:
5164
return ret;
5165
}
5166
5167
static const struct ctl_table hugetlb_table[] = {
5168
{
5169
.procname = "nr_hugepages",
5170
.data = NULL,
5171
.maxlen = sizeof(unsigned long),
5172
.mode = 0644,
5173
.proc_handler = hugetlb_sysctl_handler,
5174
},
5175
#ifdef CONFIG_NUMA
5176
{
5177
.procname = "nr_hugepages_mempolicy",
5178
.data = NULL,
5179
.maxlen = sizeof(unsigned long),
5180
.mode = 0644,
5181
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
5182
},
5183
#endif
5184
{
5185
.procname = "hugetlb_shm_group",
5186
.data = &sysctl_hugetlb_shm_group,
5187
.maxlen = sizeof(gid_t),
5188
.mode = 0644,
5189
.proc_handler = proc_dointvec,
5190
},
5191
{
5192
.procname = "nr_overcommit_hugepages",
5193
.data = NULL,
5194
.maxlen = sizeof(unsigned long),
5195
.mode = 0644,
5196
.proc_handler = hugetlb_overcommit_handler,
5197
},
5198
};
5199
5200
static void __init hugetlb_sysctl_init(void)
5201
{
5202
register_sysctl_init("vm", hugetlb_table);
5203
}
5204
#endif /* CONFIG_SYSCTL */
5205
5206
void hugetlb_report_meminfo(struct seq_file *m)
5207
{
5208
struct hstate *h;
5209
unsigned long total = 0;
5210
5211
if (!hugepages_supported())
5212
return;
5213
5214
for_each_hstate(h) {
5215
unsigned long count = h->nr_huge_pages;
5216
5217
total += huge_page_size(h) * count;
5218
5219
if (h == &default_hstate)
5220
seq_printf(m,
5221
"HugePages_Total: %5lu\n"
5222
"HugePages_Free: %5lu\n"
5223
"HugePages_Rsvd: %5lu\n"
5224
"HugePages_Surp: %5lu\n"
5225
"Hugepagesize: %8lu kB\n",
5226
count,
5227
h->free_huge_pages,
5228
h->resv_huge_pages,
5229
h->surplus_huge_pages,
5230
huge_page_size(h) / SZ_1K);
5231
}
5232
5233
seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
5234
}
5235
5236
int hugetlb_report_node_meminfo(char *buf, int len, int nid)
5237
{
5238
struct hstate *h = &default_hstate;
5239
5240
if (!hugepages_supported())
5241
return 0;
5242
5243
return sysfs_emit_at(buf, len,
5244
"Node %d HugePages_Total: %5u\n"
5245
"Node %d HugePages_Free: %5u\n"
5246
"Node %d HugePages_Surp: %5u\n",
5247
nid, h->nr_huge_pages_node[nid],
5248
nid, h->free_huge_pages_node[nid],
5249
nid, h->surplus_huge_pages_node[nid]);
5250
}
5251
5252
void hugetlb_show_meminfo_node(int nid)
5253
{
5254
struct hstate *h;
5255
5256
if (!hugepages_supported())
5257
return;
5258
5259
for_each_hstate(h)
5260
printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
5261
nid,
5262
h->nr_huge_pages_node[nid],
5263
h->free_huge_pages_node[nid],
5264
h->surplus_huge_pages_node[nid],
5265
huge_page_size(h) / SZ_1K);
5266
}
5267
5268
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
5269
{
5270
seq_printf(m, "HugetlbPages:\t%8lu kB\n",
5271
K(atomic_long_read(&mm->hugetlb_usage)));
5272
}
5273
5274
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
5275
unsigned long hugetlb_total_pages(void)
5276
{
5277
struct hstate *h;
5278
unsigned long nr_total_pages = 0;
5279
5280
for_each_hstate(h)
5281
nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
5282
return nr_total_pages;
5283
}
5284
5285
static int hugetlb_acct_memory(struct hstate *h, long delta)
5286
{
5287
int ret = -ENOMEM;
5288
5289
if (!delta)
5290
return 0;
5291
5292
spin_lock_irq(&hugetlb_lock);
5293
/*
5294
* When cpuset is configured, it breaks the strict hugetlb page
5295
* reservation as the accounting is done on a global variable. Such
5296
* reservation is completely rubbish in the presence of cpuset because
5297
* the reservation is not checked against page availability for the
5298
* current cpuset. Application can still potentially OOM'ed by kernel
5299
* with lack of free htlb page in cpuset that the task is in.
5300
* Attempt to enforce strict accounting with cpuset is almost
5301
* impossible (or too ugly) because cpuset is too fluid that
5302
* task or memory node can be dynamically moved between cpusets.
5303
*
5304
* The change of semantics for shared hugetlb mapping with cpuset is
5305
* undesirable. However, in order to preserve some of the semantics,
5306
* we fall back to check against current free page availability as
5307
* a best attempt and hopefully to minimize the impact of changing
5308
* semantics that cpuset has.
5309
*
5310
* Apart from cpuset, we also have memory policy mechanism that
5311
* also determines from which node the kernel will allocate memory
5312
* in a NUMA system. So similar to cpuset, we also should consider
5313
* the memory policy of the current task. Similar to the description
5314
* above.
5315
*/
5316
if (delta > 0) {
5317
if (gather_surplus_pages(h, delta) < 0)
5318
goto out;
5319
5320
if (delta > allowed_mems_nr(h)) {
5321
return_unused_surplus_pages(h, delta);
5322
goto out;
5323
}
5324
}
5325
5326
ret = 0;
5327
if (delta < 0)
5328
return_unused_surplus_pages(h, (unsigned long) -delta);
5329
5330
out:
5331
spin_unlock_irq(&hugetlb_lock);
5332
return ret;
5333
}
5334
5335
static void hugetlb_vm_op_open(struct vm_area_struct *vma)
5336
{
5337
struct resv_map *resv = vma_resv_map(vma);
5338
5339
/*
5340
* HPAGE_RESV_OWNER indicates a private mapping.
5341
* This new VMA should share its siblings reservation map if present.
5342
* The VMA will only ever have a valid reservation map pointer where
5343
* it is being copied for another still existing VMA. As that VMA
5344
* has a reference to the reservation map it cannot disappear until
5345
* after this open call completes. It is therefore safe to take a
5346
* new reference here without additional locking.
5347
*/
5348
if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
5349
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
5350
kref_get(&resv->refs);
5351
}
5352
5353
/*
5354
* vma_lock structure for sharable mappings is vma specific.
5355
* Clear old pointer (if copied via vm_area_dup) and allocate
5356
* new structure. Before clearing, make sure vma_lock is not
5357
* for this vma.
5358
*/
5359
if (vma->vm_flags & VM_MAYSHARE) {
5360
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
5361
5362
if (vma_lock) {
5363
if (vma_lock->vma != vma) {
5364
vma->vm_private_data = NULL;
5365
hugetlb_vma_lock_alloc(vma);
5366
} else
5367
pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
5368
} else
5369
hugetlb_vma_lock_alloc(vma);
5370
}
5371
}
5372
5373
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
5374
{
5375
struct hstate *h = hstate_vma(vma);
5376
struct resv_map *resv;
5377
struct hugepage_subpool *spool = subpool_vma(vma);
5378
unsigned long reserve, start, end;
5379
long gbl_reserve;
5380
5381
hugetlb_vma_lock_free(vma);
5382
5383
resv = vma_resv_map(vma);
5384
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
5385
return;
5386
5387
start = vma_hugecache_offset(h, vma, vma->vm_start);
5388
end = vma_hugecache_offset(h, vma, vma->vm_end);
5389
5390
reserve = (end - start) - region_count(resv, start, end);
5391
hugetlb_cgroup_uncharge_counter(resv, start, end);
5392
if (reserve) {
5393
/*
5394
* Decrement reserve counts. The global reserve count may be
5395
* adjusted if the subpool has a minimum size.
5396
*/
5397
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
5398
hugetlb_acct_memory(h, -gbl_reserve);
5399
}
5400
5401
kref_put(&resv->refs, resv_map_release);
5402
}
5403
5404
static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
5405
{
5406
if (addr & ~(huge_page_mask(hstate_vma(vma))))
5407
return -EINVAL;
5408
return 0;
5409
}
5410
5411
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
5412
{
5413
/*
5414
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
5415
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
5416
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5417
* This function is called in the middle of a VMA split operation, with
5418
* MM, VMA and rmap all write-locked to prevent concurrent page table
5419
* walks (except hardware and gup_fast()).
5420
*/
5421
vma_assert_write_locked(vma);
5422
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
5423
5424
if (addr & ~PUD_MASK) {
5425
unsigned long floor = addr & PUD_MASK;
5426
unsigned long ceil = floor + PUD_SIZE;
5427
5428
if (floor >= vma->vm_start && ceil <= vma->vm_end) {
5429
/*
5430
* Locking:
5431
* Use take_locks=false here.
5432
* The file rmap lock is already held.
5433
* The hugetlb VMA lock can't be taken when we already
5434
* hold the file rmap lock, and we don't need it because
5435
* its purpose is to synchronize against concurrent page
5436
* table walks, which are not possible thanks to the
5437
* locks held by our caller.
5438
*/
5439
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
5440
}
5441
}
5442
}
5443
5444
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
5445
{
5446
return huge_page_size(hstate_vma(vma));
5447
}
5448
5449
/*
5450
* We cannot handle pagefaults against hugetlb pages at all. They cause
5451
* handle_mm_fault() to try to instantiate regular-sized pages in the
5452
* hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
5453
* this far.
5454
*/
5455
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
5456
{
5457
BUG();
5458
return 0;
5459
}
5460
5461
/*
5462
* When a new function is introduced to vm_operations_struct and added
5463
* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
5464
* This is because under System V memory model, mappings created via
5465
* shmget/shmat with "huge page" specified are backed by hugetlbfs files,
5466
* their original vm_ops are overwritten with shm_vm_ops.
5467
*/
5468
const struct vm_operations_struct hugetlb_vm_ops = {
5469
.fault = hugetlb_vm_op_fault,
5470
.open = hugetlb_vm_op_open,
5471
.close = hugetlb_vm_op_close,
5472
.may_split = hugetlb_vm_op_split,
5473
.pagesize = hugetlb_vm_op_pagesize,
5474
};
5475
5476
static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
5477
bool try_mkwrite)
5478
{
5479
pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
5480
unsigned int shift = huge_page_shift(hstate_vma(vma));
5481
5482
if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
5483
entry = pte_mkwrite_novma(pte_mkdirty(entry));
5484
} else {
5485
entry = pte_wrprotect(entry);
5486
}
5487
entry = pte_mkyoung(entry);
5488
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
5489
5490
return entry;
5491
}
5492
5493
static void set_huge_ptep_writable(struct vm_area_struct *vma,
5494
unsigned long address, pte_t *ptep)
5495
{
5496
pte_t entry;
5497
5498
entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep)));
5499
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
5500
update_mmu_cache(vma, address, ptep);
5501
}
5502
5503
static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
5504
unsigned long address, pte_t *ptep)
5505
{
5506
if (vma->vm_flags & VM_WRITE)
5507
set_huge_ptep_writable(vma, address, ptep);
5508
}
5509
5510
bool is_hugetlb_entry_migration(pte_t pte)
5511
{
5512
swp_entry_t swp;
5513
5514
if (huge_pte_none(pte) || pte_present(pte))
5515
return false;
5516
swp = pte_to_swp_entry(pte);
5517
if (is_migration_entry(swp))
5518
return true;
5519
else
5520
return false;
5521
}
5522
5523
bool is_hugetlb_entry_hwpoisoned(pte_t pte)
5524
{
5525
swp_entry_t swp;
5526
5527
if (huge_pte_none(pte) || pte_present(pte))
5528
return false;
5529
swp = pte_to_swp_entry(pte);
5530
if (is_hwpoison_entry(swp))
5531
return true;
5532
else
5533
return false;
5534
}
5535
5536
static void
5537
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
5538
struct folio *new_folio, pte_t old, unsigned long sz)
5539
{
5540
pte_t newpte = make_huge_pte(vma, new_folio, true);
5541
5542
__folio_mark_uptodate(new_folio);
5543
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
5544
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
5545
newpte = huge_pte_mkuffd_wp(newpte);
5546
set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
5547
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
5548
folio_set_hugetlb_migratable(new_folio);
5549
}
5550
5551
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
5552
struct vm_area_struct *dst_vma,
5553
struct vm_area_struct *src_vma)
5554
{
5555
pte_t *src_pte, *dst_pte, entry;
5556
struct folio *pte_folio;
5557
unsigned long addr;
5558
bool cow = is_cow_mapping(src_vma->vm_flags);
5559
struct hstate *h = hstate_vma(src_vma);
5560
unsigned long sz = huge_page_size(h);
5561
unsigned long npages = pages_per_huge_page(h);
5562
struct mmu_notifier_range range;
5563
unsigned long last_addr_mask;
5564
int ret = 0;
5565
5566
if (cow) {
5567
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
5568
src_vma->vm_start,
5569
src_vma->vm_end);
5570
mmu_notifier_invalidate_range_start(&range);
5571
vma_assert_write_locked(src_vma);
5572
raw_write_seqcount_begin(&src->write_protect_seq);
5573
} else {
5574
/*
5575
* For shared mappings the vma lock must be held before
5576
* calling hugetlb_walk() in the src vma. Otherwise, the
5577
* returned ptep could go away if part of a shared pmd and
5578
* another thread calls huge_pmd_unshare.
5579
*/
5580
hugetlb_vma_lock_read(src_vma);
5581
}
5582
5583
last_addr_mask = hugetlb_mask_last_page(h);
5584
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
5585
spinlock_t *src_ptl, *dst_ptl;
5586
src_pte = hugetlb_walk(src_vma, addr, sz);
5587
if (!src_pte) {
5588
addr |= last_addr_mask;
5589
continue;
5590
}
5591
dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
5592
if (!dst_pte) {
5593
ret = -ENOMEM;
5594
break;
5595
}
5596
5597
/*
5598
* If the pagetables are shared don't copy or take references.
5599
*
5600
* dst_pte == src_pte is the common case of src/dest sharing.
5601
* However, src could have 'unshared' and dst shares with
5602
* another vma. So page_count of ptep page is checked instead
5603
* to reliably determine whether pte is shared.
5604
*/
5605
if (page_count(virt_to_page(dst_pte)) > 1) {
5606
addr |= last_addr_mask;
5607
continue;
5608
}
5609
5610
dst_ptl = huge_pte_lock(h, dst, dst_pte);
5611
src_ptl = huge_pte_lockptr(h, src, src_pte);
5612
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5613
entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5614
again:
5615
if (huge_pte_none(entry)) {
5616
/*
5617
* Skip if src entry none.
5618
*/
5619
;
5620
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
5621
if (!userfaultfd_wp(dst_vma))
5622
entry = huge_pte_clear_uffd_wp(entry);
5623
set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5624
} else if (unlikely(is_hugetlb_entry_migration(entry))) {
5625
swp_entry_t swp_entry = pte_to_swp_entry(entry);
5626
bool uffd_wp = pte_swp_uffd_wp(entry);
5627
5628
if (!is_readable_migration_entry(swp_entry) && cow) {
5629
/*
5630
* COW mappings require pages in both
5631
* parent and child to be set to read.
5632
*/
5633
swp_entry = make_readable_migration_entry(
5634
swp_offset(swp_entry));
5635
entry = swp_entry_to_pte(swp_entry);
5636
if (userfaultfd_wp(src_vma) && uffd_wp)
5637
entry = pte_swp_mkuffd_wp(entry);
5638
set_huge_pte_at(src, addr, src_pte, entry, sz);
5639
}
5640
if (!userfaultfd_wp(dst_vma))
5641
entry = huge_pte_clear_uffd_wp(entry);
5642
set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5643
} else if (unlikely(is_pte_marker(entry))) {
5644
pte_marker marker = copy_pte_marker(
5645
pte_to_swp_entry(entry), dst_vma);
5646
5647
if (marker)
5648
set_huge_pte_at(dst, addr, dst_pte,
5649
make_pte_marker(marker), sz);
5650
} else {
5651
entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5652
pte_folio = page_folio(pte_page(entry));
5653
folio_get(pte_folio);
5654
5655
/*
5656
* Failing to duplicate the anon rmap is a rare case
5657
* where we see pinned hugetlb pages while they're
5658
* prone to COW. We need to do the COW earlier during
5659
* fork.
5660
*
5661
* When pre-allocating the page or copying data, we
5662
* need to be without the pgtable locks since we could
5663
* sleep during the process.
5664
*/
5665
if (!folio_test_anon(pte_folio)) {
5666
hugetlb_add_file_rmap(pte_folio);
5667
} else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) {
5668
pte_t src_pte_old = entry;
5669
struct folio *new_folio;
5670
5671
spin_unlock(src_ptl);
5672
spin_unlock(dst_ptl);
5673
/* Do not use reserve as it's private owned */
5674
new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
5675
if (IS_ERR(new_folio)) {
5676
folio_put(pte_folio);
5677
ret = PTR_ERR(new_folio);
5678
break;
5679
}
5680
ret = copy_user_large_folio(new_folio, pte_folio,
5681
addr, dst_vma);
5682
folio_put(pte_folio);
5683
if (ret) {
5684
folio_put(new_folio);
5685
break;
5686
}
5687
5688
/* Install the new hugetlb folio if src pte stable */
5689
dst_ptl = huge_pte_lock(h, dst, dst_pte);
5690
src_ptl = huge_pte_lockptr(h, src, src_pte);
5691
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5692
entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5693
if (!pte_same(src_pte_old, entry)) {
5694
restore_reserve_on_error(h, dst_vma, addr,
5695
new_folio);
5696
folio_put(new_folio);
5697
/* huge_ptep of dst_pte won't change as in child */
5698
goto again;
5699
}
5700
hugetlb_install_folio(dst_vma, dst_pte, addr,
5701
new_folio, src_pte_old, sz);
5702
spin_unlock(src_ptl);
5703
spin_unlock(dst_ptl);
5704
continue;
5705
}
5706
5707
if (cow) {
5708
/*
5709
* No need to notify as we are downgrading page
5710
* table protection not changing it to point
5711
* to a new page.
5712
*
5713
* See Documentation/mm/mmu_notifier.rst
5714
*/
5715
huge_ptep_set_wrprotect(src, addr, src_pte);
5716
entry = huge_pte_wrprotect(entry);
5717
}
5718
5719
if (!userfaultfd_wp(dst_vma))
5720
entry = huge_pte_clear_uffd_wp(entry);
5721
5722
set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5723
hugetlb_count_add(npages, dst);
5724
}
5725
spin_unlock(src_ptl);
5726
spin_unlock(dst_ptl);
5727
}
5728
5729
if (cow) {
5730
raw_write_seqcount_end(&src->write_protect_seq);
5731
mmu_notifier_invalidate_range_end(&range);
5732
} else {
5733
hugetlb_vma_unlock_read(src_vma);
5734
}
5735
5736
return ret;
5737
}
5738
5739
static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
5740
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
5741
unsigned long sz)
5742
{
5743
bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
5744
struct hstate *h = hstate_vma(vma);
5745
struct mm_struct *mm = vma->vm_mm;
5746
spinlock_t *src_ptl, *dst_ptl;
5747
pte_t pte;
5748
5749
dst_ptl = huge_pte_lock(h, mm, dst_pte);
5750
src_ptl = huge_pte_lockptr(h, mm, src_pte);
5751
5752
/*
5753
* We don't have to worry about the ordering of src and dst ptlocks
5754
* because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
5755
*/
5756
if (src_ptl != dst_ptl)
5757
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5758
5759
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
5760
5761
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
5762
huge_pte_clear(mm, new_addr, dst_pte, sz);
5763
else {
5764
if (need_clear_uffd_wp) {
5765
if (pte_present(pte))
5766
pte = huge_pte_clear_uffd_wp(pte);
5767
else if (is_swap_pte(pte))
5768
pte = pte_swp_clear_uffd_wp(pte);
5769
}
5770
set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
5771
}
5772
5773
if (src_ptl != dst_ptl)
5774
spin_unlock(src_ptl);
5775
spin_unlock(dst_ptl);
5776
}
5777
5778
int move_hugetlb_page_tables(struct vm_area_struct *vma,
5779
struct vm_area_struct *new_vma,
5780
unsigned long old_addr, unsigned long new_addr,
5781
unsigned long len)
5782
{
5783
struct hstate *h = hstate_vma(vma);
5784
struct address_space *mapping = vma->vm_file->f_mapping;
5785
unsigned long sz = huge_page_size(h);
5786
struct mm_struct *mm = vma->vm_mm;
5787
unsigned long old_end = old_addr + len;
5788
unsigned long last_addr_mask;
5789
pte_t *src_pte, *dst_pte;
5790
struct mmu_notifier_range range;
5791
bool shared_pmd = false;
5792
5793
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
5794
old_end);
5795
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5796
/*
5797
* In case of shared PMDs, we should cover the maximum possible
5798
* range.
5799
*/
5800
flush_cache_range(vma, range.start, range.end);
5801
5802
mmu_notifier_invalidate_range_start(&range);
5803
last_addr_mask = hugetlb_mask_last_page(h);
5804
/* Prevent race with file truncation */
5805
hugetlb_vma_lock_write(vma);
5806
i_mmap_lock_write(mapping);
5807
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
5808
src_pte = hugetlb_walk(vma, old_addr, sz);
5809
if (!src_pte) {
5810
old_addr |= last_addr_mask;
5811
new_addr |= last_addr_mask;
5812
continue;
5813
}
5814
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
5815
continue;
5816
5817
if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
5818
shared_pmd = true;
5819
old_addr |= last_addr_mask;
5820
new_addr |= last_addr_mask;
5821
continue;
5822
}
5823
5824
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
5825
if (!dst_pte)
5826
break;
5827
5828
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
5829
}
5830
5831
if (shared_pmd)
5832
flush_hugetlb_tlb_range(vma, range.start, range.end);
5833
else
5834
flush_hugetlb_tlb_range(vma, old_end - len, old_end);
5835
mmu_notifier_invalidate_range_end(&range);
5836
i_mmap_unlock_write(mapping);
5837
hugetlb_vma_unlock_write(vma);
5838
5839
return len + old_addr - old_end;
5840
}
5841
5842
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
5843
unsigned long start, unsigned long end,
5844
struct folio *folio, zap_flags_t zap_flags)
5845
{
5846
struct mm_struct *mm = vma->vm_mm;
5847
const bool folio_provided = !!folio;
5848
unsigned long address;
5849
pte_t *ptep;
5850
pte_t pte;
5851
spinlock_t *ptl;
5852
struct hstate *h = hstate_vma(vma);
5853
unsigned long sz = huge_page_size(h);
5854
bool adjust_reservation = false;
5855
unsigned long last_addr_mask;
5856
bool force_flush = false;
5857
5858
WARN_ON(!is_vm_hugetlb_page(vma));
5859
BUG_ON(start & ~huge_page_mask(h));
5860
BUG_ON(end & ~huge_page_mask(h));
5861
5862
/*
5863
* This is a hugetlb vma, all the pte entries should point
5864
* to huge page.
5865
*/
5866
tlb_change_page_size(tlb, sz);
5867
tlb_start_vma(tlb, vma);
5868
5869
last_addr_mask = hugetlb_mask_last_page(h);
5870
address = start;
5871
for (; address < end; address += sz) {
5872
ptep = hugetlb_walk(vma, address, sz);
5873
if (!ptep) {
5874
address |= last_addr_mask;
5875
continue;
5876
}
5877
5878
ptl = huge_pte_lock(h, mm, ptep);
5879
if (huge_pmd_unshare(mm, vma, address, ptep)) {
5880
spin_unlock(ptl);
5881
tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5882
force_flush = true;
5883
address |= last_addr_mask;
5884
continue;
5885
}
5886
5887
pte = huge_ptep_get(mm, address, ptep);
5888
if (huge_pte_none(pte)) {
5889
spin_unlock(ptl);
5890
continue;
5891
}
5892
5893
/*
5894
* Migrating hugepage or HWPoisoned hugepage is already
5895
* unmapped and its refcount is dropped, so just clear pte here.
5896
*/
5897
if (unlikely(!pte_present(pte))) {
5898
/*
5899
* If the pte was wr-protected by uffd-wp in any of the
5900
* swap forms, meanwhile the caller does not want to
5901
* drop the uffd-wp bit in this zap, then replace the
5902
* pte with a marker.
5903
*/
5904
if (pte_swp_uffd_wp_any(pte) &&
5905
!(zap_flags & ZAP_FLAG_DROP_MARKER))
5906
set_huge_pte_at(mm, address, ptep,
5907
make_pte_marker(PTE_MARKER_UFFD_WP),
5908
sz);
5909
else
5910
huge_pte_clear(mm, address, ptep, sz);
5911
spin_unlock(ptl);
5912
continue;
5913
}
5914
5915
/*
5916
* If a folio is supplied, it is because a specific
5917
* folio is being unmapped, not a range. Ensure the folio we
5918
* are about to unmap is the actual folio of interest.
5919
*/
5920
if (folio_provided) {
5921
if (folio != page_folio(pte_page(pte))) {
5922
spin_unlock(ptl);
5923
continue;
5924
}
5925
/*
5926
* Mark the VMA as having unmapped its page so that
5927
* future faults in this VMA will fail rather than
5928
* looking like data was lost
5929
*/
5930
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5931
} else {
5932
folio = page_folio(pte_page(pte));
5933
}
5934
5935
pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
5936
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5937
if (huge_pte_dirty(pte))
5938
folio_mark_dirty(folio);
5939
/* Leave a uffd-wp pte marker if needed */
5940
if (huge_pte_uffd_wp(pte) &&
5941
!(zap_flags & ZAP_FLAG_DROP_MARKER))
5942
set_huge_pte_at(mm, address, ptep,
5943
make_pte_marker(PTE_MARKER_UFFD_WP),
5944
sz);
5945
hugetlb_count_sub(pages_per_huge_page(h), mm);
5946
hugetlb_remove_rmap(folio);
5947
5948
/*
5949
* Restore the reservation for anonymous page, otherwise the
5950
* backing page could be stolen by someone.
5951
* If there we are freeing a surplus, do not set the restore
5952
* reservation bit.
5953
*/
5954
if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
5955
folio_test_anon(folio)) {
5956
folio_set_hugetlb_restore_reserve(folio);
5957
/* Reservation to be adjusted after the spin lock */
5958
adjust_reservation = true;
5959
}
5960
5961
spin_unlock(ptl);
5962
5963
/*
5964
* Adjust the reservation for the region that will have the
5965
* reserve restored. Keep in mind that vma_needs_reservation() changes
5966
* resv->adds_in_progress if it succeeds. If this is not done,
5967
* do_exit() will not see it, and will keep the reservation
5968
* forever.
5969
*/
5970
if (adjust_reservation) {
5971
int rc = vma_needs_reservation(h, vma, address);
5972
5973
if (rc < 0)
5974
/* Pressumably allocate_file_region_entries failed
5975
* to allocate a file_region struct. Clear
5976
* hugetlb_restore_reserve so that global reserve
5977
* count will not be incremented by free_huge_folio.
5978
* Act as if we consumed the reservation.
5979
*/
5980
folio_clear_hugetlb_restore_reserve(folio);
5981
else if (rc)
5982
vma_add_reservation(h, vma, address);
5983
}
5984
5985
tlb_remove_page_size(tlb, folio_page(folio, 0),
5986
folio_size(folio));
5987
/*
5988
* If we were instructed to unmap a specific folio, we're done.
5989
*/
5990
if (folio_provided)
5991
break;
5992
}
5993
tlb_end_vma(tlb, vma);
5994
5995
/*
5996
* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
5997
* could defer the flush until now, since by holding i_mmap_rwsem we
5998
* guaranteed that the last refernece would not be dropped. But we must
5999
* do the flushing before we return, as otherwise i_mmap_rwsem will be
6000
* dropped and the last reference to the shared PMDs page might be
6001
* dropped as well.
6002
*
6003
* In theory we could defer the freeing of the PMD pages as well, but
6004
* huge_pmd_unshare() relies on the exact page_count for the PMD page to
6005
* detect sharing, so we cannot defer the release of the page either.
6006
* Instead, do flush now.
6007
*/
6008
if (force_flush)
6009
tlb_flush_mmu_tlbonly(tlb);
6010
}
6011
6012
void __hugetlb_zap_begin(struct vm_area_struct *vma,
6013
unsigned long *start, unsigned long *end)
6014
{
6015
if (!vma->vm_file) /* hugetlbfs_file_mmap error */
6016
return;
6017
6018
adjust_range_if_pmd_sharing_possible(vma, start, end);
6019
hugetlb_vma_lock_write(vma);
6020
if (vma->vm_file)
6021
i_mmap_lock_write(vma->vm_file->f_mapping);
6022
}
6023
6024
void __hugetlb_zap_end(struct vm_area_struct *vma,
6025
struct zap_details *details)
6026
{
6027
zap_flags_t zap_flags = details ? details->zap_flags : 0;
6028
6029
if (!vma->vm_file) /* hugetlbfs_file_mmap error */
6030
return;
6031
6032
if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
6033
/*
6034
* Unlock and free the vma lock before releasing i_mmap_rwsem.
6035
* When the vma_lock is freed, this makes the vma ineligible
6036
* for pmd sharing. And, i_mmap_rwsem is required to set up
6037
* pmd sharing. This is important as page tables for this
6038
* unmapped range will be asynchrously deleted. If the page
6039
* tables are shared, there will be issues when accessed by
6040
* someone else.
6041
*/
6042
__hugetlb_vma_unlock_write_free(vma);
6043
} else {
6044
hugetlb_vma_unlock_write(vma);
6045
}
6046
6047
if (vma->vm_file)
6048
i_mmap_unlock_write(vma->vm_file->f_mapping);
6049
}
6050
6051
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
6052
unsigned long end, struct folio *folio,
6053
zap_flags_t zap_flags)
6054
{
6055
struct mmu_notifier_range range;
6056
struct mmu_gather tlb;
6057
6058
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
6059
start, end);
6060
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6061
mmu_notifier_invalidate_range_start(&range);
6062
tlb_gather_mmu(&tlb, vma->vm_mm);
6063
6064
__unmap_hugepage_range(&tlb, vma, start, end,
6065
folio, zap_flags);
6066
6067
mmu_notifier_invalidate_range_end(&range);
6068
tlb_finish_mmu(&tlb);
6069
}
6070
6071
/*
6072
* This is called when the original mapper is failing to COW a MAP_PRIVATE
6073
* mapping it owns the reserve page for. The intention is to unmap the page
6074
* from other VMAs and let the children be SIGKILLed if they are faulting the
6075
* same region.
6076
*/
6077
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
6078
struct folio *folio, unsigned long address)
6079
{
6080
struct hstate *h = hstate_vma(vma);
6081
struct vm_area_struct *iter_vma;
6082
struct address_space *mapping;
6083
pgoff_t pgoff;
6084
6085
/*
6086
* vm_pgoff is in PAGE_SIZE units, hence the different calculation
6087
* from page cache lookup which is in HPAGE_SIZE units.
6088
*/
6089
address = address & huge_page_mask(h);
6090
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
6091
vma->vm_pgoff;
6092
mapping = vma->vm_file->f_mapping;
6093
6094
/*
6095
* Take the mapping lock for the duration of the table walk. As
6096
* this mapping should be shared between all the VMAs,
6097
* __unmap_hugepage_range() is called as the lock is already held
6098
*/
6099
i_mmap_lock_write(mapping);
6100
vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
6101
/* Do not unmap the current VMA */
6102
if (iter_vma == vma)
6103
continue;
6104
6105
/*
6106
* Shared VMAs have their own reserves and do not affect
6107
* MAP_PRIVATE accounting but it is possible that a shared
6108
* VMA is using the same page so check and skip such VMAs.
6109
*/
6110
if (iter_vma->vm_flags & VM_MAYSHARE)
6111
continue;
6112
6113
/*
6114
* Unmap the page from other VMAs without their own reserves.
6115
* They get marked to be SIGKILLed if they fault in these
6116
* areas. This is because a future no-page fault on this VMA
6117
* could insert a zeroed page instead of the data existing
6118
* from the time of fork. This would look like data corruption
6119
*/
6120
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
6121
unmap_hugepage_range(iter_vma, address,
6122
address + huge_page_size(h),
6123
folio, 0);
6124
}
6125
i_mmap_unlock_write(mapping);
6126
}
6127
6128
/*
6129
* hugetlb_wp() should be called with page lock of the original hugepage held.
6130
* Called with hugetlb_fault_mutex_table held and pte_page locked so we
6131
* cannot race with other handlers or page migration.
6132
* Keep the pte_same checks anyway to make transition from the mutex easier.
6133
*/
6134
static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
6135
{
6136
struct vm_area_struct *vma = vmf->vma;
6137
struct mm_struct *mm = vma->vm_mm;
6138
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
6139
pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte);
6140
struct hstate *h = hstate_vma(vma);
6141
struct folio *old_folio;
6142
struct folio *new_folio;
6143
bool cow_from_owner = 0;
6144
vm_fault_t ret = 0;
6145
struct mmu_notifier_range range;
6146
6147
/*
6148
* Never handle CoW for uffd-wp protected pages. It should be only
6149
* handled when the uffd-wp protection is removed.
6150
*
6151
* Note that only the CoW optimization path (in hugetlb_no_page())
6152
* can trigger this, because hugetlb_fault() will always resolve
6153
* uffd-wp bit first.
6154
*/
6155
if (!unshare && huge_pte_uffd_wp(pte))
6156
return 0;
6157
6158
/* Let's take out MAP_SHARED mappings first. */
6159
if (vma->vm_flags & VM_MAYSHARE) {
6160
set_huge_ptep_writable(vma, vmf->address, vmf->pte);
6161
return 0;
6162
}
6163
6164
old_folio = page_folio(pte_page(pte));
6165
6166
delayacct_wpcopy_start();
6167
6168
retry_avoidcopy:
6169
/*
6170
* If no-one else is actually using this page, we're the exclusive
6171
* owner and can reuse this page.
6172
*
6173
* Note that we don't rely on the (safer) folio refcount here, because
6174
* copying the hugetlb folio when there are unexpected (temporary)
6175
* folio references could harm simple fork()+exit() users when
6176
* we run out of free hugetlb folios: we would have to kill processes
6177
* in scenarios that used to work. As a side effect, there can still
6178
* be leaks between processes, for example, with FOLL_GET users.
6179
*/
6180
if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
6181
if (!PageAnonExclusive(&old_folio->page)) {
6182
folio_move_anon_rmap(old_folio, vma);
6183
SetPageAnonExclusive(&old_folio->page);
6184
}
6185
if (likely(!unshare))
6186
set_huge_ptep_maybe_writable(vma, vmf->address,
6187
vmf->pte);
6188
6189
delayacct_wpcopy_end();
6190
return 0;
6191
}
6192
VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
6193
PageAnonExclusive(&old_folio->page), &old_folio->page);
6194
6195
/*
6196
* If the process that created a MAP_PRIVATE mapping is about to perform
6197
* a COW due to a shared page count, attempt to satisfy the allocation
6198
* without using the existing reserves.
6199
* In order to determine where this is a COW on a MAP_PRIVATE mapping it
6200
* is enough to check whether the old_folio is anonymous. This means that
6201
* the reserve for this address was consumed. If reserves were used, a
6202
* partial faulted mapping at the fime of fork() could consume its reserves
6203
* on COW instead of the full address range.
6204
*/
6205
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
6206
folio_test_anon(old_folio))
6207
cow_from_owner = true;
6208
6209
folio_get(old_folio);
6210
6211
/*
6212
* Drop page table lock as buddy allocator may be called. It will
6213
* be acquired again before returning to the caller, as expected.
6214
*/
6215
spin_unlock(vmf->ptl);
6216
new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
6217
6218
if (IS_ERR(new_folio)) {
6219
/*
6220
* If a process owning a MAP_PRIVATE mapping fails to COW,
6221
* it is due to references held by a child and an insufficient
6222
* huge page pool. To guarantee the original mappers
6223
* reliability, unmap the page from child processes. The child
6224
* may get SIGKILLed if it later faults.
6225
*/
6226
if (cow_from_owner) {
6227
struct address_space *mapping = vma->vm_file->f_mapping;
6228
pgoff_t idx;
6229
u32 hash;
6230
6231
folio_put(old_folio);
6232
/*
6233
* Drop hugetlb_fault_mutex and vma_lock before
6234
* unmapping. unmapping needs to hold vma_lock
6235
* in write mode. Dropping vma_lock in read mode
6236
* here is OK as COW mappings do not interact with
6237
* PMD sharing.
6238
*
6239
* Reacquire both after unmap operation.
6240
*/
6241
idx = vma_hugecache_offset(h, vma, vmf->address);
6242
hash = hugetlb_fault_mutex_hash(mapping, idx);
6243
hugetlb_vma_unlock_read(vma);
6244
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6245
6246
unmap_ref_private(mm, vma, old_folio, vmf->address);
6247
6248
mutex_lock(&hugetlb_fault_mutex_table[hash]);
6249
hugetlb_vma_lock_read(vma);
6250
spin_lock(vmf->ptl);
6251
vmf->pte = hugetlb_walk(vma, vmf->address,
6252
huge_page_size(h));
6253
if (likely(vmf->pte &&
6254
pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte)))
6255
goto retry_avoidcopy;
6256
/*
6257
* race occurs while re-acquiring page table
6258
* lock, and our job is done.
6259
*/
6260
delayacct_wpcopy_end();
6261
return 0;
6262
}
6263
6264
ret = vmf_error(PTR_ERR(new_folio));
6265
goto out_release_old;
6266
}
6267
6268
/*
6269
* When the original hugepage is shared one, it does not have
6270
* anon_vma prepared.
6271
*/
6272
ret = __vmf_anon_prepare(vmf);
6273
if (unlikely(ret))
6274
goto out_release_all;
6275
6276
if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
6277
ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
6278
goto out_release_all;
6279
}
6280
__folio_mark_uptodate(new_folio);
6281
6282
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
6283
vmf->address + huge_page_size(h));
6284
mmu_notifier_invalidate_range_start(&range);
6285
6286
/*
6287
* Retake the page table lock to check for racing updates
6288
* before the page tables are altered
6289
*/
6290
spin_lock(vmf->ptl);
6291
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
6292
if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
6293
pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
6294
6295
/* Break COW or unshare */
6296
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
6297
hugetlb_remove_rmap(old_folio);
6298
hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
6299
if (huge_pte_uffd_wp(pte))
6300
newpte = huge_pte_mkuffd_wp(newpte);
6301
set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
6302
huge_page_size(h));
6303
folio_set_hugetlb_migratable(new_folio);
6304
/* Make the old page be freed below */
6305
new_folio = old_folio;
6306
}
6307
spin_unlock(vmf->ptl);
6308
mmu_notifier_invalidate_range_end(&range);
6309
out_release_all:
6310
/*
6311
* No restore in case of successful pagetable update (Break COW or
6312
* unshare)
6313
*/
6314
if (new_folio != old_folio)
6315
restore_reserve_on_error(h, vma, vmf->address, new_folio);
6316
folio_put(new_folio);
6317
out_release_old:
6318
folio_put(old_folio);
6319
6320
spin_lock(vmf->ptl); /* Caller expects lock to be held */
6321
6322
delayacct_wpcopy_end();
6323
return ret;
6324
}
6325
6326
/*
6327
* Return whether there is a pagecache page to back given address within VMA.
6328
*/
6329
bool hugetlbfs_pagecache_present(struct hstate *h,
6330
struct vm_area_struct *vma, unsigned long address)
6331
{
6332
struct address_space *mapping = vma->vm_file->f_mapping;
6333
pgoff_t idx = linear_page_index(vma, address);
6334
struct folio *folio;
6335
6336
folio = filemap_get_folio(mapping, idx);
6337
if (IS_ERR(folio))
6338
return false;
6339
folio_put(folio);
6340
return true;
6341
}
6342
6343
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
6344
pgoff_t idx)
6345
{
6346
struct inode *inode = mapping->host;
6347
struct hstate *h = hstate_inode(inode);
6348
int err;
6349
6350
idx <<= huge_page_order(h);
6351
__folio_set_locked(folio);
6352
err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
6353
6354
if (unlikely(err)) {
6355
__folio_clear_locked(folio);
6356
return err;
6357
}
6358
folio_clear_hugetlb_restore_reserve(folio);
6359
6360
/*
6361
* mark folio dirty so that it will not be removed from cache/file
6362
* by non-hugetlbfs specific code paths.
6363
*/
6364
folio_mark_dirty(folio);
6365
6366
spin_lock(&inode->i_lock);
6367
inode->i_blocks += blocks_per_huge_page(h);
6368
spin_unlock(&inode->i_lock);
6369
return 0;
6370
}
6371
6372
static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf,
6373
struct address_space *mapping,
6374
unsigned long reason)
6375
{
6376
u32 hash;
6377
6378
/*
6379
* vma_lock and hugetlb_fault_mutex must be dropped before handling
6380
* userfault. Also mmap_lock could be dropped due to handling
6381
* userfault, any vma operation should be careful from here.
6382
*/
6383
hugetlb_vma_unlock_read(vmf->vma);
6384
hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
6385
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6386
return handle_userfault(vmf, reason);
6387
}
6388
6389
/*
6390
* Recheck pte with pgtable lock. Returns true if pte didn't change, or
6391
* false if pte changed or is changing.
6392
*/
6393
static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr,
6394
pte_t *ptep, pte_t old_pte)
6395
{
6396
spinlock_t *ptl;
6397
bool same;
6398
6399
ptl = huge_pte_lock(h, mm, ptep);
6400
same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte);
6401
spin_unlock(ptl);
6402
6403
return same;
6404
}
6405
6406
static vm_fault_t hugetlb_no_page(struct address_space *mapping,
6407
struct vm_fault *vmf)
6408
{
6409
u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
6410
bool new_folio, new_anon_folio = false;
6411
struct vm_area_struct *vma = vmf->vma;
6412
struct mm_struct *mm = vma->vm_mm;
6413
struct hstate *h = hstate_vma(vma);
6414
vm_fault_t ret = VM_FAULT_SIGBUS;
6415
bool folio_locked = true;
6416
struct folio *folio;
6417
unsigned long size;
6418
pte_t new_pte;
6419
6420
/*
6421
* Currently, we are forced to kill the process in the event the
6422
* original mapper has unmapped pages from the child due to a failed
6423
* COW/unsharing. Warn that such a situation has occurred as it may not
6424
* be obvious.
6425
*/
6426
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
6427
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
6428
current->pid);
6429
goto out;
6430
}
6431
6432
/*
6433
* Use page lock to guard against racing truncation
6434
* before we get page_table_lock.
6435
*/
6436
new_folio = false;
6437
folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
6438
if (IS_ERR(folio)) {
6439
size = i_size_read(mapping->host) >> huge_page_shift(h);
6440
if (vmf->pgoff >= size)
6441
goto out;
6442
/* Check for page in userfault range */
6443
if (userfaultfd_missing(vma)) {
6444
/*
6445
* Since hugetlb_no_page() was examining pte
6446
* without pgtable lock, we need to re-test under
6447
* lock because the pte may not be stable and could
6448
* have changed from under us. Try to detect
6449
* either changed or during-changing ptes and retry
6450
* properly when needed.
6451
*
6452
* Note that userfaultfd is actually fine with
6453
* false positives (e.g. caused by pte changed),
6454
* but not wrong logical events (e.g. caused by
6455
* reading a pte during changing). The latter can
6456
* confuse the userspace, so the strictness is very
6457
* much preferred. E.g., MISSING event should
6458
* never happen on the page after UFFDIO_COPY has
6459
* correctly installed the page and returned.
6460
*/
6461
if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
6462
ret = 0;
6463
goto out;
6464
}
6465
6466
return hugetlb_handle_userfault(vmf, mapping,
6467
VM_UFFD_MISSING);
6468
}
6469
6470
if (!(vma->vm_flags & VM_MAYSHARE)) {
6471
ret = __vmf_anon_prepare(vmf);
6472
if (unlikely(ret))
6473
goto out;
6474
}
6475
6476
folio = alloc_hugetlb_folio(vma, vmf->address, false);
6477
if (IS_ERR(folio)) {
6478
/*
6479
* Returning error will result in faulting task being
6480
* sent SIGBUS. The hugetlb fault mutex prevents two
6481
* tasks from racing to fault in the same page which
6482
* could result in false unable to allocate errors.
6483
* Page migration does not take the fault mutex, but
6484
* does a clear then write of pte's under page table
6485
* lock. Page fault code could race with migration,
6486
* notice the clear pte and try to allocate a page
6487
* here. Before returning error, get ptl and make
6488
* sure there really is no pte entry.
6489
*/
6490
if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte))
6491
ret = vmf_error(PTR_ERR(folio));
6492
else
6493
ret = 0;
6494
goto out;
6495
}
6496
folio_zero_user(folio, vmf->real_address);
6497
__folio_mark_uptodate(folio);
6498
new_folio = true;
6499
6500
if (vma->vm_flags & VM_MAYSHARE) {
6501
int err = hugetlb_add_to_page_cache(folio, mapping,
6502
vmf->pgoff);
6503
if (err) {
6504
/*
6505
* err can't be -EEXIST which implies someone
6506
* else consumed the reservation since hugetlb
6507
* fault mutex is held when add a hugetlb page
6508
* to the page cache. So it's safe to call
6509
* restore_reserve_on_error() here.
6510
*/
6511
restore_reserve_on_error(h, vma, vmf->address,
6512
folio);
6513
folio_put(folio);
6514
ret = VM_FAULT_SIGBUS;
6515
goto out;
6516
}
6517
} else {
6518
new_anon_folio = true;
6519
folio_lock(folio);
6520
}
6521
} else {
6522
/*
6523
* If memory error occurs between mmap() and fault, some process
6524
* don't have hwpoisoned swap entry for errored virtual address.
6525
* So we need to block hugepage fault by PG_hwpoison bit check.
6526
*/
6527
if (unlikely(folio_test_hwpoison(folio))) {
6528
ret = VM_FAULT_HWPOISON_LARGE |
6529
VM_FAULT_SET_HINDEX(hstate_index(h));
6530
goto backout_unlocked;
6531
}
6532
6533
/* Check for page in userfault range. */
6534
if (userfaultfd_minor(vma)) {
6535
folio_unlock(folio);
6536
folio_put(folio);
6537
/* See comment in userfaultfd_missing() block above */
6538
if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
6539
ret = 0;
6540
goto out;
6541
}
6542
return hugetlb_handle_userfault(vmf, mapping,
6543
VM_UFFD_MINOR);
6544
}
6545
}
6546
6547
/*
6548
* If we are going to COW a private mapping later, we examine the
6549
* pending reservations for this page now. This will ensure that
6550
* any allocations necessary to record that reservation occur outside
6551
* the spinlock.
6552
*/
6553
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6554
if (vma_needs_reservation(h, vma, vmf->address) < 0) {
6555
ret = VM_FAULT_OOM;
6556
goto backout_unlocked;
6557
}
6558
/* Just decrements count, does not deallocate */
6559
vma_end_reservation(h, vma, vmf->address);
6560
}
6561
6562
vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
6563
ret = 0;
6564
/* If pte changed from under us, retry */
6565
if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
6566
goto backout;
6567
6568
if (new_anon_folio)
6569
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
6570
else
6571
hugetlb_add_file_rmap(folio);
6572
new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
6573
/*
6574
* If this pte was previously wr-protected, keep it wr-protected even
6575
* if populated.
6576
*/
6577
if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
6578
new_pte = huge_pte_mkuffd_wp(new_pte);
6579
set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
6580
6581
hugetlb_count_add(pages_per_huge_page(h), mm);
6582
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6583
/*
6584
* No need to keep file folios locked. See comment in
6585
* hugetlb_fault().
6586
*/
6587
if (!new_anon_folio) {
6588
folio_locked = false;
6589
folio_unlock(folio);
6590
}
6591
/* Optimization, do the COW without a second fault */
6592
ret = hugetlb_wp(vmf);
6593
}
6594
6595
spin_unlock(vmf->ptl);
6596
6597
/*
6598
* Only set hugetlb_migratable in newly allocated pages. Existing pages
6599
* found in the pagecache may not have hugetlb_migratable if they have
6600
* been isolated for migration.
6601
*/
6602
if (new_folio)
6603
folio_set_hugetlb_migratable(folio);
6604
6605
if (folio_locked)
6606
folio_unlock(folio);
6607
out:
6608
hugetlb_vma_unlock_read(vma);
6609
6610
/*
6611
* We must check to release the per-VMA lock. __vmf_anon_prepare() is
6612
* the only way ret can be set to VM_FAULT_RETRY.
6613
*/
6614
if (unlikely(ret & VM_FAULT_RETRY))
6615
vma_end_read(vma);
6616
6617
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6618
return ret;
6619
6620
backout:
6621
spin_unlock(vmf->ptl);
6622
backout_unlocked:
6623
/* We only need to restore reservations for private mappings */
6624
if (new_anon_folio)
6625
restore_reserve_on_error(h, vma, vmf->address, folio);
6626
6627
folio_unlock(folio);
6628
folio_put(folio);
6629
goto out;
6630
}
6631
6632
#ifdef CONFIG_SMP
6633
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6634
{
6635
unsigned long key[2];
6636
u32 hash;
6637
6638
key[0] = (unsigned long) mapping;
6639
key[1] = idx;
6640
6641
hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
6642
6643
return hash & (num_fault_mutexes - 1);
6644
}
6645
#else
6646
/*
6647
* For uniprocessor systems we always use a single mutex, so just
6648
* return 0 and avoid the hashing overhead.
6649
*/
6650
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6651
{
6652
return 0;
6653
}
6654
#endif
6655
6656
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
6657
unsigned long address, unsigned int flags)
6658
{
6659
vm_fault_t ret;
6660
u32 hash;
6661
struct folio *folio = NULL;
6662
struct hstate *h = hstate_vma(vma);
6663
struct address_space *mapping;
6664
bool need_wait_lock = false;
6665
struct vm_fault vmf = {
6666
.vma = vma,
6667
.address = address & huge_page_mask(h),
6668
.real_address = address,
6669
.flags = flags,
6670
.pgoff = vma_hugecache_offset(h, vma,
6671
address & huge_page_mask(h)),
6672
/* TODO: Track hugetlb faults using vm_fault */
6673
6674
/*
6675
* Some fields may not be initialized, be careful as it may
6676
* be hard to debug if called functions make assumptions
6677
*/
6678
};
6679
6680
/*
6681
* Serialize hugepage allocation and instantiation, so that we don't
6682
* get spurious allocation failures if two CPUs race to instantiate
6683
* the same page in the page cache.
6684
*/
6685
mapping = vma->vm_file->f_mapping;
6686
hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff);
6687
mutex_lock(&hugetlb_fault_mutex_table[hash]);
6688
6689
/*
6690
* Acquire vma lock before calling huge_pte_alloc and hold
6691
* until finished with vmf.pte. This prevents huge_pmd_unshare from
6692
* being called elsewhere and making the vmf.pte no longer valid.
6693
*/
6694
hugetlb_vma_lock_read(vma);
6695
vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
6696
if (!vmf.pte) {
6697
hugetlb_vma_unlock_read(vma);
6698
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6699
return VM_FAULT_OOM;
6700
}
6701
6702
vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
6703
if (huge_pte_none_mostly(vmf.orig_pte)) {
6704
if (is_pte_marker(vmf.orig_pte)) {
6705
pte_marker marker =
6706
pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
6707
6708
if (marker & PTE_MARKER_POISONED) {
6709
ret = VM_FAULT_HWPOISON_LARGE |
6710
VM_FAULT_SET_HINDEX(hstate_index(h));
6711
goto out_mutex;
6712
} else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
6713
/* This isn't supported in hugetlb. */
6714
ret = VM_FAULT_SIGSEGV;
6715
goto out_mutex;
6716
}
6717
}
6718
6719
/*
6720
* Other PTE markers should be handled the same way as none PTE.
6721
*
6722
* hugetlb_no_page will drop vma lock and hugetlb fault
6723
* mutex internally, which make us return immediately.
6724
*/
6725
return hugetlb_no_page(mapping, &vmf);
6726
}
6727
6728
ret = 0;
6729
6730
/* Not present, either a migration or a hwpoisoned entry */
6731
if (!pte_present(vmf.orig_pte)) {
6732
if (is_hugetlb_entry_migration(vmf.orig_pte)) {
6733
/*
6734
* Release the hugetlb fault lock now, but retain
6735
* the vma lock, because it is needed to guard the
6736
* huge_pte_lockptr() later in
6737
* migration_entry_wait_huge(). The vma lock will
6738
* be released there.
6739
*/
6740
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6741
migration_entry_wait_huge(vma, vmf.address, vmf.pte);
6742
return 0;
6743
} else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
6744
ret = VM_FAULT_HWPOISON_LARGE |
6745
VM_FAULT_SET_HINDEX(hstate_index(h));
6746
goto out_mutex;
6747
}
6748
6749
/*
6750
* If we are going to COW/unshare the mapping later, we examine the
6751
* pending reservations for this page now. This will ensure that any
6752
* allocations necessary to record that reservation occur outside the
6753
* spinlock.
6754
*/
6755
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
6756
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
6757
if (vma_needs_reservation(h, vma, vmf.address) < 0) {
6758
ret = VM_FAULT_OOM;
6759
goto out_mutex;
6760
}
6761
/* Just decrements count, does not deallocate */
6762
vma_end_reservation(h, vma, vmf.address);
6763
}
6764
6765
vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
6766
6767
/* Check for a racing update before calling hugetlb_wp() */
6768
if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte))))
6769
goto out_ptl;
6770
6771
/* Handle userfault-wp first, before trying to lock more pages */
6772
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) &&
6773
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
6774
if (!userfaultfd_wp_async(vma)) {
6775
spin_unlock(vmf.ptl);
6776
hugetlb_vma_unlock_read(vma);
6777
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6778
return handle_userfault(&vmf, VM_UFFD_WP);
6779
}
6780
6781
vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
6782
set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
6783
huge_page_size(hstate_vma(vma)));
6784
/* Fallthrough to CoW */
6785
}
6786
6787
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
6788
if (!huge_pte_write(vmf.orig_pte)) {
6789
/*
6790
* Anonymous folios need to be lock since hugetlb_wp()
6791
* checks whether we can re-use the folio exclusively
6792
* for us in case we are the only user of it.
6793
*/
6794
folio = page_folio(pte_page(vmf.orig_pte));
6795
if (folio_test_anon(folio) && !folio_trylock(folio)) {
6796
need_wait_lock = true;
6797
goto out_ptl;
6798
}
6799
folio_get(folio);
6800
ret = hugetlb_wp(&vmf);
6801
if (folio_test_anon(folio))
6802
folio_unlock(folio);
6803
folio_put(folio);
6804
goto out_ptl;
6805
} else if (likely(flags & FAULT_FLAG_WRITE)) {
6806
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
6807
}
6808
}
6809
vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
6810
if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
6811
flags & FAULT_FLAG_WRITE))
6812
update_mmu_cache(vma, vmf.address, vmf.pte);
6813
out_ptl:
6814
spin_unlock(vmf.ptl);
6815
out_mutex:
6816
hugetlb_vma_unlock_read(vma);
6817
6818
/*
6819
* We must check to release the per-VMA lock. __vmf_anon_prepare() in
6820
* hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY.
6821
*/
6822
if (unlikely(ret & VM_FAULT_RETRY))
6823
vma_end_read(vma);
6824
6825
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6826
/*
6827
* hugetlb_wp drops all the locks, but the folio lock, before trying to
6828
* unmap the folio from other processes. During that window, if another
6829
* process mapping that folio faults in, it will take the mutex and then
6830
* it will wait on folio_lock, causing an ABBA deadlock.
6831
* Use trylock instead and bail out if we fail.
6832
*
6833
* Ideally, we should hold a refcount on the folio we wait for, but we do
6834
* not want to use the folio after it becomes unlocked, but rather just
6835
* wait for it to become unlocked, so hopefully next fault successes on
6836
* the trylock.
6837
*/
6838
if (need_wait_lock)
6839
folio_wait_locked(folio);
6840
return ret;
6841
}
6842
6843
#ifdef CONFIG_USERFAULTFD
6844
/*
6845
* Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
6846
*/
6847
static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
6848
struct vm_area_struct *vma, unsigned long address)
6849
{
6850
struct mempolicy *mpol;
6851
nodemask_t *nodemask;
6852
struct folio *folio;
6853
gfp_t gfp_mask;
6854
int node;
6855
6856
gfp_mask = htlb_alloc_mask(h);
6857
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
6858
/*
6859
* This is used to allocate a temporary hugetlb to hold the copied
6860
* content, which will then be copied again to the final hugetlb
6861
* consuming a reservation. Set the alloc_fallback to false to indicate
6862
* that breaking the per-node hugetlb pool is not allowed in this case.
6863
*/
6864
folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
6865
mpol_cond_put(mpol);
6866
6867
return folio;
6868
}
6869
6870
/*
6871
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
6872
* with modifications for hugetlb pages.
6873
*/
6874
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
6875
struct vm_area_struct *dst_vma,
6876
unsigned long dst_addr,
6877
unsigned long src_addr,
6878
uffd_flags_t flags,
6879
struct folio **foliop)
6880
{
6881
struct mm_struct *dst_mm = dst_vma->vm_mm;
6882
bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
6883
bool wp_enabled = (flags & MFILL_ATOMIC_WP);
6884
struct hstate *h = hstate_vma(dst_vma);
6885
struct address_space *mapping = dst_vma->vm_file->f_mapping;
6886
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
6887
unsigned long size = huge_page_size(h);
6888
int vm_shared = dst_vma->vm_flags & VM_SHARED;
6889
pte_t _dst_pte;
6890
spinlock_t *ptl;
6891
int ret = -ENOMEM;
6892
struct folio *folio;
6893
bool folio_in_pagecache = false;
6894
6895
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
6896
ptl = huge_pte_lock(h, dst_mm, dst_pte);
6897
6898
/* Don't overwrite any existing PTEs (even markers) */
6899
if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
6900
spin_unlock(ptl);
6901
return -EEXIST;
6902
}
6903
6904
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
6905
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);
6906
6907
/* No need to invalidate - it was non-present before */
6908
update_mmu_cache(dst_vma, dst_addr, dst_pte);
6909
6910
spin_unlock(ptl);
6911
return 0;
6912
}
6913
6914
if (is_continue) {
6915
ret = -EFAULT;
6916
folio = filemap_lock_hugetlb_folio(h, mapping, idx);
6917
if (IS_ERR(folio))
6918
goto out;
6919
folio_in_pagecache = true;
6920
} else if (!*foliop) {
6921
/* If a folio already exists, then it's UFFDIO_COPY for
6922
* a non-missing case. Return -EEXIST.
6923
*/
6924
if (vm_shared &&
6925
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
6926
ret = -EEXIST;
6927
goto out;
6928
}
6929
6930
folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
6931
if (IS_ERR(folio)) {
6932
ret = -ENOMEM;
6933
goto out;
6934
}
6935
6936
ret = copy_folio_from_user(folio, (const void __user *) src_addr,
6937
false);
6938
6939
/* fallback to copy_from_user outside mmap_lock */
6940
if (unlikely(ret)) {
6941
ret = -ENOENT;
6942
/* Free the allocated folio which may have
6943
* consumed a reservation.
6944
*/
6945
restore_reserve_on_error(h, dst_vma, dst_addr, folio);
6946
folio_put(folio);
6947
6948
/* Allocate a temporary folio to hold the copied
6949
* contents.
6950
*/
6951
folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
6952
if (!folio) {
6953
ret = -ENOMEM;
6954
goto out;
6955
}
6956
*foliop = folio;
6957
/* Set the outparam foliop and return to the caller to
6958
* copy the contents outside the lock. Don't free the
6959
* folio.
6960
*/
6961
goto out;
6962
}
6963
} else {
6964
if (vm_shared &&
6965
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
6966
folio_put(*foliop);
6967
ret = -EEXIST;
6968
*foliop = NULL;
6969
goto out;
6970
}
6971
6972
folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
6973
if (IS_ERR(folio)) {
6974
folio_put(*foliop);
6975
ret = -ENOMEM;
6976
*foliop = NULL;
6977
goto out;
6978
}
6979
ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
6980
folio_put(*foliop);
6981
*foliop = NULL;
6982
if (ret) {
6983
folio_put(folio);
6984
goto out;
6985
}
6986
}
6987
6988
/*
6989
* If we just allocated a new page, we need a memory barrier to ensure
6990
* that preceding stores to the page become visible before the
6991
* set_pte_at() write. The memory barrier inside __folio_mark_uptodate
6992
* is what we need.
6993
*
6994
* In the case where we have not allocated a new page (is_continue),
6995
* the page must already be uptodate. UFFDIO_CONTINUE already includes
6996
* an earlier smp_wmb() to ensure that prior stores will be visible
6997
* before the set_pte_at() write.
6998
*/
6999
if (!is_continue)
7000
__folio_mark_uptodate(folio);
7001
else
7002
WARN_ON_ONCE(!folio_test_uptodate(folio));
7003
7004
/* Add shared, newly allocated pages to the page cache. */
7005
if (vm_shared && !is_continue) {
7006
ret = -EFAULT;
7007
if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h)))
7008
goto out_release_nounlock;
7009
7010
/*
7011
* Serialization between remove_inode_hugepages() and
7012
* hugetlb_add_to_page_cache() below happens through the
7013
* hugetlb_fault_mutex_table that here must be hold by
7014
* the caller.
7015
*/
7016
ret = hugetlb_add_to_page_cache(folio, mapping, idx);
7017
if (ret)
7018
goto out_release_nounlock;
7019
folio_in_pagecache = true;
7020
}
7021
7022
ptl = huge_pte_lock(h, dst_mm, dst_pte);
7023
7024
ret = -EIO;
7025
if (folio_test_hwpoison(folio))
7026
goto out_release_unlock;
7027
7028
/*
7029
* We allow to overwrite a pte marker: consider when both MISSING|WP
7030
* registered, we firstly wr-protect a none pte which has no page cache
7031
* page backing it, then access the page.
7032
*/
7033
ret = -EEXIST;
7034
if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte)))
7035
goto out_release_unlock;
7036
7037
if (folio_in_pagecache)
7038
hugetlb_add_file_rmap(folio);
7039
else
7040
hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);
7041
7042
/*
7043
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
7044
* with wp flag set, don't set pte write bit.
7045
*/
7046
_dst_pte = make_huge_pte(dst_vma, folio,
7047
!wp_enabled && !(is_continue && !vm_shared));
7048
/*
7049
* Always mark UFFDIO_COPY page dirty; note that this may not be
7050
* extremely important for hugetlbfs for now since swapping is not
7051
* supported, but we should still be clear in that this page cannot be
7052
* thrown away at will, even if write bit not set.
7053
*/
7054
_dst_pte = huge_pte_mkdirty(_dst_pte);
7055
_dst_pte = pte_mkyoung(_dst_pte);
7056
7057
if (wp_enabled)
7058
_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
7059
7060
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);
7061
7062
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
7063
7064
/* No need to invalidate - it was non-present before */
7065
update_mmu_cache(dst_vma, dst_addr, dst_pte);
7066
7067
spin_unlock(ptl);
7068
if (!is_continue)
7069
folio_set_hugetlb_migratable(folio);
7070
if (vm_shared || is_continue)
7071
folio_unlock(folio);
7072
ret = 0;
7073
out:
7074
return ret;
7075
out_release_unlock:
7076
spin_unlock(ptl);
7077
if (vm_shared || is_continue)
7078
folio_unlock(folio);
7079
out_release_nounlock:
7080
if (!folio_in_pagecache)
7081
restore_reserve_on_error(h, dst_vma, dst_addr, folio);
7082
folio_put(folio);
7083
goto out;
7084
}
7085
#endif /* CONFIG_USERFAULTFD */
7086
7087
long hugetlb_change_protection(struct vm_area_struct *vma,
7088
unsigned long address, unsigned long end,
7089
pgprot_t newprot, unsigned long cp_flags)
7090
{
7091
struct mm_struct *mm = vma->vm_mm;
7092
unsigned long start = address;
7093
pte_t *ptep;
7094
pte_t pte;
7095
struct hstate *h = hstate_vma(vma);
7096
long pages = 0, psize = huge_page_size(h);
7097
bool shared_pmd = false;
7098
struct mmu_notifier_range range;
7099
unsigned long last_addr_mask;
7100
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
7101
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
7102
7103
/*
7104
* In the case of shared PMDs, the area to flush could be beyond
7105
* start/end. Set range.start/range.end to cover the maximum possible
7106
* range if PMD sharing is possible.
7107
*/
7108
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
7109
0, mm, start, end);
7110
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
7111
7112
BUG_ON(address >= end);
7113
flush_cache_range(vma, range.start, range.end);
7114
7115
mmu_notifier_invalidate_range_start(&range);
7116
hugetlb_vma_lock_write(vma);
7117
i_mmap_lock_write(vma->vm_file->f_mapping);
7118
last_addr_mask = hugetlb_mask_last_page(h);
7119
for (; address < end; address += psize) {
7120
spinlock_t *ptl;
7121
ptep = hugetlb_walk(vma, address, psize);
7122
if (!ptep) {
7123
if (!uffd_wp) {
7124
address |= last_addr_mask;
7125
continue;
7126
}
7127
/*
7128
* Userfaultfd wr-protect requires pgtable
7129
* pre-allocations to install pte markers.
7130
*/
7131
ptep = huge_pte_alloc(mm, vma, address, psize);
7132
if (!ptep) {
7133
pages = -ENOMEM;
7134
break;
7135
}
7136
}
7137
ptl = huge_pte_lock(h, mm, ptep);
7138
if (huge_pmd_unshare(mm, vma, address, ptep)) {
7139
/*
7140
* When uffd-wp is enabled on the vma, unshare
7141
* shouldn't happen at all. Warn about it if it
7142
* happened due to some reason.
7143
*/
7144
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
7145
pages++;
7146
spin_unlock(ptl);
7147
shared_pmd = true;
7148
address |= last_addr_mask;
7149
continue;
7150
}
7151
pte = huge_ptep_get(mm, address, ptep);
7152
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
7153
/* Nothing to do. */
7154
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
7155
swp_entry_t entry = pte_to_swp_entry(pte);
7156
struct folio *folio = pfn_swap_entry_folio(entry);
7157
pte_t newpte = pte;
7158
7159
if (is_writable_migration_entry(entry)) {
7160
if (folio_test_anon(folio))
7161
entry = make_readable_exclusive_migration_entry(
7162
swp_offset(entry));
7163
else
7164
entry = make_readable_migration_entry(
7165
swp_offset(entry));
7166
newpte = swp_entry_to_pte(entry);
7167
pages++;
7168
}
7169
7170
if (uffd_wp)
7171
newpte = pte_swp_mkuffd_wp(newpte);
7172
else if (uffd_wp_resolve)
7173
newpte = pte_swp_clear_uffd_wp(newpte);
7174
if (!pte_same(pte, newpte))
7175
set_huge_pte_at(mm, address, ptep, newpte, psize);
7176
} else if (unlikely(is_pte_marker(pte))) {
7177
/*
7178
* Do nothing on a poison marker; page is
7179
* corrupted, permissons do not apply. Here
7180
* pte_marker_uffd_wp()==true implies !poison
7181
* because they're mutual exclusive.
7182
*/
7183
if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
7184
/* Safe to modify directly (non-present->none). */
7185
huge_pte_clear(mm, address, ptep, psize);
7186
} else if (!huge_pte_none(pte)) {
7187
pte_t old_pte;
7188
unsigned int shift = huge_page_shift(hstate_vma(vma));
7189
7190
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
7191
pte = huge_pte_modify(old_pte, newprot);
7192
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
7193
if (uffd_wp)
7194
pte = huge_pte_mkuffd_wp(pte);
7195
else if (uffd_wp_resolve)
7196
pte = huge_pte_clear_uffd_wp(pte);
7197
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
7198
pages++;
7199
} else {
7200
/* None pte */
7201
if (unlikely(uffd_wp))
7202
/* Safe to modify directly (none->non-present). */
7203
set_huge_pte_at(mm, address, ptep,
7204
make_pte_marker(PTE_MARKER_UFFD_WP),
7205
psize);
7206
}
7207
spin_unlock(ptl);
7208
}
7209
/*
7210
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
7211
* may have cleared our pud entry and done put_page on the page table:
7212
* once we release i_mmap_rwsem, another task can do the final put_page
7213
* and that page table be reused and filled with junk. If we actually
7214
* did unshare a page of pmds, flush the range corresponding to the pud.
7215
*/
7216
if (shared_pmd)
7217
flush_hugetlb_tlb_range(vma, range.start, range.end);
7218
else
7219
flush_hugetlb_tlb_range(vma, start, end);
7220
/*
7221
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
7222
* downgrading page table protection not changing it to point to a new
7223
* page.
7224
*
7225
* See Documentation/mm/mmu_notifier.rst
7226
*/
7227
i_mmap_unlock_write(vma->vm_file->f_mapping);
7228
hugetlb_vma_unlock_write(vma);
7229
mmu_notifier_invalidate_range_end(&range);
7230
7231
return pages > 0 ? (pages << h->order) : pages;
7232
}
7233
7234
/*
7235
* Update the reservation map for the range [from, to].
7236
*
7237
* Returns the number of entries that would be added to the reservation map
7238
* associated with the range [from, to]. This number is greater or equal to
7239
* zero. -EINVAL or -ENOMEM is returned in case of any errors.
7240
*/
7241
7242
long hugetlb_reserve_pages(struct inode *inode,
7243
long from, long to,
7244
struct vm_area_struct *vma,
7245
vm_flags_t vm_flags)
7246
{
7247
long chg = -1, add = -1, spool_resv, gbl_resv;
7248
struct hstate *h = hstate_inode(inode);
7249
struct hugepage_subpool *spool = subpool_inode(inode);
7250
struct resv_map *resv_map;
7251
struct hugetlb_cgroup *h_cg = NULL;
7252
long gbl_reserve, regions_needed = 0;
7253
7254
/* This should never happen */
7255
if (from > to) {
7256
VM_WARN(1, "%s called with a negative range\n", __func__);
7257
return -EINVAL;
7258
}
7259
7260
/*
7261
* vma specific semaphore used for pmd sharing and fault/truncation
7262
* synchronization
7263
*/
7264
hugetlb_vma_lock_alloc(vma);
7265
7266
/*
7267
* Only apply hugepage reservation if asked. At fault time, an
7268
* attempt will be made for VM_NORESERVE to allocate a page
7269
* without using reserves
7270
*/
7271
if (vm_flags & VM_NORESERVE)
7272
return 0;
7273
7274
/*
7275
* Shared mappings base their reservation on the number of pages that
7276
* are already allocated on behalf of the file. Private mappings need
7277
* to reserve the full area even if read-only as mprotect() may be
7278
* called to make the mapping read-write. Assume !vma is a shm mapping
7279
*/
7280
if (!vma || vma->vm_flags & VM_MAYSHARE) {
7281
/*
7282
* resv_map can not be NULL as hugetlb_reserve_pages is only
7283
* called for inodes for which resv_maps were created (see
7284
* hugetlbfs_get_inode).
7285
*/
7286
resv_map = inode_resv_map(inode);
7287
7288
chg = region_chg(resv_map, from, to, &regions_needed);
7289
} else {
7290
/* Private mapping. */
7291
resv_map = resv_map_alloc();
7292
if (!resv_map)
7293
goto out_err;
7294
7295
chg = to - from;
7296
7297
set_vma_resv_map(vma, resv_map);
7298
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
7299
}
7300
7301
if (chg < 0)
7302
goto out_err;
7303
7304
if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
7305
chg * pages_per_huge_page(h), &h_cg) < 0)
7306
goto out_err;
7307
7308
if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
7309
/* For private mappings, the hugetlb_cgroup uncharge info hangs
7310
* of the resv_map.
7311
*/
7312
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
7313
}
7314
7315
/*
7316
* There must be enough pages in the subpool for the mapping. If
7317
* the subpool has a minimum size, there may be some global
7318
* reservations already in place (gbl_reserve).
7319
*/
7320
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
7321
if (gbl_reserve < 0)
7322
goto out_uncharge_cgroup;
7323
7324
/*
7325
* Check enough hugepages are available for the reservation.
7326
* Hand the pages back to the subpool if there are not
7327
*/
7328
if (hugetlb_acct_memory(h, gbl_reserve) < 0)
7329
goto out_put_pages;
7330
7331
/*
7332
* Account for the reservations made. Shared mappings record regions
7333
* that have reservations as they are shared by multiple VMAs.
7334
* When the last VMA disappears, the region map says how much
7335
* the reservation was and the page cache tells how much of
7336
* the reservation was consumed. Private mappings are per-VMA and
7337
* only the consumed reservations are tracked. When the VMA
7338
* disappears, the original reservation is the VMA size and the
7339
* consumed reservations are stored in the map. Hence, nothing
7340
* else has to be done for private mappings here
7341
*/
7342
if (!vma || vma->vm_flags & VM_MAYSHARE) {
7343
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
7344
7345
if (unlikely(add < 0)) {
7346
hugetlb_acct_memory(h, -gbl_reserve);
7347
goto out_put_pages;
7348
} else if (unlikely(chg > add)) {
7349
/*
7350
* pages in this range were added to the reserve
7351
* map between region_chg and region_add. This
7352
* indicates a race with alloc_hugetlb_folio. Adjust
7353
* the subpool and reserve counts modified above
7354
* based on the difference.
7355
*/
7356
long rsv_adjust;
7357
7358
/*
7359
* hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
7360
* reference to h_cg->css. See comment below for detail.
7361
*/
7362
hugetlb_cgroup_uncharge_cgroup_rsvd(
7363
hstate_index(h),
7364
(chg - add) * pages_per_huge_page(h), h_cg);
7365
7366
rsv_adjust = hugepage_subpool_put_pages(spool,
7367
chg - add);
7368
hugetlb_acct_memory(h, -rsv_adjust);
7369
} else if (h_cg) {
7370
/*
7371
* The file_regions will hold their own reference to
7372
* h_cg->css. So we should release the reference held
7373
* via hugetlb_cgroup_charge_cgroup_rsvd() when we are
7374
* done.
7375
*/
7376
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
7377
}
7378
}
7379
return chg;
7380
7381
out_put_pages:
7382
spool_resv = chg - gbl_reserve;
7383
if (spool_resv) {
7384
/* put sub pool's reservation back, chg - gbl_reserve */
7385
gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
7386
/*
7387
* subpool's reserved pages can not be put back due to race,
7388
* return to hstate.
7389
*/
7390
hugetlb_acct_memory(h, -gbl_resv);
7391
}
7392
out_uncharge_cgroup:
7393
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
7394
chg * pages_per_huge_page(h), h_cg);
7395
out_err:
7396
hugetlb_vma_lock_free(vma);
7397
if (!vma || vma->vm_flags & VM_MAYSHARE)
7398
/* Only call region_abort if the region_chg succeeded but the
7399
* region_add failed or didn't run.
7400
*/
7401
if (chg >= 0 && add < 0)
7402
region_abort(resv_map, from, to, regions_needed);
7403
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
7404
kref_put(&resv_map->refs, resv_map_release);
7405
set_vma_resv_map(vma, NULL);
7406
}
7407
return chg < 0 ? chg : add < 0 ? add : -EINVAL;
7408
}
7409
7410
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
7411
long freed)
7412
{
7413
struct hstate *h = hstate_inode(inode);
7414
struct resv_map *resv_map = inode_resv_map(inode);
7415
long chg = 0;
7416
struct hugepage_subpool *spool = subpool_inode(inode);
7417
long gbl_reserve;
7418
7419
/*
7420
* Since this routine can be called in the evict inode path for all
7421
* hugetlbfs inodes, resv_map could be NULL.
7422
*/
7423
if (resv_map) {
7424
chg = region_del(resv_map, start, end);
7425
/*
7426
* region_del() can fail in the rare case where a region
7427
* must be split and another region descriptor can not be
7428
* allocated. If end == LONG_MAX, it will not fail.
7429
*/
7430
if (chg < 0)
7431
return chg;
7432
}
7433
7434
spin_lock(&inode->i_lock);
7435
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
7436
spin_unlock(&inode->i_lock);
7437
7438
/*
7439
* If the subpool has a minimum size, the number of global
7440
* reservations to be released may be adjusted.
7441
*
7442
* Note that !resv_map implies freed == 0. So (chg - freed)
7443
* won't go negative.
7444
*/
7445
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
7446
hugetlb_acct_memory(h, -gbl_reserve);
7447
7448
return 0;
7449
}
7450
7451
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
7452
static unsigned long page_table_shareable(struct vm_area_struct *svma,
7453
struct vm_area_struct *vma,
7454
unsigned long addr, pgoff_t idx)
7455
{
7456
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
7457
svma->vm_start;
7458
unsigned long sbase = saddr & PUD_MASK;
7459
unsigned long s_end = sbase + PUD_SIZE;
7460
7461
/* Allow segments to share if only one is marked locked */
7462
vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
7463
vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
7464
7465
/*
7466
* match the virtual addresses, permission and the alignment of the
7467
* page table page.
7468
*
7469
* Also, vma_lock (vm_private_data) is required for sharing.
7470
*/
7471
if (pmd_index(addr) != pmd_index(saddr) ||
7472
vm_flags != svm_flags ||
7473
!range_in_vma(svma, sbase, s_end) ||
7474
!svma->vm_private_data)
7475
return 0;
7476
7477
return saddr;
7478
}
7479
7480
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7481
{
7482
unsigned long start = addr & PUD_MASK;
7483
unsigned long end = start + PUD_SIZE;
7484
7485
#ifdef CONFIG_USERFAULTFD
7486
if (uffd_disable_huge_pmd_share(vma))
7487
return false;
7488
#endif
7489
/*
7490
* check on proper vm_flags and page table alignment
7491
*/
7492
if (!(vma->vm_flags & VM_MAYSHARE))
7493
return false;
7494
if (!vma->vm_private_data) /* vma lock required for sharing */
7495
return false;
7496
if (!range_in_vma(vma, start, end))
7497
return false;
7498
return true;
7499
}
7500
7501
/*
7502
* Determine if start,end range within vma could be mapped by shared pmd.
7503
* If yes, adjust start and end to cover range associated with possible
7504
* shared pmd mappings.
7505
*/
7506
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7507
unsigned long *start, unsigned long *end)
7508
{
7509
unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
7510
v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7511
7512
/*
7513
* vma needs to span at least one aligned PUD size, and the range
7514
* must be at least partially within in.
7515
*/
7516
if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
7517
(*end <= v_start) || (*start >= v_end))
7518
return;
7519
7520
/* Extend the range to be PUD aligned for a worst case scenario */
7521
if (*start > v_start)
7522
*start = ALIGN_DOWN(*start, PUD_SIZE);
7523
7524
if (*end < v_end)
7525
*end = ALIGN(*end, PUD_SIZE);
7526
}
7527
7528
/*
7529
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
7530
* and returns the corresponding pte. While this is not necessary for the
7531
* !shared pmd case because we can allocate the pmd later as well, it makes the
7532
* code much cleaner. pmd allocation is essential for the shared case because
7533
* pud has to be populated inside the same i_mmap_rwsem section - otherwise
7534
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
7535
* bad pmd for sharing.
7536
*/
7537
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7538
unsigned long addr, pud_t *pud)
7539
{
7540
struct address_space *mapping = vma->vm_file->f_mapping;
7541
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
7542
vma->vm_pgoff;
7543
struct vm_area_struct *svma;
7544
unsigned long saddr;
7545
pte_t *spte = NULL;
7546
pte_t *pte;
7547
7548
i_mmap_lock_read(mapping);
7549
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
7550
if (svma == vma)
7551
continue;
7552
7553
saddr = page_table_shareable(svma, vma, addr, idx);
7554
if (saddr) {
7555
spte = hugetlb_walk(svma, saddr,
7556
vma_mmu_pagesize(svma));
7557
if (spte) {
7558
ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
7559
break;
7560
}
7561
}
7562
}
7563
7564
if (!spte)
7565
goto out;
7566
7567
spin_lock(&mm->page_table_lock);
7568
if (pud_none(*pud)) {
7569
pud_populate(mm, pud,
7570
(pmd_t *)((unsigned long)spte & PAGE_MASK));
7571
mm_inc_nr_pmds(mm);
7572
} else {
7573
ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
7574
}
7575
spin_unlock(&mm->page_table_lock);
7576
out:
7577
pte = (pte_t *)pmd_alloc(mm, pud, addr);
7578
i_mmap_unlock_read(mapping);
7579
return pte;
7580
}
7581
7582
/*
7583
* unmap huge page backed by shared pte.
7584
*
7585
* Called with page table lock held.
7586
*
7587
* returns: 1 successfully unmapped a shared pte page
7588
* 0 the underlying pte page is not shared, or it is the last user
7589
*/
7590
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7591
unsigned long addr, pte_t *ptep)
7592
{
7593
unsigned long sz = huge_page_size(hstate_vma(vma));
7594
pgd_t *pgd = pgd_offset(mm, addr);
7595
p4d_t *p4d = p4d_offset(pgd, addr);
7596
pud_t *pud = pud_offset(p4d, addr);
7597
7598
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7599
hugetlb_vma_assert_locked(vma);
7600
if (sz != PMD_SIZE)
7601
return 0;
7602
if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
7603
return 0;
7604
7605
pud_clear(pud);
7606
/*
7607
* Once our caller drops the rmap lock, some other process might be
7608
* using this page table as a normal, non-hugetlb page table.
7609
* Wait for pending gup_fast() in other threads to finish before letting
7610
* that happen.
7611
*/
7612
tlb_remove_table_sync_one();
7613
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
7614
mm_dec_nr_pmds(mm);
7615
return 1;
7616
}
7617
7618
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
7619
7620
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7621
unsigned long addr, pud_t *pud)
7622
{
7623
return NULL;
7624
}
7625
7626
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7627
unsigned long addr, pte_t *ptep)
7628
{
7629
return 0;
7630
}
7631
7632
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7633
unsigned long *start, unsigned long *end)
7634
{
7635
}
7636
7637
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7638
{
7639
return false;
7640
}
7641
#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
7642
7643
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
7644
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
7645
unsigned long addr, unsigned long sz)
7646
{
7647
pgd_t *pgd;
7648
p4d_t *p4d;
7649
pud_t *pud;
7650
pte_t *pte = NULL;
7651
7652
pgd = pgd_offset(mm, addr);
7653
p4d = p4d_alloc(mm, pgd, addr);
7654
if (!p4d)
7655
return NULL;
7656
pud = pud_alloc(mm, p4d, addr);
7657
if (pud) {
7658
if (sz == PUD_SIZE) {
7659
pte = (pte_t *)pud;
7660
} else {
7661
BUG_ON(sz != PMD_SIZE);
7662
if (want_pmd_share(vma, addr) && pud_none(*pud))
7663
pte = huge_pmd_share(mm, vma, addr, pud);
7664
else
7665
pte = (pte_t *)pmd_alloc(mm, pud, addr);
7666
}
7667
}
7668
7669
if (pte) {
7670
pte_t pteval = ptep_get_lockless(pte);
7671
7672
BUG_ON(pte_present(pteval) && !pte_huge(pteval));
7673
}
7674
7675
return pte;
7676
}
7677
7678
/*
7679
* huge_pte_offset() - Walk the page table to resolve the hugepage
7680
* entry at address @addr
7681
*
7682
* Return: Pointer to page table entry (PUD or PMD) for
7683
* address @addr, or NULL if a !p*d_present() entry is encountered and the
7684
* size @sz doesn't match the hugepage size at this level of the page
7685
* table.
7686
*/
7687
pte_t *huge_pte_offset(struct mm_struct *mm,
7688
unsigned long addr, unsigned long sz)
7689
{
7690
pgd_t *pgd;
7691
p4d_t *p4d;
7692
pud_t *pud;
7693
pmd_t *pmd;
7694
7695
pgd = pgd_offset(mm, addr);
7696
if (!pgd_present(*pgd))
7697
return NULL;
7698
p4d = p4d_offset(pgd, addr);
7699
if (!p4d_present(*p4d))
7700
return NULL;
7701
7702
pud = pud_offset(p4d, addr);
7703
if (sz == PUD_SIZE)
7704
/* must be pud huge, non-present or none */
7705
return (pte_t *)pud;
7706
if (!pud_present(*pud))
7707
return NULL;
7708
/* must have a valid entry and size to go further */
7709
7710
pmd = pmd_offset(pud, addr);
7711
/* must be pmd huge, non-present or none */
7712
return (pte_t *)pmd;
7713
}
7714
7715
/*
7716
* Return a mask that can be used to update an address to the last huge
7717
* page in a page table page mapping size. Used to skip non-present
7718
* page table entries when linearly scanning address ranges. Architectures
7719
* with unique huge page to page table relationships can define their own
7720
* version of this routine.
7721
*/
7722
unsigned long hugetlb_mask_last_page(struct hstate *h)
7723
{
7724
unsigned long hp_size = huge_page_size(h);
7725
7726
if (hp_size == PUD_SIZE)
7727
return P4D_SIZE - PUD_SIZE;
7728
else if (hp_size == PMD_SIZE)
7729
return PUD_SIZE - PMD_SIZE;
7730
else
7731
return 0UL;
7732
}
7733
7734
#else
7735
7736
/* See description above. Architectures can provide their own version. */
7737
__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
7738
{
7739
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
7740
if (huge_page_size(h) == PMD_SIZE)
7741
return PUD_SIZE - PMD_SIZE;
7742
#endif
7743
return 0UL;
7744
}
7745
7746
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
7747
7748
/**
7749
* folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
7750
* @folio: the folio to isolate
7751
* @list: the list to add the folio to on success
7752
*
7753
* Isolate an allocated (refcount > 0) hugetlb folio, marking it as
7754
* isolated/non-migratable, and moving it from the active list to the
7755
* given list.
7756
*
7757
* Isolation will fail if @folio is not an allocated hugetlb folio, or if
7758
* it is already isolated/non-migratable.
7759
*
7760
* On success, an additional folio reference is taken that must be dropped
7761
* using folio_putback_hugetlb() to undo the isolation.
7762
*
7763
* Return: True if isolation worked, otherwise False.
7764
*/
7765
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
7766
{
7767
bool ret = true;
7768
7769
spin_lock_irq(&hugetlb_lock);
7770
if (!folio_test_hugetlb(folio) ||
7771
!folio_test_hugetlb_migratable(folio) ||
7772
!folio_try_get(folio)) {
7773
ret = false;
7774
goto unlock;
7775
}
7776
folio_clear_hugetlb_migratable(folio);
7777
list_move_tail(&folio->lru, list);
7778
unlock:
7779
spin_unlock_irq(&hugetlb_lock);
7780
return ret;
7781
}
7782
7783
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
7784
{
7785
int ret = 0;
7786
7787
*hugetlb = false;
7788
spin_lock_irq(&hugetlb_lock);
7789
if (folio_test_hugetlb(folio)) {
7790
*hugetlb = true;
7791
if (folio_test_hugetlb_freed(folio))
7792
ret = 0;
7793
else if (folio_test_hugetlb_migratable(folio) || unpoison)
7794
ret = folio_try_get(folio);
7795
else
7796
ret = -EBUSY;
7797
}
7798
spin_unlock_irq(&hugetlb_lock);
7799
return ret;
7800
}
7801
7802
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
7803
bool *migratable_cleared)
7804
{
7805
int ret;
7806
7807
spin_lock_irq(&hugetlb_lock);
7808
ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
7809
spin_unlock_irq(&hugetlb_lock);
7810
return ret;
7811
}
7812
7813
/**
7814
* folio_putback_hugetlb - unisolate a hugetlb folio
7815
* @folio: the isolated hugetlb folio
7816
*
7817
* Putback/un-isolate the hugetlb folio that was previous isolated using
7818
* folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
7819
* back onto the active list.
7820
*
7821
* Will drop the additional folio reference obtained through
7822
* folio_isolate_hugetlb().
7823
*/
7824
void folio_putback_hugetlb(struct folio *folio)
7825
{
7826
spin_lock_irq(&hugetlb_lock);
7827
folio_set_hugetlb_migratable(folio);
7828
list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
7829
spin_unlock_irq(&hugetlb_lock);
7830
folio_put(folio);
7831
}
7832
7833
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
7834
{
7835
struct hstate *h = folio_hstate(old_folio);
7836
7837
hugetlb_cgroup_migrate(old_folio, new_folio);
7838
folio_set_owner_migrate_reason(new_folio, reason);
7839
7840
/*
7841
* transfer temporary state of the new hugetlb folio. This is
7842
* reverse to other transitions because the newpage is going to
7843
* be final while the old one will be freed so it takes over
7844
* the temporary status.
7845
*
7846
* Also note that we have to transfer the per-node surplus state
7847
* here as well otherwise the global surplus count will not match
7848
* the per-node's.
7849
*/
7850
if (folio_test_hugetlb_temporary(new_folio)) {
7851
int old_nid = folio_nid(old_folio);
7852
int new_nid = folio_nid(new_folio);
7853
7854
folio_set_hugetlb_temporary(old_folio);
7855
folio_clear_hugetlb_temporary(new_folio);
7856
7857
7858
/*
7859
* There is no need to transfer the per-node surplus state
7860
* when we do not cross the node.
7861
*/
7862
if (new_nid == old_nid)
7863
return;
7864
spin_lock_irq(&hugetlb_lock);
7865
if (h->surplus_huge_pages_node[old_nid]) {
7866
h->surplus_huge_pages_node[old_nid]--;
7867
h->surplus_huge_pages_node[new_nid]++;
7868
}
7869
spin_unlock_irq(&hugetlb_lock);
7870
}
7871
7872
/*
7873
* Our old folio is isolated and has "migratable" cleared until it
7874
* is putback. As migration succeeded, set the new folio "migratable"
7875
* and add it to the active list.
7876
*/
7877
spin_lock_irq(&hugetlb_lock);
7878
folio_set_hugetlb_migratable(new_folio);
7879
list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
7880
spin_unlock_irq(&hugetlb_lock);
7881
}
7882
7883
/*
7884
* If @take_locks is false, the caller must ensure that no concurrent page table
7885
* access can happen (except for gup_fast() and hardware page walks).
7886
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7887
* concurrent page fault handling) and the file rmap lock.
7888
*/
7889
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7890
unsigned long start,
7891
unsigned long end,
7892
bool take_locks)
7893
{
7894
struct hstate *h = hstate_vma(vma);
7895
unsigned long sz = huge_page_size(h);
7896
struct mm_struct *mm = vma->vm_mm;
7897
struct mmu_notifier_range range;
7898
unsigned long address;
7899
spinlock_t *ptl;
7900
pte_t *ptep;
7901
7902
if (!(vma->vm_flags & VM_MAYSHARE))
7903
return;
7904
7905
if (start >= end)
7906
return;
7907
7908
flush_cache_range(vma, start, end);
7909
/*
7910
* No need to call adjust_range_if_pmd_sharing_possible(), because
7911
* we have already done the PUD_SIZE alignment.
7912
*/
7913
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
7914
start, end);
7915
mmu_notifier_invalidate_range_start(&range);
7916
if (take_locks) {
7917
hugetlb_vma_lock_write(vma);
7918
i_mmap_lock_write(vma->vm_file->f_mapping);
7919
} else {
7920
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7921
}
7922
for (address = start; address < end; address += PUD_SIZE) {
7923
ptep = hugetlb_walk(vma, address, sz);
7924
if (!ptep)
7925
continue;
7926
ptl = huge_pte_lock(h, mm, ptep);
7927
huge_pmd_unshare(mm, vma, address, ptep);
7928
spin_unlock(ptl);
7929
}
7930
flush_hugetlb_tlb_range(vma, start, end);
7931
if (take_locks) {
7932
i_mmap_unlock_write(vma->vm_file->f_mapping);
7933
hugetlb_vma_unlock_write(vma);
7934
}
7935
/*
7936
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
7937
* Documentation/mm/mmu_notifier.rst.
7938
*/
7939
mmu_notifier_invalidate_range_end(&range);
7940
}
7941
7942
/*
7943
* This function will unconditionally remove all the shared pmd pgtable entries
7944
* within the specific vma for a hugetlbfs memory range.
7945
*/
7946
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7947
{
7948
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7949
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
7950
/* take_locks = */ true);
7951
}
7952
7953
/*
7954
* For hugetlb, mremap() is an odd edge case - while the VMA copying is
7955
* performed, we permit both the old and new VMAs to reference the same
7956
* reservation.
7957
*
7958
* We fix this up after the operation succeeds, or if a newly allocated VMA
7959
* is closed as a result of a failure to allocate memory.
7960
*/
7961
void fixup_hugetlb_reservations(struct vm_area_struct *vma)
7962
{
7963
if (is_vm_hugetlb_page(vma))
7964
clear_vma_resv_huge_pages(vma);
7965
}
7966
7967