Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/hugetlb_vmemmap.c
49695 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* HugeTLB Vmemmap Optimization (HVO)
4
*
5
* Copyright (c) 2020, ByteDance. All rights reserved.
6
*
7
* Author: Muchun Song <[email protected]>
8
*
9
* See Documentation/mm/vmemmap_dedup.rst
10
*/
11
#define pr_fmt(fmt) "HugeTLB: " fmt
12
13
#include <linux/pgtable.h>
14
#include <linux/moduleparam.h>
15
#include <linux/bootmem_info.h>
16
#include <linux/mmdebug.h>
17
#include <linux/pagewalk.h>
18
#include <linux/pgalloc.h>
19
20
#include <asm/tlbflush.h>
21
#include "hugetlb_vmemmap.h"
22
23
/**
24
* struct vmemmap_remap_walk - walk vmemmap page table
25
*
26
* @remap_pte: called for each lowest-level entry (PTE).
27
* @nr_walked: the number of walked pte.
28
* @reuse_page: the page which is reused for the tail vmemmap pages.
29
* @reuse_addr: the virtual address of the @reuse_page page.
30
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
31
* or is mapped from.
32
* @flags: used to modify behavior in vmemmap page table walking
33
* operations.
34
*/
35
struct vmemmap_remap_walk {
36
void (*remap_pte)(pte_t *pte, unsigned long addr,
37
struct vmemmap_remap_walk *walk);
38
unsigned long nr_walked;
39
struct page *reuse_page;
40
unsigned long reuse_addr;
41
struct list_head *vmemmap_pages;
42
43
/* Skip the TLB flush when we split the PMD */
44
#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
45
/* Skip the TLB flush when we remap the PTE */
46
#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
47
/* synchronize_rcu() to avoid writes from page_ref_add_unless() */
48
#define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
49
unsigned long flags;
50
};
51
52
static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
53
struct vmemmap_remap_walk *walk)
54
{
55
pmd_t __pmd;
56
int i;
57
unsigned long addr = start;
58
pte_t *pgtable;
59
60
pgtable = pte_alloc_one_kernel(&init_mm);
61
if (!pgtable)
62
return -ENOMEM;
63
64
pmd_populate_kernel(&init_mm, &__pmd, pgtable);
65
66
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
67
pte_t entry, *pte;
68
pgprot_t pgprot = PAGE_KERNEL;
69
70
entry = mk_pte(head + i, pgprot);
71
pte = pte_offset_kernel(&__pmd, addr);
72
set_pte_at(&init_mm, addr, pte, entry);
73
}
74
75
spin_lock(&init_mm.page_table_lock);
76
if (likely(pmd_leaf(*pmd))) {
77
/*
78
* Higher order allocations from buddy allocator must be able to
79
* be treated as independent small pages (as they can be freed
80
* individually).
81
*/
82
if (!PageReserved(head))
83
split_page(head, get_order(PMD_SIZE));
84
85
/* Make pte visible before pmd. See comment in pmd_install(). */
86
smp_wmb();
87
pmd_populate_kernel(&init_mm, pmd, pgtable);
88
if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
89
flush_tlb_kernel_range(start, start + PMD_SIZE);
90
} else {
91
pte_free_kernel(&init_mm, pgtable);
92
}
93
spin_unlock(&init_mm.page_table_lock);
94
95
return 0;
96
}
97
98
static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
99
unsigned long next, struct mm_walk *walk)
100
{
101
int ret = 0;
102
struct page *head;
103
struct vmemmap_remap_walk *vmemmap_walk = walk->private;
104
105
/* Only splitting, not remapping the vmemmap pages. */
106
if (!vmemmap_walk->remap_pte)
107
walk->action = ACTION_CONTINUE;
108
109
spin_lock(&init_mm.page_table_lock);
110
head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
111
/*
112
* Due to HugeTLB alignment requirements and the vmemmap
113
* pages being at the start of the hotplugged memory
114
* region in memory_hotplug.memmap_on_memory case. Checking
115
* the vmemmap page associated with the first vmemmap page
116
* if it is self-hosted is sufficient.
117
*
118
* [ hotplugged memory ]
119
* [ section ][...][ section ]
120
* [ vmemmap ][ usable memory ]
121
* ^ | ^ |
122
* +--+ | |
123
* +------------------------+
124
*/
125
if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
126
struct page *page = head ? head + pte_index(addr) :
127
pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
128
129
if (PageVmemmapSelfHosted(page))
130
ret = -ENOTSUPP;
131
}
132
spin_unlock(&init_mm.page_table_lock);
133
if (!head || ret)
134
return ret;
135
136
return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
137
}
138
139
static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
140
unsigned long next, struct mm_walk *walk)
141
{
142
struct vmemmap_remap_walk *vmemmap_walk = walk->private;
143
144
/*
145
* The reuse_page is found 'first' in page table walking before
146
* starting remapping.
147
*/
148
if (!vmemmap_walk->reuse_page)
149
vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
150
else
151
vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
152
vmemmap_walk->nr_walked++;
153
154
return 0;
155
}
156
157
static const struct mm_walk_ops vmemmap_remap_ops = {
158
.pmd_entry = vmemmap_pmd_entry,
159
.pte_entry = vmemmap_pte_entry,
160
};
161
162
static int vmemmap_remap_range(unsigned long start, unsigned long end,
163
struct vmemmap_remap_walk *walk)
164
{
165
int ret;
166
167
VM_BUG_ON(!PAGE_ALIGNED(start | end));
168
169
mmap_read_lock(&init_mm);
170
ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
171
NULL, walk);
172
mmap_read_unlock(&init_mm);
173
if (ret)
174
return ret;
175
176
if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
177
flush_tlb_kernel_range(start, end);
178
179
return 0;
180
}
181
182
/*
183
* Free a vmemmap page. A vmemmap page can be allocated from the memblock
184
* allocator or buddy allocator. If the PG_reserved flag is set, it means
185
* that it allocated from the memblock allocator, just free it via the
186
* free_bootmem_page(). Otherwise, use __free_page().
187
*/
188
static inline void free_vmemmap_page(struct page *page)
189
{
190
if (PageReserved(page)) {
191
memmap_boot_pages_add(-1);
192
free_bootmem_page(page);
193
} else {
194
memmap_pages_add(-1);
195
__free_page(page);
196
}
197
}
198
199
/* Free a list of the vmemmap pages */
200
static void free_vmemmap_page_list(struct list_head *list)
201
{
202
struct page *page, *next;
203
204
list_for_each_entry_safe(page, next, list, lru)
205
free_vmemmap_page(page);
206
}
207
208
static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
209
struct vmemmap_remap_walk *walk)
210
{
211
/*
212
* Remap the tail pages as read-only to catch illegal write operation
213
* to the tail pages.
214
*/
215
pgprot_t pgprot = PAGE_KERNEL_RO;
216
struct page *page = pte_page(ptep_get(pte));
217
pte_t entry;
218
219
/* Remapping the head page requires r/w */
220
if (unlikely(addr == walk->reuse_addr)) {
221
pgprot = PAGE_KERNEL;
222
list_del(&walk->reuse_page->lru);
223
224
/*
225
* Makes sure that preceding stores to the page contents from
226
* vmemmap_remap_free() become visible before the set_pte_at()
227
* write.
228
*/
229
smp_wmb();
230
}
231
232
entry = mk_pte(walk->reuse_page, pgprot);
233
list_add(&page->lru, walk->vmemmap_pages);
234
set_pte_at(&init_mm, addr, pte, entry);
235
}
236
237
/*
238
* How many struct page structs need to be reset. When we reuse the head
239
* struct page, the special metadata (e.g. page->flags or page->mapping)
240
* cannot copy to the tail struct page structs. The invalid value will be
241
* checked in the free_tail_page_prepare(). In order to avoid the message
242
* of "corrupted mapping in tail page". We need to reset at least 4 (one
243
* head struct page struct and three tail struct page structs) struct page
244
* structs.
245
*/
246
#define NR_RESET_STRUCT_PAGE 4
247
248
static inline void reset_struct_pages(struct page *start)
249
{
250
struct page *from = start + NR_RESET_STRUCT_PAGE;
251
252
BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
253
memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
254
}
255
256
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
257
struct vmemmap_remap_walk *walk)
258
{
259
pgprot_t pgprot = PAGE_KERNEL;
260
struct page *page;
261
void *to;
262
263
BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
264
265
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
266
list_del(&page->lru);
267
to = page_to_virt(page);
268
copy_page(to, (void *)walk->reuse_addr);
269
reset_struct_pages(to);
270
271
/*
272
* Makes sure that preceding stores to the page contents become visible
273
* before the set_pte_at() write.
274
*/
275
smp_wmb();
276
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
277
}
278
279
/**
280
* vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
281
* backing PMDs of the directmap into PTEs
282
* @start: start address of the vmemmap virtual address range that we want
283
* to remap.
284
* @end: end address of the vmemmap virtual address range that we want to
285
* remap.
286
* @reuse: reuse address.
287
*
288
* Return: %0 on success, negative error code otherwise.
289
*/
290
static int vmemmap_remap_split(unsigned long start, unsigned long end,
291
unsigned long reuse)
292
{
293
struct vmemmap_remap_walk walk = {
294
.remap_pte = NULL,
295
.flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
296
};
297
298
/* See the comment in the vmemmap_remap_free(). */
299
BUG_ON(start - reuse != PAGE_SIZE);
300
301
return vmemmap_remap_range(reuse, end, &walk);
302
}
303
304
/**
305
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
306
* to the page which @reuse is mapped to, then free vmemmap
307
* which the range are mapped to.
308
* @start: start address of the vmemmap virtual address range that we want
309
* to remap.
310
* @end: end address of the vmemmap virtual address range that we want to
311
* remap.
312
* @reuse: reuse address.
313
* @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
314
* responsibility to free pages.
315
* @flags: modifications to vmemmap_remap_walk flags
316
*
317
* Return: %0 on success, negative error code otherwise.
318
*/
319
static int vmemmap_remap_free(unsigned long start, unsigned long end,
320
unsigned long reuse,
321
struct list_head *vmemmap_pages,
322
unsigned long flags)
323
{
324
int ret;
325
struct vmemmap_remap_walk walk = {
326
.remap_pte = vmemmap_remap_pte,
327
.reuse_addr = reuse,
328
.vmemmap_pages = vmemmap_pages,
329
.flags = flags,
330
};
331
int nid = page_to_nid((struct page *)reuse);
332
gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
333
334
/*
335
* Allocate a new head vmemmap page to avoid breaking a contiguous
336
* block of struct page memory when freeing it back to page allocator
337
* in free_vmemmap_page_list(). This will allow the likely contiguous
338
* struct page backing memory to be kept contiguous and allowing for
339
* more allocations of hugepages. Fallback to the currently
340
* mapped head page in case should it fail to allocate.
341
*/
342
walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
343
if (walk.reuse_page) {
344
copy_page(page_to_virt(walk.reuse_page),
345
(void *)walk.reuse_addr);
346
list_add(&walk.reuse_page->lru, vmemmap_pages);
347
memmap_pages_add(1);
348
}
349
350
/*
351
* In order to make remapping routine most efficient for the huge pages,
352
* the routine of vmemmap page table walking has the following rules
353
* (see more details from the vmemmap_pte_range()):
354
*
355
* - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
356
* should be continuous.
357
* - The @reuse address is part of the range [@reuse, @end) that we are
358
* walking which is passed to vmemmap_remap_range().
359
* - The @reuse address is the first in the complete range.
360
*
361
* So we need to make sure that @start and @reuse meet the above rules.
362
*/
363
BUG_ON(start - reuse != PAGE_SIZE);
364
365
ret = vmemmap_remap_range(reuse, end, &walk);
366
if (ret && walk.nr_walked) {
367
end = reuse + walk.nr_walked * PAGE_SIZE;
368
/*
369
* vmemmap_pages contains pages from the previous
370
* vmemmap_remap_range call which failed. These
371
* are pages which were removed from the vmemmap.
372
* They will be restored in the following call.
373
*/
374
walk = (struct vmemmap_remap_walk) {
375
.remap_pte = vmemmap_restore_pte,
376
.reuse_addr = reuse,
377
.vmemmap_pages = vmemmap_pages,
378
.flags = 0,
379
};
380
381
vmemmap_remap_range(reuse, end, &walk);
382
}
383
384
return ret;
385
}
386
387
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
388
struct list_head *list)
389
{
390
gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
391
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
392
int nid = page_to_nid((struct page *)start);
393
struct page *page, *next;
394
int i;
395
396
for (i = 0; i < nr_pages; i++) {
397
page = alloc_pages_node(nid, gfp_mask, 0);
398
if (!page)
399
goto out;
400
list_add(&page->lru, list);
401
}
402
memmap_pages_add(nr_pages);
403
404
return 0;
405
out:
406
list_for_each_entry_safe(page, next, list, lru)
407
__free_page(page);
408
return -ENOMEM;
409
}
410
411
/**
412
* vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
413
* to the page which is from the @vmemmap_pages
414
* respectively.
415
* @start: start address of the vmemmap virtual address range that we want
416
* to remap.
417
* @end: end address of the vmemmap virtual address range that we want to
418
* remap.
419
* @reuse: reuse address.
420
* @flags: modifications to vmemmap_remap_walk flags
421
*
422
* Return: %0 on success, negative error code otherwise.
423
*/
424
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
425
unsigned long reuse, unsigned long flags)
426
{
427
LIST_HEAD(vmemmap_pages);
428
struct vmemmap_remap_walk walk = {
429
.remap_pte = vmemmap_restore_pte,
430
.reuse_addr = reuse,
431
.vmemmap_pages = &vmemmap_pages,
432
.flags = flags,
433
};
434
435
/* See the comment in the vmemmap_remap_free(). */
436
BUG_ON(start - reuse != PAGE_SIZE);
437
438
if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
439
return -ENOMEM;
440
441
return vmemmap_remap_range(reuse, end, &walk);
442
}
443
444
DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
445
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
446
447
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
448
static int __init hugetlb_vmemmap_optimize_param(char *buf)
449
{
450
return kstrtobool(buf, &vmemmap_optimize_enabled);
451
}
452
early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
453
454
static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
455
struct folio *folio, unsigned long flags)
456
{
457
int ret;
458
unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
459
unsigned long vmemmap_reuse;
460
461
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
462
VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
463
464
if (!folio_test_hugetlb_vmemmap_optimized(folio))
465
return 0;
466
467
if (flags & VMEMMAP_SYNCHRONIZE_RCU)
468
synchronize_rcu();
469
470
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
471
vmemmap_reuse = vmemmap_start;
472
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
473
474
/*
475
* The pages which the vmemmap virtual address range [@vmemmap_start,
476
* @vmemmap_end) are mapped to are freed to the buddy allocator, and
477
* the range is mapped to the page which @vmemmap_reuse is mapped to.
478
* When a HugeTLB page is freed to the buddy allocator, previously
479
* discarded vmemmap pages must be allocated and remapping.
480
*/
481
ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
482
if (!ret) {
483
folio_clear_hugetlb_vmemmap_optimized(folio);
484
static_branch_dec(&hugetlb_optimize_vmemmap_key);
485
}
486
487
return ret;
488
}
489
490
/**
491
* hugetlb_vmemmap_restore_folio - restore previously optimized (by
492
* hugetlb_vmemmap_optimize_folio()) vmemmap pages which
493
* will be reallocated and remapped.
494
* @h: struct hstate.
495
* @folio: the folio whose vmemmap pages will be restored.
496
*
497
* Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
498
* negative error code otherwise.
499
*/
500
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
501
{
502
return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
503
}
504
505
/**
506
* hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
507
* @h: hstate.
508
* @folio_list: list of folios.
509
* @non_hvo_folios: Output list of folios for which vmemmap exists.
510
*
511
* Return: number of folios for which vmemmap was restored, or an error code
512
* if an error was encountered restoring vmemmap for a folio.
513
* Folios that have vmemmap are moved to the non_hvo_folios
514
* list. Processing of entries stops when the first error is
515
* encountered. The folio that experienced the error and all
516
* non-processed folios will remain on folio_list.
517
*/
518
long hugetlb_vmemmap_restore_folios(const struct hstate *h,
519
struct list_head *folio_list,
520
struct list_head *non_hvo_folios)
521
{
522
struct folio *folio, *t_folio;
523
long restored = 0;
524
long ret = 0;
525
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
526
527
list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
528
if (folio_test_hugetlb_vmemmap_optimized(folio)) {
529
ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
530
/* only need to synchronize_rcu() once for each batch */
531
flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
532
533
if (ret)
534
break;
535
restored++;
536
}
537
538
/* Add non-optimized folios to output list */
539
list_move(&folio->lru, non_hvo_folios);
540
}
541
542
if (restored)
543
flush_tlb_all();
544
if (!ret)
545
ret = restored;
546
return ret;
547
}
548
549
/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
550
static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
551
{
552
if (folio_test_hugetlb_vmemmap_optimized(folio))
553
return false;
554
555
if (!READ_ONCE(vmemmap_optimize_enabled))
556
return false;
557
558
if (!hugetlb_vmemmap_optimizable(h))
559
return false;
560
561
return true;
562
}
563
564
static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
565
struct folio *folio,
566
struct list_head *vmemmap_pages,
567
unsigned long flags)
568
{
569
int ret = 0;
570
unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
571
unsigned long vmemmap_reuse;
572
573
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
574
VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
575
576
if (!vmemmap_should_optimize_folio(h, folio))
577
return ret;
578
579
static_branch_inc(&hugetlb_optimize_vmemmap_key);
580
581
if (flags & VMEMMAP_SYNCHRONIZE_RCU)
582
synchronize_rcu();
583
/*
584
* Very Subtle
585
* If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
586
* immediately after remapping. As a result, subsequent accesses
587
* and modifications to struct pages associated with the hugetlb
588
* page could be to the OLD struct pages. Set the vmemmap optimized
589
* flag here so that it is copied to the new head page. This keeps
590
* the old and new struct pages in sync.
591
* If there is an error during optimization, we will immediately FLUSH
592
* the TLB and clear the flag below.
593
*/
594
folio_set_hugetlb_vmemmap_optimized(folio);
595
596
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
597
vmemmap_reuse = vmemmap_start;
598
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
599
600
/*
601
* Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
602
* to the page which @vmemmap_reuse is mapped to. Add pages previously
603
* mapping the range to vmemmap_pages list so that they can be freed by
604
* the caller.
605
*/
606
ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
607
vmemmap_pages, flags);
608
if (ret) {
609
static_branch_dec(&hugetlb_optimize_vmemmap_key);
610
folio_clear_hugetlb_vmemmap_optimized(folio);
611
}
612
613
return ret;
614
}
615
616
/**
617
* hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
618
* @h: struct hstate.
619
* @folio: the folio whose vmemmap pages will be optimized.
620
*
621
* This function only tries to optimize @folio's vmemmap pages and does not
622
* guarantee that the optimization will succeed after it returns. The caller
623
* can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
624
* vmemmap pages have been optimized.
625
*/
626
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
627
{
628
LIST_HEAD(vmemmap_pages);
629
630
__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
631
free_vmemmap_page_list(&vmemmap_pages);
632
}
633
634
static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
635
{
636
unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
637
unsigned long vmemmap_reuse;
638
639
if (!vmemmap_should_optimize_folio(h, folio))
640
return 0;
641
642
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
643
vmemmap_reuse = vmemmap_start;
644
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
645
646
/*
647
* Split PMDs on the vmemmap virtual address range [@vmemmap_start,
648
* @vmemmap_end]
649
*/
650
return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
651
}
652
653
static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
654
struct list_head *folio_list,
655
bool boot)
656
{
657
struct folio *folio;
658
int nr_to_optimize;
659
LIST_HEAD(vmemmap_pages);
660
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
661
662
nr_to_optimize = 0;
663
list_for_each_entry(folio, folio_list, lru) {
664
int ret;
665
unsigned long spfn, epfn;
666
667
if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
668
/*
669
* Already optimized by pre-HVO, just map the
670
* mirrored tail page structs RO.
671
*/
672
spfn = (unsigned long)&folio->page;
673
epfn = spfn + pages_per_huge_page(h);
674
vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
675
HUGETLB_VMEMMAP_RESERVE_SIZE);
676
register_page_bootmem_memmap(pfn_to_section_nr(spfn),
677
&folio->page,
678
HUGETLB_VMEMMAP_RESERVE_SIZE);
679
static_branch_inc(&hugetlb_optimize_vmemmap_key);
680
continue;
681
}
682
683
nr_to_optimize++;
684
685
ret = hugetlb_vmemmap_split_folio(h, folio);
686
687
/*
688
* Splitting the PMD requires allocating a page, thus let's fail
689
* early once we encounter the first OOM. No point in retrying
690
* as it can be dynamically done on remap with the memory
691
* we get back from the vmemmap deduplication.
692
*/
693
if (ret == -ENOMEM)
694
break;
695
}
696
697
if (!nr_to_optimize)
698
/*
699
* All pre-HVO folios, nothing left to do. It's ok if
700
* there is a mix of pre-HVO and not yet HVO-ed folios
701
* here, as __hugetlb_vmemmap_optimize_folio() will
702
* skip any folios that already have the optimized flag
703
* set, see vmemmap_should_optimize_folio().
704
*/
705
goto out;
706
707
flush_tlb_all();
708
709
list_for_each_entry(folio, folio_list, lru) {
710
int ret;
711
712
ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
713
/* only need to synchronize_rcu() once for each batch */
714
flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
715
716
/*
717
* Pages to be freed may have been accumulated. If we
718
* encounter an ENOMEM, free what we have and try again.
719
* This can occur in the case that both splitting fails
720
* halfway and head page allocation also failed. In this
721
* case __hugetlb_vmemmap_optimize_folio() would free memory
722
* allowing more vmemmap remaps to occur.
723
*/
724
if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
725
flush_tlb_all();
726
free_vmemmap_page_list(&vmemmap_pages);
727
INIT_LIST_HEAD(&vmemmap_pages);
728
__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
729
}
730
}
731
732
out:
733
flush_tlb_all();
734
free_vmemmap_page_list(&vmemmap_pages);
735
}
736
737
void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
738
{
739
__hugetlb_vmemmap_optimize_folios(h, folio_list, false);
740
}
741
742
void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
743
{
744
__hugetlb_vmemmap_optimize_folios(h, folio_list, true);
745
}
746
747
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
748
749
/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
750
static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
751
{
752
unsigned long section_size, psize, pmd_vmemmap_size;
753
phys_addr_t paddr;
754
755
if (!READ_ONCE(vmemmap_optimize_enabled))
756
return false;
757
758
if (!hugetlb_vmemmap_optimizable(m->hstate))
759
return false;
760
761
psize = huge_page_size(m->hstate);
762
paddr = virt_to_phys(m);
763
764
/*
765
* Pre-HVO only works if the bootmem huge page
766
* is aligned to the section size.
767
*/
768
section_size = (1UL << PA_SECTION_SHIFT);
769
if (!IS_ALIGNED(paddr, section_size) ||
770
!IS_ALIGNED(psize, section_size))
771
return false;
772
773
/*
774
* The pre-HVO code does not deal with splitting PMDS,
775
* so the bootmem page must be aligned to the number
776
* of base pages that can be mapped with one vmemmap PMD.
777
*/
778
pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
779
if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
780
!IS_ALIGNED(psize, pmd_vmemmap_size))
781
return false;
782
783
return true;
784
}
785
786
/*
787
* Initialize memmap section for a gigantic page, HVO-style.
788
*/
789
void __init hugetlb_vmemmap_init_early(int nid)
790
{
791
unsigned long psize, paddr, section_size;
792
unsigned long ns, i, pnum, pfn, nr_pages;
793
unsigned long start, end;
794
struct huge_bootmem_page *m = NULL;
795
void *map;
796
797
/*
798
* Noting to do if bootmem pages were not allocated
799
* early in boot, or if HVO wasn't enabled in the
800
* first place.
801
*/
802
if (!hugetlb_bootmem_allocated())
803
return;
804
805
if (!READ_ONCE(vmemmap_optimize_enabled))
806
return;
807
808
section_size = (1UL << PA_SECTION_SHIFT);
809
810
list_for_each_entry(m, &huge_boot_pages[nid], list) {
811
if (!vmemmap_should_optimize_bootmem_page(m))
812
continue;
813
814
nr_pages = pages_per_huge_page(m->hstate);
815
psize = nr_pages << PAGE_SHIFT;
816
paddr = virt_to_phys(m);
817
pfn = PHYS_PFN(paddr);
818
map = pfn_to_page(pfn);
819
start = (unsigned long)map;
820
end = start + nr_pages * sizeof(struct page);
821
822
if (vmemmap_populate_hvo(start, end, nid,
823
HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
824
continue;
825
826
memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
827
828
pnum = pfn_to_section_nr(pfn);
829
ns = psize / section_size;
830
831
for (i = 0; i < ns; i++) {
832
sparse_init_early_section(nid, map, pnum,
833
SECTION_IS_VMEMMAP_PREINIT);
834
map += section_map_size();
835
pnum++;
836
}
837
838
m->flags |= HUGE_BOOTMEM_HVO;
839
}
840
}
841
842
void __init hugetlb_vmemmap_init_late(int nid)
843
{
844
struct huge_bootmem_page *m, *tm;
845
unsigned long phys, nr_pages, start, end;
846
unsigned long pfn, nr_mmap;
847
struct hstate *h;
848
void *map;
849
850
if (!hugetlb_bootmem_allocated())
851
return;
852
853
if (!READ_ONCE(vmemmap_optimize_enabled))
854
return;
855
856
list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
857
if (!(m->flags & HUGE_BOOTMEM_HVO))
858
continue;
859
860
phys = virt_to_phys(m);
861
h = m->hstate;
862
pfn = PHYS_PFN(phys);
863
nr_pages = pages_per_huge_page(h);
864
865
if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
866
/*
867
* Oops, the hugetlb page spans multiple zones.
868
* Remove it from the list, and undo HVO.
869
*/
870
list_del(&m->list);
871
872
map = pfn_to_page(pfn);
873
874
start = (unsigned long)map;
875
end = start + nr_pages * sizeof(struct page);
876
877
vmemmap_undo_hvo(start, end, nid,
878
HUGETLB_VMEMMAP_RESERVE_SIZE);
879
nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
880
memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
881
882
memblock_phys_free(phys, huge_page_size(h));
883
continue;
884
} else
885
m->flags |= HUGE_BOOTMEM_ZONES_VALID;
886
}
887
}
888
#endif
889
890
static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
891
{
892
.procname = "hugetlb_optimize_vmemmap",
893
.data = &vmemmap_optimize_enabled,
894
.maxlen = sizeof(vmemmap_optimize_enabled),
895
.mode = 0644,
896
.proc_handler = proc_dobool,
897
},
898
};
899
900
static int __init hugetlb_vmemmap_init(void)
901
{
902
const struct hstate *h;
903
904
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
905
BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
906
907
for_each_hstate(h) {
908
if (hugetlb_vmemmap_optimizable(h)) {
909
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
910
break;
911
}
912
}
913
return 0;
914
}
915
late_initcall(hugetlb_vmemmap_init);
916
917