Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/hugetlb_vmemmap.c
26131 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* HugeTLB Vmemmap Optimization (HVO)
4
*
5
* Copyright (c) 2020, ByteDance. All rights reserved.
6
*
7
* Author: Muchun Song <[email protected]>
8
*
9
* See Documentation/mm/vmemmap_dedup.rst
10
*/
11
#define pr_fmt(fmt) "HugeTLB: " fmt
12
13
#include <linux/pgtable.h>
14
#include <linux/moduleparam.h>
15
#include <linux/bootmem_info.h>
16
#include <linux/mmdebug.h>
17
#include <linux/pagewalk.h>
18
#include <asm/pgalloc.h>
19
#include <asm/tlbflush.h>
20
#include "hugetlb_vmemmap.h"
21
22
/**
23
* struct vmemmap_remap_walk - walk vmemmap page table
24
*
25
* @remap_pte: called for each lowest-level entry (PTE).
26
* @nr_walked: the number of walked pte.
27
* @reuse_page: the page which is reused for the tail vmemmap pages.
28
* @reuse_addr: the virtual address of the @reuse_page page.
29
* @vmemmap_pages: the list head of the vmemmap pages that can be freed
30
* or is mapped from.
31
* @flags: used to modify behavior in vmemmap page table walking
32
* operations.
33
*/
34
struct vmemmap_remap_walk {
35
void (*remap_pte)(pte_t *pte, unsigned long addr,
36
struct vmemmap_remap_walk *walk);
37
unsigned long nr_walked;
38
struct page *reuse_page;
39
unsigned long reuse_addr;
40
struct list_head *vmemmap_pages;
41
42
/* Skip the TLB flush when we split the PMD */
43
#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
44
/* Skip the TLB flush when we remap the PTE */
45
#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
46
/* synchronize_rcu() to avoid writes from page_ref_add_unless() */
47
#define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
48
unsigned long flags;
49
};
50
51
static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
52
struct vmemmap_remap_walk *walk)
53
{
54
pmd_t __pmd;
55
int i;
56
unsigned long addr = start;
57
pte_t *pgtable;
58
59
pgtable = pte_alloc_one_kernel(&init_mm);
60
if (!pgtable)
61
return -ENOMEM;
62
63
pmd_populate_kernel(&init_mm, &__pmd, pgtable);
64
65
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
66
pte_t entry, *pte;
67
pgprot_t pgprot = PAGE_KERNEL;
68
69
entry = mk_pte(head + i, pgprot);
70
pte = pte_offset_kernel(&__pmd, addr);
71
set_pte_at(&init_mm, addr, pte, entry);
72
}
73
74
spin_lock(&init_mm.page_table_lock);
75
if (likely(pmd_leaf(*pmd))) {
76
/*
77
* Higher order allocations from buddy allocator must be able to
78
* be treated as indepdenent small pages (as they can be freed
79
* individually).
80
*/
81
if (!PageReserved(head))
82
split_page(head, get_order(PMD_SIZE));
83
84
/* Make pte visible before pmd. See comment in pmd_install(). */
85
smp_wmb();
86
pmd_populate_kernel(&init_mm, pmd, pgtable);
87
if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
88
flush_tlb_kernel_range(start, start + PMD_SIZE);
89
} else {
90
pte_free_kernel(&init_mm, pgtable);
91
}
92
spin_unlock(&init_mm.page_table_lock);
93
94
return 0;
95
}
96
97
static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
98
unsigned long next, struct mm_walk *walk)
99
{
100
int ret = 0;
101
struct page *head;
102
struct vmemmap_remap_walk *vmemmap_walk = walk->private;
103
104
/* Only splitting, not remapping the vmemmap pages. */
105
if (!vmemmap_walk->remap_pte)
106
walk->action = ACTION_CONTINUE;
107
108
spin_lock(&init_mm.page_table_lock);
109
head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
110
/*
111
* Due to HugeTLB alignment requirements and the vmemmap
112
* pages being at the start of the hotplugged memory
113
* region in memory_hotplug.memmap_on_memory case. Checking
114
* the vmemmap page associated with the first vmemmap page
115
* if it is self-hosted is sufficient.
116
*
117
* [ hotplugged memory ]
118
* [ section ][...][ section ]
119
* [ vmemmap ][ usable memory ]
120
* ^ | ^ |
121
* +--+ | |
122
* +------------------------+
123
*/
124
if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
125
struct page *page = head ? head + pte_index(addr) :
126
pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
127
128
if (PageVmemmapSelfHosted(page))
129
ret = -ENOTSUPP;
130
}
131
spin_unlock(&init_mm.page_table_lock);
132
if (!head || ret)
133
return ret;
134
135
return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
136
}
137
138
static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
139
unsigned long next, struct mm_walk *walk)
140
{
141
struct vmemmap_remap_walk *vmemmap_walk = walk->private;
142
143
/*
144
* The reuse_page is found 'first' in page table walking before
145
* starting remapping.
146
*/
147
if (!vmemmap_walk->reuse_page)
148
vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
149
else
150
vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
151
vmemmap_walk->nr_walked++;
152
153
return 0;
154
}
155
156
static const struct mm_walk_ops vmemmap_remap_ops = {
157
.pmd_entry = vmemmap_pmd_entry,
158
.pte_entry = vmemmap_pte_entry,
159
};
160
161
static int vmemmap_remap_range(unsigned long start, unsigned long end,
162
struct vmemmap_remap_walk *walk)
163
{
164
int ret;
165
166
VM_BUG_ON(!PAGE_ALIGNED(start | end));
167
168
mmap_read_lock(&init_mm);
169
ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
170
NULL, walk);
171
mmap_read_unlock(&init_mm);
172
if (ret)
173
return ret;
174
175
if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
176
flush_tlb_kernel_range(start, end);
177
178
return 0;
179
}
180
181
/*
182
* Free a vmemmap page. A vmemmap page can be allocated from the memblock
183
* allocator or buddy allocator. If the PG_reserved flag is set, it means
184
* that it allocated from the memblock allocator, just free it via the
185
* free_bootmem_page(). Otherwise, use __free_page().
186
*/
187
static inline void free_vmemmap_page(struct page *page)
188
{
189
if (PageReserved(page)) {
190
memmap_boot_pages_add(-1);
191
free_bootmem_page(page);
192
} else {
193
memmap_pages_add(-1);
194
__free_page(page);
195
}
196
}
197
198
/* Free a list of the vmemmap pages */
199
static void free_vmemmap_page_list(struct list_head *list)
200
{
201
struct page *page, *next;
202
203
list_for_each_entry_safe(page, next, list, lru)
204
free_vmemmap_page(page);
205
}
206
207
static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
208
struct vmemmap_remap_walk *walk)
209
{
210
/*
211
* Remap the tail pages as read-only to catch illegal write operation
212
* to the tail pages.
213
*/
214
pgprot_t pgprot = PAGE_KERNEL_RO;
215
struct page *page = pte_page(ptep_get(pte));
216
pte_t entry;
217
218
/* Remapping the head page requires r/w */
219
if (unlikely(addr == walk->reuse_addr)) {
220
pgprot = PAGE_KERNEL;
221
list_del(&walk->reuse_page->lru);
222
223
/*
224
* Makes sure that preceding stores to the page contents from
225
* vmemmap_remap_free() become visible before the set_pte_at()
226
* write.
227
*/
228
smp_wmb();
229
}
230
231
entry = mk_pte(walk->reuse_page, pgprot);
232
list_add(&page->lru, walk->vmemmap_pages);
233
set_pte_at(&init_mm, addr, pte, entry);
234
}
235
236
/*
237
* How many struct page structs need to be reset. When we reuse the head
238
* struct page, the special metadata (e.g. page->flags or page->mapping)
239
* cannot copy to the tail struct page structs. The invalid value will be
240
* checked in the free_tail_page_prepare(). In order to avoid the message
241
* of "corrupted mapping in tail page". We need to reset at least 4 (one
242
* head struct page struct and three tail struct page structs) struct page
243
* structs.
244
*/
245
#define NR_RESET_STRUCT_PAGE 4
246
247
static inline void reset_struct_pages(struct page *start)
248
{
249
struct page *from = start + NR_RESET_STRUCT_PAGE;
250
251
BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
252
memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
253
}
254
255
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
256
struct vmemmap_remap_walk *walk)
257
{
258
pgprot_t pgprot = PAGE_KERNEL;
259
struct page *page;
260
void *to;
261
262
BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
263
264
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
265
list_del(&page->lru);
266
to = page_to_virt(page);
267
copy_page(to, (void *)walk->reuse_addr);
268
reset_struct_pages(to);
269
270
/*
271
* Makes sure that preceding stores to the page contents become visible
272
* before the set_pte_at() write.
273
*/
274
smp_wmb();
275
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
276
}
277
278
/**
279
* vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
280
* backing PMDs of the directmap into PTEs
281
* @start: start address of the vmemmap virtual address range that we want
282
* to remap.
283
* @end: end address of the vmemmap virtual address range that we want to
284
* remap.
285
* @reuse: reuse address.
286
*
287
* Return: %0 on success, negative error code otherwise.
288
*/
289
static int vmemmap_remap_split(unsigned long start, unsigned long end,
290
unsigned long reuse)
291
{
292
struct vmemmap_remap_walk walk = {
293
.remap_pte = NULL,
294
.flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
295
};
296
297
/* See the comment in the vmemmap_remap_free(). */
298
BUG_ON(start - reuse != PAGE_SIZE);
299
300
return vmemmap_remap_range(reuse, end, &walk);
301
}
302
303
/**
304
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
305
* to the page which @reuse is mapped to, then free vmemmap
306
* which the range are mapped to.
307
* @start: start address of the vmemmap virtual address range that we want
308
* to remap.
309
* @end: end address of the vmemmap virtual address range that we want to
310
* remap.
311
* @reuse: reuse address.
312
* @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
313
* responsibility to free pages.
314
* @flags: modifications to vmemmap_remap_walk flags
315
*
316
* Return: %0 on success, negative error code otherwise.
317
*/
318
static int vmemmap_remap_free(unsigned long start, unsigned long end,
319
unsigned long reuse,
320
struct list_head *vmemmap_pages,
321
unsigned long flags)
322
{
323
int ret;
324
struct vmemmap_remap_walk walk = {
325
.remap_pte = vmemmap_remap_pte,
326
.reuse_addr = reuse,
327
.vmemmap_pages = vmemmap_pages,
328
.flags = flags,
329
};
330
int nid = page_to_nid((struct page *)reuse);
331
gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
332
333
/*
334
* Allocate a new head vmemmap page to avoid breaking a contiguous
335
* block of struct page memory when freeing it back to page allocator
336
* in free_vmemmap_page_list(). This will allow the likely contiguous
337
* struct page backing memory to be kept contiguous and allowing for
338
* more allocations of hugepages. Fallback to the currently
339
* mapped head page in case should it fail to allocate.
340
*/
341
walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
342
if (walk.reuse_page) {
343
copy_page(page_to_virt(walk.reuse_page),
344
(void *)walk.reuse_addr);
345
list_add(&walk.reuse_page->lru, vmemmap_pages);
346
memmap_pages_add(1);
347
}
348
349
/*
350
* In order to make remapping routine most efficient for the huge pages,
351
* the routine of vmemmap page table walking has the following rules
352
* (see more details from the vmemmap_pte_range()):
353
*
354
* - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
355
* should be continuous.
356
* - The @reuse address is part of the range [@reuse, @end) that we are
357
* walking which is passed to vmemmap_remap_range().
358
* - The @reuse address is the first in the complete range.
359
*
360
* So we need to make sure that @start and @reuse meet the above rules.
361
*/
362
BUG_ON(start - reuse != PAGE_SIZE);
363
364
ret = vmemmap_remap_range(reuse, end, &walk);
365
if (ret && walk.nr_walked) {
366
end = reuse + walk.nr_walked * PAGE_SIZE;
367
/*
368
* vmemmap_pages contains pages from the previous
369
* vmemmap_remap_range call which failed. These
370
* are pages which were removed from the vmemmap.
371
* They will be restored in the following call.
372
*/
373
walk = (struct vmemmap_remap_walk) {
374
.remap_pte = vmemmap_restore_pte,
375
.reuse_addr = reuse,
376
.vmemmap_pages = vmemmap_pages,
377
.flags = 0,
378
};
379
380
vmemmap_remap_range(reuse, end, &walk);
381
}
382
383
return ret;
384
}
385
386
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
387
struct list_head *list)
388
{
389
gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
390
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
391
int nid = page_to_nid((struct page *)start);
392
struct page *page, *next;
393
int i;
394
395
for (i = 0; i < nr_pages; i++) {
396
page = alloc_pages_node(nid, gfp_mask, 0);
397
if (!page)
398
goto out;
399
list_add(&page->lru, list);
400
}
401
memmap_pages_add(nr_pages);
402
403
return 0;
404
out:
405
list_for_each_entry_safe(page, next, list, lru)
406
__free_page(page);
407
return -ENOMEM;
408
}
409
410
/**
411
* vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
412
* to the page which is from the @vmemmap_pages
413
* respectively.
414
* @start: start address of the vmemmap virtual address range that we want
415
* to remap.
416
* @end: end address of the vmemmap virtual address range that we want to
417
* remap.
418
* @reuse: reuse address.
419
* @flags: modifications to vmemmap_remap_walk flags
420
*
421
* Return: %0 on success, negative error code otherwise.
422
*/
423
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
424
unsigned long reuse, unsigned long flags)
425
{
426
LIST_HEAD(vmemmap_pages);
427
struct vmemmap_remap_walk walk = {
428
.remap_pte = vmemmap_restore_pte,
429
.reuse_addr = reuse,
430
.vmemmap_pages = &vmemmap_pages,
431
.flags = flags,
432
};
433
434
/* See the comment in the vmemmap_remap_free(). */
435
BUG_ON(start - reuse != PAGE_SIZE);
436
437
if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
438
return -ENOMEM;
439
440
return vmemmap_remap_range(reuse, end, &walk);
441
}
442
443
DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
444
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
445
446
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
447
static int __init hugetlb_vmemmap_optimize_param(char *buf)
448
{
449
return kstrtobool(buf, &vmemmap_optimize_enabled);
450
}
451
early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
452
453
static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
454
struct folio *folio, unsigned long flags)
455
{
456
int ret;
457
unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
458
unsigned long vmemmap_reuse;
459
460
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
461
VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
462
463
if (!folio_test_hugetlb_vmemmap_optimized(folio))
464
return 0;
465
466
if (flags & VMEMMAP_SYNCHRONIZE_RCU)
467
synchronize_rcu();
468
469
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
470
vmemmap_reuse = vmemmap_start;
471
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
472
473
/*
474
* The pages which the vmemmap virtual address range [@vmemmap_start,
475
* @vmemmap_end) are mapped to are freed to the buddy allocator, and
476
* the range is mapped to the page which @vmemmap_reuse is mapped to.
477
* When a HugeTLB page is freed to the buddy allocator, previously
478
* discarded vmemmap pages must be allocated and remapping.
479
*/
480
ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
481
if (!ret) {
482
folio_clear_hugetlb_vmemmap_optimized(folio);
483
static_branch_dec(&hugetlb_optimize_vmemmap_key);
484
}
485
486
return ret;
487
}
488
489
/**
490
* hugetlb_vmemmap_restore_folio - restore previously optimized (by
491
* hugetlb_vmemmap_optimize_folio()) vmemmap pages which
492
* will be reallocated and remapped.
493
* @h: struct hstate.
494
* @folio: the folio whose vmemmap pages will be restored.
495
*
496
* Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
497
* negative error code otherwise.
498
*/
499
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
500
{
501
return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
502
}
503
504
/**
505
* hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
506
* @h: hstate.
507
* @folio_list: list of folios.
508
* @non_hvo_folios: Output list of folios for which vmemmap exists.
509
*
510
* Return: number of folios for which vmemmap was restored, or an error code
511
* if an error was encountered restoring vmemmap for a folio.
512
* Folios that have vmemmap are moved to the non_hvo_folios
513
* list. Processing of entries stops when the first error is
514
* encountered. The folio that experienced the error and all
515
* non-processed folios will remain on folio_list.
516
*/
517
long hugetlb_vmemmap_restore_folios(const struct hstate *h,
518
struct list_head *folio_list,
519
struct list_head *non_hvo_folios)
520
{
521
struct folio *folio, *t_folio;
522
long restored = 0;
523
long ret = 0;
524
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
525
526
list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
527
if (folio_test_hugetlb_vmemmap_optimized(folio)) {
528
ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
529
/* only need to synchronize_rcu() once for each batch */
530
flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
531
532
if (ret)
533
break;
534
restored++;
535
}
536
537
/* Add non-optimized folios to output list */
538
list_move(&folio->lru, non_hvo_folios);
539
}
540
541
if (restored)
542
flush_tlb_all();
543
if (!ret)
544
ret = restored;
545
return ret;
546
}
547
548
/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
549
static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
550
{
551
if (folio_test_hugetlb_vmemmap_optimized(folio))
552
return false;
553
554
if (!READ_ONCE(vmemmap_optimize_enabled))
555
return false;
556
557
if (!hugetlb_vmemmap_optimizable(h))
558
return false;
559
560
return true;
561
}
562
563
static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
564
struct folio *folio,
565
struct list_head *vmemmap_pages,
566
unsigned long flags)
567
{
568
int ret = 0;
569
unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
570
unsigned long vmemmap_reuse;
571
572
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
573
VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
574
575
if (!vmemmap_should_optimize_folio(h, folio))
576
return ret;
577
578
static_branch_inc(&hugetlb_optimize_vmemmap_key);
579
580
if (flags & VMEMMAP_SYNCHRONIZE_RCU)
581
synchronize_rcu();
582
/*
583
* Very Subtle
584
* If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
585
* immediately after remapping. As a result, subsequent accesses
586
* and modifications to struct pages associated with the hugetlb
587
* page could be to the OLD struct pages. Set the vmemmap optimized
588
* flag here so that it is copied to the new head page. This keeps
589
* the old and new struct pages in sync.
590
* If there is an error during optimization, we will immediately FLUSH
591
* the TLB and clear the flag below.
592
*/
593
folio_set_hugetlb_vmemmap_optimized(folio);
594
595
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
596
vmemmap_reuse = vmemmap_start;
597
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
598
599
/*
600
* Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
601
* to the page which @vmemmap_reuse is mapped to. Add pages previously
602
* mapping the range to vmemmap_pages list so that they can be freed by
603
* the caller.
604
*/
605
ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
606
vmemmap_pages, flags);
607
if (ret) {
608
static_branch_dec(&hugetlb_optimize_vmemmap_key);
609
folio_clear_hugetlb_vmemmap_optimized(folio);
610
}
611
612
return ret;
613
}
614
615
/**
616
* hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
617
* @h: struct hstate.
618
* @folio: the folio whose vmemmap pages will be optimized.
619
*
620
* This function only tries to optimize @folio's vmemmap pages and does not
621
* guarantee that the optimization will succeed after it returns. The caller
622
* can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
623
* vmemmap pages have been optimized.
624
*/
625
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
626
{
627
LIST_HEAD(vmemmap_pages);
628
629
__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
630
free_vmemmap_page_list(&vmemmap_pages);
631
}
632
633
static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
634
{
635
unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
636
unsigned long vmemmap_reuse;
637
638
if (!vmemmap_should_optimize_folio(h, folio))
639
return 0;
640
641
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
642
vmemmap_reuse = vmemmap_start;
643
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
644
645
/*
646
* Split PMDs on the vmemmap virtual address range [@vmemmap_start,
647
* @vmemmap_end]
648
*/
649
return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
650
}
651
652
static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
653
struct list_head *folio_list,
654
bool boot)
655
{
656
struct folio *folio;
657
int nr_to_optimize;
658
LIST_HEAD(vmemmap_pages);
659
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
660
661
nr_to_optimize = 0;
662
list_for_each_entry(folio, folio_list, lru) {
663
int ret;
664
unsigned long spfn, epfn;
665
666
if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
667
/*
668
* Already optimized by pre-HVO, just map the
669
* mirrored tail page structs RO.
670
*/
671
spfn = (unsigned long)&folio->page;
672
epfn = spfn + pages_per_huge_page(h);
673
vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
674
HUGETLB_VMEMMAP_RESERVE_SIZE);
675
register_page_bootmem_memmap(pfn_to_section_nr(spfn),
676
&folio->page,
677
HUGETLB_VMEMMAP_RESERVE_SIZE);
678
static_branch_inc(&hugetlb_optimize_vmemmap_key);
679
continue;
680
}
681
682
nr_to_optimize++;
683
684
ret = hugetlb_vmemmap_split_folio(h, folio);
685
686
/*
687
* Spliting the PMD requires allocating a page, thus lets fail
688
* early once we encounter the first OOM. No point in retrying
689
* as it can be dynamically done on remap with the memory
690
* we get back from the vmemmap deduplication.
691
*/
692
if (ret == -ENOMEM)
693
break;
694
}
695
696
if (!nr_to_optimize)
697
/*
698
* All pre-HVO folios, nothing left to do. It's ok if
699
* there is a mix of pre-HVO and not yet HVO-ed folios
700
* here, as __hugetlb_vmemmap_optimize_folio() will
701
* skip any folios that already have the optimized flag
702
* set, see vmemmap_should_optimize_folio().
703
*/
704
goto out;
705
706
flush_tlb_all();
707
708
list_for_each_entry(folio, folio_list, lru) {
709
int ret;
710
711
ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
712
/* only need to synchronize_rcu() once for each batch */
713
flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
714
715
/*
716
* Pages to be freed may have been accumulated. If we
717
* encounter an ENOMEM, free what we have and try again.
718
* This can occur in the case that both spliting fails
719
* halfway and head page allocation also failed. In this
720
* case __hugetlb_vmemmap_optimize_folio() would free memory
721
* allowing more vmemmap remaps to occur.
722
*/
723
if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
724
flush_tlb_all();
725
free_vmemmap_page_list(&vmemmap_pages);
726
INIT_LIST_HEAD(&vmemmap_pages);
727
__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
728
}
729
}
730
731
out:
732
flush_tlb_all();
733
free_vmemmap_page_list(&vmemmap_pages);
734
}
735
736
void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
737
{
738
__hugetlb_vmemmap_optimize_folios(h, folio_list, false);
739
}
740
741
void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
742
{
743
__hugetlb_vmemmap_optimize_folios(h, folio_list, true);
744
}
745
746
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
747
748
/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
749
static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
750
{
751
unsigned long section_size, psize, pmd_vmemmap_size;
752
phys_addr_t paddr;
753
754
if (!READ_ONCE(vmemmap_optimize_enabled))
755
return false;
756
757
if (!hugetlb_vmemmap_optimizable(m->hstate))
758
return false;
759
760
psize = huge_page_size(m->hstate);
761
paddr = virt_to_phys(m);
762
763
/*
764
* Pre-HVO only works if the bootmem huge page
765
* is aligned to the section size.
766
*/
767
section_size = (1UL << PA_SECTION_SHIFT);
768
if (!IS_ALIGNED(paddr, section_size) ||
769
!IS_ALIGNED(psize, section_size))
770
return false;
771
772
/*
773
* The pre-HVO code does not deal with splitting PMDS,
774
* so the bootmem page must be aligned to the number
775
* of base pages that can be mapped with one vmemmap PMD.
776
*/
777
pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
778
if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
779
!IS_ALIGNED(psize, pmd_vmemmap_size))
780
return false;
781
782
return true;
783
}
784
785
/*
786
* Initialize memmap section for a gigantic page, HVO-style.
787
*/
788
void __init hugetlb_vmemmap_init_early(int nid)
789
{
790
unsigned long psize, paddr, section_size;
791
unsigned long ns, i, pnum, pfn, nr_pages;
792
unsigned long start, end;
793
struct huge_bootmem_page *m = NULL;
794
void *map;
795
796
/*
797
* Noting to do if bootmem pages were not allocated
798
* early in boot, or if HVO wasn't enabled in the
799
* first place.
800
*/
801
if (!hugetlb_bootmem_allocated())
802
return;
803
804
if (!READ_ONCE(vmemmap_optimize_enabled))
805
return;
806
807
section_size = (1UL << PA_SECTION_SHIFT);
808
809
list_for_each_entry(m, &huge_boot_pages[nid], list) {
810
if (!vmemmap_should_optimize_bootmem_page(m))
811
continue;
812
813
nr_pages = pages_per_huge_page(m->hstate);
814
psize = nr_pages << PAGE_SHIFT;
815
paddr = virt_to_phys(m);
816
pfn = PHYS_PFN(paddr);
817
map = pfn_to_page(pfn);
818
start = (unsigned long)map;
819
end = start + nr_pages * sizeof(struct page);
820
821
if (vmemmap_populate_hvo(start, end, nid,
822
HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
823
continue;
824
825
memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
826
827
pnum = pfn_to_section_nr(pfn);
828
ns = psize / section_size;
829
830
for (i = 0; i < ns; i++) {
831
sparse_init_early_section(nid, map, pnum,
832
SECTION_IS_VMEMMAP_PREINIT);
833
map += section_map_size();
834
pnum++;
835
}
836
837
m->flags |= HUGE_BOOTMEM_HVO;
838
}
839
}
840
841
void __init hugetlb_vmemmap_init_late(int nid)
842
{
843
struct huge_bootmem_page *m, *tm;
844
unsigned long phys, nr_pages, start, end;
845
unsigned long pfn, nr_mmap;
846
struct hstate *h;
847
void *map;
848
849
if (!hugetlb_bootmem_allocated())
850
return;
851
852
if (!READ_ONCE(vmemmap_optimize_enabled))
853
return;
854
855
list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
856
if (!(m->flags & HUGE_BOOTMEM_HVO))
857
continue;
858
859
phys = virt_to_phys(m);
860
h = m->hstate;
861
pfn = PHYS_PFN(phys);
862
nr_pages = pages_per_huge_page(h);
863
864
if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
865
/*
866
* Oops, the hugetlb page spans multiple zones.
867
* Remove it from the list, and undo HVO.
868
*/
869
list_del(&m->list);
870
871
map = pfn_to_page(pfn);
872
873
start = (unsigned long)map;
874
end = start + nr_pages * sizeof(struct page);
875
876
vmemmap_undo_hvo(start, end, nid,
877
HUGETLB_VMEMMAP_RESERVE_SIZE);
878
nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
879
memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
880
881
memblock_phys_free(phys, huge_page_size(h));
882
continue;
883
} else
884
m->flags |= HUGE_BOOTMEM_ZONES_VALID;
885
}
886
}
887
#endif
888
889
static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
890
{
891
.procname = "hugetlb_optimize_vmemmap",
892
.data = &vmemmap_optimize_enabled,
893
.maxlen = sizeof(vmemmap_optimize_enabled),
894
.mode = 0644,
895
.proc_handler = proc_dobool,
896
},
897
};
898
899
static int __init hugetlb_vmemmap_init(void)
900
{
901
const struct hstate *h;
902
903
/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
904
BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
905
906
for_each_hstate(h) {
907
if (hugetlb_vmemmap_optimizable(h)) {
908
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
909
break;
910
}
911
}
912
return 0;
913
}
914
late_initcall(hugetlb_vmemmap_init);
915
916