Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/tile/mm/pgtable.c
10817 views
1
/*
2
* Copyright 2010 Tilera Corporation. All Rights Reserved.
3
*
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public License
6
* as published by the Free Software Foundation, version 2.
7
*
8
* This program is distributed in the hope that it will be useful, but
9
* WITHOUT ANY WARRANTY; without even the implied warranty of
10
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11
* NON INFRINGEMENT. See the GNU General Public License for
12
* more details.
13
*/
14
15
#include <linux/sched.h>
16
#include <linux/kernel.h>
17
#include <linux/errno.h>
18
#include <linux/mm.h>
19
#include <linux/swap.h>
20
#include <linux/highmem.h>
21
#include <linux/slab.h>
22
#include <linux/pagemap.h>
23
#include <linux/spinlock.h>
24
#include <linux/cpumask.h>
25
#include <linux/module.h>
26
#include <linux/io.h>
27
#include <linux/vmalloc.h>
28
#include <linux/smp.h>
29
30
#include <asm/system.h>
31
#include <asm/pgtable.h>
32
#include <asm/pgalloc.h>
33
#include <asm/fixmap.h>
34
#include <asm/tlb.h>
35
#include <asm/tlbflush.h>
36
#include <asm/homecache.h>
37
38
#define K(x) ((x) << (PAGE_SHIFT-10))
39
40
/*
41
* The normal show_free_areas() is too verbose on Tile, with dozens
42
* of processors and often four NUMA zones each with high and lowmem.
43
*/
44
void show_mem(unsigned int filter)
45
{
46
struct zone *zone;
47
48
pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
49
" free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
50
" pagecache:%lu swap:%lu\n",
51
(global_page_state(NR_ACTIVE_ANON) +
52
global_page_state(NR_ACTIVE_FILE)),
53
(global_page_state(NR_INACTIVE_ANON) +
54
global_page_state(NR_INACTIVE_FILE)),
55
global_page_state(NR_FILE_DIRTY),
56
global_page_state(NR_WRITEBACK),
57
global_page_state(NR_UNSTABLE_NFS),
58
global_page_state(NR_FREE_PAGES),
59
(global_page_state(NR_SLAB_RECLAIMABLE) +
60
global_page_state(NR_SLAB_UNRECLAIMABLE)),
61
global_page_state(NR_FILE_MAPPED),
62
global_page_state(NR_PAGETABLE),
63
global_page_state(NR_BOUNCE),
64
global_page_state(NR_FILE_PAGES),
65
nr_swap_pages);
66
67
for_each_zone(zone) {
68
unsigned long flags, order, total = 0, largest_order = -1;
69
70
if (!populated_zone(zone))
71
continue;
72
73
spin_lock_irqsave(&zone->lock, flags);
74
for (order = 0; order < MAX_ORDER; order++) {
75
int nr = zone->free_area[order].nr_free;
76
total += nr << order;
77
if (nr)
78
largest_order = order;
79
}
80
spin_unlock_irqrestore(&zone->lock, flags);
81
pr_err("Node %d %7s: %lukB (largest %luKb)\n",
82
zone_to_nid(zone), zone->name,
83
K(total), largest_order ? K(1UL) << largest_order : 0);
84
}
85
}
86
87
/*
88
* Associate a virtual page frame with a given physical page frame
89
* and protection flags for that frame.
90
*/
91
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
92
{
93
pgd_t *pgd;
94
pud_t *pud;
95
pmd_t *pmd;
96
pte_t *pte;
97
98
pgd = swapper_pg_dir + pgd_index(vaddr);
99
if (pgd_none(*pgd)) {
100
BUG();
101
return;
102
}
103
pud = pud_offset(pgd, vaddr);
104
if (pud_none(*pud)) {
105
BUG();
106
return;
107
}
108
pmd = pmd_offset(pud, vaddr);
109
if (pmd_none(*pmd)) {
110
BUG();
111
return;
112
}
113
pte = pte_offset_kernel(pmd, vaddr);
114
/* <pfn,flags> stored as-is, to permit clearing entries */
115
set_pte(pte, pfn_pte(pfn, flags));
116
117
/*
118
* It's enough to flush this one mapping.
119
* This appears conservative since it is only called
120
* from __set_fixmap.
121
*/
122
local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
123
}
124
125
void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
126
{
127
unsigned long address = __fix_to_virt(idx);
128
129
if (idx >= __end_of_fixed_addresses) {
130
BUG();
131
return;
132
}
133
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
134
}
135
136
#if defined(CONFIG_HIGHPTE)
137
pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
138
{
139
pte_t *pte = kmap_atomic(pmd_page(*dir)) +
140
(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
141
return &pte[pte_index(address)];
142
}
143
#endif
144
145
/**
146
* shatter_huge_page() - ensure a given address is mapped by a small page.
147
*
148
* This function converts a huge PTE mapping kernel LOWMEM into a bunch
149
* of small PTEs with the same caching. No cache flush required, but we
150
* must do a global TLB flush.
151
*
152
* Any caller that wishes to modify a kernel mapping that might
153
* have been made with a huge page should call this function,
154
* since doing so properly avoids race conditions with installing the
155
* newly-shattered page and then flushing all the TLB entries.
156
*
157
* @addr: Address at which to shatter any existing huge page.
158
*/
159
void shatter_huge_page(unsigned long addr)
160
{
161
pgd_t *pgd;
162
pud_t *pud;
163
pmd_t *pmd;
164
unsigned long flags = 0; /* happy compiler */
165
#ifdef __PAGETABLE_PMD_FOLDED
166
struct list_head *pos;
167
#endif
168
169
/* Get a pointer to the pmd entry that we need to change. */
170
addr &= HPAGE_MASK;
171
BUG_ON(pgd_addr_invalid(addr));
172
BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
173
pgd = swapper_pg_dir + pgd_index(addr);
174
pud = pud_offset(pgd, addr);
175
BUG_ON(!pud_present(*pud));
176
pmd = pmd_offset(pud, addr);
177
BUG_ON(!pmd_present(*pmd));
178
if (!pmd_huge_page(*pmd))
179
return;
180
181
/*
182
* Grab the pgd_lock, since we may need it to walk the pgd_list,
183
* and since we need some kind of lock here to avoid races.
184
*/
185
spin_lock_irqsave(&pgd_lock, flags);
186
if (!pmd_huge_page(*pmd)) {
187
/* Lost the race to convert the huge page. */
188
spin_unlock_irqrestore(&pgd_lock, flags);
189
return;
190
}
191
192
/* Shatter the huge page into the preallocated L2 page table. */
193
pmd_populate_kernel(&init_mm, pmd,
194
get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
195
196
#ifdef __PAGETABLE_PMD_FOLDED
197
/* Walk every pgd on the system and update the pmd there. */
198
list_for_each(pos, &pgd_list) {
199
pmd_t *copy_pmd;
200
pgd = list_to_pgd(pos) + pgd_index(addr);
201
pud = pud_offset(pgd, addr);
202
copy_pmd = pmd_offset(pud, addr);
203
__set_pmd(copy_pmd, *pmd);
204
}
205
#endif
206
207
/* Tell every cpu to notice the change. */
208
flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
209
cpu_possible_mask, NULL, 0);
210
211
/* Hold the lock until the TLB flush is finished to avoid races. */
212
spin_unlock_irqrestore(&pgd_lock, flags);
213
}
214
215
/*
216
* List of all pgd's needed so it can invalidate entries in both cached
217
* and uncached pgd's. This is essentially codepath-based locking
218
* against pageattr.c; it is the unique case in which a valid change
219
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
220
* vmalloc faults work because attached pagetables are never freed.
221
* The locking scheme was chosen on the basis of manfred's
222
* recommendations and having no core impact whatsoever.
223
* -- wli
224
*/
225
DEFINE_SPINLOCK(pgd_lock);
226
LIST_HEAD(pgd_list);
227
228
static inline void pgd_list_add(pgd_t *pgd)
229
{
230
list_add(pgd_to_list(pgd), &pgd_list);
231
}
232
233
static inline void pgd_list_del(pgd_t *pgd)
234
{
235
list_del(pgd_to_list(pgd));
236
}
237
238
#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
239
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
240
241
static void pgd_ctor(pgd_t *pgd)
242
{
243
unsigned long flags;
244
245
memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
246
spin_lock_irqsave(&pgd_lock, flags);
247
248
#ifndef __tilegx__
249
/*
250
* Check that the user interrupt vector has no L2.
251
* It never should for the swapper, and new page tables
252
* should always start with an empty user interrupt vector.
253
*/
254
BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
255
#endif
256
257
memcpy(pgd + KERNEL_PGD_INDEX_START,
258
swapper_pg_dir + KERNEL_PGD_INDEX_START,
259
KERNEL_PGD_PTRS * sizeof(pgd_t));
260
261
pgd_list_add(pgd);
262
spin_unlock_irqrestore(&pgd_lock, flags);
263
}
264
265
static void pgd_dtor(pgd_t *pgd)
266
{
267
unsigned long flags; /* can be called from interrupt context */
268
269
spin_lock_irqsave(&pgd_lock, flags);
270
pgd_list_del(pgd);
271
spin_unlock_irqrestore(&pgd_lock, flags);
272
}
273
274
pgd_t *pgd_alloc(struct mm_struct *mm)
275
{
276
pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
277
if (pgd)
278
pgd_ctor(pgd);
279
return pgd;
280
}
281
282
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
283
{
284
pgd_dtor(pgd);
285
kmem_cache_free(pgd_cache, pgd);
286
}
287
288
289
#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
290
291
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
292
{
293
gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
294
struct page *p;
295
#if L2_USER_PGTABLE_ORDER > 0
296
int i;
297
#endif
298
299
#ifdef CONFIG_HIGHPTE
300
flags |= __GFP_HIGHMEM;
301
#endif
302
303
p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
304
if (p == NULL)
305
return NULL;
306
307
#if L2_USER_PGTABLE_ORDER > 0
308
/*
309
* Make every page have a page_count() of one, not just the first.
310
* We don't use __GFP_COMP since it doesn't look like it works
311
* correctly with tlb_remove_page().
312
*/
313
for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
314
init_page_count(p+i);
315
inc_zone_page_state(p+i, NR_PAGETABLE);
316
}
317
#endif
318
319
pgtable_page_ctor(p);
320
return p;
321
}
322
323
/*
324
* Free page immediately (used in __pte_alloc if we raced with another
325
* process). We have to correct whatever pte_alloc_one() did before
326
* returning the pages to the allocator.
327
*/
328
void pte_free(struct mm_struct *mm, struct page *p)
329
{
330
int i;
331
332
pgtable_page_dtor(p);
333
__free_page(p);
334
335
for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
336
__free_page(p+i);
337
dec_zone_page_state(p+i, NR_PAGETABLE);
338
}
339
}
340
341
void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
342
unsigned long address)
343
{
344
int i;
345
346
pgtable_page_dtor(pte);
347
tlb_remove_page(tlb, pte);
348
349
for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
350
tlb_remove_page(tlb, pte + i);
351
dec_zone_page_state(pte + i, NR_PAGETABLE);
352
}
353
}
354
355
#ifndef __tilegx__
356
357
/*
358
* FIXME: needs to be atomic vs hypervisor writes. For now we make the
359
* window of vulnerability a bit smaller by doing an unlocked 8-bit update.
360
*/
361
int ptep_test_and_clear_young(struct vm_area_struct *vma,
362
unsigned long addr, pte_t *ptep)
363
{
364
#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
365
# error Code assumes HV_PTE "accessed" bit in second byte
366
#endif
367
u8 *tmp = (u8 *)ptep;
368
u8 second_byte = tmp[1];
369
if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
370
return 0;
371
tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
372
return 1;
373
}
374
375
/*
376
* This implementation is atomic vs hypervisor writes, since the hypervisor
377
* always writes the low word (where "accessed" and "dirty" are) and this
378
* routine only writes the high word.
379
*/
380
void ptep_set_wrprotect(struct mm_struct *mm,
381
unsigned long addr, pte_t *ptep)
382
{
383
#if HV_PTE_INDEX_WRITABLE < 32
384
# error Code assumes HV_PTE "writable" bit in high word
385
#endif
386
u32 *tmp = (u32 *)ptep;
387
tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
388
}
389
390
#endif
391
392
pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
393
{
394
pgd_t *pgd;
395
pud_t *pud;
396
pmd_t *pmd;
397
398
if (pgd_addr_invalid(addr))
399
return NULL;
400
401
pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
402
pud = pud_offset(pgd, addr);
403
if (!pud_present(*pud))
404
return NULL;
405
pmd = pmd_offset(pud, addr);
406
if (pmd_huge_page(*pmd))
407
return (pte_t *)pmd;
408
if (!pmd_present(*pmd))
409
return NULL;
410
return pte_offset_kernel(pmd, addr);
411
}
412
413
pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
414
{
415
unsigned int width = smp_width;
416
int x = cpu % width;
417
int y = cpu / width;
418
BUG_ON(y >= smp_height);
419
BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
420
BUG_ON(cpu < 0 || cpu >= NR_CPUS);
421
BUG_ON(!cpu_is_valid_lotar(cpu));
422
return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
423
}
424
425
int get_remote_cache_cpu(pgprot_t prot)
426
{
427
HV_LOTAR lotar = hv_pte_get_lotar(prot);
428
int x = HV_LOTAR_X(lotar);
429
int y = HV_LOTAR_Y(lotar);
430
BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
431
return x + y * smp_width;
432
}
433
434
/*
435
* Convert a kernel VA to a PA and homing information.
436
*/
437
int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
438
{
439
struct page *page = virt_to_page(va);
440
pte_t null_pte = { 0 };
441
442
*cpa = __pa(va);
443
444
/* Note that this is not writing a page table, just returning a pte. */
445
*pte = pte_set_home(null_pte, page_home(page));
446
447
return 0; /* return non-zero if not hfh? */
448
}
449
EXPORT_SYMBOL(va_to_cpa_and_pte);
450
451
void __set_pte(pte_t *ptep, pte_t pte)
452
{
453
#ifdef __tilegx__
454
*ptep = pte;
455
#else
456
# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
457
# error Must write the present and migrating bits last
458
# endif
459
if (pte_present(pte)) {
460
((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
461
barrier();
462
((u32 *)ptep)[0] = (u32)(pte_val(pte));
463
} else {
464
((u32 *)ptep)[0] = (u32)(pte_val(pte));
465
barrier();
466
((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
467
}
468
#endif /* __tilegx__ */
469
}
470
471
void set_pte(pte_t *ptep, pte_t pte)
472
{
473
struct page *page = pfn_to_page(pte_pfn(pte));
474
475
/* Update the home of a PTE if necessary */
476
pte = pte_set_home(pte, page_home(page));
477
478
__set_pte(ptep, pte);
479
}
480
481
/* Can this mm load a PTE with cached_priority set? */
482
static inline int mm_is_priority_cached(struct mm_struct *mm)
483
{
484
return mm->context.priority_cached;
485
}
486
487
/*
488
* Add a priority mapping to an mm_context and
489
* notify the hypervisor if this is the first one.
490
*/
491
void start_mm_caching(struct mm_struct *mm)
492
{
493
if (!mm_is_priority_cached(mm)) {
494
mm->context.priority_cached = -1U;
495
hv_set_caching(-1U);
496
}
497
}
498
499
/*
500
* Validate and return the priority_cached flag. We know if it's zero
501
* that we don't need to scan, since we immediately set it non-zero
502
* when we first consider a MAP_CACHE_PRIORITY mapping.
503
*
504
* We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
505
* since we're in an interrupt context (servicing switch_mm) we don't
506
* worry about it and don't unset the "priority_cached" field.
507
* Presumably we'll come back later and have more luck and clear
508
* the value then; for now we'll just keep the cache marked for priority.
509
*/
510
static unsigned int update_priority_cached(struct mm_struct *mm)
511
{
512
if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
513
struct vm_area_struct *vm;
514
for (vm = mm->mmap; vm; vm = vm->vm_next) {
515
if (hv_pte_get_cached_priority(vm->vm_page_prot))
516
break;
517
}
518
if (vm == NULL)
519
mm->context.priority_cached = 0;
520
up_write(&mm->mmap_sem);
521
}
522
return mm->context.priority_cached;
523
}
524
525
/* Set caching correctly for an mm that we are switching to. */
526
void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
527
{
528
if (!mm_is_priority_cached(next)) {
529
/*
530
* If the new mm doesn't use priority caching, just see if we
531
* need the hv_set_caching(), or can assume it's already zero.
532
*/
533
if (mm_is_priority_cached(prev))
534
hv_set_caching(0);
535
} else {
536
hv_set_caching(update_priority_cached(next));
537
}
538
}
539
540
#if CHIP_HAS_MMIO()
541
542
/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
543
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
544
pgprot_t home)
545
{
546
void *addr;
547
struct vm_struct *area;
548
unsigned long offset, last_addr;
549
pgprot_t pgprot;
550
551
/* Don't allow wraparound or zero size */
552
last_addr = phys_addr + size - 1;
553
if (!size || last_addr < phys_addr)
554
return NULL;
555
556
/* Create a read/write, MMIO VA mapping homed at the requested shim. */
557
pgprot = PAGE_KERNEL;
558
pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
559
pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
560
561
/*
562
* Mappings have to be page-aligned
563
*/
564
offset = phys_addr & ~PAGE_MASK;
565
phys_addr &= PAGE_MASK;
566
size = PAGE_ALIGN(last_addr+1) - phys_addr;
567
568
/*
569
* Ok, go for it..
570
*/
571
area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
572
if (!area)
573
return NULL;
574
area->phys_addr = phys_addr;
575
addr = area->addr;
576
if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
577
phys_addr, pgprot)) {
578
remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
579
return NULL;
580
}
581
return (__force void __iomem *) (offset + (char *)addr);
582
}
583
EXPORT_SYMBOL(ioremap_prot);
584
585
/* Map a PCI MMIO bus address into VA space. */
586
void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
587
{
588
panic("ioremap for PCI MMIO is not supported");
589
}
590
EXPORT_SYMBOL(ioremap);
591
592
/* Unmap an MMIO VA mapping. */
593
void iounmap(volatile void __iomem *addr_in)
594
{
595
volatile void __iomem *addr = (volatile void __iomem *)
596
(PAGE_MASK & (unsigned long __force)addr_in);
597
#if 1
598
vunmap((void * __force)addr);
599
#else
600
/* x86 uses this complicated flow instead of vunmap(). Is
601
* there any particular reason we should do the same? */
602
struct vm_struct *p, *o;
603
604
/* Use the vm area unlocked, assuming the caller
605
ensures there isn't another iounmap for the same address
606
in parallel. Reuse of the virtual address is prevented by
607
leaving it in the global lists until we're done with it.
608
cpa takes care of the direct mappings. */
609
read_lock(&vmlist_lock);
610
for (p = vmlist; p; p = p->next) {
611
if (p->addr == addr)
612
break;
613
}
614
read_unlock(&vmlist_lock);
615
616
if (!p) {
617
pr_err("iounmap: bad address %p\n", addr);
618
dump_stack();
619
return;
620
}
621
622
/* Finally remove it */
623
o = remove_vm_area((void *)addr);
624
BUG_ON(p != o || o == NULL);
625
kfree(p);
626
#endif
627
}
628
EXPORT_SYMBOL(iounmap);
629
630
#endif /* CHIP_HAS_MMIO() */
631
632