/*1* PowerPC version2* Copyright (C) 1995-1996 Gary Thomas ([email protected])3*4* Derived from "arch/i386/mm/fault.c"5* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds6*7* Modified by Cort Dougan and Paul Mackerras.8*9* Modified for PPC64 by Dave Engebretsen ([email protected])10*11* This program is free software; you can redistribute it and/or12* modify it under the terms of the GNU General Public License13* as published by the Free Software Foundation; either version14* 2 of the License, or (at your option) any later version.15*/1617#include <linux/signal.h>18#include <linux/sched.h>19#include <linux/kernel.h>20#include <linux/errno.h>21#include <linux/string.h>22#include <linux/types.h>23#include <linux/ptrace.h>24#include <linux/mman.h>25#include <linux/mm.h>26#include <linux/interrupt.h>27#include <linux/highmem.h>28#include <linux/module.h>29#include <linux/kprobes.h>30#include <linux/kdebug.h>31#include <linux/perf_event.h>32#include <linux/magic.h>33#include <linux/ratelimit.h>3435#include <asm/firmware.h>36#include <asm/page.h>37#include <asm/pgtable.h>38#include <asm/mmu.h>39#include <asm/mmu_context.h>40#include <asm/system.h>41#include <asm/uaccess.h>42#include <asm/tlbflush.h>43#include <asm/siginfo.h>44#include <mm/mmu_decl.h>4546#ifdef CONFIG_KPROBES47static inline int notify_page_fault(struct pt_regs *regs)48{49int ret = 0;5051/* kprobe_running() needs smp_processor_id() */52if (!user_mode(regs)) {53preempt_disable();54if (kprobe_running() && kprobe_fault_handler(regs, 11))55ret = 1;56preempt_enable();57}5859return ret;60}61#else62static inline int notify_page_fault(struct pt_regs *regs)63{64return 0;65}66#endif6768/*69* Check whether the instruction at regs->nip is a store using70* an update addressing form which will update r1.71*/72static int store_updates_sp(struct pt_regs *regs)73{74unsigned int inst;7576if (get_user(inst, (unsigned int __user *)regs->nip))77return 0;78/* check for 1 in the rA field */79if (((inst >> 16) & 0x1f) != 1)80return 0;81/* check major opcode */82switch (inst >> 26) {83case 37: /* stwu */84case 39: /* stbu */85case 45: /* sthu */86case 53: /* stfsu */87case 55: /* stfdu */88return 1;89case 62: /* std or stdu */90return (inst & 3) == 1;91case 31:92/* check minor opcode */93switch ((inst >> 1) & 0x3ff) {94case 181: /* stdux */95case 183: /* stwux */96case 247: /* stbux */97case 439: /* sthux */98case 695: /* stfsux */99case 759: /* stfdux */100return 1;101}102}103return 0;104}105106/*107* For 600- and 800-family processors, the error_code parameter is DSISR108* for a data fault, SRR1 for an instruction fault. For 400-family processors109* the error_code parameter is ESR for a data fault, 0 for an instruction110* fault.111* For 64-bit processors, the error_code parameter is112* - DSISR for a non-SLB data access fault,113* - SRR1 & 0x08000000 for a non-SLB instruction access fault114* - 0 any SLB fault.115*116* The return value is 0 if the fault was handled, or the signal117* number if this is a kernel fault that can't be handled here.118*/119int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,120unsigned long error_code)121{122struct vm_area_struct * vma;123struct mm_struct *mm = current->mm;124siginfo_t info;125int code = SEGV_MAPERR;126int is_write = 0, ret;127int trap = TRAP(regs);128int is_exec = trap == 0x400;129130#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))131/*132* Fortunately the bit assignments in SRR1 for an instruction133* fault and DSISR for a data fault are mostly the same for the134* bits we are interested in. But there are some bits which135* indicate errors in DSISR but can validly be set in SRR1.136*/137if (trap == 0x400)138error_code &= 0x48200000;139else140is_write = error_code & DSISR_ISSTORE;141#else142is_write = error_code & ESR_DST;143#endif /* CONFIG_4xx || CONFIG_BOOKE */144145if (notify_page_fault(regs))146return 0;147148if (unlikely(debugger_fault_handler(regs)))149return 0;150151/* On a kernel SLB miss we can only check for a valid exception entry */152if (!user_mode(regs) && (address >= TASK_SIZE))153return SIGSEGV;154155#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \156defined(CONFIG_PPC_BOOK3S_64))157if (error_code & DSISR_DABRMATCH) {158/* DABR match */159do_dabr(regs, address, error_code);160return 0;161}162#endif163164if (in_atomic() || mm == NULL) {165if (!user_mode(regs))166return SIGSEGV;167/* in_atomic() in user mode is really bad,168as is current->mm == NULL. */169printk(KERN_EMERG "Page fault in user mode with "170"in_atomic() = %d mm = %p\n", in_atomic(), mm);171printk(KERN_EMERG "NIP = %lx MSR = %lx\n",172regs->nip, regs->msr);173die("Weird page fault", regs, SIGSEGV);174}175176perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);177178/* When running in the kernel we expect faults to occur only to179* addresses in user space. All other faults represent errors in the180* kernel and should generate an OOPS. Unfortunately, in the case of an181* erroneous fault occurring in a code path which already holds mmap_sem182* we will deadlock attempting to validate the fault against the183* address space. Luckily the kernel only validly references user184* space from well defined areas of code, which are listed in the185* exceptions table.186*187* As the vast majority of faults will be valid we will only perform188* the source reference check when there is a possibility of a deadlock.189* Attempt to lock the address space, if we cannot we then validate the190* source. If this is invalid we can skip the address space check,191* thus avoiding the deadlock.192*/193if (!down_read_trylock(&mm->mmap_sem)) {194if (!user_mode(regs) && !search_exception_tables(regs->nip))195goto bad_area_nosemaphore;196197down_read(&mm->mmap_sem);198}199200vma = find_vma(mm, address);201if (!vma)202goto bad_area;203if (vma->vm_start <= address)204goto good_area;205if (!(vma->vm_flags & VM_GROWSDOWN))206goto bad_area;207208/*209* N.B. The POWER/Open ABI allows programs to access up to210* 288 bytes below the stack pointer.211* The kernel signal delivery code writes up to about 1.5kB212* below the stack pointer (r1) before decrementing it.213* The exec code can write slightly over 640kB to the stack214* before setting the user r1. Thus we allow the stack to215* expand to 1MB without further checks.216*/217if (address + 0x100000 < vma->vm_end) {218/* get user regs even if this fault is in kernel mode */219struct pt_regs *uregs = current->thread.regs;220if (uregs == NULL)221goto bad_area;222223/*224* A user-mode access to an address a long way below225* the stack pointer is only valid if the instruction226* is one which would update the stack pointer to the227* address accessed if the instruction completed,228* i.e. either stwu rs,n(r1) or stwux rs,r1,rb229* (or the byte, halfword, float or double forms).230*231* If we don't check this then any write to the area232* between the last mapped region and the stack will233* expand the stack rather than segfaulting.234*/235if (address + 2048 < uregs->gpr[1]236&& (!user_mode(regs) || !store_updates_sp(regs)))237goto bad_area;238}239if (expand_stack(vma, address))240goto bad_area;241242good_area:243code = SEGV_ACCERR;244#if defined(CONFIG_6xx)245if (error_code & 0x95700000)246/* an error such as lwarx to I/O controller space,247address matching DABR, eciwx, etc. */248goto bad_area;249#endif /* CONFIG_6xx */250#if defined(CONFIG_8xx)251/* 8xx sometimes need to load a invalid/non-present TLBs.252* These must be invalidated separately as linux mm don't.253*/254if (error_code & 0x40000000) /* no translation? */255_tlbil_va(address, 0, 0, 0);256257/* The MPC8xx seems to always set 0x80000000, which is258* "undefined". Of those that can be set, this is the only259* one which seems bad.260*/261if (error_code & 0x10000000)262/* Guarded storage error. */263goto bad_area;264#endif /* CONFIG_8xx */265266if (is_exec) {267#ifdef CONFIG_PPC_STD_MMU268/* Protection fault on exec go straight to failure on269* Hash based MMUs as they either don't support per-page270* execute permission, or if they do, it's handled already271* at the hash level. This test would probably have to272* be removed if we change the way this works to make hash273* processors use the same I/D cache coherency mechanism274* as embedded.275*/276if (error_code & DSISR_PROTFAULT)277goto bad_area;278#endif /* CONFIG_PPC_STD_MMU */279280/*281* Allow execution from readable areas if the MMU does not282* provide separate controls over reading and executing.283*284* Note: That code used to not be enabled for 4xx/BookE.285* It is now as I/D cache coherency for these is done at286* set_pte_at() time and I see no reason why the test287* below wouldn't be valid on those processors. This -may-288* break programs compiled with a really old ABI though.289*/290if (!(vma->vm_flags & VM_EXEC) &&291(cpu_has_feature(CPU_FTR_NOEXECUTE) ||292!(vma->vm_flags & (VM_READ | VM_WRITE))))293goto bad_area;294/* a write */295} else if (is_write) {296if (!(vma->vm_flags & VM_WRITE))297goto bad_area;298/* a read */299} else {300/* protection fault */301if (error_code & 0x08000000)302goto bad_area;303if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))304goto bad_area;305}306307/*308* If for any reason at all we couldn't handle the fault,309* make sure we exit gracefully rather than endlessly redo310* the fault.311*/312ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);313if (unlikely(ret & VM_FAULT_ERROR)) {314if (ret & VM_FAULT_OOM)315goto out_of_memory;316else if (ret & VM_FAULT_SIGBUS)317goto do_sigbus;318BUG();319}320if (ret & VM_FAULT_MAJOR) {321current->maj_flt++;322perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,323regs, address);324#ifdef CONFIG_PPC_SMLPAR325if (firmware_has_feature(FW_FEATURE_CMO)) {326preempt_disable();327get_lppaca()->page_ins += (1 << PAGE_FACTOR);328preempt_enable();329}330#endif331} else {332current->min_flt++;333perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,334regs, address);335}336up_read(&mm->mmap_sem);337return 0;338339bad_area:340up_read(&mm->mmap_sem);341342bad_area_nosemaphore:343/* User mode accesses cause a SIGSEGV */344if (user_mode(regs)) {345_exception(SIGSEGV, regs, code, address);346return 0;347}348349if (is_exec && (error_code & DSISR_PROTFAULT))350printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"351" page (%lx) - exploit attempt? (uid: %d)\n",352address, current_uid());353354return SIGSEGV;355356/*357* We ran out of memory, or some other thing happened to us that made358* us unable to handle the page fault gracefully.359*/360out_of_memory:361up_read(&mm->mmap_sem);362if (!user_mode(regs))363return SIGKILL;364pagefault_out_of_memory();365return 0;366367do_sigbus:368up_read(&mm->mmap_sem);369if (user_mode(regs)) {370info.si_signo = SIGBUS;371info.si_errno = 0;372info.si_code = BUS_ADRERR;373info.si_addr = (void __user *)address;374force_sig_info(SIGBUS, &info, current);375return 0;376}377return SIGBUS;378}379380/*381* bad_page_fault is called when we have a bad access from the kernel.382* It is called from the DSI and ISI handlers in head.S and from some383* of the procedures in traps.c.384*/385void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)386{387const struct exception_table_entry *entry;388unsigned long *stackend;389390/* Are we prepared to handle this fault? */391if ((entry = search_exception_tables(regs->nip)) != NULL) {392regs->nip = entry->fixup;393return;394}395396/* kernel has accessed a bad area */397398switch (regs->trap) {399case 0x300:400case 0x380:401printk(KERN_ALERT "Unable to handle kernel paging request for "402"data at address 0x%08lx\n", regs->dar);403break;404case 0x400:405case 0x480:406printk(KERN_ALERT "Unable to handle kernel paging request for "407"instruction fetch\n");408break;409default:410printk(KERN_ALERT "Unable to handle kernel paging request for "411"unknown fault\n");412break;413}414printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",415regs->nip);416417stackend = end_of_stack(current);418if (current != &init_task && *stackend != STACK_END_MAGIC)419printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");420421die("Kernel access of bad area", regs, sig);422}423424425