/*-1* Copyright (c) 2004 Tim J. Robbins2* Copyright (c) 2002 Doug Rabson3* Copyright (c) 2000 Marcel Moolenaar4* Copyright (c) 1994-1995 Søren Schmidt5* All rights reserved.6*7* Redistribution and use in source and binary forms, with or without8* modification, are permitted provided that the following conditions9* are met:10* 1. Redistributions of source code must retain the above copyright11* notice, this list of conditions and the following disclaimer12* in this position and unchanged.13* 2. Redistributions in binary form must reproduce the above copyright14* notice, this list of conditions and the following disclaimer in the15* documentation and/or other materials provided with the distribution.16* 3. The name of the author may not be used to endorse or promote products17* derived from this software without specific prior written permission.18*19* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR20* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES21* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.22* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,23* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT24* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,25* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY26* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT27* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF28* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.29*/3031#include <sys/fcntl.h>32#include <sys/file.h>33#include <sys/ktr.h>34#include <sys/lock.h>35#include <sys/malloc.h>36#include <sys/mman.h>37#include <sys/proc.h>38#include <sys/resourcevar.h>39#include <sys/rwlock.h>40#include <sys/syscallsubr.h>41#include <sys/sysent.h>42#include <sys/sysproto.h>4344#include <vm/pmap.h>45#include <vm/vm_extern.h>46#include <vm/vm_map.h>47#include <vm/vm_object.h>4849#include <compat/linux/linux_emul.h>50#include <compat/linux/linux_mmap.h>51#include <compat/linux/linux_persona.h>52#include <compat/linux/linux_util.h>5354#define STACK_SIZE (2 * 1024 * 1024)55#define GUARD_SIZE (4 * PAGE_SIZE)5657#if defined(__amd64__)58static void linux_fixup_prot(struct thread *td, int *prot);59#endif6061static int62linux_mmap_check_fp(struct file *fp, int flags, int prot, int maxprot)63{6465/* Linux mmap() just fails for O_WRONLY files */66if ((fp->f_flag & FREAD) == 0)67return (EACCES);6869return (0);70}7172int73linux_mmap_common(struct thread *td, uintptr_t addr, size_t len, int prot,74int flags, int fd, off_t pos)75{76struct mmap_req mr, mr_fixed;77struct proc *p = td->td_proc;78struct vmspace *vms = td->td_proc->p_vmspace;79int bsd_flags, error;8081LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx",82addr, len, prot, flags, fd, pos);8384error = 0;85bsd_flags = 0;8687/*88* Linux mmap(2):89* You must specify exactly one of MAP_SHARED and MAP_PRIVATE90*/91if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))92return (EINVAL);9394if (flags & LINUX_MAP_SHARED)95bsd_flags |= MAP_SHARED;96if (flags & LINUX_MAP_PRIVATE)97bsd_flags |= MAP_PRIVATE;98if (flags & LINUX_MAP_FIXED)99bsd_flags |= MAP_FIXED;100if (flags & LINUX_MAP_ANON) {101/* Enforce pos to be on page boundary, then ignore. */102if ((pos & PAGE_MASK) != 0)103return (EINVAL);104pos = 0;105bsd_flags |= MAP_ANON;106} else107bsd_flags |= MAP_NOSYNC;108if (flags & LINUX_MAP_GROWSDOWN)109bsd_flags |= MAP_STACK;110111#if defined(__amd64__)112/*113* According to the Linux mmap(2) man page, "MAP_32BIT flag114* is ignored when MAP_FIXED is set."115*/116if ((flags & LINUX_MAP_32BIT) && (flags & LINUX_MAP_FIXED) == 0)117bsd_flags |= MAP_32BIT;118119/*120* PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC121* on Linux/i386 if the binary requires executable stack.122* We do this only for IA32 emulation as on native i386 this is does not123* make sense without PAE.124*125* XXX. Linux checks that the file system is not mounted with noexec.126*/127linux_fixup_prot(td, &prot);128#endif129130/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */131fd = (bsd_flags & MAP_ANON) ? -1 : fd;132if (flags & LINUX_MAP_GROWSDOWN) {133/*134* The Linux MAP_GROWSDOWN option does not limit auto135* growth of the region. Linux mmap with this option136* takes as addr the initial BOS, and as len, the initial137* region size. It can then grow down from addr without138* limit. However, Linux threads has an implicit internal139* limit to stack size of STACK_SIZE. Its just not140* enforced explicitly in Linux. But, here we impose141* a limit of (STACK_SIZE - GUARD_SIZE) on the stack142* region, since we can do this with our mmap.143*144* Our mmap with MAP_STACK takes addr as the maximum145* downsize limit on BOS, and as len the max size of146* the region. It then maps the top SGROWSIZ bytes,147* and auto grows the region down, up to the limit148* in addr.149*150* If we don't use the MAP_STACK option, the effect151* of this code is to allocate a stack region of a152* fixed size of (STACK_SIZE - GUARD_SIZE).153*/154155if ((caddr_t)addr + len > vms->vm_maxsaddr) {156/*157* Some Linux apps will attempt to mmap158* thread stacks near the top of their159* address space. If their TOS is greater160* than vm_maxsaddr, vm_map_growstack()161* will confuse the thread stack with the162* process stack and deliver a SEGV if they163* attempt to grow the thread stack past their164* current stacksize rlimit. To avoid this,165* adjust vm_maxsaddr upwards to reflect166* the current stacksize rlimit rather167* than the maximum possible stacksize.168* It would be better to adjust the169* mmap'ed region, but some apps do not check170* mmap's return value.171*/172PROC_LOCK(p);173vms->vm_maxsaddr = (char *)round_page(vms->vm_stacktop) -174lim_cur_proc(p, RLIMIT_STACK);175PROC_UNLOCK(p);176}177178/*179* This gives us our maximum stack size and a new BOS.180* If we're using VM_STACK, then mmap will just map181* the top SGROWSIZ bytes, and let the stack grow down182* to the limit at BOS. If we're not using VM_STACK183* we map the full stack, since we don't have a way184* to autogrow it.185*/186if (len <= STACK_SIZE - GUARD_SIZE) {187addr = addr - (STACK_SIZE - GUARD_SIZE - len);188len = STACK_SIZE - GUARD_SIZE;189}190}191192/*193* FreeBSD is free to ignore the address hint if MAP_FIXED wasn't194* passed. However, some Linux applications, like the ART runtime,195* depend on the hint. If the MAP_FIXED wasn't passed, but the196* address is not zero, try with MAP_FIXED and MAP_EXCL first,197* and fall back to the normal behaviour if that fails.198*/199mr = (struct mmap_req) {200.mr_hint = addr,201.mr_len = len,202.mr_prot = prot,203.mr_flags = bsd_flags,204.mr_fd = fd,205.mr_pos = pos,206.mr_check_fp_fn = linux_mmap_check_fp,207};208if (addr != 0 && (bsd_flags & MAP_FIXED) == 0 &&209(bsd_flags & MAP_EXCL) == 0) {210mr_fixed = mr;211mr_fixed.mr_flags |= MAP_FIXED | MAP_EXCL;212error = kern_mmap(td, &mr_fixed);213if (error == 0)214goto out;215}216217error = kern_mmap(td, &mr);218out:219LINUX_CTR2(mmap2, "return: %d (%p)", error, td->td_retval[0]);220221return (error);222}223224int225linux_mprotect_common(struct thread *td, uintptr_t addr, size_t len, int prot)226{227int flags = 0;228229/* XXX Ignore PROT_GROWSUP for now. */230prot &= ~LINUX_PROT_GROWSUP;231if ((prot & ~(LINUX_PROT_GROWSDOWN | PROT_READ | PROT_WRITE |232PROT_EXEC)) != 0)233return (EINVAL);234if ((prot & LINUX_PROT_GROWSDOWN) != 0) {235prot &= ~LINUX_PROT_GROWSDOWN;236flags |= VM_MAP_PROTECT_GROWSDOWN;237}238239#if defined(__amd64__)240linux_fixup_prot(td, &prot);241#endif242return (kern_mprotect(td, addr, len, prot, flags));243}244245/*246* Implement Linux madvise(MADV_DONTNEED), which has unusual semantics: for247* anonymous memory, pages in the range are immediately discarded.248*/249static int250linux_madvise_dontneed(struct thread *td, vm_offset_t start, vm_offset_t end)251{252vm_map_t map;253vm_map_entry_t entry;254vm_object_t backing_object, object;255vm_offset_t estart, eend;256vm_pindex_t pstart, pend;257int error;258259map = &td->td_proc->p_vmspace->vm_map;260261if (!vm_map_range_valid(map, start, end))262return (EINVAL);263start = trunc_page(start);264end = round_page(end);265266error = 0;267vm_map_lock_read(map);268if (!vm_map_lookup_entry(map, start, &entry))269entry = vm_map_entry_succ(entry);270for (; entry->start < end; entry = vm_map_entry_succ(entry)) {271if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)272continue;273274if (entry->wired_count != 0) {275error = EINVAL;276break;277}278279object = entry->object.vm_object;280if (object == NULL)281continue;282if ((object->flags & (OBJ_UNMANAGED | OBJ_FICTITIOUS)) != 0)283continue;284285pstart = OFF_TO_IDX(entry->offset);286if (start > entry->start) {287pstart += atop(start - entry->start);288estart = start;289} else {290estart = entry->start;291}292pend = OFF_TO_IDX(entry->offset) +293atop(entry->end - entry->start);294if (entry->end > end) {295pend -= atop(entry->end - end);296eend = end;297} else {298eend = entry->end;299}300301if ((object->flags & (OBJ_ANON | OBJ_ONEMAPPING)) ==302(OBJ_ANON | OBJ_ONEMAPPING)) {303/*304* Singly-mapped anonymous memory is discarded. This305* does not match Linux's semantics when the object306* belongs to a shadow chain of length > 1, since307* subsequent faults may retrieve pages from an308* intermediate anonymous object. However, handling309* this case correctly introduces a fair bit of310* complexity.311*/312VM_OBJECT_WLOCK(object);313if ((object->flags & OBJ_ONEMAPPING) != 0) {314vm_object_collapse(object);315vm_object_page_remove(object, pstart, pend, 0);316backing_object = object->backing_object;317if (backing_object != NULL &&318(backing_object->flags & OBJ_ANON) != 0)319linux_msg(td,320"possibly incorrect MADV_DONTNEED");321VM_OBJECT_WUNLOCK(object);322continue;323}324VM_OBJECT_WUNLOCK(object);325}326327/*328* Handle shared mappings. Remove them outright instead of329* calling pmap_advise(), for consistency with Linux.330*/331pmap_remove(map->pmap, estart, eend);332vm_object_madvise(object, pstart, pend, MADV_DONTNEED);333}334vm_map_unlock_read(map);335336return (error);337}338339int340linux_madvise_common(struct thread *td, uintptr_t addr, size_t len, int behav)341{342343switch (behav) {344case LINUX_MADV_NORMAL:345return (kern_madvise(td, addr, len, MADV_NORMAL));346case LINUX_MADV_RANDOM:347return (kern_madvise(td, addr, len, MADV_RANDOM));348case LINUX_MADV_SEQUENTIAL:349return (kern_madvise(td, addr, len, MADV_SEQUENTIAL));350case LINUX_MADV_WILLNEED:351return (kern_madvise(td, addr, len, MADV_WILLNEED));352case LINUX_MADV_DONTNEED:353return (linux_madvise_dontneed(td, addr, addr + len));354case LINUX_MADV_FREE:355return (kern_madvise(td, addr, len, MADV_FREE));356case LINUX_MADV_REMOVE:357linux_msg(curthread, "unsupported madvise MADV_REMOVE");358return (EINVAL);359case LINUX_MADV_DONTFORK:360return (kern_minherit(td, addr, len, INHERIT_NONE));361case LINUX_MADV_DOFORK:362return (kern_minherit(td, addr, len, INHERIT_COPY));363case LINUX_MADV_MERGEABLE:364linux_msg(curthread, "unsupported madvise MADV_MERGEABLE");365return (EINVAL);366case LINUX_MADV_UNMERGEABLE:367/* We don't merge anyway. */368return (0);369case LINUX_MADV_HUGEPAGE:370/* Ignored; on FreeBSD huge pages are always on. */371return (0);372case LINUX_MADV_NOHUGEPAGE:373#if 0374/*375* Don't warn - Firefox uses it a lot, and in real Linux it's376* an optional feature.377*/378linux_msg(curthread, "unsupported madvise MADV_NOHUGEPAGE");379#endif380return (EINVAL);381case LINUX_MADV_DONTDUMP:382return (kern_madvise(td, addr, len, MADV_NOCORE));383case LINUX_MADV_DODUMP:384return (kern_madvise(td, addr, len, MADV_CORE));385case LINUX_MADV_WIPEONFORK:386return (kern_minherit(td, addr, len, INHERIT_ZERO));387case LINUX_MADV_KEEPONFORK:388return (kern_minherit(td, addr, len, INHERIT_COPY));389case LINUX_MADV_HWPOISON:390linux_msg(curthread, "unsupported madvise MADV_HWPOISON");391return (EINVAL);392case LINUX_MADV_SOFT_OFFLINE:393linux_msg(curthread, "unsupported madvise MADV_SOFT_OFFLINE");394return (EINVAL);395case -1:396/*397* -1 is sometimes used as a dummy value to detect simplistic398* madvise(2) stub implementations. This safeguard is used by399* BoringSSL, for example, before assuming MADV_WIPEONFORK is400* safe to use. Don't produce an "unsupported" error message401* for this special dummy value, which is unlikely to be used402* by any new advisory behavior feature.403*/404return (EINVAL);405default:406linux_msg(curthread, "unsupported madvise behav %d", behav);407return (EINVAL);408}409}410411#if defined(__amd64__)412static void413linux_fixup_prot(struct thread *td, int *prot)414{415struct linux_pemuldata *pem;416417if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && *prot & PROT_READ) {418pem = pem_find(td->td_proc);419if (pem->persona & LINUX_READ_IMPLIES_EXEC)420*prot |= PROT_EXEC;421}422423}424#endif425426427