/**************************************************************************1*2* Copyright (c) 2006-2007 Tungsten Graphics, Inc., Cedar Park, TX., USA3* All Rights Reserved.4*5* Permission is hereby granted, free of charge, to any person obtaining a6* copy of this software and associated documentation files (the7* "Software"), to deal in the Software without restriction, including8* without limitation the rights to use, copy, modify, merge, publish,9* distribute, sub license, and/or sell copies of the Software, and to10* permit persons to whom the Software is furnished to do so, subject to11* the following conditions:12*13* The above copyright notice and this permission notice (including the14* next paragraph) shall be included in all copies or substantial portions15* of the Software.16*17* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR18* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,19* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL20* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,21* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR22* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE23* USE OR OTHER DEALINGS IN THE SOFTWARE.24*25**************************************************************************/26/*27* Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>28*/29#include <linux/cc_platform.h>30#include <linux/export.h>31#include <linux/highmem.h>32#include <linux/ioport.h>33#include <linux/iosys-map.h>34#include <xen/xen.h>3536#include <drm/drm_cache.h>3738/* A small bounce buffer that fits on the stack. */39#define MEMCPY_BOUNCE_SIZE 1284041#if defined(CONFIG_X86)42#include <asm/smp.h>4344/*45* clflushopt is an unordered instruction which needs fencing with mfence or46* sfence to avoid ordering issues. For drm_clflush_page this fencing happens47* in the caller.48*/49static void50drm_clflush_page(struct page *page)51{52uint8_t *page_virtual;53unsigned int i;54const int size = boot_cpu_data.x86_clflush_size;5556if (unlikely(page == NULL))57return;5859page_virtual = kmap_atomic(page);60for (i = 0; i < PAGE_SIZE; i += size)61clflushopt(page_virtual + i);62kunmap_atomic(page_virtual);63}6465static void drm_cache_flush_clflush(struct page *pages[],66unsigned long num_pages)67{68unsigned long i;6970mb(); /*Full memory barrier used before so that CLFLUSH is ordered*/71for (i = 0; i < num_pages; i++)72drm_clflush_page(*pages++);73mb(); /*Also used after CLFLUSH so that all cache is flushed*/74}75#endif7677/**78* drm_clflush_pages - Flush dcache lines of a set of pages.79* @pages: List of pages to be flushed.80* @num_pages: Number of pages in the array.81*82* Flush every data cache line entry that points to an address belonging83* to a page in the array.84*/85void86drm_clflush_pages(struct page *pages[], unsigned long num_pages)87{8889#if defined(CONFIG_X86)90if (static_cpu_has(X86_FEATURE_CLFLUSH)) {91drm_cache_flush_clflush(pages, num_pages);92return;93}9495wbinvd_on_all_cpus();9697#elif defined(__powerpc__)98unsigned long i;99100for (i = 0; i < num_pages; i++) {101struct page *page = pages[i];102void *page_virtual;103104if (unlikely(page == NULL))105continue;106107page_virtual = kmap_atomic(page);108flush_dcache_range((unsigned long)page_virtual,109(unsigned long)page_virtual + PAGE_SIZE);110kunmap_atomic(page_virtual);111}112#else113WARN_ONCE(1, "Architecture has no drm_cache.c support\n");114#endif115}116EXPORT_SYMBOL(drm_clflush_pages);117118/**119* drm_clflush_sg - Flush dcache lines pointing to a scather-gather.120* @st: struct sg_table.121*122* Flush every data cache line entry that points to an address in the123* sg.124*/125void126drm_clflush_sg(struct sg_table *st)127{128#if defined(CONFIG_X86)129if (static_cpu_has(X86_FEATURE_CLFLUSH)) {130struct sg_page_iter sg_iter;131132mb(); /*CLFLUSH is ordered only by using memory barriers*/133for_each_sgtable_page(st, &sg_iter, 0)134drm_clflush_page(sg_page_iter_page(&sg_iter));135mb(); /*Make sure that all cache line entry is flushed*/136137return;138}139140wbinvd_on_all_cpus();141#else142WARN_ONCE(1, "Architecture has no drm_cache.c support\n");143#endif144}145EXPORT_SYMBOL(drm_clflush_sg);146147/**148* drm_clflush_virt_range - Flush dcache lines of a region149* @addr: Initial kernel memory address.150* @length: Region size.151*152* Flush every data cache line entry that points to an address in the153* region requested.154*/155void156drm_clflush_virt_range(void *addr, unsigned long length)157{158#if defined(CONFIG_X86)159if (static_cpu_has(X86_FEATURE_CLFLUSH)) {160const int size = boot_cpu_data.x86_clflush_size;161void *end = addr + length;162163addr = (void *)(((unsigned long)addr) & -size);164mb(); /*CLFLUSH is only ordered with a full memory barrier*/165for (; addr < end; addr += size)166clflushopt(addr);167clflushopt(end - 1); /* force serialisation */168mb(); /*Ensure that every data cache line entry is flushed*/169return;170}171172wbinvd_on_all_cpus();173#else174WARN_ONCE(1, "Architecture has no drm_cache.c support\n");175#endif176}177EXPORT_SYMBOL(drm_clflush_virt_range);178179bool drm_need_swiotlb(int dma_bits)180{181struct resource *tmp;182resource_size_t max_iomem = 0;183184/*185* Xen paravirtual hosts require swiotlb regardless of requested dma186* transfer size.187*188* NOTE: Really, what it requires is use of the dma_alloc_coherent189* allocator used in ttm_dma_populate() instead of190* ttm_populate_and_map_pages(), which bounce buffers so much in191* Xen it leads to swiotlb buffer exhaustion.192*/193if (xen_pv_domain())194return true;195196/*197* Enforce dma_alloc_coherent when memory encryption is active as well198* for the same reasons as for Xen paravirtual hosts.199*/200if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))201return true;202203for (tmp = iomem_resource.child; tmp; tmp = tmp->sibling)204max_iomem = max(max_iomem, tmp->end);205206return max_iomem > ((u64)1 << dma_bits);207}208EXPORT_SYMBOL(drm_need_swiotlb);209210static void memcpy_fallback(struct iosys_map *dst,211const struct iosys_map *src,212unsigned long len)213{214if (!dst->is_iomem && !src->is_iomem) {215memcpy(dst->vaddr, src->vaddr, len);216} else if (!src->is_iomem) {217iosys_map_memcpy_to(dst, 0, src->vaddr, len);218} else if (!dst->is_iomem) {219memcpy_fromio(dst->vaddr, src->vaddr_iomem, len);220} else {221/*222* Bounce size is not performance tuned, but using a223* bounce buffer like this is significantly faster than224* resorting to ioreadxx() + iowritexx().225*/226char bounce[MEMCPY_BOUNCE_SIZE];227void __iomem *_src = src->vaddr_iomem;228void __iomem *_dst = dst->vaddr_iomem;229230while (len >= MEMCPY_BOUNCE_SIZE) {231memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);232memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);233_src += MEMCPY_BOUNCE_SIZE;234_dst += MEMCPY_BOUNCE_SIZE;235len -= MEMCPY_BOUNCE_SIZE;236}237if (len) {238memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);239memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);240}241}242}243244#ifdef CONFIG_X86245246static DEFINE_STATIC_KEY_FALSE(has_movntdqa);247248static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)249{250kernel_fpu_begin();251252while (len >= 4) {253asm("movntdqa (%0), %%xmm0\n"254"movntdqa 16(%0), %%xmm1\n"255"movntdqa 32(%0), %%xmm2\n"256"movntdqa 48(%0), %%xmm3\n"257"movaps %%xmm0, (%1)\n"258"movaps %%xmm1, 16(%1)\n"259"movaps %%xmm2, 32(%1)\n"260"movaps %%xmm3, 48(%1)\n"261:: "r" (src), "r" (dst) : "memory");262src += 64;263dst += 64;264len -= 4;265}266while (len--) {267asm("movntdqa (%0), %%xmm0\n"268"movaps %%xmm0, (%1)\n"269:: "r" (src), "r" (dst) : "memory");270src += 16;271dst += 16;272}273274kernel_fpu_end();275}276277/*278* __drm_memcpy_from_wc copies @len bytes from @src to @dst using279* non-temporal instructions where available. Note that all arguments280* (@src, @dst) must be aligned to 16 bytes and @len must be a multiple281* of 16.282*/283static void __drm_memcpy_from_wc(void *dst, const void *src, unsigned long len)284{285if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))286memcpy(dst, src, len);287else if (likely(len))288__memcpy_ntdqa(dst, src, len >> 4);289}290291/**292* drm_memcpy_from_wc - Perform the fastest available memcpy from a source293* that may be WC.294* @dst: The destination pointer295* @src: The source pointer296* @len: The size of the area o transfer in bytes297*298* Tries an arch optimized memcpy for prefetching reading out of a WC region,299* and if no such beast is available, falls back to a normal memcpy.300*/301void drm_memcpy_from_wc(struct iosys_map *dst,302const struct iosys_map *src,303unsigned long len)304{305if (WARN_ON(in_interrupt())) {306memcpy_fallback(dst, src, len);307return;308}309310if (static_branch_likely(&has_movntdqa)) {311__drm_memcpy_from_wc(dst->is_iomem ?312(void __force *)dst->vaddr_iomem :313dst->vaddr,314src->is_iomem ?315(void const __force *)src->vaddr_iomem :316src->vaddr,317len);318return;319}320321memcpy_fallback(dst, src, len);322}323EXPORT_SYMBOL(drm_memcpy_from_wc);324325/*326* drm_memcpy_init_early - One time initialization of the WC memcpy code327*/328void drm_memcpy_init_early(void)329{330/*331* Some hypervisors (e.g. KVM) don't support VEX-prefix instructions332* emulation. So don't enable movntdqa in hypervisor guest.333*/334if (static_cpu_has(X86_FEATURE_XMM4_1) &&335!boot_cpu_has(X86_FEATURE_HYPERVISOR))336static_branch_enable(&has_movntdqa);337}338#else339void drm_memcpy_from_wc(struct iosys_map *dst,340const struct iosys_map *src,341unsigned long len)342{343WARN_ON(in_interrupt());344345memcpy_fallback(dst, src, len);346}347EXPORT_SYMBOL(drm_memcpy_from_wc);348349void drm_memcpy_init_early(void)350{351}352#endif /* CONFIG_X86 */353354355