Path: blob/21.2-virgl/src/broadcom/common/v3d_cpu_tiling.h
4560 views
/*1* Copyright © 2017 Broadcom2*3* Permission is hereby granted, free of charge, to any person obtaining a4* copy of this software and associated documentation files (the "Software"),5* to deal in the Software without restriction, including without limitation6* the rights to use, copy, modify, merge, publish, distribute, sublicense,7* and/or sell copies of the Software, and to permit persons to whom the8* Software is furnished to do so, subject to the following conditions:9*10* The above copyright notice and this permission notice (including the next11* paragraph) shall be included in all copies or substantial portions of the12* Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER18* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING19* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS20* IN THE SOFTWARE.21*/2223/** @file v3d_cpu_tiling.h24*25* Contains load/store functions common to both v3d and vc4. The utile layout26* stayed the same, though the way utiles get laid out has changed.27*/2829static inline void30v3d_load_utile(void *cpu, uint32_t cpu_stride,31void *gpu, uint32_t gpu_stride)32{33#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)34if (gpu_stride == 8) {35__asm__ volatile (36/* Load from the GPU in one shot, no interleave, to37* d0-d7.38*/39"vldm %[gpu], {q0, q1, q2, q3}\n"40/* Store each 8-byte line to cpu-side destination,41* incrementing it by the stride each time.42*/43"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"44"vst1.8 d1, [%[cpu]], %[cpu_stride]\n"45"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"46"vst1.8 d3, [%[cpu]], %[cpu_stride]\n"47"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"48"vst1.8 d5, [%[cpu]], %[cpu_stride]\n"49"vst1.8 d6, [%[cpu]], %[cpu_stride]\n"50"vst1.8 d7, [%[cpu]]\n"51: [cpu] "+r"(cpu)52: [gpu] "r"(gpu),53[cpu_stride] "r"(cpu_stride)54: "q0", "q1", "q2", "q3");55return;56} else if (gpu_stride == 16) {57void *cpu2 = cpu + 8;58__asm__ volatile (59/* Load from the GPU in one shot, no interleave, to60* d0-d7.61*/62"vldm %[gpu], {q0, q1, q2, q3};\n"63/* Store each 16-byte line in 2 parts to the cpu-side64* destination. (vld1 can only store one d-register65* at a time).66*/67"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"68"vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"69"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"70"vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"71"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"72"vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"73"vst1.8 d6, [%[cpu]]\n"74"vst1.8 d7, [%[cpu2]]\n"75: [cpu] "+r"(cpu),76[cpu2] "+r"(cpu2)77: [gpu] "r"(gpu),78[cpu_stride] "r"(cpu_stride)79: "q0", "q1", "q2", "q3");80return;81}82#elif defined (PIPE_ARCH_AARCH64)83if (gpu_stride == 8) {84__asm__ volatile (85/* Load from the GPU in one shot, no interleave, to86* d0-d7.87*/88"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"89/* Store each 8-byte line to cpu-side destination,90* incrementing it by the stride each time.91*/92"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"93"st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"94"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"95"st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"96"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"97"st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"98"st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"99"st1 {v3.D}[1], [%[cpu]]\n"100: [cpu] "+r"(cpu)101: [gpu] "r"(gpu),102[cpu_stride] "r"(cpu_stride)103: "v0", "v1", "v2", "v3");104return;105} else if (gpu_stride == 16) {106void *cpu2 = cpu + 8;107__asm__ volatile (108/* Load from the GPU in one shot, no interleave, to109* d0-d7.110*/111"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"112/* Store each 16-byte line in 2 parts to the cpu-side113* destination. (vld1 can only store one d-register114* at a time).115*/116"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"117"st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"118"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"119"st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"120"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"121"st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"122"st1 {v3.D}[0], [%[cpu]]\n"123"st1 {v3.D}[1], [%[cpu2]]\n"124: [cpu] "+r"(cpu),125[cpu2] "+r"(cpu2)126: [gpu] "r"(gpu),127[cpu_stride] "r"(cpu_stride)128: "v0", "v1", "v2", "v3");129return;130}131#endif132133for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {134memcpy(cpu, gpu + gpu_offset, gpu_stride);135cpu += cpu_stride;136}137}138139static inline void140v3d_store_utile(void *gpu, uint32_t gpu_stride,141void *cpu, uint32_t cpu_stride)142{143#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)144if (gpu_stride == 8) {145__asm__ volatile (146/* Load each 8-byte line from cpu-side source,147* incrementing it by the stride each time.148*/149"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"150"vld1.8 d1, [%[cpu]], %[cpu_stride]\n"151"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"152"vld1.8 d3, [%[cpu]], %[cpu_stride]\n"153"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"154"vld1.8 d5, [%[cpu]], %[cpu_stride]\n"155"vld1.8 d6, [%[cpu]], %[cpu_stride]\n"156"vld1.8 d7, [%[cpu]]\n"157/* Load from the GPU in one shot, no interleave, to158* d0-d7.159*/160"vstm %[gpu], {q0, q1, q2, q3}\n"161: [cpu] "+r"(cpu)162: [gpu] "r"(gpu),163[cpu_stride] "r"(cpu_stride)164: "q0", "q1", "q2", "q3");165return;166} else if (gpu_stride == 16) {167void *cpu2 = cpu + 8;168__asm__ volatile (169/* Load each 16-byte line in 2 parts from the cpu-side170* destination. (vld1 can only store one d-register171* at a time).172*/173"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"174"vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"175"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"176"vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"177"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"178"vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"179"vld1.8 d6, [%[cpu]]\n"180"vld1.8 d7, [%[cpu2]]\n"181/* Store to the GPU in one shot, no interleave. */182"vstm %[gpu], {q0, q1, q2, q3}\n"183: [cpu] "+r"(cpu),184[cpu2] "+r"(cpu2)185: [gpu] "r"(gpu),186[cpu_stride] "r"(cpu_stride)187: "q0", "q1", "q2", "q3");188return;189}190#elif defined (PIPE_ARCH_AARCH64)191if (gpu_stride == 8) {192__asm__ volatile (193/* Load each 8-byte line from cpu-side source,194* incrementing it by the stride each time.195*/196"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"197"ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"198"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"199"ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"200"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"201"ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"202"ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"203"ld1 {v3.D}[1], [%[cpu]]\n"204/* Store to the GPU in one shot, no interleave. */205"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"206: [cpu] "+r"(cpu)207: [gpu] "r"(gpu),208[cpu_stride] "r"(cpu_stride)209: "v0", "v1", "v2", "v3");210return;211} else if (gpu_stride == 16) {212void *cpu2 = cpu + 8;213__asm__ volatile (214/* Load each 16-byte line in 2 parts from the cpu-side215* destination. (vld1 can only store one d-register216* at a time).217*/218"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"219"ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"220"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"221"ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"222"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"223"ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"224"ld1 {v3.D}[0], [%[cpu]]\n"225"ld1 {v3.D}[1], [%[cpu2]]\n"226/* Store to the GPU in one shot, no interleave. */227"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"228: [cpu] "+r"(cpu),229[cpu2] "+r"(cpu2)230: [gpu] "r"(gpu),231[cpu_stride] "r"(cpu_stride)232: "v0", "v1", "v2", "v3");233return;234}235#endif236237for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {238memcpy(gpu + gpu_offset, cpu, gpu_stride);239cpu += cpu_stride;240}241}242243244