CoCalc -- v3d_cpu

GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/broadcom/common/v3d_cpu_tiling.h
⁴⁵⁶⁰ views
1
/*
2
 * Copyright © 2017 Broadcom
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23

24
/** @file v3d_cpu_tiling.h
25
 *
26
 * Contains load/store functions common to both v3d and vc4.  The utile layout
27
 * stayed the same, though the way utiles get laid out has changed.
28
 */
29

30
static inline void
31
v3d_load_utile(void *cpu, uint32_t cpu_stride,
32
               void *gpu, uint32_t gpu_stride)
33
{
34
#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35
        if (gpu_stride == 8) {
36
                __asm__ volatile (
37
                        /* Load from the GPU in one shot, no interleave, to
38
                         * d0-d7.
39
                         */
40
                        "vldm %[gpu], {q0, q1, q2, q3}\n"
41
                        /* Store each 8-byte line to cpu-side destination,
42
                         * incrementing it by the stride each time.
43
                         */
44
                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45
                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46
                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47
                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48
                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49
                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50
                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51
                        "vst1.8 d7, [%[cpu]]\n"
52
                        : [cpu]         "+r"(cpu)
53
                        : [gpu]         "r"(gpu),
54
                          [cpu_stride]  "r"(cpu_stride)
55
                        : "q0", "q1", "q2", "q3");
56
                return;
57
        } else if (gpu_stride == 16) {
58
                void *cpu2 = cpu + 8;
59
                __asm__ volatile (
60
                        /* Load from the GPU in one shot, no interleave, to
61
                         * d0-d7.
62
                         */
63
                        "vldm %[gpu], {q0, q1, q2, q3};\n"
64
                        /* Store each 16-byte line in 2 parts to the cpu-side
65
                         * destination.  (vld1 can only store one d-register
66
                         * at a time).
67
                         */
68
                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69
                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70
                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71
                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72
                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73
                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74
                        "vst1.8 d6, [%[cpu]]\n"
75
                        "vst1.8 d7, [%[cpu2]]\n"
76
                        : [cpu]         "+r"(cpu),
77
                          [cpu2]        "+r"(cpu2)
78
                        : [gpu]         "r"(gpu),
79
                          [cpu_stride]  "r"(cpu_stride)
80
                        : "q0", "q1", "q2", "q3");
81
                return;
82
        }
83
#elif defined (PIPE_ARCH_AARCH64)
84
        if (gpu_stride == 8) {
85
                __asm__ volatile (
86
                        /* Load from the GPU in one shot, no interleave, to
87
                         * d0-d7.
88
                         */
89
                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90
                        /* Store each 8-byte line to cpu-side destination,
91
                         * incrementing it by the stride each time.
92
                         */
93
                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94
                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95
                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96
                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97
                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98
                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99
                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100
                        "st1 {v3.D}[1], [%[cpu]]\n"
101
                        : [cpu]         "+r"(cpu)
102
                        : [gpu]         "r"(gpu),
103
                          [cpu_stride]  "r"(cpu_stride)
104
                        : "v0", "v1", "v2", "v3");
105
                return;
106
        } else if (gpu_stride == 16) {
107
                void *cpu2 = cpu + 8;
108
                __asm__ volatile (
109
                        /* Load from the GPU in one shot, no interleave, to
110
                         * d0-d7.
111
                         */
112
                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113
                        /* Store each 16-byte line in 2 parts to the cpu-side
114
                         * destination.  (vld1 can only store one d-register
115
                         * at a time).
116
                         */
117
                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118
                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119
                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120
                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121
                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122
                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123
                        "st1 {v3.D}[0], [%[cpu]]\n"
124
                        "st1 {v3.D}[1], [%[cpu2]]\n"
125
                        : [cpu]         "+r"(cpu),
126
                          [cpu2]        "+r"(cpu2)
127
                        : [gpu]         "r"(gpu),
128
                          [cpu_stride]  "r"(cpu_stride)
129
                        : "v0", "v1", "v2", "v3");
130
                return;
131
        }
132
#endif
133

134
        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135
                memcpy(cpu, gpu + gpu_offset, gpu_stride);
136
                cpu += cpu_stride;
137
        }
138
}
139

140
static inline void
141
v3d_store_utile(void *gpu, uint32_t gpu_stride,
142
                void *cpu, uint32_t cpu_stride)
143
{
144
#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145
        if (gpu_stride == 8) {
146
                __asm__ volatile (
147
                        /* Load each 8-byte line from cpu-side source,
148
                         * incrementing it by the stride each time.
149
                         */
150
                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151
                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152
                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153
                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154
                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155
                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156
                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157
                        "vld1.8 d7, [%[cpu]]\n"
158
                        /* Load from the GPU in one shot, no interleave, to
159
                         * d0-d7.
160
                         */
161
                        "vstm %[gpu], {q0, q1, q2, q3}\n"
162
                        : [cpu]         "+r"(cpu)
163
                        : [gpu]         "r"(gpu),
164
                          [cpu_stride]  "r"(cpu_stride)
165
                        : "q0", "q1", "q2", "q3");
166
                return;
167
        } else if (gpu_stride == 16) {
168
                void *cpu2 = cpu + 8;
169
                __asm__ volatile (
170
                        /* Load each 16-byte line in 2 parts from the cpu-side
171
                         * destination.  (vld1 can only store one d-register
172
                         * at a time).
173
                         */
174
                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175
                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176
                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177
                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178
                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179
                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180
                        "vld1.8 d6, [%[cpu]]\n"
181
                        "vld1.8 d7, [%[cpu2]]\n"
182
                        /* Store to the GPU in one shot, no interleave. */
183
                        "vstm %[gpu], {q0, q1, q2, q3}\n"
184
                        : [cpu]         "+r"(cpu),
185
                          [cpu2]        "+r"(cpu2)
186
                        : [gpu]         "r"(gpu),
187
                          [cpu_stride]  "r"(cpu_stride)
188
                        : "q0", "q1", "q2", "q3");
189
                return;
190
        }
191
#elif defined (PIPE_ARCH_AARCH64)
192
        if (gpu_stride == 8) {
193
                __asm__ volatile (
194
                        /* Load each 8-byte line from cpu-side source,
195
                         * incrementing it by the stride each time.
196
                         */
197
                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198
                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199
                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200
                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201
                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202
                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203
                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204
                        "ld1 {v3.D}[1], [%[cpu]]\n"
205
                        /* Store to the GPU in one shot, no interleave. */
206
                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207
                        : [cpu]         "+r"(cpu)
208
                        : [gpu]         "r"(gpu),
209
                          [cpu_stride]  "r"(cpu_stride)
210
                        : "v0", "v1", "v2", "v3");
211
                return;
212
        } else if (gpu_stride == 16) {
213
                void *cpu2 = cpu + 8;
214
                __asm__ volatile (
215
                        /* Load each 16-byte line in 2 parts from the cpu-side
216
                         * destination.  (vld1 can only store one d-register
217
                         * at a time).
218
                         */
219
                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220
                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221
                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222
                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223
                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224
                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225
                        "ld1 {v3.D}[0], [%[cpu]]\n"
226
                        "ld1 {v3.D}[1], [%[cpu2]]\n"
227
                        /* Store to the GPU in one shot, no interleave. */
228
                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229
                        : [cpu]         "+r"(cpu),
230
                          [cpu2]        "+r"(cpu2)
231
                        : [gpu]         "r"(gpu),
232
                          [cpu_stride]  "r"(cpu_stride)
233
                        : "v0", "v1", "v2", "v3");
234
                return;
235
        }
236
#endif
237

238
        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239
                memcpy(gpu + gpu_offset, cpu, gpu_stride);
240
                cpu += cpu_stride;
241
        }
242
}
243

244
Product

Resources

Company