Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mesa
Path: blob/21.2-virgl/src/broadcom/common/v3d_cpu_tiling.h
4560 views
1
/*
2
* Copyright © 2017 Broadcom
3
*
4
* Permission is hereby granted, free of charge, to any person obtaining a
5
* copy of this software and associated documentation files (the "Software"),
6
* to deal in the Software without restriction, including without limitation
7
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
* and/or sell copies of the Software, and to permit persons to whom the
9
* Software is furnished to do so, subject to the following conditions:
10
*
11
* The above copyright notice and this permission notice (including the next
12
* paragraph) shall be included in all copies or substantial portions of the
13
* Software.
14
*
15
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
* IN THE SOFTWARE.
22
*/
23
24
/** @file v3d_cpu_tiling.h
25
*
26
* Contains load/store functions common to both v3d and vc4. The utile layout
27
* stayed the same, though the way utiles get laid out has changed.
28
*/
29
30
static inline void
31
v3d_load_utile(void *cpu, uint32_t cpu_stride,
32
void *gpu, uint32_t gpu_stride)
33
{
34
#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35
if (gpu_stride == 8) {
36
__asm__ volatile (
37
/* Load from the GPU in one shot, no interleave, to
38
* d0-d7.
39
*/
40
"vldm %[gpu], {q0, q1, q2, q3}\n"
41
/* Store each 8-byte line to cpu-side destination,
42
* incrementing it by the stride each time.
43
*/
44
"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45
"vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46
"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47
"vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48
"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49
"vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50
"vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51
"vst1.8 d7, [%[cpu]]\n"
52
: [cpu] "+r"(cpu)
53
: [gpu] "r"(gpu),
54
[cpu_stride] "r"(cpu_stride)
55
: "q0", "q1", "q2", "q3");
56
return;
57
} else if (gpu_stride == 16) {
58
void *cpu2 = cpu + 8;
59
__asm__ volatile (
60
/* Load from the GPU in one shot, no interleave, to
61
* d0-d7.
62
*/
63
"vldm %[gpu], {q0, q1, q2, q3};\n"
64
/* Store each 16-byte line in 2 parts to the cpu-side
65
* destination. (vld1 can only store one d-register
66
* at a time).
67
*/
68
"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69
"vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70
"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71
"vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72
"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73
"vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74
"vst1.8 d6, [%[cpu]]\n"
75
"vst1.8 d7, [%[cpu2]]\n"
76
: [cpu] "+r"(cpu),
77
[cpu2] "+r"(cpu2)
78
: [gpu] "r"(gpu),
79
[cpu_stride] "r"(cpu_stride)
80
: "q0", "q1", "q2", "q3");
81
return;
82
}
83
#elif defined (PIPE_ARCH_AARCH64)
84
if (gpu_stride == 8) {
85
__asm__ volatile (
86
/* Load from the GPU in one shot, no interleave, to
87
* d0-d7.
88
*/
89
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90
/* Store each 8-byte line to cpu-side destination,
91
* incrementing it by the stride each time.
92
*/
93
"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94
"st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95
"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96
"st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97
"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98
"st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99
"st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100
"st1 {v3.D}[1], [%[cpu]]\n"
101
: [cpu] "+r"(cpu)
102
: [gpu] "r"(gpu),
103
[cpu_stride] "r"(cpu_stride)
104
: "v0", "v1", "v2", "v3");
105
return;
106
} else if (gpu_stride == 16) {
107
void *cpu2 = cpu + 8;
108
__asm__ volatile (
109
/* Load from the GPU in one shot, no interleave, to
110
* d0-d7.
111
*/
112
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113
/* Store each 16-byte line in 2 parts to the cpu-side
114
* destination. (vld1 can only store one d-register
115
* at a time).
116
*/
117
"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118
"st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119
"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120
"st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121
"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122
"st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123
"st1 {v3.D}[0], [%[cpu]]\n"
124
"st1 {v3.D}[1], [%[cpu2]]\n"
125
: [cpu] "+r"(cpu),
126
[cpu2] "+r"(cpu2)
127
: [gpu] "r"(gpu),
128
[cpu_stride] "r"(cpu_stride)
129
: "v0", "v1", "v2", "v3");
130
return;
131
}
132
#endif
133
134
for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135
memcpy(cpu, gpu + gpu_offset, gpu_stride);
136
cpu += cpu_stride;
137
}
138
}
139
140
static inline void
141
v3d_store_utile(void *gpu, uint32_t gpu_stride,
142
void *cpu, uint32_t cpu_stride)
143
{
144
#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145
if (gpu_stride == 8) {
146
__asm__ volatile (
147
/* Load each 8-byte line from cpu-side source,
148
* incrementing it by the stride each time.
149
*/
150
"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151
"vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152
"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153
"vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154
"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155
"vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156
"vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157
"vld1.8 d7, [%[cpu]]\n"
158
/* Load from the GPU in one shot, no interleave, to
159
* d0-d7.
160
*/
161
"vstm %[gpu], {q0, q1, q2, q3}\n"
162
: [cpu] "+r"(cpu)
163
: [gpu] "r"(gpu),
164
[cpu_stride] "r"(cpu_stride)
165
: "q0", "q1", "q2", "q3");
166
return;
167
} else if (gpu_stride == 16) {
168
void *cpu2 = cpu + 8;
169
__asm__ volatile (
170
/* Load each 16-byte line in 2 parts from the cpu-side
171
* destination. (vld1 can only store one d-register
172
* at a time).
173
*/
174
"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175
"vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176
"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177
"vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178
"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179
"vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180
"vld1.8 d6, [%[cpu]]\n"
181
"vld1.8 d7, [%[cpu2]]\n"
182
/* Store to the GPU in one shot, no interleave. */
183
"vstm %[gpu], {q0, q1, q2, q3}\n"
184
: [cpu] "+r"(cpu),
185
[cpu2] "+r"(cpu2)
186
: [gpu] "r"(gpu),
187
[cpu_stride] "r"(cpu_stride)
188
: "q0", "q1", "q2", "q3");
189
return;
190
}
191
#elif defined (PIPE_ARCH_AARCH64)
192
if (gpu_stride == 8) {
193
__asm__ volatile (
194
/* Load each 8-byte line from cpu-side source,
195
* incrementing it by the stride each time.
196
*/
197
"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198
"ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199
"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200
"ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201
"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202
"ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203
"ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204
"ld1 {v3.D}[1], [%[cpu]]\n"
205
/* Store to the GPU in one shot, no interleave. */
206
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207
: [cpu] "+r"(cpu)
208
: [gpu] "r"(gpu),
209
[cpu_stride] "r"(cpu_stride)
210
: "v0", "v1", "v2", "v3");
211
return;
212
} else if (gpu_stride == 16) {
213
void *cpu2 = cpu + 8;
214
__asm__ volatile (
215
/* Load each 16-byte line in 2 parts from the cpu-side
216
* destination. (vld1 can only store one d-register
217
* at a time).
218
*/
219
"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220
"ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221
"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222
"ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223
"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224
"ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225
"ld1 {v3.D}[0], [%[cpu]]\n"
226
"ld1 {v3.D}[1], [%[cpu2]]\n"
227
/* Store to the GPU in one shot, no interleave. */
228
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229
: [cpu] "+r"(cpu),
230
[cpu2] "+r"(cpu2)
231
: [gpu] "r"(gpu),
232
[cpu_stride] "r"(cpu_stride)
233
: "v0", "v1", "v2", "v3");
234
return;
235
}
236
#endif
237
238
for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239
memcpy(gpu + gpu_offset, cpu, gpu_stride);
240
cpu += cpu_stride;
241
}
242
}
243
244