CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
hrydgard

CoCalc provides the best real-time collaborative environment for Jupyter Notebooks, LaTeX documents, and SageMath, scalable from individual users to large groups and classes!

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/IndexGenerator.cpp
Views: 1401
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <cstring>
19
20
#include "ppsspp_config.h"
21
22
#include "Common/CPUDetect.h"
23
#include "Common/Common.h"
24
#include "Common/Log.h"
25
26
#ifdef _M_SSE
27
#include <emmintrin.h>
28
#endif
29
#if PPSSPP_ARCH(ARM_NEON)
30
31
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
32
#include <arm64_neon.h>
33
#else
34
#include <arm_neon.h>
35
#endif
36
#endif
37
38
#include "GPU/Common/IndexGenerator.h"
39
40
// Points don't need indexing...
41
const u8 IndexGenerator::indexedPrimitiveType[7] = {
42
GE_PRIM_POINTS,
43
GE_PRIM_LINES,
44
GE_PRIM_LINES,
45
GE_PRIM_TRIANGLES,
46
GE_PRIM_TRIANGLES,
47
GE_PRIM_TRIANGLES,
48
GE_PRIM_RECTANGLES,
49
};
50
51
void IndexGenerator::Setup(u16 *inds) {
52
this->indsBase_ = inds;
53
Reset();
54
}
55
56
void IndexGenerator::AddPrim(int prim, int vertexCount, int indexOffset, bool clockwise) {
57
switch (prim) {
58
case GE_PRIM_POINTS: AddPoints(vertexCount, indexOffset); break;
59
case GE_PRIM_LINES: AddLineList(vertexCount, indexOffset); break;
60
case GE_PRIM_LINE_STRIP: AddLineStrip(vertexCount, indexOffset); break;
61
case GE_PRIM_TRIANGLES: AddList(vertexCount, indexOffset, clockwise); break;
62
case GE_PRIM_TRIANGLE_STRIP: AddStrip(vertexCount, indexOffset, clockwise); break;
63
case GE_PRIM_TRIANGLE_FAN: AddFan(vertexCount, indexOffset, clockwise); break;
64
case GE_PRIM_RECTANGLES: AddRectangles(vertexCount, indexOffset); break; // Same
65
}
66
}
67
68
void IndexGenerator::AddPoints(int numVerts, int indexOffset) {
69
u16 *outInds = inds_;
70
for (int i = 0; i < numVerts; i++)
71
*outInds++ = indexOffset + i;
72
inds_ = outInds;
73
}
74
75
void IndexGenerator::AddList(int numVerts, int indexOffset, bool clockwise) {
76
u16 *outInds = inds_;
77
const int v1 = clockwise ? 1 : 2;
78
const int v2 = clockwise ? 2 : 1;
79
for (int i = 0; i < numVerts; i += 3) {
80
*outInds++ = indexOffset + i;
81
*outInds++ = indexOffset + i + v1;
82
*outInds++ = indexOffset + i + v2;
83
}
84
inds_ = outInds;
85
}
86
87
alignas(16) static const u16 offsets_clockwise[24] = {
88
0, (u16)(0 + 1), (u16)(0 + 2),
89
(u16)(1 + 1), 1, (u16)(1 + 2),
90
2, (u16)(2 + 1), (u16)(2 + 2),
91
(u16)(3 + 1), 3, (u16)(3 + 2),
92
4, (u16)(4 + 1), (u16)(4 + 2),
93
(u16)(5 + 1), 5, (u16)(5 + 2),
94
6, (u16)(6 + 1), (u16)(6 + 2),
95
(u16)(7 + 1), 7, (u16)(7 + 2),
96
};
97
98
alignas(16) static const uint16_t offsets_counter_clockwise[24] = {
99
0, (u16)(0 + 2), (u16)(0 + 1),
100
1, (u16)(1 + 1), (u16)(1 + 2),
101
2, (u16)(2 + 2), (u16)(2 + 1),
102
3, (u16)(3 + 1), (u16)(3 + 2),
103
4, (u16)(4 + 2), (u16)(4 + 1),
104
5, (u16)(5 + 1), (u16)(5 + 2),
105
6, (u16)(6 + 2), (u16)(6 + 1),
106
7, (u16)(7 + 1), (u16)(7 + 2),
107
};
108
109
void IndexGenerator::AddStrip(int numVerts, int indexOffset, bool clockwise) {
110
int numTris = numVerts - 2;
111
if (numTris <= 0) {
112
return;
113
}
114
#ifdef _M_SSE
115
// In an SSE2 register we can fit 8 16-bit integers.
116
// However, we need to output a multiple of 3 indices.
117
// The first such multiple is 24, which means we'll generate 24 indices per cycle,
118
// which corresponds to 8 triangles. That's pretty cool.
119
120
// We allow ourselves to write some extra indices to avoid the fallback loop.
121
// That's alright as we're appending to a buffer - they will get overwritten anyway.
122
__m128i ibase8 = _mm_set1_epi16(indexOffset);
123
const __m128i *offsets = (const __m128i *)(clockwise ? offsets_clockwise : offsets_counter_clockwise);
124
__m128i *dst = (__m128i *)inds_;
125
__m128i offsets0 = _mm_add_epi16(ibase8, _mm_load_si128(offsets));
126
// A single store is always enough for two triangles, which is a very common case.
127
_mm_storeu_si128(dst, offsets0);
128
if (numTris > 2) {
129
__m128i offsets1 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 1));
130
_mm_storeu_si128(dst + 1, offsets1);
131
if (numTris > 5) {
132
__m128i offsets2 = _mm_add_epi16(ibase8, _mm_load_si128(offsets + 2));
133
_mm_storeu_si128(dst + 2, offsets2);
134
__m128i increment = _mm_set1_epi16(8);
135
int numChunks = (numTris + 7) >> 3;
136
for (int i = 1; i < numChunks; i++) {
137
dst += 3;
138
offsets0 = _mm_add_epi16(offsets0, increment);
139
offsets1 = _mm_add_epi16(offsets1, increment);
140
offsets2 = _mm_add_epi16(offsets2, increment);
141
_mm_storeu_si128(dst, offsets0);
142
_mm_storeu_si128(dst + 1, offsets1);
143
_mm_storeu_si128(dst + 2, offsets2);
144
}
145
}
146
}
147
inds_ += numTris * 3;
148
// wind doesn't need to be updated, an even number of triangles have been drawn.
149
#elif PPSSPP_ARCH(ARM_NEON)
150
uint16x8_t ibase8 = vdupq_n_u16(indexOffset);
151
const u16 *offsets = clockwise ? offsets_clockwise : offsets_counter_clockwise;
152
u16 *dst = inds_;
153
uint16x8_t offsets0 = vaddq_u16(ibase8, vld1q_u16(offsets));
154
vst1q_u16(dst, offsets0);
155
if (numTris > 2) {
156
uint16x8_t offsets1 = vaddq_u16(ibase8, vld1q_u16(offsets + 8));
157
vst1q_u16(dst + 8, offsets1);
158
if (numTris > 5) {
159
uint16x8_t offsets2 = vaddq_u16(ibase8, vld1q_u16(offsets + 16));
160
vst1q_u16(dst + 16, offsets2);
161
uint16x8_t increment = vdupq_n_u16(8);
162
int numChunks = (numTris + 7) >> 3;
163
for (int i = 1; i < numChunks; i++) {
164
dst += 3 * 8;
165
offsets0 = vaddq_u16(offsets0, increment);
166
offsets1 = vaddq_u16(offsets1, increment);
167
offsets2 = vaddq_u16(offsets2, increment);
168
vst1q_u16(dst, offsets0);
169
vst1q_u16(dst + 8, offsets1);
170
vst1q_u16(dst + 16, offsets2);
171
}
172
}
173
}
174
inds_ += numTris * 3;
175
#else
176
// Slow fallback loop.
177
int wind = clockwise ? 1 : 2;
178
int ibase = indexOffset;
179
size_t numPairs = numTris / 2;
180
u16 *outInds = inds_;
181
while (numPairs > 0) {
182
*outInds++ = ibase;
183
*outInds++ = ibase + wind;
184
*outInds++ = ibase + (wind ^ 3);
185
*outInds++ = ibase + 1;
186
*outInds++ = ibase + 1 + (wind ^ 3);
187
*outInds++ = ibase + 1 + wind;
188
ibase += 2;
189
numPairs--;
190
}
191
if (numTris & 1) {
192
*outInds++ = ibase;
193
*outInds++ = ibase + wind;
194
wind ^= 3; // toggle between 1 and 2
195
*outInds++ = ibase + wind;
196
}
197
inds_ = outInds;
198
#endif
199
}
200
201
void IndexGenerator::AddFan(int numVerts, int indexOffset, bool clockwise) {
202
const int numTris = numVerts - 2;
203
u16 *outInds = inds_;
204
const int v1 = clockwise ? 1 : 2;
205
const int v2 = clockwise ? 2 : 1;
206
for (int i = 0; i < numTris; i++) {
207
*outInds++ = indexOffset;
208
*outInds++ = indexOffset + i + v1;
209
*outInds++ = indexOffset + i + v2;
210
}
211
inds_ = outInds;
212
}
213
214
//Lines
215
void IndexGenerator::AddLineList(int numVerts, int indexOffset) {
216
u16 *outInds = inds_;
217
numVerts &= ~1;
218
for (int i = 0; i < numVerts; i += 2) {
219
*outInds++ = indexOffset + i;
220
*outInds++ = indexOffset + i + 1;
221
}
222
inds_ = outInds;
223
}
224
225
void IndexGenerator::AddLineStrip(int numVerts, int indexOffset) {
226
const int numLines = numVerts - 1;
227
u16 *outInds = inds_;
228
for (int i = 0; i < numLines; i++) {
229
*outInds++ = indexOffset + i;
230
*outInds++ = indexOffset + i + 1;
231
}
232
inds_ = outInds;
233
}
234
235
void IndexGenerator::AddRectangles(int numVerts, int indexOffset) {
236
u16 *outInds = inds_;
237
//rectangles always need 2 vertices, disregard the last one if there's an odd number
238
numVerts = numVerts & ~1;
239
for (int i = 0; i < numVerts; i += 2) {
240
*outInds++ = indexOffset + i;
241
*outInds++ = indexOffset + i + 1;
242
}
243
inds_ = outInds;
244
}
245
246
template <class ITypeLE>
247
void IndexGenerator::TranslatePoints(int numInds, const ITypeLE *inds, int indexOffset) {
248
u16 *outInds = inds_;
249
for (int i = 0; i < numInds; i++)
250
*outInds++ = indexOffset + inds[i];
251
inds_ = outInds;
252
}
253
254
template <class ITypeLE>
255
void IndexGenerator::TranslateLineList(int numInds, const ITypeLE *inds, int indexOffset) {
256
u16 *outInds = inds_;
257
numInds = numInds & ~1;
258
for (int i = 0; i < numInds; i += 2) {
259
*outInds++ = indexOffset + inds[i];
260
*outInds++ = indexOffset + inds[i + 1];
261
}
262
inds_ = outInds;
263
}
264
265
template <class ITypeLE>
266
void IndexGenerator::TranslateLineStrip(int numInds, const ITypeLE *inds, int indexOffset) {
267
int numLines = numInds - 1;
268
u16 *outInds = inds_;
269
for (int i = 0; i < numLines; i++) {
270
*outInds++ = indexOffset + inds[i];
271
*outInds++ = indexOffset + inds[i + 1];
272
}
273
inds_ = outInds;
274
}
275
276
template <class ITypeLE>
277
void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
278
// We only bother doing this minor optimization in triangle list, since it's by far the most
279
// common operation that can benefit.
280
if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) {
281
memcpy(inds_, inds, numInds * sizeof(ITypeLE));
282
inds_ += numInds;
283
} else {
284
u16 *outInds = inds_;
285
int numTris = numInds / 3; // Round to whole triangles
286
numInds = numTris * 3;
287
const int v1 = clockwise ? 1 : 2;
288
const int v2 = clockwise ? 2 : 1;
289
// TODO: This can actually be SIMD-d, although will need complex shuffles if clockwise.
290
for (int i = 0; i < numInds; i += 3) {
291
*outInds++ = indexOffset + inds[i];
292
*outInds++ = indexOffset + inds[i + v1];
293
*outInds++ = indexOffset + inds[i + v2];
294
}
295
inds_ = outInds;
296
}
297
}
298
299
template <class ITypeLE>
300
void IndexGenerator::TranslateStrip(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
301
int wind = clockwise ? 1 : 2;
302
int numTris = numInds - 2;
303
u16 *outInds = inds_;
304
for (int i = 0; i < numTris; i++) {
305
*outInds++ = indexOffset + inds[i];
306
*outInds++ = indexOffset + inds[i + wind];
307
wind ^= 3; // Toggle between 1 and 2
308
*outInds++ = indexOffset + inds[i + wind];
309
}
310
inds_ = outInds;
311
}
312
313
template <class ITypeLE>
314
void IndexGenerator::TranslateFan(int numInds, const ITypeLE *inds, int indexOffset, bool clockwise) {
315
if (numInds <= 0) return;
316
int numTris = numInds - 2;
317
u16 *outInds = inds_;
318
const int v1 = clockwise ? 1 : 2;
319
const int v2 = clockwise ? 2 : 1;
320
for (int i = 0; i < numTris; i++) {
321
*outInds++ = indexOffset + inds[0];
322
*outInds++ = indexOffset + inds[i + v1];
323
*outInds++ = indexOffset + inds[i + v2];
324
}
325
inds_ = outInds;
326
}
327
328
template <class ITypeLE>
329
inline void IndexGenerator::TranslateRectangles(int numInds, const ITypeLE *inds, int indexOffset) {
330
u16 *outInds = inds_;
331
//rectangles always need 2 vertices, disregard the last one if there's an odd number
332
numInds = numInds & ~1;
333
for (int i = 0; i < numInds; i += 2) {
334
*outInds++ = indexOffset + inds[i];
335
*outInds++ = indexOffset + inds[i+1];
336
}
337
inds_ = outInds;
338
}
339
340
// Could template this too, but would have to define in header.
341
void IndexGenerator::TranslatePrim(int prim, int numInds, const u8 *inds, int indexOffset, bool clockwise) {
342
switch (prim) {
343
case GE_PRIM_POINTS: TranslatePoints<u8>(numInds, inds, indexOffset); break;
344
case GE_PRIM_LINES: TranslateLineList<u8>(numInds, inds, indexOffset); break;
345
case GE_PRIM_LINE_STRIP: TranslateLineStrip<u8>(numInds, inds, indexOffset); break;
346
case GE_PRIM_TRIANGLES: TranslateList<u8>(numInds, inds, indexOffset, clockwise); break;
347
case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u8>(numInds, inds, indexOffset, clockwise); break;
348
case GE_PRIM_TRIANGLE_FAN: TranslateFan<u8>(numInds, inds, indexOffset, clockwise); break;
349
case GE_PRIM_RECTANGLES: TranslateRectangles<u8>(numInds, inds, indexOffset); break; // Same
350
}
351
}
352
353
void IndexGenerator::TranslatePrim(int prim, int numInds, const u16_le *inds, int indexOffset, bool clockwise) {
354
switch (prim) {
355
case GE_PRIM_POINTS: TranslatePoints<u16_le>(numInds, inds, indexOffset); break;
356
case GE_PRIM_LINES: TranslateLineList<u16_le>(numInds, inds, indexOffset); break;
357
case GE_PRIM_LINE_STRIP: TranslateLineStrip<u16_le>(numInds, inds, indexOffset); break;
358
case GE_PRIM_TRIANGLES: TranslateList<u16_le>(numInds, inds, indexOffset, clockwise); break;
359
case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u16_le>(numInds, inds, indexOffset, clockwise); break;
360
case GE_PRIM_TRIANGLE_FAN: TranslateFan<u16_le>(numInds, inds, indexOffset, clockwise); break;
361
case GE_PRIM_RECTANGLES: TranslateRectangles<u16_le>(numInds, inds, indexOffset); break; // Same
362
}
363
}
364
365
void IndexGenerator::TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise) {
366
switch (prim) {
367
case GE_PRIM_POINTS: TranslatePoints<u32_le>(numInds, inds, indexOffset); break;
368
case GE_PRIM_LINES: TranslateLineList<u32_le>(numInds, inds, indexOffset); break;
369
case GE_PRIM_LINE_STRIP: TranslateLineStrip<u32_le>(numInds, inds, indexOffset); break;
370
case GE_PRIM_TRIANGLES: TranslateList<u32_le>(numInds, inds, indexOffset, clockwise); break;
371
case GE_PRIM_TRIANGLE_STRIP: TranslateStrip<u32_le>(numInds, inds, indexOffset, clockwise); break;
372
case GE_PRIM_TRIANGLE_FAN: TranslateFan<u32_le>(numInds, inds, indexOffset, clockwise); break;
373
case GE_PRIM_RECTANGLES: TranslateRectangles<u32_le>(numInds, inds, indexOffset); break; // Same
374
}
375
}
376
377