Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
godotengine
GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
9903 views
1
// basisu_kernels_imp.h - Do not directly include
2
// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
// http://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
16
using namespace CPPSPMD;
17
18
namespace CPPSPMD_NAME(basisu_kernels_namespace)
19
{
20
struct perceptual_distance_rgb_4_N : spmd_kernel
21
{
22
void _call(int64_t* pDistance,
23
const uint8_t* pSelectors,
24
const color_rgba* pBlock_colors,
25
const color_rgba* pSrc_pixels, uint32_t n,
26
int64_t early_out_err)
27
{
28
assert(early_out_err >= 0);
29
30
*pDistance = 0;
31
32
__m128i block_colors[4];
33
vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
34
for (uint32_t i = 0; i < 4; i++)
35
{
36
block_colors[i] = load_rgba32(&pBlock_colors[i]);
37
store_all(block_colors_r[i], (int)pBlock_colors[i].r);
38
store_all(block_colors_g[i], (int)pBlock_colors[i].g);
39
store_all(block_colors_b[i], (int)pBlock_colors[i].b);
40
}
41
42
uint32_t i;
43
for (i = 0; (i + 4) <= n; i += 4)
44
{
45
__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
46
47
vint r, g, b, a;
48
transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
49
50
int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
51
52
vint base_r, base_g, base_b, base_a;
53
if ((s0 == s1) && (s0 == s2) && (s0 == s3))
54
{
55
store_all(base_r, block_colors_r[s0]);
56
store_all(base_g, block_colors_g[s0]);
57
store_all(base_b, block_colors_b[s0]);
58
}
59
else
60
{
61
__m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
62
transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
63
}
64
65
vint dr = base_r - r;
66
vint dg = base_g - g;
67
vint db = base_b - b;
68
69
vint delta_l = dr * 27 + dg * 92 + db * 9;
70
vint delta_cr = dr * 128 - delta_l;
71
vint delta_cb = db * 128 - delta_l;
72
73
vint id = ((delta_l * delta_l) >> 7) +
74
((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
75
((((delta_cb * delta_cb) >> 7) * 3) >> 7);
76
77
*pDistance += reduce_add(id);
78
if (*pDistance >= early_out_err)
79
return;
80
}
81
82
for (; i < n; i++)
83
{
84
int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
85
86
int sel = pSelectors[i];
87
int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
88
89
int dr = base_r - r;
90
int dg = base_g - g;
91
int db = base_b - b;
92
93
int delta_l = dr * 27 + dg * 92 + db * 9;
94
int delta_cr = dr * 128 - delta_l;
95
int delta_cb = db * 128 - delta_l;
96
97
int id = ((delta_l * delta_l) >> 7) +
98
((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
99
((((delta_cb * delta_cb) >> 7) * 3) >> 7);
100
101
*pDistance += id;
102
if (*pDistance >= early_out_err)
103
return;
104
}
105
}
106
};
107
108
struct linear_distance_rgb_4_N : spmd_kernel
109
{
110
void _call(int64_t* pDistance,
111
const uint8_t* pSelectors,
112
const color_rgba* pBlock_colors,
113
const color_rgba* pSrc_pixels, uint32_t n,
114
int64_t early_out_err)
115
{
116
assert(early_out_err >= 0);
117
118
*pDistance = 0;
119
120
__m128i block_colors[4];
121
vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
122
for (uint32_t i = 0; i < 4; i++)
123
{
124
block_colors[i] = load_rgba32(&pBlock_colors[i]);
125
store_all(block_colors_r[i], (int)pBlock_colors[i].r);
126
store_all(block_colors_g[i], (int)pBlock_colors[i].g);
127
store_all(block_colors_b[i], (int)pBlock_colors[i].b);
128
}
129
130
uint32_t i;
131
for (i = 0; (i + 4) <= n; i += 4)
132
{
133
__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
134
135
vint r, g, b, a;
136
transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
137
138
int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
139
140
vint base_r, base_g, base_b, base_a;
141
if ((s0 == s1) && (s0 == s2) && (s0 == s3))
142
{
143
store_all(base_r, block_colors_r[s0]);
144
store_all(base_g, block_colors_g[s0]);
145
store_all(base_b, block_colors_b[s0]);
146
}
147
else
148
{
149
__m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
150
transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
151
}
152
153
vint dr = base_r - r;
154
vint dg = base_g - g;
155
vint db = base_b - b;
156
157
vint id = dr * dr + dg * dg + db * db;
158
159
*pDistance += reduce_add(id);
160
if (*pDistance >= early_out_err)
161
return;
162
}
163
164
for (; i < n; i++)
165
{
166
int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
167
168
int sel = pSelectors[i];
169
int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
170
171
int dr = base_r - r;
172
int dg = base_g - g;
173
int db = base_b - b;
174
175
int id = dr * dr + dg * dg + db * db;
176
177
*pDistance += id;
178
if (*pDistance >= early_out_err)
179
return;
180
}
181
}
182
};
183
184
struct find_selectors_perceptual_rgb_4_N : spmd_kernel
185
{
186
inline vint compute_dist(
187
const vint& base_r, const vint& base_g, const vint& base_b,
188
const vint& r, const vint& g, const vint& b)
189
{
190
vint dr = base_r - r;
191
vint dg = base_g - g;
192
vint db = base_b - b;
193
194
vint delta_l = dr * 27 + dg * 92 + db * 9;
195
vint delta_cr = dr * 128 - delta_l;
196
vint delta_cb = db * 128 - delta_l;
197
198
vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
199
VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
200
VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
201
202
return id;
203
}
204
205
void _call(int64_t* pDistance,
206
uint8_t* pSelectors,
207
const color_rgba* pBlock_colors,
208
const color_rgba* pSrc_pixels, uint32_t n,
209
int64_t early_out_err)
210
{
211
assert(early_out_err >= 0);
212
213
*pDistance = 0;
214
215
vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
216
for (uint32_t i = 0; i < 4; i++)
217
{
218
store_all(block_colors_r[i], (int)pBlock_colors[i].r);
219
store_all(block_colors_g[i], (int)pBlock_colors[i].g);
220
store_all(block_colors_b[i], (int)pBlock_colors[i].b);
221
}
222
223
const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
224
225
uint32_t i;
226
227
for (i = 0; (i + 4) <= n; i += 4)
228
{
229
__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
230
231
vint r, g, b, a;
232
transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
233
234
vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
235
vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
236
vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
237
vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
238
239
vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
240
241
vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
242
243
__m128i vsels = shuffle_epi8(sels.m_value, shuf);
244
storeu_si32((void *)(pSelectors + i), vsels);
245
246
*pDistance += reduce_add(min_dist);
247
if (*pDistance >= early_out_err)
248
return;
249
}
250
251
for (; i < n; i++)
252
{
253
int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
254
255
int best_err = INT_MAX, best_sel = 0;
256
for (int sel = 0; sel < 4; sel++)
257
{
258
int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
259
260
int dr = base_r - r;
261
int dg = base_g - g;
262
int db = base_b - b;
263
264
int delta_l = dr * 27 + dg * 92 + db * 9;
265
int delta_cr = dr * 128 - delta_l;
266
int delta_cb = db * 128 - delta_l;
267
268
int id = ((delta_l * delta_l) >> 7) +
269
((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
270
((((delta_cb * delta_cb) >> 7) * 3) >> 7);
271
if (id < best_err)
272
{
273
best_err = id;
274
best_sel = sel;
275
}
276
}
277
278
pSelectors[i] = (uint8_t)best_sel;
279
280
*pDistance += best_err;
281
if (*pDistance >= early_out_err)
282
return;
283
}
284
}
285
};
286
287
struct find_selectors_linear_rgb_4_N : spmd_kernel
288
{
289
inline vint compute_dist(
290
const vint& base_r, const vint& base_g, const vint& base_b,
291
const vint& r, const vint& g, const vint& b)
292
{
293
vint dr = base_r - r;
294
vint dg = base_g - g;
295
vint db = base_b - b;
296
297
vint id = dr * dr + dg * dg + db * db;
298
return id;
299
}
300
301
void _call(int64_t* pDistance,
302
uint8_t* pSelectors,
303
const color_rgba* pBlock_colors,
304
const color_rgba* pSrc_pixels, uint32_t n,
305
int64_t early_out_err)
306
{
307
assert(early_out_err >= 0);
308
309
*pDistance = 0;
310
311
vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
312
for (uint32_t i = 0; i < 4; i++)
313
{
314
store_all(block_colors_r[i], (int)pBlock_colors[i].r);
315
store_all(block_colors_g[i], (int)pBlock_colors[i].g);
316
store_all(block_colors_b[i], (int)pBlock_colors[i].b);
317
}
318
319
const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
320
321
uint32_t i;
322
323
for (i = 0; (i + 4) <= n; i += 4)
324
{
325
__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
326
327
vint r, g, b, a;
328
transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
329
330
vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
331
vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
332
vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
333
vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
334
335
vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
336
337
vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
338
339
__m128i vsels = shuffle_epi8(sels.m_value, shuf);
340
storeu_si32((void *)(pSelectors + i), vsels);
341
342
*pDistance += reduce_add(min_dist);
343
if (*pDistance >= early_out_err)
344
return;
345
}
346
347
for (; i < n; i++)
348
{
349
int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
350
351
int best_err = INT_MAX, best_sel = 0;
352
for (int sel = 0; sel < 4; sel++)
353
{
354
int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
355
356
int dr = base_r - r;
357
int dg = base_g - g;
358
int db = base_b - b;
359
360
int id = dr * dr + dg * dg + db * db;
361
if (id < best_err)
362
{
363
best_err = id;
364
best_sel = sel;
365
}
366
}
367
368
pSelectors[i] = (uint8_t)best_sel;
369
370
*pDistance += best_err;
371
if (*pDistance >= early_out_err)
372
return;
373
}
374
}
375
};
376
377
struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
378
{
379
inline vint compute_dist(
380
const vint& base_r, const vint& base_g, const vint& base_b,
381
const vint& r, const vint& g, const vint& b)
382
{
383
vint dr = base_r - r;
384
vint dg = base_g - g;
385
vint db = base_b - b;
386
387
vint delta_l = dr * 27 + dg * 92 + db * 9;
388
vint delta_cr = dr * 128 - delta_l;
389
vint delta_cb = db * 128 - delta_l;
390
391
vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
392
VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
393
VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
394
395
return id;
396
}
397
398
void _call(int64_t* pDistance,
399
const color_rgba* pBlock_colors,
400
const color_rgba* pSrc_pixels, uint32_t n,
401
int64_t early_out_error)
402
{
403
assert(early_out_error >= 0);
404
405
*pDistance = 0;
406
407
vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
408
for (uint32_t i = 0; i < 4; i++)
409
{
410
store_all(block_colors_r[i], (int)pBlock_colors[i].r);
411
store_all(block_colors_g[i], (int)pBlock_colors[i].g);
412
store_all(block_colors_b[i], (int)pBlock_colors[i].b);
413
}
414
415
uint32_t i;
416
417
for (i = 0; (i + 4) <= n; i += 4)
418
{
419
__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
420
421
vint r, g, b, a;
422
transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
423
424
vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
425
vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
426
vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
427
vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
428
429
vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
430
431
*pDistance += reduce_add(min_dist);
432
if (*pDistance > early_out_error)
433
return;
434
}
435
436
for (; i < n; i++)
437
{
438
int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
439
440
int best_err = INT_MAX;
441
for (int sel = 0; sel < 4; sel++)
442
{
443
int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
444
445
int dr = base_r - r;
446
int dg = base_g - g;
447
int db = base_b - b;
448
449
int delta_l = dr * 27 + dg * 92 + db * 9;
450
int delta_cr = dr * 128 - delta_l;
451
int delta_cb = db * 128 - delta_l;
452
453
int id = ((delta_l * delta_l) >> 7) +
454
((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
455
((((delta_cb * delta_cb) >> 7) * 3) >> 7);
456
457
if (id < best_err)
458
{
459
best_err = id;
460
}
461
}
462
463
*pDistance += best_err;
464
if (*pDistance > early_out_error)
465
return;
466
}
467
}
468
};
469
470
struct find_lowest_error_linear_rgb_4_N : spmd_kernel
471
{
472
inline vint compute_dist(
473
const vint& base_r, const vint& base_g, const vint& base_b,
474
const vint& r, const vint& g, const vint& b)
475
{
476
vint dr = base_r - r;
477
vint dg = base_g - g;
478
vint db = base_b - b;
479
480
vint id = dr * dr + dg * dg + db * db;
481
482
return id;
483
}
484
485
void _call(int64_t* pDistance,
486
const color_rgba* pBlock_colors,
487
const color_rgba* pSrc_pixels, uint32_t n,
488
int64_t early_out_error)
489
{
490
assert(early_out_error >= 0);
491
492
*pDistance = 0;
493
494
vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
495
for (uint32_t i = 0; i < 4; i++)
496
{
497
store_all(block_colors_r[i], (int)pBlock_colors[i].r);
498
store_all(block_colors_g[i], (int)pBlock_colors[i].g);
499
store_all(block_colors_b[i], (int)pBlock_colors[i].b);
500
}
501
502
uint32_t i;
503
504
for (i = 0; (i + 4) <= n; i += 4)
505
{
506
__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
507
508
vint r, g, b, a;
509
transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
510
511
vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
512
vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
513
vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
514
vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
515
516
vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
517
518
*pDistance += reduce_add(min_dist);
519
if (*pDistance > early_out_error)
520
return;
521
}
522
523
for (; i < n; i++)
524
{
525
int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
526
527
int best_err = INT_MAX;
528
for (int sel = 0; sel < 4; sel++)
529
{
530
int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
531
532
int dr = base_r - r;
533
int dg = base_g - g;
534
int db = base_b - b;
535
536
int id = dr * dr + dg * dg + db * db;
537
538
if (id < best_err)
539
{
540
best_err = id;
541
}
542
}
543
544
*pDistance += best_err;
545
if (*pDistance > early_out_error)
546
return;
547
}
548
}
549
};
550
551
struct update_covar_matrix_16x16 : spmd_kernel
552
{
553
void _call(
554
uint32_t num_vecs, const void* pWeighted_vecs_void, const void* pOrigin_void, const uint32_t* pVec_indices, void* pMatrix16x16_void)
555
{
556
const std::pair<vec16F, uint64_t>* pWeighted_vecs = static_cast< const std::pair<vec16F, uint64_t> *>(pWeighted_vecs_void);
557
558
const float* pOrigin = static_cast<const float*>(pOrigin_void);
559
vfloat org0 = loadu_linear_all(pOrigin), org1 = loadu_linear_all(pOrigin + 4), org2 = loadu_linear_all(pOrigin + 8), org3 = loadu_linear_all(pOrigin + 12);
560
561
vfloat mat[16][4];
562
vfloat vzero(zero_vfloat());
563
564
for (uint32_t i = 0; i < 16; i++)
565
{
566
store_all(mat[i][0], vzero);
567
store_all(mat[i][1], vzero);
568
store_all(mat[i][2], vzero);
569
store_all(mat[i][3], vzero);
570
}
571
572
for (uint32_t k = 0; k < num_vecs; k++)
573
{
574
const uint32_t vec_index = pVec_indices[k];
575
576
const float* pW = pWeighted_vecs[vec_index].first.get_ptr();
577
vfloat weight((float)pWeighted_vecs[vec_index].second);
578
579
vfloat vec[4] = { loadu_linear_all(pW) - org0, loadu_linear_all(pW + 4) - org1, loadu_linear_all(pW + 8) - org2, loadu_linear_all(pW + 12) - org3 };
580
581
vfloat wvec0 = vec[0] * weight, wvec1 = vec[1] * weight, wvec2 = vec[2] * weight, wvec3 = vec[3] * weight;
582
583
for (uint32_t j = 0; j < 16; j++)
584
{
585
vfloat vx = ((const float*)vec)[j];
586
587
store_all(mat[j][0], mat[j][0] + vx * wvec0);
588
store_all(mat[j][1], mat[j][1] + vx * wvec1);
589
store_all(mat[j][2], mat[j][2] + vx * wvec2);
590
store_all(mat[j][3], mat[j][3] + vx * wvec3);
591
592
} // j
593
594
} // k
595
596
float* pMatrix = static_cast<float*>(pMatrix16x16_void);
597
598
float* pDst = pMatrix;
599
for (uint32_t i = 0; i < 16; i++)
600
{
601
storeu_linear_all(pDst, mat[i][0]);
602
storeu_linear_all(pDst + 4, mat[i][1]);
603
storeu_linear_all(pDst + 8, mat[i][2]);
604
storeu_linear_all(pDst + 12, mat[i][3]);
605
pDst += 16;
606
}
607
}
608
};
609
610
} // namespace
611
612
using namespace CPPSPMD_NAME(basisu_kernels_namespace);
613
614
void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
615
{
616
spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
617
}
618
619
void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
620
{
621
spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
622
}
623
624
void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
625
{
626
spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
627
}
628
629
void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
630
{
631
spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
632
}
633
634
void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
635
{
636
spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
637
}
638
639
void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
640
{
641
spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
642
}
643
644
void CPPSPMD_NAME(update_covar_matrix_16x16)(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16)
645
{
646
spmd_call < update_covar_matrix_16x16 >(num_vecs, pWeighted_vecs, pOrigin, pVec_indices, pMatrix16x16);
647
}
648
649