Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
wine-mirror
GitHub Repository: wine-mirror/wine
Path: blob/master/libs/faudio/src/FAudio_internal_simd.c
4389 views
1
/* FAudio - XAudio Reimplementation for FNA
2
*
3
* Copyright (c) 2011-2024 Ethan Lee, Luigi Auriemma, and the MonoGame Team
4
*
5
* This software is provided 'as-is', without any express or implied warranty.
6
* In no event will the authors be held liable for any damages arising from
7
* the use of this software.
8
*
9
* Permission is granted to anyone to use this software for any purpose,
10
* including commercial applications, and to alter it and redistribute it
11
* freely, subject to the following restrictions:
12
*
13
* 1. The origin of this software must not be misrepresented; you must not
14
* claim that you wrote the original software. If you use this software in a
15
* product, an acknowledgment in the product documentation would be
16
* appreciated but is not required.
17
*
18
* 2. Altered source versions must be plainly marked as such, and must not be
19
* misrepresented as being the original software.
20
*
21
* 3. This notice may not be removed or altered from any source distribution.
22
*
23
* Ethan "flibitijibibo" Lee <[email protected]>
24
*
25
*/
26
27
#include "FAudio_internal.h"
28
29
/* SECTION 0: SSE/NEON Detection */
30
31
/* The SSE/NEON detection comes from MojoAL:
32
* https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c
33
*/
34
35
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC)
36
/* Some platforms fail to define this... */
37
#ifndef __ARM_NEON__
38
#define __ARM_NEON__ 1
39
#endif
40
41
/* AArch64 guarantees NEON. */
42
#define NEED_SCALAR_CONVERTER_FALLBACKS 0
43
#elif defined(__x86_64__) || defined(_M_X64)
44
/* Some platforms fail to define this... */
45
#ifndef __SSE2__
46
#define __SSE2__ 1
47
#endif
48
49
/* x86_64 guarantees SSE2. */
50
#define NEED_SCALAR_CONVERTER_FALLBACKS 0
51
#elif __MACOSX__ && !defined(__POWERPC__)
52
/* Some build systems may need to specify this. */
53
#if !defined(__SSE2__) && !defined(__ARM_NEON__)
54
#error macOS does not have SSE2/NEON? Bad compiler?
55
#endif
56
57
/* Mac OS X/Intel guarantees SSE2. */
58
#define NEED_SCALAR_CONVERTER_FALLBACKS 0
59
#else
60
/* Need plain C implementations to support all other hardware */
61
#define NEED_SCALAR_CONVERTER_FALLBACKS 1
62
#endif
63
64
/* Our NEON paths require AArch64, don't check __ARM_NEON__ here */
65
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC)
66
#include <arm_neon.h>
67
#define HAVE_NEON_INTRINSICS 1
68
#endif
69
70
71
#ifdef __SSE2__
72
#include <emmintrin.h>
73
#define HAVE_SSE2_INTRINSICS 1
74
#endif
75
76
/* SECTION 1: Type Converters */
77
78
/* The SSE/NEON converters are based on SDL_audiotypecvt:
79
* https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c
80
*/
81
82
#define DIVBY128 0.0078125f
83
#define DIVBY32768 0.000030517578125f
84
#define DIVBY8388607 0.00000011920930376163766f
85
86
#if NEED_SCALAR_CONVERTER_FALLBACKS
87
void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(
88
const uint8_t *restrict src,
89
float *restrict dst,
90
uint32_t len
91
) {
92
uint32_t i;
93
for (i = 0; i < len; i += 1)
94
{
95
*dst++ = (*src++ * DIVBY128) - 1.0f;
96
}
97
}
98
99
void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(
100
const int16_t *restrict src,
101
float *restrict dst,
102
uint32_t len
103
) {
104
uint32_t i;
105
for (i = 0; i < len; i += 1)
106
{
107
*dst++ = *src++ * DIVBY32768;
108
}
109
}
110
111
void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(
112
const int32_t *restrict src,
113
float *restrict dst,
114
uint32_t len
115
) {
116
uint32_t i;
117
for (i = 0; i < len; i += 1)
118
{
119
*dst++ = (*src++ >> 8) * DIVBY8388607;
120
}
121
}
122
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
123
124
#if HAVE_SSE2_INTRINSICS
125
void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(
126
const uint8_t *restrict src,
127
float *restrict dst,
128
uint32_t len
129
) {
130
int i;
131
src += len - 1;
132
dst += len - 1;
133
134
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
135
for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
136
*dst = (((float) *src) * DIVBY128) - 1.0f;
137
}
138
139
src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
140
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
141
142
/* Make sure src is aligned too. */
143
if ((((size_t) src) & 15) == 0) {
144
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
145
const __m128i *mmsrc = (const __m128i *) src;
146
const __m128i zero = _mm_setzero_si128();
147
const __m128 divby128 = _mm_set1_ps(DIVBY128);
148
const __m128 minus1 = _mm_set1_ps(-1.0f);
149
while (i >= 16) { /* 16 * 8-bit */
150
const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
151
/* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
152
const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
153
/* right-shift-zero-extend gets us uint16 with the other set of values. */
154
const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
155
/* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
156
/* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
157
const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
158
const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
159
const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
160
const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
161
/* Interleave back into correct order, store. */
162
_mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
163
_mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
164
_mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
165
_mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
166
i -= 16; mmsrc--; dst -= 16;
167
}
168
169
src = (const uint8_t *) mmsrc;
170
}
171
172
src += 15; dst += 15; /* adjust for any scalar finishing. */
173
174
/* Finish off any leftovers with scalar operations. */
175
while (i) {
176
*dst = (((float) *src) * DIVBY128) - 1.0f;
177
i--; src--; dst--;
178
}
179
}
180
181
void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(
182
const int16_t *restrict src,
183
float *restrict dst,
184
uint32_t len
185
) {
186
int i;
187
src += len - 1;
188
dst += len - 1;
189
190
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
191
for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
192
*dst = ((float) *src) * DIVBY32768;
193
}
194
195
src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
196
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
197
198
/* Make sure src is aligned too. */
199
if ((((size_t) src) & 15) == 0) {
200
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
201
const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
202
while (i >= 8) { /* 8 * 16-bit */
203
const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
204
/* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
205
const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
206
/* right-shift-sign-extend gets us sint32 with the other set of values. */
207
const __m128i b = _mm_srai_epi32(ints, 16);
208
/* Interleave these back into the right order, convert to float, multiply, store. */
209
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
210
_mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
211
i -= 8; src -= 8; dst -= 8;
212
}
213
}
214
215
src += 7; dst += 7; /* adjust for any scalar finishing. */
216
217
/* Finish off any leftovers with scalar operations. */
218
while (i) {
219
*dst = ((float) *src) * DIVBY32768;
220
i--; src--; dst--;
221
}
222
}
223
224
void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(
225
const int32_t *restrict src,
226
float *restrict dst,
227
uint32_t len
228
) {
229
int i;
230
231
/* Get dst aligned to 16 bytes */
232
for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
233
*dst = ((float) (*src>>8)) * DIVBY8388607;
234
}
235
236
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
237
238
/* Make sure src is aligned too. */
239
if ((((size_t) src) & 15) == 0) {
240
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
241
const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
242
const __m128i *mmsrc = (const __m128i *) src;
243
while (i >= 4) { /* 4 * sint32 */
244
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
245
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
246
i -= 4; mmsrc++; dst += 4;
247
}
248
src = (const int32_t *) mmsrc;
249
}
250
251
/* Finish off any leftovers with scalar operations. */
252
while (i) {
253
*dst = ((float) (*src>>8)) * DIVBY8388607;
254
i--; src++; dst++;
255
}
256
}
257
#endif /* HAVE_SSE2_INTRINSICS */
258
259
#if HAVE_NEON_INTRINSICS
260
void FAudio_INTERNAL_Convert_U8_To_F32_NEON(
261
const uint8_t *restrict src,
262
float *restrict dst,
263
uint32_t len
264
) {
265
int i;
266
src += len - 1;
267
dst += len - 1;
268
269
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
270
for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
271
*dst = (((float) *src) * DIVBY128) - 1.0f;
272
}
273
274
src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
275
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
276
277
/* Make sure src is aligned too. */
278
if ((((size_t) src) & 15) == 0) {
279
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */
280
const uint8_t *mmsrc = (const uint8_t *) src;
281
const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
282
const float32x4_t negone = vdupq_n_f32(-1.0f);
283
while (i >= 16) { /* 16 * 8-bit */
284
const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */
285
const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */
286
const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */
287
/* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
288
vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
289
vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
290
vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
291
vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
292
i -= 16; mmsrc -= 16; dst -= 16;
293
}
294
295
src = (const uint8_t *) mmsrc;
296
}
297
298
src += 15; dst += 15; /* adjust for any scalar finishing. */
299
300
/* Finish off any leftovers with scalar operations. */
301
while (i) {
302
*dst = (((float) *src) * DIVBY128) - 1.0f;
303
i--; src--; dst--;
304
}
305
}
306
307
void FAudio_INTERNAL_Convert_S16_To_F32_NEON(
308
const int16_t *restrict src,
309
float *restrict dst,
310
uint32_t len
311
) {
312
int i;
313
src += len - 1;
314
dst += len - 1;
315
316
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
317
for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
318
*dst = ((float) *src) * DIVBY32768;
319
}
320
321
src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
322
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
323
324
/* Make sure src is aligned too. */
325
if ((((size_t) src) & 15) == 0) {
326
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */
327
const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
328
while (i >= 8) { /* 8 * 16-bit */
329
const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */
330
/* split int16 to two int32, then convert to float, then multiply to normalize, store. */
331
vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
332
vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
333
i -= 8; src -= 8; dst -= 8;
334
}
335
}
336
337
src += 7; dst += 7; /* adjust for any scalar finishing. */
338
339
/* Finish off any leftovers with scalar operations. */
340
while (i) {
341
*dst = ((float) *src) * DIVBY32768;
342
i--; src--; dst--;
343
}
344
}
345
346
void FAudio_INTERNAL_Convert_S32_To_F32_NEON(
347
const int32_t *restrict src,
348
float *restrict dst,
349
uint32_t len
350
) {
351
int i;
352
353
/* Get dst aligned to 16 bytes */
354
for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
355
*dst = ((float) (*src>>8)) * DIVBY8388607;
356
}
357
358
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
359
360
/* Make sure src is aligned too. */
361
if ((((size_t) src) & 15) == 0) {
362
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */
363
const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
364
const int32_t *mmsrc = (const int32_t *) src;
365
while (i >= 4) { /* 4 * sint32 */
366
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
367
vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
368
i -= 4; mmsrc += 4; dst += 4;
369
}
370
src = (const int32_t *) mmsrc;
371
}
372
373
/* Finish off any leftovers with scalar operations. */
374
while (i) {
375
*dst = ((float) (*src>>8)) * DIVBY8388607;
376
i--; src++; dst++;
377
}
378
}
379
#endif /* HAVE_NEON_INTRINSICS */
380
381
/* SECTION 2: Linear Resamplers */
382
383
void FAudio_INTERNAL_ResampleGeneric(
384
float *restrict dCache,
385
float *restrict resampleCache,
386
uint64_t *resampleOffset,
387
uint64_t resampleStep,
388
uint64_t toResample,
389
uint8_t channels
390
) {
391
uint32_t i, j;
392
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
393
for (i = 0; i < toResample; i += 1)
394
{
395
for (j = 0; j < channels; j += 1)
396
{
397
/* lerp, then convert to float value */
398
*resampleCache++ = (float) (
399
dCache[j] +
400
(dCache[j + channels] - dCache[j]) *
401
FIXED_TO_DOUBLE(cur)
402
);
403
}
404
405
/* Increment fraction offset by the stepping value */
406
*resampleOffset += resampleStep;
407
cur += resampleStep;
408
409
/* Only increment the sample offset by integer values.
410
* Sometimes this will be 0 until cur accumulates
411
* enough steps, especially for "slow" rates.
412
*/
413
dCache += (cur >> FIXED_PRECISION) * channels;
414
415
/* Now that any integer has been added, drop it.
416
* The offset pointer will preserve the total.
417
*/
418
cur &= FIXED_FRACTION_MASK;
419
}
420
}
421
422
#if NEED_SCALAR_CONVERTER_FALLBACKS
423
void FAudio_INTERNAL_ResampleMono_Scalar(
424
float *restrict dCache,
425
float *restrict resampleCache,
426
uint64_t *resampleOffset,
427
uint64_t resampleStep,
428
uint64_t toResample,
429
uint8_t UNUSED
430
) {
431
uint32_t i;
432
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
433
for (i = 0; i < toResample; i += 1)
434
{
435
/* lerp, then convert to float value */
436
*resampleCache++ = (float) (
437
dCache[0] +
438
(dCache[1] - dCache[0]) *
439
FIXED_TO_DOUBLE(cur)
440
);
441
442
/* Increment fraction offset by the stepping value */
443
*resampleOffset += resampleStep;
444
cur += resampleStep;
445
446
/* Only increment the sample offset by integer values.
447
* Sometimes this will be 0 until cur accumulates
448
* enough steps, especially for "slow" rates.
449
*/
450
dCache += (cur >> FIXED_PRECISION);
451
452
/* Now that any integer has been added, drop it.
453
* The offset pointer will preserve the total.
454
*/
455
cur &= FIXED_FRACTION_MASK;
456
}
457
}
458
459
void FAudio_INTERNAL_ResampleStereo_Scalar(
460
float *restrict dCache,
461
float *restrict resampleCache,
462
uint64_t *resampleOffset,
463
uint64_t resampleStep,
464
uint64_t toResample,
465
uint8_t UNUSED
466
) {
467
uint32_t i;
468
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
469
for (i = 0; i < toResample; i += 1)
470
{
471
/* lerp, then convert to float value */
472
*resampleCache++ = (float) (
473
dCache[0] +
474
(dCache[2] - dCache[0]) *
475
FIXED_TO_DOUBLE(cur)
476
);
477
*resampleCache++ = (float) (
478
dCache[1] +
479
(dCache[3] - dCache[1]) *
480
FIXED_TO_DOUBLE(cur)
481
);
482
483
/* Increment fraction offset by the stepping value */
484
*resampleOffset += resampleStep;
485
cur += resampleStep;
486
487
/* Only increment the sample offset by integer values.
488
* Sometimes this will be 0 until cur accumulates
489
* enough steps, especially for "slow" rates.
490
*/
491
dCache += (cur >> FIXED_PRECISION) * 2;
492
493
/* Now that any integer has been added, drop it.
494
* The offset pointer will preserve the total.
495
*/
496
cur &= FIXED_FRACTION_MASK;
497
}
498
}
499
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
500
501
/* The SSE2 versions of the resamplers come from @8thMage! */
502
503
#if HAVE_SSE2_INTRINSICS
504
void FAudio_INTERNAL_ResampleMono_SSE2(
505
float *restrict dCache,
506
float *restrict resampleCache,
507
uint64_t *resampleOffset,
508
uint64_t resampleStep,
509
uint64_t toResample,
510
uint8_t UNUSED
511
) {
512
uint32_t i, header, tail;
513
uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
514
float *dCache_1, *dCache_2, *dCache_3;
515
uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
516
__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,
517
current, next, sub, cur_fixed, mul, res;
518
__m128i cur_frac, adder_frac, adder_frac_loop;
519
520
/* This is the header, the Dest needs to be aligned to 16B */
521
header = (16 - ((size_t) resampleCache) % 16) / 4;
522
if (header == 4)
523
{
524
header = 0;
525
}
526
for (i = 0; i < header; i += 1)
527
{
528
/* lerp, then convert to float value */
529
*resampleCache++ = (float) (
530
dCache[0] +
531
(dCache[1] - dCache[0]) *
532
FIXED_TO_FLOAT(cur_scalar)
533
);
534
535
/* Increment fraction offset by the stepping value */
536
*resampleOffset += resampleStep;
537
cur_scalar += resampleStep;
538
539
/* Only increment the sample offset by integer values.
540
* Sometimes this will be 0 until cur accumulates
541
* enough steps, especially for "slow" rates.
542
*/
543
dCache += (cur_scalar >> FIXED_PRECISION);
544
545
/* Now that any integer has been added, drop it.
546
* The offset pointer will preserve the total.
547
*/
548
cur_scalar &= FIXED_FRACTION_MASK;
549
}
550
551
toResample -= header;
552
553
/* initialising the varius cur
554
* cur_frac is the fractional part of cur with 4 samples. as the
555
* fractional part is 32 bit unsigned value, it can be just added
556
* and the modulu operation for keeping the fractional part will be implicit.
557
* the 0.5 is for converting signed values to float (no unsigned convert),
558
* the 0.5 is added later.
559
*/
560
cur_frac = _mm_set1_epi32(
561
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
562
);
563
adder_frac = _mm_setr_epi32(
564
0,
565
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
566
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
567
(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
568
);
569
cur_frac = _mm_add_epi32(cur_frac, adder_frac);
570
571
/* The various cur_scalar is for the different samples
572
* (1, 2, 3 compared to original cur_scalar = 0)
573
*/
574
cur_scalar_1 = cur_scalar + resampleStep;
575
cur_scalar_2 = cur_scalar + resampleStep * 2;
576
cur_scalar_3 = cur_scalar + resampleStep * 3;
577
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
578
dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
579
dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
580
cur_scalar &= FIXED_FRACTION_MASK;
581
cur_scalar_1 &= FIXED_FRACTION_MASK;
582
cur_scalar_2 &= FIXED_FRACTION_MASK;
583
cur_scalar_3 &= FIXED_FRACTION_MASK;
584
585
/* FIXME: These should be _mm_undefined_ps! */
586
current_next_0_1 = _mm_setzero_ps();
587
current_next_2_3 = _mm_setzero_ps();
588
589
/* Constants */
590
one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
591
half = _mm_set1_ps(0.5f);
592
adder_frac_loop = _mm_set1_epi32(
593
(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
594
);
595
596
tail = toResample % 4;
597
for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
598
{
599
/* current next holds 2 pairs of the sample and the sample + 1
600
* after that need to seperate them.
601
*/
602
603
current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);
604
current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);
605
current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);
606
current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);
607
608
/* Unpack them to have seperate current and next in 2 vectors. */
609
current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */
610
next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */
611
612
sub = _mm_sub_ps(next, current);
613
614
/* Convert the fractional part to float and then mul to get the fractions out.
615
* then add back the 0.5 we subtracted before.
616
*/
617
cur_fixed = _mm_add_ps(
618
_mm_mul_ps(
619
_mm_cvtepi32_ps(cur_frac),
620
one_over_fixed_one
621
),
622
half
623
);
624
mul = _mm_mul_ps(sub, cur_fixed);
625
res = _mm_add_ps(current, mul);
626
627
/* Store back */
628
_mm_store_ps(resampleCache, res);
629
630
/* Update dCaches for next iteration */
631
cur_scalar += resampleStep * 4;
632
cur_scalar_1 += resampleStep * 4;
633
cur_scalar_2 += resampleStep * 4;
634
cur_scalar_3 += resampleStep * 4;
635
dCache = dCache + (cur_scalar >> FIXED_PRECISION);
636
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
637
dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
638
dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
639
cur_scalar &= FIXED_FRACTION_MASK;
640
cur_scalar_1 &= FIXED_FRACTION_MASK;
641
cur_scalar_2 &= FIXED_FRACTION_MASK;
642
cur_scalar_3 &= FIXED_FRACTION_MASK;
643
644
cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
645
}
646
*resampleOffset += resampleStep * (toResample - tail);
647
648
/* This is the tail. */
649
for (i = 0; i < tail; i += 1)
650
{
651
/* lerp, then convert to float value */
652
*resampleCache++ = (float) (
653
dCache[0] +
654
(dCache[1] - dCache[0]) *
655
FIXED_TO_FLOAT(cur_scalar)
656
);
657
658
/* Increment fraction offset by the stepping value */
659
*resampleOffset += resampleStep;
660
cur_scalar += resampleStep;
661
662
/* Only increment the sample offset by integer values.
663
* Sometimes this will be 0 until cur accumulates
664
* enough steps, especially for "slow" rates.
665
*/
666
dCache += (cur_scalar >> FIXED_PRECISION);
667
668
/* Now that any integer has been added, drop it.
669
* The offset pointer will preserve the total.
670
*/
671
cur_scalar &= FIXED_FRACTION_MASK;
672
}
673
}
674
675
void FAudio_INTERNAL_ResampleStereo_SSE2(
676
float *restrict dCache,
677
float *restrict resampleCache,
678
uint64_t *resampleOffset,
679
uint64_t resampleStep,
680
uint64_t toResample,
681
uint8_t UNUSED
682
) {
683
uint32_t i, header, tail;
684
uint64_t cur_scalar, cur_scalar_1;
685
float *dCache_1;
686
__m128 one_over_fixed_one, half, current_next_1, current_next_2,
687
current, next, sub, cur_fixed, mul, res;
688
__m128i cur_frac, adder_frac, adder_frac_loop;
689
690
/* This is the header, the Dest needs to be aligned to 16B */
691
header = (16 - ((size_t) resampleCache) % 16) / 8;
692
if (header == 2)
693
{
694
header = 0;
695
}
696
cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
697
for (i = 0; i < header; i += 2)
698
{
699
/* lerp, then convert to float value */
700
*resampleCache++ = (float) (
701
dCache[0] +
702
(dCache[2] - dCache[0]) *
703
FIXED_TO_FLOAT(cur_scalar)
704
);
705
*resampleCache++ = (float) (
706
dCache[1] +
707
(dCache[3] - dCache[1]) *
708
FIXED_TO_FLOAT(cur_scalar)
709
);
710
711
/* Increment fraction offset by the stepping value */
712
*resampleOffset += resampleStep;
713
cur_scalar += resampleStep;
714
715
/* Only increment the sample offset by integer values.
716
* Sometimes this will be 0 until cur accumulates
717
* enough steps, especially for "slow" rates.
718
*/
719
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
720
721
/* Now that any integer has been added, drop it.
722
* The offset pointer will preserve the total.
723
*/
724
cur_scalar &= FIXED_FRACTION_MASK;
725
}
726
727
toResample -= header;
728
729
/* initialising the varius cur.
730
* cur_frac holds the fractional part of cur.
731
* to avoid duplication please see the mono part for a thorough
732
* explanation.
733
*/
734
cur_frac = _mm_set1_epi32(
735
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
736
);
737
adder_frac = _mm_setr_epi32(
738
0,
739
0,
740
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
741
(uint32_t) (resampleStep & FIXED_FRACTION_MASK)
742
);
743
cur_frac = _mm_add_epi32(cur_frac, adder_frac);
744
745
/* dCache_1 is the pointer for dcache in the next resample pos. */
746
cur_scalar_1 = cur_scalar + resampleStep;
747
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
748
cur_scalar_1 &= FIXED_FRACTION_MASK;
749
750
one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
751
half = _mm_set1_ps(0.5f);
752
adder_frac_loop = _mm_set1_epi32(
753
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
754
);
755
756
tail = toResample % 2;
757
for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
758
{
759
/* Current_next_1 and current_next_2 each holds 4 src
760
* sample points for getting 4 dest resample point at the end.
761
* current_next_1 holds:
762
* (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
763
* for the first resample position, while current_next_2 holds
764
* the same for the 2nd resample position
765
*/
766
current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */
767
current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */
768
769
/* Unpack them to get the current and the next in seperate vectors. */
770
current = _mm_castpd_ps(
771
_mm_unpacklo_pd(
772
_mm_castps_pd(current_next_1),
773
_mm_castps_pd(current_next_2)
774
)
775
);
776
next = _mm_castpd_ps(
777
_mm_unpackhi_pd(
778
_mm_castps_pd(current_next_1),
779
_mm_castps_pd(current_next_2)
780
)
781
);
782
783
sub = _mm_sub_ps(next, current);
784
785
/* Adding the 0.5 back.
786
* See mono explanation for more elaborate explanation.
787
*/
788
cur_fixed = _mm_add_ps(
789
_mm_mul_ps(
790
_mm_cvtepi32_ps(cur_frac),
791
one_over_fixed_one
792
),
793
half
794
);
795
mul = _mm_mul_ps(sub, cur_fixed);
796
res = _mm_add_ps(current, mul);
797
798
/* Store the results */
799
_mm_store_ps(resampleCache, res);
800
801
/* Update dCaches for next iteration */
802
cur_scalar += resampleStep * 2;
803
cur_scalar_1 += resampleStep * 2;
804
dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
805
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
806
cur_scalar &= FIXED_FRACTION_MASK;
807
cur_scalar_1 &= FIXED_FRACTION_MASK;
808
809
cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
810
}
811
*resampleOffset += resampleStep * (toResample - tail);
812
813
/* This is the tail. */
814
for (i = 0; i < tail; i += 1)
815
{
816
/* lerp, then convert to float value */
817
*resampleCache++ = (float) (
818
dCache[0] +
819
(dCache[2] - dCache[0]) *
820
FIXED_TO_FLOAT(cur_scalar)
821
);
822
*resampleCache++ = (float) (
823
dCache[1] +
824
(dCache[3] - dCache[1]) *
825
FIXED_TO_FLOAT(cur_scalar)
826
);
827
828
/* Increment fraction offset by the stepping value */
829
*resampleOffset += resampleStep;
830
cur_scalar += resampleStep;
831
832
/* Only increment the sample offset by integer values.
833
* Sometimes this will be 0 until cur accumulates
834
* enough steps, especially for "slow" rates.
835
*/
836
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
837
838
/* Now that any integer has been added, drop it.
839
* The offset pointer will preserve the total.
840
*/
841
cur_scalar &= FIXED_FRACTION_MASK;
842
}
843
}
844
#endif /* HAVE_SSE2_INTRINSICS */
845
846
#if HAVE_NEON_INTRINSICS
847
void FAudio_INTERNAL_ResampleMono_NEON(
848
float *restrict dCache,
849
float *restrict resampleCache,
850
uint64_t *resampleOffset,
851
uint64_t resampleStep,
852
uint64_t toResample,
853
uint8_t UNUSED
854
) {
855
uint32_t i, header, tail;
856
uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
857
float *dCache_1, *dCache_2, *dCache_3;
858
uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
859
float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,
860
current, next, sub, cur_fixed, mul, res;
861
int32x4_t cur_frac, adder_frac, adder_frac_loop;
862
863
/* This is the header, the Dest needs to be aligned to 16B */
864
header = (16 - ((size_t) resampleCache) % 16) / 4;
865
if (header == 4)
866
{
867
header = 0;
868
}
869
for (i = 0; i < header; i += 1)
870
{
871
/* lerp, then convert to float value */
872
*resampleCache++ = (float) (
873
dCache[0] +
874
(dCache[1] - dCache[0]) *
875
FIXED_TO_FLOAT(cur_scalar)
876
);
877
878
/* Increment fraction offset by the stepping value */
879
*resampleOffset += resampleStep;
880
cur_scalar += resampleStep;
881
882
/* Only increment the sample offset by integer values.
883
* Sometimes this will be 0 until cur accumulates
884
* enough steps, especially for "slow" rates.
885
*/
886
dCache += (cur_scalar >> FIXED_PRECISION);
887
888
/* Now that any integer has been added, drop it.
889
* The offset pointer will preserve the total.
890
*/
891
cur_scalar &= FIXED_FRACTION_MASK;
892
}
893
894
toResample -= header;
895
896
/* initialising the varius cur
897
* cur_frac is the fractional part of cur with 4 samples. as the
898
* fractional part is 32 bit unsigned value, it can be just added
899
* and the modulu operation for keeping the fractional part will be implicit.
900
* the 0.5 is for converting signed values to float (no unsigned convert),
901
* the 0.5 is added later.
902
*/
903
cur_frac = vdupq_n_s32(
904
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
905
);
906
ALIGN(int32_t, 16) data[4] =
907
{
908
0,
909
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
910
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
911
(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
912
};
913
adder_frac = vld1q_s32(data);
914
cur_frac = vaddq_s32(cur_frac, adder_frac);
915
916
/* The various cur_scalar is for the different samples
917
* (1, 2, 3 compared to original cur_scalar = 0)
918
*/
919
cur_scalar_1 = cur_scalar + resampleStep;
920
cur_scalar_2 = cur_scalar + resampleStep * 2;
921
cur_scalar_3 = cur_scalar + resampleStep * 3;
922
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
923
dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
924
dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
925
cur_scalar &= FIXED_FRACTION_MASK;
926
cur_scalar_1 &= FIXED_FRACTION_MASK;
927
cur_scalar_2 &= FIXED_FRACTION_MASK;
928
cur_scalar_3 &= FIXED_FRACTION_MASK;
929
930
/* Constants */
931
one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
932
half = vdupq_n_f32(0.5f);
933
adder_frac_loop = vdupq_n_s32(
934
(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
935
);
936
937
tail = toResample % 4;
938
for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
939
{
940
/* current next holds 2 pairs of the sample and the sample + 1
941
* after that need to separate them.
942
*/
943
current_next_0_1 = vcombine_f32(
944
vld1_f32(dCache),
945
vld1_f32(dCache_1)
946
);
947
current_next_2_3 = vcombine_f32(
948
vld1_f32(dCache_2),
949
vld1_f32(dCache_3)
950
);
951
952
/* Unpack them to have seperate current and next in 2 vectors. */
953
current = vuzp1q_f32(current_next_0_1, current_next_2_3);
954
next = vuzp2q_f32(current_next_0_1, current_next_2_3);
955
956
sub = vsubq_f32(next, current);
957
958
/* Convert the fractional part to float and then mul to get the fractions out.
959
* then add back the 0.5 we subtracted before.
960
*/
961
cur_fixed = vaddq_f32(
962
vmulq_f32(
963
vcvtq_f32_s32(cur_frac),
964
one_over_fixed_one
965
),
966
half
967
);
968
mul = vmulq_f32(sub, cur_fixed);
969
res = vaddq_f32(current, mul);
970
971
/* Store back */
972
vst1q_f32(resampleCache, res);
973
974
/* Update dCaches for next iteration */
975
cur_scalar += resampleStep * 4;
976
cur_scalar_1 += resampleStep * 4;
977
cur_scalar_2 += resampleStep * 4;
978
cur_scalar_3 += resampleStep * 4;
979
dCache = dCache + (cur_scalar >> FIXED_PRECISION);
980
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
981
dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
982
dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
983
cur_scalar &= FIXED_FRACTION_MASK;
984
cur_scalar_1 &= FIXED_FRACTION_MASK;
985
cur_scalar_2 &= FIXED_FRACTION_MASK;
986
cur_scalar_3 &= FIXED_FRACTION_MASK;
987
988
cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
989
}
990
*resampleOffset += resampleStep * (toResample - tail);
991
992
/* This is the tail. */
993
for (i = 0; i < tail; i += 1)
994
{
995
/* lerp, then convert to float value */
996
*resampleCache++ = (float) (
997
dCache[0] +
998
(dCache[1] - dCache[0]) *
999
FIXED_TO_FLOAT(cur_scalar)
1000
);
1001
1002
/* Increment fraction offset by the stepping value */
1003
*resampleOffset += resampleStep;
1004
cur_scalar += resampleStep;
1005
1006
/* Only increment the sample offset by integer values.
1007
* Sometimes this will be 0 until cur accumulates
1008
* enough steps, especially for "slow" rates.
1009
*/
1010
dCache += (cur_scalar >> FIXED_PRECISION);
1011
1012
/* Now that any integer has been added, drop it.
1013
* The offset pointer will preserve the total.
1014
*/
1015
cur_scalar &= FIXED_FRACTION_MASK;
1016
}
1017
}
1018
1019
void FAudio_INTERNAL_ResampleStereo_NEON(
1020
float *restrict dCache,
1021
float *restrict resampleCache,
1022
uint64_t *resampleOffset,
1023
uint64_t resampleStep,
1024
uint64_t toResample,
1025
uint8_t channels
1026
) {
1027
uint32_t i, header, tail;
1028
uint64_t cur_scalar, cur_scalar_1;
1029
float *dCache_1;
1030
float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;
1031
int32x4_t cur_frac, adder_frac, adder_frac_loop;
1032
1033
/* This is the header, the Dest needs to be aligned to 16B */
1034
header = (16 - ((size_t) resampleCache) % 16) / 8;
1035
if (header == 2)
1036
{
1037
header = 0;
1038
}
1039
cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
1040
for (i = 0; i < header; i += 2)
1041
{
1042
/* lerp, then convert to float value */
1043
*resampleCache++ = (float) (
1044
dCache[0] +
1045
(dCache[2] - dCache[0]) *
1046
FIXED_TO_FLOAT(cur_scalar)
1047
);
1048
*resampleCache++ = (float) (
1049
dCache[1] +
1050
(dCache[3] - dCache[1]) *
1051
FIXED_TO_FLOAT(cur_scalar)
1052
);
1053
1054
/* Increment fraction offset by the stepping value */
1055
*resampleOffset += resampleStep;
1056
cur_scalar += resampleStep;
1057
1058
/* Only increment the sample offset by integer values.
1059
* Sometimes this will be 0 until cur accumulates
1060
* enough steps, especially for "slow" rates.
1061
*/
1062
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1063
1064
/* Now that any integer has been added, drop it.
1065
* The offset pointer will preserve the total.
1066
*/
1067
cur_scalar &= FIXED_FRACTION_MASK;
1068
}
1069
1070
toResample -= header;
1071
1072
/* initialising the varius cur.
1073
* cur_frac holds the fractional part of cur.
1074
* to avoid duplication please see the mono part for a thorough
1075
* explanation.
1076
*/
1077
cur_frac = vdupq_n_s32(
1078
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
1079
);
1080
ALIGN(int32_t, 16) data[4] =
1081
{
1082
0,
1083
0,
1084
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
1085
(uint32_t) (resampleStep & FIXED_FRACTION_MASK)
1086
};
1087
adder_frac = vld1q_s32(data);
1088
cur_frac = vaddq_s32(cur_frac, adder_frac);
1089
1090
/* dCache_1 is the pointer for dcache in the next resample pos. */
1091
cur_scalar_1 = cur_scalar + resampleStep;
1092
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1093
cur_scalar_1 &= FIXED_FRACTION_MASK;
1094
1095
one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
1096
half = vdupq_n_f32(0.5f);
1097
adder_frac_loop = vdupq_n_s32(
1098
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
1099
);
1100
1101
tail = toResample % 2;
1102
for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
1103
{
1104
/* Current_next_1 and current_next_2 each holds 4 src
1105
* sample points for getting 4 dest resample point at the end.
1106
* current_next_1 holds:
1107
* (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
1108
* for the first resample position, while current_next_2 holds
1109
* the same for the 2nd resample position
1110
*/
1111
current = vcombine_f32(
1112
vld1_f32(dCache), /* A1B1 */
1113
vld1_f32(dCache_1) /* A3B3 */
1114
);
1115
next = vcombine_f32(
1116
vld1_f32(dCache + 2), /* A2B2 */
1117
vld1_f32(dCache_1 + 2) /* A4B4 */
1118
);
1119
1120
sub = vsubq_f32(next, current);
1121
1122
/* Adding the 0.5 back.
1123
* See mono explanation for more elaborate explanation.
1124
*/
1125
cur_fixed = vaddq_f32(
1126
vmulq_f32(
1127
vcvtq_f32_s32(cur_frac),
1128
one_over_fixed_one
1129
),
1130
half
1131
);
1132
mul = vmulq_f32(sub, cur_fixed);
1133
res = vaddq_f32(current, mul);
1134
1135
/* Store the results */
1136
vst1q_f32(resampleCache, res);
1137
1138
/* Update dCaches for next iteration */
1139
cur_scalar += resampleStep * 2;
1140
cur_scalar_1 += resampleStep * 2;
1141
dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
1142
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1143
cur_scalar &= FIXED_FRACTION_MASK;
1144
cur_scalar_1 &= FIXED_FRACTION_MASK;
1145
1146
cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
1147
}
1148
*resampleOffset += resampleStep * (toResample - tail);
1149
1150
/* This is the tail. */
1151
for (i = 0; i < tail; i += 1)
1152
{
1153
/* lerp, then convert to float value */
1154
*resampleCache++ = (float) (
1155
dCache[0] +
1156
(dCache[2] - dCache[0]) *
1157
FIXED_TO_FLOAT(cur_scalar)
1158
);
1159
*resampleCache++ = (float) (
1160
dCache[1] +
1161
(dCache[3] - dCache[1]) *
1162
FIXED_TO_FLOAT(cur_scalar)
1163
);
1164
1165
/* Increment fraction offset by the stepping value */
1166
*resampleOffset += resampleStep;
1167
cur_scalar += resampleStep;
1168
1169
/* Only increment the sample offset by integer values.
1170
* Sometimes this will be 0 until cur accumulates
1171
* enough steps, especially for "slow" rates.
1172
*/
1173
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1174
1175
/* Now that any integer has been added, drop it.
1176
* The offset pointer will preserve the total.
1177
*/
1178
cur_scalar &= FIXED_FRACTION_MASK;
1179
}
1180
}
1181
#endif /* HAVE_NEON_INTRINSICS */
1182
1183
/* SECTION 3: Amplifiers */
1184
1185
#if NEED_SCALAR_CONVERTER_FALLBACKS
1186
void FAudio_INTERNAL_Amplify_Scalar(
1187
float* output,
1188
uint32_t totalSamples,
1189
float volume
1190
) {
1191
uint32_t i;
1192
for (i = 0; i < totalSamples; i += 1)
1193
{
1194
output[i] *= volume;
1195
}
1196
}
1197
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
1198
1199
/* The SSE2 version of the amplifier comes from @8thMage! */
1200
1201
#if HAVE_SSE2_INTRINSICS
1202
void FAudio_INTERNAL_Amplify_SSE2(
1203
float* output,
1204
uint32_t totalSamples,
1205
float volume
1206
) {
1207
uint32_t i;
1208
uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1209
uint32_t tail = (totalSamples - header) % 4;
1210
__m128 volumeVec, outVec;
1211
if (header == 4)
1212
{
1213
header = 0;
1214
}
1215
if (tail == 4)
1216
{
1217
tail = 0;
1218
}
1219
1220
for (i = 0; i < header; i += 1)
1221
{
1222
output[i] *= volume;
1223
}
1224
1225
volumeVec = _mm_set1_ps(volume);
1226
for (i = header; i < totalSamples - tail; i += 4)
1227
{
1228
outVec = _mm_load_ps(output + i);
1229
outVec = _mm_mul_ps(outVec, volumeVec);
1230
_mm_store_ps(output + i, outVec);
1231
}
1232
1233
for (i = totalSamples - tail; i < totalSamples; i += 1)
1234
{
1235
output[i] *= volume;
1236
}
1237
}
1238
#endif /* HAVE_SSE2_INTRINSICS */
1239
1240
#if HAVE_NEON_INTRINSICS
1241
void FAudio_INTERNAL_Amplify_NEON(
1242
float* output,
1243
uint32_t totalSamples,
1244
float volume
1245
) {
1246
uint32_t i;
1247
uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1248
uint32_t tail = (totalSamples - header) % 4;
1249
float32x4_t volumeVec, outVec;
1250
if (header == 4)
1251
{
1252
header = 0;
1253
}
1254
if (tail == 4)
1255
{
1256
tail = 0;
1257
}
1258
1259
for (i = 0; i < header; i += 1)
1260
{
1261
output[i] *= volume;
1262
}
1263
1264
volumeVec = vdupq_n_f32(volume);
1265
for (i = header; i < totalSamples - tail; i += 4)
1266
{
1267
outVec = vld1q_f32(output + i);
1268
outVec = vmulq_f32(outVec, volumeVec);
1269
vst1q_f32(output + i, outVec);
1270
}
1271
1272
for (i = totalSamples - tail; i < totalSamples; i += 1)
1273
{
1274
output[i] *= volume;
1275
}
1276
}
1277
#endif /* HAVE_NEON_INTRINSICS */
1278
1279
/* SECTION 4: Mixer Functions */
1280
1281
void FAudio_INTERNAL_Mix_Generic_Scalar(
1282
uint32_t toMix,
1283
uint32_t srcChans,
1284
uint32_t dstChans,
1285
float *restrict src,
1286
float *restrict dst,
1287
float *restrict coefficients
1288
) {
1289
uint32_t i, co, ci;
1290
for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1291
for (co = 0; co < dstChans; co += 1)
1292
{
1293
for (ci = 0; ci < srcChans; ci += 1)
1294
{
1295
dst[co] += (
1296
src[ci] *
1297
coefficients[co * srcChans + ci]
1298
);
1299
}
1300
}
1301
}
1302
1303
#if HAVE_SSE2_INTRINSICS
1304
/* SSE horizontal add by Peter Cordes, CC-BY-SA.
1305
* From https://stackoverflow.com/a/35270026 */
1306
static inline float FAudio_simd_hadd(__m128 v)
1307
{
1308
__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
1309
__m128 sums = _mm_add_ps(v, shuf);
1310
shuf = _mm_movehl_ps(shuf, sums);
1311
sums = _mm_add_ss(sums, shuf);
1312
return _mm_cvtss_f32(sums);
1313
}
1314
1315
void FAudio_INTERNAL_Mix_Generic_SSE2(
1316
uint32_t toMix,
1317
uint32_t srcChans,
1318
uint32_t dstChans,
1319
float *restrict src,
1320
float *restrict dst,
1321
float *restrict coefficients
1322
) {
1323
uint32_t i, co, ci;
1324
for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1325
for (co = 0; co < dstChans; co += 1)
1326
{
1327
for (ci = 0; srcChans - ci >= 4; ci += 4)
1328
{
1329
/* do SIMD */
1330
const __m128 vols = _mm_loadu_ps(&coefficients[co * srcChans + ci]);
1331
const __m128 dat = _mm_loadu_ps(&src[ci]);
1332
dst[co] += FAudio_simd_hadd(_mm_mul_ps(dat, vols));
1333
}
1334
1335
for (; ci < srcChans; ci += 1)
1336
{
1337
/* do scalar */
1338
dst[co] += (
1339
src[ci] *
1340
coefficients[co * srcChans + ci]
1341
);
1342
}
1343
}
1344
}
1345
#endif /* HAVE_SSE2_INTRINSICS */
1346
1347
void FAudio_INTERNAL_Mix_1in_1out_Scalar(
1348
uint32_t toMix,
1349
uint32_t UNUSED1,
1350
uint32_t UNUSED2,
1351
float *restrict src,
1352
float *restrict dst,
1353
float *restrict coefficients
1354
) {
1355
uint32_t i;
1356
for (i = 0; i < toMix; i += 1, src += 1, dst += 1)
1357
{
1358
/* Base source data, combined with the coefficients */
1359
dst[0] += src[0] * coefficients[0];
1360
}
1361
}
1362
1363
void FAudio_INTERNAL_Mix_1in_2out_Scalar(
1364
uint32_t toMix,
1365
uint32_t UNUSED1,
1366
uint32_t UNUSED2,
1367
float *restrict src,
1368
float *restrict dst,
1369
float *restrict coefficients
1370
) {
1371
uint32_t i;
1372
for (i = 0; i < toMix; i += 1, src += 1, dst += 2)
1373
{
1374
dst[0] += src[0] * coefficients[0];
1375
dst[1] += src[0] * coefficients[1];
1376
}
1377
}
1378
1379
void FAudio_INTERNAL_Mix_1in_6out_Scalar(
1380
uint32_t toMix,
1381
uint32_t UNUSED1,
1382
uint32_t UNUSED2,
1383
float *restrict src,
1384
float *restrict dst,
1385
float *restrict coefficients
1386
) {
1387
uint32_t i;
1388
for (i = 0; i < toMix; i += 1, src += 1, dst += 6)
1389
{
1390
dst[0] += src[0] * coefficients[0];
1391
dst[1] += src[0] * coefficients[1];
1392
dst[2] += src[0] * coefficients[2];
1393
dst[3] += src[0] * coefficients[3];
1394
dst[4] += src[0] * coefficients[4];
1395
dst[5] += src[0] * coefficients[5];
1396
}
1397
}
1398
1399
void FAudio_INTERNAL_Mix_1in_8out_Scalar(
1400
uint32_t toMix,
1401
uint32_t UNUSED1,
1402
uint32_t UNUSED2,
1403
float *restrict src,
1404
float *restrict dst,
1405
float *restrict coefficients
1406
) {
1407
uint32_t i;
1408
for (i = 0; i < toMix; i += 1, src += 1, dst += 8)
1409
{
1410
dst[0] += src[0] * coefficients[0];
1411
dst[1] += src[0] * coefficients[1];
1412
dst[2] += src[0] * coefficients[2];
1413
dst[3] += src[0] * coefficients[3];
1414
dst[4] += src[0] * coefficients[4];
1415
dst[5] += src[0] * coefficients[5];
1416
dst[6] += src[0] * coefficients[6];
1417
dst[7] += src[0] * coefficients[7];
1418
}
1419
}
1420
1421
void FAudio_INTERNAL_Mix_2in_1out_Scalar(
1422
uint32_t toMix,
1423
uint32_t UNUSED1,
1424
uint32_t UNUSED2,
1425
float *restrict src,
1426
float *restrict dst,
1427
float *restrict coefficients
1428
) {
1429
uint32_t i;
1430
for (i = 0; i < toMix; i += 1, src += 2, dst += 1)
1431
{
1432
/* Base source data, combined with the coefficients */
1433
dst[0] += (
1434
(src[0] * coefficients[0]) +
1435
(src[1] * coefficients[1])
1436
);
1437
}
1438
}
1439
1440
void FAudio_INTERNAL_Mix_2in_2out_Scalar(
1441
uint32_t toMix,
1442
uint32_t UNUSED1,
1443
uint32_t UNUSED2,
1444
float *restrict src,
1445
float *restrict dst,
1446
float *restrict coefficients
1447
) {
1448
uint32_t i;
1449
for (i = 0; i < toMix; i += 1, src += 2, dst += 2)
1450
{
1451
dst[0] += (
1452
(src[0] * coefficients[0]) +
1453
(src[1] * coefficients[1])
1454
);
1455
dst[1] += (
1456
(src[0] * coefficients[2]) +
1457
(src[1] * coefficients[3])
1458
);
1459
}
1460
}
1461
1462
void FAudio_INTERNAL_Mix_2in_6out_Scalar(
1463
uint32_t toMix,
1464
uint32_t UNUSED1,
1465
uint32_t UNUSED2,
1466
float *restrict src,
1467
float *restrict dst,
1468
float *restrict coefficients
1469
) {
1470
uint32_t i;
1471
for (i = 0; i < toMix; i += 1, src += 2, dst += 6)
1472
{
1473
dst[0] += (
1474
(src[0] * coefficients[0]) +
1475
(src[1] * coefficients[1])
1476
);
1477
dst[1] += (
1478
(src[0] * coefficients[2]) +
1479
(src[1] * coefficients[3])
1480
);
1481
dst[2] += (
1482
(src[0] * coefficients[4]) +
1483
(src[1] * coefficients[5])
1484
);
1485
dst[3] += (
1486
(src[0] * coefficients[6]) +
1487
(src[1] * coefficients[7])
1488
);
1489
dst[4] += (
1490
(src[0] * coefficients[8]) +
1491
(src[1] * coefficients[9])
1492
);
1493
dst[5] += (
1494
(src[0] * coefficients[10]) +
1495
(src[1] * coefficients[11])
1496
);
1497
}
1498
}
1499
1500
void FAudio_INTERNAL_Mix_2in_8out_Scalar(
1501
uint32_t toMix,
1502
uint32_t UNUSED1,
1503
uint32_t UNUSED2,
1504
float *restrict src,
1505
float *restrict dst,
1506
float *restrict coefficients
1507
) {
1508
uint32_t i;
1509
for (i = 0; i < toMix; i += 1, src += 2, dst += 8)
1510
{
1511
dst[0] += (
1512
(src[0] * coefficients[0]) +
1513
(src[1] * coefficients[1])
1514
);
1515
dst[1] += (
1516
(src[0] * coefficients[2]) +
1517
(src[1] * coefficients[3])
1518
);
1519
dst[2] += (
1520
(src[0] * coefficients[4]) +
1521
(src[1] * coefficients[5])
1522
);
1523
dst[3] += (
1524
(src[0] * coefficients[6]) +
1525
(src[1] * coefficients[7])
1526
);
1527
dst[4] += (
1528
(src[0] * coefficients[8]) +
1529
(src[1] * coefficients[9])
1530
);
1531
dst[5] += (
1532
(src[0] * coefficients[10]) +
1533
(src[1] * coefficients[11])
1534
);
1535
dst[6] += (
1536
(src[0] * coefficients[12]) +
1537
(src[1] * coefficients[13])
1538
);
1539
dst[7] += (
1540
(src[0] * coefficients[14]) +
1541
(src[1] * coefficients[15])
1542
);
1543
}
1544
}
1545
1546
/* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */
1547
1548
void (*FAudio_INTERNAL_Convert_U8_To_F32)(
1549
const uint8_t *restrict src,
1550
float *restrict dst,
1551
uint32_t len
1552
);
1553
void (*FAudio_INTERNAL_Convert_S16_To_F32)(
1554
const int16_t *restrict src,
1555
float *restrict dst,
1556
uint32_t len
1557
);
1558
void (*FAudio_INTERNAL_Convert_S32_To_F32)(
1559
const int32_t *restrict src,
1560
float *restrict dst,
1561
uint32_t len
1562
);
1563
1564
FAudioResampleCallback FAudio_INTERNAL_ResampleMono;
1565
FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;
1566
1567
void (*FAudio_INTERNAL_Amplify)(
1568
float *output,
1569
uint32_t totalSamples,
1570
float volume
1571
);
1572
1573
FAudioMixCallback FAudio_INTERNAL_Mix_Generic;
1574
1575
void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)
1576
{
1577
#if HAVE_SSE2_INTRINSICS
1578
if (hasSSE2)
1579
{
1580
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;
1581
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;
1582
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;
1583
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;
1584
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;
1585
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;
1586
FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_SSE2;
1587
return;
1588
}
1589
#endif
1590
#if HAVE_NEON_INTRINSICS
1591
if (hasNEON)
1592
{
1593
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;
1594
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;
1595
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;
1596
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;
1597
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;
1598
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;
1599
FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1600
return;
1601
}
1602
#endif
1603
#if NEED_SCALAR_CONVERTER_FALLBACKS
1604
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;
1605
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;
1606
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;
1607
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;
1608
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;
1609
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;
1610
FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1611
#else
1612
FAudio_assert(0 && "Need converter functions!");
1613
#endif
1614
}
1615
1616
/* vim: set noexpandtab shiftwidth=8 tabstop=8: */
1617
1618