CoCalc -- FAudio_internal

GitHub Repository: wine-mirror/wine
Path: blob/master/libs/faudio/src/FAudio_internal_simd.c
⁴³⁸⁹ views
1
/* FAudio - XAudio Reimplementation for FNA
2
 *
3
 * Copyright (c) 2011-2024 Ethan Lee, Luigi Auriemma, and the MonoGame Team
4
 *
5
 * This software is provided 'as-is', without any express or implied warranty.
6
 * In no event will the authors be held liable for any damages arising from
7
 * the use of this software.
8
 *
9
 * Permission is granted to anyone to use this software for any purpose,
10
 * including commercial applications, and to alter it and redistribute it
11
 * freely, subject to the following restrictions:
12
 *
13
 * 1. The origin of this software must not be misrepresented; you must not
14
 * claim that you wrote the original software. If you use this software in a
15
 * product, an acknowledgment in the product documentation would be
16
 * appreciated but is not required.
17
 *
18
 * 2. Altered source versions must be plainly marked as such, and must not be
19
 * misrepresented as being the original software.
20
 *
21
 * 3. This notice may not be removed or altered from any source distribution.
22
 *
23
 * Ethan "flibitijibibo" Lee <[email protected]>
24
 *
25
 */
26

27
#include "FAudio_internal.h"
28

29
/* SECTION 0: SSE/NEON Detection */
30

31
/* The SSE/NEON detection comes from MojoAL:
32
 * https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c
33
 */
34

35
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC)
36
	/* Some platforms fail to define this... */
37
	#ifndef __ARM_NEON__
38
	#define __ARM_NEON__ 1
39
	#endif
40

41
	/* AArch64 guarantees NEON. */
42
	#define NEED_SCALAR_CONVERTER_FALLBACKS 0
43
#elif defined(__x86_64__) || defined(_M_X64)
44
	/* Some platforms fail to define this... */
45
	#ifndef __SSE2__
46
	#define __SSE2__ 1
47
	#endif
48

49
	/* x86_64 guarantees SSE2. */
50
	#define NEED_SCALAR_CONVERTER_FALLBACKS 0
51
#elif __MACOSX__ && !defined(__POWERPC__)
52
	/* Some build systems may need to specify this. */
53
	#if !defined(__SSE2__) && !defined(__ARM_NEON__)
54
	#error macOS does not have SSE2/NEON? Bad compiler?
55
	#endif
56

57
	/* Mac OS X/Intel guarantees SSE2. */
58
	#define NEED_SCALAR_CONVERTER_FALLBACKS 0
59
#else
60
	/* Need plain C implementations to support all other hardware */
61
	#define NEED_SCALAR_CONVERTER_FALLBACKS 1
62
#endif
63

64
/* Our NEON paths require AArch64, don't check __ARM_NEON__ here */
65
#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC)
66
#include <arm_neon.h>
67
#define HAVE_NEON_INTRINSICS 1
68
#endif
69

70

71
#ifdef __SSE2__
72
#include <emmintrin.h>
73
#define HAVE_SSE2_INTRINSICS 1
74
#endif
75

76
/* SECTION 1: Type Converters */
77

78
/* The SSE/NEON converters are based on SDL_audiotypecvt:
79
 * https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c
80
 */
81

82
#define DIVBY128 0.0078125f
83
#define DIVBY32768 0.000030517578125f
84
#define DIVBY8388607 0.00000011920930376163766f
85

86
#if NEED_SCALAR_CONVERTER_FALLBACKS
87
void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(
88
	const uint8_t *restrict src,
89
	float *restrict dst,
90
	uint32_t len
91
) {
92
	uint32_t i;
93
	for (i = 0; i < len; i += 1)
94
	{
95
		*dst++ = (*src++ * DIVBY128) - 1.0f;
96
	}
97
}
98

99
void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(
100
	const int16_t *restrict src,
101
	float *restrict dst,
102
	uint32_t len
103
) {
104
	uint32_t i;
105
	for (i = 0; i < len; i += 1)
106
	{
107
		*dst++ = *src++ * DIVBY32768;
108
	}
109
}
110

111
void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(
112
	const int32_t *restrict src,
113
	float *restrict dst,
114
	uint32_t len
115
) {
116
	uint32_t i;
117
	for (i = 0; i < len; i += 1)
118
	{
119
		*dst++ = (*src++ >> 8) * DIVBY8388607;
120
	}
121
}
122
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
123

124
#if HAVE_SSE2_INTRINSICS
125
void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(
126
	const uint8_t *restrict src,
127
	float *restrict dst,
128
	uint32_t len
129
) {
130
    int i;
131
    src += len - 1;
132
    dst += len - 1;
133

134
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
135
    for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
136
        *dst = (((float) *src) * DIVBY128) - 1.0f;
137
    }
138

139
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
140
    FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
141

142
    /* Make sure src is aligned too. */
143
    if ((((size_t) src) & 15) == 0) {
144
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
145
        const __m128i *mmsrc = (const __m128i *) src;
146
        const __m128i zero = _mm_setzero_si128();
147
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
148
        const __m128 minus1 = _mm_set1_ps(-1.0f);
149
        while (i >= 16) {   /* 16 * 8-bit */
150
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
151
            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
152
            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
153
            /* right-shift-zero-extend gets us uint16 with the other set of values. */
154
            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
155
            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
156
            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
157
            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
158
            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
159
            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
160
            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
161
            /* Interleave back into correct order, store. */
162
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
163
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
164
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
165
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
166
            i -= 16; mmsrc--; dst -= 16;
167
        }
168

169
        src = (const uint8_t *) mmsrc;
170
    }
171

172
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
173

174
    /* Finish off any leftovers with scalar operations. */
175
    while (i) {
176
        *dst = (((float) *src) * DIVBY128) - 1.0f;
177
        i--; src--; dst--;
178
    }
179
}
180

181
void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(
182
	const int16_t *restrict src,
183
	float *restrict dst,
184
	uint32_t len
185
) {
186
    int i;
187
    src += len - 1;
188
    dst += len - 1;
189

190
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
191
    for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
192
        *dst = ((float) *src) * DIVBY32768;
193
    }
194

195
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
196
    FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
197

198
    /* Make sure src is aligned too. */
199
    if ((((size_t) src) & 15) == 0) {
200
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
201
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
202
        while (i >= 8) {   /* 8 * 16-bit */
203
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
204
            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
205
            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
206
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
207
            const __m128i b = _mm_srai_epi32(ints, 16);
208
            /* Interleave these back into the right order, convert to float, multiply, store. */
209
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
210
            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
211
            i -= 8; src -= 8; dst -= 8;
212
        }
213
    }
214

215
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
216

217
    /* Finish off any leftovers with scalar operations. */
218
    while (i) {
219
        *dst = ((float) *src) * DIVBY32768;
220
        i--; src--; dst--;
221
    }
222
}
223

224
void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(
225
	const int32_t *restrict src,
226
	float *restrict dst,
227
	uint32_t len
228
) {
229
    int i;
230

231
    /* Get dst aligned to 16 bytes */
232
    for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
233
        *dst = ((float) (*src>>8)) * DIVBY8388607;
234
    }
235

236
    FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
237

238
    /* Make sure src is aligned too. */
239
    if ((((size_t) src) & 15) == 0) {
240
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
241
        const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
242
        const __m128i *mmsrc = (const __m128i *) src;
243
        while (i >= 4) {   /* 4 * sint32 */
244
            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
245
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
246
            i -= 4; mmsrc++; dst += 4;
247
        }
248
        src = (const int32_t *) mmsrc;
249
    }
250

251
    /* Finish off any leftovers with scalar operations. */
252
    while (i) {
253
        *dst = ((float) (*src>>8)) * DIVBY8388607;
254
        i--; src++; dst++;
255
    }
256
}
257
#endif /* HAVE_SSE2_INTRINSICS */
258

259
#if HAVE_NEON_INTRINSICS
260
void FAudio_INTERNAL_Convert_U8_To_F32_NEON(
261
	const uint8_t *restrict src,
262
	float *restrict dst,
263
	uint32_t len
264
) {
265
    int i;
266
    src += len - 1;
267
    dst += len - 1;
268

269
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
270
    for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
271
        *dst = (((float) *src) * DIVBY128) - 1.0f;
272
    }
273

274
    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
275
    FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
276

277
    /* Make sure src is aligned too. */
278
    if ((((size_t) src) & 15) == 0) {
279
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
280
        const uint8_t *mmsrc = (const uint8_t *) src;
281
        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
282
        const float32x4_t negone = vdupq_n_f32(-1.0f);
283
        while (i >= 16) {   /* 16 * 8-bit */
284
            const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
285
            const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
286
            const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
287
            /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
288
            vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
289
            vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
290
            vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
291
            vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
292
            i -= 16; mmsrc -= 16; dst -= 16;
293
        }
294

295
        src = (const uint8_t *) mmsrc;
296
    }
297

298
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
299

300
    /* Finish off any leftovers with scalar operations. */
301
    while (i) {
302
        *dst = (((float) *src) * DIVBY128) - 1.0f;
303
        i--; src--; dst--;
304
    }
305
}
306

307
void FAudio_INTERNAL_Convert_S16_To_F32_NEON(
308
	const int16_t *restrict src,
309
	float *restrict dst,
310
	uint32_t len
311
) {
312
    int i;
313
    src += len - 1;
314
    dst += len - 1;
315

316
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
317
    for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
318
        *dst = ((float) *src) * DIVBY32768;
319
    }
320

321
    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
322
    FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
323

324
    /* Make sure src is aligned too. */
325
    if ((((size_t) src) & 15) == 0) {
326
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
327
        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
328
        while (i >= 8) {   /* 8 * 16-bit */
329
            const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
330
            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
331
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
332
            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
333
            i -= 8; src -= 8; dst -= 8;
334
        }
335
    }
336

337
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
338

339
    /* Finish off any leftovers with scalar operations. */
340
    while (i) {
341
        *dst = ((float) *src) * DIVBY32768;
342
        i--; src--; dst--;
343
    }
344
}
345

346
void FAudio_INTERNAL_Convert_S32_To_F32_NEON(
347
	const int32_t *restrict src,
348
	float *restrict dst,
349
	uint32_t len
350
) {
351
    int i;
352

353
    /* Get dst aligned to 16 bytes */
354
    for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
355
        *dst = ((float) (*src>>8)) * DIVBY8388607;
356
    }
357

358
    FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
359

360
    /* Make sure src is aligned too. */
361
    if ((((size_t) src) & 15) == 0) {
362
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
363
        const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
364
        const int32_t *mmsrc = (const int32_t *) src;
365
        while (i >= 4) {   /* 4 * sint32 */
366
            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
367
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
368
            i -= 4; mmsrc += 4; dst += 4;
369
        }
370
        src = (const int32_t *) mmsrc;
371
    }
372

373
    /* Finish off any leftovers with scalar operations. */
374
    while (i) {
375
        *dst = ((float) (*src>>8)) * DIVBY8388607;
376
        i--; src++; dst++;
377
    }
378
}
379
#endif /* HAVE_NEON_INTRINSICS */
380

381
/* SECTION 2: Linear Resamplers */
382

383
void FAudio_INTERNAL_ResampleGeneric(
384
	float *restrict dCache,
385
	float *restrict resampleCache,
386
	uint64_t *resampleOffset,
387
	uint64_t resampleStep,
388
	uint64_t toResample,
389
	uint8_t channels
390
) {
391
	uint32_t i, j;
392
	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
393
	for (i = 0; i < toResample; i += 1)
394
	{
395
		for (j = 0; j < channels; j += 1)
396
		{
397
			/* lerp, then convert to float value */
398
			*resampleCache++ = (float) (
399
				dCache[j] +
400
				(dCache[j + channels] - dCache[j]) *
401
				FIXED_TO_DOUBLE(cur)
402
			);
403
		}
404

405
		/* Increment fraction offset by the stepping value */
406
		*resampleOffset += resampleStep;
407
		cur += resampleStep;
408

409
		/* Only increment the sample offset by integer values.
410
		 * Sometimes this will be 0 until cur accumulates
411
		 * enough steps, especially for "slow" rates.
412
		 */
413
		dCache += (cur >> FIXED_PRECISION) * channels;
414

415
		/* Now that any integer has been added, drop it.
416
		 * The offset pointer will preserve the total.
417
		 */
418
		cur &= FIXED_FRACTION_MASK;
419
	}
420
}
421

422
#if NEED_SCALAR_CONVERTER_FALLBACKS
423
void FAudio_INTERNAL_ResampleMono_Scalar(
424
	float *restrict dCache,
425
	float *restrict resampleCache,
426
	uint64_t *resampleOffset,
427
	uint64_t resampleStep,
428
	uint64_t toResample,
429
	uint8_t UNUSED
430
) {
431
	uint32_t i;
432
	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
433
	for (i = 0; i < toResample; i += 1)
434
	{
435
		/* lerp, then convert to float value */
436
		*resampleCache++ = (float) (
437
			dCache[0] +
438
			(dCache[1] - dCache[0]) *
439
			FIXED_TO_DOUBLE(cur)
440
		);
441

442
		/* Increment fraction offset by the stepping value */
443
		*resampleOffset += resampleStep;
444
		cur += resampleStep;
445

446
		/* Only increment the sample offset by integer values.
447
		 * Sometimes this will be 0 until cur accumulates
448
		 * enough steps, especially for "slow" rates.
449
		 */
450
		dCache += (cur >> FIXED_PRECISION);
451

452
		/* Now that any integer has been added, drop it.
453
		 * The offset pointer will preserve the total.
454
		 */
455
		cur &= FIXED_FRACTION_MASK;
456
	}
457
}
458

459
void FAudio_INTERNAL_ResampleStereo_Scalar(
460
	float *restrict dCache,
461
	float *restrict resampleCache,
462
	uint64_t *resampleOffset,
463
	uint64_t resampleStep,
464
	uint64_t toResample,
465
	uint8_t UNUSED
466
) {
467
	uint32_t i;
468
	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
469
	for (i = 0; i < toResample; i += 1)
470
	{
471
		/* lerp, then convert to float value */
472
		*resampleCache++ = (float) (
473
			dCache[0] +
474
			(dCache[2] - dCache[0]) *
475
			FIXED_TO_DOUBLE(cur)
476
		);
477
		*resampleCache++ = (float) (
478
			dCache[1] +
479
			(dCache[3] - dCache[1]) *
480
			FIXED_TO_DOUBLE(cur)
481
		);
482

483
		/* Increment fraction offset by the stepping value */
484
		*resampleOffset += resampleStep;
485
		cur += resampleStep;
486

487
		/* Only increment the sample offset by integer values.
488
		 * Sometimes this will be 0 until cur accumulates
489
		 * enough steps, especially for "slow" rates.
490
		 */
491
		dCache += (cur >> FIXED_PRECISION) * 2;
492

493
		/* Now that any integer has been added, drop it.
494
		 * The offset pointer will preserve the total.
495
		 */
496
		cur &= FIXED_FRACTION_MASK;
497
	}
498
}
499
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
500

501
/* The SSE2 versions of the resamplers come from @8thMage! */
502

503
#if HAVE_SSE2_INTRINSICS
504
void FAudio_INTERNAL_ResampleMono_SSE2(
505
	float *restrict dCache,
506
	float *restrict resampleCache,
507
	uint64_t *resampleOffset,
508
	uint64_t resampleStep,
509
	uint64_t toResample,
510
	uint8_t UNUSED
511
) {
512
	uint32_t i, header, tail;
513
	uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
514
	float *dCache_1, *dCache_2, *dCache_3;
515
	uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
516
	__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,
517
		current, next, sub, cur_fixed, mul, res;
518
	__m128i cur_frac, adder_frac, adder_frac_loop;
519

520
	/* This is the header, the Dest needs to be aligned to 16B */
521
	header = (16 - ((size_t) resampleCache) % 16) / 4;
522
	if (header == 4)
523
	{
524
		header = 0;
525
	}
526
	for (i = 0; i < header; i += 1)
527
	{
528
		/* lerp, then convert to float value */
529
		*resampleCache++ = (float) (
530
			dCache[0] +
531
			(dCache[1] - dCache[0]) *
532
			FIXED_TO_FLOAT(cur_scalar)
533
		);
534

535
		/* Increment fraction offset by the stepping value */
536
		*resampleOffset += resampleStep;
537
		cur_scalar += resampleStep;
538

539
		/* Only increment the sample offset by integer values.
540
		 * Sometimes this will be 0 until cur accumulates
541
		 * enough steps, especially for "slow" rates.
542
		 */
543
		dCache += (cur_scalar >> FIXED_PRECISION);
544

545
		/* Now that any integer has been added, drop it.
546
		 * The offset pointer will preserve the total.
547
		 */
548
		cur_scalar &= FIXED_FRACTION_MASK;
549
	}
550

551
	toResample -= header;
552

553
	/* initialising the varius cur
554
	 * cur_frac is the fractional part of cur with 4 samples. as the
555
	 * fractional part is 32 bit unsigned value, it can be just added
556
	 * and the modulu operation for keeping the fractional part will be implicit.
557
	 * the 0.5 is for converting signed values to float (no unsigned convert),
558
	 * the 0.5 is added later.
559
	 */
560
	cur_frac = _mm_set1_epi32(
561
		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
562
	);
563
	adder_frac = _mm_setr_epi32(
564
		0,
565
		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
566
		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
567
		(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
568
	);
569
	cur_frac = _mm_add_epi32(cur_frac, adder_frac);
570

571
	/* The various cur_scalar is for the different samples
572
	 * (1, 2, 3 compared to original cur_scalar = 0)
573
	 */
574
	cur_scalar_1 = cur_scalar + resampleStep;
575
	cur_scalar_2 = cur_scalar + resampleStep * 2;
576
	cur_scalar_3 = cur_scalar + resampleStep * 3;
577
	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
578
	dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
579
	dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
580
	cur_scalar &= FIXED_FRACTION_MASK;
581
	cur_scalar_1 &= FIXED_FRACTION_MASK;
582
	cur_scalar_2 &= FIXED_FRACTION_MASK;
583
	cur_scalar_3 &= FIXED_FRACTION_MASK;
584

585
	/* FIXME: These should be _mm_undefined_ps! */
586
	current_next_0_1 = _mm_setzero_ps();
587
	current_next_2_3 = _mm_setzero_ps();
588

589
	/* Constants */
590
	one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
591
	half = _mm_set1_ps(0.5f);
592
	adder_frac_loop = _mm_set1_epi32(
593
		(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
594
	);
595

596
	tail = toResample % 4;
597
	for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
598
	{
599
		/* current next holds 2 pairs of the sample and the sample + 1
600
		 * after that need to seperate them.
601
		 */
602

603
		current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);
604
		current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);
605
		current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);
606
		current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);
607

608
		/* Unpack them to have seperate current and next in 2 vectors. */
609
		current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */
610
		next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */
611

612
		sub = _mm_sub_ps(next, current);
613

614
		/* Convert the fractional part to float and then mul to get the fractions out.
615
		 * then add back the 0.5 we subtracted before.
616
		 */
617
		cur_fixed = _mm_add_ps(
618
			_mm_mul_ps(
619
				_mm_cvtepi32_ps(cur_frac),
620
				one_over_fixed_one
621
			),
622
			half
623
		);
624
		mul = _mm_mul_ps(sub, cur_fixed);
625
		res = _mm_add_ps(current, mul);
626

627
		/* Store back */
628
		_mm_store_ps(resampleCache, res);
629

630
		/* Update dCaches for next iteration */
631
		cur_scalar += resampleStep * 4;
632
		cur_scalar_1 += resampleStep * 4;
633
		cur_scalar_2 += resampleStep * 4;
634
		cur_scalar_3 += resampleStep * 4;
635
		dCache = dCache + (cur_scalar >> FIXED_PRECISION);
636
		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
637
		dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
638
		dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
639
		cur_scalar &= FIXED_FRACTION_MASK;
640
		cur_scalar_1 &= FIXED_FRACTION_MASK;
641
		cur_scalar_2 &= FIXED_FRACTION_MASK;
642
		cur_scalar_3 &= FIXED_FRACTION_MASK;
643

644
		cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
645
	}
646
	*resampleOffset += resampleStep * (toResample - tail);
647

648
	/* This is the tail. */
649
	for (i = 0; i < tail; i += 1)
650
	{
651
		/* lerp, then convert to float value */
652
		*resampleCache++ = (float) (
653
			dCache[0] +
654
			(dCache[1] - dCache[0]) *
655
			FIXED_TO_FLOAT(cur_scalar)
656
		);
657
		
658
		/* Increment fraction offset by the stepping value */
659
		*resampleOffset += resampleStep;
660
		cur_scalar += resampleStep;
661

662
		/* Only increment the sample offset by integer values.
663
		 * Sometimes this will be 0 until cur accumulates
664
		 * enough steps, especially for "slow" rates.
665
		 */
666
		dCache += (cur_scalar >> FIXED_PRECISION);
667

668
		/* Now that any integer has been added, drop it.
669
		 * The offset pointer will preserve the total.
670
		 */
671
		cur_scalar &= FIXED_FRACTION_MASK;
672
	}
673
}
674

675
void FAudio_INTERNAL_ResampleStereo_SSE2(
676
	float *restrict dCache,
677
	float *restrict resampleCache,
678
	uint64_t *resampleOffset,
679
	uint64_t resampleStep,
680
	uint64_t toResample,
681
	uint8_t UNUSED
682
) {
683
	uint32_t i, header, tail;
684
	uint64_t cur_scalar, cur_scalar_1;
685
	float *dCache_1;
686
	__m128 one_over_fixed_one, half, current_next_1, current_next_2,
687
		current, next, sub, cur_fixed, mul, res;
688
	__m128i cur_frac, adder_frac, adder_frac_loop;
689

690
	/* This is the header, the Dest needs to be aligned to 16B */
691
	header = (16 - ((size_t) resampleCache) % 16) / 8;
692
	if (header == 2)
693
	{
694
		header = 0;
695
	}
696
	cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
697
	for (i = 0; i < header; i += 2)
698
	{
699
		/* lerp, then convert to float value */
700
		*resampleCache++ = (float) (
701
			dCache[0] +
702
			(dCache[2] - dCache[0]) *
703
			FIXED_TO_FLOAT(cur_scalar)
704
		);
705
		*resampleCache++ = (float) (
706
			dCache[1] +
707
			(dCache[3] - dCache[1]) *
708
			FIXED_TO_FLOAT(cur_scalar)
709
		);
710

711
		/* Increment fraction offset by the stepping value */
712
		*resampleOffset += resampleStep;
713
		cur_scalar += resampleStep;
714

715
		/* Only increment the sample offset by integer values.
716
		 * Sometimes this will be 0 until cur accumulates
717
		 * enough steps, especially for "slow" rates.
718
		 */
719
		dCache += (cur_scalar >> FIXED_PRECISION) * 2;
720

721
		/* Now that any integer has been added, drop it.
722
		 * The offset pointer will preserve the total.
723
		 */
724
		cur_scalar &= FIXED_FRACTION_MASK;
725
	}
726

727
	toResample -= header;
728

729
	/* initialising the varius cur.
730
	 * cur_frac holds the fractional part of cur.
731
	 * to avoid duplication please see the mono part for a thorough
732
	 * explanation.
733
	 */
734
	cur_frac = _mm_set1_epi32(
735
		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
736
	);
737
	adder_frac = _mm_setr_epi32(
738
		0,
739
		0,
740
		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
741
		(uint32_t) (resampleStep & FIXED_FRACTION_MASK)
742
	);
743
	cur_frac = _mm_add_epi32(cur_frac, adder_frac);
744

745
	/* dCache_1 is the pointer for dcache in the next resample pos. */
746
	cur_scalar_1 = cur_scalar + resampleStep;
747
	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
748
	cur_scalar_1 &= FIXED_FRACTION_MASK;
749

750
	one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
751
	half = _mm_set1_ps(0.5f);
752
	adder_frac_loop = _mm_set1_epi32(
753
		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
754
	);
755

756
	tail = toResample % 2;
757
	for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
758
	{
759
		/* Current_next_1 and current_next_2 each holds 4 src
760
		 * sample points for getting 4 dest resample point at the end.
761
		 * current_next_1 holds:
762
		 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
763
		 * for the first resample position, while current_next_2 holds
764
		 * the same for the 2nd resample position
765
		 */
766
		current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */
767
		current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */
768

769
		/* Unpack them to get the current and the next in seperate vectors. */
770
		current = _mm_castpd_ps(
771
			_mm_unpacklo_pd(
772
				_mm_castps_pd(current_next_1),
773
				_mm_castps_pd(current_next_2)
774
			)
775
		);
776
		next = _mm_castpd_ps(
777
			_mm_unpackhi_pd(
778
				_mm_castps_pd(current_next_1),
779
				_mm_castps_pd(current_next_2)
780
			)
781
		);
782

783
		sub = _mm_sub_ps(next, current);
784

785
		/* Adding the 0.5 back.
786
		 * See mono explanation for more elaborate explanation.
787
		 */
788
		cur_fixed = _mm_add_ps(
789
			_mm_mul_ps(
790
				_mm_cvtepi32_ps(cur_frac),
791
				one_over_fixed_one
792
			),
793
			half
794
		);
795
		mul = _mm_mul_ps(sub, cur_fixed);
796
		res = _mm_add_ps(current, mul);
797

798
		/* Store the results */
799
		_mm_store_ps(resampleCache, res);
800

801
		/* Update dCaches for next iteration */
802
		cur_scalar += resampleStep * 2;
803
		cur_scalar_1 += resampleStep * 2;
804
		dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
805
		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
806
		cur_scalar &= FIXED_FRACTION_MASK;
807
		cur_scalar_1 &= FIXED_FRACTION_MASK;
808

809
		cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
810
	}
811
	*resampleOffset += resampleStep * (toResample - tail);
812

813
	/* This is the tail. */
814
	for (i = 0; i < tail; i += 1)
815
	{
816
		/* lerp, then convert to float value */
817
		*resampleCache++ = (float) (
818
			dCache[0] +
819
			(dCache[2] - dCache[0]) *
820
			FIXED_TO_FLOAT(cur_scalar)
821
		);
822
		*resampleCache++ = (float) (
823
			dCache[1] +
824
			(dCache[3] - dCache[1]) *
825
			FIXED_TO_FLOAT(cur_scalar)
826
		);
827

828
		/* Increment fraction offset by the stepping value */
829
		*resampleOffset += resampleStep;
830
		cur_scalar += resampleStep;
831

832
		/* Only increment the sample offset by integer values.
833
		 * Sometimes this will be 0 until cur accumulates
834
		 * enough steps, especially for "slow" rates.
835
		 */
836
		dCache += (cur_scalar >> FIXED_PRECISION) * 2;
837

838
		/* Now that any integer has been added, drop it.
839
		 * The offset pointer will preserve the total.
840
		 */
841
		cur_scalar &= FIXED_FRACTION_MASK;
842
	}
843
}
844
#endif /* HAVE_SSE2_INTRINSICS */
845

846
#if HAVE_NEON_INTRINSICS
847
void FAudio_INTERNAL_ResampleMono_NEON(
848
	float *restrict dCache,
849
	float *restrict resampleCache,
850
	uint64_t *resampleOffset,
851
	uint64_t resampleStep,
852
	uint64_t toResample,
853
	uint8_t UNUSED
854
) {
855
	uint32_t i, header, tail;
856
	uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
857
	float *dCache_1, *dCache_2, *dCache_3;
858
	uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
859
	float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,
860
		current, next, sub, cur_fixed, mul, res;
861
	int32x4_t cur_frac, adder_frac, adder_frac_loop;
862

863
	/* This is the header, the Dest needs to be aligned to 16B */
864
	header = (16 - ((size_t) resampleCache) % 16) / 4;
865
	if (header == 4)
866
	{
867
		header = 0;
868
	}
869
	for (i = 0; i < header; i += 1)
870
	{
871
		/* lerp, then convert to float value */
872
		*resampleCache++ = (float) (
873
			dCache[0] +
874
			(dCache[1] - dCache[0]) *
875
			FIXED_TO_FLOAT(cur_scalar)
876
		);
877

878
		/* Increment fraction offset by the stepping value */
879
		*resampleOffset += resampleStep;
880
		cur_scalar += resampleStep;
881

882
		/* Only increment the sample offset by integer values.
883
		 * Sometimes this will be 0 until cur accumulates
884
		 * enough steps, especially for "slow" rates.
885
		 */
886
		dCache += (cur_scalar >> FIXED_PRECISION);
887

888
		/* Now that any integer has been added, drop it.
889
		 * The offset pointer will preserve the total.
890
		 */
891
		cur_scalar &= FIXED_FRACTION_MASK;
892
	}
893

894
	toResample -= header;
895

896
	/* initialising the varius cur
897
	 * cur_frac is the fractional part of cur with 4 samples. as the
898
	 * fractional part is 32 bit unsigned value, it can be just added
899
	 * and the modulu operation for keeping the fractional part will be implicit.
900
	 * the 0.5 is for converting signed values to float (no unsigned convert),
901
	 * the 0.5 is added later.
902
	 */
903
	cur_frac = vdupq_n_s32(
904
		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
905
	);
906
	ALIGN(int32_t, 16) data[4] =
907
	{
908
		0,
909
		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
910
		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
911
		(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
912
	};
913
	adder_frac = vld1q_s32(data);
914
	cur_frac = vaddq_s32(cur_frac, adder_frac);
915

916
	/* The various cur_scalar is for the different samples
917
	 * (1, 2, 3 compared to original cur_scalar = 0)
918
	 */
919
	cur_scalar_1 = cur_scalar + resampleStep;
920
	cur_scalar_2 = cur_scalar + resampleStep * 2;
921
	cur_scalar_3 = cur_scalar + resampleStep * 3;
922
	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
923
	dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
924
	dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
925
	cur_scalar &= FIXED_FRACTION_MASK;
926
	cur_scalar_1 &= FIXED_FRACTION_MASK;
927
	cur_scalar_2 &= FIXED_FRACTION_MASK;
928
	cur_scalar_3 &= FIXED_FRACTION_MASK;
929

930
	/* Constants */
931
	one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
932
	half = vdupq_n_f32(0.5f);
933
	adder_frac_loop = vdupq_n_s32(
934
		(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
935
	);
936

937
	tail = toResample % 4;
938
	for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
939
	{
940
		/* current next holds 2 pairs of the sample and the sample + 1
941
		 * after that need to separate them.
942
		 */
943
		current_next_0_1 = vcombine_f32(
944
			vld1_f32(dCache),
945
			vld1_f32(dCache_1)
946
		);
947
		current_next_2_3 = vcombine_f32(
948
			vld1_f32(dCache_2),
949
			vld1_f32(dCache_3)
950
		);
951

952
		/* Unpack them to have seperate current and next in 2 vectors. */
953
		current = vuzp1q_f32(current_next_0_1, current_next_2_3);
954
		next = vuzp2q_f32(current_next_0_1, current_next_2_3);
955

956
		sub = vsubq_f32(next, current);
957

958
		/* Convert the fractional part to float and then mul to get the fractions out.
959
		 * then add back the 0.5 we subtracted before.
960
		 */
961
		cur_fixed = vaddq_f32(
962
			vmulq_f32(
963
				vcvtq_f32_s32(cur_frac),
964
				one_over_fixed_one
965
			),
966
			half
967
		);
968
		mul = vmulq_f32(sub, cur_fixed);
969
		res = vaddq_f32(current, mul);
970

971
		/* Store back */
972
		vst1q_f32(resampleCache, res);
973

974
		/* Update dCaches for next iteration */
975
		cur_scalar += resampleStep * 4;
976
		cur_scalar_1 += resampleStep * 4;
977
		cur_scalar_2 += resampleStep * 4;
978
		cur_scalar_3 += resampleStep * 4;
979
		dCache = dCache + (cur_scalar >> FIXED_PRECISION);
980
		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
981
		dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
982
		dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
983
		cur_scalar &= FIXED_FRACTION_MASK;
984
		cur_scalar_1 &= FIXED_FRACTION_MASK;
985
		cur_scalar_2 &= FIXED_FRACTION_MASK;
986
		cur_scalar_3 &= FIXED_FRACTION_MASK;
987

988
		cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
989
	}
990
	*resampleOffset += resampleStep * (toResample - tail);
991

992
	/* This is the tail. */
993
	for (i = 0; i < tail; i += 1)
994
	{
995
		/* lerp, then convert to float value */
996
		*resampleCache++ = (float) (
997
			dCache[0] +
998
			(dCache[1] - dCache[0]) *
999
			FIXED_TO_FLOAT(cur_scalar)
1000
		);
1001

1002
		/* Increment fraction offset by the stepping value */
1003
		*resampleOffset += resampleStep;
1004
		cur_scalar += resampleStep;
1005

1006
		/* Only increment the sample offset by integer values.
1007
		 * Sometimes this will be 0 until cur accumulates
1008
		 * enough steps, especially for "slow" rates.
1009
		 */
1010
		dCache += (cur_scalar >> FIXED_PRECISION);
1011

1012
		/* Now that any integer has been added, drop it.
1013
		 * The offset pointer will preserve the total.
1014
		 */
1015
		cur_scalar &= FIXED_FRACTION_MASK;
1016
	}
1017
}
1018

1019
void FAudio_INTERNAL_ResampleStereo_NEON(
1020
	float *restrict dCache,
1021
	float *restrict resampleCache,
1022
	uint64_t *resampleOffset,
1023
	uint64_t resampleStep,
1024
	uint64_t toResample,
1025
	uint8_t channels
1026
) {
1027
	uint32_t i, header, tail;
1028
	uint64_t cur_scalar, cur_scalar_1;
1029
	float *dCache_1;
1030
	float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;
1031
	int32x4_t cur_frac, adder_frac, adder_frac_loop;
1032

1033
	/* This is the header, the Dest needs to be aligned to 16B */
1034
	header = (16 - ((size_t) resampleCache) % 16) / 8;
1035
	if (header == 2)
1036
	{
1037
		header = 0;
1038
	}
1039
	cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
1040
	for (i = 0; i < header; i += 2)
1041
	{
1042
		/* lerp, then convert to float value */
1043
		*resampleCache++ = (float) (
1044
			dCache[0] +
1045
			(dCache[2] - dCache[0]) *
1046
			FIXED_TO_FLOAT(cur_scalar)
1047
		);
1048
		*resampleCache++ = (float) (
1049
			dCache[1] +
1050
			(dCache[3] - dCache[1]) *
1051
			FIXED_TO_FLOAT(cur_scalar)
1052
		);
1053

1054
		/* Increment fraction offset by the stepping value */
1055
		*resampleOffset += resampleStep;
1056
		cur_scalar += resampleStep;
1057

1058
		/* Only increment the sample offset by integer values.
1059
		 * Sometimes this will be 0 until cur accumulates
1060
		 * enough steps, especially for "slow" rates.
1061
		 */
1062
		dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1063

1064
		/* Now that any integer has been added, drop it.
1065
		 * The offset pointer will preserve the total.
1066
		 */
1067
		cur_scalar &= FIXED_FRACTION_MASK;
1068
	}
1069

1070
	toResample -= header;
1071

1072
	/* initialising the varius cur.
1073
	 * cur_frac holds the fractional part of cur.
1074
	 * to avoid duplication please see the mono part for a thorough
1075
	 * explanation.
1076
	 */
1077
	cur_frac = vdupq_n_s32(
1078
		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
1079
	);
1080
	ALIGN(int32_t, 16) data[4] =
1081
	{
1082
		0,
1083
		0,
1084
		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
1085
		(uint32_t) (resampleStep & FIXED_FRACTION_MASK)
1086
	};
1087
	adder_frac = vld1q_s32(data);
1088
	cur_frac = vaddq_s32(cur_frac, adder_frac);
1089

1090
	/* dCache_1 is the pointer for dcache in the next resample pos. */
1091
	cur_scalar_1 = cur_scalar + resampleStep;
1092
	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1093
	cur_scalar_1 &= FIXED_FRACTION_MASK;
1094

1095
	one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
1096
	half = vdupq_n_f32(0.5f);
1097
	adder_frac_loop = vdupq_n_s32(
1098
		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
1099
	);
1100

1101
	tail = toResample % 2;
1102
	for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
1103
	{
1104
		/* Current_next_1 and current_next_2 each holds 4 src
1105
		 * sample points for getting 4 dest resample point at the end.
1106
		 * current_next_1 holds:
1107
		 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
1108
		 * for the first resample position, while current_next_2 holds
1109
		 * the same for the 2nd resample position
1110
		 */
1111
		current = vcombine_f32(
1112
			vld1_f32(dCache), /* A1B1 */
1113
			vld1_f32(dCache_1) /* A3B3 */
1114
		);
1115
		next = vcombine_f32(
1116
			vld1_f32(dCache + 2), /* A2B2 */
1117
			vld1_f32(dCache_1 + 2) /* A4B4 */
1118
		);
1119

1120
		sub = vsubq_f32(next, current);
1121

1122
		/* Adding the 0.5 back.
1123
		 * See mono explanation for more elaborate explanation.
1124
		 */
1125
		cur_fixed = vaddq_f32(
1126
			vmulq_f32(
1127
				vcvtq_f32_s32(cur_frac),
1128
				one_over_fixed_one
1129
			),
1130
			half
1131
		);
1132
		mul = vmulq_f32(sub, cur_fixed);
1133
		res = vaddq_f32(current, mul);
1134

1135
		/* Store the results */
1136
		vst1q_f32(resampleCache, res);
1137

1138
		/* Update dCaches for next iteration */
1139
		cur_scalar += resampleStep * 2;
1140
		cur_scalar_1 += resampleStep * 2;
1141
		dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
1142
		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1143
		cur_scalar &= FIXED_FRACTION_MASK;
1144
		cur_scalar_1 &= FIXED_FRACTION_MASK;
1145

1146
		cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
1147
	}
1148
	*resampleOffset += resampleStep * (toResample - tail);
1149

1150
	/* This is the tail. */
1151
	for (i = 0; i < tail; i += 1)
1152
	{
1153
		/* lerp, then convert to float value */
1154
		*resampleCache++ = (float) (
1155
			dCache[0] +
1156
			(dCache[2] - dCache[0]) *
1157
			FIXED_TO_FLOAT(cur_scalar)
1158
		);
1159
		*resampleCache++ = (float) (
1160
			dCache[1] +
1161
			(dCache[3] - dCache[1]) *
1162
			FIXED_TO_FLOAT(cur_scalar)
1163
		);
1164

1165
		/* Increment fraction offset by the stepping value */
1166
		*resampleOffset += resampleStep;
1167
		cur_scalar += resampleStep;
1168

1169
		/* Only increment the sample offset by integer values.
1170
		 * Sometimes this will be 0 until cur accumulates
1171
		 * enough steps, especially for "slow" rates.
1172
		 */
1173
		dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1174

1175
		/* Now that any integer has been added, drop it.
1176
		 * The offset pointer will preserve the total.
1177
		 */
1178
		cur_scalar &= FIXED_FRACTION_MASK;
1179
	}
1180
}
1181
#endif /* HAVE_NEON_INTRINSICS */
1182

1183
/* SECTION 3: Amplifiers */
1184

1185
#if NEED_SCALAR_CONVERTER_FALLBACKS
1186
void FAudio_INTERNAL_Amplify_Scalar(
1187
	float* output,
1188
	uint32_t totalSamples,
1189
	float volume
1190
) {
1191
	uint32_t i;
1192
	for (i = 0; i < totalSamples; i += 1)
1193
	{
1194
		output[i] *= volume;
1195
	}
1196
}
1197
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
1198

1199
/* The SSE2 version of the amplifier comes from @8thMage! */
1200

1201
#if HAVE_SSE2_INTRINSICS
1202
void FAudio_INTERNAL_Amplify_SSE2(
1203
	float* output,
1204
	uint32_t totalSamples,
1205
	float volume
1206
) {
1207
	uint32_t i;
1208
	uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1209
	uint32_t tail = (totalSamples - header) % 4;
1210
	__m128 volumeVec, outVec;
1211
	if (header == 4)
1212
	{
1213
		header = 0;
1214
	}
1215
	if (tail == 4)
1216
	{
1217
		tail = 0;
1218
	}
1219

1220
	for (i = 0; i < header; i += 1)
1221
	{
1222
		output[i] *= volume;
1223
	}
1224

1225
	volumeVec = _mm_set1_ps(volume);
1226
	for (i = header; i < totalSamples - tail; i += 4)
1227
	{
1228
		outVec = _mm_load_ps(output + i);
1229
		outVec = _mm_mul_ps(outVec, volumeVec);
1230
		_mm_store_ps(output + i, outVec);
1231
	}
1232

1233
	for (i = totalSamples - tail; i < totalSamples; i += 1)
1234
	{
1235
		output[i] *= volume;
1236
	}
1237
}
1238
#endif /* HAVE_SSE2_INTRINSICS */
1239

1240
#if HAVE_NEON_INTRINSICS
1241
void FAudio_INTERNAL_Amplify_NEON(
1242
	float* output,
1243
	uint32_t totalSamples,
1244
	float volume
1245
) {
1246
	uint32_t i;
1247
	uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1248
	uint32_t tail = (totalSamples - header) % 4;
1249
	float32x4_t volumeVec, outVec;
1250
	if (header == 4)
1251
	{
1252
		header = 0;
1253
	}
1254
	if (tail == 4)
1255
	{
1256
		tail = 0;
1257
	}
1258

1259
	for (i = 0; i < header; i += 1)
1260
	{
1261
		output[i] *= volume;
1262
	}
1263

1264
	volumeVec = vdupq_n_f32(volume);
1265
	for (i = header; i < totalSamples - tail; i += 4)
1266
	{
1267
		outVec = vld1q_f32(output + i);
1268
		outVec = vmulq_f32(outVec, volumeVec);
1269
		vst1q_f32(output + i, outVec);
1270
	}
1271

1272
	for (i = totalSamples - tail; i < totalSamples; i += 1)
1273
	{
1274
		output[i] *= volume;
1275
	}
1276
}
1277
#endif /* HAVE_NEON_INTRINSICS */
1278

1279
/* SECTION 4: Mixer Functions */
1280

1281
void FAudio_INTERNAL_Mix_Generic_Scalar(
1282
	uint32_t toMix,
1283
	uint32_t srcChans,
1284
	uint32_t dstChans,
1285
	float *restrict src,
1286
	float *restrict dst,
1287
	float *restrict coefficients
1288
) {
1289
	uint32_t i, co, ci;
1290
	for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1291
	for (co = 0; co < dstChans; co += 1)
1292
	{
1293
		for (ci = 0; ci < srcChans; ci += 1)
1294
		{
1295
			dst[co] += (
1296
				src[ci] *
1297
				coefficients[co * srcChans + ci]
1298
			);
1299
		}
1300
	}
1301
}
1302

1303
#if HAVE_SSE2_INTRINSICS
1304
/* SSE horizontal add by Peter Cordes, CC-BY-SA.
1305
 * From https://stackoverflow.com/a/35270026 */
1306
static inline float FAudio_simd_hadd(__m128 v)
1307
{
1308
	__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
1309
	__m128 sums = _mm_add_ps(v, shuf);
1310
	shuf = _mm_movehl_ps(shuf, sums);
1311
	sums = _mm_add_ss(sums, shuf);
1312
	return _mm_cvtss_f32(sums);
1313
}
1314

1315
void FAudio_INTERNAL_Mix_Generic_SSE2(
1316
	uint32_t toMix,
1317
	uint32_t srcChans,
1318
	uint32_t dstChans,
1319
	float *restrict src,
1320
	float *restrict dst,
1321
	float *restrict coefficients
1322
) {
1323
	uint32_t i, co, ci;
1324
	for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1325
	for (co = 0; co < dstChans; co += 1)
1326
	{
1327
		for (ci = 0; srcChans - ci >= 4; ci += 4)
1328
		{
1329
			/* do SIMD */
1330
			const __m128 vols = _mm_loadu_ps(&coefficients[co * srcChans + ci]);
1331
			const __m128 dat = _mm_loadu_ps(&src[ci]);
1332
			dst[co] += FAudio_simd_hadd(_mm_mul_ps(dat, vols));
1333
		}
1334

1335
		for (; ci < srcChans; ci += 1)
1336
		{
1337
			/* do scalar */
1338
			dst[co] += (
1339
				src[ci] *
1340
				coefficients[co * srcChans + ci]
1341
			);
1342
		}
1343
	}
1344
}
1345
#endif /* HAVE_SSE2_INTRINSICS */
1346

1347
void FAudio_INTERNAL_Mix_1in_1out_Scalar(
1348
	uint32_t toMix,
1349
	uint32_t UNUSED1,
1350
	uint32_t UNUSED2,
1351
	float *restrict src,
1352
	float *restrict dst,
1353
	float *restrict coefficients
1354
) {
1355
	uint32_t i;
1356
	for (i = 0; i < toMix; i += 1, src += 1, dst += 1)
1357
	{
1358
		/* Base source data, combined with the coefficients */
1359
		dst[0] += src[0] * coefficients[0];
1360
	}
1361
}
1362

1363
void FAudio_INTERNAL_Mix_1in_2out_Scalar(
1364
	uint32_t toMix,
1365
	uint32_t UNUSED1,
1366
	uint32_t UNUSED2,
1367
	float *restrict src,
1368
	float *restrict dst,
1369
	float *restrict coefficients
1370
) {
1371
	uint32_t i;
1372
	for (i = 0; i < toMix; i += 1, src += 1, dst += 2)
1373
	{
1374
		dst[0] += src[0] * coefficients[0];
1375
		dst[1] += src[0] * coefficients[1];
1376
	}
1377
}
1378

1379
void FAudio_INTERNAL_Mix_1in_6out_Scalar(
1380
	uint32_t toMix,
1381
	uint32_t UNUSED1,
1382
	uint32_t UNUSED2,
1383
	float *restrict src,
1384
	float *restrict dst,
1385
	float *restrict coefficients
1386
) {
1387
	uint32_t i;
1388
	for (i = 0; i < toMix; i += 1, src += 1, dst += 6)
1389
	{
1390
		dst[0] += src[0] * coefficients[0];
1391
		dst[1] += src[0] * coefficients[1];
1392
		dst[2] += src[0] * coefficients[2];
1393
		dst[3] += src[0] * coefficients[3];
1394
		dst[4] += src[0] * coefficients[4];
1395
		dst[5] += src[0] * coefficients[5];
1396
	}
1397
}
1398

1399
void FAudio_INTERNAL_Mix_1in_8out_Scalar(
1400
	uint32_t toMix,
1401
	uint32_t UNUSED1,
1402
	uint32_t UNUSED2,
1403
	float *restrict src,
1404
	float *restrict dst,
1405
	float *restrict coefficients
1406
) {
1407
	uint32_t i;
1408
	for (i = 0; i < toMix; i += 1, src += 1, dst += 8)
1409
	{
1410
		dst[0] += src[0] * coefficients[0];
1411
		dst[1] += src[0] * coefficients[1];
1412
		dst[2] += src[0] * coefficients[2];
1413
		dst[3] += src[0] * coefficients[3];
1414
		dst[4] += src[0] * coefficients[4];
1415
		dst[5] += src[0] * coefficients[5];
1416
		dst[6] += src[0] * coefficients[6];
1417
		dst[7] += src[0] * coefficients[7];
1418
	}
1419
}
1420

1421
void FAudio_INTERNAL_Mix_2in_1out_Scalar(
1422
	uint32_t toMix,
1423
	uint32_t UNUSED1,
1424
	uint32_t UNUSED2,
1425
	float *restrict src,
1426
	float *restrict dst,
1427
	float *restrict coefficients
1428
) {
1429
	uint32_t i;
1430
	for (i = 0; i < toMix; i += 1, src += 2, dst += 1)
1431
	{
1432
		/* Base source data, combined with the coefficients */
1433
		dst[0] += (
1434
			(src[0] * coefficients[0]) +
1435
			(src[1] * coefficients[1])
1436
		);
1437
	}
1438
}
1439

1440
void FAudio_INTERNAL_Mix_2in_2out_Scalar(
1441
	uint32_t toMix,
1442
	uint32_t UNUSED1,
1443
	uint32_t UNUSED2,
1444
	float *restrict src,
1445
	float *restrict dst,
1446
	float *restrict coefficients
1447
) {
1448
	uint32_t i;
1449
	for (i = 0; i < toMix; i += 1, src += 2, dst += 2)
1450
	{
1451
		dst[0] += (
1452
			(src[0] * coefficients[0]) +
1453
			(src[1] * coefficients[1])
1454
		);
1455
		dst[1] += (
1456
			(src[0] * coefficients[2]) +
1457
			(src[1] * coefficients[3])
1458
		);
1459
	}
1460
}
1461

1462
void FAudio_INTERNAL_Mix_2in_6out_Scalar(
1463
	uint32_t toMix,
1464
	uint32_t UNUSED1,
1465
	uint32_t UNUSED2,
1466
	float *restrict src,
1467
	float *restrict dst,
1468
	float *restrict coefficients
1469
) {
1470
	uint32_t i;
1471
	for (i = 0; i < toMix; i += 1, src += 2, dst += 6)
1472
	{
1473
		dst[0] += (
1474
			(src[0] * coefficients[0]) +
1475
			(src[1] * coefficients[1])
1476
		);
1477
		dst[1] += (
1478
			(src[0] * coefficients[2]) +
1479
			(src[1] * coefficients[3])
1480
		);
1481
		dst[2] += (
1482
			(src[0] * coefficients[4]) +
1483
			(src[1] * coefficients[5])
1484
		);
1485
		dst[3] += (
1486
			(src[0] * coefficients[6]) +
1487
			(src[1] * coefficients[7])
1488
		);
1489
		dst[4] += (
1490
			(src[0] * coefficients[8]) +
1491
			(src[1] * coefficients[9])
1492
		);
1493
		dst[5] += (
1494
			(src[0] * coefficients[10]) +
1495
			(src[1] * coefficients[11])
1496
		);
1497
	}
1498
}
1499

1500
void FAudio_INTERNAL_Mix_2in_8out_Scalar(
1501
	uint32_t toMix,
1502
	uint32_t UNUSED1,
1503
	uint32_t UNUSED2,
1504
	float *restrict src,
1505
	float *restrict dst,
1506
	float *restrict coefficients
1507
) {
1508
	uint32_t i;
1509
	for (i = 0; i < toMix; i += 1, src += 2, dst += 8)
1510
	{
1511
		dst[0] += (
1512
			(src[0] * coefficients[0]) +
1513
			(src[1] * coefficients[1])
1514
		);
1515
		dst[1] += (
1516
			(src[0] * coefficients[2]) +
1517
			(src[1] * coefficients[3])
1518
		);
1519
		dst[2] += (
1520
			(src[0] * coefficients[4]) +
1521
			(src[1] * coefficients[5])
1522
		);
1523
		dst[3] += (
1524
			(src[0] * coefficients[6]) +
1525
			(src[1] * coefficients[7])
1526
		);
1527
		dst[4] += (
1528
			(src[0] * coefficients[8]) +
1529
			(src[1] * coefficients[9])
1530
		);
1531
		dst[5] += (
1532
			(src[0] * coefficients[10]) +
1533
			(src[1] * coefficients[11])
1534
		);
1535
		dst[6] += (
1536
			(src[0] * coefficients[12]) +
1537
			(src[1] * coefficients[13])
1538
		);
1539
		dst[7] += (
1540
			(src[0] * coefficients[14]) +
1541
			(src[1] * coefficients[15])
1542
		);
1543
	}
1544
}
1545

1546
/* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */
1547

1548
void (*FAudio_INTERNAL_Convert_U8_To_F32)(
1549
	const uint8_t *restrict src,
1550
	float *restrict dst,
1551
	uint32_t len
1552
);
1553
void (*FAudio_INTERNAL_Convert_S16_To_F32)(
1554
	const int16_t *restrict src,
1555
	float *restrict dst,
1556
	uint32_t len
1557
);
1558
void (*FAudio_INTERNAL_Convert_S32_To_F32)(
1559
	const int32_t *restrict src,
1560
	float *restrict dst,
1561
	uint32_t len
1562
);
1563

1564
FAudioResampleCallback FAudio_INTERNAL_ResampleMono;
1565
FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;
1566

1567
void (*FAudio_INTERNAL_Amplify)(
1568
	float *output,
1569
	uint32_t totalSamples,
1570
	float volume
1571
);
1572

1573
FAudioMixCallback FAudio_INTERNAL_Mix_Generic;
1574

1575
void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)
1576
{
1577
#if HAVE_SSE2_INTRINSICS
1578
	if (hasSSE2)
1579
	{
1580
		FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;
1581
		FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;
1582
		FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;
1583
		FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;
1584
		FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;
1585
		FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;
1586
		FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_SSE2;
1587
		return;
1588
	}
1589
#endif
1590
#if HAVE_NEON_INTRINSICS
1591
	if (hasNEON)
1592
	{
1593
		FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;
1594
		FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;
1595
		FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;
1596
		FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;
1597
		FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;
1598
		FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;
1599
		FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1600
		return;
1601
	}
1602
#endif
1603
#if NEED_SCALAR_CONVERTER_FALLBACKS
1604
	FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;
1605
	FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;
1606
	FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;
1607
	FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;
1608
	FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;
1609
	FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;
1610
	FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1611
#else
1612
	FAudio_assert(0 && "Need converter functions!");
1613
#endif
1614
}
1615

1616
/* vim: set noexpandtab shiftwidth=8 tabstop=8: */
1617

1618
Product

Resources

Company