Path: blob/master/libs/faudio/src/FAudio_internal_simd.c
4389 views
/* FAudio - XAudio Reimplementation for FNA1*2* Copyright (c) 2011-2024 Ethan Lee, Luigi Auriemma, and the MonoGame Team3*4* This software is provided 'as-is', without any express or implied warranty.5* In no event will the authors be held liable for any damages arising from6* the use of this software.7*8* Permission is granted to anyone to use this software for any purpose,9* including commercial applications, and to alter it and redistribute it10* freely, subject to the following restrictions:11*12* 1. The origin of this software must not be misrepresented; you must not13* claim that you wrote the original software. If you use this software in a14* product, an acknowledgment in the product documentation would be15* appreciated but is not required.16*17* 2. Altered source versions must be plainly marked as such, and must not be18* misrepresented as being the original software.19*20* 3. This notice may not be removed or altered from any source distribution.21*22* Ethan "flibitijibibo" Lee <[email protected]>23*24*/2526#include "FAudio_internal.h"2728/* SECTION 0: SSE/NEON Detection */2930/* The SSE/NEON detection comes from MojoAL:31* https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c32*/3334#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC)35/* Some platforms fail to define this... */36#ifndef __ARM_NEON__37#define __ARM_NEON__ 138#endif3940/* AArch64 guarantees NEON. */41#define NEED_SCALAR_CONVERTER_FALLBACKS 042#elif defined(__x86_64__) || defined(_M_X64)43/* Some platforms fail to define this... */44#ifndef __SSE2__45#define __SSE2__ 146#endif4748/* x86_64 guarantees SSE2. */49#define NEED_SCALAR_CONVERTER_FALLBACKS 050#elif __MACOSX__ && !defined(__POWERPC__)51/* Some build systems may need to specify this. */52#if !defined(__SSE2__) && !defined(__ARM_NEON__)53#error macOS does not have SSE2/NEON? Bad compiler?54#endif5556/* Mac OS X/Intel guarantees SSE2. */57#define NEED_SCALAR_CONVERTER_FALLBACKS 058#else59/* Need plain C implementations to support all other hardware */60#define NEED_SCALAR_CONVERTER_FALLBACKS 161#endif6263/* Our NEON paths require AArch64, don't check __ARM_NEON__ here */64#if defined(__aarch64__) || defined(_M_ARM64) || defined(__arm64ec__) || defined(_M_ARM64EC)65#include <arm_neon.h>66#define HAVE_NEON_INTRINSICS 167#endif686970#ifdef __SSE2__71#include <emmintrin.h>72#define HAVE_SSE2_INTRINSICS 173#endif7475/* SECTION 1: Type Converters */7677/* The SSE/NEON converters are based on SDL_audiotypecvt:78* https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c79*/8081#define DIVBY128 0.0078125f82#define DIVBY32768 0.000030517578125f83#define DIVBY8388607 0.00000011920930376163766f8485#if NEED_SCALAR_CONVERTER_FALLBACKS86void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(87const uint8_t *restrict src,88float *restrict dst,89uint32_t len90) {91uint32_t i;92for (i = 0; i < len; i += 1)93{94*dst++ = (*src++ * DIVBY128) - 1.0f;95}96}9798void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(99const int16_t *restrict src,100float *restrict dst,101uint32_t len102) {103uint32_t i;104for (i = 0; i < len; i += 1)105{106*dst++ = *src++ * DIVBY32768;107}108}109110void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(111const int32_t *restrict src,112float *restrict dst,113uint32_t len114) {115uint32_t i;116for (i = 0; i < len; i += 1)117{118*dst++ = (*src++ >> 8) * DIVBY8388607;119}120}121#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */122123#if HAVE_SSE2_INTRINSICS124void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(125const uint8_t *restrict src,126float *restrict dst,127uint32_t len128) {129int i;130src += len - 1;131dst += len - 1;132133/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */134for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {135*dst = (((float) *src) * DIVBY128) - 1.0f;136}137138src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */139FAudio_assert(!i || ((((size_t) dst) & 15) == 0));140141/* Make sure src is aligned too. */142if ((((size_t) src) & 15) == 0) {143/* Aligned! Do SSE blocks as long as we have 16 bytes available. */144const __m128i *mmsrc = (const __m128i *) src;145const __m128i zero = _mm_setzero_si128();146const __m128 divby128 = _mm_set1_ps(DIVBY128);147const __m128 minus1 = _mm_set1_ps(-1.0f);148while (i >= 16) { /* 16 * 8-bit */149const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */150/* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */151const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);152/* right-shift-zero-extend gets us uint16 with the other set of values. */153const __m128i shorts2 = _mm_srli_epi16(bytes, 8);154/* unpack against zero to make these int32, convert to float, multiply, add. Whew! */155/* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */156const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);157const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);158const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);159const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);160/* Interleave back into correct order, store. */161_mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));162_mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));163_mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));164_mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));165i -= 16; mmsrc--; dst -= 16;166}167168src = (const uint8_t *) mmsrc;169}170171src += 15; dst += 15; /* adjust for any scalar finishing. */172173/* Finish off any leftovers with scalar operations. */174while (i) {175*dst = (((float) *src) * DIVBY128) - 1.0f;176i--; src--; dst--;177}178}179180void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(181const int16_t *restrict src,182float *restrict dst,183uint32_t len184) {185int i;186src += len - 1;187dst += len - 1;188189/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */190for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {191*dst = ((float) *src) * DIVBY32768;192}193194src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */195FAudio_assert(!i || ((((size_t) dst) & 15) == 0));196197/* Make sure src is aligned too. */198if ((((size_t) src) & 15) == 0) {199/* Aligned! Do SSE blocks as long as we have 16 bytes available. */200const __m128 divby32768 = _mm_set1_ps(DIVBY32768);201while (i >= 8) { /* 8 * 16-bit */202const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */203/* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */204const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);205/* right-shift-sign-extend gets us sint32 with the other set of values. */206const __m128i b = _mm_srai_epi32(ints, 16);207/* Interleave these back into the right order, convert to float, multiply, store. */208_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));209_mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));210i -= 8; src -= 8; dst -= 8;211}212}213214src += 7; dst += 7; /* adjust for any scalar finishing. */215216/* Finish off any leftovers with scalar operations. */217while (i) {218*dst = ((float) *src) * DIVBY32768;219i--; src--; dst--;220}221}222223void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(224const int32_t *restrict src,225float *restrict dst,226uint32_t len227) {228int i;229230/* Get dst aligned to 16 bytes */231for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {232*dst = ((float) (*src>>8)) * DIVBY8388607;233}234235FAudio_assert(!i || ((((size_t) dst) & 15) == 0));236237/* Make sure src is aligned too. */238if ((((size_t) src) & 15) == 0) {239/* Aligned! Do SSE blocks as long as we have 16 bytes available. */240const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);241const __m128i *mmsrc = (const __m128i *) src;242while (i >= 4) { /* 4 * sint32 */243/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */244_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));245i -= 4; mmsrc++; dst += 4;246}247src = (const int32_t *) mmsrc;248}249250/* Finish off any leftovers with scalar operations. */251while (i) {252*dst = ((float) (*src>>8)) * DIVBY8388607;253i--; src++; dst++;254}255}256#endif /* HAVE_SSE2_INTRINSICS */257258#if HAVE_NEON_INTRINSICS259void FAudio_INTERNAL_Convert_U8_To_F32_NEON(260const uint8_t *restrict src,261float *restrict dst,262uint32_t len263) {264int i;265src += len - 1;266dst += len - 1;267268/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */269for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {270*dst = (((float) *src) * DIVBY128) - 1.0f;271}272273src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */274FAudio_assert(!i || ((((size_t) dst) & 15) == 0));275276/* Make sure src is aligned too. */277if ((((size_t) src) & 15) == 0) {278/* Aligned! Do NEON blocks as long as we have 16 bytes available. */279const uint8_t *mmsrc = (const uint8_t *) src;280const float32x4_t divby128 = vdupq_n_f32(DIVBY128);281const float32x4_t negone = vdupq_n_f32(-1.0f);282while (i >= 16) { /* 16 * 8-bit */283const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */284const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */285const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */286/* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */287vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));288vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));289vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));290vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));291i -= 16; mmsrc -= 16; dst -= 16;292}293294src = (const uint8_t *) mmsrc;295}296297src += 15; dst += 15; /* adjust for any scalar finishing. */298299/* Finish off any leftovers with scalar operations. */300while (i) {301*dst = (((float) *src) * DIVBY128) - 1.0f;302i--; src--; dst--;303}304}305306void FAudio_INTERNAL_Convert_S16_To_F32_NEON(307const int16_t *restrict src,308float *restrict dst,309uint32_t len310) {311int i;312src += len - 1;313dst += len - 1;314315/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */316for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {317*dst = ((float) *src) * DIVBY32768;318}319320src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */321FAudio_assert(!i || ((((size_t) dst) & 15) == 0));322323/* Make sure src is aligned too. */324if ((((size_t) src) & 15) == 0) {325/* Aligned! Do NEON blocks as long as we have 16 bytes available. */326const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);327while (i >= 8) { /* 8 * 16-bit */328const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */329/* split int16 to two int32, then convert to float, then multiply to normalize, store. */330vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));331vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));332i -= 8; src -= 8; dst -= 8;333}334}335336src += 7; dst += 7; /* adjust for any scalar finishing. */337338/* Finish off any leftovers with scalar operations. */339while (i) {340*dst = ((float) *src) * DIVBY32768;341i--; src--; dst--;342}343}344345void FAudio_INTERNAL_Convert_S32_To_F32_NEON(346const int32_t *restrict src,347float *restrict dst,348uint32_t len349) {350int i;351352/* Get dst aligned to 16 bytes */353for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {354*dst = ((float) (*src>>8)) * DIVBY8388607;355}356357FAudio_assert(!i || ((((size_t) dst) & 15) == 0));358359/* Make sure src is aligned too. */360if ((((size_t) src) & 15) == 0) {361/* Aligned! Do NEON blocks as long as we have 16 bytes available. */362const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);363const int32_t *mmsrc = (const int32_t *) src;364while (i >= 4) { /* 4 * sint32 */365/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */366vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));367i -= 4; mmsrc += 4; dst += 4;368}369src = (const int32_t *) mmsrc;370}371372/* Finish off any leftovers with scalar operations. */373while (i) {374*dst = ((float) (*src>>8)) * DIVBY8388607;375i--; src++; dst++;376}377}378#endif /* HAVE_NEON_INTRINSICS */379380/* SECTION 2: Linear Resamplers */381382void FAudio_INTERNAL_ResampleGeneric(383float *restrict dCache,384float *restrict resampleCache,385uint64_t *resampleOffset,386uint64_t resampleStep,387uint64_t toResample,388uint8_t channels389) {390uint32_t i, j;391uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;392for (i = 0; i < toResample; i += 1)393{394for (j = 0; j < channels; j += 1)395{396/* lerp, then convert to float value */397*resampleCache++ = (float) (398dCache[j] +399(dCache[j + channels] - dCache[j]) *400FIXED_TO_DOUBLE(cur)401);402}403404/* Increment fraction offset by the stepping value */405*resampleOffset += resampleStep;406cur += resampleStep;407408/* Only increment the sample offset by integer values.409* Sometimes this will be 0 until cur accumulates410* enough steps, especially for "slow" rates.411*/412dCache += (cur >> FIXED_PRECISION) * channels;413414/* Now that any integer has been added, drop it.415* The offset pointer will preserve the total.416*/417cur &= FIXED_FRACTION_MASK;418}419}420421#if NEED_SCALAR_CONVERTER_FALLBACKS422void FAudio_INTERNAL_ResampleMono_Scalar(423float *restrict dCache,424float *restrict resampleCache,425uint64_t *resampleOffset,426uint64_t resampleStep,427uint64_t toResample,428uint8_t UNUSED429) {430uint32_t i;431uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;432for (i = 0; i < toResample; i += 1)433{434/* lerp, then convert to float value */435*resampleCache++ = (float) (436dCache[0] +437(dCache[1] - dCache[0]) *438FIXED_TO_DOUBLE(cur)439);440441/* Increment fraction offset by the stepping value */442*resampleOffset += resampleStep;443cur += resampleStep;444445/* Only increment the sample offset by integer values.446* Sometimes this will be 0 until cur accumulates447* enough steps, especially for "slow" rates.448*/449dCache += (cur >> FIXED_PRECISION);450451/* Now that any integer has been added, drop it.452* The offset pointer will preserve the total.453*/454cur &= FIXED_FRACTION_MASK;455}456}457458void FAudio_INTERNAL_ResampleStereo_Scalar(459float *restrict dCache,460float *restrict resampleCache,461uint64_t *resampleOffset,462uint64_t resampleStep,463uint64_t toResample,464uint8_t UNUSED465) {466uint32_t i;467uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;468for (i = 0; i < toResample; i += 1)469{470/* lerp, then convert to float value */471*resampleCache++ = (float) (472dCache[0] +473(dCache[2] - dCache[0]) *474FIXED_TO_DOUBLE(cur)475);476*resampleCache++ = (float) (477dCache[1] +478(dCache[3] - dCache[1]) *479FIXED_TO_DOUBLE(cur)480);481482/* Increment fraction offset by the stepping value */483*resampleOffset += resampleStep;484cur += resampleStep;485486/* Only increment the sample offset by integer values.487* Sometimes this will be 0 until cur accumulates488* enough steps, especially for "slow" rates.489*/490dCache += (cur >> FIXED_PRECISION) * 2;491492/* Now that any integer has been added, drop it.493* The offset pointer will preserve the total.494*/495cur &= FIXED_FRACTION_MASK;496}497}498#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */499500/* The SSE2 versions of the resamplers come from @8thMage! */501502#if HAVE_SSE2_INTRINSICS503void FAudio_INTERNAL_ResampleMono_SSE2(504float *restrict dCache,505float *restrict resampleCache,506uint64_t *resampleOffset,507uint64_t resampleStep,508uint64_t toResample,509uint8_t UNUSED510) {511uint32_t i, header, tail;512uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;513float *dCache_1, *dCache_2, *dCache_3;514uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;515__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,516current, next, sub, cur_fixed, mul, res;517__m128i cur_frac, adder_frac, adder_frac_loop;518519/* This is the header, the Dest needs to be aligned to 16B */520header = (16 - ((size_t) resampleCache) % 16) / 4;521if (header == 4)522{523header = 0;524}525for (i = 0; i < header; i += 1)526{527/* lerp, then convert to float value */528*resampleCache++ = (float) (529dCache[0] +530(dCache[1] - dCache[0]) *531FIXED_TO_FLOAT(cur_scalar)532);533534/* Increment fraction offset by the stepping value */535*resampleOffset += resampleStep;536cur_scalar += resampleStep;537538/* Only increment the sample offset by integer values.539* Sometimes this will be 0 until cur accumulates540* enough steps, especially for "slow" rates.541*/542dCache += (cur_scalar >> FIXED_PRECISION);543544/* Now that any integer has been added, drop it.545* The offset pointer will preserve the total.546*/547cur_scalar &= FIXED_FRACTION_MASK;548}549550toResample -= header;551552/* initialising the varius cur553* cur_frac is the fractional part of cur with 4 samples. as the554* fractional part is 32 bit unsigned value, it can be just added555* and the modulu operation for keeping the fractional part will be implicit.556* the 0.5 is for converting signed values to float (no unsigned convert),557* the 0.5 is added later.558*/559cur_frac = _mm_set1_epi32(560(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)561);562adder_frac = _mm_setr_epi32(5630,564(uint32_t) (resampleStep & FIXED_FRACTION_MASK),565(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),566(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)567);568cur_frac = _mm_add_epi32(cur_frac, adder_frac);569570/* The various cur_scalar is for the different samples571* (1, 2, 3 compared to original cur_scalar = 0)572*/573cur_scalar_1 = cur_scalar + resampleStep;574cur_scalar_2 = cur_scalar + resampleStep * 2;575cur_scalar_3 = cur_scalar + resampleStep * 3;576dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);577dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);578dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);579cur_scalar &= FIXED_FRACTION_MASK;580cur_scalar_1 &= FIXED_FRACTION_MASK;581cur_scalar_2 &= FIXED_FRACTION_MASK;582cur_scalar_3 &= FIXED_FRACTION_MASK;583584/* FIXME: These should be _mm_undefined_ps! */585current_next_0_1 = _mm_setzero_ps();586current_next_2_3 = _mm_setzero_ps();587588/* Constants */589one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);590half = _mm_set1_ps(0.5f);591adder_frac_loop = _mm_set1_epi32(592(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)593);594595tail = toResample % 4;596for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)597{598/* current next holds 2 pairs of the sample and the sample + 1599* after that need to seperate them.600*/601602current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);603current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);604current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);605current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);606607/* Unpack them to have seperate current and next in 2 vectors. */608current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */609next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */610611sub = _mm_sub_ps(next, current);612613/* Convert the fractional part to float and then mul to get the fractions out.614* then add back the 0.5 we subtracted before.615*/616cur_fixed = _mm_add_ps(617_mm_mul_ps(618_mm_cvtepi32_ps(cur_frac),619one_over_fixed_one620),621half622);623mul = _mm_mul_ps(sub, cur_fixed);624res = _mm_add_ps(current, mul);625626/* Store back */627_mm_store_ps(resampleCache, res);628629/* Update dCaches for next iteration */630cur_scalar += resampleStep * 4;631cur_scalar_1 += resampleStep * 4;632cur_scalar_2 += resampleStep * 4;633cur_scalar_3 += resampleStep * 4;634dCache = dCache + (cur_scalar >> FIXED_PRECISION);635dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);636dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);637dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);638cur_scalar &= FIXED_FRACTION_MASK;639cur_scalar_1 &= FIXED_FRACTION_MASK;640cur_scalar_2 &= FIXED_FRACTION_MASK;641cur_scalar_3 &= FIXED_FRACTION_MASK;642643cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);644}645*resampleOffset += resampleStep * (toResample - tail);646647/* This is the tail. */648for (i = 0; i < tail; i += 1)649{650/* lerp, then convert to float value */651*resampleCache++ = (float) (652dCache[0] +653(dCache[1] - dCache[0]) *654FIXED_TO_FLOAT(cur_scalar)655);656657/* Increment fraction offset by the stepping value */658*resampleOffset += resampleStep;659cur_scalar += resampleStep;660661/* Only increment the sample offset by integer values.662* Sometimes this will be 0 until cur accumulates663* enough steps, especially for "slow" rates.664*/665dCache += (cur_scalar >> FIXED_PRECISION);666667/* Now that any integer has been added, drop it.668* The offset pointer will preserve the total.669*/670cur_scalar &= FIXED_FRACTION_MASK;671}672}673674void FAudio_INTERNAL_ResampleStereo_SSE2(675float *restrict dCache,676float *restrict resampleCache,677uint64_t *resampleOffset,678uint64_t resampleStep,679uint64_t toResample,680uint8_t UNUSED681) {682uint32_t i, header, tail;683uint64_t cur_scalar, cur_scalar_1;684float *dCache_1;685__m128 one_over_fixed_one, half, current_next_1, current_next_2,686current, next, sub, cur_fixed, mul, res;687__m128i cur_frac, adder_frac, adder_frac_loop;688689/* This is the header, the Dest needs to be aligned to 16B */690header = (16 - ((size_t) resampleCache) % 16) / 8;691if (header == 2)692{693header = 0;694}695cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;696for (i = 0; i < header; i += 2)697{698/* lerp, then convert to float value */699*resampleCache++ = (float) (700dCache[0] +701(dCache[2] - dCache[0]) *702FIXED_TO_FLOAT(cur_scalar)703);704*resampleCache++ = (float) (705dCache[1] +706(dCache[3] - dCache[1]) *707FIXED_TO_FLOAT(cur_scalar)708);709710/* Increment fraction offset by the stepping value */711*resampleOffset += resampleStep;712cur_scalar += resampleStep;713714/* Only increment the sample offset by integer values.715* Sometimes this will be 0 until cur accumulates716* enough steps, especially for "slow" rates.717*/718dCache += (cur_scalar >> FIXED_PRECISION) * 2;719720/* Now that any integer has been added, drop it.721* The offset pointer will preserve the total.722*/723cur_scalar &= FIXED_FRACTION_MASK;724}725726toResample -= header;727728/* initialising the varius cur.729* cur_frac holds the fractional part of cur.730* to avoid duplication please see the mono part for a thorough731* explanation.732*/733cur_frac = _mm_set1_epi32(734(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)735);736adder_frac = _mm_setr_epi32(7370,7380,739(uint32_t) (resampleStep & FIXED_FRACTION_MASK),740(uint32_t) (resampleStep & FIXED_FRACTION_MASK)741);742cur_frac = _mm_add_epi32(cur_frac, adder_frac);743744/* dCache_1 is the pointer for dcache in the next resample pos. */745cur_scalar_1 = cur_scalar + resampleStep;746dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;747cur_scalar_1 &= FIXED_FRACTION_MASK;748749one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);750half = _mm_set1_ps(0.5f);751adder_frac_loop = _mm_set1_epi32(752(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)753);754755tail = toResample % 2;756for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)757{758/* Current_next_1 and current_next_2 each holds 4 src759* sample points for getting 4 dest resample point at the end.760* current_next_1 holds:761* (current_ch_1, current_ch_2, next_ch_1, next_ch_2)762* for the first resample position, while current_next_2 holds763* the same for the 2nd resample position764*/765current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */766current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */767768/* Unpack them to get the current and the next in seperate vectors. */769current = _mm_castpd_ps(770_mm_unpacklo_pd(771_mm_castps_pd(current_next_1),772_mm_castps_pd(current_next_2)773)774);775next = _mm_castpd_ps(776_mm_unpackhi_pd(777_mm_castps_pd(current_next_1),778_mm_castps_pd(current_next_2)779)780);781782sub = _mm_sub_ps(next, current);783784/* Adding the 0.5 back.785* See mono explanation for more elaborate explanation.786*/787cur_fixed = _mm_add_ps(788_mm_mul_ps(789_mm_cvtepi32_ps(cur_frac),790one_over_fixed_one791),792half793);794mul = _mm_mul_ps(sub, cur_fixed);795res = _mm_add_ps(current, mul);796797/* Store the results */798_mm_store_ps(resampleCache, res);799800/* Update dCaches for next iteration */801cur_scalar += resampleStep * 2;802cur_scalar_1 += resampleStep * 2;803dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;804dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;805cur_scalar &= FIXED_FRACTION_MASK;806cur_scalar_1 &= FIXED_FRACTION_MASK;807808cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);809}810*resampleOffset += resampleStep * (toResample - tail);811812/* This is the tail. */813for (i = 0; i < tail; i += 1)814{815/* lerp, then convert to float value */816*resampleCache++ = (float) (817dCache[0] +818(dCache[2] - dCache[0]) *819FIXED_TO_FLOAT(cur_scalar)820);821*resampleCache++ = (float) (822dCache[1] +823(dCache[3] - dCache[1]) *824FIXED_TO_FLOAT(cur_scalar)825);826827/* Increment fraction offset by the stepping value */828*resampleOffset += resampleStep;829cur_scalar += resampleStep;830831/* Only increment the sample offset by integer values.832* Sometimes this will be 0 until cur accumulates833* enough steps, especially for "slow" rates.834*/835dCache += (cur_scalar >> FIXED_PRECISION) * 2;836837/* Now that any integer has been added, drop it.838* The offset pointer will preserve the total.839*/840cur_scalar &= FIXED_FRACTION_MASK;841}842}843#endif /* HAVE_SSE2_INTRINSICS */844845#if HAVE_NEON_INTRINSICS846void FAudio_INTERNAL_ResampleMono_NEON(847float *restrict dCache,848float *restrict resampleCache,849uint64_t *resampleOffset,850uint64_t resampleStep,851uint64_t toResample,852uint8_t UNUSED853) {854uint32_t i, header, tail;855uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;856float *dCache_1, *dCache_2, *dCache_3;857uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;858float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,859current, next, sub, cur_fixed, mul, res;860int32x4_t cur_frac, adder_frac, adder_frac_loop;861862/* This is the header, the Dest needs to be aligned to 16B */863header = (16 - ((size_t) resampleCache) % 16) / 4;864if (header == 4)865{866header = 0;867}868for (i = 0; i < header; i += 1)869{870/* lerp, then convert to float value */871*resampleCache++ = (float) (872dCache[0] +873(dCache[1] - dCache[0]) *874FIXED_TO_FLOAT(cur_scalar)875);876877/* Increment fraction offset by the stepping value */878*resampleOffset += resampleStep;879cur_scalar += resampleStep;880881/* Only increment the sample offset by integer values.882* Sometimes this will be 0 until cur accumulates883* enough steps, especially for "slow" rates.884*/885dCache += (cur_scalar >> FIXED_PRECISION);886887/* Now that any integer has been added, drop it.888* The offset pointer will preserve the total.889*/890cur_scalar &= FIXED_FRACTION_MASK;891}892893toResample -= header;894895/* initialising the varius cur896* cur_frac is the fractional part of cur with 4 samples. as the897* fractional part is 32 bit unsigned value, it can be just added898* and the modulu operation for keeping the fractional part will be implicit.899* the 0.5 is for converting signed values to float (no unsigned convert),900* the 0.5 is added later.901*/902cur_frac = vdupq_n_s32(903(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)904);905ALIGN(int32_t, 16) data[4] =906{9070,908(uint32_t) (resampleStep & FIXED_FRACTION_MASK),909(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),910(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)911};912adder_frac = vld1q_s32(data);913cur_frac = vaddq_s32(cur_frac, adder_frac);914915/* The various cur_scalar is for the different samples916* (1, 2, 3 compared to original cur_scalar = 0)917*/918cur_scalar_1 = cur_scalar + resampleStep;919cur_scalar_2 = cur_scalar + resampleStep * 2;920cur_scalar_3 = cur_scalar + resampleStep * 3;921dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);922dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);923dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);924cur_scalar &= FIXED_FRACTION_MASK;925cur_scalar_1 &= FIXED_FRACTION_MASK;926cur_scalar_2 &= FIXED_FRACTION_MASK;927cur_scalar_3 &= FIXED_FRACTION_MASK;928929/* Constants */930one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);931half = vdupq_n_f32(0.5f);932adder_frac_loop = vdupq_n_s32(933(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)934);935936tail = toResample % 4;937for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)938{939/* current next holds 2 pairs of the sample and the sample + 1940* after that need to separate them.941*/942current_next_0_1 = vcombine_f32(943vld1_f32(dCache),944vld1_f32(dCache_1)945);946current_next_2_3 = vcombine_f32(947vld1_f32(dCache_2),948vld1_f32(dCache_3)949);950951/* Unpack them to have seperate current and next in 2 vectors. */952current = vuzp1q_f32(current_next_0_1, current_next_2_3);953next = vuzp2q_f32(current_next_0_1, current_next_2_3);954955sub = vsubq_f32(next, current);956957/* Convert the fractional part to float and then mul to get the fractions out.958* then add back the 0.5 we subtracted before.959*/960cur_fixed = vaddq_f32(961vmulq_f32(962vcvtq_f32_s32(cur_frac),963one_over_fixed_one964),965half966);967mul = vmulq_f32(sub, cur_fixed);968res = vaddq_f32(current, mul);969970/* Store back */971vst1q_f32(resampleCache, res);972973/* Update dCaches for next iteration */974cur_scalar += resampleStep * 4;975cur_scalar_1 += resampleStep * 4;976cur_scalar_2 += resampleStep * 4;977cur_scalar_3 += resampleStep * 4;978dCache = dCache + (cur_scalar >> FIXED_PRECISION);979dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);980dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);981dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);982cur_scalar &= FIXED_FRACTION_MASK;983cur_scalar_1 &= FIXED_FRACTION_MASK;984cur_scalar_2 &= FIXED_FRACTION_MASK;985cur_scalar_3 &= FIXED_FRACTION_MASK;986987cur_frac = vaddq_s32(cur_frac, adder_frac_loop);988}989*resampleOffset += resampleStep * (toResample - tail);990991/* This is the tail. */992for (i = 0; i < tail; i += 1)993{994/* lerp, then convert to float value */995*resampleCache++ = (float) (996dCache[0] +997(dCache[1] - dCache[0]) *998FIXED_TO_FLOAT(cur_scalar)999);10001001/* Increment fraction offset by the stepping value */1002*resampleOffset += resampleStep;1003cur_scalar += resampleStep;10041005/* Only increment the sample offset by integer values.1006* Sometimes this will be 0 until cur accumulates1007* enough steps, especially for "slow" rates.1008*/1009dCache += (cur_scalar >> FIXED_PRECISION);10101011/* Now that any integer has been added, drop it.1012* The offset pointer will preserve the total.1013*/1014cur_scalar &= FIXED_FRACTION_MASK;1015}1016}10171018void FAudio_INTERNAL_ResampleStereo_NEON(1019float *restrict dCache,1020float *restrict resampleCache,1021uint64_t *resampleOffset,1022uint64_t resampleStep,1023uint64_t toResample,1024uint8_t channels1025) {1026uint32_t i, header, tail;1027uint64_t cur_scalar, cur_scalar_1;1028float *dCache_1;1029float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;1030int32x4_t cur_frac, adder_frac, adder_frac_loop;10311032/* This is the header, the Dest needs to be aligned to 16B */1033header = (16 - ((size_t) resampleCache) % 16) / 8;1034if (header == 2)1035{1036header = 0;1037}1038cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;1039for (i = 0; i < header; i += 2)1040{1041/* lerp, then convert to float value */1042*resampleCache++ = (float) (1043dCache[0] +1044(dCache[2] - dCache[0]) *1045FIXED_TO_FLOAT(cur_scalar)1046);1047*resampleCache++ = (float) (1048dCache[1] +1049(dCache[3] - dCache[1]) *1050FIXED_TO_FLOAT(cur_scalar)1051);10521053/* Increment fraction offset by the stepping value */1054*resampleOffset += resampleStep;1055cur_scalar += resampleStep;10561057/* Only increment the sample offset by integer values.1058* Sometimes this will be 0 until cur accumulates1059* enough steps, especially for "slow" rates.1060*/1061dCache += (cur_scalar >> FIXED_PRECISION) * 2;10621063/* Now that any integer has been added, drop it.1064* The offset pointer will preserve the total.1065*/1066cur_scalar &= FIXED_FRACTION_MASK;1067}10681069toResample -= header;10701071/* initialising the varius cur.1072* cur_frac holds the fractional part of cur.1073* to avoid duplication please see the mono part for a thorough1074* explanation.1075*/1076cur_frac = vdupq_n_s32(1077(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)1078);1079ALIGN(int32_t, 16) data[4] =1080{10810,10820,1083(uint32_t) (resampleStep & FIXED_FRACTION_MASK),1084(uint32_t) (resampleStep & FIXED_FRACTION_MASK)1085};1086adder_frac = vld1q_s32(data);1087cur_frac = vaddq_s32(cur_frac, adder_frac);10881089/* dCache_1 is the pointer for dcache in the next resample pos. */1090cur_scalar_1 = cur_scalar + resampleStep;1091dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;1092cur_scalar_1 &= FIXED_FRACTION_MASK;10931094one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);1095half = vdupq_n_f32(0.5f);1096adder_frac_loop = vdupq_n_s32(1097(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)1098);10991100tail = toResample % 2;1101for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)1102{1103/* Current_next_1 and current_next_2 each holds 4 src1104* sample points for getting 4 dest resample point at the end.1105* current_next_1 holds:1106* (current_ch_1, current_ch_2, next_ch_1, next_ch_2)1107* for the first resample position, while current_next_2 holds1108* the same for the 2nd resample position1109*/1110current = vcombine_f32(1111vld1_f32(dCache), /* A1B1 */1112vld1_f32(dCache_1) /* A3B3 */1113);1114next = vcombine_f32(1115vld1_f32(dCache + 2), /* A2B2 */1116vld1_f32(dCache_1 + 2) /* A4B4 */1117);11181119sub = vsubq_f32(next, current);11201121/* Adding the 0.5 back.1122* See mono explanation for more elaborate explanation.1123*/1124cur_fixed = vaddq_f32(1125vmulq_f32(1126vcvtq_f32_s32(cur_frac),1127one_over_fixed_one1128),1129half1130);1131mul = vmulq_f32(sub, cur_fixed);1132res = vaddq_f32(current, mul);11331134/* Store the results */1135vst1q_f32(resampleCache, res);11361137/* Update dCaches for next iteration */1138cur_scalar += resampleStep * 2;1139cur_scalar_1 += resampleStep * 2;1140dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;1141dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;1142cur_scalar &= FIXED_FRACTION_MASK;1143cur_scalar_1 &= FIXED_FRACTION_MASK;11441145cur_frac = vaddq_s32(cur_frac, adder_frac_loop);1146}1147*resampleOffset += resampleStep * (toResample - tail);11481149/* This is the tail. */1150for (i = 0; i < tail; i += 1)1151{1152/* lerp, then convert to float value */1153*resampleCache++ = (float) (1154dCache[0] +1155(dCache[2] - dCache[0]) *1156FIXED_TO_FLOAT(cur_scalar)1157);1158*resampleCache++ = (float) (1159dCache[1] +1160(dCache[3] - dCache[1]) *1161FIXED_TO_FLOAT(cur_scalar)1162);11631164/* Increment fraction offset by the stepping value */1165*resampleOffset += resampleStep;1166cur_scalar += resampleStep;11671168/* Only increment the sample offset by integer values.1169* Sometimes this will be 0 until cur accumulates1170* enough steps, especially for "slow" rates.1171*/1172dCache += (cur_scalar >> FIXED_PRECISION) * 2;11731174/* Now that any integer has been added, drop it.1175* The offset pointer will preserve the total.1176*/1177cur_scalar &= FIXED_FRACTION_MASK;1178}1179}1180#endif /* HAVE_NEON_INTRINSICS */11811182/* SECTION 3: Amplifiers */11831184#if NEED_SCALAR_CONVERTER_FALLBACKS1185void FAudio_INTERNAL_Amplify_Scalar(1186float* output,1187uint32_t totalSamples,1188float volume1189) {1190uint32_t i;1191for (i = 0; i < totalSamples; i += 1)1192{1193output[i] *= volume;1194}1195}1196#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */11971198/* The SSE2 version of the amplifier comes from @8thMage! */11991200#if HAVE_SSE2_INTRINSICS1201void FAudio_INTERNAL_Amplify_SSE2(1202float* output,1203uint32_t totalSamples,1204float volume1205) {1206uint32_t i;1207uint32_t header = (16 - (((size_t) output) % 16)) / 4;1208uint32_t tail = (totalSamples - header) % 4;1209__m128 volumeVec, outVec;1210if (header == 4)1211{1212header = 0;1213}1214if (tail == 4)1215{1216tail = 0;1217}12181219for (i = 0; i < header; i += 1)1220{1221output[i] *= volume;1222}12231224volumeVec = _mm_set1_ps(volume);1225for (i = header; i < totalSamples - tail; i += 4)1226{1227outVec = _mm_load_ps(output + i);1228outVec = _mm_mul_ps(outVec, volumeVec);1229_mm_store_ps(output + i, outVec);1230}12311232for (i = totalSamples - tail; i < totalSamples; i += 1)1233{1234output[i] *= volume;1235}1236}1237#endif /* HAVE_SSE2_INTRINSICS */12381239#if HAVE_NEON_INTRINSICS1240void FAudio_INTERNAL_Amplify_NEON(1241float* output,1242uint32_t totalSamples,1243float volume1244) {1245uint32_t i;1246uint32_t header = (16 - (((size_t) output) % 16)) / 4;1247uint32_t tail = (totalSamples - header) % 4;1248float32x4_t volumeVec, outVec;1249if (header == 4)1250{1251header = 0;1252}1253if (tail == 4)1254{1255tail = 0;1256}12571258for (i = 0; i < header; i += 1)1259{1260output[i] *= volume;1261}12621263volumeVec = vdupq_n_f32(volume);1264for (i = header; i < totalSamples - tail; i += 4)1265{1266outVec = vld1q_f32(output + i);1267outVec = vmulq_f32(outVec, volumeVec);1268vst1q_f32(output + i, outVec);1269}12701271for (i = totalSamples - tail; i < totalSamples; i += 1)1272{1273output[i] *= volume;1274}1275}1276#endif /* HAVE_NEON_INTRINSICS */12771278/* SECTION 4: Mixer Functions */12791280void FAudio_INTERNAL_Mix_Generic_Scalar(1281uint32_t toMix,1282uint32_t srcChans,1283uint32_t dstChans,1284float *restrict src,1285float *restrict dst,1286float *restrict coefficients1287) {1288uint32_t i, co, ci;1289for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)1290for (co = 0; co < dstChans; co += 1)1291{1292for (ci = 0; ci < srcChans; ci += 1)1293{1294dst[co] += (1295src[ci] *1296coefficients[co * srcChans + ci]1297);1298}1299}1300}13011302#if HAVE_SSE2_INTRINSICS1303/* SSE horizontal add by Peter Cordes, CC-BY-SA.1304* From https://stackoverflow.com/a/35270026 */1305static inline float FAudio_simd_hadd(__m128 v)1306{1307__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));1308__m128 sums = _mm_add_ps(v, shuf);1309shuf = _mm_movehl_ps(shuf, sums);1310sums = _mm_add_ss(sums, shuf);1311return _mm_cvtss_f32(sums);1312}13131314void FAudio_INTERNAL_Mix_Generic_SSE2(1315uint32_t toMix,1316uint32_t srcChans,1317uint32_t dstChans,1318float *restrict src,1319float *restrict dst,1320float *restrict coefficients1321) {1322uint32_t i, co, ci;1323for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)1324for (co = 0; co < dstChans; co += 1)1325{1326for (ci = 0; srcChans - ci >= 4; ci += 4)1327{1328/* do SIMD */1329const __m128 vols = _mm_loadu_ps(&coefficients[co * srcChans + ci]);1330const __m128 dat = _mm_loadu_ps(&src[ci]);1331dst[co] += FAudio_simd_hadd(_mm_mul_ps(dat, vols));1332}13331334for (; ci < srcChans; ci += 1)1335{1336/* do scalar */1337dst[co] += (1338src[ci] *1339coefficients[co * srcChans + ci]1340);1341}1342}1343}1344#endif /* HAVE_SSE2_INTRINSICS */13451346void FAudio_INTERNAL_Mix_1in_1out_Scalar(1347uint32_t toMix,1348uint32_t UNUSED1,1349uint32_t UNUSED2,1350float *restrict src,1351float *restrict dst,1352float *restrict coefficients1353) {1354uint32_t i;1355for (i = 0; i < toMix; i += 1, src += 1, dst += 1)1356{1357/* Base source data, combined with the coefficients */1358dst[0] += src[0] * coefficients[0];1359}1360}13611362void FAudio_INTERNAL_Mix_1in_2out_Scalar(1363uint32_t toMix,1364uint32_t UNUSED1,1365uint32_t UNUSED2,1366float *restrict src,1367float *restrict dst,1368float *restrict coefficients1369) {1370uint32_t i;1371for (i = 0; i < toMix; i += 1, src += 1, dst += 2)1372{1373dst[0] += src[0] * coefficients[0];1374dst[1] += src[0] * coefficients[1];1375}1376}13771378void FAudio_INTERNAL_Mix_1in_6out_Scalar(1379uint32_t toMix,1380uint32_t UNUSED1,1381uint32_t UNUSED2,1382float *restrict src,1383float *restrict dst,1384float *restrict coefficients1385) {1386uint32_t i;1387for (i = 0; i < toMix; i += 1, src += 1, dst += 6)1388{1389dst[0] += src[0] * coefficients[0];1390dst[1] += src[0] * coefficients[1];1391dst[2] += src[0] * coefficients[2];1392dst[3] += src[0] * coefficients[3];1393dst[4] += src[0] * coefficients[4];1394dst[5] += src[0] * coefficients[5];1395}1396}13971398void FAudio_INTERNAL_Mix_1in_8out_Scalar(1399uint32_t toMix,1400uint32_t UNUSED1,1401uint32_t UNUSED2,1402float *restrict src,1403float *restrict dst,1404float *restrict coefficients1405) {1406uint32_t i;1407for (i = 0; i < toMix; i += 1, src += 1, dst += 8)1408{1409dst[0] += src[0] * coefficients[0];1410dst[1] += src[0] * coefficients[1];1411dst[2] += src[0] * coefficients[2];1412dst[3] += src[0] * coefficients[3];1413dst[4] += src[0] * coefficients[4];1414dst[5] += src[0] * coefficients[5];1415dst[6] += src[0] * coefficients[6];1416dst[7] += src[0] * coefficients[7];1417}1418}14191420void FAudio_INTERNAL_Mix_2in_1out_Scalar(1421uint32_t toMix,1422uint32_t UNUSED1,1423uint32_t UNUSED2,1424float *restrict src,1425float *restrict dst,1426float *restrict coefficients1427) {1428uint32_t i;1429for (i = 0; i < toMix; i += 1, src += 2, dst += 1)1430{1431/* Base source data, combined with the coefficients */1432dst[0] += (1433(src[0] * coefficients[0]) +1434(src[1] * coefficients[1])1435);1436}1437}14381439void FAudio_INTERNAL_Mix_2in_2out_Scalar(1440uint32_t toMix,1441uint32_t UNUSED1,1442uint32_t UNUSED2,1443float *restrict src,1444float *restrict dst,1445float *restrict coefficients1446) {1447uint32_t i;1448for (i = 0; i < toMix; i += 1, src += 2, dst += 2)1449{1450dst[0] += (1451(src[0] * coefficients[0]) +1452(src[1] * coefficients[1])1453);1454dst[1] += (1455(src[0] * coefficients[2]) +1456(src[1] * coefficients[3])1457);1458}1459}14601461void FAudio_INTERNAL_Mix_2in_6out_Scalar(1462uint32_t toMix,1463uint32_t UNUSED1,1464uint32_t UNUSED2,1465float *restrict src,1466float *restrict dst,1467float *restrict coefficients1468) {1469uint32_t i;1470for (i = 0; i < toMix; i += 1, src += 2, dst += 6)1471{1472dst[0] += (1473(src[0] * coefficients[0]) +1474(src[1] * coefficients[1])1475);1476dst[1] += (1477(src[0] * coefficients[2]) +1478(src[1] * coefficients[3])1479);1480dst[2] += (1481(src[0] * coefficients[4]) +1482(src[1] * coefficients[5])1483);1484dst[3] += (1485(src[0] * coefficients[6]) +1486(src[1] * coefficients[7])1487);1488dst[4] += (1489(src[0] * coefficients[8]) +1490(src[1] * coefficients[9])1491);1492dst[5] += (1493(src[0] * coefficients[10]) +1494(src[1] * coefficients[11])1495);1496}1497}14981499void FAudio_INTERNAL_Mix_2in_8out_Scalar(1500uint32_t toMix,1501uint32_t UNUSED1,1502uint32_t UNUSED2,1503float *restrict src,1504float *restrict dst,1505float *restrict coefficients1506) {1507uint32_t i;1508for (i = 0; i < toMix; i += 1, src += 2, dst += 8)1509{1510dst[0] += (1511(src[0] * coefficients[0]) +1512(src[1] * coefficients[1])1513);1514dst[1] += (1515(src[0] * coefficients[2]) +1516(src[1] * coefficients[3])1517);1518dst[2] += (1519(src[0] * coefficients[4]) +1520(src[1] * coefficients[5])1521);1522dst[3] += (1523(src[0] * coefficients[6]) +1524(src[1] * coefficients[7])1525);1526dst[4] += (1527(src[0] * coefficients[8]) +1528(src[1] * coefficients[9])1529);1530dst[5] += (1531(src[0] * coefficients[10]) +1532(src[1] * coefficients[11])1533);1534dst[6] += (1535(src[0] * coefficients[12]) +1536(src[1] * coefficients[13])1537);1538dst[7] += (1539(src[0] * coefficients[14]) +1540(src[1] * coefficients[15])1541);1542}1543}15441545/* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */15461547void (*FAudio_INTERNAL_Convert_U8_To_F32)(1548const uint8_t *restrict src,1549float *restrict dst,1550uint32_t len1551);1552void (*FAudio_INTERNAL_Convert_S16_To_F32)(1553const int16_t *restrict src,1554float *restrict dst,1555uint32_t len1556);1557void (*FAudio_INTERNAL_Convert_S32_To_F32)(1558const int32_t *restrict src,1559float *restrict dst,1560uint32_t len1561);15621563FAudioResampleCallback FAudio_INTERNAL_ResampleMono;1564FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;15651566void (*FAudio_INTERNAL_Amplify)(1567float *output,1568uint32_t totalSamples,1569float volume1570);15711572FAudioMixCallback FAudio_INTERNAL_Mix_Generic;15731574void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)1575{1576#if HAVE_SSE2_INTRINSICS1577if (hasSSE2)1578{1579FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;1580FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;1581FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;1582FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;1583FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;1584FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;1585FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_SSE2;1586return;1587}1588#endif1589#if HAVE_NEON_INTRINSICS1590if (hasNEON)1591{1592FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;1593FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;1594FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;1595FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;1596FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;1597FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;1598FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;1599return;1600}1601#endif1602#if NEED_SCALAR_CONVERTER_FALLBACKS1603FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;1604FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;1605FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;1606FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;1607FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;1608FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;1609FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;1610#else1611FAudio_assert(0 && "Need converter functions!");1612#endif1613}16141615/* vim: set noexpandtab shiftwidth=8 tabstop=8: */161616171618