Path: blob/master/libsnes/bsnes/nall/dsp/resample/lib/sinc.hpp
2 views
// If these types are changed to anything other than "float", you should comment out the SSE detection directives below1// so that the SSE code is not used.23typedef float resample_coeff_t; // note: sizeof(resample_coeff_t) must be == to a power of 2, and not larger than 164typedef float resample_samp_t;56#include <string>7#include <vector>89// ...but don't comment this single RESAMPLE_SSEREGPARM define out when disabling SSE.10#define RESAMPLE_SSEREGPARM1112#if defined(__SSE__)13#define SINCRESAMPLE_USE_SSE 114#ifndef __x86_64__15#undef RESAMPLE_SSEREGPARM16#define RESAMPLE_SSEREGPARM __attribute__((sseregparm))17#endif18#else19// TODO: altivec here20#endif2122namespace ResampleUtility23{24inline void kaiser_window(double* io, int count, double beta);25inline void gen_sinc(double* out, int size, double cutoff, double kaiser);26inline void gen_sinc_os(double* out, int size, double cutoff, double kaiser);27inline void normalize(double* io, int size, double gain = 1.0);2829inline void* make_aligned(void* ptr, unsigned boundary); // boundary must be a power of 230}3132class SincResampleHR33{34private:3536inline void Init(unsigned ratio_arg, double desired_bandwidth, double beta, double d);3738inline void write(resample_samp_t sample) RESAMPLE_SSEREGPARM;39inline resample_samp_t read(void) RESAMPLE_SSEREGPARM;40inline bool output_avail(void);4142private:4344inline resample_samp_t mac(const resample_samp_t *wave, const resample_coeff_t *coeff, unsigned count);4546unsigned ratio;47unsigned num_convolutions;4849resample_coeff_t *coeffs;50std::vector<unsigned char> coeffs_mem;5152// second half of ringbuffer should be copy of first half.53resample_samp_t *rb;54std::vector<unsigned char> rb_mem;5556signed rb_readpos;57signed rb_writepos;58signed rb_in;59signed rb_eff_size;6061friend class SincResample;62};6364class SincResample65{66public:6768enum69{70QUALITY_LOW = 0,71QUALITY_MEDIUM = 2,72QUALITY_HIGH = 473};7475inline SincResample(double input_rate, double output_rate, double desired_bandwidth, unsigned quality = QUALITY_HIGH);7677inline void write(resample_samp_t sample) RESAMPLE_SSEREGPARM;78inline resample_samp_t read(void) RESAMPLE_SSEREGPARM;79inline bool output_avail(void);8081private:8283inline void Init(double input_rate, double output_rate, double desired_bandwidth, double beta, double d, unsigned pn_nume, unsigned phases_min);8485inline resample_samp_t mac(const resample_samp_t *wave, const resample_coeff_t *coeffs_a, const resample_coeff_t *coeffs_b, const double ffract, unsigned count) RESAMPLE_SSEREGPARM;8687unsigned num_convolutions;88unsigned num_phases;8990unsigned step_int;91double step_fract;9293double input_pos_fract;949596std::vector<resample_coeff_t *> coeffs; // Pointers into coeff_mem.97std::vector<unsigned char> coeff_mem;9899100std::vector<resample_samp_t> rb; // second half should be copy of first half.101signed rb_readpos;102signed rb_writepos;103signed rb_in;104105bool hr_used;106SincResampleHR hr;107};108109110//111// Code:112//113//#include "resample.hpp"114115#if 0116namespace bit117{118inline unsigned round(unsigned x) {119if((x & (x - 1)) == 0) return x;120while(x & (x - 1)) x &= x - 1;121return x << 1;122}123}124#endif125126void SincResampleHR::Init(unsigned ratio_arg, double desired_bandwidth, double beta, double d)127{128const unsigned align_boundary = 16;129std::vector<double> coeffs_tmp;130double cutoff; // 1.0 = f/2131132ratio = ratio_arg;133134//num_convolutions = ((unsigned)ceil(d / ((1.0 - desired_bandwidth) / ratio)) + 1) &~ 1; // round up to be even135num_convolutions = ((unsigned)ceil(d / ((1.0 - desired_bandwidth) / ratio)) | 1);136137cutoff = (1.0 / ratio) - (d / num_convolutions);138139//printf("%d %d %.20f\n", ratio, num_convolutions, cutoff);140assert(num_convolutions > ratio);141142143// Generate windowed sinc of POWER144coeffs_tmp.resize(num_convolutions);145//ResampleUtility::gen_sinc(&coeffs_tmp[0], num_convolutions, cutoff, beta);146ResampleUtility::gen_sinc_os(&coeffs_tmp[0], num_convolutions, cutoff, beta);147ResampleUtility::normalize(&coeffs_tmp[0], num_convolutions);148149// Copy from coeffs_tmp to coeffs~150// We multiply many coefficients at a time in the mac loop, so make sure the last few that don't really151// exist are allocated, zero'd mem.152153coeffs_mem.resize(((num_convolutions + 7) &~ 7) * sizeof(resample_coeff_t) + (align_boundary - 1));154coeffs = (resample_coeff_t *)ResampleUtility::make_aligned(&coeffs_mem[0], align_boundary);155156157for(unsigned i = 0; i < num_convolutions; i++)158coeffs[i] = coeffs_tmp[i];159160rb_eff_size = nall::bit::round(num_convolutions * 2) >> 1;161rb_readpos = 0;162rb_writepos = 0;163rb_in = 0;164165rb_mem.resize(rb_eff_size * 2 * sizeof(resample_samp_t) + (align_boundary - 1));166rb = (resample_samp_t *)ResampleUtility::make_aligned(&rb_mem[0], align_boundary);167}168169170inline bool SincResampleHR::output_avail(void)171{172return(rb_in >= (signed)num_convolutions);173}174175inline void SincResampleHR::write(resample_samp_t sample)176{177assert(!output_avail());178179rb[rb_writepos] = sample;180rb[rb_writepos + rb_eff_size] = sample;181rb_writepos = (rb_writepos + 1) & (rb_eff_size - 1);182rb_in++;183}184185resample_samp_t SincResampleHR::mac(const resample_samp_t *wave, const resample_coeff_t *coeff, unsigned count)186{187#if SINCRESAMPLE_USE_SSE188__m128 accum_veca[2] = { _mm_set1_ps(0), _mm_set1_ps(0) };189190resample_samp_t accum;191192for(unsigned c = 0; c < count; c += 8)193{194for(unsigned i = 0; i < 2; i++)195{196__m128 co[2];197__m128 w[2];198199co[i] = _mm_load_ps(&coeff[c + i * 4]);200w[i] = _mm_load_ps(&wave[c + i * 4]);201202w[i] = _mm_mul_ps(w[i], co[i]);203204accum_veca[i] = _mm_add_ps(w[i], accum_veca[i]);205}206}207208__m128 accum_vec = _mm_add_ps(accum_veca[0], accum_veca[1]); //_mm_add_ps(_mm_add_ps(accum_veca[0], accum_veca[1]), _mm_add_ps(accum_veca[2], accum_veca[3]));209210accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));211accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (1 << 0) | (0 << 2) | (1 << 4) | (0 << 6)));212213_mm_store_ss(&accum, accum_vec);214215return accum;216#else217resample_samp_t accum[4] = { 0, 0, 0, 0 };218219for(unsigned c = 0; c < count; c+= 4)220{221accum[0] += wave[c + 0] * coeff[c + 0];222accum[1] += wave[c + 1] * coeff[c + 1];223accum[2] += wave[c + 2] * coeff[c + 2];224accum[3] += wave[c + 3] * coeff[c + 3];225}226227return (accum[0] + accum[1]) + (accum[2] + accum[3]); // don't mess with parentheses(assuming compiler doesn't already, which it may...228229#endif230}231232233resample_samp_t SincResampleHR::read(void)234{235assert(output_avail());236resample_samp_t ret;237238ret = mac(&rb[rb_readpos], &coeffs[0], num_convolutions);239240rb_readpos = (rb_readpos + ratio) & (rb_eff_size - 1);241rb_in -= ratio;242243return ret;244}245246247SincResample::SincResample(double input_rate, double output_rate, double desired_bandwidth, unsigned quality)248{249const struct250{251double beta;252double d;253unsigned pn_nume;254unsigned phases_min;255} qtab[5] =256{257{ 5.658, 3.62, 4096, 4 },258{ 6.764, 4.32, 8192, 4 },259{ 7.865, 5.0, 16384, 8 },260{ 8.960, 5.7, 32768, 16 },261{ 10.056, 6.4, 65536, 32 }262};263264// Sanity checks265assert(ceil(input_rate) > 0);266assert(ceil(output_rate) > 0);267assert(ceil(input_rate / output_rate) <= 1024);268assert(ceil(output_rate / input_rate) <= 1024);269270// The simplistic number-of-phases calculation code doesn't work well enough for when desired_bandwidth is close to 1.0 and when271// upsampling.272assert(desired_bandwidth >= 0.25 && desired_bandwidth < 0.96);273assert(quality >= 0 && quality <= 4);274275hr_used = false;276277#if 1278// Round down to the nearest multiple of 4(so wave buffer remains aligned)279// It also adjusts the effective intermediate sampling rate up slightly, so that the upper frequencies below f/2280// aren't overly attenuated so much. In the future, we might want to do an FFT or something to choose the intermediate rate more accurately281// to virtually eliminate over-attenuation.282unsigned ioratio_rd = (unsigned)floor(input_rate / (output_rate * (1.0 + (1.0 - desired_bandwidth) / 2) )) & ~3;283284if(ioratio_rd >= 8)285{286hr.Init(ioratio_rd, desired_bandwidth, qtab[quality].beta, qtab[quality].d); //10.056, 6.4);287hr_used = true;288289input_rate /= ioratio_rd;290}291#endif292293Init(input_rate, output_rate, desired_bandwidth, qtab[quality].beta, qtab[quality].d, qtab[quality].pn_nume, qtab[quality].phases_min);294}295296void SincResample::Init(double input_rate, double output_rate, double desired_bandwidth, double beta, double d, unsigned pn_nume, unsigned phases_min)297{298const unsigned max_mult_atatime = 8; // multiply "granularity". must be power of 2.299const unsigned max_mult_minus1 = (max_mult_atatime - 1);300const unsigned conv_alignment_bytes = 16; // must be power of 2301const double input_to_output_ratio = input_rate / output_rate;302const double output_to_input_ratio = output_rate / input_rate;303double cutoff; // 1.0 = input_rate / 2304std::vector<double> coeff_init_buffer;305306// Round up num_convolutions to be even.307if(output_rate > input_rate)308num_convolutions = ((unsigned)ceil(d / (1.0 - desired_bandwidth)) + 1) & ~1;309else310num_convolutions = ((unsigned)ceil(d / (output_to_input_ratio * (1.0 - desired_bandwidth))) + 1) & ~1;311312if(output_rate > input_rate) // Upsampling313cutoff = desired_bandwidth;314else // Downsampling315cutoff = output_to_input_ratio * desired_bandwidth;316317// Round up to be even.318num_phases = (std::max<unsigned>(pn_nume / num_convolutions, phases_min) + 1) &~1;319320// Adjust cutoff to account for the multiple phases.321cutoff = cutoff / num_phases;322323assert((num_convolutions & 1) == 0);324assert((num_phases & 1) == 0);325326// fprintf(stderr, "num_convolutions=%u, num_phases=%u, total expected coeff byte size=%lu\n", num_convolutions, num_phases,327// (long)((num_phases + 2) * ((num_convolutions + max_mult_minus1) & ~max_mult_minus1) * sizeof(float) + conv_alignment_bytes));328329coeff_init_buffer.resize(num_phases * num_convolutions);330331coeffs.resize(num_phases + 1 + 1);332333coeff_mem.resize((num_phases + 1 + 1) * ((num_convolutions + max_mult_minus1) &~ max_mult_minus1) * sizeof(resample_coeff_t) + conv_alignment_bytes);334335// Assign aligned pointers into coeff_mem336{337resample_coeff_t *base_ptr = (resample_coeff_t *)ResampleUtility::make_aligned(&coeff_mem[0], conv_alignment_bytes);338339for(unsigned phase = 0; phase < (num_phases + 1 + 1); phase++)340{341coeffs[phase] = base_ptr + (((num_convolutions + max_mult_minus1) & ~max_mult_minus1) * phase);342}343}344345ResampleUtility::gen_sinc(&coeff_init_buffer[0], num_phases * num_convolutions, cutoff, beta);346ResampleUtility::normalize(&coeff_init_buffer[0], num_phases * num_convolutions, num_phases);347348// Reorder coefficients to allow for more efficient convolution.349for(int phase = -1; phase < ((int)num_phases + 1); phase++)350{351for(int conv = 0; conv < (int)num_convolutions; conv++)352{353double coeff;354355if(phase == -1 && conv == 0)356coeff = 0;357else if(phase == (int)num_phases && conv == ((int)num_convolutions - 1))358coeff = 0;359else360coeff = coeff_init_buffer[conv * num_phases + phase];361362coeffs[phase + 1][conv] = coeff;363}364}365366// Free a bit of mem367coeff_init_buffer.resize(0);368369step_int = floor(input_to_output_ratio);370step_fract = input_to_output_ratio - step_int;371372input_pos_fract = 0;373374// Do NOT use rb.size() later in the code, since it'll include the padding.375// We should only need one "max_mult_minus1" here, not two, since it won't matter if it over-reads(due to doing "max_mult_atatime" multiplications at a time376// rather than just 1, in which case this over-read wouldn't happen), from the first half into the duplicated half,377// since those corresponding coefficients will be zero anyway; this is just to handle the case of reading off the end of the duplicated half to378// prevent illegal memory accesses.379rb.resize(num_convolutions * 2 + max_mult_minus1);380381rb_readpos = 0;382rb_writepos = 0;383rb_in = 0;384}385386resample_samp_t SincResample::mac(const resample_samp_t *wave, const resample_coeff_t *coeffs_a, const resample_coeff_t *coeffs_b, const double ffract, unsigned count)387{388resample_samp_t accum = 0;389#if SINCRESAMPLE_USE_SSE390__m128 accum_vec_a[2] = { _mm_set1_ps(0), _mm_set1_ps(0) };391__m128 accum_vec_b[2] = { _mm_set1_ps(0), _mm_set1_ps(0) };392393for(unsigned c = 0; c < count; c += 8) //8) //4)394{395__m128 coeff_a[2];396__m128 coeff_b[2];397__m128 w[2];398__m128 result_a[2], result_b[2];399400for(unsigned i = 0; i < 2; i++)401{402coeff_a[i] = _mm_load_ps(&coeffs_a[c + (i * 4)]);403coeff_b[i] = _mm_load_ps(&coeffs_b[c + (i * 4)]);404w[i] = _mm_loadu_ps(&wave[c + (i * 4)]);405406result_a[i] = _mm_mul_ps(coeff_a[i], w[i]);407result_b[i] = _mm_mul_ps(coeff_b[i], w[i]);408409accum_vec_a[i] = _mm_add_ps(result_a[i], accum_vec_a[i]);410accum_vec_b[i] = _mm_add_ps(result_b[i], accum_vec_b[i]);411}412}413414__m128 accum_vec, av_a, av_b;415__m128 mult_a_vec = _mm_set1_ps(1.0 - ffract);416__m128 mult_b_vec = _mm_set1_ps(ffract);417418av_a = _mm_mul_ps(mult_a_vec, /*accum_vec_a[0]);*/ _mm_add_ps(accum_vec_a[0], accum_vec_a[1]));419av_b = _mm_mul_ps(mult_b_vec, /*accum_vec_b[0]);*/ _mm_add_ps(accum_vec_b[0], accum_vec_b[1]));420421accum_vec = _mm_add_ps(av_a, av_b);422423accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));424accum_vec = _mm_add_ps(accum_vec, _mm_shuffle_ps(accum_vec, accum_vec, (1 << 0) | (0 << 2) | (1 << 4) | (0 << 6)));425426_mm_store_ss(&accum, accum_vec);427#else428resample_coeff_t mult_a = 1.0 - ffract;429resample_coeff_t mult_b = ffract;430431for(unsigned c = 0; c < count; c += 4)432{433accum += wave[c + 0] * (coeffs_a[c + 0] * mult_a + coeffs_b[c + 0] * mult_b);434accum += wave[c + 1] * (coeffs_a[c + 1] * mult_a + coeffs_b[c + 1] * mult_b);435accum += wave[c + 2] * (coeffs_a[c + 2] * mult_a + coeffs_b[c + 2] * mult_b);436accum += wave[c + 3] * (coeffs_a[c + 3] * mult_a + coeffs_b[c + 3] * mult_b);437}438#endif439440return accum;441}442443inline bool SincResample::output_avail(void)444{445return(rb_in >= (int)num_convolutions);446}447448resample_samp_t SincResample::read(void)449{450assert(output_avail());451double phase = input_pos_fract * num_phases - 0.5;452signed phase_int = (signed)floor(phase);453double phase_fract = phase - phase_int;454unsigned phase_a = num_phases - 1 - phase_int;455unsigned phase_b = phase_a - 1;456resample_samp_t ret;457458ret = mac(&rb[rb_readpos], &coeffs[phase_a + 1][0], &coeffs[phase_b + 1][0], phase_fract, num_convolutions);459460unsigned int_increment = step_int;461462input_pos_fract += step_fract;463int_increment += floor(input_pos_fract);464input_pos_fract -= floor(input_pos_fract);465466rb_readpos = (rb_readpos + int_increment) % num_convolutions;467rb_in -= int_increment;468469return ret;470}471472inline void SincResample::write(resample_samp_t sample)473{474assert(!output_avail());475476if(hr_used)477{478hr.write(sample);479480if(hr.output_avail())481{482sample = hr.read();483}484else485{486return;487}488}489490rb[rb_writepos + 0 * num_convolutions] = sample;491rb[rb_writepos + 1 * num_convolutions] = sample;492rb_writepos = (rb_writepos + 1) % num_convolutions;493rb_in++;494}495496void ResampleUtility::kaiser_window( double* io, int count, double beta)497{498int const accuracy = 24; //16; //12;499500double* end = io + count;501502double beta2 = beta * beta * (double) -0.25;503double to_fract = beta2 / ((double) count * count);504double i = 0;505double rescale = 0; // Doesn't need an initializer, to shut up gcc506507for ( ; io < end; ++io, i += 1 )508{509double x = i * i * to_fract - beta2;510double u = x;511double k = x + 1;512513double n = 2;514do515{516u *= x / (n * n);517n += 1;518k += u;519}520while ( k <= u * (1 << accuracy) );521522if ( !i )523rescale = 1 / k; // otherwise values get large524525*io *= k * rescale;526}527}528529void ResampleUtility::gen_sinc(double* out, int size, double cutoff, double kaiser)530{531assert( size % 2 == 0 ); // size must be even532533int const half_size = size / 2;534double* const mid = &out [half_size];535536// Generate right half of sinc537for ( int i = 0; i < half_size; i++ )538{539double angle = (i * 2 + 1) * (M_PI / 2);540mid [i] = sin( angle * cutoff ) / angle;541}542543kaiser_window( mid, half_size, kaiser );544545// Mirror for left half546for ( int i = 0; i < half_size; i++ )547out [i] = mid [half_size - 1 - i];548}549550void ResampleUtility::gen_sinc_os(double* out, int size, double cutoff, double kaiser)551{552assert( size % 2 == 1); // size must be odd553554for(int i = 0; i < size; i++)555{556if(i == (size / 2))557out[i] = 2 * M_PI * (cutoff / 2); //0.078478; //1.0; //sin(2 * M_PI * (cutoff / 2) * (i - size / 2)) / (i - (size / 2));558else559out[i] = sin(2 * M_PI * (cutoff / 2) * (i - size / 2)) / (i - (size / 2));560561// out[i] *= 0.3635819 - 0.4891775 * cos(2 * M_PI * i / (size - 1)) + 0.1365995 * cos(4 * M_PI * i / (size - 1)) - 0.0106411 * cos(6 * M_PI * i / (size - 1));562//0.42 - 0.5 * cos(2 * M_PI * i / (size - 1)) + 0.08 * cos(4 * M_PI * i / (size - 1));563564// printf("%d %f\n", i, out[i]);565}566567kaiser_window(&out[size / 2], size / 2 + 1, kaiser);568569// Mirror for left half570for ( int i = 0; i < size / 2; i++ )571out [i] = out [size - 1 - i];572573}574575void ResampleUtility::normalize(double* io, int size, double gain)576{577double sum = 0;578for ( int i = 0; i < size; i++ )579sum += io [i];580581double scale = gain / sum;582for ( int i = 0; i < size; i++ )583io [i] *= scale;584}585586void* ResampleUtility::make_aligned(void* ptr, unsigned boundary)587{588unsigned char* null_ptr = (unsigned char *)NULL;589unsigned char* uc_ptr = (unsigned char *)ptr;590591uc_ptr += (boundary - ((uc_ptr - null_ptr) & (boundary - 1))) & (boundary - 1);592593//while((uc_ptr - null_ptr) & (boundary - 1))594// uc_ptr++;595596//printf("%16llx %16llx\n", (unsigned long long)ptr, (unsigned long long)uc_ptr);597598assert((uc_ptr - (unsigned char *)ptr) < boundary && (uc_ptr >= (unsigned char *)ptr));599600return uc_ptr;601}602603604