Path: blob/master/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
9903 views
// basisu_kernels_imp.h - Do not directly include1// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.2//3// Licensed under the Apache License, Version 2.0 (the "License");4// you may not use this file except in compliance with the License.5// You may obtain a copy of the License at6//7// http://www.apache.org/licenses/LICENSE-2.08//9// Unless required by applicable law or agreed to in writing, software10// distributed under the License is distributed on an "AS IS" BASIS,11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12// See the License for the specific language governing permissions and13// limitations under the License.1415using namespace CPPSPMD;1617namespace CPPSPMD_NAME(basisu_kernels_namespace)18{19struct perceptual_distance_rgb_4_N : spmd_kernel20{21void _call(int64_t* pDistance,22const uint8_t* pSelectors,23const color_rgba* pBlock_colors,24const color_rgba* pSrc_pixels, uint32_t n,25int64_t early_out_err)26{27assert(early_out_err >= 0);2829*pDistance = 0;3031__m128i block_colors[4];32vint block_colors_r[4], block_colors_g[4], block_colors_b[4];33for (uint32_t i = 0; i < 4; i++)34{35block_colors[i] = load_rgba32(&pBlock_colors[i]);36store_all(block_colors_r[i], (int)pBlock_colors[i].r);37store_all(block_colors_g[i], (int)pBlock_colors[i].g);38store_all(block_colors_b[i], (int)pBlock_colors[i].b);39}4041uint32_t i;42for (i = 0; (i + 4) <= n; i += 4)43{44__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);4546vint r, g, b, a;47transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);4849int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];5051vint base_r, base_g, base_b, base_a;52if ((s0 == s1) && (s0 == s2) && (s0 == s3))53{54store_all(base_r, block_colors_r[s0]);55store_all(base_g, block_colors_g[s0]);56store_all(base_b, block_colors_b[s0]);57}58else59{60__m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];61transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);62}6364vint dr = base_r - r;65vint dg = base_g - g;66vint db = base_b - b;6768vint delta_l = dr * 27 + dg * 92 + db * 9;69vint delta_cr = dr * 128 - delta_l;70vint delta_cb = db * 128 - delta_l;7172vint id = ((delta_l * delta_l) >> 7) +73((((delta_cr * delta_cr) >> 7) * 26) >> 7) +74((((delta_cb * delta_cb) >> 7) * 3) >> 7);7576*pDistance += reduce_add(id);77if (*pDistance >= early_out_err)78return;79}8081for (; i < n; i++)82{83int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;8485int sel = pSelectors[i];86int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;8788int dr = base_r - r;89int dg = base_g - g;90int db = base_b - b;9192int delta_l = dr * 27 + dg * 92 + db * 9;93int delta_cr = dr * 128 - delta_l;94int delta_cb = db * 128 - delta_l;9596int id = ((delta_l * delta_l) >> 7) +97((((delta_cr * delta_cr) >> 7) * 26) >> 7) +98((((delta_cb * delta_cb) >> 7) * 3) >> 7);99100*pDistance += id;101if (*pDistance >= early_out_err)102return;103}104}105};106107struct linear_distance_rgb_4_N : spmd_kernel108{109void _call(int64_t* pDistance,110const uint8_t* pSelectors,111const color_rgba* pBlock_colors,112const color_rgba* pSrc_pixels, uint32_t n,113int64_t early_out_err)114{115assert(early_out_err >= 0);116117*pDistance = 0;118119__m128i block_colors[4];120vint block_colors_r[4], block_colors_g[4], block_colors_b[4];121for (uint32_t i = 0; i < 4; i++)122{123block_colors[i] = load_rgba32(&pBlock_colors[i]);124store_all(block_colors_r[i], (int)pBlock_colors[i].r);125store_all(block_colors_g[i], (int)pBlock_colors[i].g);126store_all(block_colors_b[i], (int)pBlock_colors[i].b);127}128129uint32_t i;130for (i = 0; (i + 4) <= n; i += 4)131{132__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);133134vint r, g, b, a;135transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);136137int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];138139vint base_r, base_g, base_b, base_a;140if ((s0 == s1) && (s0 == s2) && (s0 == s3))141{142store_all(base_r, block_colors_r[s0]);143store_all(base_g, block_colors_g[s0]);144store_all(base_b, block_colors_b[s0]);145}146else147{148__m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];149transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);150}151152vint dr = base_r - r;153vint dg = base_g - g;154vint db = base_b - b;155156vint id = dr * dr + dg * dg + db * db;157158*pDistance += reduce_add(id);159if (*pDistance >= early_out_err)160return;161}162163for (; i < n; i++)164{165int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;166167int sel = pSelectors[i];168int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;169170int dr = base_r - r;171int dg = base_g - g;172int db = base_b - b;173174int id = dr * dr + dg * dg + db * db;175176*pDistance += id;177if (*pDistance >= early_out_err)178return;179}180}181};182183struct find_selectors_perceptual_rgb_4_N : spmd_kernel184{185inline vint compute_dist(186const vint& base_r, const vint& base_g, const vint& base_b,187const vint& r, const vint& g, const vint& b)188{189vint dr = base_r - r;190vint dg = base_g - g;191vint db = base_b - b;192193vint delta_l = dr * 27 + dg * 92 + db * 9;194vint delta_cr = dr * 128 - delta_l;195vint delta_cb = db * 128 - delta_l;196197vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +198VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +199VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);200201return id;202}203204void _call(int64_t* pDistance,205uint8_t* pSelectors,206const color_rgba* pBlock_colors,207const color_rgba* pSrc_pixels, uint32_t n,208int64_t early_out_err)209{210assert(early_out_err >= 0);211212*pDistance = 0;213214vint block_colors_r[4], block_colors_g[4], block_colors_b[4];215for (uint32_t i = 0; i < 4; i++)216{217store_all(block_colors_r[i], (int)pBlock_colors[i].r);218store_all(block_colors_g[i], (int)pBlock_colors[i].g);219store_all(block_colors_b[i], (int)pBlock_colors[i].b);220}221222const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);223224uint32_t i;225226for (i = 0; (i + 4) <= n; i += 4)227{228__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);229230vint r, g, b, a;231transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);232233vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);234vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);235vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);236vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);237238vint min_dist = min(min(min(dist0, dist1), dist2), dist3);239240vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));241242__m128i vsels = shuffle_epi8(sels.m_value, shuf);243storeu_si32((void *)(pSelectors + i), vsels);244245*pDistance += reduce_add(min_dist);246if (*pDistance >= early_out_err)247return;248}249250for (; i < n; i++)251{252int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;253254int best_err = INT_MAX, best_sel = 0;255for (int sel = 0; sel < 4; sel++)256{257int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;258259int dr = base_r - r;260int dg = base_g - g;261int db = base_b - b;262263int delta_l = dr * 27 + dg * 92 + db * 9;264int delta_cr = dr * 128 - delta_l;265int delta_cb = db * 128 - delta_l;266267int id = ((delta_l * delta_l) >> 7) +268((((delta_cr * delta_cr) >> 7) * 26) >> 7) +269((((delta_cb * delta_cb) >> 7) * 3) >> 7);270if (id < best_err)271{272best_err = id;273best_sel = sel;274}275}276277pSelectors[i] = (uint8_t)best_sel;278279*pDistance += best_err;280if (*pDistance >= early_out_err)281return;282}283}284};285286struct find_selectors_linear_rgb_4_N : spmd_kernel287{288inline vint compute_dist(289const vint& base_r, const vint& base_g, const vint& base_b,290const vint& r, const vint& g, const vint& b)291{292vint dr = base_r - r;293vint dg = base_g - g;294vint db = base_b - b;295296vint id = dr * dr + dg * dg + db * db;297return id;298}299300void _call(int64_t* pDistance,301uint8_t* pSelectors,302const color_rgba* pBlock_colors,303const color_rgba* pSrc_pixels, uint32_t n,304int64_t early_out_err)305{306assert(early_out_err >= 0);307308*pDistance = 0;309310vint block_colors_r[4], block_colors_g[4], block_colors_b[4];311for (uint32_t i = 0; i < 4; i++)312{313store_all(block_colors_r[i], (int)pBlock_colors[i].r);314store_all(block_colors_g[i], (int)pBlock_colors[i].g);315store_all(block_colors_b[i], (int)pBlock_colors[i].b);316}317318const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);319320uint32_t i;321322for (i = 0; (i + 4) <= n; i += 4)323{324__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);325326vint r, g, b, a;327transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);328329vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);330vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);331vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);332vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);333334vint min_dist = min(min(min(dist0, dist1), dist2), dist3);335336vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));337338__m128i vsels = shuffle_epi8(sels.m_value, shuf);339storeu_si32((void *)(pSelectors + i), vsels);340341*pDistance += reduce_add(min_dist);342if (*pDistance >= early_out_err)343return;344}345346for (; i < n; i++)347{348int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;349350int best_err = INT_MAX, best_sel = 0;351for (int sel = 0; sel < 4; sel++)352{353int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;354355int dr = base_r - r;356int dg = base_g - g;357int db = base_b - b;358359int id = dr * dr + dg * dg + db * db;360if (id < best_err)361{362best_err = id;363best_sel = sel;364}365}366367pSelectors[i] = (uint8_t)best_sel;368369*pDistance += best_err;370if (*pDistance >= early_out_err)371return;372}373}374};375376struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel377{378inline vint compute_dist(379const vint& base_r, const vint& base_g, const vint& base_b,380const vint& r, const vint& g, const vint& b)381{382vint dr = base_r - r;383vint dg = base_g - g;384vint db = base_b - b;385386vint delta_l = dr * 27 + dg * 92 + db * 9;387vint delta_cr = dr * 128 - delta_l;388vint delta_cb = db * 128 - delta_l;389390vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +391VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +392VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);393394return id;395}396397void _call(int64_t* pDistance,398const color_rgba* pBlock_colors,399const color_rgba* pSrc_pixels, uint32_t n,400int64_t early_out_error)401{402assert(early_out_error >= 0);403404*pDistance = 0;405406vint block_colors_r[4], block_colors_g[4], block_colors_b[4];407for (uint32_t i = 0; i < 4; i++)408{409store_all(block_colors_r[i], (int)pBlock_colors[i].r);410store_all(block_colors_g[i], (int)pBlock_colors[i].g);411store_all(block_colors_b[i], (int)pBlock_colors[i].b);412}413414uint32_t i;415416for (i = 0; (i + 4) <= n; i += 4)417{418__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);419420vint r, g, b, a;421transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);422423vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);424vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);425vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);426vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);427428vint min_dist = min(min(min(dist0, dist1), dist2), dist3);429430*pDistance += reduce_add(min_dist);431if (*pDistance > early_out_error)432return;433}434435for (; i < n; i++)436{437int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;438439int best_err = INT_MAX;440for (int sel = 0; sel < 4; sel++)441{442int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;443444int dr = base_r - r;445int dg = base_g - g;446int db = base_b - b;447448int delta_l = dr * 27 + dg * 92 + db * 9;449int delta_cr = dr * 128 - delta_l;450int delta_cb = db * 128 - delta_l;451452int id = ((delta_l * delta_l) >> 7) +453((((delta_cr * delta_cr) >> 7) * 26) >> 7) +454((((delta_cb * delta_cb) >> 7) * 3) >> 7);455456if (id < best_err)457{458best_err = id;459}460}461462*pDistance += best_err;463if (*pDistance > early_out_error)464return;465}466}467};468469struct find_lowest_error_linear_rgb_4_N : spmd_kernel470{471inline vint compute_dist(472const vint& base_r, const vint& base_g, const vint& base_b,473const vint& r, const vint& g, const vint& b)474{475vint dr = base_r - r;476vint dg = base_g - g;477vint db = base_b - b;478479vint id = dr * dr + dg * dg + db * db;480481return id;482}483484void _call(int64_t* pDistance,485const color_rgba* pBlock_colors,486const color_rgba* pSrc_pixels, uint32_t n,487int64_t early_out_error)488{489assert(early_out_error >= 0);490491*pDistance = 0;492493vint block_colors_r[4], block_colors_g[4], block_colors_b[4];494for (uint32_t i = 0; i < 4; i++)495{496store_all(block_colors_r[i], (int)pBlock_colors[i].r);497store_all(block_colors_g[i], (int)pBlock_colors[i].g);498store_all(block_colors_b[i], (int)pBlock_colors[i].b);499}500501uint32_t i;502503for (i = 0; (i + 4) <= n; i += 4)504{505__m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);506507vint r, g, b, a;508transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);509510vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);511vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);512vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);513vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);514515vint min_dist = min(min(min(dist0, dist1), dist2), dist3);516517*pDistance += reduce_add(min_dist);518if (*pDistance > early_out_error)519return;520}521522for (; i < n; i++)523{524int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;525526int best_err = INT_MAX;527for (int sel = 0; sel < 4; sel++)528{529int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;530531int dr = base_r - r;532int dg = base_g - g;533int db = base_b - b;534535int id = dr * dr + dg * dg + db * db;536537if (id < best_err)538{539best_err = id;540}541}542543*pDistance += best_err;544if (*pDistance > early_out_error)545return;546}547}548};549550struct update_covar_matrix_16x16 : spmd_kernel551{552void _call(553uint32_t num_vecs, const void* pWeighted_vecs_void, const void* pOrigin_void, const uint32_t* pVec_indices, void* pMatrix16x16_void)554{555const std::pair<vec16F, uint64_t>* pWeighted_vecs = static_cast< const std::pair<vec16F, uint64_t> *>(pWeighted_vecs_void);556557const float* pOrigin = static_cast<const float*>(pOrigin_void);558vfloat org0 = loadu_linear_all(pOrigin), org1 = loadu_linear_all(pOrigin + 4), org2 = loadu_linear_all(pOrigin + 8), org3 = loadu_linear_all(pOrigin + 12);559560vfloat mat[16][4];561vfloat vzero(zero_vfloat());562563for (uint32_t i = 0; i < 16; i++)564{565store_all(mat[i][0], vzero);566store_all(mat[i][1], vzero);567store_all(mat[i][2], vzero);568store_all(mat[i][3], vzero);569}570571for (uint32_t k = 0; k < num_vecs; k++)572{573const uint32_t vec_index = pVec_indices[k];574575const float* pW = pWeighted_vecs[vec_index].first.get_ptr();576vfloat weight((float)pWeighted_vecs[vec_index].second);577578vfloat vec[4] = { loadu_linear_all(pW) - org0, loadu_linear_all(pW + 4) - org1, loadu_linear_all(pW + 8) - org2, loadu_linear_all(pW + 12) - org3 };579580vfloat wvec0 = vec[0] * weight, wvec1 = vec[1] * weight, wvec2 = vec[2] * weight, wvec3 = vec[3] * weight;581582for (uint32_t j = 0; j < 16; j++)583{584vfloat vx = ((const float*)vec)[j];585586store_all(mat[j][0], mat[j][0] + vx * wvec0);587store_all(mat[j][1], mat[j][1] + vx * wvec1);588store_all(mat[j][2], mat[j][2] + vx * wvec2);589store_all(mat[j][3], mat[j][3] + vx * wvec3);590591} // j592593} // k594595float* pMatrix = static_cast<float*>(pMatrix16x16_void);596597float* pDst = pMatrix;598for (uint32_t i = 0; i < 16; i++)599{600storeu_linear_all(pDst, mat[i][0]);601storeu_linear_all(pDst + 4, mat[i][1]);602storeu_linear_all(pDst + 8, mat[i][2]);603storeu_linear_all(pDst + 12, mat[i][3]);604pDst += 16;605}606}607};608609} // namespace610611using namespace CPPSPMD_NAME(basisu_kernels_namespace);612613void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)614{615spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);616}617618void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)619{620spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);621}622623void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)624{625spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);626}627628void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)629{630spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);631}632633void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)634{635spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);636}637638void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)639{640spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);641}642643void CPPSPMD_NAME(update_covar_matrix_16x16)(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16)644{645spmd_call < update_covar_matrix_16x16 >(num_vecs, pWeighted_vecs, pOrigin, pVec_indices, pMatrix16x16);646}647648649