Path: blob/main/system/include/compat/avx2intrin.h
6171 views
/*1* Copyright 2024 The Emscripten Authors. All rights reserved.2* Emscripten is available under two separate licenses, the MIT license and the3* University of Illinois/NCSA Open Source License. Both these licenses can be4* found in the LICENSE file.5*/67#ifndef __emscripten_immintrin_h__8#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."9#endif1011#ifndef __emscripten_avx2intrin_h__12#define __emscripten_avx2intrin_h__1314#ifndef __AVX2__15#error "AVX2 instruction set not enabled"16#endif1718#define _mm256_mpsadbw_epu8(__A, __B, __imm) \19__extension__({ \20__m256i __a = (__A); \21__m256i __b = (__B); \22_mm256_set_m128i(_mm_mpsadbw_epu8(__a.v1, __b.v1, (__imm) >> 3), \23_mm_mpsadbw_epu8(__a.v0, __b.v0, (__imm))); \24})2526static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))27_mm256_abs_epi8(__m256i __a) {28__m256i ret;29ret.v0 = _mm_abs_epi8(__a.v0);30ret.v1 = _mm_abs_epi8(__a.v1);31return ret;32}3334static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))35_mm256_abs_epi16(__m256i __a) {36__m256i ret;37ret.v0 = _mm_abs_epi16(__a.v0);38ret.v1 = _mm_abs_epi16(__a.v1);39return ret;40}4142static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))43_mm256_abs_epi32(__m256i __a) {44__m256i ret;45ret.v0 = _mm_abs_epi32(__a.v0);46ret.v1 = _mm_abs_epi32(__a.v1);47return ret;48}4950static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))51_mm256_packs_epi16(__m256i __a, __m256i __b) {52__m256i ret;53ret.v0 = _mm_packs_epi16(__a.v0, __b.v0);54ret.v1 = _mm_packs_epi16(__a.v1, __b.v1);55return ret;56}5758static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))59_mm256_packs_epi32(__m256i __a, __m256i __b) {60__m256i ret;61ret.v0 = _mm_packs_epi32(__a.v0, __b.v0);62ret.v1 = _mm_packs_epi32(__a.v1, __b.v1);63return ret;64}6566static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))67_mm256_packus_epi16(__m256i __a, __m256i __b) {68__m256i ret;69ret.v0 = _mm_packus_epi16(__a.v0, __b.v0);70ret.v1 = _mm_packus_epi16(__a.v1, __b.v1);71return ret;72}7374static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))75_mm256_packus_epi32(__m256i __a, __m256i __b) {76__m256i ret;77ret.v0 = _mm_packus_epi32(__a.v0, __b.v0);78ret.v1 = _mm_packus_epi32(__a.v1, __b.v1);79return ret;80}8182static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))83_mm256_add_epi8(__m256i __a, __m256i __b) {84__m256i ret;85ret.v0 = _mm_add_epi8(__a.v0, __b.v0);86ret.v1 = _mm_add_epi8(__a.v1, __b.v1);87return ret;88}8990static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))91_mm256_add_epi16(__m256i __a, __m256i __b) {92__m256i ret;93ret.v0 = _mm_add_epi16(__a.v0, __b.v0);94ret.v1 = _mm_add_epi16(__a.v1, __b.v1);95return ret;96}9798static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))99_mm256_add_epi32(__m256i __a, __m256i __b) {100__m256i ret;101ret.v0 = _mm_add_epi32(__a.v0, __b.v0);102ret.v1 = _mm_add_epi32(__a.v1, __b.v1);103return ret;104}105106static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))107_mm256_add_epi64(__m256i __a, __m256i __b) {108__m256i ret;109ret.v0 = _mm_add_epi64(__a.v0, __b.v0);110ret.v1 = _mm_add_epi64(__a.v1, __b.v1);111return ret;112}113114static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))115_mm256_adds_epi8(__m256i __a, __m256i __b) {116__m256i ret;117ret.v0 = _mm_adds_epi8(__a.v0, __b.v0);118ret.v1 = _mm_adds_epi8(__a.v1, __b.v1);119return ret;120}121122static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))123_mm256_adds_epi16(__m256i __a, __m256i __b) {124__m256i ret;125ret.v0 = _mm_adds_epi16(__a.v0, __b.v0);126ret.v1 = _mm_adds_epi16(__a.v1, __b.v1);127return ret;128}129130static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))131_mm256_adds_epu8(__m256i __a, __m256i __b) {132__m256i ret;133ret.v0 = _mm_adds_epu8(__a.v0, __b.v0);134ret.v1 = _mm_adds_epu8(__a.v1, __b.v1);135return ret;136}137138static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))139_mm256_adds_epu16(__m256i __a, __m256i __b) {140__m256i ret;141ret.v0 = _mm_adds_epu16(__a.v0, __b.v0);142ret.v1 = _mm_adds_epu16(__a.v1, __b.v1);143return ret;144}145146#define _mm256_alignr_epi8(__A, __B, __imm) \147__extension__({ \148__m256i __a = (__A); \149__m256i __b = (__B); \150_mm256_set_m128i(_mm_alignr_epi8(__a.v1, __b.v1, (__imm)), \151_mm_alignr_epi8(__a.v0, __b.v0, (__imm))); \152})153154static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))155_mm256_and_si256(__m256i __a, __m256i __b) {156__m256i ret;157ret.v0 = _mm_and_si128(__a.v0, __b.v0);158ret.v1 = _mm_and_si128(__a.v1, __b.v1);159return ret;160}161162static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))163_mm256_andnot_si256(__m256i __a, __m256i __b) {164__m256i ret;165ret.v0 = _mm_andnot_si128(__a.v0, __b.v0);166ret.v1 = _mm_andnot_si128(__a.v1, __b.v1);167return ret;168}169170static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))171_mm256_avg_epu8(__m256i __a, __m256i __b) {172__m256i ret;173ret.v0 = _mm_avg_epu8(__a.v0, __b.v0);174ret.v1 = _mm_avg_epu8(__a.v1, __b.v1);175return ret;176}177178static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))179_mm256_avg_epu16(__m256i __a, __m256i __b) {180__m256i ret;181ret.v0 = _mm_avg_epu16(__a.v0, __b.v0);182ret.v1 = _mm_avg_epu16(__a.v1, __b.v1);183return ret;184}185186static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))187_mm256_blendv_epi8(__m256i __a, __m256i __b, __m256i __mask) {188__m256i ret;189ret.v0 = _mm_blendv_epi8(__a.v0, __b.v0, __mask.v0);190ret.v1 = _mm_blendv_epi8(__a.v1, __b.v1, __mask.v1);191return ret;192}193194#define _mm256_blend_epi16(__A, __B, __imm) \195__extension__({ \196__m256i __a = (__A); \197__m256i __b = (__B); \198_mm256_set_m128i(_mm_blend_epi16(__a.v1, __b.v1, (__imm)), \199_mm_blend_epi16(__a.v0, __b.v0, (__imm))); \200})201202static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))203_mm256_cmpeq_epi8(__m256i __a, __m256i __b) {204__m256i ret;205ret.v0 = _mm_cmpeq_epi8(__a.v0, __b.v0);206ret.v1 = _mm_cmpeq_epi8(__a.v1, __b.v1);207return ret;208}209210static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))211_mm256_cmpeq_epi16(__m256i __a, __m256i __b) {212__m256i ret;213ret.v0 = _mm_cmpeq_epi16(__a.v0, __b.v0);214ret.v1 = _mm_cmpeq_epi16(__a.v1, __b.v1);215return ret;216}217218static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))219_mm256_cmpeq_epi32(__m256i __a, __m256i __b) {220__m256i ret;221ret.v0 = _mm_cmpeq_epi32(__a.v0, __b.v0);222ret.v1 = _mm_cmpeq_epi32(__a.v1, __b.v1);223return ret;224}225226static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))227_mm256_cmpeq_epi64(__m256i __a, __m256i __b) {228__m256i ret;229ret.v0 = _mm_cmpeq_epi64(__a.v0, __b.v0);230ret.v1 = _mm_cmpeq_epi64(__a.v1, __b.v1);231return ret;232}233234static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))235_mm256_cmpgt_epi8(__m256i __a, __m256i __b) {236__m256i ret;237ret.v0 = _mm_cmpgt_epi8(__a.v0, __b.v0);238ret.v1 = _mm_cmpgt_epi8(__a.v1, __b.v1);239return ret;240}241242static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))243_mm256_cmpgt_epi16(__m256i __a, __m256i __b) {244__m256i ret;245ret.v0 = _mm_cmpgt_epi16(__a.v0, __b.v0);246ret.v1 = _mm_cmpgt_epi16(__a.v1, __b.v1);247return ret;248}249250static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))251_mm256_cmpgt_epi32(__m256i __a, __m256i __b) {252__m256i ret;253ret.v0 = _mm_cmpgt_epi32(__a.v0, __b.v0);254ret.v1 = _mm_cmpgt_epi32(__a.v1, __b.v1);255return ret;256}257258static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))259_mm256_cmpgt_epi64(__m256i __a, __m256i __b) {260__m256i ret;261ret.v0 = _mm_cmpgt_epi64(__a.v0, __b.v0);262ret.v1 = _mm_cmpgt_epi64(__a.v1, __b.v1);263return ret;264}265266static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))267_mm256_hadd_epi16(__m256i __a, __m256i __b) {268__m256i ret;269ret.v0 = _mm_hadd_epi16(__a.v0, __b.v0);270ret.v1 = _mm_hadd_epi16(__a.v1, __b.v1);271return ret;272}273274static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))275_mm256_hadd_epi32(__m256i __a, __m256i __b) {276__m256i ret;277ret.v0 = _mm_hadd_epi32(__a.v0, __b.v0);278ret.v1 = _mm_hadd_epi32(__a.v1, __b.v1);279return ret;280}281282static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))283_mm256_hadds_epi16(__m256i __a, __m256i __b) {284__m256i ret;285ret.v0 = _mm_hadds_epi16(__a.v0, __b.v0);286ret.v1 = _mm_hadds_epi16(__a.v1, __b.v1);287return ret;288}289290static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))291_mm256_hsub_epi16(__m256i __a, __m256i __b) {292__m256i ret;293ret.v0 = _mm_hsub_epi16(__a.v0, __b.v0);294ret.v1 = _mm_hsub_epi16(__a.v1, __b.v1);295return ret;296}297298static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))299_mm256_hsub_epi32(__m256i __a, __m256i __b) {300__m256i ret;301ret.v0 = _mm_hsub_epi32(__a.v0, __b.v0);302ret.v1 = _mm_hsub_epi32(__a.v1, __b.v1);303return ret;304}305306static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))307_mm256_hsubs_epi16(__m256i __a, __m256i __b) {308__m256i ret;309ret.v0 = _mm_hsubs_epi16(__a.v0, __b.v0);310ret.v1 = _mm_hsubs_epi16(__a.v1, __b.v1);311return ret;312}313314static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))315_mm256_maddubs_epi16(__m256i __a, __m256i __b) {316__m256i ret;317ret.v0 = _mm_maddubs_epi16(__a.v0, __b.v0);318ret.v1 = _mm_maddubs_epi16(__a.v1, __b.v1);319return ret;320}321322static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))323_mm256_madd_epi16(__m256i __a, __m256i __b) {324__m256i ret;325ret.v0 = _mm_madd_epi16(__a.v0, __b.v0);326ret.v1 = _mm_madd_epi16(__a.v1, __b.v1);327return ret;328}329330static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))331_mm256_max_epi8(__m256i __a, __m256i __b) {332__m256i ret;333ret.v0 = _mm_max_epi8(__a.v0, __b.v0);334ret.v1 = _mm_max_epi8(__a.v1, __b.v1);335return ret;336}337338static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))339_mm256_max_epi16(__m256i __a, __m256i __b) {340__m256i ret;341ret.v0 = _mm_max_epi16(__a.v0, __b.v0);342ret.v1 = _mm_max_epi16(__a.v1, __b.v1);343return ret;344}345346static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))347_mm256_max_epi32(__m256i __a, __m256i __b) {348__m256i ret;349ret.v0 = _mm_max_epi32(__a.v0, __b.v0);350ret.v1 = _mm_max_epi32(__a.v1, __b.v1);351return ret;352}353354static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))355_mm256_max_epu8(__m256i __a, __m256i __b) {356__m256i ret;357ret.v0 = _mm_max_epu8(__a.v0, __b.v0);358ret.v1 = _mm_max_epu8(__a.v1, __b.v1);359return ret;360}361362static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))363_mm256_max_epu16(__m256i __a, __m256i __b) {364__m256i ret;365ret.v0 = _mm_max_epu16(__a.v0, __b.v0);366ret.v1 = _mm_max_epu16(__a.v1, __b.v1);367return ret;368}369370static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))371_mm256_max_epu32(__m256i __a, __m256i __b) {372__m256i ret;373ret.v0 = _mm_max_epu32(__a.v0, __b.v0);374ret.v1 = _mm_max_epu32(__a.v1, __b.v1);375return ret;376}377378static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))379_mm256_min_epi8(__m256i __a, __m256i __b) {380__m256i ret;381ret.v0 = _mm_min_epi8(__a.v0, __b.v0);382ret.v1 = _mm_min_epi8(__a.v1, __b.v1);383return ret;384}385386static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))387_mm256_min_epi16(__m256i __a, __m256i __b) {388__m256i ret;389ret.v0 = _mm_min_epi16(__a.v0, __b.v0);390ret.v1 = _mm_min_epi16(__a.v1, __b.v1);391return ret;392}393394static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))395_mm256_min_epi32(__m256i __a, __m256i __b) {396__m256i ret;397ret.v0 = _mm_min_epi32(__a.v0, __b.v0);398ret.v1 = _mm_min_epi32(__a.v1, __b.v1);399return ret;400}401402static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))403_mm256_min_epu8(__m256i __a, __m256i __b) {404__m256i ret;405ret.v0 = _mm_min_epu8(__a.v0, __b.v0);406ret.v1 = _mm_min_epu8(__a.v1, __b.v1);407return ret;408}409410static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))411_mm256_min_epu16(__m256i __a, __m256i __b) {412__m256i ret;413ret.v0 = _mm_min_epu16(__a.v0, __b.v0);414ret.v1 = _mm_min_epu16(__a.v1, __b.v1);415return ret;416}417418static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))419_mm256_min_epu32(__m256i __a, __m256i __b) {420__m256i ret;421ret.v0 = _mm_min_epu32(__a.v0, __b.v0);422ret.v1 = _mm_min_epu32(__a.v1, __b.v1);423return ret;424}425426static __inline__ int __attribute__((__always_inline__, __nodebug__))427_mm256_movemask_epi8(__m256i __a) {428return (_mm_movemask_epi8(__a.v1) << 16) | _mm_movemask_epi8(__a.v0);429}430431static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))432_mm256_cvtepi8_epi16(__m128i __a) {433__m256i ret;434ret.v0 = _mm_cvtepi8_epi16(__a);435ret.v1 = _mm_cvtepi8_epi16(_mm_shuffle_epi32(__a, 0x4E));436return ret;437}438439static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))440_mm256_cvtepi8_epi32(__m128i __a) {441__m256i ret;442ret.v0 = _mm_cvtepi8_epi32(__a);443ret.v1 = _mm_cvtepi8_epi32(_mm_shuffle_epi32(__a, 0xE1));444return ret;445}446447static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))448_mm256_cvtepi8_epi64(__m128i __a) {449__m256i ret;450ret.v0 = _mm_cvtepi8_epi64(__a);451ret.v1 = _mm_cvtepi8_epi64(_mm_srli_epi32(__a, 16));452return ret;453}454455static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))456_mm256_cvtepi16_epi32(__m128i __a) {457__m256i ret;458ret.v0 = _mm_cvtepi16_epi32(__a);459ret.v1 = _mm_cvtepi16_epi32(_mm_shuffle_epi32(__a, 0x4E));460return ret;461}462463static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))464_mm256_cvtepi16_epi64(__m128i __a) {465__m256i ret;466ret.v0 = _mm_cvtepi16_epi64(__a);467ret.v1 = _mm_cvtepi16_epi64(_mm_shuffle_epi32(__a, 0xE1));468return ret;469}470471static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))472_mm256_cvtepi32_epi64(__m128i __a) {473__m256i ret;474ret.v0 = _mm_cvtepi32_epi64(__a);475ret.v1 = _mm_cvtepi32_epi64(_mm_shuffle_epi32(__a, 0x4E));476return ret;477}478479static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))480_mm256_cvtepu8_epi16(__m128i __a) {481__m256i ret;482ret.v0 = _mm_cvtepu8_epi16(__a);483ret.v1 = _mm_cvtepu8_epi16(_mm_shuffle_epi32(__a, 0x4E));484return ret;485}486487static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))488_mm256_cvtepu8_epi32(__m128i __a) {489__m256i ret;490ret.v0 = _mm_cvtepu8_epi32(__a);491ret.v1 = _mm_cvtepu8_epi32(_mm_shuffle_epi32(__a, 0xE1));492return ret;493}494495static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))496_mm256_cvtepu8_epi64(__m128i __a) {497__m256i ret;498ret.v0 = _mm_cvtepu8_epi64(__a);499ret.v1 = _mm_cvtepu8_epi64(_mm_srli_epi32(__a, 16));500return ret;501}502503static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))504_mm256_cvtepu16_epi32(__m128i __a) {505__m256i ret;506ret.v0 = _mm_cvtepu16_epi32(__a);507ret.v1 = _mm_cvtepu16_epi32(_mm_shuffle_epi32(__a, 0x4E));508return ret;509}510511static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))512_mm256_cvtepu16_epi64(__m128i __a) {513__m256i ret;514ret.v0 = _mm_cvtepu16_epi64(__a);515ret.v1 = _mm_cvtepu16_epi64(_mm_shuffle_epi32(__a, 0xE1));516return ret;517}518519static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))520_mm256_cvtepu32_epi64(__m128i __a) {521__m256i ret;522ret.v0 = _mm_cvtepu32_epi64(__a);523ret.v1 = _mm_cvtepu32_epi64(_mm_shuffle_epi32(__a, 0x4E));524return ret;525}526527static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))528_mm256_mul_epi32(__m256i __a, __m256i __b) {529__m256i ret;530ret.v0 = _mm_mul_epi32(__a.v0, __b.v0);531ret.v1 = _mm_mul_epi32(__a.v1, __b.v1);532return ret;533}534535static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))536_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {537__m256i ret;538ret.v0 = _mm_mulhrs_epi16(__a.v0, __b.v0);539ret.v1 = _mm_mulhrs_epi16(__a.v1, __b.v1);540return ret;541}542543static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))544_mm256_mulhi_epu16(__m256i __a, __m256i __b) {545__m256i ret;546ret.v0 = _mm_mulhi_epu16(__a.v0, __b.v0);547ret.v1 = _mm_mulhi_epu16(__a.v1, __b.v1);548return ret;549}550551static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))552_mm256_mulhi_epi16(__m256i __a, __m256i __b) {553__m256i ret;554ret.v0 = _mm_mulhi_epi16(__a.v0, __b.v0);555ret.v1 = _mm_mulhi_epi16(__a.v1, __b.v1);556return ret;557}558559static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))560_mm256_mullo_epi16(__m256i __a, __m256i __b) {561__m256i ret;562ret.v0 = _mm_mullo_epi16(__a.v0, __b.v0);563ret.v1 = _mm_mullo_epi16(__a.v1, __b.v1);564return ret;565}566567static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))568_mm256_mullo_epi32(__m256i __a, __m256i __b) {569__m256i ret;570ret.v0 = _mm_mullo_epi32(__a.v0, __b.v0);571ret.v1 = _mm_mullo_epi32(__a.v1, __b.v1);572return ret;573}574575static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))576_mm256_mul_epu32(__m256i __a, __m256i __b) {577__m256i ret;578ret.v0 = _mm_mul_epu32(__a.v0, __b.v0);579ret.v1 = _mm_mul_epu32(__a.v1, __b.v1);580return ret;581}582583static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))584_mm256_or_si256(__m256i __a, __m256i __b) {585__m256i ret;586ret.v0 = _mm_or_si128(__a.v0, __b.v0);587ret.v1 = _mm_or_si128(__a.v1, __b.v1);588return ret;589}590591static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))592_mm256_sad_epu8(__m256i __a, __m256i __b) {593__m256i ret;594ret.v0 = _mm_sad_epu8(__a.v0, __b.v0);595ret.v1 = _mm_sad_epu8(__a.v1, __b.v1);596return ret;597}598599static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))600_mm256_shuffle_epi8(__m256i __a, __m256i __b) {601__m256i ret;602ret.v0 = _mm_shuffle_epi8(__a.v0, __b.v0);603ret.v1 = _mm_shuffle_epi8(__a.v1, __b.v1);604return ret;605}606607#define _mm256_shuffle_epi32(__A, __imm) \608__extension__({ \609__m256i __a = (__A); \610_mm256_set_m128i(_mm_shuffle_epi32(__a.v1, (__imm)), \611_mm_shuffle_epi32(__a.v0, (__imm))); \612})613614#define _mm256_shufflehi_epi16(__A, __imm) \615__extension__({ \616__m256i __a = (__A); \617_mm256_set_m128i(_mm_shufflehi_epi16(__a.v1, (__imm)), \618_mm_shufflehi_epi16(__a.v0, (__imm))); \619})620621#define _mm256_shufflelo_epi16(__A, __imm) \622__extension__({ \623__m256i __a = (__A); \624_mm256_set_m128i(_mm_shufflelo_epi16(__a.v1, (__imm)), \625_mm_shufflelo_epi16(__a.v0, (__imm))); \626})627628static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))629_mm256_sign_epi8(__m256i __a, __m256i __b) {630__m256i ret;631ret.v0 = _mm_sign_epi8(__a.v0, __b.v0);632ret.v1 = _mm_sign_epi8(__a.v1, __b.v1);633return ret;634}635636static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))637_mm256_sign_epi16(__m256i __a, __m256i __b) {638__m256i ret;639ret.v0 = _mm_sign_epi16(__a.v0, __b.v0);640ret.v1 = _mm_sign_epi16(__a.v1, __b.v1);641return ret;642}643644static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))645_mm256_sign_epi32(__m256i __a, __m256i __b) {646__m256i ret;647ret.v0 = _mm_sign_epi32(__a.v0, __b.v0);648ret.v1 = _mm_sign_epi32(__a.v1, __b.v1);649return ret;650}651652#define _mm256_slli_si256(__A, __imm) \653__extension__({ \654__m256i __a = (__A); \655_mm256_set_m128i(_mm_slli_si128(__a.v1, (__imm)), \656_mm_slli_si128(__a.v0, (__imm))); \657})658659#define _mm256_bslli_epi128(__A, __imm) _mm256_slli_si256(__A, __imm)660661static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))662_mm256_slli_epi16(__m256i __a, int __count) {663__m256i ret;664ret.v0 = _mm_slli_epi16(__a.v0, __count);665ret.v1 = _mm_slli_epi16(__a.v1, __count);666return ret;667}668669static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))670_mm256_sll_epi16(__m256i __a, __m128i __count) {671__m256i ret;672ret.v0 = _mm_sll_epi16(__a.v0, __count);673ret.v1 = _mm_sll_epi16(__a.v1, __count);674return ret;675}676677static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))678_mm256_slli_epi32(__m256i __a, int __count) {679__m256i ret;680ret.v0 = _mm_slli_epi32(__a.v0, __count);681ret.v1 = _mm_slli_epi32(__a.v1, __count);682return ret;683}684685static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))686_mm256_sll_epi32(__m256i __a, __m128i __count) {687__m256i ret;688ret.v0 = _mm_sll_epi32(__a.v0, __count);689ret.v1 = _mm_sll_epi32(__a.v1, __count);690return ret;691}692693static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))694_mm256_slli_epi64(__m256i __a, int __count) {695__m256i ret;696ret.v0 = _mm_slli_epi64(__a.v0, __count);697ret.v1 = _mm_slli_epi64(__a.v1, __count);698return ret;699}700701static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))702_mm256_sll_epi64(__m256i __a, __m128i __count) {703__m256i ret;704ret.v0 = _mm_sll_epi64(__a.v0, __count);705ret.v1 = _mm_sll_epi64(__a.v1, __count);706return ret;707}708709static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))710_mm256_srai_epi16(__m256i __a, int __count) {711__m256i ret;712ret.v0 = _mm_srai_epi16(__a.v0, __count);713ret.v1 = _mm_srai_epi16(__a.v1, __count);714return ret;715}716717static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))718_mm256_sra_epi16(__m256i __a, __m128i __count) {719__m256i ret;720ret.v0 = _mm_sra_epi16(__a.v0, __count);721ret.v1 = _mm_sra_epi16(__a.v1, __count);722return ret;723}724725static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))726_mm256_srai_epi32(__m256i __a, int __count) {727__m256i ret;728ret.v0 = _mm_srai_epi32(__a.v0, __count);729ret.v1 = _mm_srai_epi32(__a.v1, __count);730return ret;731}732733static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))734_mm256_sra_epi32(__m256i __a, __m128i __count) {735__m256i ret;736ret.v0 = _mm_sra_epi32(__a.v0, __count);737ret.v1 = _mm_sra_epi32(__a.v1, __count);738return ret;739}740741#define _mm256_srli_si256(__A, __imm) \742__extension__({ \743__m256i __a = (__A); \744_mm256_set_m128i(_mm_srli_si128(__a.v1, (__imm)), \745_mm_srli_si128(__a.v0, (__imm))); \746})747748#define _mm256_bsrli_epi128(a, imm) _mm256_srli_si256(a, imm)749750static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))751_mm256_srli_epi16(__m256i __a, int __count) {752__m256i ret;753ret.v0 = _mm_srli_epi16(__a.v0, __count);754ret.v1 = _mm_srli_epi16(__a.v1, __count);755return ret;756}757758static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))759_mm256_srl_epi16(__m256i __a, __m128i __count) {760__m256i ret;761ret.v0 = _mm_srl_epi16(__a.v0, __count);762ret.v1 = _mm_srl_epi16(__a.v1, __count);763return ret;764}765766static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))767_mm256_srli_epi32(__m256i __a, int __count) {768__m256i ret;769ret.v0 = _mm_srli_epi32(__a.v0, __count);770ret.v1 = _mm_srli_epi32(__a.v1, __count);771return ret;772}773774static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))775_mm256_srl_epi32(__m256i __a, __m128i __count) {776__m256i ret;777ret.v0 = _mm_srl_epi32(__a.v0, __count);778ret.v1 = _mm_srl_epi32(__a.v1, __count);779return ret;780}781782static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))783_mm256_srli_epi64(__m256i __a, int __count) {784__m256i ret;785ret.v0 = _mm_srli_epi64(__a.v0, __count);786ret.v1 = _mm_srli_epi64(__a.v1, __count);787return ret;788}789790static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))791_mm256_srl_epi64(__m256i __a, __m128i __count) {792__m256i ret;793ret.v0 = _mm_srl_epi64(__a.v0, __count);794ret.v1 = _mm_srl_epi64(__a.v1, __count);795return ret;796}797798static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))799_mm256_sub_epi8(__m256i __a, __m256i __b) {800__m256i ret;801ret.v0 = _mm_sub_epi8(__a.v0, __b.v0);802ret.v1 = _mm_sub_epi8(__a.v1, __b.v1);803return ret;804}805806static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))807_mm256_sub_epi16(__m256i __a, __m256i __b) {808__m256i ret;809ret.v0 = _mm_sub_epi16(__a.v0, __b.v0);810ret.v1 = _mm_sub_epi16(__a.v1, __b.v1);811return ret;812}813814static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))815_mm256_sub_epi32(__m256i __a, __m256i __b) {816__m256i ret;817ret.v0 = _mm_sub_epi32(__a.v0, __b.v0);818ret.v1 = _mm_sub_epi32(__a.v1, __b.v1);819return ret;820}821822static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))823_mm256_sub_epi64(__m256i __a, __m256i __b) {824__m256i ret;825ret.v0 = _mm_sub_epi64(__a.v0, __b.v0);826ret.v1 = _mm_sub_epi64(__a.v1, __b.v1);827return ret;828}829830static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))831_mm256_subs_epi8(__m256i __a, __m256i __b) {832__m256i ret;833ret.v0 = _mm_subs_epi8(__a.v0, __b.v0);834ret.v1 = _mm_subs_epi8(__a.v1, __b.v1);835return ret;836}837838static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))839_mm256_subs_epi16(__m256i __a, __m256i __b) {840__m256i ret;841ret.v0 = _mm_subs_epi16(__a.v0, __b.v0);842ret.v1 = _mm_subs_epi16(__a.v1, __b.v1);843return ret;844}845846static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))847_mm256_subs_epu8(__m256i __a, __m256i __b) {848__m256i ret;849ret.v0 = _mm_subs_epu8(__a.v0, __b.v0);850ret.v1 = _mm_subs_epu8(__a.v1, __b.v1);851return ret;852}853854static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))855_mm256_subs_epu16(__m256i __a, __m256i __b) {856__m256i ret;857ret.v0 = _mm_subs_epu16(__a.v0, __b.v0);858ret.v1 = _mm_subs_epu16(__a.v1, __b.v1);859return ret;860}861862static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))863_mm256_unpackhi_epi8(__m256i __a, __m256i __b) {864__m256i ret;865ret.v0 = _mm_unpackhi_epi8(__a.v0, __b.v0);866ret.v1 = _mm_unpackhi_epi8(__a.v1, __b.v1);867return ret;868}869870static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))871_mm256_unpackhi_epi16(__m256i __a, __m256i __b) {872__m256i ret;873ret.v0 = _mm_unpackhi_epi16(__a.v0, __b.v0);874ret.v1 = _mm_unpackhi_epi16(__a.v1, __b.v1);875return ret;876}877878static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))879_mm256_unpackhi_epi32(__m256i __a, __m256i __b) {880__m256i ret;881ret.v0 = _mm_unpackhi_epi32(__a.v0, __b.v0);882ret.v1 = _mm_unpackhi_epi32(__a.v1, __b.v1);883return ret;884}885886static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))887_mm256_unpackhi_epi64(__m256i __a, __m256i __b) {888__m256i ret;889ret.v0 = _mm_unpackhi_epi64(__a.v0, __b.v0);890ret.v1 = _mm_unpackhi_epi64(__a.v1, __b.v1);891return ret;892}893894static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))895_mm256_unpacklo_epi8(__m256i __a, __m256i __b) {896__m256i ret;897ret.v0 = _mm_unpacklo_epi8(__a.v0, __b.v0);898ret.v1 = _mm_unpacklo_epi8(__a.v1, __b.v1);899return ret;900}901902static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))903_mm256_unpacklo_epi16(__m256i __a, __m256i __b) {904__m256i ret;905ret.v0 = _mm_unpacklo_epi16(__a.v0, __b.v0);906ret.v1 = _mm_unpacklo_epi16(__a.v1, __b.v1);907return ret;908}909910static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))911_mm256_unpacklo_epi32(__m256i __a, __m256i __b) {912__m256i ret;913ret.v0 = _mm_unpacklo_epi32(__a.v0, __b.v0);914ret.v1 = _mm_unpacklo_epi32(__a.v1, __b.v1);915return ret;916}917918static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))919_mm256_unpacklo_epi64(__m256i __a, __m256i __b) {920__m256i ret;921ret.v0 = _mm_unpacklo_epi64(__a.v0, __b.v0);922ret.v1 = _mm_unpacklo_epi64(__a.v1, __b.v1);923return ret;924}925926static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))927_mm256_xor_si256(__m256i __a, __m256i __b) {928__m256i ret;929ret.v0 = _mm_xor_si128(__a.v0, __b.v0);930ret.v1 = _mm_xor_si128(__a.v1, __b.v1);931return ret;932}933934static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))935_mm256_stream_load_si256(const void* __V) {936__m256i ret;937ret.v0 = _mm_stream_load_si128((const __m128i*)__V);938ret.v1 = _mm_stream_load_si128((const __m128i*)(((const uint8_t*)__V) + 16));939return ret;940}941942static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))943_mm_broadcastss_ps(__m128 __a) {944return (__m128)wasm_i32x4_shuffle(__a, __a, 0, 0, 0, 0);945}946947static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))948_mm_broadcastsd_pd(__m128d __a) {949return (__m128d)wasm_i64x2_shuffle(__a, __a, 0, 0);950}951952static __inline__ __m256 __attribute__((__always_inline__, __nodebug__))953_mm256_broadcastss_ps(__m128 __a) {954__m256 ret;955ret.v1 = ret.v0 = _mm_broadcastss_ps(__a);956return ret;957}958959static __inline__ __m256d __attribute__((__always_inline__, __nodebug__))960_mm256_broadcastsd_pd(__m128d __a) {961__m256d ret;962ret.v1 = ret.v0 = _mm_broadcastsd_pd(__a);963return ret;964}965966static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))967_mm256_broadcastsi128_si256(__m128i __a) {968__m256i ret;969ret.v1 = ret.v0 = __a;970return ret;971}972973#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)974975#define _mm_blend_epi32(__a, __b, __imm8) \976__extension__({ \977(__m128i) __builtin_shufflevector((__i32x4)(__m128i)(__a), \978(__i32x4)(__m128i)(__b), \979(((__imm8) & 0x01) ? 4 : 0), \980(((__imm8) & 0x02) ? 5 : 1), \981(((__imm8) & 0x04) ? 6 : 2), \982(((__imm8) & 0x08) ? 7 : 3)); \983})984985#define _mm256_blend_epi32(__A, __B, __imm) \986__extension__({ \987__m256i __a = (__A); \988__m256i __b = (__B); \989_mm256_set_m128i(_mm_blend_epi32(__a.v1, __b.v1, (__imm) >> 4), \990_mm_blend_epi32(__a.v0, __b.v0, (__imm))); \991})992993static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))994_mm_broadcastb_epi8(__m128i __a) {995return (__m128i)wasm_i8x16_shuffle(996__a, __a, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);997}998999static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1000_mm_broadcastw_epi16(__m128i __a) {1001return (__m128i)wasm_i16x8_shuffle(__a, __a, 0, 0, 0, 0, 0, 0, 0, 0);1002}10031004static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1005_mm_broadcastd_epi32(__m128i __a) {1006return (__m128i)wasm_i32x4_shuffle(__a, __a, 0, 0, 0, 0);1007}10081009static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1010_mm_broadcastq_epi64(__m128i __a) {1011return (__m128i)wasm_i64x2_shuffle(__a, __a, 0, 0);1012}10131014static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1015_mm256_broadcastb_epi8(__m128i __a) {1016__m256i ret;1017ret.v1 = ret.v0 = _mm_broadcastb_epi8(__a);1018return ret;1019}10201021static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1022_mm256_broadcastw_epi16(__m128i __a) {1023__m256i ret;1024ret.v1 = ret.v0 = _mm_broadcastw_epi16(__a);1025return ret;1026}10271028static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1029_mm256_broadcastd_epi32(__m128i __a) {1030__m256i ret;1031ret.v1 = ret.v0 = _mm_broadcastd_epi32(__a);1032return ret;1033}10341035static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1036_mm256_broadcastq_epi64(__m128i __a) {1037__m256i ret;1038ret.v1 = ret.v0 = _mm_broadcastq_epi64(__a);1039return ret;1040}10411042static __inline__ __m256i1043__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1044_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) {1045__m256i ret;1046int index[8];1047int lane[8];1048for (int i = 0; i < 4; i++) {1049index[i] = ((__i32x4)__b.v0)[i] & 7;1050index[i + 4] = ((__i32x4)__b.v1)[i] & 7;1051}10521053for (int j = 0; j < 8; j++) {1054lane[j] = index[j] < 4 ? ((__i32x4)(__a.v0))[index[j]]1055: ((__i32x4)(__a.v1))[index[j] - 4];1056}10571058ret.v0 = (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1059ret.v1 = (__m128i)wasm_i32x4_make(lane[4], lane[5], lane[6], lane[7]);1060return ret;1061}10621063#define _mm256_permute4x64_pd(__A, __imm) \1064__extension__({ \1065__m256d __a = (__A); \1066_mm256_set_m128d( \1067(__m128d)wasm_i64x2_shuffle( \1068__a.v0, __a.v1, (((__imm) >> 4) & 3), (((__imm) >> 6) & 3)), \1069(__m128d)wasm_i64x2_shuffle( \1070__a.v0, __a.v1, ((__imm) & 3), (((__imm) >> 2) & 3))); \1071})10721073static __inline__ __m2561074__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1075_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {1076__m256 ret;1077int index[8];1078float lane[8];1079for (int i = 0; i < 4; i++) {1080index[i] = ((__i32x4)__b.v0)[i] & 7;1081index[i + 4] = ((__i32x4)__b.v1)[i] & 7;1082}1083for (int j = 0; j < 8; j++) {1084lane[j] = index[j] < 4 ? ((__f32x4)(__a.v0))[index[j]]1085: ((__f32x4)(__a.v1))[index[j] - 4];1086}1087ret.v0 = (__m128)wasm_f32x4_make(lane[0], lane[1], lane[2], lane[3]);1088ret.v1 = (__m128)wasm_f32x4_make(lane[4], lane[5], lane[6], lane[7]);1089return ret;1090}10911092#define _mm256_permute4x64_epi64(__A, __imm) \1093__extension__({ \1094__m256i __a = (__A); \1095_mm256_set_m128i( \1096wasm_i64x2_shuffle( \1097__a.v0, __a.v1, (((__imm) >> 4) & 3), (((__imm) >> 6) & 3)), \1098wasm_i64x2_shuffle( \1099__a.v0, __a.v1, ((__imm) & 3), (((__imm) >> 2) & 3))); \1100})11011102static __inline__ __m256i1103__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1104_mm256_permute2x128_si256(__m256i __a, __m256i __b, const int imm8) {1105__m256i ret;1106ret.v0 = __avx_select4i(__a, __b, imm8);1107ret.v1 = __avx_select4i(__a, __b, imm8 >> 4);1108return ret;1109}11101111static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1112_mm256_extracti128_si256(__m256i __a, const int imm8) {1113if (imm8 & 0x1) {1114return __a.v1;1115} else {1116return __a.v0;1117}1118}11191120static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1121_mm256_inserti128_si256(__m256i __a, __m128i __b, const int imm8) {1122__m256i ret = __a;1123if (imm8 & 0x1) {1124ret.v1 = __b;1125} else {1126ret.v0 = __b;1127}1128return ret;1129}11301131static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1132_mm_maskload_epi32(int32_t const* __p, __m128i __m) {1133int32_t lane[4];1134for (size_t i = 0; i < 4; i++) {1135uint32_t mask = ((__i32x4)__m)[i];1136lane[i] = ((mask >> 31) & 0x1) ? __p[i] : 0;1137}1138return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1139}11401141static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))1142_mm_maskload_epi64(int64_t const* __p, __m128i __m) {1143int64_t lane[2];1144for (size_t i = 0; i < 2; i++) {1145uint64_t mask = ((__i64x2)__m)[i];1146lane[i] = ((mask >> 63) & 0x1) ? __p[i] : 0;1147}1148return (__m128i)wasm_i64x2_make(lane[0], lane[1]);1149}11501151static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1152_mm256_maskload_epi32(int const* __p, __m256i __m) {1153__m256i ret;1154ret.v0 = _mm_maskload_epi32(__p, __m.v0);1155ret.v1 = _mm_maskload_epi32(((int32_t*)__p) + 4, __m.v1);1156return ret;1157}11581159static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))1160_mm256_maskload_epi64(long long const* __p, __m256i __m) {1161__m256i ret;1162ret.v0 = _mm_maskload_epi64(__p, __m.v0);1163ret.v1 = _mm_maskload_epi64(((int64_t*)__p) + 2, __m.v1);1164return ret;1165}11661167static __inline__ void1168__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1169_mm_maskstore_epi32(int* __p, __m128i __m, __m128i __a) {1170if ((wasm_i32x4_extract_lane(__m, 0) & 0x80000000ull) != 0)1171__p[0] = wasm_i32x4_extract_lane((v128_t)__a, 0);1172if ((wasm_i32x4_extract_lane(__m, 1) & 0x80000000ull) != 0)1173__p[1] = wasm_i32x4_extract_lane((v128_t)__a, 1);1174if ((wasm_i32x4_extract_lane(__m, 2) & 0x80000000ull) != 0)1175__p[2] = wasm_i32x4_extract_lane((v128_t)__a, 2);1176if ((wasm_i32x4_extract_lane(__m, 3) & 0x80000000ull) != 0)1177__p[3] = wasm_i32x4_extract_lane((v128_t)__a, 3);1178}11791180static __inline__ void1181__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1182_mm_maskstore_epi64(long long* __p, __m128i __m, __m128i __a) {1183if ((wasm_i64x2_extract_lane(__m, 0) & 0x8000000000000000ull) != 0)1184__p[0] = wasm_i64x2_extract_lane((v128_t)__a, 0);1185if ((wasm_i64x2_extract_lane(__m, 1) & 0x8000000000000000ull) != 0)1186__p[1] = wasm_i64x2_extract_lane((v128_t)__a, 1);1187}11881189static __inline__ void __attribute__((__always_inline__, __nodebug__))1190_mm256_maskstore_epi32(int* __p, __m256i __m, __m256i __a) {1191_mm_maskstore_epi32(__p, __m.v0, __a.v0);1192_mm_maskstore_epi32(((int32_t*)__p) + 4, __m.v1, __a.v1);1193}11941195static __inline__ void __attribute__((__always_inline__, __nodebug__))1196_mm256_maskstore_epi64(long long* __p, __m256i __m, __m256i __a) {1197_mm_maskstore_epi64(__p, __m.v0, __a.v0);1198_mm_maskstore_epi64(((int64_t*)__p) + 2, __m.v1, __a.v1);1199}12001201static __inline__ __m128i1202__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1203_mm_sllv_epi32(__m128i __a, __m128i __count) {1204int32_t lane[4];1205for (size_t i = 0; i < 4; i++) {1206uint32_t shift = ((__u32x4)__count)[i];1207lane[i] = shift < 32 ? ((__u32x4)__a)[i] << shift : 0;1208}1209return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1210}12111212static __inline__ __m256i1213__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1214_mm256_sllv_epi32(__m256i __a, __m256i __count) {1215__m256i ret;1216ret.v0 = _mm_sllv_epi32(__a.v0, __count.v0);1217ret.v1 = _mm_sllv_epi32(__a.v1, __count.v1);1218return ret;1219}12201221static __inline__ __m128i1222__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1223_mm_sllv_epi64(__m128i __a, __m128i __count) {12241225int64_t lane[2];1226for (size_t i = 0; i < 2; i++) {1227uint64_t shift = (uint64_t)((__u64x2)__count)[i];1228lane[i] = shift < 64 ? ((__u64x2)__a)[i] << shift : 0;1229}1230return (__m128i)wasm_i64x2_make(lane[0], lane[1]);1231}12321233static __inline__ __m256i1234__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1235_mm256_sllv_epi64(__m256i __a, __m256i __count) {1236__m256i ret;1237ret.v0 = _mm_sllv_epi64(__a.v0, __count.v0);1238ret.v1 = _mm_sllv_epi64(__a.v1, __count.v1);1239return ret;1240}12411242static __inline__ __m128i1243__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1244_mm_srav_epi32(__m128i __a, __m128i __count) {1245int32_t lane[4];1246for (size_t i = 0; i < 4; i++) {1247uint32_t shift = ((__u32x4)__count)[i];1248shift = shift < 31 ? shift : 31;1249lane[i] = ((__i32x4)__a)[i] >> shift;1250}1251return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1252}12531254static __inline__ __m256i1255__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1256_mm256_srav_epi32(__m256i __a, __m256i __count) {1257__m256i ret;1258ret.v0 = _mm_srav_epi32(__a.v0, __count.v0);1259ret.v1 = _mm_srav_epi32(__a.v1, __count.v1);1260return ret;1261}12621263static __inline__ __m128i1264__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1265_mm_srlv_epi32(__m128i __a, __m128i __count) {1266int32_t lane[4];1267for (size_t i = 0; i < 4; i++) {1268uint32_t shift = ((__u32x4)__count)[i];1269lane[i] = shift < 32 ? ((__u32x4)__a)[i] >> shift : 0;1270}1271return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1272}12731274static __inline__ __m256i1275__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1276_mm256_srlv_epi32(__m256i __a, __m256i __count) {1277__m256i ret;1278ret.v0 = _mm_srlv_epi32(__a.v0, __count.v0);1279ret.v1 = _mm_srlv_epi32(__a.v1, __count.v1);1280return ret;1281}12821283static __inline__ __m128i1284__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1285_mm_srlv_epi64(__m128i __a, __m128i __count) {1286int64_t lane[2];1287for (size_t i = 0; i < 2; i++) {1288uint64_t shift = ((__u64x2)__count)[i];1289lane[i] = shift < 64 ? ((__u64x2)__a)[i] >> shift : 0;1290}1291return (__m128i)wasm_i64x2_make(lane[0], lane[1]);1292}12931294static __inline__ __m256i1295__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1296_mm256_srlv_epi64(__m256i __a, __m256i __count) {1297__m256i ret;1298ret.v0 = _mm_srlv_epi64(__a.v0, __count.v0);1299ret.v1 = _mm_srlv_epi64(__a.v1, __count.v1);1300return ret;1301}13021303static __inline__ __m128d1304__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1305_mm_mask_i32gather_pd(__m128d src,1306const double* base_addr,1307__m128i vindex,1308__m128d mask,1309const int scale) {1310double lane[2];1311for (size_t i = 0; i < 2; i++) {1312if ((((__i64x2)mask)[i] >> 63) & 0x1) {1313double* addr =1314(double*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1315(uint64_t)((uint32_t)scale));1316lane[i] = *addr;1317} else {1318lane[i] = ((__f64x2)src)[i];1319}1320}1321return (__m128d)wasm_f64x2_make(lane[0], lane[1]);1322}13231324static __inline__ __m256d1325__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1326_mm256_mask_i32gather_pd(__m256d src,1327const double* base_addr,1328__m128i vindex,1329__m256d mask,1330const int scale) {1331__m256d ret;1332ret.v0 = _mm_mask_i32gather_pd(src.v0, base_addr, vindex, mask.v0, scale);1333__m128i vindex1 = (__m128i)wasm_i32x4_shuffle(vindex, vindex, 2, 3, 0, 1);1334ret.v1 = _mm_mask_i32gather_pd(src.v1, base_addr, vindex1, mask.v1, scale);1335return ret;1336}13371338static __inline__ __m128d1339__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1340_mm_mask_i64gather_pd(__m128d src,1341const double* base_addr,1342__m128i vindex,1343__m128d mask,1344const int scale) {1345double lane[2];1346for (size_t i = 0; i < 2; i++) {1347if ((((__i64x2)mask)[i] >> 63) & 0x1) {1348double* addr =1349(double*)((uint8_t*)base_addr +1350((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1351lane[i] = *addr;1352} else {1353lane[i] = ((__f64x2)src)[i];1354}1355}1356return (__m128d)wasm_f64x2_make(lane[0], lane[1]);1357}13581359static __inline__ __m256d1360__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1361_mm256_mask_i64gather_pd(__m256d src,1362const double* base_addr,1363__m256i vindex,1364__m256d mask,1365const int scale) {1366__m256d ret;1367ret.v0 = _mm_mask_i64gather_pd(src.v0, base_addr, vindex.v0, mask.v0, scale);1368ret.v1 = _mm_mask_i64gather_pd(src.v1, base_addr, vindex.v1, mask.v1, scale);1369return ret;1370}13711372static __inline__ __m1281373__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1374_mm_mask_i32gather_ps(__m128 src,1375const float* base_addr,1376__m128i vindex,1377__m128 mask,1378const int scale) {1379float lane[4];1380for (size_t i = 0; i < 4; i++) {1381if ((((__i32x4)mask)[i] >> 31) & 0x1) {1382float* addr =1383(float*)((uint8_t*)base_addr +1384(int64_t)(((__i32x4)vindex)[i]) * (uint64_t)((uint32_t)scale));1385lane[i] = *addr;1386} else {1387lane[i] = ((__f32x4)src)[i];1388}1389}1390return (__m128)wasm_f32x4_make(lane[0], lane[1], lane[2], lane[3]);1391}13921393static __inline__ __m2561394__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1395_mm256_mask_i32gather_ps(__m256 src,1396const float* base_addr,1397__m256i vindex,1398__m256 mask,1399const int scale) {1400__m256 ret;1401ret.v0 = _mm_mask_i32gather_ps(src.v0, base_addr, vindex.v0, mask.v0, scale);1402ret.v1 = _mm_mask_i32gather_ps(src.v1, base_addr, vindex.v1, mask.v1, scale);1403return ret;1404}14051406static __inline__ __m1281407__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1408_mm_mask_i64gather_ps(__m128 src,1409const float* base_addr,1410__m128i vindex,1411__m128 mask,1412const int scale) {1413float lane[2];1414for (size_t i = 0; i < 2; i++) {1415if ((((__i32x4)mask)[i] >> 31) & 0x1) {1416float* addr =1417(float*)((uint8_t*)base_addr +1418((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1419lane[i] = *addr;1420} else {1421lane[i] = ((__f32x4)src)[i];1422}1423}1424return (__m128)wasm_f32x4_make(lane[0], lane[1], 0, 0);1425}14261427static __inline__ __m1281428__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1429_mm256_mask_i64gather_ps(__m128 src,1430const float* base_addr,1431__m256i vindex,1432__m128 mask,1433const int scale) {1434float lane[4];1435__m128i current_vindex;1436for (size_t i = 0; i < 4; i++) {1437current_vindex = i < 2 ? vindex.v0 : vindex.v1;1438if ((((__i32x4)mask)[i] >> 31) & 0x1) {1439float* addr =1440(float*)((uint8_t*)base_addr + ((__i64x2)current_vindex)[i & 1] *1441(uint64_t)((uint32_t)scale));1442lane[i] = *addr;1443} else {1444lane[i] = ((__f32x4)src)[i];1445}1446}1447return (__m128)wasm_f32x4_make(lane[0], lane[1], lane[2], lane[3]);1448}14491450static __inline__ __m128i1451__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1452_mm_mask_i32gather_epi32(__m128i src,1453const int* base_addr,1454__m128i vindex,1455__m128i mask,1456const int scale) {1457int32_t lane[4];1458for (size_t i = 0; i < 4; i++) {1459if ((((__i32x4)mask)[i] >> 31) & 0x1) {1460int32_t* addr =1461(int32_t*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1462(uint64_t)((uint32_t)scale));1463lane[i] = *addr;1464} else {1465lane[i] = ((__i32x4)src)[i];1466}1467}1468return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1469}14701471static __inline__ __m256i1472__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1473_mm256_mask_i32gather_epi32(__m256i src,1474const int* base_addr,1475__m256i vindex,1476__m256i mask,1477const int scale) {1478__m256i ret;1479ret.v0 =1480_mm_mask_i32gather_epi32(src.v0, base_addr, vindex.v0, mask.v0, scale);1481ret.v1 =1482_mm_mask_i32gather_epi32(src.v1, base_addr, vindex.v1, mask.v1, scale);1483return ret;1484}14851486static __inline__ __m128i1487__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1488_mm_mask_i64gather_epi32(__m128i src,1489const int* base_addr,1490__m128i vindex,1491__m128i mask,1492const int scale) {1493int32_t lane[2];1494for (size_t i = 0; i < 2; i++) {1495if ((((__i32x4)mask)[i] >> 31) & 0x1) {1496int32_t* addr =1497(int32_t*)((uint8_t*)base_addr +1498((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1499lane[i] = *addr;1500} else {1501lane[i] = ((__i32x4)src)[i];1502}1503}1504return (__m128i)wasm_i32x4_make(lane[0], lane[1], 0, 0);1505}15061507static __inline__ __m128i1508__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1509_mm256_mask_i64gather_epi32(__m128i src,1510const int* base_addr,1511__m256i vindex,1512__m128i mask,1513const int scale) {1514int32_t lane[4];1515__m128i current_vindex;1516for (size_t i = 0; i < 4; i++) {1517current_vindex = i < 2 ? vindex.v0 : vindex.v1;1518if ((((__i32x4)mask)[i] >> 31) & 0x1) {1519int32_t* addr =1520(int32_t*)((uint8_t*)base_addr + ((__i64x2)current_vindex)[i & 1] *1521(uint64_t)((uint32_t)scale));1522lane[i] = *addr;1523} else {1524lane[i] = ((__i32x4)src)[i];1525}1526}1527return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);1528}15291530static __inline__ __m128i1531__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1532_mm_mask_i32gather_epi64(__m128i src,1533const long long* base_addr,1534__m128i vindex,1535__m128i mask,1536const int scale) {1537int64_t lane[2];1538for (size_t i = 0; i < 2; i++) {1539if ((((__i64x2)mask)[i] >> 63) & 0x1) {1540int64_t* addr =1541(int64_t*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1542(uint64_t)((uint32_t)scale));1543lane[i] = *addr;1544} else {1545lane[i] = ((__i64x2)src)[i];1546}1547}1548return (__m128i)wasm_i64x2_make(lane[0], lane[1]);1549}15501551static __inline__ __m256i1552__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1553_mm256_mask_i32gather_epi64(__m256i src,1554const long long* base_addr,1555__m128i vindex,1556__m256i mask,1557const int scale) {1558__m256i ret;1559ret.v0 = _mm_mask_i32gather_epi64(src.v0, base_addr, vindex, mask.v0, scale);1560__m128i vindex1 = (__m128i)wasm_i32x4_shuffle(vindex, vindex, 2, 3, 0, 1);1561ret.v1 = _mm_mask_i32gather_epi64(src.v1, base_addr, vindex1, mask.v1, scale);1562return ret;1563}15641565static __inline__ __m128i1566__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1567_mm_mask_i64gather_epi64(__m128i src,1568const long long* base_addr,1569__m128i vindex,1570__m128i mask,1571const int scale) {1572int64_t lane[2];1573for (size_t i = 0; i < 2; i++) {1574if ((((__i64x2)mask)[i] >> 63) & 0x1) {1575int64_t* addr =1576(int64_t*)((uint8_t*)base_addr +1577((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1578lane[i] = *addr;1579} else {1580lane[i] = ((__i64x2)src)[i];1581}1582}1583return (__m128i)wasm_i64x2_make(lane[0], lane[1]);1584}15851586static __inline__ __m256i1587__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1588_mm256_mask_i64gather_epi64(__m256i src,1589const long long* base_addr,1590__m256i vindex,1591__m256i mask,1592const int scale) {1593__m256i ret;1594ret.v0 =1595_mm_mask_i64gather_epi64(src.v0, base_addr, vindex.v0, mask.v0, scale);1596ret.v1 =1597_mm_mask_i64gather_epi64(src.v1, base_addr, vindex.v1, mask.v1, scale);1598return ret;1599}16001601static __inline__ __m128d1602__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1603_mm_i32gather_pd(const double* base_addr, __m128i vindex, const int scale) {1604double* lane[2];1605for (size_t i = 0; i < 2; i++) {1606lane[i] = (double*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1607(uint64_t)((uint32_t)scale));1608}1609return (__m128d)wasm_f64x2_make(*lane[0], *lane[1]);1610}16111612static __inline__ __m256d1613__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1614_mm256_i32gather_pd(const double* base_addr,1615__m128i vindex,1616const int scale) {1617__m256d ret;1618double* lane[4];1619for (size_t i = 0; i < 4; i++) {1620lane[i] = (double*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1621(uint64_t)((uint32_t)scale));1622}1623ret.v0 = (__m128d)wasm_f64x2_make(*lane[0], *lane[1]);1624ret.v1 = (__m128d)wasm_f64x2_make(*lane[2], *lane[3]);1625return ret;1626}16271628static __inline__ __m128d1629__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1630_mm_i64gather_pd(const double* base_addr, __m128i vindex, const int scale) {1631double* lane[2];1632for (size_t i = 0; i < 2; i++) {1633lane[i] = (double*)((uint8_t*)base_addr +1634((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1635}1636return (__m128d)wasm_f64x2_make(*lane[0], *lane[1]);1637}16381639static __inline__ __m256d1640__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1641_mm256_i64gather_pd(const double* base_addr,1642__m256i vindex,1643const int scale) {1644__m256d ret;1645ret.v0 = _mm_i64gather_pd(base_addr, vindex.v0, scale);1646ret.v1 = _mm_i64gather_pd(base_addr, vindex.v1, scale);1647return ret;1648}16491650static __inline__ __m1281651__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1652_mm_i32gather_ps(const float* base_addr, __m128i vindex, const int scale) {1653float* lane[4];1654for (size_t i = 0; i < 4; i++) {1655lane[i] = (float*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1656(uint64_t)((uint32_t)scale));1657}1658return (__m128)wasm_f32x4_make(*lane[0], *lane[1], *lane[2], *lane[3]);1659}16601661static __inline__ __m2561662__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1663_mm256_i32gather_ps(const float* base_addr, __m256i vindex, const int scale) {1664__m256 ret;1665ret.v0 = _mm_i32gather_ps(base_addr, vindex.v0, scale);1666ret.v1 = _mm_i32gather_ps(base_addr, vindex.v1, scale);1667return ret;1668}16691670static __inline__ __m1281671__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1672_mm_i64gather_ps(const float* base_addr, __m128i vindex, const int scale) {1673float* lane[2];1674for (size_t i = 0; i < 2; i++) {1675lane[i] = (float*)((uint8_t*)base_addr +1676((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1677}1678return (__m128)wasm_f32x4_make(*lane[0], *lane[1], 0, 0);1679}16801681static __inline__ __m1281682__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1683_mm256_i64gather_ps(const float* base_addr, __m256i vindex, const int scale) {1684float* lane[4];1685__m128i current_vindex;1686for (size_t i = 0; i < 4; i++) {1687current_vindex = i < 2 ? vindex.v0 : vindex.v1;1688lane[i] = (float*)((uint8_t*)base_addr + ((__i64x2)current_vindex)[i & 1] *1689(uint64_t)((uint32_t)scale));1690}1691return (__m128)wasm_f32x4_make(*lane[0], *lane[1], *lane[2], *lane[3]);1692}16931694static __inline__ __m128i1695__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1696_mm_i32gather_epi32(const int* base_addr, __m128i vindex, const int scale) {1697int32_t* lane[4];1698for (size_t i = 0; i < 4; i++) {1699lane[i] = (int32_t*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1700(uint64_t)((uint32_t)scale));1701}1702return (__m128i)wasm_i32x4_make(*lane[0], *lane[1], *lane[2], *lane[3]);1703}17041705static __inline__ __m256i1706__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1707_mm256_i32gather_epi32(const int* base_addr,1708__m256i vindex,1709const int scale) {1710__m256i ret;1711ret.v0 = _mm_i32gather_epi32(base_addr, vindex.v0, scale);1712ret.v1 = _mm_i32gather_epi32(base_addr, vindex.v1, scale);1713return ret;1714}17151716static __inline__ __m128i1717__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1718_mm_i64gather_epi32(const int* base_addr, __m128i vindex, const int scale) {1719int32_t* lane[2];1720for (size_t i = 0; i < 2; i++) {1721lane[i] = (int32_t*)((uint8_t*)base_addr +1722((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1723}1724return (__m128i)wasm_i32x4_make(*lane[0], *lane[1], 0, 0);1725}17261727static __inline__ __m128i1728__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1729_mm256_i64gather_epi32(const int* base_addr,1730__m256i vindex,1731const int scale) {1732int32_t* lane[4];1733__m128i current_vindex;1734for (size_t i = 0; i < 4; i++) {1735current_vindex = i < 2 ? vindex.v0 : vindex.v1;1736lane[i] =1737(int32_t*)((uint8_t*)base_addr + ((__i64x2)current_vindex)[i & 1] *1738(uint64_t)((uint32_t)scale));1739}1740return (__m128i)wasm_i32x4_make(*lane[0], *lane[1], *lane[2], *lane[3]);1741}17421743static __inline__ __m128i1744__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1745_mm_i32gather_epi64(const long long* base_addr,1746__m128i vindex,1747const int scale) {1748int64_t* lane[2];1749for (size_t i = 0; i < 2; i++) {1750lane[i] = (int64_t*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1751(uint64_t)((uint32_t)scale));1752}1753return (__m128i)wasm_i64x2_make(*lane[0], *lane[1]);1754}17551756static __inline__ __m256i1757__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1758_mm256_i32gather_epi64(const long long* base_addr,1759__m128i vindex,1760const int scale) {17611762__m256i ret;1763int64_t* lane[4];1764for (size_t i = 0; i < 4; i++) {1765lane[i] = (int64_t*)((uint8_t*)base_addr + (int64_t)(((__i32x4)vindex)[i]) *1766(uint64_t)((uint32_t)scale));1767}1768ret.v0 = (__m128i)wasm_i64x2_make(*lane[0], *lane[1]);1769ret.v1 = (__m128i)wasm_i64x2_make(*lane[2], *lane[3]);1770return ret;1771}17721773static __inline__ __m128i1774__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1775_mm_i64gather_epi64(const long long* base_addr,1776__m128i vindex,1777const int scale) {1778int64_t* lane[2];1779for (size_t i = 0; i < 2; i++) {1780lane[i] = (int64_t*)((uint8_t*)base_addr +1781((__i64x2)vindex)[i] * (uint64_t)((uint32_t)scale));1782}1783return (__m128i)wasm_i64x2_make(*lane[0], *lane[1]);1784}17851786static __inline__ __m256i1787__attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))1788_mm256_i64gather_epi64(const long long* base_addr,1789__m256i vindex,1790const int scale) {1791__m256i ret;1792ret.v0 = _mm_i64gather_epi64(base_addr, vindex.v0, scale);1793ret.v1 = _mm_i64gather_epi64(base_addr, vindex.v1, scale);1794return ret;1795}17961797#endif /* __emscripten_avx2intrin_h__ */179817991800