Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/test/benchmark/benchmark_sse2.cpp
6174 views
1
/*
2
* Copyright 2020 The Emscripten Authors. All rights reserved.
3
* Emscripten is available under two separate licenses, the MIT license and the
4
* University of Illinois/NCSA Open Source License. Both these licenses can be
5
* found in the LICENSE file.
6
*/
7
#include <emmintrin.h>
8
#include "benchmark_sse.h"
9
10
int main() {
11
printf ("{ \"workload\": %u, \"results\": [\n", N);
12
13
float *src_flt = alloc_float_buffer();
14
float *src2_flt = alloc_float_buffer();
15
float *dst_flt = alloc_float_buffer();
16
for(int i = 0; i < N; ++i) src_flt[i] = (float)(1.0 + (double)rand() / RAND_MAX);
17
for(int i = 0; i < N; ++i) src2_flt[i] = (float)(1.0 + (double)rand() / RAND_MAX);
18
19
double *src_dbl = alloc_double_buffer();
20
double *src2_dbl = alloc_double_buffer();
21
double *dst_dbl = alloc_double_buffer();
22
for(int i = 0; i < N; ++i) src_dbl[i] = 1.0 + (double)rand() / RAND_MAX;
23
for(int i = 0; i < N; ++i) src2_dbl[i] = 1.0 + (double)rand() / RAND_MAX;
24
25
int *src_int = alloc_int_buffer();
26
int *src2_int = alloc_int_buffer();
27
int *dst_int = alloc_int_buffer();
28
for(int i = 0; i < N; ++i) src_int[i] = rand();
29
for(int i = 0; i < N; ++i) src2_int[i] = rand();
30
31
float scalarTime = 0.f;
32
33
// Benchmarks start:
34
SETCHART("load");
35
START();
36
for(int i = 0; i < N; ++i)
37
dst_dbl[i] = src_dbl[i];
38
ENDSCALAR(checksum_dst(dst_dbl), "scalar");
39
40
LOAD_STORE_D("_mm_load_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2);
41
LOAD_STORE_D("_mm_load_pd1", _mm_load_pd1, 1, _mm_store_pd, double*, 0, 2);
42
LOAD_STORE_D("_mm_load_sd", _mm_load_sd, 1, _mm_store_pd, double*, 0, 2);
43
// _mm_load_si128
44
LOAD_STORE_D("_mm_load1_pd", _mm_load1_pd, 1, _mm_store_pd, double*, 0, 2);
45
46
__m128d tempReg = _mm_set_pd(1.0, 2.0);
47
LOAD_STORE_M64("_mm_loadh_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2);
48
// _mm_loadl_epi64
49
LOAD_STORE_M64("_mm_loadl_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2);
50
51
LOAD_STORE_D("_mm_loadr_pd", _mm_loadr_pd, 0, _mm_store_pd, double*, 0, 2);
52
LOAD_STORE_D("_mm_loadu_pd", _mm_loadu_pd, 1, _mm_store_pd, double*, 0, 2);
53
// _mm_loadu_si128
54
55
SETCHART("set");
56
/* _mm_set_epi16
57
_mm_set_epi32
58
_mm_set_epi64
59
_mm_set_epi64x
60
_mm_set_epi8 */
61
SET_STORE_D("_mm_set_pd", _mm_set_pd(src_dbl[i+2], src_dbl[i+0]));
62
//SET_STORE_D("_mm_set_pd1", _mm_set_pd1(src_dbl[i]));
63
SET_STORE_D("_mm_set_sd", _mm_set_sd(src_dbl[i]));
64
/* _mm_set1_epi16
65
_mm_set1_epi32
66
_mm_set1_epi64
67
_mm_set1_epi64x
68
_mm_set1_epi8 */
69
SET_STORE_D("_mm_set1_pd", _mm_set1_pd(src_dbl[i]));
70
/* _mm_setr_epi16
71
_mm_setr_epi32
72
_mm_setr_epi64
73
_mm_setr_epi8 */
74
SET_STORE_D("_mm_setr_pd", _mm_set_pd(src_dbl[i+2], src_dbl[i+0]));
75
SET_STORE_D("_mm_setzero_pd", _mm_setzero_pd());
76
// _mm_setzero_si128
77
78
SETCHART("move");
79
// _mm_move_epi64
80
SET_STORE_D("_mm_move_sd", _mm_move_sd(_mm_load_pd(src_dbl+i), _mm_load_pd(src2_dbl+i)));
81
82
SETCHART("store");
83
// _mm_maskmoveu_si128
84
LOAD_STORE_D("_mm_store_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2);
85
// LOAD_STORE_D("_mm_store_pd1", _mm_load_pd, 0, _mm_store_pd1, double*, 0);
86
LOAD_STORE_D("_mm_store_sd", _mm_load_pd, 0, _mm_store_sd, double*, 1, 2);
87
// _mm_store_si128
88
// _mm_store1_pd
89
LOAD_STORE_64_D("_mm_storeh_pi", _mm_load_pd, 0, _mm_storeh_pi, 1, 2);
90
// _mm_storel_epi64
91
LOAD_STORE_64_D("_mm_storel_pi", _mm_load_pd, 0, _mm_storel_pi, 1, 2);
92
LOAD_STORE_D("_mm_storer_pd", _mm_load_pd, 0, _mm_storer_pd, double*, 0, 2);
93
LOAD_STORE_D("_mm_storeu_pd", _mm_load_pd, 0, _mm_storeu_pd, double*, 1, 2);
94
// _mm_storeu_si128
95
LOAD_STORE_D("_mm_stream_pd", _mm_load_pd, 0, _mm_stream_pd, double*, 0, 2);
96
// _mm_stream_si128
97
// _mm_stream_si32
98
// _mm_stream_si64
99
100
SETCHART("arithmetic");
101
// _mm_add_epi16
102
// _mm_add_epi32
103
// _mm_add_epi64
104
// _mm_add_epi8
105
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] += src2_dbl[0]; dst_dbl[1] += src2_dbl[1]; dst_dbl[2] += src2_dbl[2]; dst_dbl[3] += src2_dbl[3]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar add");
106
BINARYOP_D_DD("_mm_add_pd", _mm_add_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
107
BINARYOP_D_DD("_mm_add_sd", _mm_add_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
108
// _mm_adds_epi16
109
// _mm_adds_epi8
110
// _mm_adds_epu16
111
// _mm_adds_epu8
112
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] /= src2_dbl[0]; dst_dbl[1] /= src2_dbl[1]; dst_dbl[2] /= src2_dbl[2]; dst_dbl[3] /= src2_dbl[3]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar div");
113
BINARYOP_D_DD("_mm_div_pd", _mm_div_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
114
BINARYOP_D_DD("_mm_div_sd", _mm_div_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
115
// _mm_madd_epi16
116
// _mm_mul_epu32
117
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] *= src2_dbl[0]; dst_dbl[1] *= src2_dbl[1]; dst_dbl[2] *= src2_dbl[2]; dst_dbl[3] *= src2_dbl[3]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar mul");
118
BINARYOP_D_DD("_mm_mul_pd", _mm_mul_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
119
BINARYOP_D_DD("_mm_mul_sd", _mm_mul_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
120
// _mm_mulhi_epi16
121
// _mm_mulhi_epu16
122
// _mm_mullo_epi16
123
// _mm_sad_epu8
124
// _mm_sub_epi16
125
// _mm_sub_epi32
126
// _mm_sub_epi64
127
// _mm_sub_epi8
128
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] -= src2_dbl[0]; dst_dbl[1] -= src2_dbl[1]; dst_dbl[2] -= src2_dbl[2]; dst_dbl[3] -= src2_dbl[3]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar sub");
129
BINARYOP_D_DD("_mm_sub_pd", _mm_sub_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
130
BINARYOP_D_DD("_mm_sub_sd", _mm_sub_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
131
// _mm_subs_epi16
132
// _mm_subs_epi8
133
// _mm_subs_epu16
134
// _mm_subs_epu8
135
136
SETCHART("roots");
137
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = sqrt(dst_dbl[0]); dst_dbl[1] = sqrt(dst_dbl[1]); dst_dbl[2] = sqrt(dst_dbl[2]); dst_dbl[3] = sqrt(dst_dbl[3]); } ENDSCALAR(checksum_dst(dst_dbl), "scalar sqrt");
138
UNARYOP_D_D("_mm_sqrt_pd", _mm_sqrt_pd, _mm_load_pd(src_dbl));
139
// UNARYOP_D_D("_mm_sqrt_sd", _mm_sqrt_sd, _mm_load_pd(src_dbl));
140
141
SETCHART("logical");
142
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = ucastd(dcastu(dst_dbl[0]) & dcastu(src2_dbl[0])); dst_dbl[1] = ucastd(dcastu(dst_dbl[1]) & dcastu(src2_dbl[1])); dst_dbl[2] = ucastd(dcastu(dst_dbl[2]) & dcastu(src2_dbl[2])); dst_dbl[3] = ucastd(dcastu(dst_dbl[3]) & dcastu(src2_dbl[3])); } ENDSCALAR(checksum_dst(dst_dbl), "scalar and");
143
BINARYOP_D_DD("_mm_and_pd", _mm_and_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
144
// _mm_and_si128
145
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = ucastd((~dcastu(dst_dbl[0])) & dcastu(src2_dbl[0])); dst_dbl[1] = ucastd((~dcastu(dst_dbl[1])) & dcastu(src2_dbl[1])); dst_dbl[2] = ucastd((~dcastu(dst_dbl[2])) & dcastu(src2_dbl[2])); dst_dbl[3] = ucastd((~dcastu(dst_dbl[3])) & dcastu(src2_dbl[3])); } ENDSCALAR(checksum_dst(dst_dbl), "scalar andnot");
146
BINARYOP_D_DD("_mm_andnot_pd", _mm_andnot_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
147
// _mm_andnot_si128
148
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = ucastd(dcastu(dst_dbl[0]) | dcastu(src2_dbl[0])); dst_dbl[1] = ucastd(dcastu(dst_dbl[1]) | dcastu(src2_dbl[1])); dst_dbl[2] = ucastd(dcastu(dst_dbl[2]) | dcastu(src2_dbl[2])); dst_dbl[3] = ucastd(dcastu(dst_dbl[3]) | dcastu(src2_dbl[3])); } ENDSCALAR(checksum_dst(dst_dbl), "scalar or");
149
BINARYOP_D_DD("_mm_or_pd", _mm_or_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
150
// _mm_or_si128
151
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = ucastd(dcastu(dst_dbl[0]) ^ dcastu(src2_dbl[0])); dst_dbl[1] = ucastd(dcastu(dst_dbl[1]) ^ dcastu(src2_dbl[1])); dst_dbl[2] = ucastd(dcastu(dst_dbl[2]) ^ dcastu(src2_dbl[2])); dst_dbl[3] = ucastd(dcastu(dst_dbl[3]) ^ dcastu(src2_dbl[3])); } ENDSCALAR(checksum_dst(dst_dbl), "scalar xor");
152
BINARYOP_D_DD("_mm_xor_pd", _mm_xor_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
153
// _mm_xor_si128
154
155
SETCHART("cmp");
156
// _mm_cmpeq_epi16
157
// _mm_cmpeq_epi32
158
// _mm_cmpeq_epi8
159
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (dst_dbl[0] == src2_dbl[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (dst_dbl[1] == src2_dbl[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (dst_dbl[2] == src2_dbl[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (dst_dbl[3] == src2_dbl[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmp==");
160
BINARYOP_D_DD("_mm_cmpeq_pd", _mm_cmpeq_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
161
BINARYOP_D_DD("_mm_cmpeq_sd", _mm_cmpeq_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
162
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (dst_dbl[0] >= src2_dbl[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (dst_dbl[1] >= src2_dbl[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (dst_dbl[2] >= src2_dbl[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (dst_dbl[3] >= src2_dbl[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmp>=");
163
BINARYOP_D_DD("_mm_cmpge_pd", _mm_cmpge_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
164
BINARYOP_D_DD("_mm_cmpge_sd", _mm_cmpge_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
165
// _mm_cmpgt_epi16
166
// _mm_cmpgt_epi32
167
// _mm_cmpgt_epi8
168
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (dst_dbl[0] > src2_dbl[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (dst_dbl[1] > src2_dbl[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (dst_dbl[2] > src2_dbl[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (dst_dbl[3] > src2_dbl[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmp>");
169
BINARYOP_D_DD("_mm_cmpgt_pd", _mm_cmpgt_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
170
BINARYOP_D_DD("_mm_cmpgt_sd", _mm_cmpgt_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
171
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (dst_dbl[0] <= src2_dbl[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (dst_dbl[1] <= src2_dbl[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (dst_dbl[2] <= src2_dbl[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (dst_dbl[3] <= src2_dbl[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmp<=");
172
BINARYOP_D_DD("_mm_cmple_pd", _mm_cmple_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
173
BINARYOP_D_DD("_mm_cmple_sd", _mm_cmple_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
174
// _mm_cmplt_epi16
175
// _mm_cmplt_epi32
176
// _mm_cmplt_epi8
177
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (dst_dbl[0] < src2_dbl[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (dst_dbl[1] < src2_dbl[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (dst_dbl[2] < src2_dbl[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (dst_dbl[3] < src2_dbl[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmp<");
178
BINARYOP_D_DD("_mm_cmplt_pd", _mm_cmplt_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
179
BINARYOP_D_DD("_mm_cmplt_sd", _mm_cmplt_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
180
181
/*_mm_cmpneq_pd
182
_mm_cmpneq_sd
183
_mm_cmpnge_pd
184
_mm_cmpnge_sd
185
_mm_cmpngt_pd
186
_mm_cmpngt_sd
187
_mm_cmpnle_pd
188
_mm_cmpnle_sd
189
_mm_cmpnlt_pd
190
_mm_cmpnlt_sd*/
191
192
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (!Isnan(dst_dbl[0]) && !Isnan(src2_dbl[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (!Isnan(dst_dbl[1]) && !Isnan(src2_dbl[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (!Isnan(dst_dbl[2]) && !Isnan(src2_dbl[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (!Isnan(dst_dbl[3]) && !Isnan(src2_dbl[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmpord");
193
BINARYOP_D_DD("_mm_cmpord_pd", _mm_cmpord_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
194
BINARYOP_D_DD("_mm_cmpord_sd", _mm_cmpord_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
195
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = (Isnan(dst_dbl[0]) || Isnan(src2_dbl[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[1] = (Isnan(dst_dbl[1]) || Isnan(src2_dbl[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[2] = (Isnan(dst_dbl[2]) || Isnan(src2_dbl[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst_dbl[3] = (Isnan(dst_dbl[3]) || Isnan(src2_dbl[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst_dbl), "scalar cmpunord");
196
BINARYOP_D_DD("_mm_cmpunord_pd", _mm_cmpunord_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
197
BINARYOP_D_DD("_mm_cmpunord_sd", _mm_cmpunord_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
198
199
SETCHART("max");
200
// _mm_max_epi16
201
// _mm_max_epu8
202
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = Max(dst_dbl[0], src2_dbl[0]); dst_dbl[1] = Max(dst_dbl[1], src2_dbl[1]); dst_dbl[2] = Max(dst_dbl[2], src2_dbl[2]); dst_dbl[3] = Max(dst_dbl[3], src2_dbl[3]); } ENDSCALAR(checksum_dst(dst_dbl), "scalar max");
203
BINARYOP_D_DD("_mm_max_pd", _mm_max_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
204
BINARYOP_D_DD("_mm_max_sd", _mm_max_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
205
// _mm_min_epi16
206
// _mm_min_epu8
207
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = Min(dst_dbl[0], src2_dbl[0]); dst_dbl[1] = Min(dst_dbl[1], src2_dbl[1]); dst_dbl[2] = Min(dst_dbl[2], src2_dbl[2]); dst_dbl[3] = Min(dst_dbl[3], src2_dbl[3]); } ENDSCALAR(checksum_dst(dst_dbl), "scalar min");
208
BINARYOP_D_DD("_mm_min_pd", _mm_min_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
209
BINARYOP_D_DD("_mm_min_sd", _mm_min_sd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
210
211
SETCHART("shuffle");
212
// _mm_extract_epi16
213
// _mm_insert_epi16
214
// _mm_shuffle_epi32
215
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[3] = dst_dbl[1]; dst_dbl[2] = dst_dbl[0]; dst_dbl[1] = src2_dbl[3]; dst_dbl[0] = src2_dbl[2]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar shuffle");
216
// BINARYOP_D_DD("_mm_shuffle_pd", _mm_shuffle_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
217
START();
218
__m128 o0 = _mm_load_pd(src_dbl);
219
__m128 o1 = _mm_load_pd(src2_dbl);
220
for(int i = 0; i < N; i += 4)
221
o0 = _mm_shuffle_pd(o0, o1, _MM_SHUFFLE2(1, 0));
222
_mm_store_pd(dst_dbl, o0);
223
END(checksum_dst(dst_dbl), "_mm_shuffle_pd");
224
225
// _mm_shufflehi_epi16
226
// _mm_shufflelo_epi16
227
// _mm_unpackhi_epi16
228
// _mm_unpackhi_epi32
229
// _mm_unpackhi_epi64
230
// _mm_unpackhi_epi8
231
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[0] = dst_dbl[2]; dst_dbl[1] = src2_dbl[2]; dst_dbl[2] = dst_dbl[3]; dst_dbl[3] = src2_dbl[3]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar unpackhi_pd");
232
BINARYOP_D_DD("_mm_unpackhi_pd", _mm_unpackhi_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
233
// _mm_unpacklo_epi16
234
// _mm_unpacklo_epi32
235
// _mm_unpacklo_epi64
236
// _mm_unpacklo_epi8
237
START(); dst_dbl[0] = src_dbl[0]; dst_dbl[1] = src_dbl[1]; dst_dbl[2] = src_dbl[2]; dst_dbl[3] = src_dbl[3]; for(int i = 0; i < N; ++i) { dst_dbl[2] = dst_dbl[1]; dst_dbl[1] = dst_dbl[0]; dst_dbl[0] = src2_dbl[0]; dst_dbl[3] = src2_dbl[1]; } ENDSCALAR(checksum_dst(dst_dbl), "scalar unpacklo_pd");
238
BINARYOP_D_DD("_mm_unpacklo_pd", _mm_unpacklo_pd, _mm_load_pd(src_dbl), _mm_load_pd(src2_dbl));
239
240
// Benchmarks end:
241
printf("]}\n");
242
}
243
244