Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
emscripten-core
GitHub Repository: emscripten-core/emscripten
Path: blob/main/test/benchmark/benchmark_sse.h
6174 views
1
#pragma once
2
3
#include <stdio.h>
4
#include <math.h>
5
#include <time.h>
6
#include <inttypes.h>
7
#include <stdlib.h>
8
#include <assert.h>
9
10
#ifdef __EMSCRIPTEN__
11
#include <emscripten/emscripten.h>
12
#endif
13
14
#if defined(__unix__) && !defined(__EMSCRIPTEN__) // Native build without Emscripten.
15
#include <time.h>
16
#include <errno.h>
17
#include <string.h>
18
#endif
19
20
#ifdef __APPLE__
21
#define aligned_alloc(align, size) malloc((size))
22
#endif
23
24
#ifdef WIN32
25
#include <Windows.h>
26
#define aligned_alloc(align, size) _aligned_malloc((size), (align))
27
#endif
28
29
// Scalar horizontal max across four lanes.
30
float hmax(__m128 m) {
31
float f[4];
32
_mm_storeu_ps(f, m);
33
return fmax(fmax(f[0], f[1]), fmax(f[2], f[3]));
34
}
35
36
#include "tick.h"
37
38
const int N = 8*1024*1024;
39
40
tick_t scalarTotalTicks = 0;
41
tick_t simdTotalTicks = 0;
42
tick_t scalarTicks = 0;
43
const char *chartName = "";
44
#define SETCHART(x) chartName = (x);
45
46
#define START() \
47
do { \
48
tick_t start = tick();
49
50
bool comma=false;
51
#define END(result, name) \
52
tick_t end = tick(); \
53
tick_t ticks = end - start; \
54
scalarTotalTicks += scalarTicks; \
55
simdTotalTicks += ticks; \
56
double nsecs = (double)ticks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
57
printf("%s{ \"chart\": \"%s\", \"category\": \"%s\", \"scalar\": %f, \"simd\": %f }\n", comma?",":"", chartName, name, scalarTime, nsecs); \
58
comma = true; \
59
printf("%s", (result) != 0 ? "Error!" : ""); \
60
} while(0)
61
62
#define ENDSCALAR(result, name) \
63
tick_t end = tick(); \
64
scalarTicks = end - start; \
65
scalarTime = (double)scalarTicks * 1000.0 * 1000.0 * 1000.0 / ticks_per_sec() / N; \
66
printf("%s", (result) != 0 ? "Error!" : ""); \
67
} while(0)
68
69
void Print(__m128 m)
70
{
71
float val[4];
72
_mm_storeu_ps(val, m);
73
fprintf(stderr, "[%g, %g, %g, %g]\n", val[3], val[2], val[1], val[0]);
74
}
75
76
bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.
77
78
#ifdef _MSC_VER
79
#define NOINLINE __declspec(noinline)
80
#define INLINE __forceinline
81
#else
82
#define NOINLINE __attribute__((noinline))
83
#define INLINE __inline__
84
#endif
85
86
// Slightly awkward way to allocate so that compiler will definitely not see this memory area as compile-time optimizable:
87
int NOINLINE *alloc_int_buffer() { return always_true() ? (int*)aligned_alloc(16, (N+16)*sizeof(int)) : 0; }
88
float NOINLINE *alloc_float_buffer() { return always_true() ? (float*)aligned_alloc(16, (N+16)*sizeof(float)) : 0; }
89
double NOINLINE *alloc_double_buffer() { return always_true() ? (double*)aligned_alloc(16, (N+16)*sizeof(double)) : 0; }
90
91
template<typename T>
92
T checksum_dst(T *dst) {
93
if (always_true()) {
94
return 0.f;
95
} else {
96
T s = 0.f; for(int i = 0; i < N; ++i) s += dst[i];
97
return s;
98
}
99
}
100
101
uint32_t fcastu(float f) { return *(uint32_t*)&f; }
102
uint64_t dcastu(double f) { return *(uint64_t*)&f; }
103
float ucastf(uint32_t t) { return *(float*)&t; }
104
double ucastd(uint64_t t) { return *(double*)&t; }
105
106
#define LOAD_STORE_F(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
107
START(); \
108
for(int i = 0; i < N; i += num_elems_stride) \
109
store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(src_flt+load_offset+i)); \
110
END(checksum_dst(dst_flt), msg);
111
112
#define LOAD_STORE_D(msg, load_instr, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
113
START(); \
114
for(int i = 0; i < N; i += num_elems_stride) \
115
store_instr((store_ptr_type)dst_dbl+store_offset+i, load_instr(src_dbl+load_offset+i)); \
116
END(checksum_dst(dst_dbl), msg);
117
118
#define LOAD_STORE_I(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
119
START(); \
120
for(int i = 0; i < N; i += num_elems_stride) \
121
store_instr((__m128i*)(dst_int+store_offset+i), load_instr((__m128i*)(src_int+load_offset+i))); \
122
END(checksum_dst(dst_int), msg);
123
124
// load M64*, store M128
125
#define LOAD_STORE_M64(msg, reg, load_instr, load_ptr_type, load_offset, store_instr, store_ptr_type, store_offset, num_elems_stride) \
126
START(); \
127
for(int i = 0; i < N; i += num_elems_stride) \
128
store_instr((store_ptr_type)dst_flt+store_offset+i, load_instr(reg, (load_ptr_type)(src_flt+load_offset+i))); \
129
END(checksum_dst(dst_flt), msg);
130
131
#define LOAD_STORE_64_F(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
132
START(); \
133
for(int i = 0; i < N; i += num_elems_stride) \
134
store_instr((__m64*)(dst_flt+store_offset+i), load_instr(src_flt+load_offset+i)); \
135
END(checksum_dst(dst_flt), msg);
136
137
#define LOAD_STORE_64_D(msg, load_instr, load_offset, store_instr, store_offset, num_elems_stride) \
138
START(); \
139
for(int i = 0; i < N; i += num_elems_stride) \
140
store_instr((__m64*)(dst_dbl+store_offset+i), load_instr(src_dbl+load_offset+i)); \
141
END(checksum_dst(dst_dbl), msg);
142
143
#define SET_STORE_F(msg, set_instr) \
144
START(); \
145
for(int i = 0; i < N; i += 4) \
146
_mm_store_ps(dst_flt+i, set_instr); \
147
END(checksum_dst(dst_flt), msg);
148
149
#define SET_STORE_D(msg, set_instr) \
150
START(); \
151
for(int i = 0; i < N; i += 4) \
152
_mm_store_pd(dst_dbl+i, set_instr); \
153
END(checksum_dst(dst_dbl), msg);
154
155
#define UNARYOP_F_F(msg, instr, op0) \
156
START(); \
157
__m128 o = op0; \
158
for(int i = 0; i < N; i += 4) \
159
o = instr(o); \
160
_mm_store_ps(dst_flt, o); \
161
END(checksum_dst(dst_flt), msg);
162
163
#define UNARYOP_I_I(msg, instr, op0) \
164
START(); \
165
__m128 o = op0; \
166
for(int i = 0; i < N; i += 4) \
167
o = instr(o); \
168
_mm_store_si128((__m128i*)dst_int, o); \
169
END(checksum_dst(dst_int), msg);
170
171
#define UNARYOP_i_F(msg, instr) \
172
START(); \
173
for(int i = 0; i < N; i += 4) \
174
dst_int_scalar += instr; \
175
END(dst_int_scalar, msg);
176
177
#define UNARYOP_D_D(msg, instr, op0) \
178
START(); \
179
__m128d o = op0; \
180
for(int i = 0; i < N; i += 2) \
181
o = instr(o); \
182
_mm_store_pd(dst_dbl, o); \
183
END(checksum_dst(dst_dbl), msg);
184
185
#define BINARYOP_F_FF(msg, instr, op0, op1) \
186
START(); \
187
__m128 o0 = op0; \
188
__m128 o1 = op1; \
189
for(int i = 0; i < N; i += 4) \
190
o0 = instr(o0, o1); \
191
_mm_store_ps(dst_flt, o0); \
192
END(checksum_dst(dst_flt), msg);
193
194
#define BINARYOP_I_II(msg, instr, op0, op1) \
195
START(); \
196
__m128 o0 = op0; \
197
__m128 o1 = op1; \
198
for(int i = 0; i < N; i += 4) \
199
o0 = instr(o0, o1); \
200
_mm_store_si128((__m128i*)dst_int, o0); \
201
END(checksum_dst(dst_int), msg);
202
203
#define BINARYOP_D_DD(msg, instr, op0, op1) \
204
START(); \
205
__m128d o0 = op0; \
206
__m128d o1 = op1; \
207
for(int i = 0; i < N; i += 2) \
208
o0 = instr(o0, o1); \
209
_mm_store_pd(dst_dbl, o0); \
210
END(checksum_dst(dst_dbl), msg);
211
212
#define Max(a,b) ((a) >= (b) ? (a) : (b))
213
#define Min(a,b) ((a) <= (b) ? (a) : (b))
214
215
static INLINE int Isnan(float __f) {
216
return (*(unsigned int*)&__f << 1) > 0xFF000000u;
217
}
218
219