Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/arm-optimized-routines/math/test/mathbench.c
48254 views
1
/*
2
* Microbenchmark for math functions.
3
*
4
* Copyright (c) 2018-2024, Arm Limited.
5
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6
*/
7
8
#if WANT_SVE_TESTS
9
# if __aarch64__ && __linux__
10
# ifdef __clang__
11
# pragma clang attribute push(__attribute__((target("sve"))), \
12
apply_to = any(function))
13
# else
14
# pragma GCC target("+sve")
15
# endif
16
# else
17
# error "SVE not supported - please disable WANT_SVE_TESTS"
18
# endif
19
#endif
20
21
#undef _GNU_SOURCE
22
#define _GNU_SOURCE 1
23
#include <stdint.h>
24
#include <stdlib.h>
25
#include <stdio.h>
26
#include <string.h>
27
#include <time.h>
28
#include <math.h>
29
#include "mathlib.h"
30
31
/* Number of measurements, best result is reported. */
32
#define MEASURE 60
33
/* Array size. */
34
#define N 8000
35
/* Iterations over the array. */
36
#define ITER 125
37
38
static double *Trace;
39
static size_t trace_size;
40
static double A[N];
41
static float Af[N];
42
static long measurecount = MEASURE;
43
static long itercount = ITER;
44
45
static double
46
dummy (double x)
47
{
48
return x;
49
}
50
51
static float
52
dummyf (float x)
53
{
54
return x;
55
}
56
#if __aarch64__ && __linux__
57
__vpcs static float64x2_t
58
__vn_dummy (float64x2_t x)
59
{
60
return x;
61
}
62
63
__vpcs static float32x4_t
64
__vn_dummyf (float32x4_t x)
65
{
66
return x;
67
}
68
#endif
69
#if WANT_SVE_TESTS
70
static svfloat64_t
71
__sv_dummy (svfloat64_t x, svbool_t pg)
72
{
73
return x;
74
}
75
76
static svfloat32_t
77
__sv_dummyf (svfloat32_t x, svbool_t pg)
78
{
79
return x;
80
}
81
82
#endif
83
84
#include "test/mathbench_wrappers.h"
85
86
static const struct fun
87
{
88
const char *name;
89
int prec;
90
int vec;
91
double lo;
92
double hi;
93
union
94
{
95
double (*d) (double);
96
float (*f) (float);
97
#if __aarch64__ && __linux__
98
__vpcs float64x2_t (*vnd) (float64x2_t);
99
__vpcs float32x4_t (*vnf) (float32x4_t);
100
#endif
101
#if WANT_SVE_TESTS
102
svfloat64_t (*svd) (svfloat64_t, svbool_t);
103
svfloat32_t (*svf) (svfloat32_t, svbool_t);
104
#endif
105
} fun;
106
} funtab[] = {
107
// clang-format off
108
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
109
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
110
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
111
#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
112
#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
113
#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
114
D (dummy, 1.0, 2.0)
115
F (dummyf, 1.0, 2.0)
116
#if __aarch64__ && __linux__
117
VND (__vn_dummy, 1.0, 2.0)
118
VNF (__vn_dummyf, 1.0, 2.0)
119
#endif
120
#if WANT_SVE_TESTS
121
SVD (__sv_dummy, 1.0, 2.0)
122
SVF (__sv_dummyf, 1.0, 2.0)
123
#endif
124
#include "test/mathbench_funcs.h"
125
{0},
126
#undef F
127
#undef D
128
#undef VNF
129
#undef VND
130
#undef SVF
131
#undef SVD
132
// clang-format on
133
};
134
135
static void
136
gen_linear (double lo, double hi)
137
{
138
for (int i = 0; i < N; i++)
139
A[i] = (lo * (N - i) + hi * i) / N;
140
}
141
142
static void
143
genf_linear (double lo, double hi)
144
{
145
for (int i = 0; i < N; i++)
146
Af[i] = (float)(lo * (N - i) + hi * i) / N;
147
}
148
149
static inline double
150
asdouble (uint64_t i)
151
{
152
union
153
{
154
uint64_t i;
155
double f;
156
} u = {i};
157
return u.f;
158
}
159
160
static uint64_t seed = 0x0123456789abcdef;
161
162
static double
163
frand (double lo, double hi)
164
{
165
seed = 6364136223846793005ULL * seed + 1;
166
return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
167
}
168
169
static void
170
gen_rand (double lo, double hi)
171
{
172
for (int i = 0; i < N; i++)
173
A[i] = frand (lo, hi);
174
}
175
176
static void
177
genf_rand (double lo, double hi)
178
{
179
for (int i = 0; i < N; i++)
180
Af[i] = (float)frand (lo, hi);
181
}
182
183
static void
184
gen_trace (int index)
185
{
186
for (int i = 0; i < N; i++)
187
A[i] = Trace[index + i];
188
}
189
190
static void
191
genf_trace (int index)
192
{
193
for (int i = 0; i < N; i++)
194
Af[i] = (float)Trace[index + i];
195
}
196
197
static void
198
run_thruput (double f (double))
199
{
200
for (int i = 0; i < N; i++)
201
f (A[i]);
202
}
203
204
static void
205
runf_thruput (float f (float))
206
{
207
for (int i = 0; i < N; i++)
208
f (Af[i]);
209
}
210
211
volatile double zero = 0;
212
213
static void
214
run_latency (double f (double))
215
{
216
double z = zero;
217
double prev = z;
218
for (int i = 0; i < N; i++)
219
prev = f (A[i] + prev * z);
220
}
221
222
static void
223
runf_latency (float f (float))
224
{
225
float z = (float)zero;
226
float prev = z;
227
for (int i = 0; i < N; i++)
228
prev = f (Af[i] + prev * z);
229
}
230
231
#if __aarch64__ && __linux__
232
static void
233
run_vn_thruput (__vpcs float64x2_t f (float64x2_t))
234
{
235
for (int i = 0; i < N; i += 2)
236
f (vld1q_f64 (A + i));
237
}
238
239
static void
240
runf_vn_thruput (__vpcs float32x4_t f (float32x4_t))
241
{
242
for (int i = 0; i < N; i += 4)
243
f (vld1q_f32 (Af + i));
244
}
245
246
static void
247
run_vn_latency (__vpcs float64x2_t f (float64x2_t))
248
{
249
volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
250
uint64x2_t sel = vsel;
251
float64x2_t prev = vdupq_n_f64 (0);
252
for (int i = 0; i < N; i += 2)
253
prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i)));
254
}
255
256
static void
257
runf_vn_latency (__vpcs float32x4_t f (float32x4_t))
258
{
259
volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
260
uint32x4_t sel = vsel;
261
float32x4_t prev = vdupq_n_f32 (0);
262
for (int i = 0; i < N; i += 4)
263
prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i)));
264
}
265
#endif
266
267
#if WANT_SVE_TESTS
268
static void
269
run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t))
270
{
271
for (int i = 0; i < N; i += svcntd ())
272
f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ());
273
}
274
275
static void
276
runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t))
277
{
278
for (int i = 0; i < N; i += svcntw ())
279
f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ());
280
}
281
282
static void
283
run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t))
284
{
285
volatile svbool_t vsel = svptrue_b64 ();
286
svbool_t sel = vsel;
287
svfloat64_t prev = svdup_f64 (0);
288
for (int i = 0; i < N; i += svcntd ())
289
prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev),
290
svptrue_b64 ());
291
}
292
293
static void
294
runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t))
295
{
296
volatile svbool_t vsel = svptrue_b32 ();
297
svbool_t sel = vsel;
298
svfloat32_t prev = svdup_f32 (0);
299
for (int i = 0; i < N; i += svcntw ())
300
prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev),
301
svptrue_b32 ());
302
}
303
#endif
304
305
static uint64_t
306
tic (void)
307
{
308
struct timespec ts;
309
#if defined(_MSC_VER)
310
if (!timespec_get (&ts, TIME_UTC))
311
#else
312
if (clock_gettime (CLOCK_REALTIME, &ts))
313
#endif
314
abort ();
315
return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
316
}
317
318
#define TIMEIT(run, f) do { \
319
dt = -1; \
320
run (f); /* Warm up. */ \
321
for (int j = 0; j < measurecount; j++) \
322
{ \
323
uint64_t t0 = tic (); \
324
for (int i = 0; i < itercount; i++) \
325
run (f); \
326
uint64_t t1 = tic (); \
327
if (t1 - t0 < dt) \
328
dt = t1 - t0; \
329
} \
330
} while (0)
331
332
static void
333
bench1 (const struct fun *f, int type, double lo, double hi)
334
{
335
uint64_t dt = 0;
336
uint64_t ns100;
337
const char *s = type == 't' ? "rthruput" : "latency";
338
int vlen = 1;
339
340
if (f->vec == 'n')
341
vlen = f->prec == 'd' ? 2 : 4;
342
#if WANT_SVE_TESTS
343
else if (f->vec == 's')
344
vlen = f->prec == 'd' ? svcntd () : svcntw ();
345
#endif
346
347
if (f->prec == 'd' && type == 't' && f->vec == 0)
348
TIMEIT (run_thruput, f->fun.d);
349
else if (f->prec == 'd' && type == 'l' && f->vec == 0)
350
TIMEIT (run_latency, f->fun.d);
351
else if (f->prec == 'f' && type == 't' && f->vec == 0)
352
TIMEIT (runf_thruput, f->fun.f);
353
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
354
TIMEIT (runf_latency, f->fun.f);
355
#if __aarch64__ && __linux__
356
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
357
TIMEIT (run_vn_thruput, f->fun.vnd);
358
else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
359
TIMEIT (run_vn_latency, f->fun.vnd);
360
else if (f->prec == 'f' && type == 't' && f->vec == 'n')
361
TIMEIT (runf_vn_thruput, f->fun.vnf);
362
else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
363
TIMEIT (runf_vn_latency, f->fun.vnf);
364
#endif
365
#if WANT_SVE_TESTS
366
else if (f->prec == 'd' && type == 't' && f->vec == 's')
367
TIMEIT (run_sv_thruput, f->fun.svd);
368
else if (f->prec == 'd' && type == 'l' && f->vec == 's')
369
TIMEIT (run_sv_latency, f->fun.svd);
370
else if (f->prec == 'f' && type == 't' && f->vec == 's')
371
TIMEIT (runf_sv_thruput, f->fun.svf);
372
else if (f->prec == 'f' && type == 'l' && f->vec == 's')
373
TIMEIT (runf_sv_latency, f->fun.svf);
374
#endif
375
376
if (type == 't')
377
{
378
ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
379
printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
380
f->name, s,
381
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
382
(unsigned long long) dt, lo, hi, vlen);
383
}
384
else if (type == 'l')
385
{
386
ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
387
printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
388
f->name, s,
389
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
390
(unsigned long long) dt, lo, hi, vlen);
391
}
392
fflush (stdout);
393
}
394
395
static void
396
bench (const struct fun *f, double lo, double hi, int type, int gen)
397
{
398
if (f->prec == 'd' && gen == 'r')
399
gen_rand (lo, hi);
400
else if (f->prec == 'd' && gen == 'l')
401
gen_linear (lo, hi);
402
else if (f->prec == 'd' && gen == 't')
403
gen_trace (0);
404
else if (f->prec == 'f' && gen == 'r')
405
genf_rand (lo, hi);
406
else if (f->prec == 'f' && gen == 'l')
407
genf_linear (lo, hi);
408
else if (f->prec == 'f' && gen == 't')
409
genf_trace (0);
410
411
if (gen == 't')
412
hi = trace_size / N;
413
414
if (type == 'b' || type == 't')
415
bench1 (f, 't', lo, hi);
416
417
if (type == 'b' || type == 'l')
418
bench1 (f, 'l', lo, hi);
419
420
for (int i = N; i < trace_size; i += N)
421
{
422
if (f->prec == 'd')
423
gen_trace (i);
424
else
425
genf_trace (i);
426
427
lo = i / N;
428
if (type == 'b' || type == 't')
429
bench1 (f, 't', lo, hi);
430
431
if (type == 'b' || type == 'l')
432
bench1 (f, 'l', lo, hi);
433
}
434
}
435
436
static void
437
readtrace (const char *name)
438
{
439
int n = 0;
440
FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
441
if (!f)
442
{
443
printf ("openning \"%s\" failed: %m\n", name);
444
exit (1);
445
}
446
for (;;)
447
{
448
if (n >= trace_size)
449
{
450
trace_size += N;
451
Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
452
if (Trace == NULL)
453
{
454
printf ("out of memory\n");
455
exit (1);
456
}
457
}
458
if (fscanf (f, "%lf", Trace + n) != 1)
459
break;
460
n++;
461
}
462
if (ferror (f) || n == 0)
463
{
464
printf ("reading \"%s\" failed: %m\n", name);
465
exit (1);
466
}
467
fclose (f);
468
if (n % N == 0)
469
trace_size = n;
470
for (int i = 0; n < trace_size; n++, i++)
471
Trace[n] = Trace[i];
472
}
473
474
static void
475
usage (void)
476
{
477
printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
478
"[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
479
"[func2 ..]\n");
480
printf ("func:\n");
481
printf ("%7s [run all benchmarks]\n", "all");
482
for (const struct fun *f = funtab; f->name; f++)
483
printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
484
exit (1);
485
}
486
487
int
488
main (int argc, char *argv[])
489
{
490
int usergen = 0, gen = 'r', type = 'b', all = 0;
491
double lo = 0, hi = 0;
492
const char *tracefile = "-";
493
494
argv++;
495
argc--;
496
for (;;)
497
{
498
if (argc <= 0)
499
usage ();
500
if (argv[0][0] != '-')
501
break;
502
else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
503
{
504
usergen = 1;
505
lo = strtod (argv[1], 0);
506
hi = strtod (argv[2], 0);
507
argv += 3;
508
argc -= 3;
509
}
510
else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
511
{
512
measurecount = strtol (argv[1], 0, 0);
513
argv += 2;
514
argc -= 2;
515
}
516
else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
517
{
518
itercount = strtol (argv[1], 0, 0);
519
argv += 2;
520
argc -= 2;
521
}
522
else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
523
{
524
gen = argv[1][0];
525
if (strchr ("rlt", gen) == 0)
526
usage ();
527
argv += 2;
528
argc -= 2;
529
}
530
else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
531
{
532
gen = 't'; /* -f implies -g trace. */
533
tracefile = argv[1];
534
argv += 2;
535
argc -= 2;
536
}
537
else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
538
{
539
type = argv[1][0];
540
if (strchr ("ltb", type) == 0)
541
usage ();
542
argv += 2;
543
argc -= 2;
544
}
545
else
546
usage ();
547
}
548
if (gen == 't')
549
{
550
readtrace (tracefile);
551
lo = hi = 0;
552
usergen = 1;
553
}
554
while (argc > 0)
555
{
556
int found = 0;
557
all = strcmp (argv[0], "all") == 0;
558
for (const struct fun *f = funtab; f->name; f++)
559
if (all || strcmp (argv[0], f->name) == 0)
560
{
561
found = 1;
562
if (!usergen)
563
{
564
lo = f->lo;
565
hi = f->hi;
566
}
567
bench (f, lo, hi, type, gen);
568
if (usergen && !all)
569
break;
570
}
571
if (!found)
572
printf ("unknown function: %s\n", argv[0]);
573
argv++;
574
argc--;
575
}
576
return 0;
577
}
578
579
#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__)
580
# pragma clang attribute pop
581
#endif
582
583