Path: blob/main/contrib/arm-optimized-routines/math/test/mathbench.c
48254 views
/*1* Microbenchmark for math functions.2*3* Copyright (c) 2018-2024, Arm Limited.4* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception5*/67#if WANT_SVE_TESTS8# if __aarch64__ && __linux__9# ifdef __clang__10# pragma clang attribute push(__attribute__((target("sve"))), \11apply_to = any(function))12# else13# pragma GCC target("+sve")14# endif15# else16# error "SVE not supported - please disable WANT_SVE_TESTS"17# endif18#endif1920#undef _GNU_SOURCE21#define _GNU_SOURCE 122#include <stdint.h>23#include <stdlib.h>24#include <stdio.h>25#include <string.h>26#include <time.h>27#include <math.h>28#include "mathlib.h"2930/* Number of measurements, best result is reported. */31#define MEASURE 6032/* Array size. */33#define N 800034/* Iterations over the array. */35#define ITER 1253637static double *Trace;38static size_t trace_size;39static double A[N];40static float Af[N];41static long measurecount = MEASURE;42static long itercount = ITER;4344static double45dummy (double x)46{47return x;48}4950static float51dummyf (float x)52{53return x;54}55#if __aarch64__ && __linux__56__vpcs static float64x2_t57__vn_dummy (float64x2_t x)58{59return x;60}6162__vpcs static float32x4_t63__vn_dummyf (float32x4_t x)64{65return x;66}67#endif68#if WANT_SVE_TESTS69static svfloat64_t70__sv_dummy (svfloat64_t x, svbool_t pg)71{72return x;73}7475static svfloat32_t76__sv_dummyf (svfloat32_t x, svbool_t pg)77{78return x;79}8081#endif8283#include "test/mathbench_wrappers.h"8485static const struct fun86{87const char *name;88int prec;89int vec;90double lo;91double hi;92union93{94double (*d) (double);95float (*f) (float);96#if __aarch64__ && __linux__97__vpcs float64x2_t (*vnd) (float64x2_t);98__vpcs float32x4_t (*vnf) (float32x4_t);99#endif100#if WANT_SVE_TESTS101svfloat64_t (*svd) (svfloat64_t, svbool_t);102svfloat32_t (*svf) (svfloat32_t, svbool_t);103#endif104} fun;105} funtab[] = {106// clang-format off107#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},108#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},109#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},110#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},111#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},112#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},113D (dummy, 1.0, 2.0)114F (dummyf, 1.0, 2.0)115#if __aarch64__ && __linux__116VND (__vn_dummy, 1.0, 2.0)117VNF (__vn_dummyf, 1.0, 2.0)118#endif119#if WANT_SVE_TESTS120SVD (__sv_dummy, 1.0, 2.0)121SVF (__sv_dummyf, 1.0, 2.0)122#endif123#include "test/mathbench_funcs.h"124{0},125#undef F126#undef D127#undef VNF128#undef VND129#undef SVF130#undef SVD131// clang-format on132};133134static void135gen_linear (double lo, double hi)136{137for (int i = 0; i < N; i++)138A[i] = (lo * (N - i) + hi * i) / N;139}140141static void142genf_linear (double lo, double hi)143{144for (int i = 0; i < N; i++)145Af[i] = (float)(lo * (N - i) + hi * i) / N;146}147148static inline double149asdouble (uint64_t i)150{151union152{153uint64_t i;154double f;155} u = {i};156return u.f;157}158159static uint64_t seed = 0x0123456789abcdef;160161static double162frand (double lo, double hi)163{164seed = 6364136223846793005ULL * seed + 1;165return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);166}167168static void169gen_rand (double lo, double hi)170{171for (int i = 0; i < N; i++)172A[i] = frand (lo, hi);173}174175static void176genf_rand (double lo, double hi)177{178for (int i = 0; i < N; i++)179Af[i] = (float)frand (lo, hi);180}181182static void183gen_trace (int index)184{185for (int i = 0; i < N; i++)186A[i] = Trace[index + i];187}188189static void190genf_trace (int index)191{192for (int i = 0; i < N; i++)193Af[i] = (float)Trace[index + i];194}195196static void197run_thruput (double f (double))198{199for (int i = 0; i < N; i++)200f (A[i]);201}202203static void204runf_thruput (float f (float))205{206for (int i = 0; i < N; i++)207f (Af[i]);208}209210volatile double zero = 0;211212static void213run_latency (double f (double))214{215double z = zero;216double prev = z;217for (int i = 0; i < N; i++)218prev = f (A[i] + prev * z);219}220221static void222runf_latency (float f (float))223{224float z = (float)zero;225float prev = z;226for (int i = 0; i < N; i++)227prev = f (Af[i] + prev * z);228}229230#if __aarch64__ && __linux__231static void232run_vn_thruput (__vpcs float64x2_t f (float64x2_t))233{234for (int i = 0; i < N; i += 2)235f (vld1q_f64 (A + i));236}237238static void239runf_vn_thruput (__vpcs float32x4_t f (float32x4_t))240{241for (int i = 0; i < N; i += 4)242f (vld1q_f32 (Af + i));243}244245static void246run_vn_latency (__vpcs float64x2_t f (float64x2_t))247{248volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };249uint64x2_t sel = vsel;250float64x2_t prev = vdupq_n_f64 (0);251for (int i = 0; i < N; i += 2)252prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i)));253}254255static void256runf_vn_latency (__vpcs float32x4_t f (float32x4_t))257{258volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };259uint32x4_t sel = vsel;260float32x4_t prev = vdupq_n_f32 (0);261for (int i = 0; i < N; i += 4)262prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i)));263}264#endif265266#if WANT_SVE_TESTS267static void268run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t))269{270for (int i = 0; i < N; i += svcntd ())271f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ());272}273274static void275runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t))276{277for (int i = 0; i < N; i += svcntw ())278f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ());279}280281static void282run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t))283{284volatile svbool_t vsel = svptrue_b64 ();285svbool_t sel = vsel;286svfloat64_t prev = svdup_f64 (0);287for (int i = 0; i < N; i += svcntd ())288prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev),289svptrue_b64 ());290}291292static void293runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t))294{295volatile svbool_t vsel = svptrue_b32 ();296svbool_t sel = vsel;297svfloat32_t prev = svdup_f32 (0);298for (int i = 0; i < N; i += svcntw ())299prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev),300svptrue_b32 ());301}302#endif303304static uint64_t305tic (void)306{307struct timespec ts;308#if defined(_MSC_VER)309if (!timespec_get (&ts, TIME_UTC))310#else311if (clock_gettime (CLOCK_REALTIME, &ts))312#endif313abort ();314return ts.tv_sec * 1000000000ULL + ts.tv_nsec;315}316317#define TIMEIT(run, f) do { \318dt = -1; \319run (f); /* Warm up. */ \320for (int j = 0; j < measurecount; j++) \321{ \322uint64_t t0 = tic (); \323for (int i = 0; i < itercount; i++) \324run (f); \325uint64_t t1 = tic (); \326if (t1 - t0 < dt) \327dt = t1 - t0; \328} \329} while (0)330331static void332bench1 (const struct fun *f, int type, double lo, double hi)333{334uint64_t dt = 0;335uint64_t ns100;336const char *s = type == 't' ? "rthruput" : "latency";337int vlen = 1;338339if (f->vec == 'n')340vlen = f->prec == 'd' ? 2 : 4;341#if WANT_SVE_TESTS342else if (f->vec == 's')343vlen = f->prec == 'd' ? svcntd () : svcntw ();344#endif345346if (f->prec == 'd' && type == 't' && f->vec == 0)347TIMEIT (run_thruput, f->fun.d);348else if (f->prec == 'd' && type == 'l' && f->vec == 0)349TIMEIT (run_latency, f->fun.d);350else if (f->prec == 'f' && type == 't' && f->vec == 0)351TIMEIT (runf_thruput, f->fun.f);352else if (f->prec == 'f' && type == 'l' && f->vec == 0)353TIMEIT (runf_latency, f->fun.f);354#if __aarch64__ && __linux__355else if (f->prec == 'd' && type == 't' && f->vec == 'n')356TIMEIT (run_vn_thruput, f->fun.vnd);357else if (f->prec == 'd' && type == 'l' && f->vec == 'n')358TIMEIT (run_vn_latency, f->fun.vnd);359else if (f->prec == 'f' && type == 't' && f->vec == 'n')360TIMEIT (runf_vn_thruput, f->fun.vnf);361else if (f->prec == 'f' && type == 'l' && f->vec == 'n')362TIMEIT (runf_vn_latency, f->fun.vnf);363#endif364#if WANT_SVE_TESTS365else if (f->prec == 'd' && type == 't' && f->vec == 's')366TIMEIT (run_sv_thruput, f->fun.svd);367else if (f->prec == 'd' && type == 'l' && f->vec == 's')368TIMEIT (run_sv_latency, f->fun.svd);369else if (f->prec == 'f' && type == 't' && f->vec == 's')370TIMEIT (runf_sv_thruput, f->fun.svf);371else if (f->prec == 'f' && type == 'l' && f->vec == 's')372TIMEIT (runf_sv_latency, f->fun.svf);373#endif374375if (type == 't')376{377ns100 = (100 * dt + itercount * N / 2) / (itercount * N);378printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",379f->name, s,380(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),381(unsigned long long) dt, lo, hi, vlen);382}383else if (type == 'l')384{385ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);386printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",387f->name, s,388(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),389(unsigned long long) dt, lo, hi, vlen);390}391fflush (stdout);392}393394static void395bench (const struct fun *f, double lo, double hi, int type, int gen)396{397if (f->prec == 'd' && gen == 'r')398gen_rand (lo, hi);399else if (f->prec == 'd' && gen == 'l')400gen_linear (lo, hi);401else if (f->prec == 'd' && gen == 't')402gen_trace (0);403else if (f->prec == 'f' && gen == 'r')404genf_rand (lo, hi);405else if (f->prec == 'f' && gen == 'l')406genf_linear (lo, hi);407else if (f->prec == 'f' && gen == 't')408genf_trace (0);409410if (gen == 't')411hi = trace_size / N;412413if (type == 'b' || type == 't')414bench1 (f, 't', lo, hi);415416if (type == 'b' || type == 'l')417bench1 (f, 'l', lo, hi);418419for (int i = N; i < trace_size; i += N)420{421if (f->prec == 'd')422gen_trace (i);423else424genf_trace (i);425426lo = i / N;427if (type == 'b' || type == 't')428bench1 (f, 't', lo, hi);429430if (type == 'b' || type == 'l')431bench1 (f, 'l', lo, hi);432}433}434435static void436readtrace (const char *name)437{438int n = 0;439FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");440if (!f)441{442printf ("openning \"%s\" failed: %m\n", name);443exit (1);444}445for (;;)446{447if (n >= trace_size)448{449trace_size += N;450Trace = realloc (Trace, trace_size * sizeof (Trace[0]));451if (Trace == NULL)452{453printf ("out of memory\n");454exit (1);455}456}457if (fscanf (f, "%lf", Trace + n) != 1)458break;459n++;460}461if (ferror (f) || n == 0)462{463printf ("reading \"%s\" failed: %m\n", name);464exit (1);465}466fclose (f);467if (n % N == 0)468trace_size = n;469for (int i = 0; n < trace_size; n++, i++)470Trace[n] = Trace[i];471}472473static void474usage (void)475{476printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "477"[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "478"[func2 ..]\n");479printf ("func:\n");480printf ("%7s [run all benchmarks]\n", "all");481for (const struct fun *f = funtab; f->name; f++)482printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);483exit (1);484}485486int487main (int argc, char *argv[])488{489int usergen = 0, gen = 'r', type = 'b', all = 0;490double lo = 0, hi = 0;491const char *tracefile = "-";492493argv++;494argc--;495for (;;)496{497if (argc <= 0)498usage ();499if (argv[0][0] != '-')500break;501else if (argc >= 3 && strcmp (argv[0], "-i") == 0)502{503usergen = 1;504lo = strtod (argv[1], 0);505hi = strtod (argv[2], 0);506argv += 3;507argc -= 3;508}509else if (argc >= 2 && strcmp (argv[0], "-m") == 0)510{511measurecount = strtol (argv[1], 0, 0);512argv += 2;513argc -= 2;514}515else if (argc >= 2 && strcmp (argv[0], "-c") == 0)516{517itercount = strtol (argv[1], 0, 0);518argv += 2;519argc -= 2;520}521else if (argc >= 2 && strcmp (argv[0], "-g") == 0)522{523gen = argv[1][0];524if (strchr ("rlt", gen) == 0)525usage ();526argv += 2;527argc -= 2;528}529else if (argc >= 2 && strcmp (argv[0], "-f") == 0)530{531gen = 't'; /* -f implies -g trace. */532tracefile = argv[1];533argv += 2;534argc -= 2;535}536else if (argc >= 2 && strcmp (argv[0], "-t") == 0)537{538type = argv[1][0];539if (strchr ("ltb", type) == 0)540usage ();541argv += 2;542argc -= 2;543}544else545usage ();546}547if (gen == 't')548{549readtrace (tracefile);550lo = hi = 0;551usergen = 1;552}553while (argc > 0)554{555int found = 0;556all = strcmp (argv[0], "all") == 0;557for (const struct fun *f = funtab; f->name; f++)558if (all || strcmp (argv[0], f->name) == 0)559{560found = 1;561if (!usergen)562{563lo = f->lo;564hi = f->hi;565}566bench (f, lo, hi, type, gen);567if (usergen && !all)568break;569}570if (!found)571printf ("unknown function: %s\n", argv[0]);572argv++;573argc--;574}575return 0;576}577578#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__)579# pragma clang attribute pop580#endif581582583