Path: blob/main/cddl/contrib/opensolaris/cmd/lockstat/lockstat.c
39488 views
/*1* CDDL HEADER START2*3* The contents of this file are subject to the terms of the4* Common Development and Distribution License (the "License").5* You may not use this file except in compliance with the License.6*7* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE8* or http://www.opensolaris.org/os/licensing.9* See the License for the specific language governing permissions10* and limitations under the License.11*12* When distributing Covered Code, include this CDDL HEADER in each13* file and include the License file at usr/src/OPENSOLARIS.LICENSE.14* If applicable, add the following below this CDDL HEADER, with the15* fields enclosed by brackets "[]" replaced with your own identifying16* information: Portions Copyright [yyyy] [name of copyright owner]17*18* CDDL HEADER END19*/20/*21* Copyright 2008 Sun Microsystems, Inc. All rights reserved.22* Use is subject to license terms.23*/2425#pragma ident "%Z%%M% %I% %E% SMI"2627#include <stdio.h>28#include <stddef.h>29#include <stdlib.h>30#include <stdarg.h>31#include <string.h>32#include <strings.h>33#include <ctype.h>34#include <fcntl.h>35#include <unistd.h>36#include <errno.h>37#include <limits.h>38#include <sys/types.h>39#include <sys/modctl.h>40#include <sys/stat.h>41#include <sys/wait.h>42#include <dtrace.h>43#include <sys/lockstat.h>44#include <alloca.h>45#include <signal.h>46#include <assert.h>4748#ifdef illumos49#define GETOPT_EOF EOF50#else51#include <sys/time.h>52#include <sys/resource.h>5354#define mergesort(a, b, c, d) lsmergesort(a, b, c, d)55#define GETOPT_EOF (-1)5657typedef uintptr_t pc_t;58#endif5960#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"6162#define LS_MAX_STACK_DEPTH 5063#define LS_MAX_EVENTS 646465typedef struct lsrec {66struct lsrec *ls_next; /* next in hash chain */67#ifdef illumos68uintptr_t ls_lock; /* lock address */69#else70char *ls_lock; /* lock name */71#endif72uintptr_t ls_caller; /* caller address */73uint32_t ls_count; /* cumulative event count */74uint32_t ls_event; /* type of event */75uintptr_t ls_refcnt; /* cumulative reference count */76uint64_t ls_time; /* cumulative event duration */77uint32_t ls_hist[64]; /* log2(duration) histogram */78uintptr_t ls_stack[LS_MAX_STACK_DEPTH];79} lsrec_t;8081typedef struct lsdata {82struct lsrec *lsd_next; /* next available */83int lsd_count; /* number of records */84} lsdata_t;8586/*87* Definitions for the types of experiments which can be run. They are88* listed in increasing order of memory cost and processing time cost.89* The numerical value of each type is the number of bytes needed per record.90*/91#define LS_BASIC offsetof(lsrec_t, ls_time)92#define LS_TIME offsetof(lsrec_t, ls_hist[0])93#define LS_HIST offsetof(lsrec_t, ls_stack[0])94#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])9596static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);97static void report_trace(FILE *, lsrec_t **);9899extern int symtab_init(void);100extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);101extern uintptr_t sym_to_addr(char *name);102extern size_t sym_size(char *name);103extern char *strtok_r(char *, const char *, char **);104105#define DEFAULT_NRECS 10000106#define DEFAULT_HZ 97107#define MAX_HZ 1000108#define MIN_AGGSIZE (16 * 1024)109#define MAX_AGGSIZE (32 * 1024 * 1024)110111static int g_stkdepth;112static int g_topn = INT_MAX;113static hrtime_t g_elapsed;114static int g_rates = 0;115static int g_pflag = 0;116static int g_Pflag = 0;117static int g_wflag = 0;118static int g_Wflag = 0;119static int g_cflag = 0;120static int g_kflag = 0;121static int g_gflag = 0;122static int g_Vflag = 0;123static int g_tracing = 0;124static size_t g_recsize;125static size_t g_nrecs;126static int g_nrecs_used;127static uchar_t g_enabled[LS_MAX_EVENTS];128static hrtime_t g_min_duration[LS_MAX_EVENTS];129static dtrace_hdl_t *g_dtp;130static char *g_predicate;131static char *g_ipredicate;132static char *g_prog;133static int g_proglen;134static int g_dropped;135136typedef struct ls_event_info {137char ev_type;138char ev_lhdr[20];139char ev_desc[80];140char ev_units[10];141char ev_name[DTRACE_NAMELEN];142char *ev_predicate;143char *ev_acquire;144} ls_event_info_t;145146static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {147{ 'C', "Lock", "Adaptive mutex spin", "nsec",148"lockstat:::adaptive-spin" },149{ 'C', "Lock", "Adaptive mutex block", "nsec",150"lockstat:::adaptive-block" },151{ 'C', "Lock", "Spin lock spin", "nsec",152"lockstat:::spin-spin" },153{ 'C', "Lock", "Thread lock spin", "nsec",154"lockstat:::thread-spin" },155{ 'C', "Lock", "R/W writer blocked by writer", "nsec",156"lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },157{ 'C', "Lock", "R/W writer blocked by readers", "nsec",158"lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },159{ 'C', "Lock", "R/W reader blocked by writer", "nsec",160"lockstat:::rw-block", "arg2 == 1 && arg3 == 1" },161{ 'C', "Lock", "R/W reader blocked by write wanted", "nsec",162"lockstat:::rw-block", "arg2 == 1 && arg3 == 0 && arg4" },163{ 'C', "Lock", "R/W writer spin on writer", "nsec",164"lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" },165{ 'C', "Lock", "R/W writer spin on readers", "nsec",166"lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" },167{ 'C', "Lock", "R/W reader spin on writer", "nsec",168"lockstat:::rw-spin", "arg2 == 1 && arg3 == 1" },169{ 'C', "Lock", "R/W reader spin on write wanted", "nsec",170"lockstat:::rw-spin", "arg2 == 1 && arg3 == 0 && arg4" },171{ 'C', "Lock", "SX exclusive block", "nsec",172"lockstat:::sx-block", "arg2 == 0" },173{ 'C', "Lock", "SX shared block", "nsec",174"lockstat:::sx-block", "arg2 == 1" },175{ 'C', "Lock", "SX exclusive spin", "nsec",176"lockstat:::sx-spin", "arg2 == 0" },177{ 'C', "Lock", "SX shared spin", "nsec",178"lockstat:::sx-spin", "arg2 == 1" },179{ 'C', "Lock", "lockmgr writer blocked by writer", "nsec",180"lockstat:::lockmgr-block", "arg2 == 0 && arg3 == 1" },181{ 'C', "Lock", "lockmgr writer blocked by readers", "nsec",182"lockstat:::lockmgr-block", "arg2 == 0 && arg3 == 0 && arg4" },183{ 'C', "Lock", "lockmgr reader blocked by writer", "nsec",184"lockstat:::lockmgr-block", "arg2 == 1 && arg3 == 1" },185{ 'C', "Lock", "lockmgr reader blocked by write wanted", "nsec",186"lockstat:::lockmgr-block", "arg2 == 1 && arg3 == 0 && arg4" },187{ 'C', "Lock", "Unknown event (type 20)", "units" },188{ 'C', "Lock", "Unknown event (type 21)", "units" },189{ 'C', "Lock", "Unknown event (type 22)", "units" },190{ 'C', "Lock", "Unknown event (type 23)", "units" },191{ 'C', "Lock", "Unknown event (type 24)", "units" },192{ 'C', "Lock", "Unknown event (type 25)", "units" },193{ 'C', "Lock", "Unknown event (type 26)", "units" },194{ 'C', "Lock", "Unknown event (type 27)", "units" },195{ 'C', "Lock", "Unknown event (type 28)", "units" },196{ 'C', "Lock", "Unknown event (type 29)", "units" },197{ 'C', "Lock", "Unknown event (type 30)", "units" },198{ 'C', "Lock", "Unknown event (type 31)", "units" },199{ 'H', "Lock", "Adaptive mutex hold", "nsec",200"lockstat:::adaptive-release", NULL,201"lockstat:::adaptive-acquire" },202{ 'H', "Lock", "Spin lock hold", "nsec",203"lockstat:::spin-release", NULL,204"lockstat:::spin-acquire" },205{ 'H', "Lock", "R/W writer hold", "nsec",206"lockstat:::rw-release", "arg1 == 0",207"lockstat:::rw-acquire" },208{ 'H', "Lock", "R/W reader hold", "nsec",209"lockstat:::rw-release", "arg1 == 1",210"lockstat:::rw-acquire" },211{ 'H', "Lock", "SX shared hold", "nsec",212"lockstat:::sx-release", "arg1 == 1",213"lockstat:::sx-acquire" },214{ 'H', "Lock", "SX exclusive hold", "nsec",215"lockstat:::sx-release", "arg1 == 0",216"lockstat:::sx-acquire" },217{ 'H', "Lock", "lockmgr shared hold", "nsec",218"lockstat:::lockmgr-release", "arg1 == 1",219"lockstat:::lockmgr-acquire" },220{ 'H', "Lock", "lockmgr exclusive hold", "nsec",221"lockstat:::lockmgr-release,lockstat:::lockmgr-disown", "arg1 == 0",222"lockstat:::lockmgr-acquire" },223{ 'H', "Lock", "Unknown event (type 40)", "units" },224{ 'H', "Lock", "Unknown event (type 41)", "units" },225{ 'H', "Lock", "Unknown event (type 42)", "units" },226{ 'H', "Lock", "Unknown event (type 43)", "units" },227{ 'H', "Lock", "Unknown event (type 44)", "units" },228{ 'H', "Lock", "Unknown event (type 45)", "units" },229{ 'H', "Lock", "Unknown event (type 46)", "units" },230{ 'H', "Lock", "Unknown event (type 47)", "units" },231{ 'H', "Lock", "Unknown event (type 48)", "units" },232{ 'H', "Lock", "Unknown event (type 49)", "units" },233{ 'H', "Lock", "Unknown event (type 50)", "units" },234{ 'H', "Lock", "Unknown event (type 51)", "units" },235{ 'H', "Lock", "Unknown event (type 52)", "units" },236{ 'H', "Lock", "Unknown event (type 53)", "units" },237{ 'H', "Lock", "Unknown event (type 54)", "units" },238{ 'H', "Lock", "Unknown event (type 55)", "units" },239#ifdef illumos240{ 'I', "CPU+PIL", "Profiling interrupt", "nsec",241#else242{ 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec",243#endif244"profile:::profile-97", NULL },245{ 'I', "Lock", "Unknown event (type 57)", "units" },246{ 'I', "Lock", "Unknown event (type 58)", "units" },247{ 'I', "Lock", "Unknown event (type 59)", "units" },248{ 'E', "Lock", "Recursive lock entry detected", "(N/A)",249"lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },250{ 'E', "Lock", "Lockstat enter failure", "(N/A)" },251{ 'E', "Lock", "Lockstat exit failure", "nsec" },252{ 'E', "Lock", "Lockstat record failure", "(N/A)" },253};254255#ifndef illumos256static char *g_pri_class[] = {257"",258"Intr",259"RealT",260"TShar",261"Idle"262};263#endif264265static void266fail(int do_perror, const char *message, ...)267{268va_list args;269int save_errno = errno;270271va_start(args, message);272(void) fprintf(stderr, "lockstat: ");273(void) vfprintf(stderr, message, args);274va_end(args);275if (do_perror)276(void) fprintf(stderr, ": %s", strerror(save_errno));277(void) fprintf(stderr, "\n");278exit(2);279}280281static void282dfail(const char *message, ...)283{284va_list args;285286va_start(args, message);287(void) fprintf(stderr, "lockstat: ");288(void) vfprintf(stderr, message, args);289va_end(args);290(void) fprintf(stderr, ": %s\n",291dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));292293exit(2);294}295296static void297show_events(char event_type, char *desc)298{299int i, first = -1, last;300301for (i = 0; i < LS_MAX_EVENTS; i++) {302ls_event_info_t *evp = &g_event_info[i];303if (evp->ev_type != event_type ||304strncmp(evp->ev_desc, "Unknown event", 13) == 0)305continue;306if (first == -1)307first = i;308last = i;309}310311(void) fprintf(stderr,312"\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",313desc, event_type, first, last);314315for (i = first; i <= last; i++)316(void) fprintf(stderr,317"%4d = %s\n", i, g_event_info[i].ev_desc);318}319320static void321usage(void)322{323(void) fprintf(stderr,324"Usage: lockstat [options] command [args]\n"325"\nGeneral options:\n\n"326" -V print the corresponding D program\n"327"\nEvent selection options:\n\n"328" -C watch contention events [on by default]\n"329" -E watch error events [off by default]\n"330" -H watch hold events [off by default]\n"331" -I watch interrupt events [off by default]\n"332" -A watch all lock events [equivalent to -CH]\n"333" -e event_list only watch the specified events (shown below);\n"334" <event_list> is a comma-separated list of\n"335" events or ranges of events, e.g. 1,4-7,35\n"336" -i rate interrupt rate for -I [default: %d Hz]\n"337"\nData gathering options:\n\n"338" -b basic statistics (lock, caller, event count)\n"339" -t timing for all events [default]\n"340" -h histograms for event times\n"341" -s depth stack traces <depth> deep\n"342" -x opt[=val] enable or modify DTrace options\n"343"\nData filtering options:\n\n"344" -n nrecords maximum number of data records [default: %d]\n"345" -l lock[,size] only watch <lock>, which can be specified as a\n"346" symbolic name or hex address; <size> defaults\n"347" to the ELF symbol size if available, 1 if not\n"348" -f func[,size] only watch events generated by <func>\n"349" -d duration only watch events longer than <duration>\n"350" -T trace (rather than sample) events\n"351"\nData reporting options:\n\n"352#ifdef illumos353" -c coalesce lock data for arrays like pse_mutex[]\n"354#endif355" -k coalesce PCs within functions\n"356" -g show total events generated by function\n"357" -w wherever: don't distinguish events by caller\n"358" -W whichever: don't distinguish events by lock\n"359" -R display rates rather than counts\n"360" -p parsable output format (awk(1)-friendly)\n"361" -P sort lock data by (count * avg_time) product\n"362" -D n only display top <n> events of each type\n"363" -o filename send output to <filename>\n",364DEFAULT_HZ, DEFAULT_NRECS);365366show_events('C', "Contention");367show_events('H', "Hold-time");368show_events('I', "Interrupt");369show_events('E', "Error");370(void) fprintf(stderr, "\n");371372exit(1);373}374375static int376lockcmp(lsrec_t *a, lsrec_t *b)377{378int i;379380if (a->ls_event < b->ls_event)381return (-1);382if (a->ls_event > b->ls_event)383return (1);384385for (i = g_stkdepth - 1; i >= 0; i--) {386if (a->ls_stack[i] < b->ls_stack[i])387return (-1);388if (a->ls_stack[i] > b->ls_stack[i])389return (1);390}391392if (a->ls_caller < b->ls_caller)393return (-1);394if (a->ls_caller > b->ls_caller)395return (1);396397#ifdef illumos398if (a->ls_lock < b->ls_lock)399return (-1);400if (a->ls_lock > b->ls_lock)401return (1);402403return (0);404#else405return (strcmp(a->ls_lock, b->ls_lock));406#endif407}408409static int410countcmp(lsrec_t *a, lsrec_t *b)411{412if (a->ls_event < b->ls_event)413return (-1);414if (a->ls_event > b->ls_event)415return (1);416417return (b->ls_count - a->ls_count);418}419420static int421timecmp(lsrec_t *a, lsrec_t *b)422{423if (a->ls_event < b->ls_event)424return (-1);425if (a->ls_event > b->ls_event)426return (1);427428if (a->ls_time < b->ls_time)429return (1);430if (a->ls_time > b->ls_time)431return (-1);432433return (0);434}435436static int437lockcmp_anywhere(lsrec_t *a, lsrec_t *b)438{439if (a->ls_event < b->ls_event)440return (-1);441if (a->ls_event > b->ls_event)442return (1);443444#ifdef illumos445if (a->ls_lock < b->ls_lock)446return (-1);447if (a->ls_lock > b->ls_lock)448return (1);449450return (0);451#else452return (strcmp(a->ls_lock, b->ls_lock));453#endif454}455456static int457lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)458{459#ifndef illumos460int cmp;461#endif462463if (a->ls_event < b->ls_event)464return (-1);465if (a->ls_event > b->ls_event)466return (1);467468#ifdef illumos469if (a->ls_lock < b->ls_lock)470return (-1);471if (a->ls_lock > b->ls_lock)472return (1);473#else474cmp = strcmp(a->ls_lock, b->ls_lock);475if (cmp != 0)476return (cmp);477#endif478479return (b->ls_count - a->ls_count);480}481482static int483sitecmp_anylock(lsrec_t *a, lsrec_t *b)484{485int i;486487if (a->ls_event < b->ls_event)488return (-1);489if (a->ls_event > b->ls_event)490return (1);491492for (i = g_stkdepth - 1; i >= 0; i--) {493if (a->ls_stack[i] < b->ls_stack[i])494return (-1);495if (a->ls_stack[i] > b->ls_stack[i])496return (1);497}498499if (a->ls_caller < b->ls_caller)500return (-1);501if (a->ls_caller > b->ls_caller)502return (1);503504return (0);505}506507static int508site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)509{510int i;511512if (a->ls_event < b->ls_event)513return (-1);514if (a->ls_event > b->ls_event)515return (1);516517for (i = g_stkdepth - 1; i >= 0; i--) {518if (a->ls_stack[i] < b->ls_stack[i])519return (-1);520if (a->ls_stack[i] > b->ls_stack[i])521return (1);522}523524if (a->ls_caller < b->ls_caller)525return (-1);526if (a->ls_caller > b->ls_caller)527return (1);528529return (b->ls_count - a->ls_count);530}531532static void533lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)534{535int m = n / 2;536int i, j;537538if (m > 1)539lsmergesort(cmp, a, b, m);540if (n - m > 1)541lsmergesort(cmp, a + m, b + m, n - m);542for (i = m; i > 0; i--)543b[i - 1] = a[i - 1];544for (j = m - 1; j < n - 1; j++)545b[n + m - j - 2] = a[j + 1];546while (i < j)547*a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];548*a = b[i];549}550551static void552coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)553{554int i, j;555lsrec_t *target, *current;556557target = lock[0];558559for (i = 1; i < n; i++) {560current = lock[i];561if (cmp(current, target) != 0) {562target = current;563continue;564}565current->ls_event = LS_MAX_EVENTS;566target->ls_count += current->ls_count;567target->ls_refcnt += current->ls_refcnt;568if (g_recsize < LS_TIME)569continue;570target->ls_time += current->ls_time;571if (g_recsize < LS_HIST)572continue;573for (j = 0; j < 64; j++)574target->ls_hist[j] += current->ls_hist[j];575}576}577578static void579coalesce_symbol(uintptr_t *addrp)580{581uintptr_t symoff;582size_t symsize;583584if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)585*addrp -= symoff;586}587588static void589predicate_add(char **pred, char *what, char *cmp, uintptr_t value)590{591char *new;592int len, newlen;593594if (what == NULL)595return;596597if (*pred == NULL) {598*pred = malloc(1);599*pred[0] = '\0';600}601602len = strlen(*pred);603newlen = len + strlen(what) + 32 + strlen("( && )");604new = malloc(newlen);605606if (*pred[0] != '\0') {607if (cmp != NULL) {608(void) sprintf(new, "(%s) && (%s %s %p)",609*pred, what, cmp, (void *)value);610} else {611(void) sprintf(new, "(%s) && (%s)", *pred, what);612}613} else {614if (cmp != NULL) {615(void) sprintf(new, "%s %s %p",616what, cmp, (void *)value);617} else {618(void) sprintf(new, "%s", what);619}620}621622free(*pred);623*pred = new;624}625626static void627predicate_destroy(char **pred)628{629free(*pred);630*pred = NULL;631}632633static void634filter_add(char **filt, char *what, uintptr_t base, size_t size)635{636char buf[256], *c = buf, *new;637int len, newlen;638639if (*filt == NULL) {640*filt = malloc(1);641*filt[0] = '\0';642}643644#ifdef illumos645(void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?646" || " : "", what, (void *)base, what, (void *)(base + size));647#else648(void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?649" || " : "", what, (void *)base, what, (void *)(base + size));650#endif651652newlen = (len = strlen(*filt) + 1) + strlen(c);653new = malloc(newlen);654bcopy(*filt, new, len);655(void) strcat(new, c);656free(*filt);657*filt = new;658}659660static void661filter_destroy(char **filt)662{663free(*filt);664*filt = NULL;665}666667static void668dprog_add(const char *fmt, ...)669{670va_list args;671int size, offs;672char c;673674va_start(args, fmt);675size = vsnprintf(&c, 1, fmt, args) + 1;676va_end(args);677678if (g_proglen == 0) {679offs = 0;680} else {681offs = g_proglen - 1;682}683684g_proglen = offs + size;685686if ((g_prog = realloc(g_prog, g_proglen)) == NULL)687fail(1, "failed to reallocate program text");688689va_start(args, fmt);690(void) vsnprintf(&g_prog[offs], size, fmt, args);691va_end(args);692}693694/*695* This function may read like an open sewer, but keep in mind that programs696* that generate other programs are rarely pretty. If one has the unenviable697* task of maintaining or -- worse -- extending this code, use the -V option698* to examine the D program as generated by this function.699*/700static void701dprog_addevent(int event)702{703ls_event_info_t *info = &g_event_info[event];704char *pred = NULL;705char stack[20];706const char *arg0, *caller;707char *arg1 = "arg1";708char buf[80];709hrtime_t dur;710int depth;711712if (info->ev_name[0] == '\0')713return;714715if (info->ev_type == 'I') {716/*717* For interrupt events, arg0 (normally the lock pointer) is718* the CPU address plus the current pil, and arg1 (normally719* the number of nanoseconds) is the number of nanoseconds720* late -- and it's stored in arg2.721*/722#ifdef illumos723arg0 = "(uintptr_t)curthread->t_cpu + \n"724"\t curthread->t_cpu->cpu_profile_pil";725#else726arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"727"\t 0x01000000 + curthread->td_pri_class";728#endif729caller = "(uintptr_t)arg0";730arg1 = "arg2";731} else {732#ifdef illumos733arg0 = "(uintptr_t)arg0";734#else735arg0 = "stringof(args[0]->lock_object.lo_name)";736#endif737caller = "caller";738}739740if (g_recsize > LS_HIST) {741for (depth = 0; g_recsize > LS_STACK(depth); depth++)742continue;743744if (g_tracing) {745(void) sprintf(stack, "\tstack(%d);\n", depth);746} else {747(void) sprintf(stack, ", stack(%d)", depth);748}749} else {750(void) sprintf(stack, "");751}752753if (info->ev_acquire != NULL) {754/*755* If this is a hold event, we need to generate an additional756* clause for the acquire; the clause for the release will be757* generated with the aggregating statement, below.758*/759dprog_add("%s\n", info->ev_acquire);760predicate_add(&pred, info->ev_predicate, NULL, 0);761predicate_add(&pred, g_predicate, NULL, 0);762if (pred != NULL)763dprog_add("/%s/\n", pred);764765dprog_add("{\n");766(void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);767768if (info->ev_type == 'H') {769dprog_add("\t%s = timestamp;\n", buf);770} else {771/*772* If this isn't a hold event, it's the recursive773* error event. For this, we simply bump the774* thread-local, per-lock count.775*/776dprog_add("\t%s++;\n", buf);777}778779dprog_add("}\n\n");780predicate_destroy(&pred);781pred = NULL;782783if (info->ev_type == 'E') {784/*785* If this is the recursive lock error event, we need786* to generate an additional clause to decrement the787* thread-local, per-lock count. This assures that we788* only execute the aggregating clause if we have789* recursive entry.790*/791dprog_add("%s\n", info->ev_name);792dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);793}794795predicate_add(&pred, buf, NULL, 0);796797if (info->ev_type == 'H') {798(void) sprintf(buf, "timestamp -\n\t "799"self->ev%d[(uintptr_t)arg0]", event);800}801802arg1 = buf;803} else {804predicate_add(&pred, info->ev_predicate, NULL, 0);805if (info->ev_type != 'I')806predicate_add(&pred, g_predicate, NULL, 0);807else808predicate_add(&pred, g_ipredicate, NULL, 0);809}810811if ((dur = g_min_duration[event]) != 0)812predicate_add(&pred, arg1, ">=", dur);813814dprog_add("%s\n", info->ev_name);815816if (pred != NULL)817dprog_add("/%s/\n", pred);818predicate_destroy(&pred);819820dprog_add("{\n");821822if (g_tracing) {823dprog_add("\ttrace(%dULL);\n", event);824dprog_add("\ttrace(%s);\n", arg0);825dprog_add("\ttrace(%s);\n", caller);826dprog_add(stack);827} else {828/*829* The ordering here is important: when we process the830* aggregate, we count on the fact that @avg appears before831* @hist in program order to assure that @avg is assigned the832* first aggregation variable ID and @hist assigned the833* second; see the comment in process_aggregate() for details.834*/835dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",836event, arg0, caller, stack, arg1);837838if (g_recsize >= LS_HIST) {839dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"840"(%s);\n", event, arg0, caller, stack, arg1);841}842}843844if (info->ev_acquire != NULL)845dprog_add("\tself->ev%d[arg0] = 0;\n", event);846847dprog_add("}\n\n");848}849850static void851dprog_compile()852{853dtrace_prog_t *prog;854dtrace_proginfo_t info;855856if (g_Vflag) {857(void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");858(void) fputs(g_prog, stderr);859(void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");860}861862if ((prog = dtrace_program_strcompile(g_dtp, g_prog,863DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)864dfail("failed to compile program");865866if (dtrace_program_exec(g_dtp, prog, &info) == -1)867dfail("failed to enable probes");868869if (dtrace_go(g_dtp) != 0)870dfail("couldn't start tracing");871}872873static void874#ifdef illumos875status_fire(void)876#else877status_fire(int i)878#endif879{}880881static void882status_init(void)883{884dtrace_optval_t val, status, agg;885struct sigaction act;886struct itimerspec ts;887struct sigevent ev;888timer_t tid;889890if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)891dfail("failed to get 'statusrate'");892893if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)894dfail("failed to get 'statusrate'");895896/*897* We would want to awaken at a rate that is the GCD of the statusrate898* and the aggrate -- but that seems a bit absurd. Instead, we'll899* simply awaken at a rate that is the more frequent of the two, which900* assures that we're never later than the interval implied by the901* more frequent rate.902*/903val = status < agg ? status : agg;904905(void) sigemptyset(&act.sa_mask);906act.sa_flags = 0;907act.sa_handler = status_fire;908(void) sigaction(SIGUSR1, &act, NULL);909910ev.sigev_notify = SIGEV_SIGNAL;911ev.sigev_signo = SIGUSR1;912913if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)914dfail("cannot create CLOCK_REALTIME timer");915916ts.it_value.tv_sec = val / NANOSEC;917ts.it_value.tv_nsec = val % NANOSEC;918ts.it_interval = ts.it_value;919920if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)921dfail("cannot set time on CLOCK_REALTIME timer");922}923924static void925status_check(void)926{927if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)928dfail("failed to snap aggregate");929930if (dtrace_status(g_dtp) == -1)931dfail("dtrace_status()");932}933934static void935lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)936{937bzero(lsrec, g_recsize);938lsrec->ls_count = 1;939940if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))941fail(0, "truncated DTrace record");942943if (rec->dtrd_size != sizeof (uint64_t))944fail(0, "bad event size in first record");945946/* LINTED - alignment */947lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));948rec++;949950#ifdef illumos951if (rec->dtrd_size != sizeof (uintptr_t))952fail(0, "bad lock address size in second record");953954/* LINTED - alignment */955lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));956rec++;957#else958lsrec->ls_lock = strdup((const char *)(data + rec->dtrd_offset));959rec++;960#endif961962if (rec->dtrd_size != sizeof (uintptr_t))963fail(0, "bad caller size in third record");964965/* LINTED - alignment */966lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));967rec++;968969if (g_recsize > LS_HIST) {970int frames, i;971pc_t *stack;972973frames = rec->dtrd_size / sizeof (pc_t);974/* LINTED - alignment */975stack = (pc_t *)(data + rec->dtrd_offset);976977for (i = 1; i < frames; i++)978lsrec->ls_stack[i - 1] = stack[i];979}980}981982/*ARGSUSED*/983static int984count_aggregate(const dtrace_aggdata_t *agg, void *arg)985{986*((size_t *)arg) += 1;987988return (DTRACE_AGGWALK_NEXT);989}990991static int992process_aggregate(const dtrace_aggdata_t *agg, void *arg)993{994const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;995caddr_t data = agg->dtada_data;996lsdata_t *lsdata = arg;997lsrec_t *lsrec = lsdata->lsd_next;998const dtrace_recdesc_t *rec;999uint64_t *avg, *quantized;1000int i, j;10011002assert(lsdata->lsd_count < g_nrecs);10031004/*1005* Aggregation variable IDs are guaranteed to be generated in program1006* order, and they are guaranteed to start from DTRACE_AGGVARIDNONE1007* plus one. As "avg" appears before "hist" in program order, we know1008* that "avg" will be allocated the first aggregation variable ID, and1009* "hist" will be allocated the second aggregation variable ID -- and1010* we therefore use the aggregation variable ID to differentiate the1011* cases.1012*/1013if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {1014/*1015* If this is the histogram entry. We'll copy the quantized1016* data into lc_hist, and jump over the rest.1017*/1018rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];10191020if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)1021fail(0, "bad variable ID in aggregation record");10221023if (rec->dtrd_size !=1024DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))1025fail(0, "bad quantize size in aggregation record");10261027/* LINTED - alignment */1028quantized = (uint64_t *)(data + rec->dtrd_offset);10291030for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;1031i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)1032lsrec->ls_hist[j] = quantized[i];10331034goto out;1035}10361037lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],1038aggdesc->dtagd_nrecs - 1, data);10391040rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];10411042if (rec->dtrd_size != 2 * sizeof (uint64_t))1043fail(0, "bad avg size in aggregation record");10441045/* LINTED - alignment */1046avg = (uint64_t *)(data + rec->dtrd_offset);1047lsrec->ls_count = (uint32_t)avg[0];1048lsrec->ls_time = (uintptr_t)avg[1];10491050if (g_recsize >= LS_HIST)1051return (DTRACE_AGGWALK_NEXT);10521053out:1054lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);1055lsdata->lsd_count++;10561057return (DTRACE_AGGWALK_NEXT);1058}10591060static int1061process_trace(const dtrace_probedata_t *pdata, void *arg)1062{1063lsdata_t *lsdata = arg;1064lsrec_t *lsrec = lsdata->lsd_next;1065dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;1066caddr_t data = pdata->dtpda_data;10671068if (lsdata->lsd_count >= g_nrecs)1069return (DTRACE_CONSUME_NEXT);10701071lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);10721073lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);1074lsdata->lsd_count++;10751076return (DTRACE_CONSUME_NEXT);1077}10781079static int1080process_data(FILE *out, char *data)1081{1082lsdata_t lsdata;10831084/* LINTED - alignment */1085lsdata.lsd_next = (lsrec_t *)data;1086lsdata.lsd_count = 0;10871088if (g_tracing) {1089if (dtrace_consume(g_dtp, out,1090process_trace, NULL, &lsdata) != 0)1091dfail("failed to consume buffer");10921093return (lsdata.lsd_count);1094}10951096if (dtrace_aggregate_walk_keyvarsorted(g_dtp,1097process_aggregate, &lsdata) != 0)1098dfail("failed to walk aggregate");10991100return (lsdata.lsd_count);1101}11021103/*ARGSUSED*/1104static int1105drophandler(const dtrace_dropdata_t *data, void *arg)1106{1107g_dropped++;1108(void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);1109return (DTRACE_HANDLE_OK);1110}11111112int1113main(int argc, char **argv)1114{1115char *data_buf;1116lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;1117FILE *out = stdout;1118int c;1119pid_t child;1120int status;1121int i, j;1122hrtime_t duration;1123char *addrp, *offp, *sizep, *evp, *lastp, *p;1124uintptr_t addr;1125size_t size, off;1126int events_specified = 0;1127int exec_errno = 0;1128uint32_t event;1129char *filt = NULL, *ifilt = NULL;1130static uint64_t ev_count[LS_MAX_EVENTS + 1];1131static uint64_t ev_time[LS_MAX_EVENTS + 1];1132dtrace_optval_t aggsize;1133char aggstr[10];1134long ncpus;1135int dynvar = 0;1136int err;11371138if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {1139fail(0, "cannot open dtrace library: %s",1140dtrace_errmsg(NULL, err));1141}11421143if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)1144dfail("couldn't establish drop handler");11451146if (symtab_init() == -1)1147fail(1, "can't load kernel symbols");11481149g_nrecs = DEFAULT_NRECS;11501151while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {1152switch (c) {1153case 'b':1154g_recsize = LS_BASIC;1155break;11561157case 't':1158g_recsize = LS_TIME;1159break;11601161case 'h':1162g_recsize = LS_HIST;1163break;11641165case 's':1166if (!isdigit(optarg[0]))1167usage();1168g_stkdepth = atoi(optarg);1169if (g_stkdepth > LS_MAX_STACK_DEPTH)1170fail(0, "max stack depth is %d",1171LS_MAX_STACK_DEPTH);1172g_recsize = LS_STACK(g_stkdepth);1173break;11741175case 'n':1176if (!isdigit(optarg[0]))1177usage();1178g_nrecs = atoi(optarg);1179break;11801181case 'd':1182if (!isdigit(optarg[0]))1183usage();1184duration = atoll(optarg);11851186/*1187* XXX -- durations really should be per event1188* since the units are different, but it's hard1189* to express this nicely in the interface.1190* Not clear yet what the cleanest solution is.1191*/1192for (i = 0; i < LS_MAX_EVENTS; i++)1193if (g_event_info[i].ev_type != 'E')1194g_min_duration[i] = duration;11951196break;11971198case 'i':1199if (!isdigit(optarg[0]))1200usage();1201i = atoi(optarg);1202if (i <= 0)1203usage();1204if (i > MAX_HZ)1205fail(0, "max interrupt rate is %d Hz", MAX_HZ);12061207for (j = 0; j < LS_MAX_EVENTS; j++)1208if (strcmp(g_event_info[j].ev_desc,1209"Profiling interrupt") == 0)1210break;12111212(void) sprintf(g_event_info[j].ev_name,1213"profile:::profile-%d", i);1214break;12151216case 'l':1217case 'f':1218addrp = strtok(optarg, ",");1219sizep = strtok(NULL, ",");1220addrp = strtok(optarg, ",+");1221offp = strtok(NULL, ",");12221223size = sizep ? strtoul(sizep, NULL, 0) : 1;1224off = offp ? strtoul(offp, NULL, 0) : 0;12251226if (addrp[0] == '0') {1227addr = strtoul(addrp, NULL, 16) + off;1228} else {1229addr = sym_to_addr(addrp) + off;1230if (sizep == NULL)1231size = sym_size(addrp) - off;1232if (addr - off == 0)1233fail(0, "symbol '%s' not found", addrp);1234if (size == 0)1235size = 1;1236}123712381239if (c == 'l') {1240filter_add(&filt, "arg0", addr, size);1241} else {1242filter_add(&filt, "caller", addr, size);1243filter_add(&ifilt, "arg0", addr, size);1244}1245break;12461247case 'e':1248evp = strtok_r(optarg, ",", &lastp);1249while (evp) {1250int ev1, ev2;1251char *evp2;12521253(void) strtok(evp, "-");1254evp2 = strtok(NULL, "-");1255ev1 = atoi(evp);1256ev2 = evp2 ? atoi(evp2) : ev1;1257if ((uint_t)ev1 >= LS_MAX_EVENTS ||1258(uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)1259fail(0, "-e events out of range");1260for (i = ev1; i <= ev2; i++)1261g_enabled[i] = 1;1262evp = strtok_r(NULL, ",", &lastp);1263}1264events_specified = 1;1265break;12661267#ifdef illumos1268case 'c':1269g_cflag = 1;1270break;1271#endif12721273case 'k':1274g_kflag = 1;1275break;12761277case 'w':1278g_wflag = 1;1279break;12801281case 'W':1282g_Wflag = 1;1283break;12841285case 'g':1286g_gflag = 1;1287break;12881289case 'C':1290case 'E':1291case 'H':1292case 'I':1293for (i = 0; i < LS_MAX_EVENTS; i++)1294if (g_event_info[i].ev_type == c)1295g_enabled[i] = 1;1296events_specified = 1;1297break;12981299case 'A':1300for (i = 0; i < LS_MAX_EVENTS; i++)1301if (strchr("CH", g_event_info[i].ev_type))1302g_enabled[i] = 1;1303events_specified = 1;1304break;13051306case 'T':1307g_tracing = 1;1308break;13091310case 'D':1311if (!isdigit(optarg[0]))1312usage();1313g_topn = atoi(optarg);1314break;13151316case 'R':1317g_rates = 1;1318break;13191320case 'p':1321g_pflag = 1;1322break;13231324case 'P':1325g_Pflag = 1;1326break;13271328case 'o':1329if ((out = fopen(optarg, "w")) == NULL)1330fail(1, "error opening file");1331break;13321333case 'V':1334g_Vflag = 1;1335break;13361337default:1338if (strchr(LOCKSTAT_OPTSTR, c) == NULL)1339usage();1340}1341}13421343if (filt != NULL) {1344predicate_add(&g_predicate, filt, NULL, 0);1345filter_destroy(&filt);1346}13471348if (ifilt != NULL) {1349predicate_add(&g_ipredicate, ifilt, NULL, 0);1350filter_destroy(&ifilt);1351}13521353if (g_recsize == 0) {1354if (g_gflag) {1355g_stkdepth = LS_MAX_STACK_DEPTH;1356g_recsize = LS_STACK(g_stkdepth);1357} else {1358g_recsize = LS_TIME;1359}1360}13611362if (g_gflag && g_recsize <= LS_STACK(0))1363fail(0, "'-g' requires at least '-s 1' data gathering");13641365/*1366* Make sure the alignment is reasonable1367*/1368g_recsize = -(-g_recsize & -sizeof (uint64_t));13691370for (i = 0; i < LS_MAX_EVENTS; i++) {1371/*1372* If no events were specified, enable -C.1373*/1374if (!events_specified && g_event_info[i].ev_type == 'C')1375g_enabled[i] = 1;1376}13771378for (i = 0; i < LS_MAX_EVENTS; i++) {1379if (!g_enabled[i])1380continue;13811382if (g_event_info[i].ev_acquire != NULL) {1383/*1384* If we've enabled a hold event, we must explicitly1385* allocate dynamic variable space.1386*/1387dynvar = 1;1388}13891390dprog_addevent(i);1391}13921393/*1394* Make sure there are remaining arguments to specify a child command1395* to execute.1396*/1397if (argc <= optind)1398usage();13991400if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)1401dfail("couldn't determine number of online CPUs");14021403/*1404* By default, we set our data buffer size to be the number of records1405* multiplied by the size of the record, doubled to account for some1406* DTrace slop and divided by the number of CPUs. We silently clamp1407* the aggregation size at both a minimum and a maximum to prevent1408* absurdly low or high values.1409*/1410if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)1411aggsize = MIN_AGGSIZE;14121413if (aggsize > MAX_AGGSIZE)1414aggsize = MAX_AGGSIZE;14151416(void) sprintf(aggstr, "%lld", (long long)aggsize);14171418if (!g_tracing) {1419if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)1420dfail("failed to set 'bufsize'");14211422if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)1423dfail("failed to set 'aggsize'");14241425if (dynvar) {1426/*1427* If we're using dynamic variables, we set our1428* dynamic variable size to be one megabyte per CPU,1429* with a hard-limit of 32 megabytes. This may still1430* be too small in some cases, but it can be tuned1431* manually via -x if need be.1432*/1433(void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);14341435if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)1436dfail("failed to set 'dynvarsize'");1437}1438} else {1439if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)1440dfail("failed to set 'bufsize'");1441}14421443if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)1444dfail("failed to set 'statusrate'");14451446optind = 1;1447while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {1448switch (c) {1449case 'x':1450if ((p = strchr(optarg, '=')) != NULL)1451*p++ = '\0';14521453if (dtrace_setopt(g_dtp, optarg, p) != 0)1454dfail("failed to set -x %s", optarg);1455break;1456}1457}14581459argc -= optind;1460argv += optind;14611462dprog_compile();1463status_init();14641465g_elapsed = -gethrtime();14661467/*1468* Spawn the specified command and wait for it to complete.1469*/1470child = fork();1471if (child == -1)1472fail(1, "cannot fork");1473if (child == 0) {1474(void) dtrace_close(g_dtp);1475(void) execvp(argv[0], &argv[0]);1476exec_errno = errno;1477exit(127);1478}14791480#ifdef illumos1481while (waitpid(child, &status, WEXITED) != child)1482#else1483while (waitpid(child, &status, 0) != child)1484#endif1485status_check();14861487g_elapsed += gethrtime();14881489if (WIFEXITED(status)) {1490if (WEXITSTATUS(status) != 0) {1491if (exec_errno != 0) {1492errno = exec_errno;1493fail(1, "could not execute %s", argv[0]);1494}1495(void) fprintf(stderr,1496"lockstat: warning: %s exited with code %d\n",1497argv[0], WEXITSTATUS(status));1498}1499} else {1500(void) fprintf(stderr,1501"lockstat: warning: %s died on signal %d\n",1502argv[0], WTERMSIG(status));1503}15041505if (dtrace_stop(g_dtp) == -1)1506dfail("failed to stop dtrace");15071508/*1509* Before we read out the results, we need to allocate our buffer.1510* If we're tracing, then we'll just use the precalculated size. If1511* we're not, then we'll take a snapshot of the aggregate, and walk1512* it to count the number of records.1513*/1514if (!g_tracing) {1515if (dtrace_aggregate_snap(g_dtp) != 0)1516dfail("failed to snap aggregate");15171518g_nrecs = 0;15191520if (dtrace_aggregate_walk(g_dtp,1521count_aggregate, &g_nrecs) != 0)1522dfail("failed to walk aggregate");1523}15241525#ifdef illumos1526if ((data_buf = memalign(sizeof (uint64_t),1527(g_nrecs + 1) * g_recsize)) == NULL)1528#else1529if (posix_memalign((void **)&data_buf, sizeof (uint64_t),1530(g_nrecs + 1) * g_recsize) )1531#endif1532fail(1, "Memory allocation failed");15331534/*1535* Read out the DTrace data.1536*/1537g_nrecs_used = process_data(out, data_buf);15381539if (g_nrecs_used > g_nrecs || g_dropped)1540(void) fprintf(stderr, "lockstat: warning: "1541"ran out of data records (use -n for more)\n");15421543/* LINTED - alignment */1544for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,1545/* LINTED - alignment */1546lsp = (lsrec_t *)((char *)lsp + g_recsize)) {1547ev_count[lsp->ls_event] += lsp->ls_count;1548ev_time[lsp->ls_event] += lsp->ls_time;1549}15501551/*1552* If -g was specified, convert stacks into individual records.1553*/1554if (g_gflag) {1555lsrec_t *newlsp, *oldlsp;15561557#ifdef illumos1558newlsp = memalign(sizeof (uint64_t),1559g_nrecs_used * LS_TIME * (g_stkdepth + 1));1560#else1561posix_memalign((void **)&newlsp, sizeof (uint64_t),1562g_nrecs_used * LS_TIME * (g_stkdepth + 1));1563#endif1564if (newlsp == NULL)1565fail(1, "Cannot allocate space for -g processing");1566lsp = newlsp;1567/* LINTED - alignment */1568for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,1569/* LINTED - alignment */1570oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {1571int fr;1572int caller_in_stack = 0;15731574if (oldlsp->ls_count == 0)1575continue;15761577for (fr = 0; fr < g_stkdepth; fr++) {1578if (oldlsp->ls_stack[fr] == 0)1579break;1580if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)1581caller_in_stack = 1;1582bcopy(oldlsp, lsp, LS_TIME);1583lsp->ls_caller = oldlsp->ls_stack[fr];1584#ifndef illumos1585lsp->ls_lock = strdup(oldlsp->ls_lock);1586#endif1587/* LINTED - alignment */1588lsp = (lsrec_t *)((char *)lsp + LS_TIME);1589}1590if (!caller_in_stack) {1591bcopy(oldlsp, lsp, LS_TIME);1592/* LINTED - alignment */1593lsp = (lsrec_t *)((char *)lsp + LS_TIME);1594}1595#ifndef illumos1596free(oldlsp->ls_lock);1597#endif1598}1599g_nrecs = g_nrecs_used =1600((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;1601g_recsize = LS_TIME;1602g_stkdepth = 0;1603free(data_buf);1604data_buf = (char *)newlsp;1605}16061607if ((sort_buf = calloc(2 * (g_nrecs + 1),1608sizeof (void *))) == NULL)1609fail(1, "Sort buffer allocation failed");1610merge_buf = sort_buf + (g_nrecs + 1);16111612/*1613* Build the sort buffer, discarding zero-count records along the way.1614*/1615/* LINTED - alignment */1616for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,1617/* LINTED - alignment */1618lsp = (lsrec_t *)((char *)lsp + g_recsize)) {1619if (lsp->ls_count == 0)1620lsp->ls_event = LS_MAX_EVENTS;1621sort_buf[i] = lsp;1622}16231624if (g_nrecs_used == 0)1625exit(0);16261627/*1628* Add a sentinel after the last record1629*/1630sort_buf[i] = lsp;1631lsp->ls_event = LS_MAX_EVENTS;16321633if (g_tracing) {1634report_trace(out, sort_buf);1635return (0);1636}16371638/*1639* Application of -g may have resulted in multiple records1640* with the same signature; coalesce them.1641*/1642if (g_gflag) {1643mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);1644coalesce(lockcmp, sort_buf, g_nrecs_used);1645}16461647/*1648* Coalesce locks within the same symbol if -c option specified.1649* Coalesce PCs within the same function if -k option specified.1650*/1651if (g_cflag || g_kflag) {1652for (i = 0; i < g_nrecs_used; i++) {1653int fr;1654lsp = sort_buf[i];1655#ifdef illumos1656if (g_cflag)1657coalesce_symbol(&lsp->ls_lock);1658#endif1659if (g_kflag) {1660for (fr = 0; fr < g_stkdepth; fr++)1661coalesce_symbol(&lsp->ls_stack[fr]);1662coalesce_symbol(&lsp->ls_caller);1663}1664}1665mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);1666coalesce(lockcmp, sort_buf, g_nrecs_used);1667}16681669/*1670* Coalesce callers if -w option specified1671*/1672if (g_wflag) {1673mergesort(lock_and_count_cmp_anywhere,1674sort_buf, merge_buf, g_nrecs_used);1675coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);1676}16771678/*1679* Coalesce locks if -W option specified1680*/1681if (g_Wflag) {1682mergesort(site_and_count_cmp_anylock,1683sort_buf, merge_buf, g_nrecs_used);1684coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);1685}16861687/*1688* Sort data by contention count (ls_count) or total time (ls_time),1689* depending on g_Pflag. Override g_Pflag if time wasn't measured.1690*/1691if (g_recsize < LS_TIME)1692g_Pflag = 0;16931694if (g_Pflag)1695mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);1696else1697mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);16981699/*1700* Display data by event type1701*/1702first = &sort_buf[0];1703while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {1704current = first;1705while ((lsp = *current)->ls_event == event)1706current++;1707report_stats(out, first, current - first, ev_count[event],1708ev_time[event]);1709first = current;1710}17111712#ifndef illumos1713/*1714* Free lock name buffers1715*/1716for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,1717lsp = (lsrec_t *)((char *)lsp + g_recsize))1718free(lsp->ls_lock);1719#endif17201721return (0);1722}17231724static char *1725format_symbol(char *buf, uintptr_t addr, int show_size)1726{1727uintptr_t symoff;1728char *symname;1729size_t symsize;17301731symname = addr_to_sym(addr, &symoff, &symsize);17321733if (show_size && symoff == 0)1734(void) sprintf(buf, "%s[%ld]", symname, (long)symsize);1735else if (symoff == 0)1736(void) sprintf(buf, "%s", symname);1737else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */1738#ifdef illumos1739(void) sprintf(buf, "%s+%ld", symname, (long)symoff);1740#else1741(void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);1742#endif1743else if (symoff <= symsize || (symoff < 256 && addr != symoff))1744(void) sprintf(buf, "%s+0x%llx", symname,1745(unsigned long long)symoff);1746else1747(void) sprintf(buf, "0x%llx", (unsigned long long)addr);1748return (buf);1749}17501751static void1752report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,1753uint64_t total_time)1754{1755uint32_t event = sort_buf[0]->ls_event;1756lsrec_t *lsp;1757double ptotal = 0.0;1758double percent;1759int i, j, fr;1760int displayed;1761int first_bin, last_bin, max_bin_count, total_bin_count;1762int rectype;1763char buf[256];1764char lhdr[80], chdr[80];17651766rectype = g_recsize;17671768if (g_topn == 0) {1769(void) fprintf(out, "%20llu %s\n",1770g_rates == 0 ? total_count :1771((unsigned long long)total_count * NANOSEC) / g_elapsed,1772g_event_info[event].ev_desc);1773return;1774}17751776(void) sprintf(lhdr, "%s%s",1777g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);1778(void) sprintf(chdr, "%s%s",1779g_wflag ? "Hottest " : "", "Caller");17801781if (!g_pflag)1782(void) fprintf(out,1783"\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",1784g_event_info[event].ev_desc, (double)total_count,1785(double)g_elapsed / NANOSEC,1786(double)total_count * NANOSEC / g_elapsed);17871788if (!g_pflag && rectype < LS_HIST) {1789(void) sprintf(buf, "%s", g_event_info[event].ev_units);1790(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",1791g_rates ? "ops/s" : "Count",1792g_gflag ? "genr" : "indv",1793"cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);1794(void) fprintf(out, "---------------------------------"1795"----------------------------------------------\n");1796}17971798displayed = 0;1799for (i = 0; i < nrecs; i++) {1800lsp = sort_buf[i];18011802if (displayed++ >= g_topn)1803break;18041805if (g_pflag) {1806int j;18071808(void) fprintf(out, "%u %u",1809lsp->ls_event, lsp->ls_count);1810#ifdef illumos1811(void) fprintf(out, " %s",1812format_symbol(buf, lsp->ls_lock, g_cflag));1813#else1814(void) fprintf(out, " %s", lsp->ls_lock);1815#endif1816(void) fprintf(out, " %s",1817format_symbol(buf, lsp->ls_caller, 0));1818(void) fprintf(out, " %f",1819(double)lsp->ls_refcnt / lsp->ls_count);1820if (rectype >= LS_TIME)1821(void) fprintf(out, " %llu",1822(unsigned long long)lsp->ls_time);1823if (rectype >= LS_HIST) {1824for (j = 0; j < 64; j++)1825(void) fprintf(out, " %u",1826lsp->ls_hist[j]);1827}1828for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {1829if (rectype <= LS_STACK(j) ||1830lsp->ls_stack[j] == 0)1831break;1832(void) fprintf(out, " %s",1833format_symbol(buf, lsp->ls_stack[j], 0));1834}1835(void) fprintf(out, "\n");1836continue;1837}18381839if (rectype >= LS_HIST) {1840(void) fprintf(out, "---------------------------------"1841"----------------------------------------------\n");1842(void) sprintf(buf, "%s",1843g_event_info[event].ev_units);1844(void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",1845g_rates ? "ops/s" : "Count",1846g_gflag ? "genr" : "indv",1847"cuml", "rcnt", buf, lhdr, chdr);1848}18491850if (g_Pflag && total_time != 0)1851percent = (lsp->ls_time * 100.00) / total_time;1852else1853percent = (lsp->ls_count * 100.00) / total_count;18541855ptotal += percent;18561857if (rectype >= LS_TIME)1858(void) sprintf(buf, "%llu",1859(unsigned long long)(lsp->ls_time / lsp->ls_count));1860else1861buf[0] = '\0';18621863(void) fprintf(out, "%5llu ",1864g_rates == 0 ? lsp->ls_count :1865((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);18661867(void) fprintf(out, "%3.0f%% ", percent);18681869if (g_gflag)1870(void) fprintf(out, "---- ");1871else1872(void) fprintf(out, "%3.0f%% ", ptotal);18731874(void) fprintf(out, "%4.2f %8s ",1875(double)lsp->ls_refcnt / lsp->ls_count, buf);18761877#ifdef illumos1878(void) fprintf(out, "%-22s ",1879format_symbol(buf, lsp->ls_lock, g_cflag));1880#else1881(void) fprintf(out, "%-22s ", lsp->ls_lock);1882#endif18831884(void) fprintf(out, "%-24s\n",1885format_symbol(buf, lsp->ls_caller, 0));18861887if (rectype < LS_HIST)1888continue;18891890(void) fprintf(out, "\n");1891(void) fprintf(out, "%10s %31s %-9s %-24s\n",1892g_event_info[event].ev_units,1893"------ Time Distribution ------",1894g_rates ? "ops/s" : "count",1895rectype > LS_STACK(0) ? "Stack" : "");18961897first_bin = 0;1898while (lsp->ls_hist[first_bin] == 0)1899first_bin++;19001901last_bin = 63;1902while (lsp->ls_hist[last_bin] == 0)1903last_bin--;19041905max_bin_count = 0;1906total_bin_count = 0;1907for (j = first_bin; j <= last_bin; j++) {1908total_bin_count += lsp->ls_hist[j];1909if (lsp->ls_hist[j] > max_bin_count)1910max_bin_count = lsp->ls_hist[j];1911}19121913/*1914* If we went a few frames below the caller, ignore them1915*/1916for (fr = 3; fr > 0; fr--)1917if (lsp->ls_stack[fr] == lsp->ls_caller)1918break;19191920for (j = first_bin; j <= last_bin; j++) {1921uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;1922(void) fprintf(out, "%10llu |%s%s %-9u ",19231ULL << j,1924"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,1925" " + depth,1926g_rates == 0 ? lsp->ls_hist[j] :1927(uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /1928g_elapsed));1929if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {1930(void) fprintf(out, "\n");1931continue;1932}1933(void) fprintf(out, "%-24s\n",1934format_symbol(buf, lsp->ls_stack[fr], 0));1935fr++;1936}1937while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {1938(void) fprintf(out, "%15s %-36s %-24s\n", "", "",1939format_symbol(buf, lsp->ls_stack[fr], 0));1940fr++;1941}1942}19431944if (!g_pflag)1945(void) fprintf(out, "---------------------------------"1946"----------------------------------------------\n");19471948(void) fflush(out);1949}19501951static void1952report_trace(FILE *out, lsrec_t **sort_buf)1953{1954lsrec_t *lsp;1955int i, fr;1956int rectype;1957char buf[256], buf2[256];19581959rectype = g_recsize;19601961if (!g_pflag) {1962(void) fprintf(out, "%5s %7s %11s %-24s %-24s\n",1963"Event", "Time", "Owner", "Lock", "Caller");1964(void) fprintf(out, "---------------------------------"1965"----------------------------------------------\n");1966}19671968for (i = 0; i < g_nrecs_used; i++) {19691970lsp = sort_buf[i];19711972if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)1973continue;19741975(void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n",1976lsp->ls_event, (unsigned long long)lsp->ls_time,1977(void *)lsp->ls_next,1978#ifdef illumos1979format_symbol(buf, lsp->ls_lock, 0),1980#else1981lsp->ls_lock,1982#endif1983format_symbol(buf2, lsp->ls_caller, 0));19841985if (rectype <= LS_STACK(0))1986continue;19871988/*1989* If we went a few frames below the caller, ignore them1990*/1991for (fr = 3; fr > 0; fr--)1992if (lsp->ls_stack[fr] == lsp->ls_caller)1993break;19941995while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {1996(void) fprintf(out, "%53s %-24s\n", "",1997format_symbol(buf, lsp->ls_stack[fr], 0));1998fr++;1999}2000(void) fprintf(out, "\n");2001}20022003(void) fflush(out);2004}200520062007