Path: blob/main/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
109245 views
/*1* CDDL HEADER START2*3* The contents of this file are subject to the terms of the4* Common Development and Distribution License (the "License").5* You may not use this file except in compliance with the License.6*7* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE8* or http://www.opensolaris.org/os/licensing.9* See the License for the specific language governing permissions10* and limitations under the License.11*12* When distributing Covered Code, include this CDDL HEADER in each13* file and include the License file at usr/src/OPENSOLARIS.LICENSE.14* If applicable, add the following below this CDDL HEADER, with the15* fields enclosed by brackets "[]" replaced with your own identifying16* information: Portions Copyright [yyyy] [name of copyright owner]17*18* CDDL HEADER END19*/2021/*22* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.23* Copyright (c) 2016, Joyent, Inc. All rights reserved.24* Copyright (c) 2012, 2014 by Delphix. All rights reserved.25*/2627/*28* DTrace - Dynamic Tracing for Solaris29*30* This is the implementation of the Solaris Dynamic Tracing framework31* (DTrace). The user-visible interface to DTrace is described at length in32* the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace33* library, the in-kernel DTrace framework, and the DTrace providers are34* described in the block comments in the <sys/dtrace.h> header file. The35* internal architecture of DTrace is described in the block comments in the36* <sys/dtrace_impl.h> header file. The comments contained within the DTrace37* implementation very much assume mastery of all of these sources; if one has38* an unanswered question about the implementation, one should consult them39* first.40*41* The functions here are ordered roughly as follows:42*43* - Probe context functions44* - Probe hashing functions45* - Non-probe context utility functions46* - Matching functions47* - Provider-to-Framework API functions48* - Probe management functions49* - DIF object functions50* - Format functions51* - Predicate functions52* - ECB functions53* - Buffer functions54* - Enabling functions55* - DOF functions56* - Anonymous enabling functions57* - Consumer state functions58* - Helper functions59* - Hook functions60* - Driver cookbook functions61*62* Each group of functions begins with a block comment labelled the "DTrace63* [Group] Functions", allowing one to find each block by searching forward64* on capital-f functions.65*/66#include <sys/errno.h>67#include <sys/param.h>68#include <sys/types.h>69#ifndef illumos70#include <sys/time.h>71#endif72#include <sys/stat.h>73#include <sys/conf.h>74#include <sys/systm.h>75#include <sys/endian.h>76#ifdef illumos77#include <sys/ddi.h>78#include <sys/sunddi.h>79#endif80#include <sys/cpuvar.h>81#include <sys/kmem.h>82#ifdef illumos83#include <sys/strsubr.h>84#endif85#include <sys/sysmacros.h>86#include <sys/dtrace_impl.h>87#include <sys/atomic.h>88#include <sys/cmn_err.h>89#ifdef illumos90#include <sys/mutex_impl.h>91#include <sys/rwlock_impl.h>92#endif93#include <sys/ctf_api.h>94#ifdef illumos95#include <sys/panic.h>96#include <sys/priv_impl.h>97#endif98#ifdef illumos99#include <sys/cred_impl.h>100#include <sys/procfs_isa.h>101#endif102#include <sys/taskq.h>103#ifdef illumos104#include <sys/mkdev.h>105#include <sys/kdi.h>106#endif107#include <sys/zone.h>108#include <sys/socket.h>109#include <netinet/in.h>110#include "strtolctype.h"111112/* FreeBSD includes: */113#ifndef illumos114#include <sys/callout.h>115#include <sys/ctype.h>116#include <sys/eventhandler.h>117#include <sys/limits.h>118#include <sys/linker.h>119#include <sys/kdb.h>120#include <sys/jail.h>121#include <sys/kernel.h>122#include <sys/malloc.h>123#include <sys/lock.h>124#include <sys/mutex.h>125#include <sys/ptrace.h>126#include <sys/random.h>127#include <sys/rwlock.h>128#include <sys/sx.h>129#include <sys/sysctl.h>130131132#include <sys/mount.h>133#undef AT_UID134#undef AT_GID135#include <sys/vnode.h>136#include <sys/cred.h>137138#include <sys/dtrace_bsd.h>139140#include <netinet/in.h>141142#include "dtrace_cddl.h"143#include "dtrace_debug.c"144#endif145146#include "dtrace_xoroshiro128_plus.h"147148/*149* DTrace Tunable Variables150*151* The following variables may be tuned by adding a line to /etc/system that152* includes both the name of the DTrace module ("dtrace") and the name of the153* variable. For example:154*155* set dtrace:dtrace_destructive_disallow = 1156*157* In general, the only variables that one should be tuning this way are those158* that affect system-wide DTrace behavior, and for which the default behavior159* is undesirable. Most of these variables are tunable on a per-consumer160* basis using DTrace options, and need not be tuned on a system-wide basis.161* When tuning these variables, avoid pathological values; while some attempt162* is made to verify the integrity of these variables, they are not considered163* part of the supported interface to DTrace, and they are therefore not164* checked comprehensively. Further, these variables should not be tuned165* dynamically via "mdb -kw" or other means; they should only be tuned via166* /etc/system.167*/168int dtrace_destructive_disallow = 0;169#ifndef illumos170/* Positive logic version of dtrace_destructive_disallow for loader tunable */171int dtrace_allow_destructive = 1;172#endif173dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);174size_t dtrace_difo_maxsize = (256 * 1024);175dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);176size_t dtrace_statvar_maxsize = (16 * 1024);177size_t dtrace_actions_max = (16 * 1024);178size_t dtrace_retain_max = 1024;179dtrace_optval_t dtrace_helper_actions_max = 128;180dtrace_optval_t dtrace_helper_providers_max = 32;181dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);182size_t dtrace_strsize_default = 256;183dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */184dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */185dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */186dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */187dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */188dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */189dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */190dtrace_optval_t dtrace_nspec_default = 1;191dtrace_optval_t dtrace_specsize_default = 32 * 1024;192dtrace_optval_t dtrace_stackframes_default = 20;193dtrace_optval_t dtrace_ustackframes_default = 20;194dtrace_optval_t dtrace_jstackframes_default = 50;195dtrace_optval_t dtrace_jstackstrsize_default = 512;196int dtrace_msgdsize_max = 128;197hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */198hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */199int dtrace_devdepth_max = 32;200int dtrace_err_verbose;201hrtime_t dtrace_deadman_interval = NANOSEC;202hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;203hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;204hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;205#ifndef illumos206int dtrace_memstr_max = 4096;207int dtrace_bufsize_max_frac = 128;208#endif209210/*211* DTrace External Variables212*213* As dtrace(7D) is a kernel module, any DTrace variables are obviously214* available to DTrace consumers via the backtick (`) syntax. One of these,215* dtrace_zero, is made deliberately so: it is provided as a source of216* well-known, zero-filled memory. While this variable is not documented,217* it is used by some translators as an implementation detail.218*/219const char dtrace_zero[256] = { 0 }; /* zero-filled memory */220221/*222* DTrace Internal Variables223*/224#ifdef illumos225static dev_info_t *dtrace_devi; /* device info */226#endif227#ifdef illumos228static vmem_t *dtrace_arena; /* probe ID arena */229static vmem_t *dtrace_minor; /* minor number arena */230#else231static taskq_t *dtrace_taskq; /* task queue */232static struct unrhdr *dtrace_arena; /* Probe ID number. */233#endif234static dtrace_probe_t **dtrace_probes; /* array of all probes */235static int dtrace_nprobes; /* number of probes */236static dtrace_provider_t *dtrace_provider; /* provider list */237static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */238static int dtrace_opens; /* number of opens */239static int dtrace_helpers; /* number of helpers */240static int dtrace_getf; /* number of unpriv getf()s */241#ifdef illumos242static void *dtrace_softstate; /* softstate pointer */243#endif244static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */245static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */246static dtrace_hash_t *dtrace_byname; /* probes hashed by name */247static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */248static int dtrace_toxranges; /* number of toxic ranges */249static int dtrace_toxranges_max; /* size of toxic range array */250static dtrace_anon_t dtrace_anon; /* anonymous enabling */251static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */252static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */253static kthread_t *dtrace_panicked; /* panicking thread */254static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */255static dtrace_genid_t dtrace_probegen; /* current probe generation */256static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */257static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */258static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */259static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */260static int dtrace_dynvar_failclean; /* dynvars failed to clean */261#ifndef illumos262static struct mtx dtrace_unr_mtx;263MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);264static eventhandler_tag dtrace_kld_load_tag;265static eventhandler_tag dtrace_kld_unload_try_tag;266#endif267268/*269* DTrace Locking270* DTrace is protected by three (relatively coarse-grained) locks:271*272* (1) dtrace_lock is required to manipulate essentially any DTrace state,273* including enabling state, probes, ECBs, consumer state, helper state,274* etc. Importantly, dtrace_lock is _not_ required when in probe context;275* probe context is lock-free -- synchronization is handled via the276* dtrace_sync() cross call mechanism.277*278* (2) dtrace_provider_lock is required when manipulating provider state, or279* when provider state must be held constant.280*281* (3) dtrace_meta_lock is required when manipulating meta provider state, or282* when meta provider state must be held constant.283*284* The lock ordering between these three locks is dtrace_meta_lock before285* dtrace_provider_lock before dtrace_lock. (In particular, there are286* several places where dtrace_provider_lock is held by the framework as it287* calls into the providers -- which then call back into the framework,288* grabbing dtrace_lock.)289*290* There are two other locks in the mix: mod_lock and cpu_lock. With respect291* to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical292* role as a coarse-grained lock; it is acquired before both of these locks.293* With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must294* be acquired _between_ dtrace_meta_lock and any other DTrace locks.295* mod_lock is similar with respect to dtrace_provider_lock in that it must be296* acquired _between_ dtrace_provider_lock and dtrace_lock.297*/298static kmutex_t dtrace_lock; /* probe state lock */299static kmutex_t dtrace_provider_lock; /* provider state lock */300static kmutex_t dtrace_meta_lock; /* meta-provider state lock */301302#ifndef illumos303/* XXX FreeBSD hacks. */304#define cr_suid cr_svuid305#define cr_sgid cr_svgid306#define ipaddr_t in_addr_t307#define mod_modname pathname308#define vuprintf vprintf309#ifndef crgetzoneid310#define crgetzoneid(_a) 0311#endif312#define ttoproc(_a) ((_a)->td_proc)313#define SNOCD 0314#define CPU_ON_INTR(_a) 0315316#define PRIV_EFFECTIVE (1 << 0)317#define PRIV_DTRACE_KERNEL (1 << 1)318#define PRIV_DTRACE_PROC (1 << 2)319#define PRIV_DTRACE_USER (1 << 3)320#define PRIV_PROC_OWNER (1 << 4)321#define PRIV_PROC_ZONE (1 << 5)322#define PRIV_ALL ~0323324SYSCTL_DECL(_debug_dtrace);325SYSCTL_DECL(_kern_dtrace);326#endif327328#ifdef illumos329#define curcpu CPU->cpu_id330#endif331332333/*334* DTrace Provider Variables335*336* These are the variables relating to DTrace as a provider (that is, the337* provider of the BEGIN, END, and ERROR probes).338*/339static dtrace_pattr_t dtrace_provider_attr = {340{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },341{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },342{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },343{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },344{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },345};346347static void348dtrace_nullop(void)349{}350351static dtrace_pops_t dtrace_provider_ops = {352.dtps_provide = (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,353.dtps_provide_module = (void (*)(void *, modctl_t *))dtrace_nullop,354.dtps_enable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,355.dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,356.dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,357.dtps_resume = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,358.dtps_getargdesc = NULL,359.dtps_getargval = NULL,360.dtps_usermode = NULL,361.dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,362};363364static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */365static dtrace_id_t dtrace_probeid_end; /* special END probe */366dtrace_id_t dtrace_probeid_error; /* special ERROR probe */367368/*369* DTrace Helper Tracing Variables370*371* These variables should be set dynamically to enable helper tracing. The372* only variables that should be set are dtrace_helptrace_enable (which should373* be set to a non-zero value to allocate helper tracing buffers on the next374* open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a375* non-zero value to deallocate helper tracing buffers on the next close of376* /dev/dtrace). When (and only when) helper tracing is disabled, the377* buffer size may also be set via dtrace_helptrace_bufsize.378*/379int dtrace_helptrace_enable = 0;380int dtrace_helptrace_disable = 0;381int dtrace_helptrace_bufsize = 16 * 1024 * 1024;382uint32_t dtrace_helptrace_nlocals;383static dtrace_helptrace_t *dtrace_helptrace_buffer;384static uint32_t dtrace_helptrace_next = 0;385static int dtrace_helptrace_wrapped = 0;386387/*388* DTrace Error Hashing389*390* On DEBUG kernels, DTrace will track the errors that has seen in a hash391* table. This is very useful for checking coverage of tests that are392* expected to induce DIF or DOF processing errors, and may be useful for393* debugging problems in the DIF code generator or in DOF generation . The394* error hash may be examined with the ::dtrace_errhash MDB dcmd.395*/396#ifdef DEBUG397static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];398static const char *dtrace_errlast;399static kthread_t *dtrace_errthread;400static kmutex_t dtrace_errlock;401#endif402403/*404* DTrace Macros and Constants405*406* These are various macros that are useful in various spots in the407* implementation, along with a few random constants that have no meaning408* outside of the implementation. There is no real structure to this cpp409* mishmash -- but is there ever?410*/411#define DTRACE_HASHSTR(hash, probe) \412dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))413414#define DTRACE_HASHNEXT(hash, probe) \415(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)416417#define DTRACE_HASHPREV(hash, probe) \418(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)419420#define DTRACE_HASHEQ(hash, lhs, rhs) \421(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \422*((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)423424#define DTRACE_AGGHASHSIZE_SLEW 17425426#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)427428/*429* The key for a thread-local variable consists of the lower 61 bits of the430* t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.431* We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never432* equal to a variable identifier. This is necessary (but not sufficient) to433* assure that global associative arrays never collide with thread-local434* variables. To guarantee that they cannot collide, we must also define the435* order for keying dynamic variables. That order is:436*437* [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]438*439* Because the variable-key and the tls-key are in orthogonal spaces, there is440* no way for a global variable key signature to match a thread-local key441* signature.442*/443#ifdef illumos444#define DTRACE_TLS_THRKEY(where) { \445uint_t intr = 0; \446uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \447for (; actv; actv >>= 1) \448intr++; \449ASSERT(intr < (1 << 3)); \450(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \451(((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \452}453#else454#define DTRACE_TLS_THRKEY(where) { \455solaris_cpu_t *_c = &solaris_cpu[curcpu]; \456uint_t intr = 0; \457uint_t actv = _c->cpu_intr_actv; \458for (; actv; actv >>= 1) \459intr++; \460ASSERT(intr < (1 << 3)); \461(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \462(((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \463}464#endif465466#define DT_BSWAP_8(x) ((x) & 0xff)467#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))468#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))469#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))470471#define DT_MASK_LO 0x00000000FFFFFFFFULL472473#define DTRACE_STORE(type, tomax, offset, what) \474*((type *)((uintptr_t)(tomax) + (size_t)offset)) = (type)(what);475476#if !defined(__x86) && !defined(__aarch64__)477#define DTRACE_ALIGNCHECK(addr, size, flags) \478if (addr & (size - 1)) { \479*flags |= CPU_DTRACE_BADALIGN; \480cpu_core[curcpu].cpuc_dtrace_illval = addr; \481return (0); \482}483#else484#define DTRACE_ALIGNCHECK(addr, size, flags)485#endif486487/*488* Test whether a range of memory starting at testaddr of size testsz falls489* within the range of memory described by addr, sz. We take care to avoid490* problems with overflow and underflow of the unsigned quantities, and491* disallow all negative sizes. Ranges of size 0 are allowed.492*/493#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \494((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \495(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \496(testaddr) + (testsz) >= (testaddr))497498#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \499do { \500if ((remp) != NULL) { \501*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \502} \503} while (0)504505506/*507* Test whether alloc_sz bytes will fit in the scratch region. We isolate508* alloc_sz on the righthand side of the comparison in order to avoid overflow509* or underflow in the comparison with it. This is simpler than the INRANGE510* check above, because we know that the dtms_scratch_ptr is valid in the511* range. Allocations of size zero are allowed.512*/513#define DTRACE_INSCRATCH(mstate, alloc_sz) \514((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \515(mstate)->dtms_scratch_ptr >= (alloc_sz))516517#define DTRACE_INSCRATCHPTR(mstate, ptr, howmany) \518((ptr) >= (mstate)->dtms_scratch_base && \519(ptr) <= \520((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - (howmany)))521522#define DTRACE_LOADFUNC(bits) \523/*CSTYLED*/ \524uint##bits##_t \525dtrace_load##bits(uintptr_t addr) \526{ \527size_t size = bits / NBBY; \528/*CSTYLED*/ \529uint##bits##_t rval; \530int i; \531volatile uint16_t *flags = (volatile uint16_t *) \532&cpu_core[curcpu].cpuc_dtrace_flags; \533\534DTRACE_ALIGNCHECK(addr, size, flags); \535\536for (i = 0; i < dtrace_toxranges; i++) { \537if (addr >= dtrace_toxrange[i].dtt_limit) \538continue; \539\540if (addr + size <= dtrace_toxrange[i].dtt_base) \541continue; \542\543/* \544* This address falls within a toxic region; return 0. \545*/ \546*flags |= CPU_DTRACE_BADADDR; \547cpu_core[curcpu].cpuc_dtrace_illval = addr; \548return (0); \549} \550\551__compiler_membar(); \552*flags |= CPU_DTRACE_NOFAULT; \553/*CSTYLED*/ \554rval = *((volatile uint##bits##_t *)addr); \555*flags &= ~CPU_DTRACE_NOFAULT; \556__compiler_membar(); \557\558return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \559}560561#ifdef _LP64562#define dtrace_loadptr dtrace_load64563#else564#define dtrace_loadptr dtrace_load32565#endif566567#define DTRACE_DYNHASH_FREE 0568#define DTRACE_DYNHASH_SINK 1569#define DTRACE_DYNHASH_VALID 2570571#define DTRACE_MATCH_NEXT 0572#define DTRACE_MATCH_DONE 1573#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')574#define DTRACE_STATE_ALIGN 64575576#define DTRACE_FLAGS2FLT(flags) \577(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \578((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \579((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \580((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \581((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \582((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \583((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \584((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \585((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \586DTRACEFLT_UNKNOWN)587588#define DTRACEACT_ISSTRING(act) \589((act)->dta_kind == DTRACEACT_DIFEXPR && \590(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)591592/* Function prototype definitions: */593static size_t dtrace_strlen(const char *, size_t);594static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);595static void dtrace_enabling_provide(dtrace_provider_t *);596static int dtrace_enabling_match(dtrace_enabling_t *, int *);597static void dtrace_enabling_matchall(void);598static void dtrace_enabling_matchall_task(void *);599static void dtrace_enabling_reap(void *);600static dtrace_state_t *dtrace_anon_grab(void);601static uint64_t dtrace_helper(int, dtrace_mstate_t *,602dtrace_state_t *, uint64_t, uint64_t);603static dtrace_helpers_t *dtrace_helpers_create(proc_t *);604static void dtrace_buffer_drop(dtrace_buffer_t *);605static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);606static ssize_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,607dtrace_state_t *, dtrace_mstate_t *);608static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,609dtrace_optval_t);610static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);611static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);612uint16_t dtrace_load16(uintptr_t);613uint32_t dtrace_load32(uintptr_t);614uint64_t dtrace_load64(uintptr_t);615uint8_t dtrace_load8(uintptr_t);616void dtrace_dynvar_clean(dtrace_dstate_t *);617dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,618size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);619uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);620static int dtrace_priv_proc(dtrace_state_t *);621static void dtrace_getf_barrier(void);622static int dtrace_canload_remains(uint64_t, size_t, size_t *,623dtrace_mstate_t *, dtrace_vstate_t *);624static int dtrace_canstore_remains(uint64_t, size_t, size_t *,625dtrace_mstate_t *, dtrace_vstate_t *);626627/*628* DTrace Probe Context Functions629*630* These functions are called from probe context. Because probe context is631* any context in which C may be called, arbitrarily locks may be held,632* interrupts may be disabled, we may be in arbitrary dispatched state, etc.633* As a result, functions called from probe context may only call other DTrace634* support functions -- they may not interact at all with the system at large.635* (Note that the ASSERT macro is made probe-context safe by redefining it in636* terms of dtrace_assfail(), a probe-context safe function.) If arbitrary637* loads are to be performed from probe context, they _must_ be in terms of638* the safe dtrace_load*() variants.639*640* Some functions in this block are not actually called from probe context;641* for these functions, there will be a comment above the function reading642* "Note: not called from probe context."643*/644void645dtrace_panic(const char *format, ...)646{647va_list alist;648649va_start(alist, format);650#ifdef __FreeBSD__651vpanic(format, alist);652#else653dtrace_vpanic(format, alist);654#endif655va_end(alist);656}657658int659dtrace_assfail(const char *a, const char *f, int l)660{661dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);662663/*664* We just need something here that even the most clever compiler665* cannot optimize away.666*/667return (a[(uintptr_t)f]);668}669670/*671* Atomically increment a specified error counter from probe context.672*/673static void674dtrace_error(uint32_t *counter)675{676/*677* Most counters stored to in probe context are per-CPU counters.678* However, there are some error conditions that are sufficiently679* arcane that they don't merit per-CPU storage. If these counters680* are incremented concurrently on different CPUs, scalability will be681* adversely affected -- but we don't expect them to be white-hot in a682* correctly constructed enabling...683*/684uint32_t oval, nval;685686do {687oval = *counter;688689if ((nval = oval + 1) == 0) {690/*691* If the counter would wrap, set it to 1 -- assuring692* that the counter is never zero when we have seen693* errors. (The counter must be 32-bits because we694* aren't guaranteed a 64-bit compare&swap operation.)695* To save this code both the infamy of being fingered696* by a priggish news story and the indignity of being697* the target of a neo-puritan witch trial, we're698* carefully avoiding any colorful description of the699* likelihood of this condition -- but suffice it to700* say that it is only slightly more likely than the701* overflow of predicate cache IDs, as discussed in702* dtrace_predicate_create().703*/704nval = 1;705}706} while (dtrace_cas32(counter, oval, nval) != oval);707}708709void710dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg)711{712cpuset_t cpus;713714if (cpu == DTRACE_CPUALL)715cpus = all_cpus;716else717CPU_SETOF(cpu, &cpus);718719smp_rendezvous_cpus(cpus, smp_no_rendezvous_barrier, func,720smp_no_rendezvous_barrier, arg);721}722723static void724dtrace_sync_func(void)725{726}727728void729dtrace_sync(void)730{731dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);732}733734/*735* Use the DTRACE_LOADFUNC macro to define functions for each of loading a736* uint8_t, a uint16_t, a uint32_t and a uint64_t.737*/738/* BEGIN CSTYLED */739DTRACE_LOADFUNC(8)740DTRACE_LOADFUNC(16)741DTRACE_LOADFUNC(32)742DTRACE_LOADFUNC(64)743/* END CSTYLED */744745static int746dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)747{748if (dest < mstate->dtms_scratch_base)749return (0);750751if (dest + size < dest)752return (0);753754if (dest + size > mstate->dtms_scratch_ptr)755return (0);756757return (1);758}759760static int761dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,762dtrace_statvar_t **svars, int nsvars)763{764int i;765size_t maxglobalsize, maxlocalsize;766767if (nsvars == 0)768return (0);769770maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);771maxlocalsize = maxglobalsize * (mp_maxid + 1);772773for (i = 0; i < nsvars; i++) {774dtrace_statvar_t *svar = svars[i];775uint8_t scope;776size_t size;777778if (svar == NULL || (size = svar->dtsv_size) == 0)779continue;780781scope = svar->dtsv_var.dtdv_scope;782783/*784* We verify that our size is valid in the spirit of providing785* defense in depth: we want to prevent attackers from using786* DTrace to escalate an orthogonal kernel heap corruption bug787* into the ability to store to arbitrary locations in memory.788*/789VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||790(scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));791792if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,793svar->dtsv_size)) {794DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,795svar->dtsv_size);796return (1);797}798}799800return (0);801}802803/*804* Check to see if the address is within a memory region to which a store may805* be issued. This includes the DTrace scratch areas, and any DTrace variable806* region. The caller of dtrace_canstore() is responsible for performing any807* alignment checks that are needed before stores are actually executed.808*/809static int810dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,811dtrace_vstate_t *vstate)812{813return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));814}815816/*817* Implementation of dtrace_canstore which communicates the upper bound of the818* allowed memory region.819*/820static int821dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,822dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)823{824/*825* First, check to see if the address is in scratch space...826*/827if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,828mstate->dtms_scratch_size)) {829DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,830mstate->dtms_scratch_size);831return (1);832}833834/*835* Now check to see if it's a dynamic variable. This check will pick836* up both thread-local variables and any global dynamically-allocated837* variables.838*/839if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,840vstate->dtvs_dynvars.dtds_size)) {841dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;842uintptr_t base = (uintptr_t)dstate->dtds_base +843(dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));844uintptr_t chunkoffs;845dtrace_dynvar_t *dvar;846847/*848* Before we assume that we can store here, we need to make849* sure that it isn't in our metadata -- storing to our850* dynamic variable metadata would corrupt our state. For851* the range to not include any dynamic variable metadata,852* it must:853*854* (1) Start above the hash table that is at the base of855* the dynamic variable space856*857* (2) Have a starting chunk offset that is beyond the858* dtrace_dynvar_t that is at the base of every chunk859*860* (3) Not span a chunk boundary861*862* (4) Not be in the tuple space of a dynamic variable863*864*/865if (addr < base)866return (0);867868chunkoffs = (addr - base) % dstate->dtds_chunksize;869870if (chunkoffs < sizeof (dtrace_dynvar_t))871return (0);872873if (chunkoffs + sz > dstate->dtds_chunksize)874return (0);875876dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);877878if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)879return (0);880881if (chunkoffs < sizeof (dtrace_dynvar_t) +882((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))883return (0);884885DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);886return (1);887}888889/*890* Finally, check the static local and global variables. These checks891* take the longest, so we perform them last.892*/893if (dtrace_canstore_statvar(addr, sz, remain,894vstate->dtvs_locals, vstate->dtvs_nlocals))895return (1);896897if (dtrace_canstore_statvar(addr, sz, remain,898vstate->dtvs_globals, vstate->dtvs_nglobals))899return (1);900901return (0);902}903904905/*906* Convenience routine to check to see if the address is within a memory907* region in which a load may be issued given the user's privilege level;908* if not, it sets the appropriate error flags and loads 'addr' into the909* illegal value slot.910*911* DTrace subroutines (DIF_SUBR_*) should use this helper to implement912* appropriate memory access protection.913*/914static int915dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,916dtrace_vstate_t *vstate)917{918return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));919}920921/*922* Implementation of dtrace_canload which communicates the uppoer bound of the923* allowed memory region.924*/925static int926dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,927dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)928{929volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;930file_t *fp;931932/*933* If we hold the privilege to read from kernel memory, then934* everything is readable.935*/936if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {937DTRACE_RANGE_REMAIN(remain, addr, addr, sz);938return (1);939}940941/*942* You can obviously read that which you can store.943*/944if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))945return (1);946947/*948* We're allowed to read from our own string table.949*/950if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,951mstate->dtms_difo->dtdo_strlen)) {952DTRACE_RANGE_REMAIN(remain, addr,953mstate->dtms_difo->dtdo_strtab,954mstate->dtms_difo->dtdo_strlen);955return (1);956}957958if (vstate->dtvs_state != NULL &&959dtrace_priv_proc(vstate->dtvs_state)) {960proc_t *p;961962/*963* When we have privileges to the current process, there are964* several context-related kernel structures that are safe to965* read, even absent the privilege to read from kernel memory.966* These reads are safe because these structures contain only967* state that (1) we're permitted to read, (2) is harmless or968* (3) contains pointers to additional kernel state that we're969* not permitted to read (and as such, do not present an970* opportunity for privilege escalation). Finally (and971* critically), because of the nature of their relation with972* the current thread context, the memory associated with these973* structures cannot change over the duration of probe context,974* and it is therefore impossible for this memory to be975* deallocated and reallocated as something else while it's976* being operated upon.977*/978if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {979DTRACE_RANGE_REMAIN(remain, addr, curthread,980sizeof (kthread_t));981return (1);982}983984if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,985sz, curthread->t_procp, sizeof (proc_t))) {986DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,987sizeof (proc_t));988return (1);989}990991if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,992curthread->t_cred, sizeof (cred_t))) {993DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,994sizeof (cred_t));995return (1);996}997998#ifdef illumos999if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,1000&(p->p_pidp->pid_id), sizeof (pid_t))) {1001DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),1002sizeof (pid_t));1003return (1);1004}10051006if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,1007curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {1008DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,1009offsetof(cpu_t, cpu_pause_thread));1010return (1);1011}1012#endif1013}10141015if ((fp = mstate->dtms_getf) != NULL) {1016uintptr_t psz = sizeof (void *);1017vnode_t *vp;1018vnodeops_t *op;10191020/*1021* When getf() returns a file_t, the enabling is implicitly1022* granted the (transient) right to read the returned file_t1023* as well as the v_path and v_op->vnop_name of the underlying1024* vnode. These accesses are allowed after a successful1025* getf() because the members that they refer to cannot change1026* once set -- and the barrier logic in the kernel's closef()1027* path assures that the file_t and its referenced vode_t1028* cannot themselves be stale (that is, it impossible for1029* either dtms_getf itself or its f_vnode member to reference1030* freed memory).1031*/1032if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {1033DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));1034return (1);1035}10361037if ((vp = fp->f_vnode) != NULL) {1038size_t slen;1039#ifdef illumos1040if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {1041DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,1042psz);1043return (1);1044}1045slen = strlen(vp->v_path) + 1;1046if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {1047DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,1048slen);1049return (1);1050}1051#endif10521053if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {1054DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,1055psz);1056return (1);1057}10581059#ifdef illumos1060if ((op = vp->v_op) != NULL &&1061DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {1062DTRACE_RANGE_REMAIN(remain, addr,1063&op->vnop_name, psz);1064return (1);1065}10661067if (op != NULL && op->vnop_name != NULL &&1068DTRACE_INRANGE(addr, sz, op->vnop_name,1069(slen = strlen(op->vnop_name) + 1))) {1070DTRACE_RANGE_REMAIN(remain, addr,1071op->vnop_name, slen);1072return (1);1073}1074#endif1075}1076}10771078DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);1079*illval = addr;1080return (0);1081}10821083/*1084* Convenience routine to check to see if a given string is within a memory1085* region in which a load may be issued given the user's privilege level;1086* this exists so that we don't need to issue unnecessary dtrace_strlen()1087* calls in the event that the user has all privileges.1088*/1089static int1090dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,1091dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)1092{1093size_t rsize;10941095/*1096* If we hold the privilege to read from kernel memory, then1097* everything is readable.1098*/1099if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {1100DTRACE_RANGE_REMAIN(remain, addr, addr, sz);1101return (1);1102}11031104/*1105* Even if the caller is uninterested in querying the remaining valid1106* range, it is required to ensure that the access is allowed.1107*/1108if (remain == NULL) {1109remain = &rsize;1110}1111if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {1112size_t strsz;1113/*1114* Perform the strlen after determining the length of the1115* memory region which is accessible. This prevents timing1116* information from being used to find NULs in memory which is1117* not accessible to the caller.1118*/1119strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,1120MIN(sz, *remain));1121if (strsz <= *remain) {1122return (1);1123}1124}11251126return (0);1127}11281129/*1130* Convenience routine to check to see if a given variable is within a memory1131* region in which a load may be issued given the user's privilege level.1132*/1133static int1134dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,1135dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)1136{1137size_t sz;1138ASSERT(type->dtdt_flags & DIF_TF_BYREF);11391140/*1141* Calculate the max size before performing any checks since even1142* DTRACE_ACCESS_KERNEL-credentialed callers expect that this function1143* return the max length via 'remain'.1144*/1145if (type->dtdt_kind == DIF_TYPE_STRING) {1146dtrace_state_t *state = vstate->dtvs_state;11471148if (state != NULL) {1149sz = state->dts_options[DTRACEOPT_STRSIZE];1150} else {1151/*1152* In helper context, we have a NULL state; fall back1153* to using the system-wide default for the string size1154* in this case.1155*/1156sz = dtrace_strsize_default;1157}1158} else {1159sz = type->dtdt_size;1160}11611162/*1163* If we hold the privilege to read from kernel memory, then1164* everything is readable.1165*/1166if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {1167DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);1168return (1);1169}11701171if (type->dtdt_kind == DIF_TYPE_STRING) {1172return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,1173vstate));1174}1175return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,1176vstate));1177}11781179/*1180* Convert a string to a signed integer using safe loads.1181*1182* NOTE: This function uses various macros from strtolctype.h to manipulate1183* digit values, etc -- these have all been checked to ensure they make1184* no additional function calls.1185*/1186static int64_t1187dtrace_strtoll(char *input, int base, size_t limit)1188{1189uintptr_t pos = (uintptr_t)input;1190int64_t val = 0;1191int x;1192boolean_t neg = B_FALSE;1193char c, cc, ccc;1194uintptr_t end = pos + limit;11951196/*1197* Consume any whitespace preceding digits.1198*/1199while ((c = dtrace_load8(pos)) == ' ' || c == '\t')1200pos++;12011202/*1203* Handle an explicit sign if one is present.1204*/1205if (c == '-' || c == '+') {1206if (c == '-')1207neg = B_TRUE;1208c = dtrace_load8(++pos);1209}12101211/*1212* Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it1213* if present.1214*/1215if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||1216cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {1217pos += 2;1218c = ccc;1219}12201221/*1222* Read in contiguous digits until the first non-digit character.1223*/1224for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;1225c = dtrace_load8(++pos))1226val = val * base + x;12271228return (neg ? -val : val);1229}12301231/*1232* Compare two strings using safe loads.1233*/1234static int1235dtrace_strncmp(char *s1, char *s2, size_t limit)1236{1237uint8_t c1, c2;1238volatile uint16_t *flags;12391240if (s1 == s2 || limit == 0)1241return (0);12421243flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;12441245do {1246if (s1 == NULL) {1247c1 = '\0';1248} else {1249c1 = dtrace_load8((uintptr_t)s1++);1250}12511252if (s2 == NULL) {1253c2 = '\0';1254} else {1255c2 = dtrace_load8((uintptr_t)s2++);1256}12571258if (c1 != c2)1259return (c1 - c2);1260} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));12611262return (0);1263}12641265/*1266* Compute strlen(s) for a string using safe memory accesses. The additional1267* len parameter is used to specify a maximum length to ensure completion.1268*/1269static size_t1270dtrace_strlen(const char *s, size_t lim)1271{1272uint_t len;12731274for (len = 0; len != lim; len++) {1275if (dtrace_load8((uintptr_t)s++) == '\0')1276break;1277}12781279return (len);1280}12811282/*1283* Check if an address falls within a toxic region.1284*/1285static int1286dtrace_istoxic(uintptr_t kaddr, size_t size)1287{1288uintptr_t taddr, tsize;1289int i;12901291for (i = 0; i < dtrace_toxranges; i++) {1292taddr = dtrace_toxrange[i].dtt_base;1293tsize = dtrace_toxrange[i].dtt_limit - taddr;12941295if (kaddr - taddr < tsize) {1296DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);1297cpu_core[curcpu].cpuc_dtrace_illval = kaddr;1298return (1);1299}13001301if (taddr - kaddr < size) {1302DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);1303cpu_core[curcpu].cpuc_dtrace_illval = taddr;1304return (1);1305}1306}13071308return (0);1309}13101311/*1312* Copy src to dst using safe memory accesses. The src is assumed to be unsafe1313* memory specified by the DIF program. The dst is assumed to be safe memory1314* that we can store to directly because it is managed by DTrace. As with1315* standard bcopy, overlapping copies are handled properly.1316*/1317static void1318dtrace_bcopy(const void *src, void *dst, size_t len)1319{1320if (len != 0) {1321uint8_t *s1 = dst;1322const uint8_t *s2 = src;13231324if (s1 <= s2) {1325do {1326*s1++ = dtrace_load8((uintptr_t)s2++);1327} while (--len != 0);1328} else {1329s2 += len;1330s1 += len;13311332do {1333*--s1 = dtrace_load8((uintptr_t)--s2);1334} while (--len != 0);1335}1336}1337}13381339/*1340* Copy src to dst using safe memory accesses, up to either the specified1341* length, or the point that a nul byte is encountered. The src is assumed to1342* be unsafe memory specified by the DIF program. The dst is assumed to be1343* safe memory that we can store to directly because it is managed by DTrace.1344* Unlike dtrace_bcopy(), overlapping regions are not handled.1345*/1346static void1347dtrace_strcpy(const void *src, void *dst, size_t len)1348{1349if (len != 0) {1350uint8_t *s1 = dst, c;1351const uint8_t *s2 = src;13521353do {1354*s1++ = c = dtrace_load8((uintptr_t)s2++);1355} while (--len != 0 && c != '\0');1356}1357}13581359/*1360* Copy src to dst, deriving the size and type from the specified (BYREF)1361* variable type. The src is assumed to be unsafe memory specified by the DIF1362* program. The dst is assumed to be DTrace variable memory that is of the1363* specified type; we assume that we can store to directly.1364*/1365static void1366dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)1367{1368ASSERT(type->dtdt_flags & DIF_TF_BYREF);13691370if (type->dtdt_kind == DIF_TYPE_STRING) {1371dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));1372} else {1373dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));1374}1375}13761377/*1378* Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be1379* unsafe memory specified by the DIF program. The s2 data is assumed to be1380* safe memory that we can access directly because it is managed by DTrace.1381*/1382static int1383dtrace_bcmp(const void *s1, const void *s2, size_t len)1384{1385volatile uint16_t *flags;13861387flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;13881389if (s1 == s2)1390return (0);13911392if (s1 == NULL || s2 == NULL)1393return (1);13941395if (s1 != s2 && len != 0) {1396const uint8_t *ps1 = s1;1397const uint8_t *ps2 = s2;13981399do {1400if (dtrace_load8((uintptr_t)ps1++) != *ps2++)1401return (1);1402} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));1403}1404return (0);1405}14061407/*1408* Zero the specified region using a simple byte-by-byte loop. Note that this1409* is for safe DTrace-managed memory only.1410*/1411static void1412dtrace_bzero(void *dst, size_t len)1413{1414uchar_t *cp;14151416for (cp = dst; len != 0; len--)1417*cp++ = 0;1418}14191420static void1421dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)1422{1423uint64_t result[2];14241425result[0] = addend1[0] + addend2[0];1426result[1] = addend1[1] + addend2[1] +1427(result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);14281429sum[0] = result[0];1430sum[1] = result[1];1431}14321433/*1434* Shift the 128-bit value in a by b. If b is positive, shift left.1435* If b is negative, shift right.1436*/1437static void1438dtrace_shift_128(uint64_t *a, int b)1439{1440uint64_t mask;14411442if (b == 0)1443return;14441445if (b < 0) {1446b = -b;1447if (b >= 64) {1448a[0] = a[1] >> (b - 64);1449a[1] = 0;1450} else {1451a[0] >>= b;1452mask = 1LL << (64 - b);1453mask -= 1;1454a[0] |= ((a[1] & mask) << (64 - b));1455a[1] >>= b;1456}1457} else {1458if (b >= 64) {1459a[1] = a[0] << (b - 64);1460a[0] = 0;1461} else {1462a[1] <<= b;1463mask = a[0] >> (64 - b);1464a[1] |= mask;1465a[0] <<= b;1466}1467}1468}14691470/*1471* The basic idea is to break the 2 64-bit values into 4 32-bit values,1472* use native multiplication on those, and then re-combine into the1473* resulting 128-bit value.1474*1475* (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =1476* hi1 * hi2 << 64 +1477* hi1 * lo2 << 32 +1478* hi2 * lo1 << 32 +1479* lo1 * lo21480*/1481static void1482dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)1483{1484uint64_t hi1, hi2, lo1, lo2;1485uint64_t tmp[2];14861487hi1 = factor1 >> 32;1488hi2 = factor2 >> 32;14891490lo1 = factor1 & DT_MASK_LO;1491lo2 = factor2 & DT_MASK_LO;14921493product[0] = lo1 * lo2;1494product[1] = hi1 * hi2;14951496tmp[0] = hi1 * lo2;1497tmp[1] = 0;1498dtrace_shift_128(tmp, 32);1499dtrace_add_128(product, tmp, product);15001501tmp[0] = hi2 * lo1;1502tmp[1] = 0;1503dtrace_shift_128(tmp, 32);1504dtrace_add_128(product, tmp, product);1505}15061507/*1508* This privilege check should be used by actions and subroutines to1509* verify that the user credentials of the process that enabled the1510* invoking ECB match the target credentials1511*/1512static int1513dtrace_priv_proc_common_user(dtrace_state_t *state)1514{1515cred_t *cr, *s_cr = state->dts_cred.dcr_cred;15161517/*1518* We should always have a non-NULL state cred here, since if cred1519* is null (anonymous tracing), we fast-path bypass this routine.1520*/1521ASSERT(s_cr != NULL);15221523if ((cr = CRED()) != NULL &&1524s_cr->cr_uid == cr->cr_uid &&1525s_cr->cr_uid == cr->cr_ruid &&1526s_cr->cr_uid == cr->cr_suid &&1527s_cr->cr_gid == cr->cr_gid &&1528s_cr->cr_gid == cr->cr_rgid &&1529s_cr->cr_gid == cr->cr_sgid)1530return (1);15311532return (0);1533}15341535/*1536* This privilege check should be used by actions and subroutines to1537* verify that the zone of the process that enabled the invoking ECB1538* matches the target credentials1539*/1540static int1541dtrace_priv_proc_common_zone(dtrace_state_t *state)1542{1543#ifdef illumos1544cred_t *cr, *s_cr = state->dts_cred.dcr_cred;15451546/*1547* We should always have a non-NULL state cred here, since if cred1548* is null (anonymous tracing), we fast-path bypass this routine.1549*/1550ASSERT(s_cr != NULL);15511552if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)1553return (1);15541555return (0);1556#else1557return (1);1558#endif1559}15601561/*1562* This privilege check should be used by actions and subroutines to1563* verify that the process has not setuid or changed credentials.1564*/1565static int1566dtrace_priv_proc_common_nocd(void)1567{1568proc_t *proc;15691570if ((proc = ttoproc(curthread)) != NULL &&1571!(proc->p_flag & SNOCD))1572return (1);15731574return (0);1575}15761577static int1578dtrace_priv_proc_destructive(dtrace_state_t *state)1579{1580int action = state->dts_cred.dcr_action;15811582if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&1583dtrace_priv_proc_common_zone(state) == 0)1584goto bad;15851586if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&1587dtrace_priv_proc_common_user(state) == 0)1588goto bad;15891590if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&1591dtrace_priv_proc_common_nocd() == 0)1592goto bad;15931594return (1);15951596bad:1597cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;15981599return (0);1600}16011602static int1603dtrace_priv_proc_control(dtrace_state_t *state)1604{1605if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)1606return (1);16071608if (dtrace_priv_proc_common_zone(state) &&1609dtrace_priv_proc_common_user(state) &&1610dtrace_priv_proc_common_nocd())1611return (1);16121613cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;16141615return (0);1616}16171618static int1619dtrace_priv_proc(dtrace_state_t *state)1620{1621if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)1622return (1);16231624cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;16251626return (0);1627}16281629static int1630dtrace_priv_kernel(dtrace_state_t *state)1631{1632if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)1633return (1);16341635cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;16361637return (0);1638}16391640static int1641dtrace_priv_kernel_destructive(dtrace_state_t *state)1642{1643if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)1644return (1);16451646cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;16471648return (0);1649}16501651/*1652* Determine if the dte_cond of the specified ECB allows for processing of1653* the current probe to continue. Note that this routine may allow continued1654* processing, but with access(es) stripped from the mstate's dtms_access1655* field.1656*/1657static int1658dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,1659dtrace_ecb_t *ecb)1660{1661dtrace_probe_t *probe = ecb->dte_probe;1662dtrace_provider_t *prov = probe->dtpr_provider;1663dtrace_pops_t *pops = &prov->dtpv_pops;1664int mode = DTRACE_MODE_NOPRIV_DROP;16651666ASSERT(ecb->dte_cond);16671668#ifdef illumos1669if (pops->dtps_mode != NULL) {1670mode = pops->dtps_mode(prov->dtpv_arg,1671probe->dtpr_id, probe->dtpr_arg);16721673ASSERT((mode & DTRACE_MODE_USER) ||1674(mode & DTRACE_MODE_KERNEL));1675ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||1676(mode & DTRACE_MODE_NOPRIV_DROP));1677}16781679/*1680* If the dte_cond bits indicate that this consumer is only allowed to1681* see user-mode firings of this probe, call the provider's dtps_mode()1682* entry point to check that the probe was fired while in a user1683* context. If that's not the case, use the policy specified by the1684* provider to determine if we drop the probe or merely restrict1685* operation.1686*/1687if (ecb->dte_cond & DTRACE_COND_USERMODE) {1688ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);16891690if (!(mode & DTRACE_MODE_USER)) {1691if (mode & DTRACE_MODE_NOPRIV_DROP)1692return (0);16931694mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;1695}1696}1697#endif16981699/*1700* This is more subtle than it looks. We have to be absolutely certain1701* that CRED() isn't going to change out from under us so it's only1702* legit to examine that structure if we're in constrained situations.1703* Currently, the only times we'll this check is if a non-super-user1704* has enabled the profile or syscall providers -- providers that1705* allow visibility of all processes. For the profile case, the check1706* above will ensure that we're examining a user context.1707*/1708if (ecb->dte_cond & DTRACE_COND_OWNER) {1709cred_t *cr;1710cred_t *s_cr = state->dts_cred.dcr_cred;1711proc_t *proc;17121713ASSERT(s_cr != NULL);17141715if ((cr = CRED()) == NULL ||1716s_cr->cr_uid != cr->cr_uid ||1717s_cr->cr_uid != cr->cr_ruid ||1718s_cr->cr_uid != cr->cr_suid ||1719s_cr->cr_gid != cr->cr_gid ||1720s_cr->cr_gid != cr->cr_rgid ||1721s_cr->cr_gid != cr->cr_sgid ||1722(proc = ttoproc(curthread)) == NULL ||1723(proc->p_flag & SNOCD)) {1724if (mode & DTRACE_MODE_NOPRIV_DROP)1725return (0);17261727#ifdef illumos1728mstate->dtms_access &= ~DTRACE_ACCESS_PROC;1729#endif1730}1731}17321733#ifdef illumos1734/*1735* If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not1736* in our zone, check to see if our mode policy is to restrict rather1737* than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC1738* and DTRACE_ACCESS_ARGS1739*/1740if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {1741cred_t *cr;1742cred_t *s_cr = state->dts_cred.dcr_cred;17431744ASSERT(s_cr != NULL);17451746if ((cr = CRED()) == NULL ||1747s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {1748if (mode & DTRACE_MODE_NOPRIV_DROP)1749return (0);17501751mstate->dtms_access &=1752~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);1753}1754}1755#endif17561757return (1);1758}17591760/*1761* Note: not called from probe context. This function is called1762* asynchronously (and at a regular interval) from outside of probe context to1763* clean the dirty dynamic variable lists on all CPUs. Dynamic variable1764* cleaning is explained in detail in <sys/dtrace_impl.h>.1765*/1766void1767dtrace_dynvar_clean(dtrace_dstate_t *dstate)1768{1769dtrace_dynvar_t *dirty;1770dtrace_dstate_percpu_t *dcpu;1771dtrace_dynvar_t **rinsep;1772int i, j, work = 0;17731774CPU_FOREACH(i) {1775dcpu = &dstate->dtds_percpu[i];1776rinsep = &dcpu->dtdsc_rinsing;17771778/*1779* If the dirty list is NULL, there is no dirty work to do.1780*/1781if (dcpu->dtdsc_dirty == NULL)1782continue;17831784if (dcpu->dtdsc_rinsing != NULL) {1785/*1786* If the rinsing list is non-NULL, then it is because1787* this CPU was selected to accept another CPU's1788* dirty list -- and since that time, dirty buffers1789* have accumulated. This is a highly unlikely1790* condition, but we choose to ignore the dirty1791* buffers -- they'll be picked up a future cleanse.1792*/1793continue;1794}17951796if (dcpu->dtdsc_clean != NULL) {1797/*1798* If the clean list is non-NULL, then we're in a1799* situation where a CPU has done deallocations (we1800* have a non-NULL dirty list) but no allocations (we1801* also have a non-NULL clean list). We can't simply1802* move the dirty list into the clean list on this1803* CPU, yet we also don't want to allow this condition1804* to persist, lest a short clean list prevent a1805* massive dirty list from being cleaned (which in1806* turn could lead to otherwise avoidable dynamic1807* drops). To deal with this, we look for some CPU1808* with a NULL clean list, NULL dirty list, and NULL1809* rinsing list -- and then we borrow this CPU to1810* rinse our dirty list.1811*/1812CPU_FOREACH(j) {1813dtrace_dstate_percpu_t *rinser;18141815rinser = &dstate->dtds_percpu[j];18161817if (rinser->dtdsc_rinsing != NULL)1818continue;18191820if (rinser->dtdsc_dirty != NULL)1821continue;18221823if (rinser->dtdsc_clean != NULL)1824continue;18251826rinsep = &rinser->dtdsc_rinsing;1827break;1828}18291830if (j > mp_maxid) {1831/*1832* We were unable to find another CPU that1833* could accept this dirty list -- we are1834* therefore unable to clean it now.1835*/1836dtrace_dynvar_failclean++;1837continue;1838}1839}18401841work = 1;18421843/*1844* Atomically move the dirty list aside.1845*/1846do {1847dirty = dcpu->dtdsc_dirty;18481849/*1850* Before we zap the dirty list, set the rinsing list.1851* (This allows for a potential assertion in1852* dtrace_dynvar(): if a free dynamic variable appears1853* on a hash chain, either the dirty list or the1854* rinsing list for some CPU must be non-NULL.)1855*/1856*rinsep = dirty;1857dtrace_membar_producer();1858} while (dtrace_casptr(&dcpu->dtdsc_dirty,1859dirty, NULL) != dirty);1860}18611862if (!work) {1863/*1864* We have no work to do; we can simply return.1865*/1866return;1867}18681869dtrace_sync();18701871CPU_FOREACH(i) {1872dcpu = &dstate->dtds_percpu[i];18731874if (dcpu->dtdsc_rinsing == NULL)1875continue;18761877/*1878* We are now guaranteed that no hash chain contains a pointer1879* into this dirty list; we can make it clean.1880*/1881ASSERT(dcpu->dtdsc_clean == NULL);1882dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;1883dcpu->dtdsc_rinsing = NULL;1884}18851886/*1887* Before we actually set the state to be DTRACE_DSTATE_CLEAN, make1888* sure that all CPUs have seen all of the dtdsc_clean pointers.1889* This prevents a race whereby a CPU incorrectly decides that1890* the state should be something other than DTRACE_DSTATE_CLEAN1891* after dtrace_dynvar_clean() has completed.1892*/1893dtrace_sync();18941895dstate->dtds_state = DTRACE_DSTATE_CLEAN;1896}18971898/*1899* Depending on the value of the op parameter, this function looks-up,1900* allocates or deallocates an arbitrarily-keyed dynamic variable. If an1901* allocation is requested, this function will return a pointer to a1902* dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no1903* variable can be allocated. If NULL is returned, the appropriate counter1904* will be incremented.1905*/1906dtrace_dynvar_t *1907dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,1908dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,1909dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)1910{1911uint64_t hashval = DTRACE_DYNHASH_VALID;1912dtrace_dynhash_t *hash = dstate->dtds_hash;1913dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;1914processorid_t me = curcpu, cpu = me;1915dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];1916size_t bucket, ksize;1917size_t chunksize = dstate->dtds_chunksize;1918uintptr_t kdata, lock, nstate;1919uint_t i;19201921ASSERT(nkeys != 0);19221923/*1924* Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"1925* algorithm. For the by-value portions, we perform the algorithm in1926* 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a1927* bit, and seems to have only a minute effect on distribution. For1928* the by-reference data, we perform "One-at-a-time" iterating (safely)1929* over each referenced byte. It's painful to do this, but it's much1930* better than pathological hash distribution. The efficacy of the1931* hashing algorithm (and a comparison with other algorithms) may be1932* found by running the ::dtrace_dynstat MDB dcmd.1933*/1934for (i = 0; i < nkeys; i++) {1935if (key[i].dttk_size == 0) {1936uint64_t val = key[i].dttk_value;19371938hashval += (val >> 48) & 0xffff;1939hashval += (hashval << 10);1940hashval ^= (hashval >> 6);19411942hashval += (val >> 32) & 0xffff;1943hashval += (hashval << 10);1944hashval ^= (hashval >> 6);19451946hashval += (val >> 16) & 0xffff;1947hashval += (hashval << 10);1948hashval ^= (hashval >> 6);19491950hashval += val & 0xffff;1951hashval += (hashval << 10);1952hashval ^= (hashval >> 6);1953} else {1954/*1955* This is incredibly painful, but it beats the hell1956* out of the alternative.1957*/1958uint64_t j, size = key[i].dttk_size;1959uintptr_t base = (uintptr_t)key[i].dttk_value;19601961if (!dtrace_canload(base, size, mstate, vstate))1962break;19631964for (j = 0; j < size; j++) {1965hashval += dtrace_load8(base + j);1966hashval += (hashval << 10);1967hashval ^= (hashval >> 6);1968}1969}1970}19711972if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))1973return (NULL);19741975hashval += (hashval << 3);1976hashval ^= (hashval >> 11);1977hashval += (hashval << 15);19781979/*1980* There is a remote chance (ideally, 1 in 2^31) that our hashval1981* comes out to be one of our two sentinel hash values. If this1982* actually happens, we set the hashval to be a value known to be a1983* non-sentinel value.1984*/1985if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)1986hashval = DTRACE_DYNHASH_VALID;19871988/*1989* Yes, it's painful to do a divide here. If the cycle count becomes1990* important here, tricks can be pulled to reduce it. (However, it's1991* critical that hash collisions be kept to an absolute minimum;1992* they're much more painful than a divide.) It's better to have a1993* solution that generates few collisions and still keeps things1994* relatively simple.1995*/1996bucket = hashval % dstate->dtds_hashsize;19971998if (op == DTRACE_DYNVAR_DEALLOC) {1999volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;20002001for (;;) {2002while ((lock = *lockp) & 1)2003continue;20042005if (dtrace_casptr((volatile void *)lockp,2006(volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)2007break;2008}20092010dtrace_membar_producer();2011}20122013top:2014prev = NULL;2015lock = hash[bucket].dtdh_lock;20162017dtrace_membar_consumer();20182019start = hash[bucket].dtdh_chain;2020ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||2021start->dtdv_hashval != DTRACE_DYNHASH_FREE ||2022op != DTRACE_DYNVAR_DEALLOC));20232024for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {2025dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;2026dtrace_key_t *dkey = &dtuple->dtt_key[0];20272028if (dvar->dtdv_hashval != hashval) {2029if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {2030/*2031* We've reached the sink, and therefore the2032* end of the hash chain; we can kick out of2033* the loop knowing that we have seen a valid2034* snapshot of state.2035*/2036ASSERT(dvar->dtdv_next == NULL);2037ASSERT(dvar == &dtrace_dynhash_sink);2038break;2039}20402041if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {2042/*2043* We've gone off the rails: somewhere along2044* the line, one of the members of this hash2045* chain was deleted. Note that we could also2046* detect this by simply letting this loop run2047* to completion, as we would eventually hit2048* the end of the dirty list. However, we2049* want to avoid running the length of the2050* dirty list unnecessarily (it might be quite2051* long), so we catch this as early as2052* possible by detecting the hash marker. In2053* this case, we simply set dvar to NULL and2054* break; the conditional after the loop will2055* send us back to top.2056*/2057dvar = NULL;2058break;2059}20602061goto next;2062}20632064if (dtuple->dtt_nkeys != nkeys)2065goto next;20662067for (i = 0; i < nkeys; i++, dkey++) {2068if (dkey->dttk_size != key[i].dttk_size)2069goto next; /* size or type mismatch */20702071if (dkey->dttk_size != 0) {2072if (dtrace_bcmp(2073(void *)(uintptr_t)key[i].dttk_value,2074(void *)(uintptr_t)dkey->dttk_value,2075dkey->dttk_size))2076goto next;2077} else {2078if (dkey->dttk_value != key[i].dttk_value)2079goto next;2080}2081}20822083if (op != DTRACE_DYNVAR_DEALLOC)2084return (dvar);20852086ASSERT(dvar->dtdv_next == NULL ||2087dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);20882089if (prev != NULL) {2090ASSERT(hash[bucket].dtdh_chain != dvar);2091ASSERT(start != dvar);2092ASSERT(prev->dtdv_next == dvar);2093prev->dtdv_next = dvar->dtdv_next;2094} else {2095if (dtrace_casptr(&hash[bucket].dtdh_chain,2096start, dvar->dtdv_next) != start) {2097/*2098* We have failed to atomically swing the2099* hash table head pointer, presumably because2100* of a conflicting allocation on another CPU.2101* We need to reread the hash chain and try2102* again.2103*/2104goto top;2105}2106}21072108dtrace_membar_producer();21092110/*2111* Now set the hash value to indicate that it's free.2112*/2113ASSERT(hash[bucket].dtdh_chain != dvar);2114dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;21152116dtrace_membar_producer();21172118/*2119* Set the next pointer to point at the dirty list, and2120* atomically swing the dirty pointer to the newly freed dvar.2121*/2122do {2123next = dcpu->dtdsc_dirty;2124dvar->dtdv_next = next;2125} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);21262127/*2128* Finally, unlock this hash bucket.2129*/2130ASSERT(hash[bucket].dtdh_lock == lock);2131ASSERT(lock & 1);2132hash[bucket].dtdh_lock++;21332134return (NULL);2135next:2136prev = dvar;2137continue;2138}21392140if (dvar == NULL) {2141/*2142* If dvar is NULL, it is because we went off the rails:2143* one of the elements that we traversed in the hash chain2144* was deleted while we were traversing it. In this case,2145* we assert that we aren't doing a dealloc (deallocs lock2146* the hash bucket to prevent themselves from racing with2147* one another), and retry the hash chain traversal.2148*/2149ASSERT(op != DTRACE_DYNVAR_DEALLOC);2150goto top;2151}21522153if (op != DTRACE_DYNVAR_ALLOC) {2154/*2155* If we are not to allocate a new variable, we want to2156* return NULL now. Before we return, check that the value2157* of the lock word hasn't changed. If it has, we may have2158* seen an inconsistent snapshot.2159*/2160if (op == DTRACE_DYNVAR_NOALLOC) {2161if (hash[bucket].dtdh_lock != lock)2162goto top;2163} else {2164ASSERT(op == DTRACE_DYNVAR_DEALLOC);2165ASSERT(hash[bucket].dtdh_lock == lock);2166ASSERT(lock & 1);2167hash[bucket].dtdh_lock++;2168}21692170return (NULL);2171}21722173/*2174* We need to allocate a new dynamic variable. The size we need is the2175* size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the2176* size of any auxiliary key data (rounded up to 8-byte alignment) plus2177* the size of any referred-to data (dsize). We then round the final2178* size up to the chunksize for allocation.2179*/2180for (ksize = 0, i = 0; i < nkeys; i++)2181ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));21822183/*2184* This should be pretty much impossible, but could happen if, say,2185* strange DIF specified the tuple. Ideally, this should be an2186* assertion and not an error condition -- but that requires that the2187* chunksize calculation in dtrace_difo_chunksize() be absolutely2188* bullet-proof. (That is, it must not be able to be fooled by2189* malicious DIF.) Given the lack of backwards branches in DIF,2190* solving this would presumably not amount to solving the Halting2191* Problem -- but it still seems awfully hard.2192*/2193if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +2194ksize + dsize > chunksize) {2195dcpu->dtdsc_drops++;2196return (NULL);2197}21982199nstate = DTRACE_DSTATE_EMPTY;22002201do {2202retry:2203free = dcpu->dtdsc_free;22042205if (free == NULL) {2206dtrace_dynvar_t *clean = dcpu->dtdsc_clean;2207void *rval;22082209if (clean == NULL) {2210/*2211* We're out of dynamic variable space on2212* this CPU. Unless we have tried all CPUs,2213* we'll try to allocate from a different2214* CPU.2215*/2216switch (dstate->dtds_state) {2217case DTRACE_DSTATE_CLEAN: {2218void *sp = &dstate->dtds_state;22192220if (++cpu > mp_maxid)2221cpu = 0;22222223if (dcpu->dtdsc_dirty != NULL &&2224nstate == DTRACE_DSTATE_EMPTY)2225nstate = DTRACE_DSTATE_DIRTY;22262227if (dcpu->dtdsc_rinsing != NULL)2228nstate = DTRACE_DSTATE_RINSING;22292230dcpu = &dstate->dtds_percpu[cpu];22312232if (cpu != me)2233goto retry;22342235(void) dtrace_cas32(sp,2236DTRACE_DSTATE_CLEAN, nstate);22372238/*2239* To increment the correct bean2240* counter, take another lap.2241*/2242goto retry;2243}22442245case DTRACE_DSTATE_DIRTY:2246dcpu->dtdsc_dirty_drops++;2247break;22482249case DTRACE_DSTATE_RINSING:2250dcpu->dtdsc_rinsing_drops++;2251break;22522253case DTRACE_DSTATE_EMPTY:2254dcpu->dtdsc_drops++;2255break;2256}22572258DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);2259return (NULL);2260}22612262/*2263* The clean list appears to be non-empty. We want to2264* move the clean list to the free list; we start by2265* moving the clean pointer aside.2266*/2267if (dtrace_casptr(&dcpu->dtdsc_clean,2268clean, NULL) != clean) {2269/*2270* We are in one of two situations:2271*2272* (a) The clean list was switched to the2273* free list by another CPU.2274*2275* (b) The clean list was added to by the2276* cleansing cyclic.2277*2278* In either of these situations, we can2279* just reattempt the free list allocation.2280*/2281goto retry;2282}22832284ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);22852286/*2287* Now we'll move the clean list to our free list.2288* It's impossible for this to fail: the only way2289* the free list can be updated is through this2290* code path, and only one CPU can own the clean list.2291* Thus, it would only be possible for this to fail if2292* this code were racing with dtrace_dynvar_clean().2293* (That is, if dtrace_dynvar_clean() updated the clean2294* list, and we ended up racing to update the free2295* list.) This race is prevented by the dtrace_sync()2296* in dtrace_dynvar_clean() -- which flushes the2297* owners of the clean lists out before resetting2298* the clean lists.2299*/2300dcpu = &dstate->dtds_percpu[me];2301rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);2302ASSERT(rval == NULL);2303goto retry;2304}23052306dvar = free;2307new_free = dvar->dtdv_next;2308} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);23092310/*2311* We have now allocated a new chunk. We copy the tuple keys into the2312* tuple array and copy any referenced key data into the data space2313* following the tuple array. As we do this, we relocate dttk_value2314* in the final tuple to point to the key data address in the chunk.2315*/2316kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];2317dvar->dtdv_data = (void *)(kdata + ksize);2318dvar->dtdv_tuple.dtt_nkeys = nkeys;23192320for (i = 0; i < nkeys; i++) {2321dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];2322size_t kesize = key[i].dttk_size;23232324if (kesize != 0) {2325dtrace_bcopy(2326(const void *)(uintptr_t)key[i].dttk_value,2327(void *)kdata, kesize);2328dkey->dttk_value = kdata;2329kdata += P2ROUNDUP(kesize, sizeof (uint64_t));2330} else {2331dkey->dttk_value = key[i].dttk_value;2332}23332334dkey->dttk_size = kesize;2335}23362337ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);2338dvar->dtdv_hashval = hashval;2339dvar->dtdv_next = start;23402341if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)2342return (dvar);23432344/*2345* The cas has failed. Either another CPU is adding an element to2346* this hash chain, or another CPU is deleting an element from this2347* hash chain. The simplest way to deal with both of these cases2348* (though not necessarily the most efficient) is to free our2349* allocated block and re-attempt it all. Note that the free is2350* to the dirty list and _not_ to the free list. This is to prevent2351* races with allocators, above.2352*/2353dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;23542355dtrace_membar_producer();23562357do {2358free = dcpu->dtdsc_dirty;2359dvar->dtdv_next = free;2360} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);23612362goto top;2363}23642365/*ARGSUSED*/2366static void2367dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)2368{2369if ((int64_t)nval < (int64_t)*oval)2370*oval = nval;2371}23722373/*ARGSUSED*/2374static void2375dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)2376{2377if ((int64_t)nval > (int64_t)*oval)2378*oval = nval;2379}23802381static void2382dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)2383{2384int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;2385int64_t val = (int64_t)nval;23862387if (val < 0) {2388for (i = 0; i < zero; i++) {2389if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {2390quanta[i] += incr;2391return;2392}2393}2394} else {2395for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {2396if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {2397quanta[i - 1] += incr;2398return;2399}2400}24012402quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;2403return;2404}24052406ASSERT(0);2407}24082409static void2410dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)2411{2412uint64_t arg = *lquanta++;2413int32_t base = DTRACE_LQUANTIZE_BASE(arg);2414uint16_t step = DTRACE_LQUANTIZE_STEP(arg);2415uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);2416int32_t val = (int32_t)nval, level;24172418ASSERT(step != 0);2419ASSERT(levels != 0);24202421if (val < base) {2422/*2423* This is an underflow.2424*/2425lquanta[0] += incr;2426return;2427}24282429level = (val - base) / step;24302431if (level < levels) {2432lquanta[level + 1] += incr;2433return;2434}24352436/*2437* This is an overflow.2438*/2439lquanta[levels + 1] += incr;2440}24412442static int2443dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,2444uint16_t high, uint16_t nsteps, int64_t value)2445{2446int64_t this = 1, last, next;2447int base = 1, order;24482449ASSERT(factor <= nsteps);2450ASSERT(nsteps % factor == 0);24512452for (order = 0; order < low; order++)2453this *= factor;24542455/*2456* If our value is less than our factor taken to the power of the2457* low order of magnitude, it goes into the zeroth bucket.2458*/2459if (value < (last = this))2460return (0);24612462for (this *= factor; order <= high; order++) {2463int nbuckets = this > nsteps ? nsteps : this;24642465if ((next = this * factor) < this) {2466/*2467* We should not generally get log/linear quantizations2468* with a high magnitude that allows 64-bits to2469* overflow, but we nonetheless protect against this2470* by explicitly checking for overflow, and clamping2471* our value accordingly.2472*/2473value = this - 1;2474}24752476if (value < this) {2477/*2478* If our value lies within this order of magnitude,2479* determine its position by taking the offset within2480* the order of magnitude, dividing by the bucket2481* width, and adding to our (accumulated) base.2482*/2483return (base + (value - last) / (this / nbuckets));2484}24852486base += nbuckets - (nbuckets / factor);2487last = this;2488this = next;2489}24902491/*2492* Our value is greater than or equal to our factor taken to the2493* power of one plus the high magnitude -- return the top bucket.2494*/2495return (base);2496}24972498static void2499dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)2500{2501uint64_t arg = *llquanta++;2502uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);2503uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);2504uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);2505uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);25062507llquanta[dtrace_aggregate_llquantize_bucket(factor,2508low, high, nsteps, nval)] += incr;2509}25102511/*ARGSUSED*/2512static void2513dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)2514{2515data[0]++;2516data[1] += nval;2517}25182519/*ARGSUSED*/2520static void2521dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)2522{2523int64_t snval = (int64_t)nval;2524uint64_t tmp[2];25252526data[0]++;2527data[1] += nval;25282529/*2530* What we want to say here is:2531*2532* data[2] += nval * nval;2533*2534* But given that nval is 64-bit, we could easily overflow, so2535* we do this as 128-bit arithmetic.2536*/2537if (snval < 0)2538snval = -snval;25392540dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);2541dtrace_add_128(data + 2, tmp, data + 2);2542}25432544/*ARGSUSED*/2545static void2546dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)2547{2548*oval = *oval + 1;2549}25502551/*ARGSUSED*/2552static void2553dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)2554{2555*oval += nval;2556}25572558/*2559* Aggregate given the tuple in the principal data buffer, and the aggregating2560* action denoted by the specified dtrace_aggregation_t. The aggregation2561* buffer is specified as the buf parameter. This routine does not return2562* failure; if there is no space in the aggregation buffer, the data will be2563* dropped, and a corresponding counter incremented.2564*/2565static void2566dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,2567intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)2568{2569dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;2570uint32_t i, ndx, size, fsize;2571uint32_t align = sizeof (uint64_t) - 1;2572dtrace_aggbuffer_t *agb;2573dtrace_aggkey_t *key;2574uint32_t hashval = 0, limit, isstr;2575caddr_t tomax, data, kdata;2576dtrace_actkind_t action;2577dtrace_action_t *act;2578size_t offs;25792580if (buf == NULL)2581return;25822583if (!agg->dtag_hasarg) {2584/*2585* Currently, only quantize() and lquantize() take additional2586* arguments, and they have the same semantics: an increment2587* value that defaults to 1 when not present. If additional2588* aggregating actions take arguments, the setting of the2589* default argument value will presumably have to become more2590* sophisticated...2591*/2592arg = 1;2593}25942595action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;2596size = rec->dtrd_offset - agg->dtag_base;2597fsize = size + rec->dtrd_size;25982599ASSERT(dbuf->dtb_tomax != NULL);2600data = dbuf->dtb_tomax + offset + agg->dtag_base;26012602if ((tomax = buf->dtb_tomax) == NULL) {2603dtrace_buffer_drop(buf);2604return;2605}26062607/*2608* The metastructure is always at the bottom of the buffer.2609*/2610agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -2611sizeof (dtrace_aggbuffer_t));26122613if (buf->dtb_offset == 0) {2614/*2615* We just kludge up approximately 1/8th of the size to be2616* buckets. If this guess ends up being routinely2617* off-the-mark, we may need to dynamically readjust this2618* based on past performance.2619*/2620uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);26212622if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <2623(uintptr_t)tomax || hashsize == 0) {2624/*2625* We've been given a ludicrously small buffer;2626* increment our drop count and leave.2627*/2628dtrace_buffer_drop(buf);2629return;2630}26312632/*2633* And now, a pathetic attempt to try to get a an odd (or2634* perchance, a prime) hash size for better hash distribution.2635*/2636if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))2637hashsize -= DTRACE_AGGHASHSIZE_SLEW;26382639agb->dtagb_hashsize = hashsize;2640agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -2641agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));2642agb->dtagb_free = (uintptr_t)agb->dtagb_hash;26432644for (i = 0; i < agb->dtagb_hashsize; i++)2645agb->dtagb_hash[i] = NULL;2646}26472648ASSERT(agg->dtag_first != NULL);2649ASSERT(agg->dtag_first->dta_intuple);26502651/*2652* Calculate the hash value based on the key. Note that we _don't_2653* include the aggid in the hashing (but we will store it as part of2654* the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"2655* algorithm: a simple, quick algorithm that has no known funnels, and2656* gets good distribution in practice. The efficacy of the hashing2657* algorithm (and a comparison with other algorithms) may be found by2658* running the ::dtrace_aggstat MDB dcmd.2659*/2660for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {2661i = act->dta_rec.dtrd_offset - agg->dtag_base;2662limit = i + act->dta_rec.dtrd_size;2663ASSERT(limit <= size);2664isstr = DTRACEACT_ISSTRING(act);26652666for (; i < limit; i++) {2667hashval += data[i];2668hashval += (hashval << 10);2669hashval ^= (hashval >> 6);26702671if (isstr && data[i] == '\0')2672break;2673}2674}26752676hashval += (hashval << 3);2677hashval ^= (hashval >> 11);2678hashval += (hashval << 15);26792680/*2681* Yes, the divide here is expensive -- but it's generally the least2682* of the performance issues given the amount of data that we iterate2683* over to compute hash values, compare data, etc.2684*/2685ndx = hashval % agb->dtagb_hashsize;26862687for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {2688ASSERT((caddr_t)key >= tomax);2689ASSERT((caddr_t)key < tomax + buf->dtb_size);26902691if (hashval != key->dtak_hashval || key->dtak_size != size)2692continue;26932694kdata = key->dtak_data;2695ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);26962697for (act = agg->dtag_first; act->dta_intuple;2698act = act->dta_next) {2699i = act->dta_rec.dtrd_offset - agg->dtag_base;2700limit = i + act->dta_rec.dtrd_size;2701ASSERT(limit <= size);2702isstr = DTRACEACT_ISSTRING(act);27032704for (; i < limit; i++) {2705if (kdata[i] != data[i])2706goto next;27072708if (isstr && data[i] == '\0')2709break;2710}2711}27122713if (action != key->dtak_action) {2714/*2715* We are aggregating on the same value in the same2716* aggregation with two different aggregating actions.2717* (This should have been picked up in the compiler,2718* so we may be dealing with errant or devious DIF.)2719* This is an error condition; we indicate as much,2720* and return.2721*/2722DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);2723return;2724}27252726/*2727* This is a hit: we need to apply the aggregator to2728* the value at this key.2729*/2730agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);2731return;2732next:2733continue;2734}27352736/*2737* We didn't find it. We need to allocate some zero-filled space,2738* link it into the hash table appropriately, and apply the aggregator2739* to the (zero-filled) value.2740*/2741offs = buf->dtb_offset;2742while (offs & (align - 1))2743offs += sizeof (uint32_t);27442745/*2746* If we don't have enough room to both allocate a new key _and_2747* its associated data, increment the drop count and return.2748*/2749if ((uintptr_t)tomax + offs + fsize >2750agb->dtagb_free - sizeof (dtrace_aggkey_t)) {2751dtrace_buffer_drop(buf);2752return;2753}27542755/*CONSTCOND*/2756ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));2757key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));2758agb->dtagb_free -= sizeof (dtrace_aggkey_t);27592760key->dtak_data = kdata = tomax + offs;2761buf->dtb_offset = offs + fsize;27622763/*2764* Now copy the data across.2765*/2766*((dtrace_aggid_t *)kdata) = agg->dtag_id;27672768for (i = sizeof (dtrace_aggid_t); i < size; i++)2769kdata[i] = data[i];27702771/*2772* Because strings are not zeroed out by default, we need to iterate2773* looking for actions that store strings, and we need to explicitly2774* pad these strings out with zeroes.2775*/2776for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {2777int nul;27782779if (!DTRACEACT_ISSTRING(act))2780continue;27812782i = act->dta_rec.dtrd_offset - agg->dtag_base;2783limit = i + act->dta_rec.dtrd_size;2784ASSERT(limit <= size);27852786for (nul = 0; i < limit; i++) {2787if (nul) {2788kdata[i] = '\0';2789continue;2790}27912792if (data[i] != '\0')2793continue;27942795nul = 1;2796}2797}27982799for (i = size; i < fsize; i++)2800kdata[i] = 0;28012802key->dtak_hashval = hashval;2803key->dtak_size = size;2804key->dtak_action = action;2805key->dtak_next = agb->dtagb_hash[ndx];2806agb->dtagb_hash[ndx] = key;28072808/*2809* Finally, apply the aggregator.2810*/2811*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;2812agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);2813}28142815/*2816* Given consumer state, this routine finds a speculation in the INACTIVE2817* state and transitions it into the ACTIVE state. If there is no speculation2818* in the INACTIVE state, 0 is returned. In this case, no error counter is2819* incremented -- it is up to the caller to take appropriate action.2820*/2821static int2822dtrace_speculation(dtrace_state_t *state)2823{2824int i = 0;2825dtrace_speculation_state_t curstate;2826uint32_t *stat = &state->dts_speculations_unavail, count;28272828while (i < state->dts_nspeculations) {2829dtrace_speculation_t *spec = &state->dts_speculations[i];28302831curstate = spec->dtsp_state;28322833if (curstate != DTRACESPEC_INACTIVE) {2834if (curstate == DTRACESPEC_COMMITTINGMANY ||2835curstate == DTRACESPEC_COMMITTING ||2836curstate == DTRACESPEC_DISCARDING)2837stat = &state->dts_speculations_busy;2838i++;2839continue;2840}28412842if (dtrace_cas32((uint32_t *)&spec->dtsp_state,2843curstate, DTRACESPEC_ACTIVE) == curstate)2844return (i + 1);2845}28462847/*2848* We couldn't find a speculation. If we found as much as a single2849* busy speculation buffer, we'll attribute this failure as "busy"2850* instead of "unavail".2851*/2852do {2853count = *stat;2854} while (dtrace_cas32(stat, count, count + 1) != count);28552856return (0);2857}28582859/*2860* This routine commits an active speculation. If the specified speculation2861* is not in a valid state to perform a commit(), this routine will silently do2862* nothing. The state of the specified speculation is transitioned according2863* to the state transition diagram outlined in <sys/dtrace_impl.h>2864*/2865static void2866dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,2867dtrace_specid_t which)2868{2869dtrace_speculation_t *spec;2870dtrace_buffer_t *src, *dest;2871uintptr_t daddr, saddr, dlimit, slimit;2872dtrace_speculation_state_t curstate, new = 0;2873ssize_t offs;2874uint64_t timestamp;28752876if (which == 0)2877return;28782879if (which > state->dts_nspeculations) {2880cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;2881return;2882}28832884spec = &state->dts_speculations[which - 1];2885src = &spec->dtsp_buffer[cpu];2886dest = &state->dts_buffer[cpu];28872888do {2889curstate = spec->dtsp_state;28902891if (curstate == DTRACESPEC_COMMITTINGMANY)2892break;28932894switch (curstate) {2895case DTRACESPEC_INACTIVE:2896case DTRACESPEC_DISCARDING:2897return;28982899case DTRACESPEC_COMMITTING:2900/*2901* This is only possible if we are (a) commit()'ing2902* without having done a prior speculate() on this CPU2903* and (b) racing with another commit() on a different2904* CPU. There's nothing to do -- we just assert that2905* our offset is 0.2906*/2907ASSERT(src->dtb_offset == 0);2908return;29092910case DTRACESPEC_ACTIVE:2911new = DTRACESPEC_COMMITTING;2912break;29132914case DTRACESPEC_ACTIVEONE:2915/*2916* This speculation is active on one CPU. If our2917* buffer offset is non-zero, we know that the one CPU2918* must be us. Otherwise, we are committing on a2919* different CPU from the speculate(), and we must2920* rely on being asynchronously cleaned.2921*/2922if (src->dtb_offset != 0) {2923new = DTRACESPEC_COMMITTING;2924break;2925}2926/*FALLTHROUGH*/29272928case DTRACESPEC_ACTIVEMANY:2929new = DTRACESPEC_COMMITTINGMANY;2930break;29312932default:2933ASSERT(0);2934}2935} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,2936curstate, new) != curstate);29372938/*2939* We have set the state to indicate that we are committing this2940* speculation. Now reserve the necessary space in the destination2941* buffer.2942*/2943if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,2944sizeof (uint64_t), state, NULL)) < 0) {2945dtrace_buffer_drop(dest);2946goto out;2947}29482949/*2950* We have sufficient space to copy the speculative buffer into the2951* primary buffer. First, modify the speculative buffer, filling2952* in the timestamp of all entries with the curstate time. The data2953* must have the commit() time rather than the time it was traced,2954* so that all entries in the primary buffer are in timestamp order.2955*/2956timestamp = dtrace_gethrtime();2957saddr = (uintptr_t)src->dtb_tomax;2958slimit = saddr + src->dtb_offset;2959while (saddr < slimit) {2960size_t size;2961dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;29622963if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {2964saddr += sizeof (dtrace_epid_t);2965continue;2966}2967ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);2968size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;29692970ASSERT3U(saddr + size, <=, slimit);2971ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));2972ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);29732974DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);29752976saddr += size;2977}29782979/*2980* Copy the buffer across. (Note that this is a2981* highly subobtimal bcopy(); in the unlikely event that this becomes2982* a serious performance issue, a high-performance DTrace-specific2983* bcopy() should obviously be invented.)2984*/2985daddr = (uintptr_t)dest->dtb_tomax + offs;2986dlimit = daddr + src->dtb_offset;2987saddr = (uintptr_t)src->dtb_tomax;29882989/*2990* First, the aligned portion.2991*/2992while (dlimit - daddr >= sizeof (uint64_t)) {2993*((uint64_t *)daddr) = *((uint64_t *)saddr);29942995daddr += sizeof (uint64_t);2996saddr += sizeof (uint64_t);2997}29982999/*3000* Now any left-over bit...3001*/3002while (dlimit - daddr)3003*((uint8_t *)daddr++) = *((uint8_t *)saddr++);30043005/*3006* Finally, commit the reserved space in the destination buffer.3007*/3008dest->dtb_offset = offs + src->dtb_offset;30093010out:3011/*3012* If we're lucky enough to be the only active CPU on this speculation3013* buffer, we can just set the state back to DTRACESPEC_INACTIVE.3014*/3015if (curstate == DTRACESPEC_ACTIVE ||3016(curstate == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {3017uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,3018DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);30193020ASSERT(rval == DTRACESPEC_COMMITTING);3021}30223023src->dtb_offset = 0;3024src->dtb_xamot_drops += src->dtb_drops;3025src->dtb_drops = 0;3026}30273028/*3029* This routine discards an active speculation. If the specified speculation3030* is not in a valid state to perform a discard(), this routine will silently3031* do nothing. The state of the specified speculation is transitioned3032* according to the state transition diagram outlined in <sys/dtrace_impl.h>3033*/3034static void3035dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,3036dtrace_specid_t which)3037{3038dtrace_speculation_t *spec;3039dtrace_speculation_state_t curstate, new = 0;3040dtrace_buffer_t *buf;30413042if (which == 0)3043return;30443045if (which > state->dts_nspeculations) {3046cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;3047return;3048}30493050spec = &state->dts_speculations[which - 1];3051buf = &spec->dtsp_buffer[cpu];30523053do {3054curstate = spec->dtsp_state;30553056switch (curstate) {3057case DTRACESPEC_INACTIVE:3058case DTRACESPEC_COMMITTINGMANY:3059case DTRACESPEC_COMMITTING:3060case DTRACESPEC_DISCARDING:3061return;30623063case DTRACESPEC_ACTIVE:3064case DTRACESPEC_ACTIVEMANY:3065new = DTRACESPEC_DISCARDING;3066break;30673068case DTRACESPEC_ACTIVEONE:3069if (buf->dtb_offset != 0) {3070new = DTRACESPEC_INACTIVE;3071} else {3072new = DTRACESPEC_DISCARDING;3073}3074break;30753076default:3077ASSERT(0);3078}3079} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,3080curstate, new) != curstate);30813082buf->dtb_offset = 0;3083buf->dtb_drops = 0;3084}30853086/*3087* Note: not called from probe context. This function is called3088* asynchronously from cross call context to clean any speculations that are3089* in the COMMITTINGMANY or DISCARDING states. These speculations may not be3090* transitioned back to the INACTIVE state until all CPUs have cleaned the3091* speculation.3092*/3093static void3094dtrace_speculation_clean_here(dtrace_state_t *state)3095{3096dtrace_icookie_t cookie;3097processorid_t cpu = curcpu;3098dtrace_buffer_t *dest = &state->dts_buffer[cpu];3099dtrace_specid_t i;31003101cookie = dtrace_interrupt_disable();31023103if (dest->dtb_tomax == NULL) {3104dtrace_interrupt_enable(cookie);3105return;3106}31073108for (i = 0; i < state->dts_nspeculations; i++) {3109dtrace_speculation_t *spec = &state->dts_speculations[i];3110dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];31113112if (src->dtb_tomax == NULL)3113continue;31143115if (spec->dtsp_state == DTRACESPEC_DISCARDING) {3116src->dtb_offset = 0;3117continue;3118}31193120if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)3121continue;31223123if (src->dtb_offset == 0)3124continue;31253126dtrace_speculation_commit(state, cpu, i + 1);3127}31283129dtrace_interrupt_enable(cookie);3130}31313132/*3133* Note: not called from probe context. This function is called3134* asynchronously (and at a regular interval) to clean any speculations that3135* are in the COMMITTINGMANY or DISCARDING states. If it discovers that there3136* is work to be done, it cross calls all CPUs to perform that work;3137* COMMITMANY and DISCARDING speculations may not be transitioned back to the3138* INACTIVE state until they have been cleaned by all CPUs.3139*/3140static void3141dtrace_speculation_clean(dtrace_state_t *state)3142{3143int work = 0, rv;3144dtrace_specid_t i;31453146for (i = 0; i < state->dts_nspeculations; i++) {3147dtrace_speculation_t *spec = &state->dts_speculations[i];31483149ASSERT(!spec->dtsp_cleaning);31503151if (spec->dtsp_state != DTRACESPEC_DISCARDING &&3152spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)3153continue;31543155work++;3156spec->dtsp_cleaning = 1;3157}31583159if (!work)3160return;31613162dtrace_xcall(DTRACE_CPUALL,3163(dtrace_xcall_t)dtrace_speculation_clean_here, state);31643165/*3166* We now know that all CPUs have committed or discarded their3167* speculation buffers, as appropriate. We can now set the state3168* to inactive.3169*/3170for (i = 0; i < state->dts_nspeculations; i++) {3171dtrace_speculation_t *spec = &state->dts_speculations[i];3172dtrace_speculation_state_t curstate, new;31733174if (!spec->dtsp_cleaning)3175continue;31763177curstate = spec->dtsp_state;3178ASSERT(curstate == DTRACESPEC_DISCARDING ||3179curstate == DTRACESPEC_COMMITTINGMANY);31803181new = DTRACESPEC_INACTIVE;31823183rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, curstate, new);3184ASSERT(rv == curstate);3185spec->dtsp_cleaning = 0;3186}3187}31883189/*3190* Called as part of a speculate() to get the speculative buffer associated3191* with a given speculation. Returns NULL if the specified speculation is not3192* in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and3193* the active CPU is not the specified CPU -- the speculation will be3194* atomically transitioned into the ACTIVEMANY state.3195*/3196static dtrace_buffer_t *3197dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,3198dtrace_specid_t which)3199{3200dtrace_speculation_t *spec;3201dtrace_speculation_state_t curstate, new = 0;3202dtrace_buffer_t *buf;32033204if (which == 0)3205return (NULL);32063207if (which > state->dts_nspeculations) {3208cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;3209return (NULL);3210}32113212spec = &state->dts_speculations[which - 1];3213buf = &spec->dtsp_buffer[cpuid];32143215do {3216curstate = spec->dtsp_state;32173218switch (curstate) {3219case DTRACESPEC_INACTIVE:3220case DTRACESPEC_COMMITTINGMANY:3221case DTRACESPEC_DISCARDING:3222return (NULL);32233224case DTRACESPEC_COMMITTING:3225ASSERT(buf->dtb_offset == 0);3226return (NULL);32273228case DTRACESPEC_ACTIVEONE:3229/*3230* This speculation is currently active on one CPU.3231* Check the offset in the buffer; if it's non-zero,3232* that CPU must be us (and we leave the state alone).3233* If it's zero, assume that we're starting on a new3234* CPU -- and change the state to indicate that the3235* speculation is active on more than one CPU.3236*/3237if (buf->dtb_offset != 0)3238return (buf);32393240new = DTRACESPEC_ACTIVEMANY;3241break;32423243case DTRACESPEC_ACTIVEMANY:3244return (buf);32453246case DTRACESPEC_ACTIVE:3247new = DTRACESPEC_ACTIVEONE;3248break;32493250default:3251ASSERT(0);3252}3253} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,3254curstate, new) != curstate);32553256ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);3257return (buf);3258}32593260/*3261* Return a string. In the event that the user lacks the privilege to access3262* arbitrary kernel memory, we copy the string out to scratch memory so that we3263* don't fail access checking.3264*3265* dtrace_dif_variable() uses this routine as a helper for various3266* builtin values such as 'execname' and 'probefunc.'3267*/3268uintptr_t3269dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,3270dtrace_mstate_t *mstate)3271{3272uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];3273uintptr_t ret;3274size_t strsz;32753276/*3277* The easy case: this probe is allowed to read all of memory, so3278* we can just return this as a vanilla pointer.3279*/3280if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)3281return (addr);32823283/*3284* This is the tougher case: we copy the string in question from3285* kernel memory into scratch memory and return it that way: this3286* ensures that we won't trip up when access checking tests the3287* BYREF return value.3288*/3289strsz = dtrace_strlen((char *)addr, size) + 1;32903291if (mstate->dtms_scratch_ptr + strsz >3292mstate->dtms_scratch_base + mstate->dtms_scratch_size) {3293DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);3294return (0);3295}32963297dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,3298strsz);3299ret = mstate->dtms_scratch_ptr;3300mstate->dtms_scratch_ptr += strsz;3301return (ret);3302}33033304/*3305* Return a string from a memoy address which is known to have one or3306* more concatenated, individually zero terminated, sub-strings.3307* In the event that the user lacks the privilege to access3308* arbitrary kernel memory, we copy the string out to scratch memory so that we3309* don't fail access checking.3310*3311* dtrace_dif_variable() uses this routine as a helper for various3312* builtin values such as 'execargs'.3313*/3314static uintptr_t3315dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,3316dtrace_mstate_t *mstate)3317{3318char *p;3319size_t i;3320uintptr_t ret;33213322if (mstate->dtms_scratch_ptr + strsz >3323mstate->dtms_scratch_base + mstate->dtms_scratch_size) {3324DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);3325return (0);3326}33273328dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,3329strsz);33303331/* Replace sub-string termination characters with a space. */3332for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;3333p++, i++)3334if (*p == '\0')3335*p = ' ';33363337ret = mstate->dtms_scratch_ptr;3338mstate->dtms_scratch_ptr += strsz;3339return (ret);3340}33413342/*3343* This function implements the DIF emulator's variable lookups. The emulator3344* passes a reserved variable identifier and optional built-in array index.3345*/3346static uint64_t3347dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,3348uint64_t ndx)3349{3350/*3351* If we're accessing one of the uncached arguments, we'll turn this3352* into a reference in the args array.3353*/3354if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {3355ndx = v - DIF_VAR_ARG0;3356v = DIF_VAR_ARGS;3357}33583359switch (v) {3360case DIF_VAR_ARGS:3361ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);3362if (ndx >= sizeof (mstate->dtms_arg) /3363sizeof (mstate->dtms_arg[0])) {3364int aframes = mstate->dtms_probe->dtpr_aframes + 2;3365dtrace_provider_t *pv;3366uint64_t val;33673368pv = mstate->dtms_probe->dtpr_provider;3369if (pv->dtpv_pops.dtps_getargval != NULL)3370val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,3371mstate->dtms_probe->dtpr_id,3372mstate->dtms_probe->dtpr_arg, ndx, aframes);3373else3374val = dtrace_getarg(ndx, aframes);33753376/*3377* This is regrettably required to keep the compiler3378* from tail-optimizing the call to dtrace_getarg().3379* The condition always evaluates to true, but the3380* compiler has no way of figuring that out a priori.3381* (None of this would be necessary if the compiler3382* could be relied upon to _always_ tail-optimize3383* the call to dtrace_getarg() -- but it can't.)3384*/3385if (mstate->dtms_probe != NULL)3386return (val);33873388ASSERT(0);3389}33903391return (mstate->dtms_arg[ndx]);33923393case DIF_VAR_REGS:3394case DIF_VAR_UREGS: {3395struct trapframe *tframe;33963397if (!dtrace_priv_proc(state))3398return (0);33993400if (v == DIF_VAR_REGS)3401tframe = curthread->t_dtrace_trapframe;3402else3403tframe = curthread->td_frame;34043405if (tframe == NULL) {3406DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);3407cpu_core[curcpu].cpuc_dtrace_illval = 0;3408return (0);3409}34103411return (dtrace_getreg(tframe, ndx));3412}34133414case DIF_VAR_CURTHREAD:3415if (!dtrace_priv_proc(state))3416return (0);3417return ((uint64_t)(uintptr_t)curthread);34183419case DIF_VAR_TIMESTAMP:3420if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {3421mstate->dtms_timestamp = dtrace_gethrtime();3422mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;3423}3424return (mstate->dtms_timestamp);34253426case DIF_VAR_VTIMESTAMP:3427ASSERT(dtrace_vtime_references != 0);3428return (curthread->t_dtrace_vtime);34293430case DIF_VAR_WALLTIMESTAMP:3431if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {3432mstate->dtms_walltimestamp = dtrace_gethrestime();3433mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;3434}3435return (mstate->dtms_walltimestamp);34363437#ifdef illumos3438case DIF_VAR_IPL:3439if (!dtrace_priv_kernel(state))3440return (0);3441if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {3442mstate->dtms_ipl = dtrace_getipl();3443mstate->dtms_present |= DTRACE_MSTATE_IPL;3444}3445return (mstate->dtms_ipl);3446#endif34473448case DIF_VAR_EPID:3449ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);3450return (mstate->dtms_epid);34513452case DIF_VAR_ID:3453ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);3454return (mstate->dtms_probe->dtpr_id);34553456case DIF_VAR_STACKDEPTH:3457if (!dtrace_priv_kernel(state))3458return (0);3459if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {3460int aframes = mstate->dtms_probe->dtpr_aframes + 2;34613462mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);3463mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;3464}3465return (mstate->dtms_stackdepth);34663467case DIF_VAR_USTACKDEPTH:3468if (!dtrace_priv_proc(state))3469return (0);3470if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {3471/*3472* See comment in DIF_VAR_PID.3473*/3474if (DTRACE_ANCHORED(mstate->dtms_probe) &&3475CPU_ON_INTR(CPU)) {3476mstate->dtms_ustackdepth = 0;3477} else {3478DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);3479mstate->dtms_ustackdepth =3480dtrace_getustackdepth();3481DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);3482}3483mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;3484}3485return (mstate->dtms_ustackdepth);34863487case DIF_VAR_CALLER:3488if (!dtrace_priv_kernel(state))3489return (0);3490if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {3491int aframes = mstate->dtms_probe->dtpr_aframes + 2;34923493if (!DTRACE_ANCHORED(mstate->dtms_probe)) {3494/*3495* If this is an unanchored probe, we are3496* required to go through the slow path:3497* dtrace_caller() only guarantees correct3498* results for anchored probes.3499*/3500pc_t caller[2] = {0, 0};35013502dtrace_getpcstack(caller, 2, aframes,3503(uint32_t *)(uintptr_t)mstate->dtms_arg[0]);3504mstate->dtms_caller = caller[1];3505} else if ((mstate->dtms_caller =3506dtrace_caller(aframes)) == -1) {3507/*3508* We have failed to do this the quick way;3509* we must resort to the slower approach of3510* calling dtrace_getpcstack().3511*/3512pc_t caller = 0;35133514dtrace_getpcstack(&caller, 1, aframes, NULL);3515mstate->dtms_caller = caller;3516}35173518mstate->dtms_present |= DTRACE_MSTATE_CALLER;3519}3520return (mstate->dtms_caller);35213522case DIF_VAR_UCALLER:3523if (!dtrace_priv_proc(state))3524return (0);35253526if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {3527uint64_t ustack[3];35283529/*3530* dtrace_getupcstack() fills in the first uint64_t3531* with the current PID. The second uint64_t will3532* be the program counter at user-level. The third3533* uint64_t will contain the caller, which is what3534* we're after.3535*/3536ustack[2] = 0;3537DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);3538dtrace_getupcstack(ustack, 3);3539DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);3540mstate->dtms_ucaller = ustack[2];3541mstate->dtms_present |= DTRACE_MSTATE_UCALLER;3542}35433544return (mstate->dtms_ucaller);35453546case DIF_VAR_PROBEPROV:3547ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);3548return (dtrace_dif_varstr(3549(uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,3550state, mstate));35513552case DIF_VAR_PROBEMOD:3553ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);3554return (dtrace_dif_varstr(3555(uintptr_t)mstate->dtms_probe->dtpr_mod,3556state, mstate));35573558case DIF_VAR_PROBEFUNC:3559ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);3560return (dtrace_dif_varstr(3561(uintptr_t)mstate->dtms_probe->dtpr_func,3562state, mstate));35633564case DIF_VAR_PROBENAME:3565ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);3566return (dtrace_dif_varstr(3567(uintptr_t)mstate->dtms_probe->dtpr_name,3568state, mstate));35693570case DIF_VAR_PID:3571if (!dtrace_priv_proc(state))3572return (0);35733574#ifdef illumos3575/*3576* Note that we are assuming that an unanchored probe is3577* always due to a high-level interrupt. (And we're assuming3578* that there is only a single high level interrupt.)3579*/3580if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3581return (pid0.pid_id);35823583/*3584* It is always safe to dereference one's own t_procp pointer:3585* it always points to a valid, allocated proc structure.3586* Further, it is always safe to dereference the p_pidp member3587* of one's own proc structure. (These are truisms becuase3588* threads and processes don't clean up their own state --3589* they leave that task to whomever reaps them.)3590*/3591return ((uint64_t)curthread->t_procp->p_pidp->pid_id);3592#else3593return ((uint64_t)curproc->p_pid);3594#endif35953596case DIF_VAR_PPID:3597if (!dtrace_priv_proc(state))3598return (0);35993600#ifdef illumos3601/*3602* See comment in DIF_VAR_PID.3603*/3604if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3605return (pid0.pid_id);36063607/*3608* It is always safe to dereference one's own t_procp pointer:3609* it always points to a valid, allocated proc structure.3610* (This is true because threads don't clean up their own3611* state -- they leave that task to whomever reaps them.)3612*/3613return ((uint64_t)curthread->t_procp->p_ppid);3614#else3615if (curproc->p_pid == proc0.p_pid)3616return (curproc->p_pid);3617else3618return (curproc->p_pptr->p_pid);3619#endif36203621case DIF_VAR_TID:3622#ifdef illumos3623/*3624* See comment in DIF_VAR_PID.3625*/3626if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3627return (0);3628#endif36293630return ((uint64_t)curthread->t_tid);36313632case DIF_VAR_EXECARGS: {3633struct pargs *p_args = curthread->td_proc->p_args;36343635if (p_args == NULL)3636return(0);36373638return (dtrace_dif_varstrz(3639(uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));3640}36413642case DIF_VAR_EXECNAME:3643#ifdef illumos3644if (!dtrace_priv_proc(state))3645return (0);36463647/*3648* See comment in DIF_VAR_PID.3649*/3650if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3651return ((uint64_t)(uintptr_t)p0.p_user.u_comm);36523653/*3654* It is always safe to dereference one's own t_procp pointer:3655* it always points to a valid, allocated proc structure.3656* (This is true because threads don't clean up their own3657* state -- they leave that task to whomever reaps them.)3658*/3659return (dtrace_dif_varstr(3660(uintptr_t)curthread->t_procp->p_user.u_comm,3661state, mstate));3662#else3663return (dtrace_dif_varstr(3664(uintptr_t) curthread->td_proc->p_comm, state, mstate));3665#endif36663667case DIF_VAR_ZONENAME:3668#ifdef illumos3669if (!dtrace_priv_proc(state))3670return (0);36713672/*3673* See comment in DIF_VAR_PID.3674*/3675if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3676return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);36773678/*3679* It is always safe to dereference one's own t_procp pointer:3680* it always points to a valid, allocated proc structure.3681* (This is true because threads don't clean up their own3682* state -- they leave that task to whomever reaps them.)3683*/3684return (dtrace_dif_varstr(3685(uintptr_t)curthread->t_procp->p_zone->zone_name,3686state, mstate));3687#elif defined(__FreeBSD__)3688/*3689* On FreeBSD, we introduce compatibility to zonename by falling through3690* into jailname.3691*/3692case DIF_VAR_JAILNAME:3693if (!dtrace_priv_kernel(state))3694return (0);36953696return (dtrace_dif_varstr(3697(uintptr_t)curthread->td_ucred->cr_prison->pr_name,3698state, mstate));36993700case DIF_VAR_JID:3701if (!dtrace_priv_kernel(state))3702return (0);37033704return ((uint64_t)curthread->td_ucred->cr_prison->pr_id);3705#else3706return (0);3707#endif37083709case DIF_VAR_UID:3710if (!dtrace_priv_proc(state))3711return (0);37123713#ifdef illumos3714/*3715* See comment in DIF_VAR_PID.3716*/3717if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3718return ((uint64_t)p0.p_cred->cr_uid);37193720/*3721* It is always safe to dereference one's own t_procp pointer:3722* it always points to a valid, allocated proc structure.3723* (This is true because threads don't clean up their own3724* state -- they leave that task to whomever reaps them.)3725*3726* Additionally, it is safe to dereference one's own process3727* credential, since this is never NULL after process birth.3728*/3729return ((uint64_t)curthread->t_procp->p_cred->cr_uid);3730#else3731return ((uint64_t)curthread->td_ucred->cr_uid);3732#endif37333734case DIF_VAR_GID:3735if (!dtrace_priv_proc(state))3736return (0);37373738#ifdef illumos3739/*3740* See comment in DIF_VAR_PID.3741*/3742if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3743return ((uint64_t)p0.p_cred->cr_gid);37443745/*3746* It is always safe to dereference one's own t_procp pointer:3747* it always points to a valid, allocated proc structure.3748* (This is true because threads don't clean up their own3749* state -- they leave that task to whomever reaps them.)3750*3751* Additionally, it is safe to dereference one's own process3752* credential, since this is never NULL after process birth.3753*/3754return ((uint64_t)curthread->t_procp->p_cred->cr_gid);3755#else3756return ((uint64_t)curthread->td_ucred->cr_gid);3757#endif37583759case DIF_VAR_ERRNO: {3760#ifdef illumos3761klwp_t *lwp;3762if (!dtrace_priv_proc(state))3763return (0);37643765/*3766* See comment in DIF_VAR_PID.3767*/3768if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))3769return (0);37703771/*3772* It is always safe to dereference one's own t_lwp pointer in3773* the event that this pointer is non-NULL. (This is true3774* because threads and lwps don't clean up their own state --3775* they leave that task to whomever reaps them.)3776*/3777if ((lwp = curthread->t_lwp) == NULL)3778return (0);37793780return ((uint64_t)lwp->lwp_errno);3781#else3782return (curthread->td_errno);3783#endif3784}3785#ifndef illumos3786case DIF_VAR_CPU: {3787return curcpu;3788}3789#endif3790default:3791DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);3792return (0);3793}3794}379537963797typedef enum dtrace_json_state {3798DTRACE_JSON_REST = 1,3799DTRACE_JSON_OBJECT,3800DTRACE_JSON_STRING,3801DTRACE_JSON_STRING_ESCAPE,3802DTRACE_JSON_STRING_ESCAPE_UNICODE,3803DTRACE_JSON_COLON,3804DTRACE_JSON_COMMA,3805DTRACE_JSON_VALUE,3806DTRACE_JSON_IDENTIFIER,3807DTRACE_JSON_NUMBER,3808DTRACE_JSON_NUMBER_FRAC,3809DTRACE_JSON_NUMBER_EXP,3810DTRACE_JSON_COLLECT_OBJECT3811} dtrace_json_state_t;38123813/*3814* This function possesses just enough knowledge about JSON to extract a single3815* value from a JSON string and store it in the scratch buffer. It is able3816* to extract nested object values, and members of arrays by index.3817*3818* elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to3819* be looked up as we descend into the object tree. e.g.3820*3821* foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL3822* with nelems = 5.3823*3824* The run time of this function must be bounded above by strsize to limit the3825* amount of work done in probe context. As such, it is implemented as a3826* simple state machine, reading one character at a time using safe loads3827* until we find the requested element, hit a parsing error or run off the3828* end of the object or string.3829*3830* As there is no way for a subroutine to return an error without interrupting3831* clause execution, we simply return NULL in the event of a missing key or any3832* other error condition. Each NULL return in this function is commented with3833* the error condition it represents -- parsing or otherwise.3834*3835* The set of states for the state machine closely matches the JSON3836* specification (http://json.org/). Briefly:3837*3838* DTRACE_JSON_REST:3839* Skip whitespace until we find either a top-level Object, moving3840* to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.3841*3842* DTRACE_JSON_OBJECT:3843* Locate the next key String in an Object. Sets a flag to denote3844* the next String as a key string and moves to DTRACE_JSON_STRING.3845*3846* DTRACE_JSON_COLON:3847* Skip whitespace until we find the colon that separates key Strings3848* from their values. Once found, move to DTRACE_JSON_VALUE.3849*3850* DTRACE_JSON_VALUE:3851* Detects the type of the next value (String, Number, Identifier, Object3852* or Array) and routes to the states that process that type. Here we also3853* deal with the element selector list if we are requested to traverse down3854* into the object tree.3855*3856* DTRACE_JSON_COMMA:3857* Skip whitespace until we find the comma that separates key-value pairs3858* in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays3859* (similarly DTRACE_JSON_VALUE). All following literal value processing3860* states return to this state at the end of their value, unless otherwise3861* noted.3862*3863* DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:3864* Processes a Number literal from the JSON, including any exponent3865* component that may be present. Numbers are returned as strings, which3866* may be passed to strtoll() if an integer is required.3867*3868* DTRACE_JSON_IDENTIFIER:3869* Processes a "true", "false" or "null" literal in the JSON.3870*3871* DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,3872* DTRACE_JSON_STRING_ESCAPE_UNICODE:3873* Processes a String literal from the JSON, whether the String denotes3874* a key, a value or part of a larger Object. Handles all escape sequences3875* present in the specification, including four-digit unicode characters,3876* but merely includes the escape sequence without converting it to the3877* actual escaped character. If the String is flagged as a key, we3878* move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.3879*3880* DTRACE_JSON_COLLECT_OBJECT:3881* This state collects an entire Object (or Array), correctly handling3882* embedded strings. If the full element selector list matches this nested3883* object, we return the Object in full as a string. If not, we use this3884* state to skip to the next value at this level and continue processing.3885*3886* NOTE: This function uses various macros from strtolctype.h to manipulate3887* digit values, etc -- these have all been checked to ensure they make3888* no additional function calls.3889*/3890static char *3891dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,3892char *dest)3893{3894dtrace_json_state_t state = DTRACE_JSON_REST;3895int64_t array_elem = INT64_MIN;3896int64_t array_pos = 0;3897uint8_t escape_unicount = 0;3898boolean_t string_is_key = B_FALSE;3899boolean_t collect_object = B_FALSE;3900boolean_t found_key = B_FALSE;3901boolean_t in_array = B_FALSE;3902uint32_t braces = 0, brackets = 0;3903char *elem = elemlist;3904char *dd = dest;3905uintptr_t cur;39063907for (cur = json; cur < json + size; cur++) {3908char cc = dtrace_load8(cur);3909if (cc == '\0')3910return (NULL);39113912switch (state) {3913case DTRACE_JSON_REST:3914if (isspace(cc))3915break;39163917if (cc == '{') {3918state = DTRACE_JSON_OBJECT;3919break;3920}39213922if (cc == '[') {3923in_array = B_TRUE;3924array_pos = 0;3925array_elem = dtrace_strtoll(elem, 10, size);3926found_key = array_elem == 0 ? B_TRUE : B_FALSE;3927state = DTRACE_JSON_VALUE;3928break;3929}39303931/*3932* ERROR: expected to find a top-level object or array.3933*/3934return (NULL);3935case DTRACE_JSON_OBJECT:3936if (isspace(cc))3937break;39383939if (cc == '"') {3940state = DTRACE_JSON_STRING;3941string_is_key = B_TRUE;3942break;3943}39443945/*3946* ERROR: either the object did not start with a key3947* string, or we've run off the end of the object3948* without finding the requested key.3949*/3950return (NULL);3951case DTRACE_JSON_STRING:3952if (cc == '\\') {3953*dd++ = '\\';3954state = DTRACE_JSON_STRING_ESCAPE;3955break;3956}39573958if (cc == '"') {3959if (collect_object) {3960/*3961* We don't reset the dest here, as3962* the string is part of a larger3963* object being collected.3964*/3965*dd++ = cc;3966collect_object = B_FALSE;3967state = DTRACE_JSON_COLLECT_OBJECT;3968break;3969}3970*dd = '\0';3971dd = dest; /* reset string buffer */3972if (string_is_key) {3973if (dtrace_strncmp(dest, elem,3974size) == 0)3975found_key = B_TRUE;3976} else if (found_key) {3977if (nelems > 1) {3978/*3979* We expected an object, not3980* this string.3981*/3982return (NULL);3983}3984return (dest);3985}3986state = string_is_key ? DTRACE_JSON_COLON :3987DTRACE_JSON_COMMA;3988string_is_key = B_FALSE;3989break;3990}39913992*dd++ = cc;3993break;3994case DTRACE_JSON_STRING_ESCAPE:3995*dd++ = cc;3996if (cc == 'u') {3997escape_unicount = 0;3998state = DTRACE_JSON_STRING_ESCAPE_UNICODE;3999} else {4000state = DTRACE_JSON_STRING;4001}4002break;4003case DTRACE_JSON_STRING_ESCAPE_UNICODE:4004if (!isxdigit(cc)) {4005/*4006* ERROR: invalid unicode escape, expected4007* four valid hexidecimal digits.4008*/4009return (NULL);4010}40114012*dd++ = cc;4013if (++escape_unicount == 4)4014state = DTRACE_JSON_STRING;4015break;4016case DTRACE_JSON_COLON:4017if (isspace(cc))4018break;40194020if (cc == ':') {4021state = DTRACE_JSON_VALUE;4022break;4023}40244025/*4026* ERROR: expected a colon.4027*/4028return (NULL);4029case DTRACE_JSON_COMMA:4030if (isspace(cc))4031break;40324033if (cc == ',') {4034if (in_array) {4035state = DTRACE_JSON_VALUE;4036if (++array_pos == array_elem)4037found_key = B_TRUE;4038} else {4039state = DTRACE_JSON_OBJECT;4040}4041break;4042}40434044/*4045* ERROR: either we hit an unexpected character, or4046* we reached the end of the object or array without4047* finding the requested key.4048*/4049return (NULL);4050case DTRACE_JSON_IDENTIFIER:4051if (islower(cc)) {4052*dd++ = cc;4053break;4054}40554056*dd = '\0';4057dd = dest; /* reset string buffer */40584059if (dtrace_strncmp(dest, "true", 5) == 0 ||4060dtrace_strncmp(dest, "false", 6) == 0 ||4061dtrace_strncmp(dest, "null", 5) == 0) {4062if (found_key) {4063if (nelems > 1) {4064/*4065* ERROR: We expected an object,4066* not this identifier.4067*/4068return (NULL);4069}4070return (dest);4071} else {4072cur--;4073state = DTRACE_JSON_COMMA;4074break;4075}4076}40774078/*4079* ERROR: we did not recognise the identifier as one4080* of those in the JSON specification.4081*/4082return (NULL);4083case DTRACE_JSON_NUMBER:4084if (cc == '.') {4085*dd++ = cc;4086state = DTRACE_JSON_NUMBER_FRAC;4087break;4088}40894090if (cc == 'x' || cc == 'X') {4091/*4092* ERROR: specification explicitly excludes4093* hexidecimal or octal numbers.4094*/4095return (NULL);4096}40974098/* FALLTHRU */4099case DTRACE_JSON_NUMBER_FRAC:4100if (cc == 'e' || cc == 'E') {4101*dd++ = cc;4102state = DTRACE_JSON_NUMBER_EXP;4103break;4104}41054106if (cc == '+' || cc == '-') {4107/*4108* ERROR: expect sign as part of exponent only.4109*/4110return (NULL);4111}4112/* FALLTHRU */4113case DTRACE_JSON_NUMBER_EXP:4114if (isdigit(cc) || cc == '+' || cc == '-') {4115*dd++ = cc;4116break;4117}41184119*dd = '\0';4120dd = dest; /* reset string buffer */4121if (found_key) {4122if (nelems > 1) {4123/*4124* ERROR: We expected an object, not4125* this number.4126*/4127return (NULL);4128}4129return (dest);4130}41314132cur--;4133state = DTRACE_JSON_COMMA;4134break;4135case DTRACE_JSON_VALUE:4136if (isspace(cc))4137break;41384139if (cc == '{' || cc == '[') {4140if (nelems > 1 && found_key) {4141in_array = cc == '[' ? B_TRUE : B_FALSE;4142/*4143* If our element selector directs us4144* to descend into this nested object,4145* then move to the next selector4146* element in the list and restart the4147* state machine.4148*/4149while (*elem != '\0')4150elem++;4151elem++; /* skip the inter-element NUL */4152nelems--;4153dd = dest;4154if (in_array) {4155state = DTRACE_JSON_VALUE;4156array_pos = 0;4157array_elem = dtrace_strtoll(4158elem, 10, size);4159found_key = array_elem == 0 ?4160B_TRUE : B_FALSE;4161} else {4162found_key = B_FALSE;4163state = DTRACE_JSON_OBJECT;4164}4165break;4166}41674168/*4169* Otherwise, we wish to either skip this4170* nested object or return it in full.4171*/4172if (cc == '[')4173brackets = 1;4174else4175braces = 1;4176*dd++ = cc;4177state = DTRACE_JSON_COLLECT_OBJECT;4178break;4179}41804181if (cc == '"') {4182state = DTRACE_JSON_STRING;4183break;4184}41854186if (islower(cc)) {4187/*4188* Here we deal with true, false and null.4189*/4190*dd++ = cc;4191state = DTRACE_JSON_IDENTIFIER;4192break;4193}41944195if (cc == '-' || isdigit(cc)) {4196*dd++ = cc;4197state = DTRACE_JSON_NUMBER;4198break;4199}42004201/*4202* ERROR: unexpected character at start of value.4203*/4204return (NULL);4205case DTRACE_JSON_COLLECT_OBJECT:4206if (cc == '\0')4207/*4208* ERROR: unexpected end of input.4209*/4210return (NULL);42114212*dd++ = cc;4213if (cc == '"') {4214collect_object = B_TRUE;4215state = DTRACE_JSON_STRING;4216break;4217}42184219if (cc == ']') {4220if (brackets-- == 0) {4221/*4222* ERROR: unbalanced brackets.4223*/4224return (NULL);4225}4226} else if (cc == '}') {4227if (braces-- == 0) {4228/*4229* ERROR: unbalanced braces.4230*/4231return (NULL);4232}4233} else if (cc == '{') {4234braces++;4235} else if (cc == '[') {4236brackets++;4237}42384239if (brackets == 0 && braces == 0) {4240if (found_key) {4241*dd = '\0';4242return (dest);4243}4244dd = dest; /* reset string buffer */4245state = DTRACE_JSON_COMMA;4246}4247break;4248}4249}4250return (NULL);4251}42524253/*4254* Emulate the execution of DTrace ID subroutines invoked by the call opcode.4255* Notice that we don't bother validating the proper number of arguments or4256* their types in the tuple stack. This isn't needed because all argument4257* interpretation is safe because of our load safety -- the worst that can4258* happen is that a bogus program can obtain bogus results.4259*/4260static void4261dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,4262dtrace_key_t *tupregs, int nargs,4263dtrace_mstate_t *mstate, dtrace_state_t *state)4264{4265volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;4266volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;4267dtrace_vstate_t *vstate = &state->dts_vstate;42684269#ifdef illumos4270union {4271mutex_impl_t mi;4272uint64_t mx;4273} m;42744275union {4276krwlock_t ri;4277uintptr_t rw;4278} r;4279#else4280struct thread *lowner;4281union {4282struct lock_object *li;4283uintptr_t lx;4284} l;4285#endif42864287switch (subr) {4288case DIF_SUBR_RAND:4289regs[rd] = dtrace_xoroshiro128_plus_next(4290state->dts_rstate[curcpu]);4291break;42924293#ifdef illumos4294case DIF_SUBR_MUTEX_OWNED:4295if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),4296mstate, vstate)) {4297regs[rd] = 0;4298break;4299}43004301m.mx = dtrace_load64(tupregs[0].dttk_value);4302if (MUTEX_TYPE_ADAPTIVE(&m.mi))4303regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;4304else4305regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);4306break;43074308case DIF_SUBR_MUTEX_OWNER:4309if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),4310mstate, vstate)) {4311regs[rd] = 0;4312break;4313}43144315m.mx = dtrace_load64(tupregs[0].dttk_value);4316if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&4317MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)4318regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);4319else4320regs[rd] = 0;4321break;43224323case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:4324if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),4325mstate, vstate)) {4326regs[rd] = 0;4327break;4328}43294330m.mx = dtrace_load64(tupregs[0].dttk_value);4331regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);4332break;43334334case DIF_SUBR_MUTEX_TYPE_SPIN:4335if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),4336mstate, vstate)) {4337regs[rd] = 0;4338break;4339}43404341m.mx = dtrace_load64(tupregs[0].dttk_value);4342regs[rd] = MUTEX_TYPE_SPIN(&m.mi);4343break;43444345case DIF_SUBR_RW_READ_HELD: {4346uintptr_t tmp;43474348if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),4349mstate, vstate)) {4350regs[rd] = 0;4351break;4352}43534354r.rw = dtrace_loadptr(tupregs[0].dttk_value);4355regs[rd] = _RW_READ_HELD(&r.ri, tmp);4356break;4357}43584359case DIF_SUBR_RW_WRITE_HELD:4360if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),4361mstate, vstate)) {4362regs[rd] = 0;4363break;4364}43654366r.rw = dtrace_loadptr(tupregs[0].dttk_value);4367regs[rd] = _RW_WRITE_HELD(&r.ri);4368break;43694370case DIF_SUBR_RW_ISWRITER:4371if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),4372mstate, vstate)) {4373regs[rd] = 0;4374break;4375}43764377r.rw = dtrace_loadptr(tupregs[0].dttk_value);4378regs[rd] = _RW_ISWRITER(&r.ri);4379break;43804381#else /* !illumos */4382case DIF_SUBR_MUTEX_OWNED:4383if (!dtrace_canload(tupregs[0].dttk_value,4384sizeof (struct lock_object), mstate, vstate)) {4385regs[rd] = 0;4386break;4387}4388l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);4389DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4390regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);4391DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4392break;43934394case DIF_SUBR_MUTEX_OWNER:4395if (!dtrace_canload(tupregs[0].dttk_value,4396sizeof (struct lock_object), mstate, vstate)) {4397regs[rd] = 0;4398break;4399}4400l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);4401DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4402LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);4403DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4404regs[rd] = (uintptr_t)lowner;4405break;44064407case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:4408if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),4409mstate, vstate)) {4410regs[rd] = 0;4411break;4412}4413l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);4414DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4415regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0;4416DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4417break;44184419case DIF_SUBR_MUTEX_TYPE_SPIN:4420if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),4421mstate, vstate)) {4422regs[rd] = 0;4423break;4424}4425l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);4426DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4427regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;4428DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4429break;44304431case DIF_SUBR_RW_READ_HELD:4432case DIF_SUBR_SX_SHARED_HELD:4433if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),4434mstate, vstate)) {4435regs[rd] = 0;4436break;4437}4438l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);4439DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4440regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&4441lowner == NULL;4442DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4443break;44444445case DIF_SUBR_RW_WRITE_HELD:4446case DIF_SUBR_SX_EXCLUSIVE_HELD:4447if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),4448mstate, vstate)) {4449regs[rd] = 0;4450break;4451}4452l.lx = dtrace_loadptr(tupregs[0].dttk_value);4453DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4454regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&4455lowner != NULL;4456DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4457break;44584459case DIF_SUBR_RW_ISWRITER:4460case DIF_SUBR_SX_ISEXCLUSIVE:4461if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),4462mstate, vstate)) {4463regs[rd] = 0;4464break;4465}4466l.lx = dtrace_loadptr(tupregs[0].dttk_value);4467DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4468LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);4469DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4470regs[rd] = (lowner == curthread);4471break;4472#endif /* illumos */44734474case DIF_SUBR_BCOPY: {4475/*4476* We need to be sure that the destination is in the scratch4477* region -- no other region is allowed.4478*/4479uintptr_t src = tupregs[0].dttk_value;4480uintptr_t dest = tupregs[1].dttk_value;4481size_t size = tupregs[2].dttk_value;44824483if (!dtrace_inscratch(dest, size, mstate)) {4484*flags |= CPU_DTRACE_BADADDR;4485*illval = regs[rd];4486break;4487}44884489if (!dtrace_canload(src, size, mstate, vstate)) {4490regs[rd] = 0;4491break;4492}44934494dtrace_bcopy((void *)src, (void *)dest, size);4495break;4496}44974498case DIF_SUBR_ALLOCA:4499case DIF_SUBR_COPYIN: {4500uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);4501uint64_t size =4502tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;4503size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;45044505/*4506* This action doesn't require any credential checks since4507* probes will not activate in user contexts to which the4508* enabling user does not have permissions.4509*/45104511/*4512* Rounding up the user allocation size could have overflowed4513* a large, bogus allocation (like -1ULL) to 0.4514*/4515if (scratch_size < size ||4516!DTRACE_INSCRATCH(mstate, scratch_size)) {4517DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);4518regs[rd] = 0;4519break;4520}45214522if (subr == DIF_SUBR_COPYIN) {4523DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4524dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);4525DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4526}45274528mstate->dtms_scratch_ptr += scratch_size;4529regs[rd] = dest;4530break;4531}45324533case DIF_SUBR_COPYINTO: {4534uint64_t size = tupregs[1].dttk_value;4535uintptr_t dest = tupregs[2].dttk_value;45364537/*4538* This action doesn't require any credential checks since4539* probes will not activate in user contexts to which the4540* enabling user does not have permissions.4541*/4542if (!dtrace_inscratch(dest, size, mstate)) {4543*flags |= CPU_DTRACE_BADADDR;4544*illval = regs[rd];4545break;4546}45474548DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4549dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);4550DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4551break;4552}45534554case DIF_SUBR_COPYINSTR: {4555uintptr_t dest = mstate->dtms_scratch_ptr;4556uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];45574558if (nargs > 1 && tupregs[1].dttk_value < size)4559size = tupregs[1].dttk_value + 1;45604561/*4562* This action doesn't require any credential checks since4563* probes will not activate in user contexts to which the4564* enabling user does not have permissions.4565*/4566if (!DTRACE_INSCRATCH(mstate, size)) {4567DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);4568regs[rd] = 0;4569break;4570}45714572DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4573dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);4574DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);45754576((char *)dest)[size - 1] = '\0';4577mstate->dtms_scratch_ptr += size;4578regs[rd] = dest;4579break;4580}45814582#ifdef illumos4583case DIF_SUBR_MSGSIZE:4584case DIF_SUBR_MSGDSIZE: {4585uintptr_t baddr = tupregs[0].dttk_value, daddr;4586uintptr_t wptr, rptr;4587size_t count = 0;4588int cont = 0;45894590while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {45914592if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,4593vstate)) {4594regs[rd] = 0;4595break;4596}45974598wptr = dtrace_loadptr(baddr +4599offsetof(mblk_t, b_wptr));46004601rptr = dtrace_loadptr(baddr +4602offsetof(mblk_t, b_rptr));46034604if (wptr < rptr) {4605*flags |= CPU_DTRACE_BADADDR;4606*illval = tupregs[0].dttk_value;4607break;4608}46094610daddr = dtrace_loadptr(baddr +4611offsetof(mblk_t, b_datap));46124613baddr = dtrace_loadptr(baddr +4614offsetof(mblk_t, b_cont));46154616/*4617* We want to prevent against denial-of-service here,4618* so we're only going to search the list for4619* dtrace_msgdsize_max mblks.4620*/4621if (cont++ > dtrace_msgdsize_max) {4622*flags |= CPU_DTRACE_ILLOP;4623break;4624}46254626if (subr == DIF_SUBR_MSGDSIZE) {4627if (dtrace_load8(daddr +4628offsetof(dblk_t, db_type)) != M_DATA)4629continue;4630}46314632count += wptr - rptr;4633}46344635if (!(*flags & CPU_DTRACE_FAULT))4636regs[rd] = count;46374638break;4639}4640#endif46414642case DIF_SUBR_PROGENYOF: {4643pid_t pid = tupregs[0].dttk_value;4644proc_t *p;4645int rval = 0;46464647DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);46484649for (p = curthread->t_procp; p != NULL; p = p->p_parent) {4650#ifdef illumos4651if (p->p_pidp->pid_id == pid) {4652#else4653if (p->p_pid == pid) {4654#endif4655rval = 1;4656break;4657}4658}46594660DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);46614662regs[rd] = rval;4663break;4664}46654666case DIF_SUBR_SPECULATION:4667regs[rd] = dtrace_speculation(state);4668break;46694670case DIF_SUBR_COPYOUT: {4671uintptr_t kaddr = tupregs[0].dttk_value;4672uintptr_t uaddr = tupregs[1].dttk_value;4673uint64_t size = tupregs[2].dttk_value;46744675if (!dtrace_destructive_disallow &&4676dtrace_priv_proc_control(state) &&4677!dtrace_istoxic(kaddr, size) &&4678dtrace_canload(kaddr, size, mstate, vstate)) {4679DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4680dtrace_copyout(kaddr, uaddr, size, flags);4681DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4682}4683break;4684}46854686case DIF_SUBR_COPYOUTSTR: {4687uintptr_t kaddr = tupregs[0].dttk_value;4688uintptr_t uaddr = tupregs[1].dttk_value;4689uint64_t size = tupregs[2].dttk_value;4690size_t lim;46914692if (!dtrace_destructive_disallow &&4693dtrace_priv_proc_control(state) &&4694!dtrace_istoxic(kaddr, size) &&4695dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {4696DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);4697dtrace_copyoutstr(kaddr, uaddr, lim, flags);4698DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);4699}4700break;4701}47024703case DIF_SUBR_STRLEN: {4704size_t size = state->dts_options[DTRACEOPT_STRSIZE];4705uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;4706size_t lim;47074708if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {4709regs[rd] = 0;4710break;4711}47124713regs[rd] = dtrace_strlen((char *)addr, lim);4714break;4715}47164717case DIF_SUBR_STRCHR:4718case DIF_SUBR_STRRCHR: {4719/*4720* We're going to iterate over the string looking for the4721* specified character. We will iterate until we have reached4722* the string length or we have found the character. If this4723* is DIF_SUBR_STRRCHR, we will look for the last occurrence4724* of the specified character instead of the first.4725*/4726uintptr_t addr = tupregs[0].dttk_value;4727uintptr_t addr_limit;4728uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];4729size_t lim;4730char c, target = (char)tupregs[1].dttk_value;47314732if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {4733regs[rd] = 0;4734break;4735}4736addr_limit = addr + lim;47374738for (regs[rd] = 0; addr < addr_limit; addr++) {4739if ((c = dtrace_load8(addr)) == target) {4740regs[rd] = addr;47414742if (subr == DIF_SUBR_STRCHR)4743break;4744}47454746if (c == '\0')4747break;4748}4749break;4750}47514752case DIF_SUBR_STRSTR:4753case DIF_SUBR_INDEX:4754case DIF_SUBR_RINDEX: {4755/*4756* We're going to iterate over the string looking for the4757* specified string. We will iterate until we have reached4758* the string length or we have found the string. (Yes, this4759* is done in the most naive way possible -- but considering4760* that the string we're searching for is likely to be4761* relatively short, the complexity of Rabin-Karp or similar4762* hardly seems merited.)4763*/4764char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;4765char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;4766uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];4767size_t len = dtrace_strlen(addr, size);4768size_t sublen = dtrace_strlen(substr, size);4769char *limit = addr + len, *orig = addr;4770int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;4771int inc = 1;47724773regs[rd] = notfound;47744775if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {4776regs[rd] = 0;4777break;4778}47794780if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,4781vstate)) {4782regs[rd] = 0;4783break;4784}47854786/*4787* strstr() and index()/rindex() have similar semantics if4788* both strings are the empty string: strstr() returns a4789* pointer to the (empty) string, and index() and rindex()4790* both return index 0 (regardless of any position argument).4791*/4792if (sublen == 0 && len == 0) {4793if (subr == DIF_SUBR_STRSTR)4794regs[rd] = (uintptr_t)addr;4795else4796regs[rd] = 0;4797break;4798}47994800if (subr != DIF_SUBR_STRSTR) {4801if (subr == DIF_SUBR_RINDEX) {4802limit = orig - 1;4803addr += len;4804inc = -1;4805}48064807/*4808* Both index() and rindex() take an optional position4809* argument that denotes the starting position.4810*/4811if (nargs == 3) {4812int64_t pos = (int64_t)tupregs[2].dttk_value;48134814/*4815* If the position argument to index() is4816* negative, Perl implicitly clamps it at4817* zero. This semantic is a little surprising4818* given the special meaning of negative4819* positions to similar Perl functions like4820* substr(), but it appears to reflect a4821* notion that index() can start from a4822* negative index and increment its way up to4823* the string. Given this notion, Perl's4824* rindex() is at least self-consistent in4825* that it implicitly clamps positions greater4826* than the string length to be the string4827* length. Where Perl completely loses4828* coherence, however, is when the specified4829* substring is the empty string (""). In4830* this case, even if the position is4831* negative, rindex() returns 0 -- and even if4832* the position is greater than the length,4833* index() returns the string length. These4834* semantics violate the notion that index()4835* should never return a value less than the4836* specified position and that rindex() should4837* never return a value greater than the4838* specified position. (One assumes that4839* these semantics are artifacts of Perl's4840* implementation and not the results of4841* deliberate design -- it beggars belief that4842* even Larry Wall could desire such oddness.)4843* While in the abstract one would wish for4844* consistent position semantics across4845* substr(), index() and rindex() -- or at the4846* very least self-consistent position4847* semantics for index() and rindex() -- we4848* instead opt to keep with the extant Perl4849* semantics, in all their broken glory. (Do4850* we have more desire to maintain Perl's4851* semantics than Perl does? Probably.)4852*/4853if (subr == DIF_SUBR_RINDEX) {4854if (pos < 0) {4855if (sublen == 0)4856regs[rd] = 0;4857break;4858}48594860if (pos > len)4861pos = len;4862} else {4863if (pos < 0)4864pos = 0;48654866if (pos >= len) {4867if (sublen == 0)4868regs[rd] = len;4869break;4870}4871}48724873addr = orig + pos;4874}4875}48764877for (regs[rd] = notfound; addr != limit; addr += inc) {4878if (dtrace_strncmp(addr, substr, sublen) == 0) {4879if (subr != DIF_SUBR_STRSTR) {4880/*4881* As D index() and rindex() are4882* modeled on Perl (and not on awk),4883* we return a zero-based (and not a4884* one-based) index. (For you Perl4885* weenies: no, we're not going to add4886* $[ -- and shouldn't you be at a con4887* or something?)4888*/4889regs[rd] = (uintptr_t)(addr - orig);4890break;4891}48924893ASSERT(subr == DIF_SUBR_STRSTR);4894regs[rd] = (uintptr_t)addr;4895break;4896}4897}48984899break;4900}49014902case DIF_SUBR_STRTOK: {4903uintptr_t addr = tupregs[0].dttk_value;4904uintptr_t tokaddr = tupregs[1].dttk_value;4905uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];4906uintptr_t limit, toklimit;4907size_t clim;4908uint8_t c = 0, tokmap[32]; /* 256 / 8 */4909char *dest = (char *)mstate->dtms_scratch_ptr;4910int i;49114912/*4913* Check both the token buffer and (later) the input buffer,4914* since both could be non-scratch addresses.4915*/4916if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {4917regs[rd] = 0;4918break;4919}4920toklimit = tokaddr + clim;49214922if (!DTRACE_INSCRATCH(mstate, size)) {4923DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);4924regs[rd] = 0;4925break;4926}49274928if (addr == 0) {4929/*4930* If the address specified is NULL, we use our saved4931* strtok pointer from the mstate. Note that this4932* means that the saved strtok pointer is _only_4933* valid within multiple enablings of the same probe --4934* it behaves like an implicit clause-local variable.4935*/4936addr = mstate->dtms_strtok;4937limit = mstate->dtms_strtok_limit;4938} else {4939/*4940* If the user-specified address is non-NULL we must4941* access check it. This is the only time we have4942* a chance to do so, since this address may reside4943* in the string table of this clause-- future calls4944* (when we fetch addr from mstate->dtms_strtok)4945* would fail this access check.4946*/4947if (!dtrace_strcanload(addr, size, &clim, mstate,4948vstate)) {4949regs[rd] = 0;4950break;4951}4952limit = addr + clim;4953}49544955/*4956* First, zero the token map, and then process the token4957* string -- setting a bit in the map for every character4958* found in the token string.4959*/4960for (i = 0; i < sizeof (tokmap); i++)4961tokmap[i] = 0;49624963for (; tokaddr < toklimit; tokaddr++) {4964if ((c = dtrace_load8(tokaddr)) == '\0')4965break;49664967ASSERT((c >> 3) < sizeof (tokmap));4968tokmap[c >> 3] |= (1 << (c & 0x7));4969}49704971for (; addr < limit; addr++) {4972/*4973* We're looking for a character that is _not_4974* contained in the token string.4975*/4976if ((c = dtrace_load8(addr)) == '\0')4977break;49784979if (!(tokmap[c >> 3] & (1 << (c & 0x7))))4980break;4981}49824983if (c == '\0') {4984/*4985* We reached the end of the string without finding4986* any character that was not in the token string.4987* We return NULL in this case, and we set the saved4988* address to NULL as well.4989*/4990regs[rd] = 0;4991mstate->dtms_strtok = 0;4992mstate->dtms_strtok_limit = 0;4993break;4994}49954996/*4997* From here on, we're copying into the destination string.4998*/4999for (i = 0; addr < limit && i < size - 1; addr++) {5000if ((c = dtrace_load8(addr)) == '\0')5001break;50025003if (tokmap[c >> 3] & (1 << (c & 0x7)))5004break;50055006ASSERT(i < size);5007dest[i++] = c;5008}50095010ASSERT(i < size);5011dest[i] = '\0';5012regs[rd] = (uintptr_t)dest;5013mstate->dtms_scratch_ptr += size;5014mstate->dtms_strtok = addr;5015mstate->dtms_strtok_limit = limit;5016break;5017}50185019case DIF_SUBR_SUBSTR: {5020uintptr_t s = tupregs[0].dttk_value;5021uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5022char *d = (char *)mstate->dtms_scratch_ptr;5023int64_t index = (int64_t)tupregs[1].dttk_value;5024int64_t remaining = (int64_t)tupregs[2].dttk_value;5025size_t len = dtrace_strlen((char *)s, size);5026int64_t i;50275028if (!dtrace_canload(s, len + 1, mstate, vstate)) {5029regs[rd] = 0;5030break;5031}50325033if (!DTRACE_INSCRATCH(mstate, size)) {5034DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5035regs[rd] = 0;5036break;5037}50385039if (nargs <= 2)5040remaining = (int64_t)size;50415042if (index < 0) {5043index += len;50445045if (index < 0 && index + remaining > 0) {5046remaining += index;5047index = 0;5048}5049}50505051if (index >= len || index < 0) {5052remaining = 0;5053} else if (remaining < 0) {5054remaining += len - index;5055} else if (index + remaining > size) {5056remaining = size - index;5057}50585059for (i = 0; i < remaining; i++) {5060if ((d[i] = dtrace_load8(s + index + i)) == '\0')5061break;5062}50635064d[i] = '\0';50655066mstate->dtms_scratch_ptr += size;5067regs[rd] = (uintptr_t)d;5068break;5069}50705071case DIF_SUBR_JSON: {5072uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5073uintptr_t json = tupregs[0].dttk_value;5074size_t jsonlen = dtrace_strlen((char *)json, size);5075uintptr_t elem = tupregs[1].dttk_value;5076size_t elemlen = dtrace_strlen((char *)elem, size);50775078char *dest = (char *)mstate->dtms_scratch_ptr;5079char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;5080char *ee = elemlist;5081int nelems = 1;5082uintptr_t cur;50835084if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||5085!dtrace_canload(elem, elemlen + 1, mstate, vstate)) {5086regs[rd] = 0;5087break;5088}50895090if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {5091DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5092regs[rd] = 0;5093break;5094}50955096/*5097* Read the element selector and split it up into a packed list5098* of strings.5099*/5100for (cur = elem; cur < elem + elemlen; cur++) {5101char cc = dtrace_load8(cur);51025103if (cur == elem && cc == '[') {5104/*5105* If the first element selector key is5106* actually an array index then ignore the5107* bracket.5108*/5109continue;5110}51115112if (cc == ']')5113continue;51145115if (cc == '.' || cc == '[') {5116nelems++;5117cc = '\0';5118}51195120*ee++ = cc;5121}5122*ee++ = '\0';51235124if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,5125nelems, dest)) != 0)5126mstate->dtms_scratch_ptr += jsonlen + 1;5127break;5128}51295130case DIF_SUBR_TOUPPER:5131case DIF_SUBR_TOLOWER: {5132uintptr_t s = tupregs[0].dttk_value;5133uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5134char *dest = (char *)mstate->dtms_scratch_ptr, c;5135size_t len = dtrace_strlen((char *)s, size);5136char lower, upper, convert;5137int64_t i;51385139if (subr == DIF_SUBR_TOUPPER) {5140lower = 'a';5141upper = 'z';5142convert = 'A';5143} else {5144lower = 'A';5145upper = 'Z';5146convert = 'a';5147}51485149if (!dtrace_canload(s, len + 1, mstate, vstate)) {5150regs[rd] = 0;5151break;5152}51535154if (!DTRACE_INSCRATCH(mstate, size)) {5155DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5156regs[rd] = 0;5157break;5158}51595160for (i = 0; i < size - 1; i++) {5161if ((c = dtrace_load8(s + i)) == '\0')5162break;51635164if (c >= lower && c <= upper)5165c = convert + (c - lower);51665167dest[i] = c;5168}51695170ASSERT(i < size);5171dest[i] = '\0';5172regs[rd] = (uintptr_t)dest;5173mstate->dtms_scratch_ptr += size;5174break;5175}51765177#ifdef illumos5178case DIF_SUBR_GETMAJOR:5179#ifdef _LP645180regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;5181#else5182regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;5183#endif5184break;51855186case DIF_SUBR_GETMINOR:5187#ifdef _LP645188regs[rd] = tupregs[0].dttk_value & MAXMIN64;5189#else5190regs[rd] = tupregs[0].dttk_value & MAXMIN;5191#endif5192break;51935194case DIF_SUBR_DDI_PATHNAME: {5195/*5196* This one is a galactic mess. We are going to roughly5197* emulate ddi_pathname(), but it's made more complicated5198* by the fact that we (a) want to include the minor name and5199* (b) must proceed iteratively instead of recursively.5200*/5201uintptr_t dest = mstate->dtms_scratch_ptr;5202uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5203char *start = (char *)dest, *end = start + size - 1;5204uintptr_t daddr = tupregs[0].dttk_value;5205int64_t minor = (int64_t)tupregs[1].dttk_value;5206char *s;5207int i, len, depth = 0;52085209/*5210* Due to all the pointer jumping we do and context we must5211* rely upon, we just mandate that the user must have kernel5212* read privileges to use this routine.5213*/5214if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {5215*flags |= CPU_DTRACE_KPRIV;5216*illval = daddr;5217regs[rd] = 0;5218}52195220if (!DTRACE_INSCRATCH(mstate, size)) {5221DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5222regs[rd] = 0;5223break;5224}52255226*end = '\0';52275228/*5229* We want to have a name for the minor. In order to do this,5230* we need to walk the minor list from the devinfo. We want5231* to be sure that we don't infinitely walk a circular list,5232* so we check for circularity by sending a scout pointer5233* ahead two elements for every element that we iterate over;5234* if the list is circular, these will ultimately point to the5235* same element. You may recognize this little trick as the5236* answer to a stupid interview question -- one that always5237* seems to be asked by those who had to have it laboriously5238* explained to them, and who can't even concisely describe5239* the conditions under which one would be forced to resort to5240* this technique. Needless to say, those conditions are5241* found here -- and probably only here. Is this the only use5242* of this infamous trick in shipping, production code? If it5243* isn't, it probably should be...5244*/5245if (minor != -1) {5246uintptr_t maddr = dtrace_loadptr(daddr +5247offsetof(struct dev_info, devi_minor));52485249uintptr_t next = offsetof(struct ddi_minor_data, next);5250uintptr_t name = offsetof(struct ddi_minor_data,5251d_minor) + offsetof(struct ddi_minor, name);5252uintptr_t dev = offsetof(struct ddi_minor_data,5253d_minor) + offsetof(struct ddi_minor, dev);5254uintptr_t scout;52555256if (maddr != NULL)5257scout = dtrace_loadptr(maddr + next);52585259while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {5260uint64_t m;5261#ifdef _LP645262m = dtrace_load64(maddr + dev) & MAXMIN64;5263#else5264m = dtrace_load32(maddr + dev) & MAXMIN;5265#endif5266if (m != minor) {5267maddr = dtrace_loadptr(maddr + next);52685269if (scout == NULL)5270continue;52715272scout = dtrace_loadptr(scout + next);52735274if (scout == NULL)5275continue;52765277scout = dtrace_loadptr(scout + next);52785279if (scout == NULL)5280continue;52815282if (scout == maddr) {5283*flags |= CPU_DTRACE_ILLOP;5284break;5285}52865287continue;5288}52895290/*5291* We have the minor data. Now we need to5292* copy the minor's name into the end of the5293* pathname.5294*/5295s = (char *)dtrace_loadptr(maddr + name);5296len = dtrace_strlen(s, size);52975298if (*flags & CPU_DTRACE_FAULT)5299break;53005301if (len != 0) {5302if ((end -= (len + 1)) < start)5303break;53045305*end = ':';5306}53075308for (i = 1; i <= len; i++)5309end[i] = dtrace_load8((uintptr_t)s++);5310break;5311}5312}53135314while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {5315ddi_node_state_t devi_state;53165317devi_state = dtrace_load32(daddr +5318offsetof(struct dev_info, devi_node_state));53195320if (*flags & CPU_DTRACE_FAULT)5321break;53225323if (devi_state >= DS_INITIALIZED) {5324s = (char *)dtrace_loadptr(daddr +5325offsetof(struct dev_info, devi_addr));5326len = dtrace_strlen(s, size);53275328if (*flags & CPU_DTRACE_FAULT)5329break;53305331if (len != 0) {5332if ((end -= (len + 1)) < start)5333break;53345335*end = '@';5336}53375338for (i = 1; i <= len; i++)5339end[i] = dtrace_load8((uintptr_t)s++);5340}53415342/*5343* Now for the node name...5344*/5345s = (char *)dtrace_loadptr(daddr +5346offsetof(struct dev_info, devi_node_name));53475348daddr = dtrace_loadptr(daddr +5349offsetof(struct dev_info, devi_parent));53505351/*5352* If our parent is NULL (that is, if we're the root5353* node), we're going to use the special path5354* "devices".5355*/5356if (daddr == 0)5357s = "devices";53585359len = dtrace_strlen(s, size);5360if (*flags & CPU_DTRACE_FAULT)5361break;53625363if ((end -= (len + 1)) < start)5364break;53655366for (i = 1; i <= len; i++)5367end[i] = dtrace_load8((uintptr_t)s++);5368*end = '/';53695370if (depth++ > dtrace_devdepth_max) {5371*flags |= CPU_DTRACE_ILLOP;5372break;5373}5374}53755376if (end < start)5377DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);53785379if (daddr == 0) {5380regs[rd] = (uintptr_t)end;5381mstate->dtms_scratch_ptr += size;5382}53835384break;5385}5386#endif53875388case DIF_SUBR_STRJOIN: {5389char *d = (char *)mstate->dtms_scratch_ptr;5390uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5391uintptr_t s1 = tupregs[0].dttk_value;5392uintptr_t s2 = tupregs[1].dttk_value;5393int i = 0, j = 0;5394size_t lim1, lim2;5395char c;53965397if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||5398!dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {5399regs[rd] = 0;5400break;5401}54025403if (!DTRACE_INSCRATCH(mstate, size)) {5404DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5405regs[rd] = 0;5406break;5407}54085409for (;;) {5410if (i >= size) {5411DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5412regs[rd] = 0;5413break;5414}5415c = (i >= lim1) ? '\0' : dtrace_load8(s1++);5416if ((d[i++] = c) == '\0') {5417i--;5418break;5419}5420}54215422for (;;) {5423if (i >= size) {5424DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5425regs[rd] = 0;5426break;5427}54285429c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);5430if ((d[i++] = c) == '\0')5431break;5432}54335434if (i < size) {5435mstate->dtms_scratch_ptr += i;5436regs[rd] = (uintptr_t)d;5437}54385439break;5440}54415442case DIF_SUBR_STRTOLL: {5443uintptr_t s = tupregs[0].dttk_value;5444uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5445size_t lim;5446int base = 10;54475448if (nargs > 1) {5449if ((base = tupregs[1].dttk_value) <= 1 ||5450base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {5451*flags |= CPU_DTRACE_ILLOP;5452break;5453}5454}54555456if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {5457regs[rd] = INT64_MIN;5458break;5459}54605461regs[rd] = dtrace_strtoll((char *)s, base, lim);5462break;5463}54645465case DIF_SUBR_LLTOSTR: {5466int64_t i = (int64_t)tupregs[0].dttk_value;5467uint64_t val, digit;5468uint64_t size = 65; /* enough room for 2^64 in binary */5469char *end = (char *)mstate->dtms_scratch_ptr + size - 1;5470int base = 10;54715472if (nargs > 1) {5473if ((base = tupregs[1].dttk_value) <= 1 ||5474base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {5475*flags |= CPU_DTRACE_ILLOP;5476break;5477}5478}54795480val = (base == 10 && i < 0) ? i * -1 : i;54815482if (!DTRACE_INSCRATCH(mstate, size)) {5483DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5484regs[rd] = 0;5485break;5486}54875488for (*end-- = '\0'; val; val /= base) {5489if ((digit = val % base) <= '9' - '0') {5490*end-- = '0' + digit;5491} else {5492*end-- = 'a' + (digit - ('9' - '0') - 1);5493}5494}54955496if (i == 0 && base == 16)5497*end-- = '0';54985499if (base == 16)5500*end-- = 'x';55015502if (i == 0 || base == 8 || base == 16)5503*end-- = '0';55045505if (i < 0 && base == 10)5506*end-- = '-';55075508regs[rd] = (uintptr_t)end + 1;5509mstate->dtms_scratch_ptr += size;5510break;5511}55125513case DIF_SUBR_HTONS:5514case DIF_SUBR_NTOHS:5515#if BYTE_ORDER == BIG_ENDIAN5516regs[rd] = (uint16_t)tupregs[0].dttk_value;5517#else5518regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);5519#endif5520break;552155225523case DIF_SUBR_HTONL:5524case DIF_SUBR_NTOHL:5525#if BYTE_ORDER == BIG_ENDIAN5526regs[rd] = (uint32_t)tupregs[0].dttk_value;5527#else5528regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);5529#endif5530break;553155325533case DIF_SUBR_HTONLL:5534case DIF_SUBR_NTOHLL:5535#if BYTE_ORDER == BIG_ENDIAN5536regs[rd] = (uint64_t)tupregs[0].dttk_value;5537#else5538regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);5539#endif5540break;554155425543case DIF_SUBR_DIRNAME:5544case DIF_SUBR_BASENAME: {5545char *dest = (char *)mstate->dtms_scratch_ptr;5546uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5547uintptr_t src = tupregs[0].dttk_value;5548int i, j, len = dtrace_strlen((char *)src, size);5549int lastbase = -1, firstbase = -1, lastdir = -1;5550int start, end;55515552if (!dtrace_canload(src, len + 1, mstate, vstate)) {5553regs[rd] = 0;5554break;5555}55565557if (!DTRACE_INSCRATCH(mstate, size)) {5558DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5559regs[rd] = 0;5560break;5561}55625563/*5564* The basename and dirname for a zero-length string is5565* defined to be "."5566*/5567if (len == 0) {5568len = 1;5569src = (uintptr_t)".";5570}55715572/*5573* Start from the back of the string, moving back toward the5574* front until we see a character that isn't a slash. That5575* character is the last character in the basename.5576*/5577for (i = len - 1; i >= 0; i--) {5578if (dtrace_load8(src + i) != '/')5579break;5580}55815582if (i >= 0)5583lastbase = i;55845585/*5586* Starting from the last character in the basename, move5587* towards the front until we find a slash. The character5588* that we processed immediately before that is the first5589* character in the basename.5590*/5591for (; i >= 0; i--) {5592if (dtrace_load8(src + i) == '/')5593break;5594}55955596if (i >= 0)5597firstbase = i + 1;55985599/*5600* Now keep going until we find a non-slash character. That5601* character is the last character in the dirname.5602*/5603for (; i >= 0; i--) {5604if (dtrace_load8(src + i) != '/')5605break;5606}56075608if (i >= 0)5609lastdir = i;56105611ASSERT(!(lastbase == -1 && firstbase != -1));5612ASSERT(!(firstbase == -1 && lastdir != -1));56135614if (lastbase == -1) {5615/*5616* We didn't find a non-slash character. We know that5617* the length is non-zero, so the whole string must be5618* slashes. In either the dirname or the basename5619* case, we return '/'.5620*/5621ASSERT(firstbase == -1);5622firstbase = lastbase = lastdir = 0;5623}56245625if (firstbase == -1) {5626/*5627* The entire string consists only of a basename5628* component. If we're looking for dirname, we need5629* to change our string to be just "."; if we're5630* looking for a basename, we'll just set the first5631* character of the basename to be 0.5632*/5633if (subr == DIF_SUBR_DIRNAME) {5634ASSERT(lastdir == -1);5635src = (uintptr_t)".";5636lastdir = 0;5637} else {5638firstbase = 0;5639}5640}56415642if (subr == DIF_SUBR_DIRNAME) {5643if (lastdir == -1) {5644/*5645* We know that we have a slash in the name --5646* or lastdir would be set to 0, above. And5647* because lastdir is -1, we know that this5648* slash must be the first character. (That5649* is, the full string must be of the form5650* "/basename".) In this case, the last5651* character of the directory name is 0.5652*/5653lastdir = 0;5654}56555656start = 0;5657end = lastdir;5658} else {5659ASSERT(subr == DIF_SUBR_BASENAME);5660ASSERT(firstbase != -1 && lastbase != -1);5661start = firstbase;5662end = lastbase;5663}56645665for (i = start, j = 0; i <= end && j < size - 1; i++, j++)5666dest[j] = dtrace_load8(src + i);56675668dest[j] = '\0';5669regs[rd] = (uintptr_t)dest;5670mstate->dtms_scratch_ptr += size;5671break;5672}56735674case DIF_SUBR_GETF: {5675uintptr_t fd = tupregs[0].dttk_value;5676struct filedesc *fdp;5677file_t *fp;56785679if (!dtrace_priv_proc(state)) {5680regs[rd] = 0;5681break;5682}5683fdp = curproc->p_fd;5684FILEDESC_SLOCK(fdp);5685/*5686* XXXMJG this looks broken as no ref is taken.5687*/5688fp = fget_noref(fdp, fd);5689mstate->dtms_getf = fp;5690regs[rd] = (uintptr_t)fp;5691FILEDESC_SUNLOCK(fdp);5692break;5693}56945695case DIF_SUBR_CLEANPATH: {5696char *dest = (char *)mstate->dtms_scratch_ptr, c;5697uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];5698uintptr_t src = tupregs[0].dttk_value;5699size_t lim;5700int i = 0, j = 0;5701#ifdef illumos5702zone_t *z;5703#endif57045705if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {5706regs[rd] = 0;5707break;5708}57095710if (!DTRACE_INSCRATCH(mstate, size)) {5711DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5712regs[rd] = 0;5713break;5714}57155716/*5717* Move forward, loading each character.5718*/5719do {5720c = (i >= lim) ? '\0' : dtrace_load8(src + i++);5721next:5722if (j + 5 >= size) /* 5 = strlen("/..c\0") */5723break;57245725if (c != '/') {5726dest[j++] = c;5727continue;5728}57295730c = (i >= lim) ? '\0' : dtrace_load8(src + i++);57315732if (c == '/') {5733/*5734* We have two slashes -- we can just advance5735* to the next character.5736*/5737goto next;5738}57395740if (c != '.') {5741/*5742* This is not "." and it's not ".." -- we can5743* just store the "/" and this character and5744* drive on.5745*/5746dest[j++] = '/';5747dest[j++] = c;5748continue;5749}57505751c = (i >= lim) ? '\0' : dtrace_load8(src + i++);57525753if (c == '/') {5754/*5755* This is a "/./" component. We're not going5756* to store anything in the destination buffer;5757* we're just going to go to the next component.5758*/5759goto next;5760}57615762if (c != '.') {5763/*5764* This is not ".." -- we can just store the5765* "/." and this character and continue5766* processing.5767*/5768dest[j++] = '/';5769dest[j++] = '.';5770dest[j++] = c;5771continue;5772}57735774c = (i >= lim) ? '\0' : dtrace_load8(src + i++);57755776if (c != '/' && c != '\0') {5777/*5778* This is not ".." -- it's "..[mumble]".5779* We'll store the "/.." and this character5780* and continue processing.5781*/5782dest[j++] = '/';5783dest[j++] = '.';5784dest[j++] = '.';5785dest[j++] = c;5786continue;5787}57885789/*5790* This is "/../" or "/..\0". We need to back up5791* our destination pointer until we find a "/".5792*/5793i--;5794while (j != 0 && dest[--j] != '/')5795continue;57965797if (c == '\0')5798dest[++j] = '/';5799} while (c != '\0');58005801dest[j] = '\0';58025803#ifdef illumos5804if (mstate->dtms_getf != NULL &&5805!(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&5806(z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {5807/*5808* If we've done a getf() as a part of this ECB and we5809* don't have kernel access (and we're not in the global5810* zone), check if the path we cleaned up begins with5811* the zone's root path, and trim it off if so. Note5812* that this is an output cleanliness issue, not a5813* security issue: knowing one's zone root path does5814* not enable privilege escalation.5815*/5816if (strstr(dest, z->zone_rootpath) == dest)5817dest += strlen(z->zone_rootpath) - 1;5818}5819#endif58205821regs[rd] = (uintptr_t)dest;5822mstate->dtms_scratch_ptr += size;5823break;5824}58255826case DIF_SUBR_INET_NTOA:5827case DIF_SUBR_INET_NTOA6:5828case DIF_SUBR_INET_NTOP: {5829size_t size;5830int af, argi, i;5831char *base, *end;58325833if (subr == DIF_SUBR_INET_NTOP) {5834af = (int)tupregs[0].dttk_value;5835argi = 1;5836} else {5837af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;5838argi = 0;5839}58405841if (af == AF_INET) {5842ipaddr_t ip4;5843uint8_t *ptr8, val;58445845if (!dtrace_canload(tupregs[argi].dttk_value,5846sizeof (ipaddr_t), mstate, vstate)) {5847regs[rd] = 0;5848break;5849}58505851/*5852* Safely load the IPv4 address.5853*/5854ip4 = dtrace_load32(tupregs[argi].dttk_value);58555856/*5857* Check an IPv4 string will fit in scratch.5858*/5859size = INET_ADDRSTRLEN;5860if (!DTRACE_INSCRATCH(mstate, size)) {5861DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5862regs[rd] = 0;5863break;5864}5865base = (char *)mstate->dtms_scratch_ptr;5866end = (char *)mstate->dtms_scratch_ptr + size - 1;58675868/*5869* Stringify as a dotted decimal quad.5870*/5871*end-- = '\0';5872ptr8 = (uint8_t *)&ip4;5873for (i = 3; i >= 0; i--) {5874val = ptr8[i];58755876if (val == 0) {5877*end-- = '0';5878} else {5879for (; val; val /= 10) {5880*end-- = '0' + (val % 10);5881}5882}58835884if (i > 0)5885*end-- = '.';5886}5887ASSERT(end + 1 >= base);58885889} else if (af == AF_INET6) {5890struct in6_addr ip6;5891int firstzero, tryzero, numzero, v6end;5892uint16_t val;5893const char digits[] = "0123456789abcdef";58945895/*5896* Stringify using RFC 1884 convention 2 - 16 bit5897* hexadecimal values with a zero-run compression.5898* Lower case hexadecimal digits are used.5899* eg, fe80::214:4fff:fe0b:76c8.5900* The IPv4 embedded form is returned for inet_ntop,5901* just the IPv4 string is returned for inet_ntoa6.5902*/59035904if (!dtrace_canload(tupregs[argi].dttk_value,5905sizeof (struct in6_addr), mstate, vstate)) {5906regs[rd] = 0;5907break;5908}59095910/*5911* Safely load the IPv6 address.5912*/5913dtrace_bcopy(5914(void *)(uintptr_t)tupregs[argi].dttk_value,5915(void *)(uintptr_t)&ip6, sizeof (struct in6_addr));59165917/*5918* Check an IPv6 string will fit in scratch.5919*/5920size = INET6_ADDRSTRLEN;5921if (!DTRACE_INSCRATCH(mstate, size)) {5922DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);5923regs[rd] = 0;5924break;5925}5926base = (char *)mstate->dtms_scratch_ptr;5927end = (char *)mstate->dtms_scratch_ptr + size - 1;5928*end-- = '\0';59295930/*5931* Find the longest run of 16 bit zero values5932* for the single allowed zero compression - "::".5933*/5934firstzero = -1;5935tryzero = -1;5936numzero = 1;5937for (i = 0; i < sizeof (struct in6_addr); i++) {5938#ifdef illumos5939if (ip6._S6_un._S6_u8[i] == 0 &&5940#else5941if (ip6.__u6_addr.__u6_addr8[i] == 0 &&5942#endif5943tryzero == -1 && i % 2 == 0) {5944tryzero = i;5945continue;5946}59475948if (tryzero != -1 &&5949#ifdef illumos5950(ip6._S6_un._S6_u8[i] != 0 ||5951#else5952(ip6.__u6_addr.__u6_addr8[i] != 0 ||5953#endif5954i == sizeof (struct in6_addr) - 1)) {59555956if (i - tryzero <= numzero) {5957tryzero = -1;5958continue;5959}59605961firstzero = tryzero;5962numzero = i - i % 2 - tryzero;5963tryzero = -1;59645965#ifdef illumos5966if (ip6._S6_un._S6_u8[i] == 0 &&5967#else5968if (ip6.__u6_addr.__u6_addr8[i] == 0 &&5969#endif5970i == sizeof (struct in6_addr) - 1)5971numzero += 2;5972}5973}5974ASSERT(firstzero + numzero <= sizeof (struct in6_addr));59755976/*5977* Check for an IPv4 embedded address.5978*/5979v6end = sizeof (struct in6_addr) - 2;5980if (IN6_IS_ADDR_V4MAPPED(&ip6) ||5981IN6_IS_ADDR_V4COMPAT(&ip6)) {5982for (i = sizeof (struct in6_addr) - 1;5983i >= DTRACE_V4MAPPED_OFFSET; i--) {5984ASSERT(end >= base);59855986#ifdef illumos5987val = ip6._S6_un._S6_u8[i];5988#else5989val = ip6.__u6_addr.__u6_addr8[i];5990#endif59915992if (val == 0) {5993*end-- = '0';5994} else {5995for (; val; val /= 10) {5996*end-- = '0' + val % 10;5997}5998}59996000if (i > DTRACE_V4MAPPED_OFFSET)6001*end-- = '.';6002}60036004if (subr == DIF_SUBR_INET_NTOA6)6005goto inetout;60066007/*6008* Set v6end to skip the IPv4 address that6009* we have already stringified.6010*/6011v6end = 10;6012}60136014/*6015* Build the IPv6 string by working through the6016* address in reverse.6017*/6018for (i = v6end; i >= 0; i -= 2) {6019ASSERT(end >= base);60206021if (i == firstzero + numzero - 2) {6022*end-- = ':';6023*end-- = ':';6024i -= numzero - 2;6025continue;6026}60276028if (i < 14 && i != firstzero - 2)6029*end-- = ':';60306031#ifdef illumos6032val = (ip6._S6_un._S6_u8[i] << 8) +6033ip6._S6_un._S6_u8[i + 1];6034#else6035val = (ip6.__u6_addr.__u6_addr8[i] << 8) +6036ip6.__u6_addr.__u6_addr8[i + 1];6037#endif60386039if (val == 0) {6040*end-- = '0';6041} else {6042for (; val; val /= 16) {6043*end-- = digits[val % 16];6044}6045}6046}6047ASSERT(end + 1 >= base);60486049} else {6050/*6051* The user didn't use AH_INET or AH_INET6.6052*/6053DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);6054regs[rd] = 0;6055break;6056}60576058inetout: regs[rd] = (uintptr_t)end + 1;6059mstate->dtms_scratch_ptr += size;6060break;6061}60626063case DIF_SUBR_MEMREF: {6064uintptr_t size = 2 * sizeof(uintptr_t);6065uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));6066size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;60676068/* address and length */6069memref[0] = tupregs[0].dttk_value;6070memref[1] = tupregs[1].dttk_value;60716072regs[rd] = (uintptr_t) memref;6073mstate->dtms_scratch_ptr += scratch_size;6074break;6075}60766077#ifndef illumos6078case DIF_SUBR_MEMSTR: {6079char *str = (char *)mstate->dtms_scratch_ptr;6080uintptr_t mem = tupregs[0].dttk_value;6081char c = tupregs[1].dttk_value;6082size_t size = tupregs[2].dttk_value;6083uint8_t n;6084int i;60856086regs[rd] = 0;60876088if (size == 0)6089break;60906091if (!dtrace_canload(mem, size - 1, mstate, vstate))6092break;60936094if (!DTRACE_INSCRATCH(mstate, size)) {6095DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);6096break;6097}60986099if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {6100*flags |= CPU_DTRACE_ILLOP;6101break;6102}61036104for (i = 0; i < size - 1; i++) {6105n = dtrace_load8(mem++);6106str[i] = (n == 0) ? c : n;6107}6108str[size - 1] = 0;61096110regs[rd] = (uintptr_t)str;6111mstate->dtms_scratch_ptr += size;6112break;6113}6114#endif6115}6116}61176118/*6119* Emulate the execution of DTrace IR instructions specified by the given6120* DIF object. This function is deliberately void of assertions as all of6121* the necessary checks are handled by a call to dtrace_difo_validate().6122*/6123static uint64_t6124dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,6125dtrace_vstate_t *vstate, dtrace_state_t *state)6126{6127const dif_instr_t *text = difo->dtdo_buf;6128const uint_t textlen = difo->dtdo_len;6129const char *strtab = difo->dtdo_strtab;6130const uint64_t *inttab = difo->dtdo_inttab;61316132uint64_t rval = 0;6133dtrace_statvar_t *svar;6134dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;6135dtrace_difv_t *v;6136volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;6137volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;61386139dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */6140uint64_t regs[DIF_DIR_NREGS];6141uint64_t *tmp;61426143uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;6144int64_t cc_r;6145uint_t pc = 0, id, opc = 0;6146uint8_t ttop = 0;6147dif_instr_t instr;6148uint_t r1, r2, rd;61496150/*6151* We stash the current DIF object into the machine state: we need it6152* for subsequent access checking.6153*/6154mstate->dtms_difo = difo;61556156regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */61576158while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {6159opc = pc;61606161instr = text[pc++];6162r1 = DIF_INSTR_R1(instr);6163r2 = DIF_INSTR_R2(instr);6164rd = DIF_INSTR_RD(instr);61656166switch (DIF_INSTR_OP(instr)) {6167case DIF_OP_OR:6168regs[rd] = regs[r1] | regs[r2];6169break;6170case DIF_OP_XOR:6171regs[rd] = regs[r1] ^ regs[r2];6172break;6173case DIF_OP_AND:6174regs[rd] = regs[r1] & regs[r2];6175break;6176case DIF_OP_SLL:6177regs[rd] = regs[r1] << regs[r2];6178break;6179case DIF_OP_SRL:6180regs[rd] = regs[r1] >> regs[r2];6181break;6182case DIF_OP_SUB:6183regs[rd] = regs[r1] - regs[r2];6184break;6185case DIF_OP_ADD:6186regs[rd] = regs[r1] + regs[r2];6187break;6188case DIF_OP_MUL:6189regs[rd] = regs[r1] * regs[r2];6190break;6191case DIF_OP_SDIV:6192if (regs[r2] == 0) {6193regs[rd] = 0;6194*flags |= CPU_DTRACE_DIVZERO;6195} else {6196regs[rd] = (int64_t)regs[r1] /6197(int64_t)regs[r2];6198}6199break;62006201case DIF_OP_UDIV:6202if (regs[r2] == 0) {6203regs[rd] = 0;6204*flags |= CPU_DTRACE_DIVZERO;6205} else {6206regs[rd] = regs[r1] / regs[r2];6207}6208break;62096210case DIF_OP_SREM:6211if (regs[r2] == 0) {6212regs[rd] = 0;6213*flags |= CPU_DTRACE_DIVZERO;6214} else {6215regs[rd] = (int64_t)regs[r1] %6216(int64_t)regs[r2];6217}6218break;62196220case DIF_OP_UREM:6221if (regs[r2] == 0) {6222regs[rd] = 0;6223*flags |= CPU_DTRACE_DIVZERO;6224} else {6225regs[rd] = regs[r1] % regs[r2];6226}6227break;62286229case DIF_OP_NOT:6230regs[rd] = ~regs[r1];6231break;6232case DIF_OP_MOV:6233regs[rd] = regs[r1];6234break;6235case DIF_OP_CMP:6236cc_r = regs[r1] - regs[r2];6237cc_n = cc_r < 0;6238cc_z = cc_r == 0;6239cc_v = 0;6240cc_c = regs[r1] < regs[r2];6241break;6242case DIF_OP_TST:6243cc_n = cc_v = cc_c = 0;6244cc_z = regs[r1] == 0;6245break;6246case DIF_OP_BA:6247pc = DIF_INSTR_LABEL(instr);6248break;6249case DIF_OP_BE:6250if (cc_z)6251pc = DIF_INSTR_LABEL(instr);6252break;6253case DIF_OP_BNE:6254if (cc_z == 0)6255pc = DIF_INSTR_LABEL(instr);6256break;6257case DIF_OP_BG:6258if ((cc_z | (cc_n ^ cc_v)) == 0)6259pc = DIF_INSTR_LABEL(instr);6260break;6261case DIF_OP_BGU:6262if ((cc_c | cc_z) == 0)6263pc = DIF_INSTR_LABEL(instr);6264break;6265case DIF_OP_BGE:6266if ((cc_n ^ cc_v) == 0)6267pc = DIF_INSTR_LABEL(instr);6268break;6269case DIF_OP_BGEU:6270if (cc_c == 0)6271pc = DIF_INSTR_LABEL(instr);6272break;6273case DIF_OP_BL:6274if (cc_n ^ cc_v)6275pc = DIF_INSTR_LABEL(instr);6276break;6277case DIF_OP_BLU:6278if (cc_c)6279pc = DIF_INSTR_LABEL(instr);6280break;6281case DIF_OP_BLE:6282if (cc_z | (cc_n ^ cc_v))6283pc = DIF_INSTR_LABEL(instr);6284break;6285case DIF_OP_BLEU:6286if (cc_c | cc_z)6287pc = DIF_INSTR_LABEL(instr);6288break;6289case DIF_OP_RLDSB:6290if (!dtrace_canload(regs[r1], 1, mstate, vstate))6291break;6292/*FALLTHROUGH*/6293case DIF_OP_LDSB:6294regs[rd] = (int8_t)dtrace_load8(regs[r1]);6295break;6296case DIF_OP_RLDSH:6297if (!dtrace_canload(regs[r1], 2, mstate, vstate))6298break;6299/*FALLTHROUGH*/6300case DIF_OP_LDSH:6301regs[rd] = (int16_t)dtrace_load16(regs[r1]);6302break;6303case DIF_OP_RLDSW:6304if (!dtrace_canload(regs[r1], 4, mstate, vstate))6305break;6306/*FALLTHROUGH*/6307case DIF_OP_LDSW:6308regs[rd] = (int32_t)dtrace_load32(regs[r1]);6309break;6310case DIF_OP_RLDUB:6311if (!dtrace_canload(regs[r1], 1, mstate, vstate))6312break;6313/*FALLTHROUGH*/6314case DIF_OP_LDUB:6315regs[rd] = dtrace_load8(regs[r1]);6316break;6317case DIF_OP_RLDUH:6318if (!dtrace_canload(regs[r1], 2, mstate, vstate))6319break;6320/*FALLTHROUGH*/6321case DIF_OP_LDUH:6322regs[rd] = dtrace_load16(regs[r1]);6323break;6324case DIF_OP_RLDUW:6325if (!dtrace_canload(regs[r1], 4, mstate, vstate))6326break;6327/*FALLTHROUGH*/6328case DIF_OP_LDUW:6329regs[rd] = dtrace_load32(regs[r1]);6330break;6331case DIF_OP_RLDX:6332if (!dtrace_canload(regs[r1], 8, mstate, vstate))6333break;6334/*FALLTHROUGH*/6335case DIF_OP_LDX:6336regs[rd] = dtrace_load64(regs[r1]);6337break;6338case DIF_OP_ULDSB:6339DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6340regs[rd] = (int8_t)6341dtrace_fuword8((void *)(uintptr_t)regs[r1]);6342DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6343break;6344case DIF_OP_ULDSH:6345DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6346regs[rd] = (int16_t)6347dtrace_fuword16((void *)(uintptr_t)regs[r1]);6348DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6349break;6350case DIF_OP_ULDSW:6351DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6352regs[rd] = (int32_t)6353dtrace_fuword32((void *)(uintptr_t)regs[r1]);6354DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6355break;6356case DIF_OP_ULDUB:6357DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6358regs[rd] =6359dtrace_fuword8((void *)(uintptr_t)regs[r1]);6360DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6361break;6362case DIF_OP_ULDUH:6363DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6364regs[rd] =6365dtrace_fuword16((void *)(uintptr_t)regs[r1]);6366DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6367break;6368case DIF_OP_ULDUW:6369DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6370regs[rd] =6371dtrace_fuword32((void *)(uintptr_t)regs[r1]);6372DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6373break;6374case DIF_OP_ULDX:6375DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);6376regs[rd] =6377dtrace_fuword64((void *)(uintptr_t)regs[r1]);6378DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);6379break;6380case DIF_OP_RET:6381rval = regs[rd];6382pc = textlen;6383break;6384case DIF_OP_NOP:6385break;6386case DIF_OP_SETX:6387regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];6388break;6389case DIF_OP_SETS:6390regs[rd] = (uint64_t)(uintptr_t)6391(strtab + DIF_INSTR_STRING(instr));6392break;6393case DIF_OP_SCMP: {6394size_t sz = state->dts_options[DTRACEOPT_STRSIZE];6395uintptr_t s1 = regs[r1];6396uintptr_t s2 = regs[r2];6397size_t lim1, lim2;63986399/*6400* If one of the strings is NULL then the limit becomes6401* 0 which compares 0 characters in dtrace_strncmp()6402* resulting in a false positive. dtrace_strncmp()6403* treats a NULL as an empty 1-char string.6404*/6405lim1 = lim2 = 1;64066407if (s1 != 0 &&6408!dtrace_strcanload(s1, sz, &lim1, mstate, vstate))6409break;6410if (s2 != 0 &&6411!dtrace_strcanload(s2, sz, &lim2, mstate, vstate))6412break;64136414cc_r = dtrace_strncmp((char *)s1, (char *)s2,6415MIN(lim1, lim2));64166417cc_n = cc_r < 0;6418cc_z = cc_r == 0;6419cc_v = cc_c = 0;6420break;6421}6422case DIF_OP_LDGA:6423regs[rd] = dtrace_dif_variable(mstate, state,6424r1, regs[r2]);6425break;6426case DIF_OP_LDGS:6427id = DIF_INSTR_VAR(instr);64286429if (id >= DIF_VAR_OTHER_UBASE) {6430uintptr_t a;64316432id -= DIF_VAR_OTHER_UBASE;6433svar = vstate->dtvs_globals[id];6434ASSERT(svar != NULL);6435v = &svar->dtsv_var;64366437if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {6438regs[rd] = svar->dtsv_data;6439break;6440}64416442a = (uintptr_t)svar->dtsv_data;64436444if (*(uint8_t *)a == UINT8_MAX) {6445/*6446* If the 0th byte is set to UINT8_MAX6447* then this is to be treated as a6448* reference to a NULL variable.6449*/6450regs[rd] = 0;6451} else {6452regs[rd] = a + sizeof (uint64_t);6453}64546455break;6456}64576458regs[rd] = dtrace_dif_variable(mstate, state, id, 0);6459break;64606461case DIF_OP_STGS:6462id = DIF_INSTR_VAR(instr);64636464ASSERT(id >= DIF_VAR_OTHER_UBASE);6465id -= DIF_VAR_OTHER_UBASE;64666467VERIFY(id < vstate->dtvs_nglobals);6468svar = vstate->dtvs_globals[id];6469ASSERT(svar != NULL);6470v = &svar->dtsv_var;64716472if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6473uintptr_t a = (uintptr_t)svar->dtsv_data;6474size_t lim;64756476ASSERT(a != 0);6477ASSERT(svar->dtsv_size != 0);64786479if (regs[rd] == 0) {6480*(uint8_t *)a = UINT8_MAX;6481break;6482} else {6483*(uint8_t *)a = 0;6484a += sizeof (uint64_t);6485}6486if (!dtrace_vcanload(6487(void *)(uintptr_t)regs[rd], &v->dtdv_type,6488&lim, mstate, vstate))6489break;64906491dtrace_vcopy((void *)(uintptr_t)regs[rd],6492(void *)a, &v->dtdv_type, lim);6493break;6494}64956496svar->dtsv_data = regs[rd];6497break;64986499case DIF_OP_LDTA:6500/*6501* There are no DTrace built-in thread-local arrays at6502* present. This opcode is saved for future work.6503*/6504*flags |= CPU_DTRACE_ILLOP;6505regs[rd] = 0;6506break;65076508case DIF_OP_LDLS:6509id = DIF_INSTR_VAR(instr);65106511if (id < DIF_VAR_OTHER_UBASE) {6512/*6513* For now, this has no meaning.6514*/6515regs[rd] = 0;6516break;6517}65186519id -= DIF_VAR_OTHER_UBASE;65206521ASSERT(id < vstate->dtvs_nlocals);6522ASSERT(vstate->dtvs_locals != NULL);65236524svar = vstate->dtvs_locals[id];6525ASSERT(svar != NULL);6526v = &svar->dtsv_var;65276528if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6529uintptr_t a = (uintptr_t)svar->dtsv_data;6530size_t sz = v->dtdv_type.dtdt_size;6531size_t lim;65326533sz += sizeof (uint64_t);6534ASSERT(svar->dtsv_size == (mp_maxid + 1) * sz);6535a += curcpu * sz;65366537if (*(uint8_t *)a == UINT8_MAX) {6538/*6539* If the 0th byte is set to UINT8_MAX6540* then this is to be treated as a6541* reference to a NULL variable.6542*/6543regs[rd] = 0;6544} else {6545regs[rd] = a + sizeof (uint64_t);6546}65476548break;6549}65506551ASSERT(svar->dtsv_size ==6552(mp_maxid + 1) * sizeof (uint64_t));6553tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;6554regs[rd] = tmp[curcpu];6555break;65566557case DIF_OP_STLS:6558id = DIF_INSTR_VAR(instr);65596560ASSERT(id >= DIF_VAR_OTHER_UBASE);6561id -= DIF_VAR_OTHER_UBASE;6562VERIFY(id < vstate->dtvs_nlocals);65636564ASSERT(vstate->dtvs_locals != NULL);6565svar = vstate->dtvs_locals[id];6566ASSERT(svar != NULL);6567v = &svar->dtsv_var;65686569if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6570uintptr_t a = (uintptr_t)svar->dtsv_data;6571size_t sz = v->dtdv_type.dtdt_size;6572size_t lim;65736574sz += sizeof (uint64_t);6575ASSERT(svar->dtsv_size == (mp_maxid + 1) * sz);6576a += curcpu * sz;65776578if (regs[rd] == 0) {6579*(uint8_t *)a = UINT8_MAX;6580break;6581} else {6582*(uint8_t *)a = 0;6583a += sizeof (uint64_t);6584}65856586if (!dtrace_vcanload(6587(void *)(uintptr_t)regs[rd], &v->dtdv_type,6588&lim, mstate, vstate))6589break;65906591dtrace_vcopy((void *)(uintptr_t)regs[rd],6592(void *)a, &v->dtdv_type, lim);6593break;6594}65956596ASSERT(svar->dtsv_size ==6597(mp_maxid + 1) * sizeof (uint64_t));6598tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;6599tmp[curcpu] = regs[rd];6600break;66016602case DIF_OP_LDTS: {6603dtrace_dynvar_t *dvar;6604dtrace_key_t *key;66056606id = DIF_INSTR_VAR(instr);6607ASSERT(id >= DIF_VAR_OTHER_UBASE);6608id -= DIF_VAR_OTHER_UBASE;6609v = &vstate->dtvs_tlocals[id];66106611key = &tupregs[DIF_DTR_NREGS];6612key[0].dttk_value = (uint64_t)id;6613key[0].dttk_size = 0;6614DTRACE_TLS_THRKEY(key[1].dttk_value);6615key[1].dttk_size = 0;66166617dvar = dtrace_dynvar(dstate, 2, key,6618sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,6619mstate, vstate);66206621if (dvar == NULL) {6622regs[rd] = 0;6623break;6624}66256626if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6627regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;6628} else {6629regs[rd] = *((uint64_t *)dvar->dtdv_data);6630}66316632break;6633}66346635case DIF_OP_STTS: {6636dtrace_dynvar_t *dvar;6637dtrace_key_t *key;66386639id = DIF_INSTR_VAR(instr);6640ASSERT(id >= DIF_VAR_OTHER_UBASE);6641id -= DIF_VAR_OTHER_UBASE;6642VERIFY(id < vstate->dtvs_ntlocals);66436644key = &tupregs[DIF_DTR_NREGS];6645key[0].dttk_value = (uint64_t)id;6646key[0].dttk_size = 0;6647DTRACE_TLS_THRKEY(key[1].dttk_value);6648key[1].dttk_size = 0;6649v = &vstate->dtvs_tlocals[id];66506651dvar = dtrace_dynvar(dstate, 2, key,6652v->dtdv_type.dtdt_size > sizeof (uint64_t) ?6653v->dtdv_type.dtdt_size : sizeof (uint64_t),6654regs[rd] ? DTRACE_DYNVAR_ALLOC :6655DTRACE_DYNVAR_DEALLOC, mstate, vstate);66566657/*6658* Given that we're storing to thread-local data,6659* we need to flush our predicate cache.6660*/6661curthread->t_predcache = 0;66626663if (dvar == NULL)6664break;66656666if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6667size_t lim;66686669if (!dtrace_vcanload(6670(void *)(uintptr_t)regs[rd],6671&v->dtdv_type, &lim, mstate, vstate))6672break;66736674dtrace_vcopy((void *)(uintptr_t)regs[rd],6675dvar->dtdv_data, &v->dtdv_type, lim);6676} else {6677*((uint64_t *)dvar->dtdv_data) = regs[rd];6678}66796680break;6681}66826683case DIF_OP_SRA:6684regs[rd] = (int64_t)regs[r1] >> regs[r2];6685break;66866687case DIF_OP_CALL:6688dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,6689regs, tupregs, ttop, mstate, state);6690break;66916692case DIF_OP_PUSHTR:6693if (ttop == DIF_DTR_NREGS) {6694*flags |= CPU_DTRACE_TUPOFLOW;6695break;6696}66976698if (r1 == DIF_TYPE_STRING) {6699/*6700* If this is a string type and the size is 0,6701* we'll use the system-wide default string6702* size. Note that we are _not_ looking at6703* the value of the DTRACEOPT_STRSIZE option;6704* had this been set, we would expect to have6705* a non-zero size value in the "pushtr".6706*/6707tupregs[ttop].dttk_size =6708dtrace_strlen((char *)(uintptr_t)regs[rd],6709regs[r2] ? regs[r2] :6710dtrace_strsize_default) + 1;6711} else {6712if (regs[r2] > LONG_MAX) {6713*flags |= CPU_DTRACE_ILLOP;6714break;6715}67166717tupregs[ttop].dttk_size = regs[r2];6718}67196720tupregs[ttop++].dttk_value = regs[rd];6721break;67226723case DIF_OP_PUSHTV:6724if (ttop == DIF_DTR_NREGS) {6725*flags |= CPU_DTRACE_TUPOFLOW;6726break;6727}67286729tupregs[ttop].dttk_value = regs[rd];6730tupregs[ttop++].dttk_size = 0;6731break;67326733case DIF_OP_POPTS:6734if (ttop != 0)6735ttop--;6736break;67376738case DIF_OP_FLUSHTS:6739ttop = 0;6740break;67416742case DIF_OP_LDGAA:6743case DIF_OP_LDTAA: {6744dtrace_dynvar_t *dvar;6745dtrace_key_t *key = tupregs;6746uint_t nkeys = ttop;67476748id = DIF_INSTR_VAR(instr);6749ASSERT(id >= DIF_VAR_OTHER_UBASE);6750id -= DIF_VAR_OTHER_UBASE;67516752key[nkeys].dttk_value = (uint64_t)id;6753key[nkeys++].dttk_size = 0;67546755if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {6756DTRACE_TLS_THRKEY(key[nkeys].dttk_value);6757key[nkeys++].dttk_size = 0;6758VERIFY(id < vstate->dtvs_ntlocals);6759v = &vstate->dtvs_tlocals[id];6760} else {6761VERIFY(id < vstate->dtvs_nglobals);6762v = &vstate->dtvs_globals[id]->dtsv_var;6763}67646765dvar = dtrace_dynvar(dstate, nkeys, key,6766v->dtdv_type.dtdt_size > sizeof (uint64_t) ?6767v->dtdv_type.dtdt_size : sizeof (uint64_t),6768DTRACE_DYNVAR_NOALLOC, mstate, vstate);67696770if (dvar == NULL) {6771regs[rd] = 0;6772break;6773}67746775if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6776regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;6777} else {6778regs[rd] = *((uint64_t *)dvar->dtdv_data);6779}67806781break;6782}67836784case DIF_OP_STGAA:6785case DIF_OP_STTAA: {6786dtrace_dynvar_t *dvar;6787dtrace_key_t *key = tupregs;6788uint_t nkeys = ttop;67896790id = DIF_INSTR_VAR(instr);6791ASSERT(id >= DIF_VAR_OTHER_UBASE);6792id -= DIF_VAR_OTHER_UBASE;67936794key[nkeys].dttk_value = (uint64_t)id;6795key[nkeys++].dttk_size = 0;67966797if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {6798DTRACE_TLS_THRKEY(key[nkeys].dttk_value);6799key[nkeys++].dttk_size = 0;6800VERIFY(id < vstate->dtvs_ntlocals);6801v = &vstate->dtvs_tlocals[id];6802} else {6803VERIFY(id < vstate->dtvs_nglobals);6804v = &vstate->dtvs_globals[id]->dtsv_var;6805}68066807dvar = dtrace_dynvar(dstate, nkeys, key,6808v->dtdv_type.dtdt_size > sizeof (uint64_t) ?6809v->dtdv_type.dtdt_size : sizeof (uint64_t),6810regs[rd] ? DTRACE_DYNVAR_ALLOC :6811DTRACE_DYNVAR_DEALLOC, mstate, vstate);68126813if (dvar == NULL)6814break;68156816if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {6817size_t lim;68186819if (!dtrace_vcanload(6820(void *)(uintptr_t)regs[rd], &v->dtdv_type,6821&lim, mstate, vstate))6822break;68236824dtrace_vcopy((void *)(uintptr_t)regs[rd],6825dvar->dtdv_data, &v->dtdv_type, lim);6826} else {6827*((uint64_t *)dvar->dtdv_data) = regs[rd];6828}68296830break;6831}68326833case DIF_OP_ALLOCS: {6834uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);6835size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];68366837/*6838* Rounding up the user allocation size could have6839* overflowed large, bogus allocations (like -1ULL) to6840* 0.6841*/6842if (size < regs[r1] ||6843!DTRACE_INSCRATCH(mstate, size)) {6844DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);6845regs[rd] = 0;6846break;6847}68486849dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);6850mstate->dtms_scratch_ptr += size;6851regs[rd] = ptr;6852break;6853}68546855case DIF_OP_COPYS:6856if (!dtrace_canstore(regs[rd], regs[r2],6857mstate, vstate)) {6858*flags |= CPU_DTRACE_BADADDR;6859*illval = regs[rd];6860break;6861}68626863if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))6864break;68656866dtrace_bcopy((void *)(uintptr_t)regs[r1],6867(void *)(uintptr_t)regs[rd], (size_t)regs[r2]);6868break;68696870case DIF_OP_STB:6871if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {6872*flags |= CPU_DTRACE_BADADDR;6873*illval = regs[rd];6874break;6875}6876*((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];6877break;68786879case DIF_OP_STH:6880if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {6881*flags |= CPU_DTRACE_BADADDR;6882*illval = regs[rd];6883break;6884}6885if (regs[rd] & 1) {6886*flags |= CPU_DTRACE_BADALIGN;6887*illval = regs[rd];6888break;6889}6890*((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];6891break;68926893case DIF_OP_STW:6894if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {6895*flags |= CPU_DTRACE_BADADDR;6896*illval = regs[rd];6897break;6898}6899if (regs[rd] & 3) {6900*flags |= CPU_DTRACE_BADALIGN;6901*illval = regs[rd];6902break;6903}6904*((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];6905break;69066907case DIF_OP_STX:6908if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {6909*flags |= CPU_DTRACE_BADADDR;6910*illval = regs[rd];6911break;6912}6913if (regs[rd] & 7) {6914*flags |= CPU_DTRACE_BADALIGN;6915*illval = regs[rd];6916break;6917}6918*((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];6919break;6920}6921}69226923if (!(*flags & CPU_DTRACE_FAULT))6924return (rval);69256926mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);6927mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;69286929return (0);6930}69316932static void6933dtrace_action_breakpoint(dtrace_ecb_t *ecb)6934{6935dtrace_probe_t *probe = ecb->dte_probe;6936dtrace_provider_t *prov = probe->dtpr_provider;6937char c[DTRACE_FULLNAMELEN + 80], *str;6938char *msg = "dtrace: breakpoint action at probe ";6939char *ecbmsg = " (ecb ";6940uintptr_t val = (uintptr_t)ecb;6941int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;69426943if (dtrace_destructive_disallow)6944return;69456946/*6947* It's impossible to be taking action on the NULL probe.6948*/6949ASSERT(probe != NULL);69506951/*6952* This is a poor man's (destitute man's?) sprintf(): we want to6953* print the provider name, module name, function name and name of6954* the probe, along with the hex address of the ECB with the breakpoint6955* action -- all of which we must place in the character buffer by6956* hand.6957*/6958while (*msg != '\0')6959c[i++] = *msg++;69606961for (str = prov->dtpv_name; *str != '\0'; str++)6962c[i++] = *str;6963c[i++] = ':';69646965for (str = probe->dtpr_mod; *str != '\0'; str++)6966c[i++] = *str;6967c[i++] = ':';69686969for (str = probe->dtpr_func; *str != '\0'; str++)6970c[i++] = *str;6971c[i++] = ':';69726973for (str = probe->dtpr_name; *str != '\0'; str++)6974c[i++] = *str;69756976while (*ecbmsg != '\0')6977c[i++] = *ecbmsg++;69786979while (shift >= 0) {6980size_t mask = (size_t)0xf << shift;69816982if (val >= ((size_t)1 << shift))6983c[i++] = "0123456789abcdef"[(val & mask) >> shift];6984shift -= 4;6985}69866987c[i++] = ')';6988c[i] = '\0';69896990#ifdef illumos6991debug_enter(c);6992#else6993kdb_enter(KDB_WHY_DTRACE, "breakpoint action");6994#endif6995}69966997static void6998dtrace_action_panic(dtrace_ecb_t *ecb)6999{7000dtrace_probe_t *probe = ecb->dte_probe;70017002/*7003* It's impossible to be taking action on the NULL probe.7004*/7005ASSERT(probe != NULL);70067007if (dtrace_destructive_disallow)7008return;70097010if (dtrace_panicked != NULL)7011return;70127013if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)7014return;70157016/*7017* We won the right to panic. (We want to be sure that only one7018* thread calls panic() from dtrace_probe(), and that panic() is7019* called exactly once.)7020*/7021dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",7022probe->dtpr_provider->dtpv_name, probe->dtpr_mod,7023probe->dtpr_func, probe->dtpr_name, (void *)ecb);7024}70257026static void7027dtrace_action_raise(uint64_t sig)7028{7029if (dtrace_destructive_disallow)7030return;70317032if (sig >= NSIG) {7033DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);7034return;7035}70367037#ifdef illumos7038/*7039* raise() has a queue depth of 1 -- we ignore all subsequent7040* invocations of the raise() action.7041*/7042if (curthread->t_dtrace_sig == 0)7043curthread->t_dtrace_sig = (uint8_t)sig;70447045curthread->t_sig_check = 1;7046aston(curthread);7047#else7048struct proc *p = curproc;7049PROC_LOCK(p);7050kern_psignal(p, sig);7051PROC_UNLOCK(p);7052#endif7053}70547055static void7056dtrace_action_stop(void)7057{7058if (dtrace_destructive_disallow)7059return;70607061#ifdef illumos7062if (!curthread->t_dtrace_stop) {7063curthread->t_dtrace_stop = 1;7064curthread->t_sig_check = 1;7065aston(curthread);7066}7067#else7068struct proc *p = curproc;7069PROC_LOCK(p);7070kern_psignal(p, SIGSTOP);7071PROC_UNLOCK(p);7072#endif7073}70747075static void7076dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)7077{7078hrtime_t now;7079volatile uint16_t *flags;7080#ifdef illumos7081cpu_t *cpu = CPU;7082#else7083cpu_t *cpu = &solaris_cpu[curcpu];7084#endif70857086if (dtrace_destructive_disallow)7087return;70887089flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;70907091now = dtrace_gethrtime();70927093if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {7094/*7095* We need to advance the mark to the current time.7096*/7097cpu->cpu_dtrace_chillmark = now;7098cpu->cpu_dtrace_chilled = 0;7099}71007101/*7102* Now check to see if the requested chill time would take us over7103* the maximum amount of time allowed in the chill interval. (Or7104* worse, if the calculation itself induces overflow.)7105*/7106if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||7107cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {7108*flags |= CPU_DTRACE_ILLOP;7109return;7110}71117112while (dtrace_gethrtime() - now < val)7113continue;71147115/*7116* Normally, we assure that the value of the variable "timestamp" does7117* not change within an ECB. The presence of chill() represents an7118* exception to this rule, however.7119*/7120mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;7121cpu->cpu_dtrace_chilled += val;7122}71237124static void7125dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,7126uint64_t *buf, uint64_t arg)7127{7128int nframes = DTRACE_USTACK_NFRAMES(arg);7129int strsize = DTRACE_USTACK_STRSIZE(arg);7130uint64_t *pcs = &buf[1], *fps;7131char *str = (char *)&pcs[nframes];7132int size, offs = 0, i, j;7133size_t rem;7134uintptr_t old = mstate->dtms_scratch_ptr, saved;7135uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;7136char *sym;71377138/*7139* Should be taking a faster path if string space has not been7140* allocated.7141*/7142ASSERT(strsize != 0);71437144/*7145* We will first allocate some temporary space for the frame pointers.7146*/7147fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);7148size = (uintptr_t)fps - mstate->dtms_scratch_ptr +7149(nframes * sizeof (uint64_t));71507151if (!DTRACE_INSCRATCH(mstate, size)) {7152/*7153* Not enough room for our frame pointers -- need to indicate7154* that we ran out of scratch space.7155*/7156DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);7157return;7158}71597160mstate->dtms_scratch_ptr += size;7161saved = mstate->dtms_scratch_ptr;71627163/*7164* Now get a stack with both program counters and frame pointers.7165*/7166DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);7167dtrace_getufpstack(buf, fps, nframes + 1);7168DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);71697170/*7171* If that faulted, we're cooked.7172*/7173if (*flags & CPU_DTRACE_FAULT)7174goto out;71757176/*7177* Now we want to walk up the stack, calling the USTACK helper. For7178* each iteration, we restore the scratch pointer.7179*/7180for (i = 0; i < nframes; i++) {7181mstate->dtms_scratch_ptr = saved;71827183if (offs >= strsize)7184break;71857186sym = (char *)(uintptr_t)dtrace_helper(7187DTRACE_HELPER_ACTION_USTACK,7188mstate, state, pcs[i], fps[i]);71897190/*7191* If we faulted while running the helper, we're going to7192* clear the fault and null out the corresponding string.7193*/7194if (*flags & CPU_DTRACE_FAULT) {7195*flags &= ~CPU_DTRACE_FAULT;7196str[offs++] = '\0';7197continue;7198}71997200if (sym == NULL) {7201str[offs++] = '\0';7202continue;7203}72047205if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate,7206&(state->dts_vstate))) {7207str[offs++] = '\0';7208continue;7209}72107211DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);72127213/*7214* Now copy in the string that the helper returned to us.7215*/7216for (j = 0; offs + j < strsize && j < rem; j++) {7217if ((str[offs + j] = sym[j]) == '\0')7218break;7219}72207221DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);72227223offs += j + 1;7224}72257226if (offs >= strsize) {7227/*7228* If we didn't have room for all of the strings, we don't7229* abort processing -- this needn't be a fatal error -- but we7230* still want to increment a counter (dts_stkstroverflows) to7231* allow this condition to be warned about. (If this is from7232* a jstack() action, it is easily tuned via jstackstrsize.)7233*/7234dtrace_error(&state->dts_stkstroverflows);7235}72367237while (offs < strsize)7238str[offs++] = '\0';72397240out:7241mstate->dtms_scratch_ptr = old;7242}72437244static void7245dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,7246size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)7247{7248volatile uint16_t *flags;7249uint64_t val = *valp;7250size_t valoffs = *valoffsp;72517252flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;7253ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);72547255/*7256* If this is a string, we're going to only load until we find the zero7257* byte -- after which we'll store zero bytes.7258*/7259if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {7260char c = '\0' + 1;7261size_t s;72627263for (s = 0; s < size; s++) {7264if (c != '\0' && dtkind == DIF_TF_BYREF) {7265c = dtrace_load8(val++);7266} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {7267DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);7268c = dtrace_fuword8((void *)(uintptr_t)val++);7269DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);7270if (*flags & CPU_DTRACE_FAULT)7271break;7272}72737274DTRACE_STORE(uint8_t, tomax, valoffs++, c);72757276if (c == '\0' && intuple)7277break;7278}7279} else {7280uint8_t c;7281while (valoffs < end) {7282if (dtkind == DIF_TF_BYREF) {7283c = dtrace_load8(val++);7284} else if (dtkind == DIF_TF_BYUREF) {7285DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);7286c = dtrace_fuword8((void *)(uintptr_t)val++);7287DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);7288if (*flags & CPU_DTRACE_FAULT)7289break;7290}72917292DTRACE_STORE(uint8_t, tomax,7293valoffs++, c);7294}7295}72967297*valp = val;7298*valoffsp = valoffs;7299}73007301/*7302* Disables interrupts and sets the per-thread inprobe flag. When DEBUG is7303* defined, we also assert that we are not recursing unless the probe ID is an7304* error probe.7305*/7306static dtrace_icookie_t7307dtrace_probe_enter(dtrace_id_t id)7308{7309dtrace_icookie_t cookie;73107311cookie = dtrace_interrupt_disable();73127313/*7314* Unless this is an ERROR probe, we are not allowed to recurse in7315* dtrace_probe(). Recursing into DTrace probe usually means that a7316* function is instrumented that should not have been instrumented or7317* that the ordering guarantee of the records will be violated,7318* resulting in unexpected output. If there is an exception to this7319* assertion, a new case should be added.7320*/7321ASSERT(curthread->t_dtrace_inprobe == 0 ||7322id == dtrace_probeid_error);7323curthread->t_dtrace_inprobe = 1;73247325return (cookie);7326}73277328/*7329* Clears the per-thread inprobe flag and enables interrupts.7330*/7331static void7332dtrace_probe_exit(dtrace_icookie_t cookie)7333{73347335curthread->t_dtrace_inprobe = 0;7336dtrace_interrupt_enable(cookie);7337}73387339/*7340* If you're looking for the epicenter of DTrace, you just found it. This7341* is the function called by the provider to fire a probe -- from which all7342* subsequent probe-context DTrace activity emanates.7343*/7344void7345dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,7346uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)7347{7348processorid_t cpuid;7349dtrace_icookie_t cookie;7350dtrace_probe_t *probe;7351dtrace_mstate_t mstate;7352dtrace_ecb_t *ecb;7353dtrace_action_t *act;7354intptr_t offs;7355size_t size;7356int vtime, onintr;7357volatile uint16_t *flags;7358hrtime_t now;73597360if (KERNEL_PANICKED())7361return;73627363#ifdef illumos7364/*7365* Kick out immediately if this CPU is still being born (in which case7366* curthread will be set to -1) or the current thread can't allow7367* probes in its current context.7368*/7369if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))7370return;7371#endif73727373cookie = dtrace_probe_enter(id);7374probe = dtrace_probes[id - 1];7375cpuid = curcpu;7376onintr = CPU_ON_INTR(CPU);73777378if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&7379probe->dtpr_predcache == curthread->t_predcache) {7380/*7381* We have hit in the predicate cache; we know that7382* this predicate would evaluate to be false.7383*/7384dtrace_probe_exit(cookie);7385return;7386}73877388#ifdef illumos7389if (panic_quiesce) {7390#else7391if (KERNEL_PANICKED()) {7392#endif7393/*7394* We don't trace anything if we're panicking.7395*/7396dtrace_probe_exit(cookie);7397return;7398}73997400now = mstate.dtms_timestamp = dtrace_gethrtime();7401mstate.dtms_present = DTRACE_MSTATE_TIMESTAMP;7402vtime = dtrace_vtime_references != 0;74037404if (vtime && curthread->t_dtrace_start)7405curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;74067407mstate.dtms_difo = NULL;7408mstate.dtms_probe = probe;7409mstate.dtms_strtok = 0;7410mstate.dtms_arg[0] = arg0;7411mstate.dtms_arg[1] = arg1;7412mstate.dtms_arg[2] = arg2;7413mstate.dtms_arg[3] = arg3;7414mstate.dtms_arg[4] = arg4;74157416flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;74177418for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {7419dtrace_predicate_t *pred = ecb->dte_predicate;7420dtrace_state_t *state = ecb->dte_state;7421dtrace_buffer_t *buf = &state->dts_buffer[cpuid];7422dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];7423dtrace_vstate_t *vstate = &state->dts_vstate;7424dtrace_provider_t *prov = probe->dtpr_provider;7425uint64_t tracememsize = 0;7426int committed = 0;7427caddr_t tomax;74287429/*7430* A little subtlety with the following (seemingly innocuous)7431* declaration of the automatic 'val': by looking at the7432* code, you might think that it could be declared in the7433* action processing loop, below. (That is, it's only used in7434* the action processing loop.) However, it must be declared7435* out of that scope because in the case of DIF expression7436* arguments to aggregating actions, one iteration of the7437* action loop will use the last iteration's value.7438*/7439uint64_t val = 0;74407441mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;7442mstate.dtms_getf = NULL;74437444*flags &= ~CPU_DTRACE_ERROR;74457446if (prov == dtrace_provider) {7447/*7448* If dtrace itself is the provider of this probe,7449* we're only going to continue processing the ECB if7450* arg0 (the dtrace_state_t) is equal to the ECB's7451* creating state. (This prevents disjoint consumers7452* from seeing one another's metaprobes.)7453*/7454if (arg0 != (uint64_t)(uintptr_t)state)7455continue;7456}74577458if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {7459/*7460* We're not currently active. If our provider isn't7461* the dtrace pseudo provider, we're not interested.7462*/7463if (prov != dtrace_provider)7464continue;74657466/*7467* Now we must further check if we are in the BEGIN7468* probe. If we are, we will only continue processing7469* if we're still in WARMUP -- if one BEGIN enabling7470* has invoked the exit() action, we don't want to7471* evaluate subsequent BEGIN enablings.7472*/7473if (probe->dtpr_id == dtrace_probeid_begin &&7474state->dts_activity != DTRACE_ACTIVITY_WARMUP) {7475ASSERT(state->dts_activity ==7476DTRACE_ACTIVITY_DRAINING);7477continue;7478}7479}74807481if (ecb->dte_cond) {7482/*7483* If the dte_cond bits indicate that this7484* consumer is only allowed to see user-mode firings7485* of this probe, call the provider's dtps_usermode()7486* entry point to check that the probe was fired7487* while in a user context. Skip this ECB if that's7488* not the case.7489*/7490if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&7491prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,7492probe->dtpr_id, probe->dtpr_arg) == 0)7493continue;74947495#ifdef illumos7496/*7497* This is more subtle than it looks. We have to be7498* absolutely certain that CRED() isn't going to7499* change out from under us so it's only legit to7500* examine that structure if we're in constrained7501* situations. Currently, the only times we'll this7502* check is if a non-super-user has enabled the7503* profile or syscall providers -- providers that7504* allow visibility of all processes. For the7505* profile case, the check above will ensure that7506* we're examining a user context.7507*/7508if (ecb->dte_cond & DTRACE_COND_OWNER) {7509cred_t *cr;7510cred_t *s_cr =7511ecb->dte_state->dts_cred.dcr_cred;7512proc_t *proc;75137514ASSERT(s_cr != NULL);75157516if ((cr = CRED()) == NULL ||7517s_cr->cr_uid != cr->cr_uid ||7518s_cr->cr_uid != cr->cr_ruid ||7519s_cr->cr_uid != cr->cr_suid ||7520s_cr->cr_gid != cr->cr_gid ||7521s_cr->cr_gid != cr->cr_rgid ||7522s_cr->cr_gid != cr->cr_sgid ||7523(proc = ttoproc(curthread)) == NULL ||7524(proc->p_flag & SNOCD))7525continue;7526}75277528if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {7529cred_t *cr;7530cred_t *s_cr =7531ecb->dte_state->dts_cred.dcr_cred;75327533ASSERT(s_cr != NULL);75347535if ((cr = CRED()) == NULL ||7536s_cr->cr_zone->zone_id !=7537cr->cr_zone->zone_id)7538continue;7539}7540#endif7541}75427543if (now - state->dts_alive > dtrace_deadman_timeout) {7544/*7545* We seem to be dead. Unless we (a) have kernel7546* destructive permissions (b) have explicitly enabled7547* destructive actions and (c) destructive actions have7548* not been disabled, we're going to transition into7549* the KILLED state, from which no further processing7550* on this state will be performed.7551*/7552if (!dtrace_priv_kernel_destructive(state) ||7553!state->dts_cred.dcr_destructive ||7554dtrace_destructive_disallow) {7555void *activity = &state->dts_activity;7556dtrace_activity_t curstate;75577558do {7559curstate = state->dts_activity;7560} while (dtrace_cas32(activity, curstate,7561DTRACE_ACTIVITY_KILLED) != curstate);75627563continue;7564}7565}75667567if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,7568ecb->dte_alignment, state, &mstate)) < 0)7569continue;75707571tomax = buf->dtb_tomax;7572ASSERT(tomax != NULL);75737574if (ecb->dte_size != 0) {7575dtrace_rechdr_t dtrh;7576if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {7577mstate.dtms_timestamp = dtrace_gethrtime();7578mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;7579}7580ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));7581dtrh.dtrh_epid = ecb->dte_epid;7582DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,7583mstate.dtms_timestamp);7584*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;7585}75867587mstate.dtms_epid = ecb->dte_epid;7588mstate.dtms_present |= DTRACE_MSTATE_EPID;75897590if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)7591mstate.dtms_access = DTRACE_ACCESS_KERNEL;7592else7593mstate.dtms_access = 0;75947595if (pred != NULL) {7596dtrace_difo_t *dp = pred->dtp_difo;7597uint64_t rval;75987599rval = dtrace_dif_emulate(dp, &mstate, vstate, state);76007601if (!(*flags & CPU_DTRACE_ERROR) && !rval) {7602dtrace_cacheid_t cid = probe->dtpr_predcache;76037604if (cid != DTRACE_CACHEIDNONE && !onintr) {7605/*7606* Update the predicate cache...7607*/7608ASSERT(cid == pred->dtp_cacheid);7609curthread->t_predcache = cid;7610}76117612continue;7613}7614}76157616for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&7617act != NULL; act = act->dta_next) {7618size_t valoffs;7619dtrace_difo_t *dp;7620dtrace_recdesc_t *rec = &act->dta_rec;76217622size = rec->dtrd_size;7623valoffs = offs + rec->dtrd_offset;76247625if (DTRACEACT_ISAGG(act->dta_kind)) {7626uint64_t v = 0xbad;7627dtrace_aggregation_t *agg;76287629agg = (dtrace_aggregation_t *)act;76307631if ((dp = act->dta_difo) != NULL)7632v = dtrace_dif_emulate(dp,7633&mstate, vstate, state);76347635if (*flags & CPU_DTRACE_ERROR)7636continue;76377638/*7639* Note that we always pass the expression7640* value from the previous iteration of the7641* action loop. This value will only be used7642* if there is an expression argument to the7643* aggregating action, denoted by the7644* dtag_hasarg field.7645*/7646dtrace_aggregate(agg, buf,7647offs, aggbuf, v, val);7648continue;7649}76507651switch (act->dta_kind) {7652case DTRACEACT_STOP:7653if (dtrace_priv_proc_destructive(state))7654dtrace_action_stop();7655continue;76567657case DTRACEACT_BREAKPOINT:7658if (dtrace_priv_kernel_destructive(state))7659dtrace_action_breakpoint(ecb);7660continue;76617662case DTRACEACT_PANIC:7663if (dtrace_priv_kernel_destructive(state))7664dtrace_action_panic(ecb);7665continue;76667667case DTRACEACT_STACK:7668if (!dtrace_priv_kernel(state))7669continue;76707671dtrace_getpcstack((pc_t *)(tomax + valoffs),7672size / sizeof (pc_t), probe->dtpr_aframes,7673DTRACE_ANCHORED(probe) ? NULL :7674(uint32_t *)arg0);7675continue;76767677case DTRACEACT_JSTACK:7678case DTRACEACT_USTACK:7679if (!dtrace_priv_proc(state))7680continue;76817682/*7683* See comment in DIF_VAR_PID.7684*/7685if (DTRACE_ANCHORED(mstate.dtms_probe) &&7686CPU_ON_INTR(CPU)) {7687int depth = DTRACE_USTACK_NFRAMES(7688rec->dtrd_arg) + 1;76897690dtrace_bzero((void *)(tomax + valoffs),7691DTRACE_USTACK_STRSIZE(rec->dtrd_arg)7692+ depth * sizeof (uint64_t));76937694continue;7695}76967697if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&7698curproc->p_dtrace_helpers != NULL) {7699/*7700* This is the slow path -- we have7701* allocated string space, and we're7702* getting the stack of a process that7703* has helpers. Call into a separate7704* routine to perform this processing.7705*/7706dtrace_action_ustack(&mstate, state,7707(uint64_t *)(tomax + valoffs),7708rec->dtrd_arg);7709continue;7710}77117712DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);7713dtrace_getupcstack((uint64_t *)7714(tomax + valoffs),7715DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);7716DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);7717continue;77187719default:7720break;7721}77227723dp = act->dta_difo;7724ASSERT(dp != NULL);77257726val = dtrace_dif_emulate(dp, &mstate, vstate, state);77277728if (*flags & CPU_DTRACE_ERROR)7729continue;77307731switch (act->dta_kind) {7732case DTRACEACT_SPECULATE: {7733dtrace_rechdr_t *dtrh;77347735ASSERT(buf == &state->dts_buffer[cpuid]);7736buf = dtrace_speculation_buffer(state,7737cpuid, val);77387739if (buf == NULL) {7740*flags |= CPU_DTRACE_DROP;7741continue;7742}77437744offs = dtrace_buffer_reserve(buf,7745ecb->dte_needed, ecb->dte_alignment,7746state, NULL);77477748if (offs < 0) {7749*flags |= CPU_DTRACE_DROP;7750continue;7751}77527753tomax = buf->dtb_tomax;7754ASSERT(tomax != NULL);77557756if (ecb->dte_size == 0)7757continue;77587759ASSERT3U(ecb->dte_size, >=,7760sizeof (dtrace_rechdr_t));7761dtrh = ((void *)(tomax + offs));7762dtrh->dtrh_epid = ecb->dte_epid;7763/*7764* When the speculation is committed, all of7765* the records in the speculative buffer will7766* have their timestamps set to the commit7767* time. Until then, it is set to a sentinel7768* value, for debugability.7769*/7770DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);7771continue;7772}77737774case DTRACEACT_PRINTM: {7775/*7776* printm() assumes that the DIF returns a7777* pointer returned by memref(). memref() is a7778* subroutine that is used to get around the7779* single-valued returns of DIF and is assumed7780* to always be allocated in the scratch space.7781* Therefore, we need to validate that the7782* pointer given to printm() is in the scratch7783* space in order to avoid a potential panic.7784*/7785uintptr_t *memref = (uintptr_t *)(uintptr_t) val;77867787if (!DTRACE_INSCRATCHPTR(&mstate,7788(uintptr_t) memref,7789sizeof (uintptr_t) + sizeof (size_t))) {7790*flags |= CPU_DTRACE_BADADDR;7791continue;7792}77937794/* Get the size from the memref. */7795size = memref[1];77967797/*7798* Check if the size exceeds the allocated7799* buffer size.7800*/7801if (size + sizeof (size_t) >7802dp->dtdo_rtype.dtdt_size) {7803/* Flag a drop! */7804*flags |= CPU_DTRACE_DROP;7805continue;7806}78077808/* Store the size in the buffer first. */7809DTRACE_STORE(size_t, tomax, valoffs, size);78107811/*7812* Offset the buffer address to the start7813* of the data.7814*/7815valoffs += sizeof(size_t);78167817/*7818* Reset to the memory address rather than7819* the memref array, then let the BYREF7820* code below do the work to store the7821* memory data in the buffer.7822*/7823val = memref[0];7824break;7825}78267827case DTRACEACT_CHILL:7828if (dtrace_priv_kernel_destructive(state))7829dtrace_action_chill(&mstate, val);7830continue;78317832case DTRACEACT_RAISE:7833if (dtrace_priv_proc_destructive(state))7834dtrace_action_raise(val);7835continue;78367837case DTRACEACT_COMMIT:7838ASSERT(!committed);78397840/*7841* We need to commit our buffer state.7842*/7843if (ecb->dte_size)7844buf->dtb_offset = offs + ecb->dte_size;7845buf = &state->dts_buffer[cpuid];7846dtrace_speculation_commit(state, cpuid, val);7847committed = 1;7848continue;78497850case DTRACEACT_DISCARD:7851dtrace_speculation_discard(state, cpuid, val);7852continue;78537854case DTRACEACT_DIFEXPR:7855case DTRACEACT_LIBACT:7856case DTRACEACT_PRINTF:7857case DTRACEACT_PRINTA:7858case DTRACEACT_SYSTEM:7859case DTRACEACT_FREOPEN:7860case DTRACEACT_TRACEMEM:7861break;78627863case DTRACEACT_TRACEMEM_DYNSIZE:7864tracememsize = val;7865break;78667867case DTRACEACT_SYM:7868case DTRACEACT_MOD:7869if (!dtrace_priv_kernel(state))7870continue;7871break;78727873case DTRACEACT_USYM:7874case DTRACEACT_UMOD:7875case DTRACEACT_UADDR: {7876#ifdef illumos7877struct pid *pid = curthread->t_procp->p_pidp;7878#endif78797880if (!dtrace_priv_proc(state))7881continue;78827883DTRACE_STORE(uint64_t, tomax,7884#ifdef illumos7885valoffs, (uint64_t)pid->pid_id);7886#else7887valoffs, (uint64_t) curproc->p_pid);7888#endif7889DTRACE_STORE(uint64_t, tomax,7890valoffs + sizeof (uint64_t), val);78917892continue;7893}78947895case DTRACEACT_EXIT: {7896/*7897* For the exit action, we are going to attempt7898* to atomically set our activity to be7899* draining. If this fails (either because7900* another CPU has beat us to the exit action,7901* or because our current activity is something7902* other than ACTIVE or WARMUP), we will7903* continue. This assures that the exit action7904* can be successfully recorded at most once7905* when we're in the ACTIVE state. If we're7906* encountering the exit() action while in7907* COOLDOWN, however, we want to honor the new7908* status code. (We know that we're the only7909* thread in COOLDOWN, so there is no race.)7910*/7911void *activity = &state->dts_activity;7912dtrace_activity_t curstate = state->dts_activity;79137914if (curstate == DTRACE_ACTIVITY_COOLDOWN)7915break;79167917if (curstate != DTRACE_ACTIVITY_WARMUP)7918curstate = DTRACE_ACTIVITY_ACTIVE;79197920if (dtrace_cas32(activity, curstate,7921DTRACE_ACTIVITY_DRAINING) != curstate) {7922*flags |= CPU_DTRACE_DROP;7923continue;7924}79257926break;7927}79287929default:7930ASSERT(0);7931}79327933if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||7934dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {7935uintptr_t end = valoffs + size;79367937if (tracememsize != 0 &&7938valoffs + tracememsize < end) {7939end = valoffs + tracememsize;7940tracememsize = 0;7941}79427943if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&7944!dtrace_vcanload((void *)(uintptr_t)val,7945&dp->dtdo_rtype, NULL, &mstate, vstate))7946continue;79477948dtrace_store_by_ref(dp, tomax, size, &valoffs,7949&val, end, act->dta_intuple,7950dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?7951DIF_TF_BYREF: DIF_TF_BYUREF);7952continue;7953}79547955switch (size) {7956case 0:7957break;79587959case sizeof (uint8_t):7960DTRACE_STORE(uint8_t, tomax, valoffs, val);7961break;7962case sizeof (uint16_t):7963DTRACE_STORE(uint16_t, tomax, valoffs, val);7964break;7965case sizeof (uint32_t):7966DTRACE_STORE(uint32_t, tomax, valoffs, val);7967break;7968case sizeof (uint64_t):7969DTRACE_STORE(uint64_t, tomax, valoffs, val);7970break;7971default:7972/*7973* Any other size should have been returned by7974* reference, not by value.7975*/7976ASSERT(0);7977break;7978}7979}79807981if (*flags & CPU_DTRACE_DROP)7982continue;79837984if (*flags & CPU_DTRACE_FAULT) {7985int ndx;7986dtrace_action_t *err;79877988buf->dtb_errors++;79897990if (probe->dtpr_id == dtrace_probeid_error) {7991/*7992* There's nothing we can do -- we had an7993* error on the error probe. We bump an7994* error counter to at least indicate that7995* this condition happened.7996*/7997dtrace_error(&state->dts_dblerrors);7998continue;7999}80008001if (vtime) {8002/*8003* Before recursing on dtrace_probe(), we8004* need to explicitly clear out our start8005* time to prevent it from being accumulated8006* into t_dtrace_vtime.8007*/8008curthread->t_dtrace_start = 0;8009}80108011/*8012* Iterate over the actions to figure out which action8013* we were processing when we experienced the error.8014* Note that act points _past_ the faulting action; if8015* act is ecb->dte_action, the fault was in the8016* predicate, if it's ecb->dte_action->dta_next it's8017* in action #1, and so on.8018*/8019for (err = ecb->dte_action, ndx = 0;8020err != act; err = err->dta_next, ndx++)8021continue;80228023dtrace_probe_error(state, ecb->dte_epid, ndx,8024(mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?8025mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),8026cpu_core[cpuid].cpuc_dtrace_illval);80278028continue;8029}80308031if (!committed)8032buf->dtb_offset = offs + ecb->dte_size;8033}80348035if (vtime)8036curthread->t_dtrace_start = dtrace_gethrtime();80378038dtrace_probe_exit(cookie);8039}80408041/*8042* DTrace Probe Hashing Functions8043*8044* The functions in this section (and indeed, the functions in remaining8045* sections) are not _called_ from probe context. (Any exceptions to this are8046* marked with a "Note:".) Rather, they are called from elsewhere in the8047* DTrace framework to look-up probes in, add probes to and remove probes from8048* the DTrace probe hashes. (Each probe is hashed by each element of the8049* probe tuple -- allowing for fast lookups, regardless of what was8050* specified.)8051*/8052static uint_t8053dtrace_hash_str(const char *p)8054{8055unsigned int g;8056uint_t hval = 0;80578058while (*p) {8059hval = (hval << 4) + *p++;8060if ((g = (hval & 0xf0000000)) != 0)8061hval ^= g >> 24;8062hval &= ~g;8063}8064return (hval);8065}80668067static dtrace_hash_t *8068dtrace_hash_create(size_t stroffs, size_t nextoffs, size_t prevoffs)8069{8070dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);80718072hash->dth_stroffs = stroffs;8073hash->dth_nextoffs = nextoffs;8074hash->dth_prevoffs = prevoffs;80758076hash->dth_size = 1;8077hash->dth_mask = hash->dth_size - 1;80788079hash->dth_tab = kmem_zalloc(hash->dth_size *8080sizeof (dtrace_hashbucket_t *), KM_SLEEP);80818082return (hash);8083}80848085static void8086dtrace_hash_destroy(dtrace_hash_t *hash)8087{8088#ifdef DEBUG8089int i;80908091for (i = 0; i < hash->dth_size; i++)8092ASSERT(hash->dth_tab[i] == NULL);8093#endif80948095kmem_free(hash->dth_tab,8096hash->dth_size * sizeof (dtrace_hashbucket_t *));8097kmem_free(hash, sizeof (dtrace_hash_t));8098}80998100static void8101dtrace_hash_resize(dtrace_hash_t *hash)8102{8103int size = hash->dth_size, i, ndx;8104int new_size = hash->dth_size << 1;8105int new_mask = new_size - 1;8106dtrace_hashbucket_t **new_tab, *bucket, *next;81078108ASSERT((new_size & new_mask) == 0);81098110new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);81118112for (i = 0; i < size; i++) {8113for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {8114dtrace_probe_t *probe = bucket->dthb_chain;81158116ASSERT(probe != NULL);8117ndx = DTRACE_HASHSTR(hash, probe) & new_mask;81188119next = bucket->dthb_next;8120bucket->dthb_next = new_tab[ndx];8121new_tab[ndx] = bucket;8122}8123}81248125kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));8126hash->dth_tab = new_tab;8127hash->dth_size = new_size;8128hash->dth_mask = new_mask;8129}81308131static void8132dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)8133{8134int hashval = DTRACE_HASHSTR(hash, new);8135int ndx = hashval & hash->dth_mask;8136dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];8137dtrace_probe_t **nextp, **prevp;81388139for (; bucket != NULL; bucket = bucket->dthb_next) {8140if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))8141goto add;8142}81438144if ((hash->dth_nbuckets >> 1) > hash->dth_size) {8145dtrace_hash_resize(hash);8146dtrace_hash_add(hash, new);8147return;8148}81498150bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);8151bucket->dthb_next = hash->dth_tab[ndx];8152hash->dth_tab[ndx] = bucket;8153hash->dth_nbuckets++;81548155add:8156nextp = DTRACE_HASHNEXT(hash, new);8157ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);8158*nextp = bucket->dthb_chain;81598160if (bucket->dthb_chain != NULL) {8161prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);8162ASSERT(*prevp == NULL);8163*prevp = new;8164}81658166bucket->dthb_chain = new;8167bucket->dthb_len++;8168}81698170static dtrace_probe_t *8171dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)8172{8173int hashval = DTRACE_HASHSTR(hash, template);8174int ndx = hashval & hash->dth_mask;8175dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];81768177for (; bucket != NULL; bucket = bucket->dthb_next) {8178if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))8179return (bucket->dthb_chain);8180}81818182return (NULL);8183}81848185static int8186dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)8187{8188int hashval = DTRACE_HASHSTR(hash, template);8189int ndx = hashval & hash->dth_mask;8190dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];81918192for (; bucket != NULL; bucket = bucket->dthb_next) {8193if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))8194return (bucket->dthb_len);8195}81968197return (0);8198}81998200static void8201dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)8202{8203int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;8204dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];82058206dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);8207dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);82088209/*8210* Find the bucket that we're removing this probe from.8211*/8212for (; bucket != NULL; bucket = bucket->dthb_next) {8213if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))8214break;8215}82168217ASSERT(bucket != NULL);82188219if (*prevp == NULL) {8220if (*nextp == NULL) {8221/*8222* The removed probe was the only probe on this8223* bucket; we need to remove the bucket.8224*/8225dtrace_hashbucket_t *b = hash->dth_tab[ndx];82268227ASSERT(bucket->dthb_chain == probe);8228ASSERT(b != NULL);82298230if (b == bucket) {8231hash->dth_tab[ndx] = bucket->dthb_next;8232} else {8233while (b->dthb_next != bucket)8234b = b->dthb_next;8235b->dthb_next = bucket->dthb_next;8236}82378238ASSERT(hash->dth_nbuckets > 0);8239hash->dth_nbuckets--;8240kmem_free(bucket, sizeof (dtrace_hashbucket_t));8241return;8242}82438244bucket->dthb_chain = *nextp;8245} else {8246*(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;8247}82488249if (*nextp != NULL)8250*(DTRACE_HASHPREV(hash, *nextp)) = *prevp;8251}82528253/*8254* DTrace Utility Functions8255*8256* These are random utility functions that are _not_ called from probe context.8257*/8258static int8259dtrace_badattr(const dtrace_attribute_t *a)8260{8261return (a->dtat_name > DTRACE_STABILITY_MAX ||8262a->dtat_data > DTRACE_STABILITY_MAX ||8263a->dtat_class > DTRACE_CLASS_MAX);8264}82658266/*8267* Return a duplicate copy of a string. If the specified string is NULL,8268* this function returns a zero-length string.8269*/8270static char *8271dtrace_strdup(const char *str)8272{8273char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);82748275if (str != NULL)8276(void) strcpy(new, str);82778278return (new);8279}82808281#define DTRACE_ISALPHA(c) \8282(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))82838284static int8285dtrace_badname(const char *s)8286{8287char c;82888289if (s == NULL || (c = *s++) == '\0')8290return (0);82918292if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')8293return (1);82948295while ((c = *s++) != '\0') {8296if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&8297c != '-' && c != '_' && c != '.' && c != '`')8298return (1);8299}83008301return (0);8302}83038304static void8305dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)8306{8307uint32_t priv;83088309#ifdef illumos8310if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {8311/*8312* For DTRACE_PRIV_ALL, the uid and zoneid don't matter.8313*/8314priv = DTRACE_PRIV_ALL;8315} else {8316*uidp = crgetuid(cr);8317*zoneidp = crgetzoneid(cr);83188319priv = 0;8320if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))8321priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;8322else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))8323priv |= DTRACE_PRIV_USER;8324if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))8325priv |= DTRACE_PRIV_PROC;8326if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))8327priv |= DTRACE_PRIV_OWNER;8328if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))8329priv |= DTRACE_PRIV_ZONEOWNER;8330}8331#else8332priv = DTRACE_PRIV_ALL;8333#endif83348335*privp = priv;8336}83378338#ifdef DTRACE_ERRDEBUG8339static void8340dtrace_errdebug(const char *str)8341{8342int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;8343int occupied = 0;83448345mutex_enter(&dtrace_errlock);8346dtrace_errlast = str;8347dtrace_errthread = curthread;83488349while (occupied++ < DTRACE_ERRHASHSZ) {8350if (dtrace_errhash[hval].dter_msg == str) {8351dtrace_errhash[hval].dter_count++;8352goto out;8353}83548355if (dtrace_errhash[hval].dter_msg != NULL) {8356hval = (hval + 1) % DTRACE_ERRHASHSZ;8357continue;8358}83598360dtrace_errhash[hval].dter_msg = str;8361dtrace_errhash[hval].dter_count = 1;8362goto out;8363}83648365panic("dtrace: undersized error hash");8366out:8367mutex_exit(&dtrace_errlock);8368}8369#endif83708371/*8372* DTrace Matching Functions8373*8374* These functions are used to match groups of probes, given some elements of8375* a probe tuple, or some globbed expressions for elements of a probe tuple.8376*/8377static int8378dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,8379zoneid_t zoneid)8380{8381if (priv != DTRACE_PRIV_ALL) {8382uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;8383uint32_t match = priv & ppriv;83848385/*8386* No PRIV_DTRACE_* privileges...8387*/8388if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |8389DTRACE_PRIV_KERNEL)) == 0)8390return (0);83918392/*8393* No matching bits, but there were bits to match...8394*/8395if (match == 0 && ppriv != 0)8396return (0);83978398/*8399* Need to have permissions to the process, but don't...8400*/8401if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&8402uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {8403return (0);8404}84058406/*8407* Need to be in the same zone unless we possess the8408* privilege to examine all zones.8409*/8410if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&8411zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {8412return (0);8413}8414}84158416return (1);8417}84188419/*8420* dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which8421* consists of input pattern strings and an ops-vector to evaluate them.8422* This function returns >0 for match, 0 for no match, and <0 for error.8423*/8424static int8425dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,8426uint32_t priv, uid_t uid, zoneid_t zoneid)8427{8428dtrace_provider_t *pvp = prp->dtpr_provider;8429int rv;84308431if (pvp->dtpv_defunct)8432return (0);84338434if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)8435return (rv);84368437if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)8438return (rv);84398440if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)8441return (rv);84428443if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)8444return (rv);84458446if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)8447return (0);84488449return (rv);8450}84518452/*8453* dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)8454* interface for matching a glob pattern 'p' to an input string 's'. Unlike8455* libc's version, the kernel version only applies to 8-bit ASCII strings.8456* In addition, all of the recursion cases except for '*' matching have been8457* unwound. For '*', we still implement recursive evaluation, but a depth8458* counter is maintained and matching is aborted if we recurse too deep.8459* The function returns 0 if no match, >0 if match, and <0 if recursion error.8460*/8461static int8462dtrace_match_glob(const char *s, const char *p, int depth)8463{8464const char *olds;8465char s1, c;8466int gs;84678468if (depth > DTRACE_PROBEKEY_MAXDEPTH)8469return (-1);84708471if (s == NULL)8472s = ""; /* treat NULL as empty string */84738474top:8475olds = s;8476s1 = *s++;84778478if (p == NULL)8479return (0);84808481if ((c = *p++) == '\0')8482return (s1 == '\0');84838484switch (c) {8485case '[': {8486int ok = 0, notflag = 0;8487char lc = '\0';84888489if (s1 == '\0')8490return (0);84918492if (*p == '!') {8493notflag = 1;8494p++;8495}84968497if ((c = *p++) == '\0')8498return (0);84998500do {8501if (c == '-' && lc != '\0' && *p != ']') {8502if ((c = *p++) == '\0')8503return (0);8504if (c == '\\' && (c = *p++) == '\0')8505return (0);85068507if (notflag) {8508if (s1 < lc || s1 > c)8509ok++;8510else8511return (0);8512} else if (lc <= s1 && s1 <= c)8513ok++;85148515} else if (c == '\\' && (c = *p++) == '\0')8516return (0);85178518lc = c; /* save left-hand 'c' for next iteration */85198520if (notflag) {8521if (s1 != c)8522ok++;8523else8524return (0);8525} else if (s1 == c)8526ok++;85278528if ((c = *p++) == '\0')8529return (0);85308531} while (c != ']');85328533if (ok)8534goto top;85358536return (0);8537}85388539case '\\':8540if ((c = *p++) == '\0')8541return (0);8542/*FALLTHRU*/85438544default:8545if (c != s1)8546return (0);8547/*FALLTHRU*/85488549case '?':8550if (s1 != '\0')8551goto top;8552return (0);85538554case '*':8555while (*p == '*')8556p++; /* consecutive *'s are identical to a single one */85578558if (*p == '\0')8559return (1);85608561for (s = olds; *s != '\0'; s++) {8562if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)8563return (gs);8564}85658566return (0);8567}8568}85698570/*ARGSUSED*/8571static int8572dtrace_match_string(const char *s, const char *p, int depth)8573{8574return (s != NULL && strcmp(s, p) == 0);8575}85768577/*ARGSUSED*/8578static int8579dtrace_match_nul(const char *s, const char *p, int depth)8580{8581return (1); /* always match the empty pattern */8582}85838584/*ARGSUSED*/8585static int8586dtrace_match_nonzero(const char *s, const char *p, int depth)8587{8588return (s != NULL && s[0] != '\0');8589}85908591static int8592dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,8593zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)8594{8595dtrace_probe_t template, *probe;8596dtrace_hash_t *hash = NULL;8597int len, best = INT_MAX, nmatched = 0;8598dtrace_id_t i;85998600ASSERT(MUTEX_HELD(&dtrace_lock));86018602/*8603* If the probe ID is specified in the key, just lookup by ID and8604* invoke the match callback once if a matching probe is found.8605*/8606if (pkp->dtpk_id != DTRACE_IDNONE) {8607if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&8608dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {8609(void) (*matched)(probe, arg);8610nmatched++;8611}8612return (nmatched);8613}86148615template.dtpr_mod = (char *)pkp->dtpk_mod;8616template.dtpr_func = (char *)pkp->dtpk_func;8617template.dtpr_name = (char *)pkp->dtpk_name;86188619/*8620* We want to find the most distinct of the module name, function8621* name, and name. So for each one that is not a glob pattern or8622* empty string, we perform a lookup in the corresponding hash and8623* use the hash table with the fewest collisions to do our search.8624*/8625if (pkp->dtpk_mmatch == &dtrace_match_string &&8626(len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {8627best = len;8628hash = dtrace_bymod;8629}86308631if (pkp->dtpk_fmatch == &dtrace_match_string &&8632(len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {8633best = len;8634hash = dtrace_byfunc;8635}86368637if (pkp->dtpk_nmatch == &dtrace_match_string &&8638(len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {8639best = len;8640hash = dtrace_byname;8641}86428643/*8644* If we did not select a hash table, iterate over every probe and8645* invoke our callback for each one that matches our input probe key.8646*/8647if (hash == NULL) {8648for (i = 0; i < dtrace_nprobes; i++) {8649if ((probe = dtrace_probes[i]) == NULL ||8650dtrace_match_probe(probe, pkp, priv, uid,8651zoneid) <= 0)8652continue;86538654nmatched++;86558656if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)8657break;8658}86598660return (nmatched);8661}86628663/*8664* If we selected a hash table, iterate over each probe of the same key8665* name and invoke the callback for every probe that matches the other8666* attributes of our input probe key.8667*/8668for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;8669probe = *(DTRACE_HASHNEXT(hash, probe))) {86708671if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)8672continue;86738674nmatched++;86758676if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)8677break;8678}86798680return (nmatched);8681}86828683/*8684* Return the function pointer dtrace_probecmp() should use to compare the8685* specified pattern with a string. For NULL or empty patterns, we select8686* dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().8687* For non-empty non-glob strings, we use dtrace_match_string().8688*/8689static dtrace_probekey_f *8690dtrace_probekey_func(const char *p)8691{8692char c;86938694if (p == NULL || *p == '\0')8695return (&dtrace_match_nul);86968697while ((c = *p++) != '\0') {8698if (c == '[' || c == '?' || c == '*' || c == '\\')8699return (&dtrace_match_glob);8700}87018702return (&dtrace_match_string);8703}87048705/*8706* Build a probe comparison key for use with dtrace_match_probe() from the8707* given probe description. By convention, a null key only matches anchored8708* probes: if each field is the empty string, reset dtpk_fmatch to8709* dtrace_match_nonzero().8710*/8711static void8712dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)8713{8714pkp->dtpk_prov = pdp->dtpd_provider;8715pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);87168717pkp->dtpk_mod = pdp->dtpd_mod;8718pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);87198720pkp->dtpk_func = pdp->dtpd_func;8721pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);87228723pkp->dtpk_name = pdp->dtpd_name;8724pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);87258726pkp->dtpk_id = pdp->dtpd_id;87278728if (pkp->dtpk_id == DTRACE_IDNONE &&8729pkp->dtpk_pmatch == &dtrace_match_nul &&8730pkp->dtpk_mmatch == &dtrace_match_nul &&8731pkp->dtpk_fmatch == &dtrace_match_nul &&8732pkp->dtpk_nmatch == &dtrace_match_nul)8733pkp->dtpk_fmatch = &dtrace_match_nonzero;8734}87358736/*8737* DTrace Provider-to-Framework API Functions8738*8739* These functions implement much of the Provider-to-Framework API, as8740* described in <sys/dtrace.h>. The parts of the API not in this section are8741* the functions in the API for probe management (found below), and8742* dtrace_probe() itself (found above).8743*/87448745/*8746* Register the calling provider with the DTrace framework. This should8747* generally be called by DTrace providers in their attach(9E) entry point.8748*/8749int8750dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,8751cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)8752{8753dtrace_provider_t *provider;87548755if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {8756cmn_err(CE_WARN, "failed to register provider '%s': invalid "8757"arguments", name ? name : "<NULL>");8758return (EINVAL);8759}87608761if (name[0] == '\0' || dtrace_badname(name)) {8762cmn_err(CE_WARN, "failed to register provider '%s': invalid "8763"provider name", name);8764return (EINVAL);8765}87668767if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||8768pops->dtps_enable == NULL || pops->dtps_disable == NULL ||8769pops->dtps_destroy == NULL ||8770((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {8771cmn_err(CE_WARN, "failed to register provider '%s': invalid "8772"provider ops", name);8773return (EINVAL);8774}87758776if (dtrace_badattr(&pap->dtpa_provider) ||8777dtrace_badattr(&pap->dtpa_mod) ||8778dtrace_badattr(&pap->dtpa_func) ||8779dtrace_badattr(&pap->dtpa_name) ||8780dtrace_badattr(&pap->dtpa_args)) {8781cmn_err(CE_WARN, "failed to register provider '%s': invalid "8782"provider attributes", name);8783return (EINVAL);8784}87858786if (priv & ~DTRACE_PRIV_ALL) {8787cmn_err(CE_WARN, "failed to register provider '%s': invalid "8788"privilege attributes", name);8789return (EINVAL);8790}87918792if ((priv & DTRACE_PRIV_KERNEL) &&8793(priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&8794pops->dtps_usermode == NULL) {8795cmn_err(CE_WARN, "failed to register provider '%s': need "8796"dtps_usermode() op for given privilege attributes", name);8797return (EINVAL);8798}87998800provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);8801provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);8802(void) strcpy(provider->dtpv_name, name);88038804provider->dtpv_attr = *pap;8805provider->dtpv_priv.dtpp_flags = priv;8806if (cr != NULL) {8807provider->dtpv_priv.dtpp_uid = crgetuid(cr);8808provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);8809}8810provider->dtpv_pops = *pops;88118812if (pops->dtps_provide == NULL) {8813ASSERT(pops->dtps_provide_module != NULL);8814provider->dtpv_pops.dtps_provide =8815(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;8816}88178818if (pops->dtps_provide_module == NULL) {8819ASSERT(pops->dtps_provide != NULL);8820provider->dtpv_pops.dtps_provide_module =8821(void (*)(void *, modctl_t *))dtrace_nullop;8822}88238824if (pops->dtps_suspend == NULL) {8825ASSERT(pops->dtps_resume == NULL);8826provider->dtpv_pops.dtps_suspend =8827(void (*)(void *, dtrace_id_t, void *))dtrace_nullop;8828provider->dtpv_pops.dtps_resume =8829(void (*)(void *, dtrace_id_t, void *))dtrace_nullop;8830}88318832provider->dtpv_arg = arg;8833*idp = (dtrace_provider_id_t)provider;88348835if (pops == &dtrace_provider_ops) {8836ASSERT(MUTEX_HELD(&dtrace_provider_lock));8837ASSERT(MUTEX_HELD(&dtrace_lock));8838ASSERT(dtrace_anon.dta_enabling == NULL);88398840/*8841* We make sure that the DTrace provider is at the head of8842* the provider chain.8843*/8844provider->dtpv_next = dtrace_provider;8845dtrace_provider = provider;8846return (0);8847}88488849mutex_enter(&dtrace_provider_lock);8850mutex_enter(&dtrace_lock);88518852/*8853* If there is at least one provider registered, we'll add this8854* provider after the first provider.8855*/8856if (dtrace_provider != NULL) {8857provider->dtpv_next = dtrace_provider->dtpv_next;8858dtrace_provider->dtpv_next = provider;8859} else {8860dtrace_provider = provider;8861}88628863if (dtrace_retained != NULL) {8864dtrace_enabling_provide(provider);88658866/*8867* Now we need to call dtrace_enabling_matchall() -- which8868* will acquire cpu_lock and dtrace_lock. We therefore need8869* to drop all of our locks before calling into it...8870*/8871mutex_exit(&dtrace_lock);8872mutex_exit(&dtrace_provider_lock);8873dtrace_enabling_matchall();88748875return (0);8876}88778878mutex_exit(&dtrace_lock);8879mutex_exit(&dtrace_provider_lock);88808881return (0);8882}88838884/*8885* Unregister the specified provider from the DTrace framework. This should8886* generally be called by DTrace providers in their detach(9E) entry point.8887*/8888int8889dtrace_unregister(dtrace_provider_id_t id)8890{8891dtrace_provider_t *old = (dtrace_provider_t *)id;8892dtrace_provider_t *prev = NULL;8893int i, self = 0, noreap = 0;8894dtrace_probe_t *probe, *first = NULL;88958896if (old->dtpv_pops.dtps_enable ==8897(void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {8898/*8899* If DTrace itself is the provider, we're called with locks8900* already held.8901*/8902ASSERT(old == dtrace_provider);8903#ifdef illumos8904ASSERT(dtrace_devi != NULL);8905#endif8906ASSERT(MUTEX_HELD(&dtrace_provider_lock));8907ASSERT(MUTEX_HELD(&dtrace_lock));8908self = 1;89098910if (dtrace_provider->dtpv_next != NULL) {8911/*8912* There's another provider here; return failure.8913*/8914return (EBUSY);8915}8916} else {8917mutex_enter(&dtrace_provider_lock);8918#ifdef illumos8919mutex_enter(&mod_lock);8920#endif8921mutex_enter(&dtrace_lock);8922}89238924/*8925* If anyone has /dev/dtrace open, or if there are anonymous enabled8926* probes, we refuse to let providers slither away, unless this8927* provider has already been explicitly invalidated.8928*/8929if (!old->dtpv_defunct &&8930(dtrace_opens || (dtrace_anon.dta_state != NULL &&8931dtrace_anon.dta_state->dts_necbs > 0))) {8932if (!self) {8933mutex_exit(&dtrace_lock);8934#ifdef illumos8935mutex_exit(&mod_lock);8936#endif8937mutex_exit(&dtrace_provider_lock);8938}8939return (EBUSY);8940}89418942/*8943* Attempt to destroy the probes associated with this provider.8944*/8945for (i = 0; i < dtrace_nprobes; i++) {8946if ((probe = dtrace_probes[i]) == NULL)8947continue;89488949if (probe->dtpr_provider != old)8950continue;89518952if (probe->dtpr_ecb == NULL)8953continue;89548955/*8956* If we are trying to unregister a defunct provider, and the8957* provider was made defunct within the interval dictated by8958* dtrace_unregister_defunct_reap, we'll (asynchronously)8959* attempt to reap our enablings. To denote that the provider8960* should reattempt to unregister itself at some point in the8961* future, we will return a differentiable error code (EAGAIN8962* instead of EBUSY) in this case.8963*/8964if (dtrace_gethrtime() - old->dtpv_defunct >8965dtrace_unregister_defunct_reap)8966noreap = 1;89678968if (!self) {8969mutex_exit(&dtrace_lock);8970#ifdef illumos8971mutex_exit(&mod_lock);8972#endif8973mutex_exit(&dtrace_provider_lock);8974}89758976if (noreap)8977return (EBUSY);89788979(void) taskq_dispatch(dtrace_taskq,8980(task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);89818982return (EAGAIN);8983}89848985/*8986* All of the probes for this provider are disabled; we can safely8987* remove all of them from their hash chains and from the probe array.8988*/8989for (i = 0; i < dtrace_nprobes; i++) {8990if ((probe = dtrace_probes[i]) == NULL)8991continue;89928993if (probe->dtpr_provider != old)8994continue;89958996dtrace_probes[i] = NULL;89978998dtrace_hash_remove(dtrace_bymod, probe);8999dtrace_hash_remove(dtrace_byfunc, probe);9000dtrace_hash_remove(dtrace_byname, probe);90019002if (first == NULL) {9003first = probe;9004probe->dtpr_nextmod = NULL;9005} else {9006probe->dtpr_nextmod = first;9007first = probe;9008}9009}90109011/*9012* The provider's probes have been removed from the hash chains and9013* from the probe array. Now issue a dtrace_sync() to be sure that9014* everyone has cleared out from any probe array processing.9015*/9016dtrace_sync();90179018for (probe = first; probe != NULL; probe = first) {9019first = probe->dtpr_nextmod;90209021old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,9022probe->dtpr_arg);9023kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);9024kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);9025kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);9026#ifdef illumos9027vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);9028#else9029free_unr(dtrace_arena, probe->dtpr_id);9030#endif9031kmem_free(probe, sizeof (dtrace_probe_t));9032}90339034if ((prev = dtrace_provider) == old) {9035#ifdef illumos9036ASSERT(self || dtrace_devi == NULL);9037ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);9038#endif9039dtrace_provider = old->dtpv_next;9040} else {9041while (prev != NULL && prev->dtpv_next != old)9042prev = prev->dtpv_next;90439044if (prev == NULL) {9045panic("attempt to unregister non-existent "9046"dtrace provider %p\n", (void *)id);9047}90489049prev->dtpv_next = old->dtpv_next;9050}90519052if (!self) {9053mutex_exit(&dtrace_lock);9054#ifdef illumos9055mutex_exit(&mod_lock);9056#endif9057mutex_exit(&dtrace_provider_lock);9058}90599060kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);9061kmem_free(old, sizeof (dtrace_provider_t));90629063return (0);9064}90659066/*9067* Invalidate the specified provider. All subsequent probe lookups for the9068* specified provider will fail, but its probes will not be removed.9069*/9070void9071dtrace_invalidate(dtrace_provider_id_t id)9072{9073dtrace_provider_t *pvp = (dtrace_provider_t *)id;90749075ASSERT(pvp->dtpv_pops.dtps_enable !=9076(void (*)(void *, dtrace_id_t, void *))dtrace_nullop);90779078mutex_enter(&dtrace_provider_lock);9079mutex_enter(&dtrace_lock);90809081pvp->dtpv_defunct = dtrace_gethrtime();90829083mutex_exit(&dtrace_lock);9084mutex_exit(&dtrace_provider_lock);9085}90869087/*9088* Indicate whether or not DTrace has attached.9089*/9090int9091dtrace_attached(void)9092{9093/*9094* dtrace_provider will be non-NULL iff the DTrace driver has9095* attached. (It's non-NULL because DTrace is always itself a9096* provider.)9097*/9098return (dtrace_provider != NULL);9099}91009101/*9102* Remove all the unenabled probes for the given provider. This function is9103* not unlike dtrace_unregister(), except that it doesn't remove the provider9104* -- just as many of its associated probes as it can.9105*/9106int9107dtrace_condense(dtrace_provider_id_t id)9108{9109dtrace_provider_t *prov = (dtrace_provider_t *)id;9110int i;9111dtrace_probe_t *probe;91129113/*9114* Make sure this isn't the dtrace provider itself.9115*/9116ASSERT(prov->dtpv_pops.dtps_enable !=9117(void (*)(void *, dtrace_id_t, void *))dtrace_nullop);91189119mutex_enter(&dtrace_provider_lock);9120mutex_enter(&dtrace_lock);91219122/*9123* Attempt to destroy the probes associated with this provider.9124*/9125for (i = 0; i < dtrace_nprobes; i++) {9126if ((probe = dtrace_probes[i]) == NULL)9127continue;91289129if (probe->dtpr_provider != prov)9130continue;91319132if (probe->dtpr_ecb != NULL)9133continue;91349135dtrace_probes[i] = NULL;91369137dtrace_hash_remove(dtrace_bymod, probe);9138dtrace_hash_remove(dtrace_byfunc, probe);9139dtrace_hash_remove(dtrace_byname, probe);91409141prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,9142probe->dtpr_arg);9143kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);9144kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);9145kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);9146kmem_free(probe, sizeof (dtrace_probe_t));9147#ifdef illumos9148vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);9149#else9150free_unr(dtrace_arena, i + 1);9151#endif9152}91539154mutex_exit(&dtrace_lock);9155mutex_exit(&dtrace_provider_lock);91569157return (0);9158}91599160/*9161* DTrace Probe Management Functions9162*9163* The functions in this section perform the DTrace probe management,9164* including functions to create probes, look-up probes, and call into the9165* providers to request that probes be provided. Some of these functions are9166* in the Provider-to-Framework API; these functions can be identified by the9167* fact that they are not declared "static".9168*/91699170/*9171* Create a probe with the specified module name, function name, and name.9172*/9173dtrace_id_t9174dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,9175const char *func, const char *name, int aframes, void *arg)9176{9177dtrace_probe_t *probe, **probes;9178dtrace_provider_t *provider = (dtrace_provider_t *)prov;9179dtrace_id_t id;91809181if (provider == dtrace_provider) {9182ASSERT(MUTEX_HELD(&dtrace_lock));9183} else {9184mutex_enter(&dtrace_lock);9185}91869187#ifdef illumos9188id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,9189VM_BESTFIT | VM_SLEEP);9190#else9191id = alloc_unr(dtrace_arena);9192#endif9193probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);91949195probe->dtpr_id = id;9196probe->dtpr_gen = dtrace_probegen++;9197probe->dtpr_mod = dtrace_strdup(mod);9198probe->dtpr_func = dtrace_strdup(func);9199probe->dtpr_name = dtrace_strdup(name);9200probe->dtpr_arg = arg;9201probe->dtpr_aframes = aframes;9202probe->dtpr_provider = provider;92039204dtrace_hash_add(dtrace_bymod, probe);9205dtrace_hash_add(dtrace_byfunc, probe);9206dtrace_hash_add(dtrace_byname, probe);92079208if (id - 1 >= dtrace_nprobes) {9209size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);9210size_t nsize = osize << 1;92119212if (nsize == 0) {9213ASSERT(osize == 0);9214ASSERT(dtrace_probes == NULL);9215nsize = sizeof (dtrace_probe_t *);9216}92179218probes = kmem_zalloc(nsize, KM_SLEEP);92199220if (dtrace_probes == NULL) {9221ASSERT(osize == 0);9222dtrace_probes = probes;9223dtrace_nprobes = 1;9224} else {9225dtrace_probe_t **oprobes = dtrace_probes;92269227bcopy(oprobes, probes, osize);9228dtrace_membar_producer();9229dtrace_probes = probes;92309231dtrace_sync();92329233/*9234* All CPUs are now seeing the new probes array; we can9235* safely free the old array.9236*/9237kmem_free(oprobes, osize);9238dtrace_nprobes <<= 1;9239}92409241ASSERT(id - 1 < dtrace_nprobes);9242}92439244ASSERT(dtrace_probes[id - 1] == NULL);9245dtrace_probes[id - 1] = probe;92469247if (provider != dtrace_provider)9248mutex_exit(&dtrace_lock);92499250return (id);9251}92529253static dtrace_probe_t *9254dtrace_probe_lookup_id(dtrace_id_t id)9255{9256ASSERT(MUTEX_HELD(&dtrace_lock));92579258if (id == 0 || id > dtrace_nprobes)9259return (NULL);92609261return (dtrace_probes[id - 1]);9262}92639264static int9265dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)9266{9267*((dtrace_id_t *)arg) = probe->dtpr_id;92689269return (DTRACE_MATCH_DONE);9270}92719272/*9273* Look up a probe based on provider and one or more of module name, function9274* name and probe name.9275*/9276dtrace_id_t9277dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,9278char *func, char *name)9279{9280dtrace_probekey_t pkey;9281dtrace_id_t id;9282int match;92839284pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;9285pkey.dtpk_pmatch = &dtrace_match_string;9286pkey.dtpk_mod = mod;9287pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;9288pkey.dtpk_func = func;9289pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;9290pkey.dtpk_name = name;9291pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;9292pkey.dtpk_id = DTRACE_IDNONE;92939294mutex_enter(&dtrace_lock);9295match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,9296dtrace_probe_lookup_match, &id);9297mutex_exit(&dtrace_lock);92989299ASSERT(match == 1 || match == 0);9300return (match ? id : 0);9301}93029303/*9304* Returns the probe argument associated with the specified probe.9305*/9306void *9307dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)9308{9309dtrace_probe_t *probe;9310void *rval = NULL;93119312mutex_enter(&dtrace_lock);93139314if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&9315probe->dtpr_provider == (dtrace_provider_t *)id)9316rval = probe->dtpr_arg;93179318mutex_exit(&dtrace_lock);93199320return (rval);9321}93229323/*9324* Copy a probe into a probe description.9325*/9326static void9327dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)9328{9329bzero(pdp, sizeof (dtrace_probedesc_t));9330pdp->dtpd_id = prp->dtpr_id;93319332(void) strncpy(pdp->dtpd_provider,9333prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);93349335(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);9336(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);9337(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);9338}93399340/*9341* Called to indicate that a probe -- or probes -- should be provided by a9342* specfied provider. If the specified description is NULL, the provider will9343* be told to provide all of its probes. (This is done whenever a new9344* consumer comes along, or whenever a retained enabling is to be matched.) If9345* the specified description is non-NULL, the provider is given the9346* opportunity to dynamically provide the specified probe, allowing providers9347* to support the creation of probes on-the-fly. (So-called _autocreated_9348* probes.) If the provider is NULL, the operations will be applied to all9349* providers; if the provider is non-NULL the operations will only be applied9350* to the specified provider. The dtrace_provider_lock must be held, and the9351* dtrace_lock must _not_ be held -- the provider's dtps_provide() operation9352* will need to grab the dtrace_lock when it reenters the framework through9353* dtrace_probe_lookup(), dtrace_probe_create(), etc.9354*/9355static void9356dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)9357{9358#ifdef illumos9359modctl_t *ctl;9360#endif9361int all = 0;93629363ASSERT(MUTEX_HELD(&dtrace_provider_lock));93649365if (prv == NULL) {9366all = 1;9367prv = dtrace_provider;9368}93699370do {9371/*9372* First, call the blanket provide operation.9373*/9374prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);93759376#ifdef illumos9377/*9378* Now call the per-module provide operation. We will grab9379* mod_lock to prevent the list from being modified. Note9380* that this also prevents the mod_busy bits from changing.9381* (mod_busy can only be changed with mod_lock held.)9382*/9383mutex_enter(&mod_lock);93849385ctl = &modules;9386do {9387if (ctl->mod_busy || ctl->mod_mp == NULL)9388continue;93899390prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);93919392} while ((ctl = ctl->mod_next) != &modules);93939394mutex_exit(&mod_lock);9395#endif9396} while (all && (prv = prv->dtpv_next) != NULL);9397}93989399#ifdef illumos9400/*9401* Iterate over each probe, and call the Framework-to-Provider API function9402* denoted by offs.9403*/9404static void9405dtrace_probe_foreach(uintptr_t offs)9406{9407dtrace_provider_t *prov;9408void (*func)(void *, dtrace_id_t, void *);9409dtrace_probe_t *probe;9410dtrace_icookie_t cookie;9411int i;94129413/*9414* We disable interrupts to walk through the probe array. This is9415* safe -- the dtrace_sync() in dtrace_unregister() assures that we9416* won't see stale data.9417*/9418cookie = dtrace_interrupt_disable();94199420for (i = 0; i < dtrace_nprobes; i++) {9421if ((probe = dtrace_probes[i]) == NULL)9422continue;94239424if (probe->dtpr_ecb == NULL) {9425/*9426* This probe isn't enabled -- don't call the function.9427*/9428continue;9429}94309431prov = probe->dtpr_provider;9432func = *((void(**)(void *, dtrace_id_t, void *))9433((uintptr_t)&prov->dtpv_pops + offs));94349435func(prov->dtpv_arg, i + 1, probe->dtpr_arg);9436}94379438dtrace_interrupt_enable(cookie);9439}9440#endif94419442static int9443dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)9444{9445dtrace_probekey_t pkey;9446uint32_t priv;9447uid_t uid;9448zoneid_t zoneid;94499450ASSERT(MUTEX_HELD(&dtrace_lock));9451dtrace_ecb_create_cache = NULL;94529453if (desc == NULL) {9454/*9455* If we're passed a NULL description, we're being asked to9456* create an ECB with a NULL probe.9457*/9458(void) dtrace_ecb_create_enable(NULL, enab);9459return (0);9460}94619462dtrace_probekey(desc, &pkey);9463dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,9464&priv, &uid, &zoneid);94659466return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,9467enab));9468}94699470/*9471* DTrace Helper Provider Functions9472*/9473static void9474dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)9475{9476attr->dtat_name = DOF_ATTR_NAME(dofattr);9477attr->dtat_data = DOF_ATTR_DATA(dofattr);9478attr->dtat_class = DOF_ATTR_CLASS(dofattr);9479}94809481static void9482dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,9483const dof_provider_t *dofprov, char *strtab)9484{9485hprov->dthpv_provname = strtab + dofprov->dofpv_name;9486dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,9487dofprov->dofpv_provattr);9488dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,9489dofprov->dofpv_modattr);9490dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,9491dofprov->dofpv_funcattr);9492dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,9493dofprov->dofpv_nameattr);9494dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,9495dofprov->dofpv_argsattr);9496}94979498static void9499dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)9500{9501uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;9502dof_hdr_t *dof = (dof_hdr_t *)daddr;9503dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;9504dof_provider_t *provider;9505dof_probe_t *probe;9506uint32_t *off, *enoff;9507uint8_t *arg;9508char *strtab;9509uint_t i, nprobes;9510dtrace_helper_provdesc_t dhpv;9511dtrace_helper_probedesc_t dhpb;9512dtrace_meta_t *meta = dtrace_meta_pid;9513dtrace_mops_t *mops = &meta->dtm_mops;9514void *parg;95159516provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);9517str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +9518provider->dofpv_strtab * dof->dofh_secsize);9519prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +9520provider->dofpv_probes * dof->dofh_secsize);9521arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +9522provider->dofpv_prargs * dof->dofh_secsize);9523off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +9524provider->dofpv_proffs * dof->dofh_secsize);95259526strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);9527off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);9528arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);9529enoff = NULL;95309531/*9532* See dtrace_helper_provider_validate().9533*/9534if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&9535provider->dofpv_prenoffs != DOF_SECT_NONE) {9536enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +9537provider->dofpv_prenoffs * dof->dofh_secsize);9538enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);9539}95409541nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;95429543/*9544* Create the provider.9545*/9546dtrace_dofprov2hprov(&dhpv, provider, strtab);95479548if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)9549return;95509551meta->dtm_count++;95529553/*9554* Create the probes.9555*/9556for (i = 0; i < nprobes; i++) {9557probe = (dof_probe_t *)(uintptr_t)(daddr +9558prb_sec->dofs_offset + i * prb_sec->dofs_entsize);95599560/* See the check in dtrace_helper_provider_validate(). */9561if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)9562continue;95639564dhpb.dthpb_mod = dhp->dofhp_mod;9565dhpb.dthpb_func = strtab + probe->dofpr_func;9566dhpb.dthpb_name = strtab + probe->dofpr_name;9567dhpb.dthpb_base = probe->dofpr_addr;9568dhpb.dthpb_offs = off + probe->dofpr_offidx;9569dhpb.dthpb_noffs = probe->dofpr_noffs;9570if (enoff != NULL) {9571dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;9572dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;9573} else {9574dhpb.dthpb_enoffs = NULL;9575dhpb.dthpb_nenoffs = 0;9576}9577dhpb.dthpb_args = arg + probe->dofpr_argidx;9578dhpb.dthpb_nargc = probe->dofpr_nargc;9579dhpb.dthpb_xargc = probe->dofpr_xargc;9580dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;9581dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;95829583mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);9584}9585}95869587static void9588dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)9589{9590uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;9591dof_hdr_t *dof = (dof_hdr_t *)daddr;9592int i;95939594ASSERT(MUTEX_HELD(&dtrace_meta_lock));95959596for (i = 0; i < dof->dofh_secnum; i++) {9597dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +9598dof->dofh_secoff + i * dof->dofh_secsize);95999600if (sec->dofs_type != DOF_SECT_PROVIDER)9601continue;96029603dtrace_helper_provide_one(dhp, sec, pid);9604}96059606/*9607* We may have just created probes, so we must now rematch against9608* any retained enablings. Note that this call will acquire both9609* cpu_lock and dtrace_lock; the fact that we are holding9610* dtrace_meta_lock now is what defines the ordering with respect to9611* these three locks.9612*/9613dtrace_enabling_matchall();9614}96159616static void9617dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)9618{9619uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;9620dof_hdr_t *dof = (dof_hdr_t *)daddr;9621dof_sec_t *str_sec;9622dof_provider_t *provider;9623char *strtab;9624dtrace_helper_provdesc_t dhpv;9625dtrace_meta_t *meta = dtrace_meta_pid;9626dtrace_mops_t *mops = &meta->dtm_mops;96279628provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);9629str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +9630provider->dofpv_strtab * dof->dofh_secsize);96319632strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);96339634/*9635* Create the provider.9636*/9637dtrace_dofprov2hprov(&dhpv, provider, strtab);96389639mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);96409641meta->dtm_count--;9642}96439644static void9645dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)9646{9647uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;9648dof_hdr_t *dof = (dof_hdr_t *)daddr;9649int i;96509651ASSERT(MUTEX_HELD(&dtrace_meta_lock));96529653for (i = 0; i < dof->dofh_secnum; i++) {9654dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +9655dof->dofh_secoff + i * dof->dofh_secsize);96569657if (sec->dofs_type != DOF_SECT_PROVIDER)9658continue;96599660dtrace_helper_provider_remove_one(dhp, sec, pid);9661}9662}96639664/*9665* DTrace Meta Provider-to-Framework API Functions9666*9667* These functions implement the Meta Provider-to-Framework API, as described9668* in <sys/dtrace.h>.9669*/9670int9671dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,9672dtrace_meta_provider_id_t *idp)9673{9674dtrace_meta_t *meta;9675dtrace_helpers_t *help, *next;9676int i;96779678*idp = DTRACE_METAPROVNONE;96799680/*9681* We strictly don't need the name, but we hold onto it for9682* debuggability. All hail error queues!9683*/9684if (name == NULL) {9685cmn_err(CE_WARN, "failed to register meta-provider: "9686"invalid name");9687return (EINVAL);9688}96899690if (mops == NULL ||9691mops->dtms_create_probe == NULL ||9692mops->dtms_provide_pid == NULL ||9693mops->dtms_remove_pid == NULL) {9694cmn_err(CE_WARN, "failed to register meta-register %s: "9695"invalid ops", name);9696return (EINVAL);9697}96989699meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);9700meta->dtm_mops = *mops;9701meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);9702(void) strcpy(meta->dtm_name, name);9703meta->dtm_arg = arg;97049705mutex_enter(&dtrace_meta_lock);9706mutex_enter(&dtrace_lock);97079708if (dtrace_meta_pid != NULL) {9709mutex_exit(&dtrace_lock);9710mutex_exit(&dtrace_meta_lock);9711cmn_err(CE_WARN, "failed to register meta-register %s: "9712"user-land meta-provider exists", name);9713kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);9714kmem_free(meta, sizeof (dtrace_meta_t));9715return (EINVAL);9716}97179718dtrace_meta_pid = meta;9719*idp = (dtrace_meta_provider_id_t)meta;97209721/*9722* If there are providers and probes ready to go, pass them9723* off to the new meta provider now.9724*/97259726help = dtrace_deferred_pid;9727dtrace_deferred_pid = NULL;97289729mutex_exit(&dtrace_lock);97309731while (help != NULL) {9732for (i = 0; i < help->dthps_nprovs; i++) {9733dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,9734help->dthps_pid);9735}97369737next = help->dthps_next;9738help->dthps_next = NULL;9739help->dthps_prev = NULL;9740help->dthps_deferred = 0;9741help = next;9742}97439744mutex_exit(&dtrace_meta_lock);97459746return (0);9747}97489749int9750dtrace_meta_unregister(dtrace_meta_provider_id_t id)9751{9752dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;97539754mutex_enter(&dtrace_meta_lock);9755mutex_enter(&dtrace_lock);97569757if (old == dtrace_meta_pid) {9758pp = &dtrace_meta_pid;9759} else {9760panic("attempt to unregister non-existent "9761"dtrace meta-provider %p\n", (void *)old);9762}97639764if (old->dtm_count != 0) {9765mutex_exit(&dtrace_lock);9766mutex_exit(&dtrace_meta_lock);9767return (EBUSY);9768}97699770*pp = NULL;97719772mutex_exit(&dtrace_lock);9773mutex_exit(&dtrace_meta_lock);97749775kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);9776kmem_free(old, sizeof (dtrace_meta_t));97779778return (0);9779}978097819782/*9783* DTrace DIF Object Functions9784*/9785static int9786dtrace_difo_err(uint_t pc, const char *format, ...)9787{9788if (dtrace_err_verbose) {9789va_list alist;97909791(void) uprintf("dtrace DIF object error: [%u]: ", pc);9792va_start(alist, format);9793(void) vuprintf(format, alist);9794va_end(alist);9795}97969797#ifdef DTRACE_ERRDEBUG9798dtrace_errdebug(format);9799#endif9800return (1);9801}98029803/*9804* Validate a DTrace DIF object by checking the IR instructions. The following9805* rules are currently enforced by dtrace_difo_validate():9806*9807* 1. Each instruction must have a valid opcode9808* 2. Each register, string, variable, or subroutine reference must be valid9809* 3. No instruction can modify register %r0 (must be zero)9810* 4. All instruction reserved bits must be set to zero9811* 5. The last instruction must be a "ret" instruction9812* 6. All branch targets must reference a valid instruction _after_ the branch9813*/9814static int9815dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,9816cred_t *cr)9817{9818int err = 0, i;9819int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;9820int kcheckload;9821uint_t pc;9822int maxglobal = -1, maxlocal = -1, maxtlocal = -1;98239824kcheckload = cr == NULL ||9825(vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;98269827dp->dtdo_destructive = 0;98289829for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {9830dif_instr_t instr = dp->dtdo_buf[pc];98319832uint_t r1 = DIF_INSTR_R1(instr);9833uint_t r2 = DIF_INSTR_R2(instr);9834uint_t rd = DIF_INSTR_RD(instr);9835uint_t rs = DIF_INSTR_RS(instr);9836uint_t label = DIF_INSTR_LABEL(instr);9837uint_t v = DIF_INSTR_VAR(instr);9838uint_t subr = DIF_INSTR_SUBR(instr);9839uint_t type = DIF_INSTR_TYPE(instr);9840uint_t op = DIF_INSTR_OP(instr);98419842switch (op) {9843case DIF_OP_OR:9844case DIF_OP_XOR:9845case DIF_OP_AND:9846case DIF_OP_SLL:9847case DIF_OP_SRL:9848case DIF_OP_SRA:9849case DIF_OP_SUB:9850case DIF_OP_ADD:9851case DIF_OP_MUL:9852case DIF_OP_SDIV:9853case DIF_OP_UDIV:9854case DIF_OP_SREM:9855case DIF_OP_UREM:9856case DIF_OP_COPYS:9857if (r1 >= nregs)9858err += efunc(pc, "invalid register %u\n", r1);9859if (r2 >= nregs)9860err += efunc(pc, "invalid register %u\n", r2);9861if (rd >= nregs)9862err += efunc(pc, "invalid register %u\n", rd);9863if (rd == 0)9864err += efunc(pc, "cannot write to %%r0\n");9865break;9866case DIF_OP_NOT:9867case DIF_OP_MOV:9868case DIF_OP_ALLOCS:9869if (r1 >= nregs)9870err += efunc(pc, "invalid register %u\n", r1);9871if (r2 != 0)9872err += efunc(pc, "non-zero reserved bits\n");9873if (rd >= nregs)9874err += efunc(pc, "invalid register %u\n", rd);9875if (rd == 0)9876err += efunc(pc, "cannot write to %%r0\n");9877break;9878case DIF_OP_LDSB:9879case DIF_OP_LDSH:9880case DIF_OP_LDSW:9881case DIF_OP_LDUB:9882case DIF_OP_LDUH:9883case DIF_OP_LDUW:9884case DIF_OP_LDX:9885if (r1 >= nregs)9886err += efunc(pc, "invalid register %u\n", r1);9887if (r2 != 0)9888err += efunc(pc, "non-zero reserved bits\n");9889if (rd >= nregs)9890err += efunc(pc, "invalid register %u\n", rd);9891if (rd == 0)9892err += efunc(pc, "cannot write to %%r0\n");9893if (kcheckload)9894dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +9895DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);9896break;9897case DIF_OP_RLDSB:9898case DIF_OP_RLDSH:9899case DIF_OP_RLDSW:9900case DIF_OP_RLDUB:9901case DIF_OP_RLDUH:9902case DIF_OP_RLDUW:9903case DIF_OP_RLDX:9904if (r1 >= nregs)9905err += efunc(pc, "invalid register %u\n", r1);9906if (r2 != 0)9907err += efunc(pc, "non-zero reserved bits\n");9908if (rd >= nregs)9909err += efunc(pc, "invalid register %u\n", rd);9910if (rd == 0)9911err += efunc(pc, "cannot write to %%r0\n");9912break;9913case DIF_OP_ULDSB:9914case DIF_OP_ULDSH:9915case DIF_OP_ULDSW:9916case DIF_OP_ULDUB:9917case DIF_OP_ULDUH:9918case DIF_OP_ULDUW:9919case DIF_OP_ULDX:9920if (r1 >= nregs)9921err += efunc(pc, "invalid register %u\n", r1);9922if (r2 != 0)9923err += efunc(pc, "non-zero reserved bits\n");9924if (rd >= nregs)9925err += efunc(pc, "invalid register %u\n", rd);9926if (rd == 0)9927err += efunc(pc, "cannot write to %%r0\n");9928break;9929case DIF_OP_STB:9930case DIF_OP_STH:9931case DIF_OP_STW:9932case DIF_OP_STX:9933if (r1 >= nregs)9934err += efunc(pc, "invalid register %u\n", r1);9935if (r2 != 0)9936err += efunc(pc, "non-zero reserved bits\n");9937if (rd >= nregs)9938err += efunc(pc, "invalid register %u\n", rd);9939if (rd == 0)9940err += efunc(pc, "cannot write to 0 address\n");9941break;9942case DIF_OP_CMP:9943case DIF_OP_SCMP:9944if (r1 >= nregs)9945err += efunc(pc, "invalid register %u\n", r1);9946if (r2 >= nregs)9947err += efunc(pc, "invalid register %u\n", r2);9948if (rd != 0)9949err += efunc(pc, "non-zero reserved bits\n");9950break;9951case DIF_OP_TST:9952if (r1 >= nregs)9953err += efunc(pc, "invalid register %u\n", r1);9954if (r2 != 0 || rd != 0)9955err += efunc(pc, "non-zero reserved bits\n");9956break;9957case DIF_OP_BA:9958case DIF_OP_BE:9959case DIF_OP_BNE:9960case DIF_OP_BG:9961case DIF_OP_BGU:9962case DIF_OP_BGE:9963case DIF_OP_BGEU:9964case DIF_OP_BL:9965case DIF_OP_BLU:9966case DIF_OP_BLE:9967case DIF_OP_BLEU:9968if (label >= dp->dtdo_len) {9969err += efunc(pc, "invalid branch target %u\n",9970label);9971}9972if (label <= pc) {9973err += efunc(pc, "backward branch to %u\n",9974label);9975}9976break;9977case DIF_OP_RET:9978if (r1 != 0 || r2 != 0)9979err += efunc(pc, "non-zero reserved bits\n");9980if (rd >= nregs)9981err += efunc(pc, "invalid register %u\n", rd);9982break;9983case DIF_OP_NOP:9984case DIF_OP_POPTS:9985case DIF_OP_FLUSHTS:9986if (r1 != 0 || r2 != 0 || rd != 0)9987err += efunc(pc, "non-zero reserved bits\n");9988break;9989case DIF_OP_SETX:9990if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {9991err += efunc(pc, "invalid integer ref %u\n",9992DIF_INSTR_INTEGER(instr));9993}9994if (rd >= nregs)9995err += efunc(pc, "invalid register %u\n", rd);9996if (rd == 0)9997err += efunc(pc, "cannot write to %%r0\n");9998break;9999case DIF_OP_SETS:10000if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {10001err += efunc(pc, "invalid string ref %u\n",10002DIF_INSTR_STRING(instr));10003}10004if (rd >= nregs)10005err += efunc(pc, "invalid register %u\n", rd);10006if (rd == 0)10007err += efunc(pc, "cannot write to %%r0\n");10008break;10009case DIF_OP_LDGA:10010case DIF_OP_LDTA:10011if (r1 > DIF_VAR_ARRAY_MAX)10012err += efunc(pc, "invalid array %u\n", r1);10013if (r2 >= nregs)10014err += efunc(pc, "invalid register %u\n", r2);10015if (rd >= nregs)10016err += efunc(pc, "invalid register %u\n", rd);10017if (rd == 0)10018err += efunc(pc, "cannot write to %%r0\n");10019break;10020case DIF_OP_LDGS:10021case DIF_OP_LDTS:10022case DIF_OP_LDLS:10023case DIF_OP_LDGAA:10024case DIF_OP_LDTAA:10025if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)10026err += efunc(pc, "invalid variable %u\n", v);10027if (rd >= nregs)10028err += efunc(pc, "invalid register %u\n", rd);10029if (rd == 0)10030err += efunc(pc, "cannot write to %%r0\n");10031break;10032case DIF_OP_STGS:10033case DIF_OP_STTS:10034case DIF_OP_STLS:10035case DIF_OP_STGAA:10036case DIF_OP_STTAA:10037if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)10038err += efunc(pc, "invalid variable %u\n", v);10039if (rs >= nregs)10040err += efunc(pc, "invalid register %u\n", rd);10041break;10042case DIF_OP_CALL:10043if (subr > DIF_SUBR_MAX)10044err += efunc(pc, "invalid subr %u\n", subr);10045if (rd >= nregs)10046err += efunc(pc, "invalid register %u\n", rd);10047if (rd == 0)10048err += efunc(pc, "cannot write to %%r0\n");1004910050if (subr == DIF_SUBR_COPYOUT ||10051subr == DIF_SUBR_COPYOUTSTR) {10052dp->dtdo_destructive = 1;10053}1005410055if (subr == DIF_SUBR_GETF) {10056#ifdef __FreeBSD__10057err += efunc(pc, "getf() not supported");10058#else10059/*10060* If we have a getf() we need to record that10061* in our state. Note that our state can be10062* NULL if this is a helper -- but in that10063* case, the call to getf() is itself illegal,10064* and will be caught (slightly later) when10065* the helper is validated.10066*/10067if (vstate->dtvs_state != NULL)10068vstate->dtvs_state->dts_getf++;10069#endif10070}1007110072break;10073case DIF_OP_PUSHTR:10074if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)10075err += efunc(pc, "invalid ref type %u\n", type);10076if (r2 >= nregs)10077err += efunc(pc, "invalid register %u\n", r2);10078if (rs >= nregs)10079err += efunc(pc, "invalid register %u\n", rs);10080break;10081case DIF_OP_PUSHTV:10082if (type != DIF_TYPE_CTF)10083err += efunc(pc, "invalid val type %u\n", type);10084if (r2 >= nregs)10085err += efunc(pc, "invalid register %u\n", r2);10086if (rs >= nregs)10087err += efunc(pc, "invalid register %u\n", rs);10088break;10089default:10090err += efunc(pc, "invalid opcode %u\n",10091DIF_INSTR_OP(instr));10092}10093}1009410095if (dp->dtdo_len != 0 &&10096DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {10097err += efunc(dp->dtdo_len - 1,10098"expected 'ret' as last DIF instruction\n");10099}1010010101if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {10102/*10103* If we're not returning by reference, the size must be either10104* 0 or the size of one of the base types.10105*/10106switch (dp->dtdo_rtype.dtdt_size) {10107case 0:10108case sizeof (uint8_t):10109case sizeof (uint16_t):10110case sizeof (uint32_t):10111case sizeof (uint64_t):10112break;1011310114default:10115err += efunc(dp->dtdo_len - 1, "bad return size\n");10116}10117}1011810119for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {10120dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;10121dtrace_diftype_t *vt, *et;10122uint_t id, ndx;1012310124if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&10125v->dtdv_scope != DIFV_SCOPE_THREAD &&10126v->dtdv_scope != DIFV_SCOPE_LOCAL) {10127err += efunc(i, "unrecognized variable scope %d\n",10128v->dtdv_scope);10129break;10130}1013110132if (v->dtdv_kind != DIFV_KIND_ARRAY &&10133v->dtdv_kind != DIFV_KIND_SCALAR) {10134err += efunc(i, "unrecognized variable type %d\n",10135v->dtdv_kind);10136break;10137}1013810139if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {10140err += efunc(i, "%d exceeds variable id limit\n", id);10141break;10142}1014310144if (id < DIF_VAR_OTHER_UBASE)10145continue;1014610147/*10148* For user-defined variables, we need to check that this10149* definition is identical to any previous definition that we10150* encountered.10151*/10152ndx = id - DIF_VAR_OTHER_UBASE;1015310154switch (v->dtdv_scope) {10155case DIFV_SCOPE_GLOBAL:10156if (maxglobal == -1 || ndx > maxglobal)10157maxglobal = ndx;1015810159if (ndx < vstate->dtvs_nglobals) {10160dtrace_statvar_t *svar;1016110162if ((svar = vstate->dtvs_globals[ndx]) != NULL)10163existing = &svar->dtsv_var;10164}1016510166break;1016710168case DIFV_SCOPE_THREAD:10169if (maxtlocal == -1 || ndx > maxtlocal)10170maxtlocal = ndx;1017110172if (ndx < vstate->dtvs_ntlocals)10173existing = &vstate->dtvs_tlocals[ndx];10174break;1017510176case DIFV_SCOPE_LOCAL:10177if (maxlocal == -1 || ndx > maxlocal)10178maxlocal = ndx;1017910180if (ndx < vstate->dtvs_nlocals) {10181dtrace_statvar_t *svar;1018210183if ((svar = vstate->dtvs_locals[ndx]) != NULL)10184existing = &svar->dtsv_var;10185}1018610187break;10188}1018910190vt = &v->dtdv_type;1019110192if (vt->dtdt_flags & DIF_TF_BYREF) {10193if (vt->dtdt_size == 0) {10194err += efunc(i, "zero-sized variable\n");10195break;10196}1019710198if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||10199v->dtdv_scope == DIFV_SCOPE_LOCAL) &&10200vt->dtdt_size > dtrace_statvar_maxsize) {10201err += efunc(i, "oversized by-ref static\n");10202break;10203}10204}1020510206if (existing == NULL || existing->dtdv_id == 0)10207continue;1020810209ASSERT(existing->dtdv_id == v->dtdv_id);10210ASSERT(existing->dtdv_scope == v->dtdv_scope);1021110212if (existing->dtdv_kind != v->dtdv_kind)10213err += efunc(i, "%d changed variable kind\n", id);1021410215et = &existing->dtdv_type;1021610217if (vt->dtdt_flags != et->dtdt_flags) {10218err += efunc(i, "%d changed variable type flags\n", id);10219break;10220}1022110222if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {10223err += efunc(i, "%d changed variable type size\n", id);10224break;10225}10226}1022710228for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {10229dif_instr_t instr = dp->dtdo_buf[pc];1023010231uint_t v = DIF_INSTR_VAR(instr);10232uint_t op = DIF_INSTR_OP(instr);1023310234switch (op) {10235case DIF_OP_LDGS:10236case DIF_OP_LDGAA:10237case DIF_OP_STGS:10238case DIF_OP_STGAA:10239if (v > DIF_VAR_OTHER_UBASE + maxglobal)10240err += efunc(pc, "invalid variable %u\n", v);10241break;10242case DIF_OP_LDTS:10243case DIF_OP_LDTAA:10244case DIF_OP_STTS:10245case DIF_OP_STTAA:10246if (v > DIF_VAR_OTHER_UBASE + maxtlocal)10247err += efunc(pc, "invalid variable %u\n", v);10248break;10249case DIF_OP_LDLS:10250case DIF_OP_STLS:10251if (v > DIF_VAR_OTHER_UBASE + maxlocal)10252err += efunc(pc, "invalid variable %u\n", v);10253break;10254default:10255break;10256}10257}1025810259return (err);10260}1026110262/*10263* Validate a DTrace DIF object that it is to be used as a helper. Helpers10264* are much more constrained than normal DIFOs. Specifically, they may10265* not:10266*10267* 1. Make calls to subroutines other than copyin(), copyinstr() or10268* miscellaneous string routines10269* 2. Access DTrace variables other than the args[] array, and the10270* curthread, pid, ppid, tid, execname, zonename, uid and gid variables.10271* 3. Have thread-local variables.10272* 4. Have dynamic variables.10273*/10274static int10275dtrace_difo_validate_helper(dtrace_difo_t *dp)10276{10277int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;10278int err = 0;10279uint_t pc;1028010281for (pc = 0; pc < dp->dtdo_len; pc++) {10282dif_instr_t instr = dp->dtdo_buf[pc];1028310284uint_t v = DIF_INSTR_VAR(instr);10285uint_t subr = DIF_INSTR_SUBR(instr);10286uint_t op = DIF_INSTR_OP(instr);1028710288switch (op) {10289case DIF_OP_OR:10290case DIF_OP_XOR:10291case DIF_OP_AND:10292case DIF_OP_SLL:10293case DIF_OP_SRL:10294case DIF_OP_SRA:10295case DIF_OP_SUB:10296case DIF_OP_ADD:10297case DIF_OP_MUL:10298case DIF_OP_SDIV:10299case DIF_OP_UDIV:10300case DIF_OP_SREM:10301case DIF_OP_UREM:10302case DIF_OP_COPYS:10303case DIF_OP_NOT:10304case DIF_OP_MOV:10305case DIF_OP_RLDSB:10306case DIF_OP_RLDSH:10307case DIF_OP_RLDSW:10308case DIF_OP_RLDUB:10309case DIF_OP_RLDUH:10310case DIF_OP_RLDUW:10311case DIF_OP_RLDX:10312case DIF_OP_ULDSB:10313case DIF_OP_ULDSH:10314case DIF_OP_ULDSW:10315case DIF_OP_ULDUB:10316case DIF_OP_ULDUH:10317case DIF_OP_ULDUW:10318case DIF_OP_ULDX:10319case DIF_OP_STB:10320case DIF_OP_STH:10321case DIF_OP_STW:10322case DIF_OP_STX:10323case DIF_OP_ALLOCS:10324case DIF_OP_CMP:10325case DIF_OP_SCMP:10326case DIF_OP_TST:10327case DIF_OP_BA:10328case DIF_OP_BE:10329case DIF_OP_BNE:10330case DIF_OP_BG:10331case DIF_OP_BGU:10332case DIF_OP_BGE:10333case DIF_OP_BGEU:10334case DIF_OP_BL:10335case DIF_OP_BLU:10336case DIF_OP_BLE:10337case DIF_OP_BLEU:10338case DIF_OP_RET:10339case DIF_OP_NOP:10340case DIF_OP_POPTS:10341case DIF_OP_FLUSHTS:10342case DIF_OP_SETX:10343case DIF_OP_SETS:10344case DIF_OP_LDGA:10345case DIF_OP_LDLS:10346case DIF_OP_STGS:10347case DIF_OP_STLS:10348case DIF_OP_PUSHTR:10349case DIF_OP_PUSHTV:10350break;1035110352case DIF_OP_LDGS:10353if (v >= DIF_VAR_OTHER_UBASE)10354break;1035510356if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)10357break;1035810359if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||10360v == DIF_VAR_PPID || v == DIF_VAR_TID ||10361v == DIF_VAR_EXECARGS ||10362v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||10363v == DIF_VAR_UID || v == DIF_VAR_GID)10364break;1036510366err += efunc(pc, "illegal variable %u\n", v);10367break;1036810369case DIF_OP_LDTA:10370case DIF_OP_LDTS:10371case DIF_OP_LDGAA:10372case DIF_OP_LDTAA:10373err += efunc(pc, "illegal dynamic variable load\n");10374break;1037510376case DIF_OP_STTS:10377case DIF_OP_STGAA:10378case DIF_OP_STTAA:10379err += efunc(pc, "illegal dynamic variable store\n");10380break;1038110382case DIF_OP_CALL:10383if (subr == DIF_SUBR_ALLOCA ||10384subr == DIF_SUBR_BCOPY ||10385subr == DIF_SUBR_COPYIN ||10386subr == DIF_SUBR_COPYINTO ||10387subr == DIF_SUBR_COPYINSTR ||10388subr == DIF_SUBR_INDEX ||10389subr == DIF_SUBR_INET_NTOA ||10390subr == DIF_SUBR_INET_NTOA6 ||10391subr == DIF_SUBR_INET_NTOP ||10392subr == DIF_SUBR_JSON ||10393subr == DIF_SUBR_LLTOSTR ||10394subr == DIF_SUBR_STRTOLL ||10395subr == DIF_SUBR_RINDEX ||10396subr == DIF_SUBR_STRCHR ||10397subr == DIF_SUBR_STRJOIN ||10398subr == DIF_SUBR_STRRCHR ||10399subr == DIF_SUBR_STRSTR ||10400subr == DIF_SUBR_HTONS ||10401subr == DIF_SUBR_HTONL ||10402subr == DIF_SUBR_HTONLL ||10403subr == DIF_SUBR_NTOHS ||10404subr == DIF_SUBR_NTOHL ||10405subr == DIF_SUBR_NTOHLL ||10406subr == DIF_SUBR_MEMREF)10407break;10408#ifdef __FreeBSD__10409if (subr == DIF_SUBR_MEMSTR)10410break;10411#endif1041210413err += efunc(pc, "invalid subr %u\n", subr);10414break;1041510416default:10417err += efunc(pc, "invalid opcode %u\n",10418DIF_INSTR_OP(instr));10419}10420}1042110422return (err);10423}1042410425/*10426* Returns 1 if the expression in the DIF object can be cached on a per-thread10427* basis; 0 if not.10428*/10429static int10430dtrace_difo_cacheable(dtrace_difo_t *dp)10431{10432int i;1043310434if (dp == NULL)10435return (0);1043610437for (i = 0; i < dp->dtdo_varlen; i++) {10438dtrace_difv_t *v = &dp->dtdo_vartab[i];1043910440if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)10441continue;1044210443switch (v->dtdv_id) {10444case DIF_VAR_CURTHREAD:10445case DIF_VAR_PID:10446case DIF_VAR_TID:10447case DIF_VAR_EXECARGS:10448case DIF_VAR_EXECNAME:10449case DIF_VAR_ZONENAME:10450break;1045110452default:10453return (0);10454}10455}1045610457/*10458* This DIF object may be cacheable. Now we need to look for any10459* array loading instructions, any memory loading instructions, or10460* any stores to thread-local variables.10461*/10462for (i = 0; i < dp->dtdo_len; i++) {10463uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);1046410465if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||10466(op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||10467(op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||10468op == DIF_OP_LDGA || op == DIF_OP_STTS)10469return (0);10470}1047110472return (1);10473}1047410475static void10476dtrace_difo_hold(dtrace_difo_t *dp)10477{10478int i;1047910480ASSERT(MUTEX_HELD(&dtrace_lock));1048110482dp->dtdo_refcnt++;10483ASSERT(dp->dtdo_refcnt != 0);1048410485/*10486* We need to check this DIF object for references to the variable10487* DIF_VAR_VTIMESTAMP.10488*/10489for (i = 0; i < dp->dtdo_varlen; i++) {10490dtrace_difv_t *v = &dp->dtdo_vartab[i];1049110492if (v->dtdv_id != DIF_VAR_VTIMESTAMP)10493continue;1049410495if (dtrace_vtime_references++ == 0)10496dtrace_vtime_enable();10497}10498}1049910500/*10501* This routine calculates the dynamic variable chunksize for a given DIF10502* object. The calculation is not fool-proof, and can probably be tricked by10503* malicious DIF -- but it works for all compiler-generated DIF. Because this10504* calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail10505* if a dynamic variable size exceeds the chunksize.10506*/10507static void10508dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)10509{10510uint64_t sval = 0;10511dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */10512const dif_instr_t *text = dp->dtdo_buf;10513uint_t pc, srd = 0;10514uint_t ttop = 0;10515size_t size, ksize;10516uint_t id, i;1051710518for (pc = 0; pc < dp->dtdo_len; pc++) {10519dif_instr_t instr = text[pc];10520uint_t op = DIF_INSTR_OP(instr);10521uint_t rd = DIF_INSTR_RD(instr);10522uint_t r1 = DIF_INSTR_R1(instr);10523uint_t nkeys = 0;10524uchar_t scope = 0;1052510526dtrace_key_t *key = tupregs;1052710528switch (op) {10529case DIF_OP_SETX:10530sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];10531srd = rd;10532continue;1053310534case DIF_OP_STTS:10535key = &tupregs[DIF_DTR_NREGS];10536key[0].dttk_size = 0;10537key[1].dttk_size = 0;10538nkeys = 2;10539scope = DIFV_SCOPE_THREAD;10540break;1054110542case DIF_OP_STGAA:10543case DIF_OP_STTAA:10544nkeys = ttop;1054510546if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)10547key[nkeys++].dttk_size = 0;1054810549key[nkeys++].dttk_size = 0;1055010551if (op == DIF_OP_STTAA) {10552scope = DIFV_SCOPE_THREAD;10553} else {10554scope = DIFV_SCOPE_GLOBAL;10555}1055610557break;1055810559case DIF_OP_PUSHTR:10560if (ttop == DIF_DTR_NREGS)10561return;1056210563if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {10564/*10565* If the register for the size of the "pushtr"10566* is %r0 (or the value is 0) and the type is10567* a string, we'll use the system-wide default10568* string size.10569*/10570tupregs[ttop++].dttk_size =10571dtrace_strsize_default;10572} else {10573if (srd == 0)10574return;1057510576if (sval > LONG_MAX)10577return;1057810579tupregs[ttop++].dttk_size = sval;10580}1058110582break;1058310584case DIF_OP_PUSHTV:10585if (ttop == DIF_DTR_NREGS)10586return;1058710588tupregs[ttop++].dttk_size = 0;10589break;1059010591case DIF_OP_FLUSHTS:10592ttop = 0;10593break;1059410595case DIF_OP_POPTS:10596if (ttop != 0)10597ttop--;10598break;10599}1060010601sval = 0;10602srd = 0;1060310604if (nkeys == 0)10605continue;1060610607/*10608* We have a dynamic variable allocation; calculate its size.10609*/10610for (ksize = 0, i = 0; i < nkeys; i++)10611ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));1061210613size = sizeof (dtrace_dynvar_t);10614size += sizeof (dtrace_key_t) * (nkeys - 1);10615size += ksize;1061610617/*10618* Now we need to determine the size of the stored data.10619*/10620id = DIF_INSTR_VAR(instr);1062110622for (i = 0; i < dp->dtdo_varlen; i++) {10623dtrace_difv_t *v = &dp->dtdo_vartab[i];1062410625if (v->dtdv_id == id && v->dtdv_scope == scope) {10626size += v->dtdv_type.dtdt_size;10627break;10628}10629}1063010631if (i == dp->dtdo_varlen)10632return;1063310634/*10635* We have the size. If this is larger than the chunk size10636* for our dynamic variable state, reset the chunk size.10637*/10638size = P2ROUNDUP(size, sizeof (uint64_t));1063910640/*10641* Before setting the chunk size, check that we're not going10642* to set it to a negative value...10643*/10644if (size > LONG_MAX)10645return;1064610647/*10648* ...and make certain that we didn't badly overflow.10649*/10650if (size < ksize || size < sizeof (dtrace_dynvar_t))10651return;1065210653if (size > vstate->dtvs_dynvars.dtds_chunksize)10654vstate->dtvs_dynvars.dtds_chunksize = size;10655}10656}1065710658static void10659dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)10660{10661int i, oldsvars, osz, nsz, otlocals, ntlocals;10662uint_t id;1066310664ASSERT(MUTEX_HELD(&dtrace_lock));10665ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);1066610667for (i = 0; i < dp->dtdo_varlen; i++) {10668dtrace_difv_t *v = &dp->dtdo_vartab[i];10669dtrace_statvar_t *svar, ***svarp = NULL;10670size_t dsize = 0;10671uint8_t scope = v->dtdv_scope;10672int *np = NULL;1067310674if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)10675continue;1067610677id -= DIF_VAR_OTHER_UBASE;1067810679switch (scope) {10680case DIFV_SCOPE_THREAD:10681while (id >= (otlocals = vstate->dtvs_ntlocals)) {10682dtrace_difv_t *tlocals;1068310684if ((ntlocals = (otlocals << 1)) == 0)10685ntlocals = 1;1068610687osz = otlocals * sizeof (dtrace_difv_t);10688nsz = ntlocals * sizeof (dtrace_difv_t);1068910690tlocals = kmem_zalloc(nsz, KM_SLEEP);1069110692if (osz != 0) {10693bcopy(vstate->dtvs_tlocals,10694tlocals, osz);10695kmem_free(vstate->dtvs_tlocals, osz);10696}1069710698vstate->dtvs_tlocals = tlocals;10699vstate->dtvs_ntlocals = ntlocals;10700}1070110702vstate->dtvs_tlocals[id] = *v;10703continue;1070410705case DIFV_SCOPE_LOCAL:10706np = &vstate->dtvs_nlocals;10707svarp = &vstate->dtvs_locals;1070810709if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)10710dsize = (mp_maxid + 1) *10711(v->dtdv_type.dtdt_size +10712sizeof (uint64_t));10713else10714dsize = (mp_maxid + 1) * sizeof (uint64_t);1071510716break;1071710718case DIFV_SCOPE_GLOBAL:10719np = &vstate->dtvs_nglobals;10720svarp = &vstate->dtvs_globals;1072110722if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)10723dsize = v->dtdv_type.dtdt_size +10724sizeof (uint64_t);1072510726break;1072710728default:10729ASSERT(0);10730}1073110732while (id >= (oldsvars = *np)) {10733dtrace_statvar_t **statics;10734int newsvars, oldsize, newsize;1073510736if ((newsvars = (oldsvars << 1)) == 0)10737newsvars = 1;1073810739oldsize = oldsvars * sizeof (dtrace_statvar_t *);10740newsize = newsvars * sizeof (dtrace_statvar_t *);1074110742statics = kmem_zalloc(newsize, KM_SLEEP);1074310744if (oldsize != 0) {10745bcopy(*svarp, statics, oldsize);10746kmem_free(*svarp, oldsize);10747}1074810749*svarp = statics;10750*np = newsvars;10751}1075210753if ((svar = (*svarp)[id]) == NULL) {10754svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);10755svar->dtsv_var = *v;1075610757if ((svar->dtsv_size = dsize) != 0) {10758svar->dtsv_data = (uint64_t)(uintptr_t)10759kmem_zalloc(dsize, KM_SLEEP);10760}1076110762(*svarp)[id] = svar;10763}1076410765svar->dtsv_refcnt++;10766}1076710768dtrace_difo_chunksize(dp, vstate);10769dtrace_difo_hold(dp);10770}1077110772static dtrace_difo_t *10773dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)10774{10775dtrace_difo_t *new;10776size_t sz;1077710778ASSERT(dp->dtdo_buf != NULL);10779ASSERT(dp->dtdo_refcnt != 0);1078010781new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);1078210783ASSERT(dp->dtdo_buf != NULL);10784sz = dp->dtdo_len * sizeof (dif_instr_t);10785new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);10786bcopy(dp->dtdo_buf, new->dtdo_buf, sz);10787new->dtdo_len = dp->dtdo_len;1078810789if (dp->dtdo_strtab != NULL) {10790ASSERT(dp->dtdo_strlen != 0);10791new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);10792bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);10793new->dtdo_strlen = dp->dtdo_strlen;10794}1079510796if (dp->dtdo_inttab != NULL) {10797ASSERT(dp->dtdo_intlen != 0);10798sz = dp->dtdo_intlen * sizeof (uint64_t);10799new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);10800bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);10801new->dtdo_intlen = dp->dtdo_intlen;10802}1080310804if (dp->dtdo_vartab != NULL) {10805ASSERT(dp->dtdo_varlen != 0);10806sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);10807new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);10808bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);10809new->dtdo_varlen = dp->dtdo_varlen;10810}1081110812dtrace_difo_init(new, vstate);10813return (new);10814}1081510816static void10817dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)10818{10819int i;1082010821ASSERT(dp->dtdo_refcnt == 0);1082210823for (i = 0; i < dp->dtdo_varlen; i++) {10824dtrace_difv_t *v = &dp->dtdo_vartab[i];10825dtrace_statvar_t *svar, **svarp = NULL;10826uint_t id;10827uint8_t scope = v->dtdv_scope;10828int *np = NULL;1082910830switch (scope) {10831case DIFV_SCOPE_THREAD:10832continue;1083310834case DIFV_SCOPE_LOCAL:10835np = &vstate->dtvs_nlocals;10836svarp = vstate->dtvs_locals;10837break;1083810839case DIFV_SCOPE_GLOBAL:10840np = &vstate->dtvs_nglobals;10841svarp = vstate->dtvs_globals;10842break;1084310844default:10845ASSERT(0);10846}1084710848if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)10849continue;1085010851id -= DIF_VAR_OTHER_UBASE;10852ASSERT(id < *np);1085310854svar = svarp[id];10855ASSERT(svar != NULL);10856ASSERT(svar->dtsv_refcnt > 0);1085710858if (--svar->dtsv_refcnt > 0)10859continue;1086010861if (svar->dtsv_size != 0) {10862ASSERT(svar->dtsv_data != 0);10863kmem_free((void *)(uintptr_t)svar->dtsv_data,10864svar->dtsv_size);10865}1086610867kmem_free(svar, sizeof (dtrace_statvar_t));10868svarp[id] = NULL;10869}1087010871if (dp->dtdo_buf != NULL)10872kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));10873if (dp->dtdo_inttab != NULL)10874kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));10875if (dp->dtdo_strtab != NULL)10876kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);10877if (dp->dtdo_vartab != NULL)10878kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));1087910880kmem_free(dp, sizeof (dtrace_difo_t));10881}1088210883static void10884dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)10885{10886int i;1088710888ASSERT(MUTEX_HELD(&dtrace_lock));10889ASSERT(dp->dtdo_refcnt != 0);1089010891for (i = 0; i < dp->dtdo_varlen; i++) {10892dtrace_difv_t *v = &dp->dtdo_vartab[i];1089310894if (v->dtdv_id != DIF_VAR_VTIMESTAMP)10895continue;1089610897ASSERT(dtrace_vtime_references > 0);10898if (--dtrace_vtime_references == 0)10899dtrace_vtime_disable();10900}1090110902if (--dp->dtdo_refcnt == 0)10903dtrace_difo_destroy(dp, vstate);10904}1090510906/*10907* DTrace Format Functions10908*/10909static uint16_t10910dtrace_format_add(dtrace_state_t *state, char *str)10911{10912char *fmt, **new;10913uint16_t ndx, len = strlen(str) + 1;1091410915fmt = kmem_zalloc(len, KM_SLEEP);10916bcopy(str, fmt, len);1091710918for (ndx = 0; ndx < state->dts_nformats; ndx++) {10919if (state->dts_formats[ndx] == NULL) {10920state->dts_formats[ndx] = fmt;10921return (ndx + 1);10922}10923}1092410925if (state->dts_nformats == USHRT_MAX) {10926/*10927* This is only likely if a denial-of-service attack is being10928* attempted. As such, it's okay to fail silently here.10929*/10930kmem_free(fmt, len);10931return (0);10932}1093310934/*10935* For simplicity, we always resize the formats array to be exactly the10936* number of formats.10937*/10938ndx = state->dts_nformats++;10939new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);1094010941if (state->dts_formats != NULL) {10942ASSERT(ndx != 0);10943bcopy(state->dts_formats, new, ndx * sizeof (char *));10944kmem_free(state->dts_formats, ndx * sizeof (char *));10945}1094610947state->dts_formats = new;10948state->dts_formats[ndx] = fmt;1094910950return (ndx + 1);10951}1095210953static void10954dtrace_format_remove(dtrace_state_t *state, uint16_t format)10955{10956char *fmt;1095710958ASSERT(state->dts_formats != NULL);10959ASSERT(format <= state->dts_nformats);10960ASSERT(state->dts_formats[format - 1] != NULL);1096110962fmt = state->dts_formats[format - 1];10963kmem_free(fmt, strlen(fmt) + 1);10964state->dts_formats[format - 1] = NULL;10965}1096610967static void10968dtrace_format_destroy(dtrace_state_t *state)10969{10970int i;1097110972if (state->dts_nformats == 0) {10973ASSERT(state->dts_formats == NULL);10974return;10975}1097610977ASSERT(state->dts_formats != NULL);1097810979for (i = 0; i < state->dts_nformats; i++) {10980char *fmt = state->dts_formats[i];1098110982if (fmt == NULL)10983continue;1098410985kmem_free(fmt, strlen(fmt) + 1);10986}1098710988kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));10989state->dts_nformats = 0;10990state->dts_formats = NULL;10991}1099210993/*10994* DTrace Predicate Functions10995*/10996static dtrace_predicate_t *10997dtrace_predicate_create(dtrace_difo_t *dp)10998{10999dtrace_predicate_t *pred;1100011001ASSERT(MUTEX_HELD(&dtrace_lock));11002ASSERT(dp->dtdo_refcnt != 0);1100311004pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);11005pred->dtp_difo = dp;11006pred->dtp_refcnt = 1;1100711008if (!dtrace_difo_cacheable(dp))11009return (pred);1101011011if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {11012/*11013* This is only theoretically possible -- we have had 2^3211014* cacheable predicates on this machine. We cannot allow any11015* more predicates to become cacheable: as unlikely as it is,11016* there may be a thread caching a (now stale) predicate cache11017* ID. (N.B.: the temptation is being successfully resisted to11018* have this cmn_err() "Holy shit -- we executed this code!")11019*/11020return (pred);11021}1102211023pred->dtp_cacheid = dtrace_predcache_id++;1102411025return (pred);11026}1102711028static void11029dtrace_predicate_hold(dtrace_predicate_t *pred)11030{11031ASSERT(MUTEX_HELD(&dtrace_lock));11032ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);11033ASSERT(pred->dtp_refcnt > 0);1103411035pred->dtp_refcnt++;11036}1103711038static void11039dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)11040{11041dtrace_difo_t *dp = pred->dtp_difo;1104211043ASSERT(MUTEX_HELD(&dtrace_lock));11044ASSERT(dp != NULL && dp->dtdo_refcnt != 0);11045ASSERT(pred->dtp_refcnt > 0);1104611047if (--pred->dtp_refcnt == 0) {11048dtrace_difo_release(pred->dtp_difo, vstate);11049kmem_free(pred, sizeof (dtrace_predicate_t));11050}11051}1105211053/*11054* DTrace Action Description Functions11055*/11056static dtrace_actdesc_t *11057dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,11058uint64_t uarg, uint64_t arg)11059{11060dtrace_actdesc_t *act;1106111062#ifdef illumos11063ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&11064arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));11065#endif1106611067act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);11068act->dtad_kind = kind;11069act->dtad_ntuple = ntuple;11070act->dtad_uarg = uarg;11071act->dtad_arg = arg;11072act->dtad_refcnt = 1;1107311074return (act);11075}1107611077static void11078dtrace_actdesc_hold(dtrace_actdesc_t *act)11079{11080ASSERT(act->dtad_refcnt >= 1);11081act->dtad_refcnt++;11082}1108311084static void11085dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)11086{11087dtrace_actkind_t kind = act->dtad_kind;11088dtrace_difo_t *dp;1108911090ASSERT(act->dtad_refcnt >= 1);1109111092if (--act->dtad_refcnt != 0)11093return;1109411095if ((dp = act->dtad_difo) != NULL)11096dtrace_difo_release(dp, vstate);1109711098if (DTRACEACT_ISPRINTFLIKE(kind)) {11099char *str = (char *)(uintptr_t)act->dtad_arg;1110011101#ifdef illumos11102ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||11103(str == NULL && act->dtad_kind == DTRACEACT_PRINTA));11104#endif1110511106if (str != NULL)11107kmem_free(str, strlen(str) + 1);11108}1110911110kmem_free(act, sizeof (dtrace_actdesc_t));11111}1111211113/*11114* DTrace ECB Functions11115*/11116static dtrace_ecb_t *11117dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)11118{11119dtrace_ecb_t *ecb;11120dtrace_epid_t epid;1112111122ASSERT(MUTEX_HELD(&dtrace_lock));1112311124ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);11125ecb->dte_predicate = NULL;11126ecb->dte_probe = probe;1112711128/*11129* The default size is the size of the default action: recording11130* the header.11131*/11132ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);11133ecb->dte_alignment = sizeof (dtrace_epid_t);1113411135epid = state->dts_epid++;1113611137if (epid - 1 >= state->dts_necbs) {11138dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;11139int necbs = state->dts_necbs << 1;1114011141ASSERT(epid == state->dts_necbs + 1);1114211143if (necbs == 0) {11144ASSERT(oecbs == NULL);11145necbs = 1;11146}1114711148ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);1114911150if (oecbs != NULL)11151bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));1115211153dtrace_membar_producer();11154state->dts_ecbs = ecbs;1115511156if (oecbs != NULL) {11157/*11158* If this state is active, we must dtrace_sync()11159* before we can free the old dts_ecbs array: we're11160* coming in hot, and there may be active ring11161* buffer processing (which indexes into the dts_ecbs11162* array) on another CPU.11163*/11164if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)11165dtrace_sync();1116611167kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));11168}1116911170dtrace_membar_producer();11171state->dts_necbs = necbs;11172}1117311174ecb->dte_state = state;1117511176ASSERT(state->dts_ecbs[epid - 1] == NULL);11177dtrace_membar_producer();11178state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;1117911180return (ecb);11181}1118211183static void11184dtrace_ecb_enable(dtrace_ecb_t *ecb)11185{11186dtrace_probe_t *probe = ecb->dte_probe;1118711188ASSERT(MUTEX_HELD(&cpu_lock));11189ASSERT(MUTEX_HELD(&dtrace_lock));11190ASSERT(ecb->dte_next == NULL);1119111192if (probe == NULL) {11193/*11194* This is the NULL probe -- there's nothing to do.11195*/11196return;11197}1119811199if (probe->dtpr_ecb == NULL) {11200dtrace_provider_t *prov = probe->dtpr_provider;1120111202/*11203* We're the first ECB on this probe.11204*/11205probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;1120611207if (ecb->dte_predicate != NULL)11208probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;1120911210prov->dtpv_pops.dtps_enable(prov->dtpv_arg,11211probe->dtpr_id, probe->dtpr_arg);11212} else {11213/*11214* This probe is already active. Swing the last pointer to11215* point to the new ECB, and issue a dtrace_sync() to assure11216* that all CPUs have seen the change.11217*/11218ASSERT(probe->dtpr_ecb_last != NULL);11219probe->dtpr_ecb_last->dte_next = ecb;11220probe->dtpr_ecb_last = ecb;11221probe->dtpr_predcache = 0;1122211223dtrace_sync();11224}11225}1122611227static int11228dtrace_ecb_resize(dtrace_ecb_t *ecb)11229{11230dtrace_action_t *act;11231uint32_t curneeded = UINT32_MAX;11232uint32_t aggbase = UINT32_MAX;1123311234/*11235* If we record anything, we always record the dtrace_rechdr_t. (And11236* we always record it first.)11237*/11238ecb->dte_size = sizeof (dtrace_rechdr_t);11239ecb->dte_alignment = sizeof (dtrace_epid_t);1124011241for (act = ecb->dte_action; act != NULL; act = act->dta_next) {11242dtrace_recdesc_t *rec = &act->dta_rec;11243ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);1124411245ecb->dte_alignment = MAX(ecb->dte_alignment,11246rec->dtrd_alignment);1124711248if (DTRACEACT_ISAGG(act->dta_kind)) {11249dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;1125011251ASSERT(rec->dtrd_size != 0);11252ASSERT(agg->dtag_first != NULL);11253ASSERT(act->dta_prev->dta_intuple);11254ASSERT(aggbase != UINT32_MAX);11255ASSERT(curneeded != UINT32_MAX);1125611257agg->dtag_base = aggbase;1125811259curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);11260rec->dtrd_offset = curneeded;11261if (curneeded + rec->dtrd_size < curneeded)11262return (EINVAL);11263curneeded += rec->dtrd_size;11264ecb->dte_needed = MAX(ecb->dte_needed, curneeded);1126511266aggbase = UINT32_MAX;11267curneeded = UINT32_MAX;11268} else if (act->dta_intuple) {11269if (curneeded == UINT32_MAX) {11270/*11271* This is the first record in a tuple. Align11272* curneeded to be at offset 4 in an 8-byte11273* aligned block.11274*/11275ASSERT(act->dta_prev == NULL ||11276!act->dta_prev->dta_intuple);11277ASSERT3U(aggbase, ==, UINT32_MAX);11278curneeded = P2PHASEUP(ecb->dte_size,11279sizeof (uint64_t), sizeof (dtrace_aggid_t));1128011281aggbase = curneeded - sizeof (dtrace_aggid_t);11282ASSERT(IS_P2ALIGNED(aggbase,11283sizeof (uint64_t)));11284}11285curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);11286rec->dtrd_offset = curneeded;11287if (curneeded + rec->dtrd_size < curneeded)11288return (EINVAL);11289curneeded += rec->dtrd_size;11290} else {11291/* tuples must be followed by an aggregation */11292ASSERT(act->dta_prev == NULL ||11293!act->dta_prev->dta_intuple);1129411295ecb->dte_size = P2ROUNDUP(ecb->dte_size,11296rec->dtrd_alignment);11297rec->dtrd_offset = ecb->dte_size;11298if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)11299return (EINVAL);11300ecb->dte_size += rec->dtrd_size;11301ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);11302}11303}1130411305if ((act = ecb->dte_action) != NULL &&11306!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&11307ecb->dte_size == sizeof (dtrace_rechdr_t)) {11308/*11309* If the size is still sizeof (dtrace_rechdr_t), then all11310* actions store no data; set the size to 0.11311*/11312ecb->dte_size = 0;11313}1131411315ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));11316ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));11317ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,11318ecb->dte_needed);11319return (0);11320}1132111322static dtrace_action_t *11323dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)11324{11325dtrace_aggregation_t *agg;11326size_t size = sizeof (uint64_t);11327int ntuple = desc->dtad_ntuple;11328dtrace_action_t *act;11329dtrace_recdesc_t *frec;11330dtrace_aggid_t aggid;11331dtrace_state_t *state = ecb->dte_state;1133211333agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);11334agg->dtag_ecb = ecb;1133511336ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));1133711338switch (desc->dtad_kind) {11339case DTRACEAGG_MIN:11340agg->dtag_initial = INT64_MAX;11341agg->dtag_aggregate = dtrace_aggregate_min;11342break;1134311344case DTRACEAGG_MAX:11345agg->dtag_initial = INT64_MIN;11346agg->dtag_aggregate = dtrace_aggregate_max;11347break;1134811349case DTRACEAGG_COUNT:11350agg->dtag_aggregate = dtrace_aggregate_count;11351break;1135211353case DTRACEAGG_QUANTIZE:11354agg->dtag_aggregate = dtrace_aggregate_quantize;11355size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *11356sizeof (uint64_t);11357break;1135811359case DTRACEAGG_LQUANTIZE: {11360uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);11361uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);1136211363agg->dtag_initial = desc->dtad_arg;11364agg->dtag_aggregate = dtrace_aggregate_lquantize;1136511366if (step == 0 || levels == 0)11367goto err;1136811369size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);11370break;11371}1137211373case DTRACEAGG_LLQUANTIZE: {11374uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);11375uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);11376uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);11377uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);11378int64_t v;1137911380agg->dtag_initial = desc->dtad_arg;11381agg->dtag_aggregate = dtrace_aggregate_llquantize;1138211383if (factor < 2 || low >= high || nsteps < factor)11384goto err;1138511386/*11387* Now check that the number of steps evenly divides a power11388* of the factor. (This assures both integer bucket size and11389* linearity within each magnitude.)11390*/11391for (v = factor; v < nsteps; v *= factor)11392continue;1139311394if ((v % nsteps) || (nsteps % factor))11395goto err;1139611397size = (dtrace_aggregate_llquantize_bucket(factor,11398low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);11399break;11400}1140111402case DTRACEAGG_AVG:11403agg->dtag_aggregate = dtrace_aggregate_avg;11404size = sizeof (uint64_t) * 2;11405break;1140611407case DTRACEAGG_STDDEV:11408agg->dtag_aggregate = dtrace_aggregate_stddev;11409size = sizeof (uint64_t) * 4;11410break;1141111412case DTRACEAGG_SUM:11413agg->dtag_aggregate = dtrace_aggregate_sum;11414break;1141511416default:11417goto err;11418}1141911420agg->dtag_action.dta_rec.dtrd_size = size;1142111422if (ntuple == 0)11423goto err;1142411425/*11426* We must make sure that we have enough actions for the n-tuple.11427*/11428for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {11429if (DTRACEACT_ISAGG(act->dta_kind))11430break;1143111432if (--ntuple == 0) {11433/*11434* This is the action with which our n-tuple begins.11435*/11436agg->dtag_first = act;11437goto success;11438}11439}1144011441/*11442* This n-tuple is short by ntuple elements. Return failure.11443*/11444ASSERT(ntuple != 0);11445err:11446kmem_free(agg, sizeof (dtrace_aggregation_t));11447return (NULL);1144811449success:11450/*11451* If the last action in the tuple has a size of zero, it's actually11452* an expression argument for the aggregating action.11453*/11454ASSERT(ecb->dte_action_last != NULL);11455act = ecb->dte_action_last;1145611457if (act->dta_kind == DTRACEACT_DIFEXPR) {11458ASSERT(act->dta_difo != NULL);1145911460if (act->dta_difo->dtdo_rtype.dtdt_size == 0)11461agg->dtag_hasarg = 1;11462}1146311464/*11465* We need to allocate an id for this aggregation.11466*/11467#ifdef illumos11468aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,11469VM_BESTFIT | VM_SLEEP);11470#else11471aggid = alloc_unr(state->dts_aggid_arena);11472#endif1147311474if (aggid - 1 >= state->dts_naggregations) {11475dtrace_aggregation_t **oaggs = state->dts_aggregations;11476dtrace_aggregation_t **aggs;11477int naggs = state->dts_naggregations << 1;11478int onaggs = state->dts_naggregations;1147911480ASSERT(aggid == state->dts_naggregations + 1);1148111482if (naggs == 0) {11483ASSERT(oaggs == NULL);11484naggs = 1;11485}1148611487aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);1148811489if (oaggs != NULL) {11490bcopy(oaggs, aggs, onaggs * sizeof (*aggs));11491kmem_free(oaggs, onaggs * sizeof (*aggs));11492}1149311494state->dts_aggregations = aggs;11495state->dts_naggregations = naggs;11496}1149711498ASSERT(state->dts_aggregations[aggid - 1] == NULL);11499state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;1150011501frec = &agg->dtag_first->dta_rec;11502if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))11503frec->dtrd_alignment = sizeof (dtrace_aggid_t);1150411505for (act = agg->dtag_first; act != NULL; act = act->dta_next) {11506ASSERT(!act->dta_intuple);11507act->dta_intuple = 1;11508}1150911510return (&agg->dtag_action);11511}1151211513static void11514dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)11515{11516dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;11517dtrace_state_t *state = ecb->dte_state;11518dtrace_aggid_t aggid = agg->dtag_id;1151911520ASSERT(DTRACEACT_ISAGG(act->dta_kind));11521#ifdef illumos11522vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);11523#else11524free_unr(state->dts_aggid_arena, aggid);11525#endif1152611527ASSERT(state->dts_aggregations[aggid - 1] == agg);11528state->dts_aggregations[aggid - 1] = NULL;1152911530kmem_free(agg, sizeof (dtrace_aggregation_t));11531}1153211533static int11534dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)11535{11536dtrace_action_t *action, *last;11537dtrace_difo_t *dp = desc->dtad_difo;11538uint32_t size = 0, align = sizeof (uint8_t), mask;11539uint16_t format = 0;11540dtrace_recdesc_t *rec;11541dtrace_state_t *state = ecb->dte_state;11542dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;11543uint64_t arg = desc->dtad_arg;1154411545ASSERT(MUTEX_HELD(&dtrace_lock));11546ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);1154711548if (DTRACEACT_ISAGG(desc->dtad_kind)) {11549/*11550* If this is an aggregating action, there must be neither11551* a speculate nor a commit on the action chain.11552*/11553dtrace_action_t *act;1155411555for (act = ecb->dte_action; act != NULL; act = act->dta_next) {11556if (act->dta_kind == DTRACEACT_COMMIT)11557return (EINVAL);1155811559if (act->dta_kind == DTRACEACT_SPECULATE)11560return (EINVAL);11561}1156211563action = dtrace_ecb_aggregation_create(ecb, desc);1156411565if (action == NULL)11566return (EINVAL);11567} else {11568if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||11569(desc->dtad_kind == DTRACEACT_DIFEXPR &&11570dp != NULL && dp->dtdo_destructive)) {11571state->dts_destructive = 1;11572}1157311574switch (desc->dtad_kind) {11575case DTRACEACT_PRINTF:11576case DTRACEACT_PRINTA:11577case DTRACEACT_SYSTEM:11578case DTRACEACT_FREOPEN:11579case DTRACEACT_DIFEXPR:11580/*11581* We know that our arg is a string -- turn it into a11582* format.11583*/11584if (arg == 0) {11585ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||11586desc->dtad_kind == DTRACEACT_DIFEXPR);11587format = 0;11588} else {11589ASSERT(arg != 0);11590#ifdef illumos11591ASSERT(arg > KERNELBASE);11592#endif11593format = dtrace_format_add(state,11594(char *)(uintptr_t)arg);11595}1159611597/*FALLTHROUGH*/11598case DTRACEACT_LIBACT:11599case DTRACEACT_TRACEMEM:11600case DTRACEACT_TRACEMEM_DYNSIZE:11601if (dp == NULL)11602return (EINVAL);1160311604if ((size = dp->dtdo_rtype.dtdt_size) != 0)11605break;1160611607if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {11608if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))11609return (EINVAL);1161011611size = opt[DTRACEOPT_STRSIZE];11612}1161311614break;1161511616case DTRACEACT_STACK:11617if ((nframes = arg) == 0) {11618nframes = opt[DTRACEOPT_STACKFRAMES];11619ASSERT(nframes > 0);11620arg = nframes;11621}1162211623size = nframes * sizeof (pc_t);11624break;1162511626case DTRACEACT_JSTACK:11627if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)11628strsize = opt[DTRACEOPT_JSTACKSTRSIZE];1162911630if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)11631nframes = opt[DTRACEOPT_JSTACKFRAMES];1163211633arg = DTRACE_USTACK_ARG(nframes, strsize);1163411635/*FALLTHROUGH*/11636case DTRACEACT_USTACK:11637if (desc->dtad_kind != DTRACEACT_JSTACK &&11638(nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {11639strsize = DTRACE_USTACK_STRSIZE(arg);11640nframes = opt[DTRACEOPT_USTACKFRAMES];11641ASSERT(nframes > 0);11642arg = DTRACE_USTACK_ARG(nframes, strsize);11643}1164411645/*11646* Save a slot for the pid.11647*/11648size = (nframes + 1) * sizeof (uint64_t);11649size += DTRACE_USTACK_STRSIZE(arg);11650size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));1165111652break;1165311654case DTRACEACT_SYM:11655case DTRACEACT_MOD:11656if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=11657sizeof (uint64_t)) ||11658(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))11659return (EINVAL);11660break;1166111662case DTRACEACT_USYM:11663case DTRACEACT_UMOD:11664case DTRACEACT_UADDR:11665if (dp == NULL ||11666(dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||11667(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))11668return (EINVAL);1166911670/*11671* We have a slot for the pid, plus a slot for the11672* argument. To keep things simple (aligned with11673* bitness-neutral sizing), we store each as a 64-bit11674* quantity.11675*/11676size = 2 * sizeof (uint64_t);11677break;1167811679case DTRACEACT_STOP:11680case DTRACEACT_BREAKPOINT:11681case DTRACEACT_PANIC:11682break;1168311684case DTRACEACT_CHILL:11685case DTRACEACT_DISCARD:11686case DTRACEACT_RAISE:11687if (dp == NULL)11688return (EINVAL);11689break;1169011691case DTRACEACT_EXIT:11692if (dp == NULL ||11693(size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||11694(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))11695return (EINVAL);11696break;1169711698case DTRACEACT_SPECULATE:11699if (ecb->dte_size > sizeof (dtrace_rechdr_t))11700return (EINVAL);1170111702if (dp == NULL)11703return (EINVAL);1170411705state->dts_speculates = 1;11706break;1170711708case DTRACEACT_PRINTM:11709size = dp->dtdo_rtype.dtdt_size;11710break;1171111712case DTRACEACT_COMMIT: {11713dtrace_action_t *act = ecb->dte_action;1171411715for (; act != NULL; act = act->dta_next) {11716if (act->dta_kind == DTRACEACT_COMMIT)11717return (EINVAL);11718}1171911720if (dp == NULL)11721return (EINVAL);11722break;11723}1172411725default:11726return (EINVAL);11727}1172811729if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {11730/*11731* If this is a data-storing action or a speculate,11732* we must be sure that there isn't a commit on the11733* action chain.11734*/11735dtrace_action_t *act = ecb->dte_action;1173611737for (; act != NULL; act = act->dta_next) {11738if (act->dta_kind == DTRACEACT_COMMIT)11739return (EINVAL);11740}11741}1174211743action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);11744action->dta_rec.dtrd_size = size;11745}1174611747action->dta_refcnt = 1;11748rec = &action->dta_rec;11749size = rec->dtrd_size;1175011751for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {11752if (!(size & mask)) {11753align = mask + 1;11754break;11755}11756}1175711758action->dta_kind = desc->dtad_kind;1175911760if ((action->dta_difo = dp) != NULL)11761dtrace_difo_hold(dp);1176211763rec->dtrd_action = action->dta_kind;11764rec->dtrd_arg = arg;11765rec->dtrd_uarg = desc->dtad_uarg;11766rec->dtrd_alignment = (uint16_t)align;11767rec->dtrd_format = format;1176811769if ((last = ecb->dte_action_last) != NULL) {11770ASSERT(ecb->dte_action != NULL);11771action->dta_prev = last;11772last->dta_next = action;11773} else {11774ASSERT(ecb->dte_action == NULL);11775ecb->dte_action = action;11776}1177711778ecb->dte_action_last = action;1177911780return (0);11781}1178211783static void11784dtrace_ecb_action_remove(dtrace_ecb_t *ecb)11785{11786dtrace_action_t *act = ecb->dte_action, *next;11787dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;11788dtrace_difo_t *dp;11789uint16_t format;1179011791if (act != NULL && act->dta_refcnt > 1) {11792ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);11793act->dta_refcnt--;11794} else {11795for (; act != NULL; act = next) {11796next = act->dta_next;11797ASSERT(next != NULL || act == ecb->dte_action_last);11798ASSERT(act->dta_refcnt == 1);1179911800if ((format = act->dta_rec.dtrd_format) != 0)11801dtrace_format_remove(ecb->dte_state, format);1180211803if ((dp = act->dta_difo) != NULL)11804dtrace_difo_release(dp, vstate);1180511806if (DTRACEACT_ISAGG(act->dta_kind)) {11807dtrace_ecb_aggregation_destroy(ecb, act);11808} else {11809kmem_free(act, sizeof (dtrace_action_t));11810}11811}11812}1181311814ecb->dte_action = NULL;11815ecb->dte_action_last = NULL;11816ecb->dte_size = 0;11817}1181811819static void11820dtrace_ecb_disable(dtrace_ecb_t *ecb)11821{11822/*11823* We disable the ECB by removing it from its probe.11824*/11825dtrace_ecb_t *pecb, *prev = NULL;11826dtrace_probe_t *probe = ecb->dte_probe;1182711828ASSERT(MUTEX_HELD(&dtrace_lock));1182911830if (probe == NULL) {11831/*11832* This is the NULL probe; there is nothing to disable.11833*/11834return;11835}1183611837for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {11838if (pecb == ecb)11839break;11840prev = pecb;11841}1184211843ASSERT(pecb != NULL);1184411845if (prev == NULL) {11846probe->dtpr_ecb = ecb->dte_next;11847} else {11848prev->dte_next = ecb->dte_next;11849}1185011851if (ecb == probe->dtpr_ecb_last) {11852ASSERT(ecb->dte_next == NULL);11853probe->dtpr_ecb_last = prev;11854}1185511856/*11857* The ECB has been disconnected from the probe; now sync to assure11858* that all CPUs have seen the change before returning.11859*/11860dtrace_sync();1186111862if (probe->dtpr_ecb == NULL) {11863/*11864* That was the last ECB on the probe; clear the predicate11865* cache ID for the probe, disable it and sync one more time11866* to assure that we'll never hit it again.11867*/11868dtrace_provider_t *prov = probe->dtpr_provider;1186911870ASSERT(ecb->dte_next == NULL);11871ASSERT(probe->dtpr_ecb_last == NULL);11872probe->dtpr_predcache = DTRACE_CACHEIDNONE;11873prov->dtpv_pops.dtps_disable(prov->dtpv_arg,11874probe->dtpr_id, probe->dtpr_arg);11875dtrace_sync();11876} else {11877/*11878* There is at least one ECB remaining on the probe. If there11879* is _exactly_ one, set the probe's predicate cache ID to be11880* the predicate cache ID of the remaining ECB.11881*/11882ASSERT(probe->dtpr_ecb_last != NULL);11883ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);1188411885if (probe->dtpr_ecb == probe->dtpr_ecb_last) {11886dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;1188711888ASSERT(probe->dtpr_ecb->dte_next == NULL);1188911890if (p != NULL)11891probe->dtpr_predcache = p->dtp_cacheid;11892}1189311894ecb->dte_next = NULL;11895}11896}1189711898static void11899dtrace_ecb_destroy(dtrace_ecb_t *ecb)11900{11901dtrace_state_t *state = ecb->dte_state;11902dtrace_vstate_t *vstate = &state->dts_vstate;11903dtrace_predicate_t *pred;11904dtrace_epid_t epid = ecb->dte_epid;1190511906ASSERT(MUTEX_HELD(&dtrace_lock));11907ASSERT(ecb->dte_next == NULL);11908ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);1190911910if ((pred = ecb->dte_predicate) != NULL)11911dtrace_predicate_release(pred, vstate);1191211913dtrace_ecb_action_remove(ecb);1191411915ASSERT(state->dts_ecbs[epid - 1] == ecb);11916state->dts_ecbs[epid - 1] = NULL;1191711918kmem_free(ecb, sizeof (dtrace_ecb_t));11919}1192011921static dtrace_ecb_t *11922dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,11923dtrace_enabling_t *enab)11924{11925dtrace_ecb_t *ecb;11926dtrace_predicate_t *pred;11927dtrace_actdesc_t *act;11928dtrace_provider_t *prov;11929dtrace_ecbdesc_t *desc = enab->dten_current;1193011931ASSERT(MUTEX_HELD(&dtrace_lock));11932ASSERT(state != NULL);1193311934ecb = dtrace_ecb_add(state, probe);11935ecb->dte_uarg = desc->dted_uarg;1193611937if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {11938dtrace_predicate_hold(pred);11939ecb->dte_predicate = pred;11940}1194111942if (probe != NULL) {11943/*11944* If the provider shows more leg than the consumer is old11945* enough to see, we need to enable the appropriate implicit11946* predicate bits to prevent the ecb from activating at11947* revealing times.11948*11949* Providers specifying DTRACE_PRIV_USER at register time11950* are stating that they need the /proc-style privilege11951* model to be enforced, and this is what DTRACE_COND_OWNER11952* and DTRACE_COND_ZONEOWNER will then do at probe time.11953*/11954prov = probe->dtpr_provider;11955if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&11956(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))11957ecb->dte_cond |= DTRACE_COND_OWNER;1195811959if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&11960(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))11961ecb->dte_cond |= DTRACE_COND_ZONEOWNER;1196211963/*11964* If the provider shows us kernel innards and the user11965* is lacking sufficient privilege, enable the11966* DTRACE_COND_USERMODE implicit predicate.11967*/11968if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&11969(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))11970ecb->dte_cond |= DTRACE_COND_USERMODE;11971}1197211973if (dtrace_ecb_create_cache != NULL) {11974/*11975* If we have a cached ecb, we'll use its action list instead11976* of creating our own (saving both time and space).11977*/11978dtrace_ecb_t *cached = dtrace_ecb_create_cache;11979dtrace_action_t *act = cached->dte_action;1198011981if (act != NULL) {11982ASSERT(act->dta_refcnt > 0);11983act->dta_refcnt++;11984ecb->dte_action = act;11985ecb->dte_action_last = cached->dte_action_last;11986ecb->dte_needed = cached->dte_needed;11987ecb->dte_size = cached->dte_size;11988ecb->dte_alignment = cached->dte_alignment;11989}1199011991return (ecb);11992}1199311994for (act = desc->dted_action; act != NULL; act = act->dtad_next) {11995if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {11996dtrace_ecb_destroy(ecb);11997return (NULL);11998}11999}1200012001if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {12002dtrace_ecb_destroy(ecb);12003return (NULL);12004}1200512006return (dtrace_ecb_create_cache = ecb);12007}1200812009static int12010dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)12011{12012dtrace_ecb_t *ecb;12013dtrace_enabling_t *enab = arg;12014dtrace_state_t *state = enab->dten_vstate->dtvs_state;1201512016ASSERT(state != NULL);1201712018if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {12019/*12020* This probe was created in a generation for which this12021* enabling has previously created ECBs; we don't want to12022* enable it again, so just kick out.12023*/12024return (DTRACE_MATCH_NEXT);12025}1202612027if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)12028return (DTRACE_MATCH_DONE);1202912030dtrace_ecb_enable(ecb);12031return (DTRACE_MATCH_NEXT);12032}1203312034static dtrace_ecb_t *12035dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)12036{12037dtrace_ecb_t *ecb;1203812039ASSERT(MUTEX_HELD(&dtrace_lock));1204012041if (id == 0 || id > state->dts_necbs)12042return (NULL);1204312044ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);12045ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);1204612047return (state->dts_ecbs[id - 1]);12048}1204912050static dtrace_aggregation_t *12051dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)12052{12053dtrace_aggregation_t *agg;1205412055ASSERT(MUTEX_HELD(&dtrace_lock));1205612057if (id == 0 || id > state->dts_naggregations)12058return (NULL);1205912060ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);12061ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||12062agg->dtag_id == id);1206312064return (state->dts_aggregations[id - 1]);12065}1206612067/*12068* DTrace Buffer Functions12069*12070* The following functions manipulate DTrace buffers. Most of these functions12071* are called in the context of establishing or processing consumer state;12072* exceptions are explicitly noted.12073*/1207412075/*12076* Note: called from cross call context. This function switches the two12077* buffers on a given CPU. The atomicity of this operation is assured by12078* disabling interrupts while the actual switch takes place; the disabling of12079* interrupts serializes the execution with any execution of dtrace_probe() on12080* the same CPU.12081*/12082static void12083dtrace_buffer_switch(dtrace_buffer_t *buf)12084{12085caddr_t tomax = buf->dtb_tomax;12086caddr_t xamot = buf->dtb_xamot;12087dtrace_icookie_t cookie;12088hrtime_t now;1208912090ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));12091ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));1209212093cookie = dtrace_interrupt_disable();12094now = dtrace_gethrtime();12095buf->dtb_tomax = xamot;12096buf->dtb_xamot = tomax;12097buf->dtb_xamot_drops = buf->dtb_drops;12098buf->dtb_xamot_offset = buf->dtb_offset;12099buf->dtb_xamot_errors = buf->dtb_errors;12100buf->dtb_xamot_flags = buf->dtb_flags;12101buf->dtb_offset = 0;12102buf->dtb_drops = 0;12103buf->dtb_errors = 0;12104buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);12105buf->dtb_interval = now - buf->dtb_switched;12106buf->dtb_switched = now;12107dtrace_interrupt_enable(cookie);12108}1210912110/*12111* Note: called from cross call context. This function activates a buffer12112* on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation12113* is guaranteed by the disabling of interrupts.12114*/12115static void12116dtrace_buffer_activate(dtrace_state_t *state)12117{12118dtrace_buffer_t *buf;12119dtrace_icookie_t cookie = dtrace_interrupt_disable();1212012121buf = &state->dts_buffer[curcpu];1212212123if (buf->dtb_tomax != NULL) {12124/*12125* We might like to assert that the buffer is marked inactive,12126* but this isn't necessarily true: the buffer for the CPU12127* that processes the BEGIN probe has its buffer activated12128* manually. In this case, we take the (harmless) action12129* re-clearing the bit INACTIVE bit.12130*/12131buf->dtb_flags &= ~DTRACEBUF_INACTIVE;12132}1213312134dtrace_interrupt_enable(cookie);12135}1213612137#ifdef __FreeBSD__12138/*12139* Activate the specified per-CPU buffer. This is used instead of12140* dtrace_buffer_activate() when APs have not yet started, i.e. when12141* activating anonymous state.12142*/12143static void12144dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu)12145{1214612147if (state->dts_buffer[cpu].dtb_tomax != NULL)12148state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;12149}12150#endif1215112152static int12153dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,12154processorid_t cpu, int *factor)12155{12156#ifdef illumos12157cpu_t *cp;12158#endif12159dtrace_buffer_t *buf;12160int allocated = 0, desired = 0;1216112162#ifdef illumos12163ASSERT(MUTEX_HELD(&cpu_lock));12164ASSERT(MUTEX_HELD(&dtrace_lock));1216512166*factor = 1;1216712168if (size > dtrace_nonroot_maxsize &&12169!PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))12170return (EFBIG);1217112172cp = cpu_list;1217312174do {12175if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)12176continue;1217712178buf = &bufs[cp->cpu_id];1217912180/*12181* If there is already a buffer allocated for this CPU, it12182* is only possible that this is a DR event. In this case,12183*/12184if (buf->dtb_tomax != NULL) {12185ASSERT(buf->dtb_size == size);12186continue;12187}1218812189ASSERT(buf->dtb_xamot == NULL);1219012191if ((buf->dtb_tomax = kmem_zalloc(size,12192KM_NOSLEEP | KM_NORMALPRI)) == NULL)12193goto err;1219412195buf->dtb_size = size;12196buf->dtb_flags = flags;12197buf->dtb_offset = 0;12198buf->dtb_drops = 0;1219912200if (flags & DTRACEBUF_NOSWITCH)12201continue;1220212203if ((buf->dtb_xamot = kmem_zalloc(size,12204KM_NOSLEEP | KM_NORMALPRI)) == NULL)12205goto err;12206} while ((cp = cp->cpu_next) != cpu_list);1220712208return (0);1220912210err:12211cp = cpu_list;1221212213do {12214if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)12215continue;1221612217buf = &bufs[cp->cpu_id];12218desired += 2;1221912220if (buf->dtb_xamot != NULL) {12221ASSERT(buf->dtb_tomax != NULL);12222ASSERT(buf->dtb_size == size);12223kmem_free(buf->dtb_xamot, size);12224allocated++;12225}1222612227if (buf->dtb_tomax != NULL) {12228ASSERT(buf->dtb_size == size);12229kmem_free(buf->dtb_tomax, size);12230allocated++;12231}1223212233buf->dtb_tomax = NULL;12234buf->dtb_xamot = NULL;12235buf->dtb_size = 0;12236} while ((cp = cp->cpu_next) != cpu_list);12237#else12238int i;1223912240*factor = 1;12241#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \12242defined(__mips__) || defined(__powerpc__) || defined(__riscv)12243/*12244* FreeBSD isn't good at limiting the amount of memory we12245* ask to malloc, so let's place a limit here before trying12246* to do something that might well end in tears at bedtime.12247*/12248int bufsize_percpu_frac = dtrace_bufsize_max_frac * mp_ncpus;12249if (size > physmem * PAGE_SIZE / bufsize_percpu_frac)12250return (ENOMEM);12251#endif1225212253ASSERT(MUTEX_HELD(&dtrace_lock));12254CPU_FOREACH(i) {12255if (cpu != DTRACE_CPUALL && cpu != i)12256continue;1225712258buf = &bufs[i];1225912260/*12261* If there is already a buffer allocated for this CPU, it12262* is only possible that this is a DR event. In this case,12263* the buffer size must match our specified size.12264*/12265if (buf->dtb_tomax != NULL) {12266ASSERT(buf->dtb_size == size);12267continue;12268}1226912270ASSERT(buf->dtb_xamot == NULL);1227112272if ((buf->dtb_tomax = kmem_zalloc(size,12273KM_NOSLEEP | KM_NORMALPRI)) == NULL)12274goto err;1227512276buf->dtb_size = size;12277buf->dtb_flags = flags;12278buf->dtb_offset = 0;12279buf->dtb_drops = 0;1228012281if (flags & DTRACEBUF_NOSWITCH)12282continue;1228312284if ((buf->dtb_xamot = kmem_zalloc(size,12285KM_NOSLEEP | KM_NORMALPRI)) == NULL)12286goto err;12287}1228812289return (0);1229012291err:12292/*12293* Error allocating memory, so free the buffers that were12294* allocated before the failed allocation.12295*/12296CPU_FOREACH(i) {12297if (cpu != DTRACE_CPUALL && cpu != i)12298continue;1229912300buf = &bufs[i];12301desired += 2;1230212303if (buf->dtb_xamot != NULL) {12304ASSERT(buf->dtb_tomax != NULL);12305ASSERT(buf->dtb_size == size);12306kmem_free(buf->dtb_xamot, size);12307allocated++;12308}1230912310if (buf->dtb_tomax != NULL) {12311ASSERT(buf->dtb_size == size);12312kmem_free(buf->dtb_tomax, size);12313allocated++;12314}1231512316buf->dtb_tomax = NULL;12317buf->dtb_xamot = NULL;12318buf->dtb_size = 0;1231912320}12321#endif12322*factor = desired / (allocated > 0 ? allocated : 1);1232312324return (ENOMEM);12325}1232612327/*12328* Note: called from probe context. This function just increments the drop12329* count on a buffer. It has been made a function to allow for the12330* possibility of understanding the source of mysterious drop counts. (A12331* problem for which one may be particularly disappointed that DTrace cannot12332* be used to understand DTrace.)12333*/12334static void12335dtrace_buffer_drop(dtrace_buffer_t *buf)12336{12337buf->dtb_drops++;12338}1233912340/*12341* Note: called from probe context. This function is called to reserve space12342* in a buffer. If mstate is non-NULL, sets the scratch base and size in the12343* mstate. Returns the new offset in the buffer, or a negative value if an12344* error has occurred.12345*/12346static ssize_t12347dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,12348dtrace_state_t *state, dtrace_mstate_t *mstate)12349{12350ssize_t offs = buf->dtb_offset, soffs;12351intptr_t woffs;12352caddr_t tomax;12353size_t total;1235412355if (buf->dtb_flags & DTRACEBUF_INACTIVE)12356return (-1);1235712358if ((tomax = buf->dtb_tomax) == NULL) {12359dtrace_buffer_drop(buf);12360return (-1);12361}1236212363if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {12364while (offs & (align - 1)) {12365/*12366* Assert that our alignment is off by a number which12367* is itself sizeof (uint32_t) aligned.12368*/12369ASSERT(!((align - (offs & (align - 1))) &12370(sizeof (uint32_t) - 1)));12371DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);12372offs += sizeof (uint32_t);12373}1237412375if ((soffs = offs + needed) > buf->dtb_size) {12376dtrace_buffer_drop(buf);12377return (-1);12378}1237912380if (mstate == NULL)12381return (offs);1238212383mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;12384mstate->dtms_scratch_size = buf->dtb_size - soffs;12385mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;1238612387return (offs);12388}1238912390if (buf->dtb_flags & DTRACEBUF_FILL) {12391if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&12392(buf->dtb_flags & DTRACEBUF_FULL))12393return (-1);12394goto out;12395}1239612397total = needed + (offs & (align - 1));1239812399/*12400* For a ring buffer, life is quite a bit more complicated. Before12401* we can store any padding, we need to adjust our wrapping offset.12402* (If we've never before wrapped or we're not about to, no adjustment12403* is required.)12404*/12405if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||12406offs + total > buf->dtb_size) {12407woffs = buf->dtb_xamot_offset;1240812409if (offs + total > buf->dtb_size) {12410/*12411* We can't fit in the end of the buffer. First, a12412* sanity check that we can fit in the buffer at all.12413*/12414if (total > buf->dtb_size) {12415dtrace_buffer_drop(buf);12416return (-1);12417}1241812419/*12420* We're going to be storing at the top of the buffer,12421* so now we need to deal with the wrapped offset. We12422* only reset our wrapped offset to 0 if it is12423* currently greater than the current offset. If it12424* is less than the current offset, it is because a12425* previous allocation induced a wrap -- but the12426* allocation didn't subsequently take the space due12427* to an error or false predicate evaluation. In this12428* case, we'll just leave the wrapped offset alone: if12429* the wrapped offset hasn't been advanced far enough12430* for this allocation, it will be adjusted in the12431* lower loop.12432*/12433if (buf->dtb_flags & DTRACEBUF_WRAPPED) {12434if (woffs >= offs)12435woffs = 0;12436} else {12437woffs = 0;12438}1243912440/*12441* Now we know that we're going to be storing to the12442* top of the buffer and that there is room for us12443* there. We need to clear the buffer from the current12444* offset to the end (there may be old gunk there).12445*/12446while (offs < buf->dtb_size)12447tomax[offs++] = 0;1244812449/*12450* We need to set our offset to zero. And because we12451* are wrapping, we need to set the bit indicating as12452* much. We can also adjust our needed space back12453* down to the space required by the ECB -- we know12454* that the top of the buffer is aligned.12455*/12456offs = 0;12457total = needed;12458buf->dtb_flags |= DTRACEBUF_WRAPPED;12459} else {12460/*12461* There is room for us in the buffer, so we simply12462* need to check the wrapped offset.12463*/12464if (woffs < offs) {12465/*12466* The wrapped offset is less than the offset.12467* This can happen if we allocated buffer space12468* that induced a wrap, but then we didn't12469* subsequently take the space due to an error12470* or false predicate evaluation. This is12471* okay; we know that _this_ allocation isn't12472* going to induce a wrap. We still can't12473* reset the wrapped offset to be zero,12474* however: the space may have been trashed in12475* the previous failed probe attempt. But at12476* least the wrapped offset doesn't need to12477* be adjusted at all...12478*/12479goto out;12480}12481}1248212483while (offs + total > woffs) {12484dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);12485size_t size;1248612487if (epid == DTRACE_EPIDNONE) {12488size = sizeof (uint32_t);12489} else {12490ASSERT3U(epid, <=, state->dts_necbs);12491ASSERT(state->dts_ecbs[epid - 1] != NULL);1249212493size = state->dts_ecbs[epid - 1]->dte_size;12494}1249512496ASSERT(woffs + size <= buf->dtb_size);12497ASSERT(size != 0);1249812499if (woffs + size == buf->dtb_size) {12500/*12501* We've reached the end of the buffer; we want12502* to set the wrapped offset to 0 and break12503* out. However, if the offs is 0, then we're12504* in a strange edge-condition: the amount of12505* space that we want to reserve plus the size12506* of the record that we're overwriting is12507* greater than the size of the buffer. This12508* is problematic because if we reserve the12509* space but subsequently don't consume it (due12510* to a failed predicate or error) the wrapped12511* offset will be 0 -- yet the EPID at offset 012512* will not be committed. This situation is12513* relatively easy to deal with: if we're in12514* this case, the buffer is indistinguishable12515* from one that hasn't wrapped; we need only12516* finish the job by clearing the wrapped bit,12517* explicitly setting the offset to be 0, and12518* zero'ing out the old data in the buffer.12519*/12520if (offs == 0) {12521buf->dtb_flags &= ~DTRACEBUF_WRAPPED;12522buf->dtb_offset = 0;12523woffs = total;1252412525while (woffs < buf->dtb_size)12526tomax[woffs++] = 0;12527}1252812529woffs = 0;12530break;12531}1253212533woffs += size;12534}1253512536/*12537* We have a wrapped offset. It may be that the wrapped offset12538* has become zero -- that's okay.12539*/12540buf->dtb_xamot_offset = woffs;12541}1254212543out:12544/*12545* Now we can plow the buffer with any necessary padding.12546*/12547while (offs & (align - 1)) {12548/*12549* Assert that our alignment is off by a number which12550* is itself sizeof (uint32_t) aligned.12551*/12552ASSERT(!((align - (offs & (align - 1))) &12553(sizeof (uint32_t) - 1)));12554DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);12555offs += sizeof (uint32_t);12556}1255712558if (buf->dtb_flags & DTRACEBUF_FILL) {12559if (offs + needed > buf->dtb_size - state->dts_reserve) {12560buf->dtb_flags |= DTRACEBUF_FULL;12561return (-1);12562}12563}1256412565if (mstate == NULL)12566return (offs);1256712568/*12569* For ring buffers and fill buffers, the scratch space is always12570* the inactive buffer.12571*/12572mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;12573mstate->dtms_scratch_size = buf->dtb_size;12574mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;1257512576return (offs);12577}1257812579static void12580dtrace_buffer_polish(dtrace_buffer_t *buf)12581{12582ASSERT(buf->dtb_flags & DTRACEBUF_RING);12583ASSERT(MUTEX_HELD(&dtrace_lock));1258412585if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))12586return;1258712588/*12589* We need to polish the ring buffer. There are three cases:12590*12591* - The first (and presumably most common) is that there is no gap12592* between the buffer offset and the wrapped offset. In this case,12593* there is nothing in the buffer that isn't valid data; we can12594* mark the buffer as polished and return.12595*12596* - The second (less common than the first but still more common12597* than the third) is that there is a gap between the buffer offset12598* and the wrapped offset, and the wrapped offset is larger than the12599* buffer offset. This can happen because of an alignment issue, or12600* can happen because of a call to dtrace_buffer_reserve() that12601* didn't subsequently consume the buffer space. In this case,12602* we need to zero the data from the buffer offset to the wrapped12603* offset.12604*12605* - The third (and least common) is that there is a gap between the12606* buffer offset and the wrapped offset, but the wrapped offset is12607* _less_ than the buffer offset. This can only happen because a12608* call to dtrace_buffer_reserve() induced a wrap, but the space12609* was not subsequently consumed. In this case, we need to zero the12610* space from the offset to the end of the buffer _and_ from the12611* top of the buffer to the wrapped offset.12612*/12613if (buf->dtb_offset < buf->dtb_xamot_offset) {12614bzero(buf->dtb_tomax + buf->dtb_offset,12615buf->dtb_xamot_offset - buf->dtb_offset);12616}1261712618if (buf->dtb_offset > buf->dtb_xamot_offset) {12619bzero(buf->dtb_tomax + buf->dtb_offset,12620buf->dtb_size - buf->dtb_offset);12621bzero(buf->dtb_tomax, buf->dtb_xamot_offset);12622}12623}1262412625/*12626* This routine determines if data generated at the specified time has likely12627* been entirely consumed at user-level. This routine is called to determine12628* if an ECB on a defunct probe (but for an active enabling) can be safely12629* disabled and destroyed.12630*/12631static int12632dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)12633{12634int i;1263512636CPU_FOREACH(i) {12637dtrace_buffer_t *buf = &bufs[i];1263812639if (buf->dtb_size == 0)12640continue;1264112642if (buf->dtb_flags & DTRACEBUF_RING)12643return (0);1264412645if (!buf->dtb_switched && buf->dtb_offset != 0)12646return (0);1264712648if (buf->dtb_switched - buf->dtb_interval < when)12649return (0);12650}1265112652return (1);12653}1265412655static void12656dtrace_buffer_free(dtrace_buffer_t *bufs)12657{12658int i;1265912660CPU_FOREACH(i) {12661dtrace_buffer_t *buf = &bufs[i];1266212663if (buf->dtb_tomax == NULL) {12664ASSERT(buf->dtb_xamot == NULL);12665ASSERT(buf->dtb_size == 0);12666continue;12667}1266812669if (buf->dtb_xamot != NULL) {12670ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));12671kmem_free(buf->dtb_xamot, buf->dtb_size);12672}1267312674kmem_free(buf->dtb_tomax, buf->dtb_size);12675buf->dtb_size = 0;12676buf->dtb_tomax = NULL;12677buf->dtb_xamot = NULL;12678}12679}1268012681/*12682* DTrace Enabling Functions12683*/12684static dtrace_enabling_t *12685dtrace_enabling_create(dtrace_vstate_t *vstate)12686{12687dtrace_enabling_t *enab;1268812689enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);12690enab->dten_vstate = vstate;1269112692return (enab);12693}1269412695static void12696dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)12697{12698dtrace_ecbdesc_t **ndesc;12699size_t osize, nsize;1270012701/*12702* We can't add to enablings after we've enabled them, or after we've12703* retained them.12704*/12705ASSERT(enab->dten_probegen == 0);12706ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);1270712708if (enab->dten_ndesc < enab->dten_maxdesc) {12709enab->dten_desc[enab->dten_ndesc++] = ecb;12710return;12711}1271212713osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);1271412715if (enab->dten_maxdesc == 0) {12716enab->dten_maxdesc = 1;12717} else {12718enab->dten_maxdesc <<= 1;12719}1272012721ASSERT(enab->dten_ndesc < enab->dten_maxdesc);1272212723nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);12724ndesc = kmem_zalloc(nsize, KM_SLEEP);12725bcopy(enab->dten_desc, ndesc, osize);12726if (enab->dten_desc != NULL)12727kmem_free(enab->dten_desc, osize);1272812729enab->dten_desc = ndesc;12730enab->dten_desc[enab->dten_ndesc++] = ecb;12731}1273212733static void12734dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,12735dtrace_probedesc_t *pd)12736{12737dtrace_ecbdesc_t *new;12738dtrace_predicate_t *pred;12739dtrace_actdesc_t *act;1274012741/*12742* We're going to create a new ECB description that matches the12743* specified ECB in every way, but has the specified probe description.12744*/12745new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);1274612747if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)12748dtrace_predicate_hold(pred);1274912750for (act = ecb->dted_action; act != NULL; act = act->dtad_next)12751dtrace_actdesc_hold(act);1275212753new->dted_action = ecb->dted_action;12754new->dted_pred = ecb->dted_pred;12755new->dted_probe = *pd;12756new->dted_uarg = ecb->dted_uarg;1275712758dtrace_enabling_add(enab, new);12759}1276012761static void12762dtrace_enabling_dump(dtrace_enabling_t *enab)12763{12764int i;1276512766for (i = 0; i < enab->dten_ndesc; i++) {12767dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;1276812769#ifdef __FreeBSD__12770printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i,12771desc->dtpd_provider, desc->dtpd_mod,12772desc->dtpd_func, desc->dtpd_name);12773#else12774cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,12775desc->dtpd_provider, desc->dtpd_mod,12776desc->dtpd_func, desc->dtpd_name);12777#endif12778}12779}1278012781static void12782dtrace_enabling_destroy(dtrace_enabling_t *enab)12783{12784int i;12785dtrace_ecbdesc_t *ep;12786dtrace_vstate_t *vstate = enab->dten_vstate;1278712788ASSERT(MUTEX_HELD(&dtrace_lock));1278912790for (i = 0; i < enab->dten_ndesc; i++) {12791dtrace_actdesc_t *act, *next;12792dtrace_predicate_t *pred;1279312794ep = enab->dten_desc[i];1279512796if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)12797dtrace_predicate_release(pred, vstate);1279812799for (act = ep->dted_action; act != NULL; act = next) {12800next = act->dtad_next;12801dtrace_actdesc_release(act, vstate);12802}1280312804kmem_free(ep, sizeof (dtrace_ecbdesc_t));12805}1280612807if (enab->dten_desc != NULL)12808kmem_free(enab->dten_desc,12809enab->dten_maxdesc * sizeof (dtrace_enabling_t *));1281012811/*12812* If this was a retained enabling, decrement the dts_nretained count12813* and take it off of the dtrace_retained list.12814*/12815if (enab->dten_prev != NULL || enab->dten_next != NULL ||12816dtrace_retained == enab) {12817ASSERT(enab->dten_vstate->dtvs_state != NULL);12818ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);12819enab->dten_vstate->dtvs_state->dts_nretained--;12820dtrace_retained_gen++;12821}1282212823if (enab->dten_prev == NULL) {12824if (dtrace_retained == enab) {12825dtrace_retained = enab->dten_next;1282612827if (dtrace_retained != NULL)12828dtrace_retained->dten_prev = NULL;12829}12830} else {12831ASSERT(enab != dtrace_retained);12832ASSERT(dtrace_retained != NULL);12833enab->dten_prev->dten_next = enab->dten_next;12834}1283512836if (enab->dten_next != NULL) {12837ASSERT(dtrace_retained != NULL);12838enab->dten_next->dten_prev = enab->dten_prev;12839}1284012841kmem_free(enab, sizeof (dtrace_enabling_t));12842}1284312844static int12845dtrace_enabling_retain(dtrace_enabling_t *enab)12846{12847dtrace_state_t *state;1284812849ASSERT(MUTEX_HELD(&dtrace_lock));12850ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);12851ASSERT(enab->dten_vstate != NULL);1285212853state = enab->dten_vstate->dtvs_state;12854ASSERT(state != NULL);1285512856/*12857* We only allow each state to retain dtrace_retain_max enablings.12858*/12859if (state->dts_nretained >= dtrace_retain_max)12860return (ENOSPC);1286112862state->dts_nretained++;12863dtrace_retained_gen++;1286412865if (dtrace_retained == NULL) {12866dtrace_retained = enab;12867return (0);12868}1286912870enab->dten_next = dtrace_retained;12871dtrace_retained->dten_prev = enab;12872dtrace_retained = enab;1287312874return (0);12875}1287612877static int12878dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,12879dtrace_probedesc_t *create)12880{12881dtrace_enabling_t *new, *enab;12882int found = 0, err = ENOENT;1288312884ASSERT(MUTEX_HELD(&dtrace_lock));12885ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);12886ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);12887ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);12888ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);1288912890new = dtrace_enabling_create(&state->dts_vstate);1289112892/*12893* Iterate over all retained enablings, looking for enablings that12894* match the specified state.12895*/12896for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {12897int i;1289812899/*12900* dtvs_state can only be NULL for helper enablings -- and12901* helper enablings can't be retained.12902*/12903ASSERT(enab->dten_vstate->dtvs_state != NULL);1290412905if (enab->dten_vstate->dtvs_state != state)12906continue;1290712908/*12909* Now iterate over each probe description; we're looking for12910* an exact match to the specified probe description.12911*/12912for (i = 0; i < enab->dten_ndesc; i++) {12913dtrace_ecbdesc_t *ep = enab->dten_desc[i];12914dtrace_probedesc_t *pd = &ep->dted_probe;1291512916if (strcmp(pd->dtpd_provider, match->dtpd_provider))12917continue;1291812919if (strcmp(pd->dtpd_mod, match->dtpd_mod))12920continue;1292112922if (strcmp(pd->dtpd_func, match->dtpd_func))12923continue;1292412925if (strcmp(pd->dtpd_name, match->dtpd_name))12926continue;1292712928/*12929* We have a winning probe! Add it to our growing12930* enabling.12931*/12932found = 1;12933dtrace_enabling_addlike(new, ep, create);12934}12935}1293612937if (!found || (err = dtrace_enabling_retain(new)) != 0) {12938dtrace_enabling_destroy(new);12939return (err);12940}1294112942return (0);12943}1294412945static void12946dtrace_enabling_retract(dtrace_state_t *state)12947{12948dtrace_enabling_t *enab, *next;1294912950ASSERT(MUTEX_HELD(&dtrace_lock));1295112952/*12953* Iterate over all retained enablings, destroy the enablings retained12954* for the specified state.12955*/12956for (enab = dtrace_retained; enab != NULL; enab = next) {12957next = enab->dten_next;1295812959/*12960* dtvs_state can only be NULL for helper enablings -- and12961* helper enablings can't be retained.12962*/12963ASSERT(enab->dten_vstate->dtvs_state != NULL);1296412965if (enab->dten_vstate->dtvs_state == state) {12966ASSERT(state->dts_nretained > 0);12967dtrace_enabling_destroy(enab);12968}12969}1297012971ASSERT(state->dts_nretained == 0);12972}1297312974static int12975dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)12976{12977int i = 0;12978int matched = 0;1297912980ASSERT(MUTEX_HELD(&cpu_lock));12981ASSERT(MUTEX_HELD(&dtrace_lock));1298212983for (i = 0; i < enab->dten_ndesc; i++) {12984dtrace_ecbdesc_t *ep = enab->dten_desc[i];1298512986enab->dten_current = ep;12987enab->dten_error = 0;1298812989matched += dtrace_probe_enable(&ep->dted_probe, enab);1299012991if (enab->dten_error != 0) {12992/*12993* If we get an error half-way through enabling the12994* probes, we kick out -- perhaps with some number of12995* them enabled. Leaving enabled probes enabled may12996* be slightly confusing for user-level, but we expect12997* that no one will attempt to actually drive on in12998* the face of such errors. If this is an anonymous12999* enabling (indicated with a NULL nmatched pointer),13000* we cmn_err() a message. We aren't expecting to13001* get such an error -- such as it can exist at all,13002* it would be a result of corrupted DOF in the driver13003* properties.13004*/13005if (nmatched == NULL) {13006cmn_err(CE_WARN, "dtrace_enabling_match() "13007"error on %p: %d", (void *)ep,13008enab->dten_error);13009}1301013011return (enab->dten_error);13012}13013}1301413015enab->dten_probegen = dtrace_probegen;13016if (nmatched != NULL)13017*nmatched = matched;1301813019return (0);13020}1302113022static void13023dtrace_enabling_matchall_task(void *args __unused)13024{13025dtrace_enabling_matchall();13026}1302713028static void13029dtrace_enabling_matchall(void)13030{13031dtrace_enabling_t *enab;1303213033mutex_enter(&cpu_lock);13034mutex_enter(&dtrace_lock);1303513036/*13037* Iterate over all retained enablings to see if any probes match13038* against them. We only perform this operation on enablings for which13039* we have sufficient permissions by virtue of being in the global zone13040* or in the same zone as the DTrace client. Because we can be called13041* after dtrace_detach() has been called, we cannot assert that there13042* are retained enablings. We can safely load from dtrace_retained,13043* however: the taskq_destroy() at the end of dtrace_detach() will13044* block pending our completion.13045*/13046for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {13047#ifdef illumos13048cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;1304913050if (INGLOBALZONE(curproc) ||13051cr != NULL && getzoneid() == crgetzoneid(cr))13052#endif13053(void) dtrace_enabling_match(enab, NULL);13054}1305513056mutex_exit(&dtrace_lock);13057mutex_exit(&cpu_lock);13058}1305913060/*13061* If an enabling is to be enabled without having matched probes (that is, if13062* dtrace_state_go() is to be called on the underlying dtrace_state_t), the13063* enabling must be _primed_ by creating an ECB for every ECB description.13064* This must be done to assure that we know the number of speculations, the13065* number of aggregations, the minimum buffer size needed, etc. before we13066* transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually13067* enabling any probes, we create ECBs for every ECB decription, but with a13068* NULL probe -- which is exactly what this function does.13069*/13070static void13071dtrace_enabling_prime(dtrace_state_t *state)13072{13073dtrace_enabling_t *enab;13074int i;1307513076for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {13077ASSERT(enab->dten_vstate->dtvs_state != NULL);1307813079if (enab->dten_vstate->dtvs_state != state)13080continue;1308113082/*13083* We don't want to prime an enabling more than once, lest13084* we allow a malicious user to induce resource exhaustion.13085* (The ECBs that result from priming an enabling aren't13086* leaked -- but they also aren't deallocated until the13087* consumer state is destroyed.)13088*/13089if (enab->dten_primed)13090continue;1309113092for (i = 0; i < enab->dten_ndesc; i++) {13093enab->dten_current = enab->dten_desc[i];13094(void) dtrace_probe_enable(NULL, enab);13095}1309613097enab->dten_primed = 1;13098}13099}1310013101/*13102* Called to indicate that probes should be provided due to retained13103* enablings. This is implemented in terms of dtrace_probe_provide(), but it13104* must take an initial lap through the enabling calling the dtps_provide()13105* entry point explicitly to allow for autocreated probes.13106*/13107static void13108dtrace_enabling_provide(dtrace_provider_t *prv)13109{13110int i, all = 0;13111dtrace_probedesc_t desc;13112dtrace_genid_t gen;1311313114ASSERT(MUTEX_HELD(&dtrace_lock));13115ASSERT(MUTEX_HELD(&dtrace_provider_lock));1311613117if (prv == NULL) {13118all = 1;13119prv = dtrace_provider;13120}1312113122do {13123dtrace_enabling_t *enab;13124void *parg = prv->dtpv_arg;1312513126retry:13127gen = dtrace_retained_gen;13128for (enab = dtrace_retained; enab != NULL;13129enab = enab->dten_next) {13130for (i = 0; i < enab->dten_ndesc; i++) {13131desc = enab->dten_desc[i]->dted_probe;13132mutex_exit(&dtrace_lock);13133prv->dtpv_pops.dtps_provide(parg, &desc);13134mutex_enter(&dtrace_lock);13135/*13136* Process the retained enablings again if13137* they have changed while we weren't holding13138* dtrace_lock.13139*/13140if (gen != dtrace_retained_gen)13141goto retry;13142}13143}13144} while (all && (prv = prv->dtpv_next) != NULL);1314513146mutex_exit(&dtrace_lock);13147dtrace_probe_provide(NULL, all ? NULL : prv);13148mutex_enter(&dtrace_lock);13149}1315013151/*13152* Called to reap ECBs that are attached to probes from defunct providers.13153*/13154static void13155dtrace_enabling_reap(void *args __unused)13156{13157dtrace_provider_t *prov;13158dtrace_probe_t *probe;13159dtrace_ecb_t *ecb;13160hrtime_t when;13161int i;1316213163mutex_enter(&cpu_lock);13164mutex_enter(&dtrace_lock);1316513166for (i = 0; i < dtrace_nprobes; i++) {13167if ((probe = dtrace_probes[i]) == NULL)13168continue;1316913170if (probe->dtpr_ecb == NULL)13171continue;1317213173prov = probe->dtpr_provider;1317413175if ((when = prov->dtpv_defunct) == 0)13176continue;1317713178/*13179* We have ECBs on a defunct provider: we want to reap these13180* ECBs to allow the provider to unregister. The destruction13181* of these ECBs must be done carefully: if we destroy the ECB13182* and the consumer later wishes to consume an EPID that13183* corresponds to the destroyed ECB (and if the EPID metadata13184* has not been previously consumed), the consumer will abort13185* processing on the unknown EPID. To reduce (but not, sadly,13186* eliminate) the possibility of this, we will only destroy an13187* ECB for a defunct provider if, for the state that13188* corresponds to the ECB:13189*13190* (a) There is no speculative tracing (which can effectively13191* cache an EPID for an arbitrary amount of time).13192*13193* (b) The principal buffers have been switched twice since the13194* provider became defunct.13195*13196* (c) The aggregation buffers are of zero size or have been13197* switched twice since the provider became defunct.13198*13199* We use dts_speculates to determine (a) and call a function13200* (dtrace_buffer_consumed()) to determine (b) and (c). Note13201* that as soon as we've been unable to destroy one of the ECBs13202* associated with the probe, we quit trying -- reaping is only13203* fruitful in as much as we can destroy all ECBs associated13204* with the defunct provider's probes.13205*/13206while ((ecb = probe->dtpr_ecb) != NULL) {13207dtrace_state_t *state = ecb->dte_state;13208dtrace_buffer_t *buf = state->dts_buffer;13209dtrace_buffer_t *aggbuf = state->dts_aggbuffer;1321013211if (state->dts_speculates)13212break;1321313214if (!dtrace_buffer_consumed(buf, when))13215break;1321613217if (!dtrace_buffer_consumed(aggbuf, when))13218break;1321913220dtrace_ecb_disable(ecb);13221ASSERT(probe->dtpr_ecb != ecb);13222dtrace_ecb_destroy(ecb);13223}13224}1322513226mutex_exit(&dtrace_lock);13227mutex_exit(&cpu_lock);13228}1322913230/*13231* DTrace DOF Functions13232*/13233/*ARGSUSED*/13234static void13235dtrace_dof_error(dof_hdr_t *dof, const char *str)13236{13237if (dtrace_err_verbose)13238cmn_err(CE_WARN, "failed to process DOF: %s", str);1323913240#ifdef DTRACE_ERRDEBUG13241dtrace_errdebug(str);13242#endif13243}1324413245/*13246* Create DOF out of a currently enabled state. Right now, we only create13247* DOF containing the run-time options -- but this could be expanded to create13248* complete DOF representing the enabled state.13249*/13250static dof_hdr_t *13251dtrace_dof_create(dtrace_state_t *state)13252{13253dof_hdr_t *dof;13254dof_sec_t *sec;13255dof_optdesc_t *opt;13256int i, len = sizeof (dof_hdr_t) +13257roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +13258sizeof (dof_optdesc_t) * DTRACEOPT_MAX;1325913260ASSERT(MUTEX_HELD(&dtrace_lock));1326113262dof = kmem_zalloc(len, KM_SLEEP);13263dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;13264dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;13265dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;13266dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;1326713268dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;13269dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;13270dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;13271dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;13272dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;13273dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;1327413275dof->dofh_flags = 0;13276dof->dofh_hdrsize = sizeof (dof_hdr_t);13277dof->dofh_secsize = sizeof (dof_sec_t);13278dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */13279dof->dofh_secoff = sizeof (dof_hdr_t);13280dof->dofh_loadsz = len;13281dof->dofh_filesz = len;13282dof->dofh_pad = 0;1328313284/*13285* Fill in the option section header...13286*/13287sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));13288sec->dofs_type = DOF_SECT_OPTDESC;13289sec->dofs_align = sizeof (uint64_t);13290sec->dofs_flags = DOF_SECF_LOAD;13291sec->dofs_entsize = sizeof (dof_optdesc_t);1329213293opt = (dof_optdesc_t *)((uintptr_t)sec +13294roundup(sizeof (dof_sec_t), sizeof (uint64_t)));1329513296sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;13297sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;1329813299for (i = 0; i < DTRACEOPT_MAX; i++) {13300opt[i].dofo_option = i;13301opt[i].dofo_strtab = DOF_SECIDX_NONE;13302opt[i].dofo_value = state->dts_options[i];13303}1330413305return (dof);13306}1330713308static dof_hdr_t *13309dtrace_dof_copyin(uintptr_t uarg, int *errp)13310{13311dof_hdr_t hdr, *dof;1331213313ASSERT(!MUTEX_HELD(&dtrace_lock));1331413315/*13316* First, we're going to copyin() the sizeof (dof_hdr_t).13317*/13318if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {13319dtrace_dof_error(NULL, "failed to copyin DOF header");13320*errp = EFAULT;13321return (NULL);13322}1332313324/*13325* Now we'll allocate the entire DOF and copy it in -- provided13326* that the length isn't outrageous.13327*/13328if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {13329dtrace_dof_error(&hdr, "load size exceeds maximum");13330*errp = E2BIG;13331return (NULL);13332}1333313334if (hdr.dofh_loadsz < sizeof (hdr)) {13335dtrace_dof_error(&hdr, "invalid load size");13336*errp = EINVAL;13337return (NULL);13338}1333913340dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);1334113342if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||13343dof->dofh_loadsz != hdr.dofh_loadsz) {13344kmem_free(dof, hdr.dofh_loadsz);13345*errp = EFAULT;13346return (NULL);13347}1334813349return (dof);13350}1335113352#ifdef __FreeBSD__13353static dof_hdr_t *13354dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp)13355{13356dof_hdr_t hdr, *dof;13357struct thread *td;13358size_t loadsz;1335913360ASSERT(!MUTEX_HELD(&dtrace_lock));1336113362td = curthread;1336313364/*13365* First, we're going to copyin() the sizeof (dof_hdr_t).13366*/13367if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) {13368dtrace_dof_error(NULL, "failed to copyin DOF header");13369*errp = EFAULT;13370return (NULL);13371}1337213373/*13374* Now we'll allocate the entire DOF and copy it in -- provided13375* that the length isn't outrageous.13376*/13377if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {13378dtrace_dof_error(&hdr, "load size exceeds maximum");13379*errp = E2BIG;13380return (NULL);13381}13382loadsz = (size_t)hdr.dofh_loadsz;1338313384if (loadsz < sizeof (hdr)) {13385dtrace_dof_error(&hdr, "invalid load size");13386*errp = EINVAL;13387return (NULL);13388}1338913390dof = kmem_alloc(loadsz, KM_SLEEP);1339113392if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz ||13393dof->dofh_loadsz != loadsz) {13394kmem_free(dof, hdr.dofh_loadsz);13395*errp = EFAULT;13396return (NULL);13397}1339813399return (dof);13400}1340113402static __inline uchar_t13403dtrace_dof_char(char c)13404{1340513406switch (c) {13407case '0':13408case '1':13409case '2':13410case '3':13411case '4':13412case '5':13413case '6':13414case '7':13415case '8':13416case '9':13417return (c - '0');13418case 'A':13419case 'B':13420case 'C':13421case 'D':13422case 'E':13423case 'F':13424return (c - 'A' + 10);13425case 'a':13426case 'b':13427case 'c':13428case 'd':13429case 'e':13430case 'f':13431return (c - 'a' + 10);13432}13433/* Should not reach here. */13434return (UCHAR_MAX);13435}13436#endif /* __FreeBSD__ */1343713438static dof_hdr_t *13439dtrace_dof_property(const char *name)13440{13441#ifdef __FreeBSD__13442uint8_t *dofbuf;13443u_char *data, *eol;13444caddr_t doffile;13445size_t bytes, len, i;13446dof_hdr_t *dof;13447u_char c1, c2;1344813449dof = NULL;1345013451doffile = preload_search_by_type("dtrace_dof");13452if (doffile == NULL)13453return (NULL);1345413455data = preload_fetch_addr(doffile);13456len = preload_fetch_size(doffile);13457for (;;) {13458/* Look for the end of the line. All lines end in a newline. */13459eol = memchr(data, '\n', len);13460if (eol == NULL)13461return (NULL);1346213463if (strncmp(name, data, strlen(name)) == 0)13464break;1346513466eol++; /* skip past the newline */13467len -= eol - data;13468data = eol;13469}1347013471/* We've found the data corresponding to the specified key. */1347213473data += strlen(name) + 1; /* skip past the '=' */13474len = eol - data;13475if (len % 2 != 0) {13476dtrace_dof_error(NULL, "invalid DOF encoding length");13477goto doferr;13478}13479bytes = len / 2;13480if (bytes < sizeof(dof_hdr_t)) {13481dtrace_dof_error(NULL, "truncated header");13482goto doferr;13483}1348413485/*13486* Each byte is represented by the two ASCII characters in its hex13487* representation.13488*/13489dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK);13490for (i = 0; i < bytes; i++) {13491c1 = dtrace_dof_char(data[i * 2]);13492c2 = dtrace_dof_char(data[i * 2 + 1]);13493if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) {13494dtrace_dof_error(NULL, "invalid hex char in DOF");13495goto doferr;13496}13497dofbuf[i] = c1 * 16 + c2;13498}1349913500dof = (dof_hdr_t *)dofbuf;13501if (bytes < dof->dofh_loadsz) {13502dtrace_dof_error(NULL, "truncated DOF");13503goto doferr;13504}1350513506if (dof->dofh_loadsz >= dtrace_dof_maxsize) {13507dtrace_dof_error(NULL, "oversized DOF");13508goto doferr;13509}1351013511return (dof);1351213513doferr:13514free(dof, M_SOLARIS);13515return (NULL);13516#else /* __FreeBSD__ */13517uchar_t *buf;13518uint64_t loadsz;13519unsigned int len, i;13520dof_hdr_t *dof;1352113522/*13523* Unfortunately, array of values in .conf files are always (and13524* only) interpreted to be integer arrays. We must read our DOF13525* as an integer array, and then squeeze it into a byte array.13526*/13527if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,13528(char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)13529return (NULL);1353013531for (i = 0; i < len; i++)13532buf[i] = (uchar_t)(((int *)buf)[i]);1353313534if (len < sizeof (dof_hdr_t)) {13535ddi_prop_free(buf);13536dtrace_dof_error(NULL, "truncated header");13537return (NULL);13538}1353913540if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {13541ddi_prop_free(buf);13542dtrace_dof_error(NULL, "truncated DOF");13543return (NULL);13544}1354513546if (loadsz >= dtrace_dof_maxsize) {13547ddi_prop_free(buf);13548dtrace_dof_error(NULL, "oversized DOF");13549return (NULL);13550}1355113552dof = kmem_alloc(loadsz, KM_SLEEP);13553bcopy(buf, dof, loadsz);13554ddi_prop_free(buf);1355513556return (dof);13557#endif /* !__FreeBSD__ */13558}1355913560static void13561dtrace_dof_destroy(dof_hdr_t *dof)13562{13563kmem_free(dof, dof->dofh_loadsz);13564}1356513566/*13567* Return the dof_sec_t pointer corresponding to a given section index. If the13568* index is not valid, dtrace_dof_error() is called and NULL is returned. If13569* a type other than DOF_SECT_NONE is specified, the header is checked against13570* this type and NULL is returned if the types do not match.13571*/13572static dof_sec_t *13573dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)13574{13575dof_sec_t *sec = (dof_sec_t *)(uintptr_t)13576((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);1357713578if (i >= dof->dofh_secnum) {13579dtrace_dof_error(dof, "referenced section index is invalid");13580return (NULL);13581}1358213583if (!(sec->dofs_flags & DOF_SECF_LOAD)) {13584dtrace_dof_error(dof, "referenced section is not loadable");13585return (NULL);13586}1358713588if (type != DOF_SECT_NONE && type != sec->dofs_type) {13589dtrace_dof_error(dof, "referenced section is the wrong type");13590return (NULL);13591}1359213593return (sec);13594}1359513596static dtrace_probedesc_t *13597dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)13598{13599dof_probedesc_t *probe;13600dof_sec_t *strtab;13601uintptr_t daddr = (uintptr_t)dof;13602uintptr_t str;13603size_t size;1360413605if (sec->dofs_type != DOF_SECT_PROBEDESC) {13606dtrace_dof_error(dof, "invalid probe section");13607return (NULL);13608}1360913610if (sec->dofs_align != sizeof (dof_secidx_t)) {13611dtrace_dof_error(dof, "bad alignment in probe description");13612return (NULL);13613}1361413615if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {13616dtrace_dof_error(dof, "truncated probe description");13617return (NULL);13618}1361913620probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);13621strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);1362213623if (strtab == NULL)13624return (NULL);1362513626str = daddr + strtab->dofs_offset;13627size = strtab->dofs_size;1362813629if (probe->dofp_provider >= strtab->dofs_size) {13630dtrace_dof_error(dof, "corrupt probe provider");13631return (NULL);13632}1363313634(void) strncpy(desc->dtpd_provider,13635(char *)(str + probe->dofp_provider),13636MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));1363713638if (probe->dofp_mod >= strtab->dofs_size) {13639dtrace_dof_error(dof, "corrupt probe module");13640return (NULL);13641}1364213643(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),13644MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));1364513646if (probe->dofp_func >= strtab->dofs_size) {13647dtrace_dof_error(dof, "corrupt probe function");13648return (NULL);13649}1365013651(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),13652MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));1365313654if (probe->dofp_name >= strtab->dofs_size) {13655dtrace_dof_error(dof, "corrupt probe name");13656return (NULL);13657}1365813659(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),13660MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));1366113662return (desc);13663}1366413665static dtrace_difo_t *13666dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,13667cred_t *cr)13668{13669dtrace_difo_t *dp;13670size_t ttl = 0;13671dof_difohdr_t *dofd;13672uintptr_t daddr = (uintptr_t)dof;13673size_t max = dtrace_difo_maxsize;13674int i, l, n;1367513676static const struct {13677int section;13678int bufoffs;13679int lenoffs;13680int entsize;13681int align;13682const char *msg;13683} difo[] = {13684{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),13685offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),13686sizeof (dif_instr_t), "multiple DIF sections" },1368713688{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),13689offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),13690sizeof (uint64_t), "multiple integer tables" },1369113692{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),13693offsetof(dtrace_difo_t, dtdo_strlen), 0,13694sizeof (char), "multiple string tables" },1369513696{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),13697offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),13698sizeof (uint_t), "multiple variable tables" },1369913700{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }13701};1370213703if (sec->dofs_type != DOF_SECT_DIFOHDR) {13704dtrace_dof_error(dof, "invalid DIFO header section");13705return (NULL);13706}1370713708if (sec->dofs_align != sizeof (dof_secidx_t)) {13709dtrace_dof_error(dof, "bad alignment in DIFO header");13710return (NULL);13711}1371213713if (sec->dofs_size < sizeof (dof_difohdr_t) ||13714sec->dofs_size % sizeof (dof_secidx_t)) {13715dtrace_dof_error(dof, "bad size in DIFO header");13716return (NULL);13717}1371813719dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);13720n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;1372113722dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);13723dp->dtdo_rtype = dofd->dofd_rtype;1372413725for (l = 0; l < n; l++) {13726dof_sec_t *subsec;13727void **bufp;13728uint32_t *lenp;1372913730if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,13731dofd->dofd_links[l])) == NULL)13732goto err; /* invalid section link */1373313734if (ttl + subsec->dofs_size > max) {13735dtrace_dof_error(dof, "exceeds maximum size");13736goto err;13737}1373813739ttl += subsec->dofs_size;1374013741for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {13742if (subsec->dofs_type != difo[i].section)13743continue;1374413745if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {13746dtrace_dof_error(dof, "section not loaded");13747goto err;13748}1374913750if (subsec->dofs_align != difo[i].align) {13751dtrace_dof_error(dof, "bad alignment");13752goto err;13753}1375413755bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);13756lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);1375713758if (*bufp != NULL) {13759dtrace_dof_error(dof, difo[i].msg);13760goto err;13761}1376213763if (difo[i].entsize != subsec->dofs_entsize) {13764dtrace_dof_error(dof, "entry size mismatch");13765goto err;13766}1376713768if (subsec->dofs_entsize != 0 &&13769(subsec->dofs_size % subsec->dofs_entsize) != 0) {13770dtrace_dof_error(dof, "corrupt entry size");13771goto err;13772}1377313774*lenp = subsec->dofs_size;13775*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);13776bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),13777*bufp, subsec->dofs_size);1377813779if (subsec->dofs_entsize != 0)13780*lenp /= subsec->dofs_entsize;1378113782break;13783}1378413785/*13786* If we encounter a loadable DIFO sub-section that is not13787* known to us, assume this is a broken program and fail.13788*/13789if (difo[i].section == DOF_SECT_NONE &&13790(subsec->dofs_flags & DOF_SECF_LOAD)) {13791dtrace_dof_error(dof, "unrecognized DIFO subsection");13792goto err;13793}13794}1379513796if (dp->dtdo_buf == NULL) {13797/*13798* We can't have a DIF object without DIF text.13799*/13800dtrace_dof_error(dof, "missing DIF text");13801goto err;13802}1380313804/*13805* Before we validate the DIF object, run through the variable table13806* looking for the strings -- if any of their size are under, we'll set13807* their size to be the system-wide default string size. Note that13808* this should _not_ happen if the "strsize" option has been set --13809* in this case, the compiler should have set the size to reflect the13810* setting of the option.13811*/13812for (i = 0; i < dp->dtdo_varlen; i++) {13813dtrace_difv_t *v = &dp->dtdo_vartab[i];13814dtrace_diftype_t *t = &v->dtdv_type;1381513816if (v->dtdv_id < DIF_VAR_OTHER_UBASE)13817continue;1381813819if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)13820t->dtdt_size = dtrace_strsize_default;13821}1382213823if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)13824goto err;1382513826dtrace_difo_init(dp, vstate);13827return (dp);1382813829err:13830kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));13831kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));13832kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);13833kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));1383413835kmem_free(dp, sizeof (dtrace_difo_t));13836return (NULL);13837}1383813839static dtrace_predicate_t *13840dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,13841cred_t *cr)13842{13843dtrace_difo_t *dp;1384413845if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)13846return (NULL);1384713848return (dtrace_predicate_create(dp));13849}1385013851static dtrace_actdesc_t *13852dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,13853cred_t *cr)13854{13855dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;13856dof_actdesc_t *desc;13857dof_sec_t *difosec;13858size_t offs;13859uintptr_t daddr = (uintptr_t)dof;13860uint64_t arg;13861dtrace_actkind_t kind;1386213863if (sec->dofs_type != DOF_SECT_ACTDESC) {13864dtrace_dof_error(dof, "invalid action section");13865return (NULL);13866}1386713868if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {13869dtrace_dof_error(dof, "truncated action description");13870return (NULL);13871}1387213873if (sec->dofs_align != sizeof (uint64_t)) {13874dtrace_dof_error(dof, "bad alignment in action description");13875return (NULL);13876}1387713878if (sec->dofs_size < sec->dofs_entsize) {13879dtrace_dof_error(dof, "section entry size exceeds total size");13880return (NULL);13881}1388213883if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {13884dtrace_dof_error(dof, "bad entry size in action description");13885return (NULL);13886}1388713888if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {13889dtrace_dof_error(dof, "actions exceed dtrace_actions_max");13890return (NULL);13891}1389213893for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {13894desc = (dof_actdesc_t *)(daddr +13895(uintptr_t)sec->dofs_offset + offs);13896kind = (dtrace_actkind_t)desc->dofa_kind;1389713898if ((DTRACEACT_ISPRINTFLIKE(kind) &&13899(kind != DTRACEACT_PRINTA ||13900desc->dofa_strtab != DOF_SECIDX_NONE)) ||13901(kind == DTRACEACT_DIFEXPR &&13902desc->dofa_strtab != DOF_SECIDX_NONE)) {13903dof_sec_t *strtab;13904char *str, *fmt;13905uint64_t i;1390613907/*13908* The argument to these actions is an index into the13909* DOF string table. For printf()-like actions, this13910* is the format string. For print(), this is the13911* CTF type of the expression result.13912*/13913if ((strtab = dtrace_dof_sect(dof,13914DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)13915goto err;1391613917str = (char *)((uintptr_t)dof +13918(uintptr_t)strtab->dofs_offset);1391913920for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {13921if (str[i] == '\0')13922break;13923}1392413925if (i >= strtab->dofs_size) {13926dtrace_dof_error(dof, "bogus format string");13927goto err;13928}1392913930if (i == desc->dofa_arg) {13931dtrace_dof_error(dof, "empty format string");13932goto err;13933}1393413935i -= desc->dofa_arg;13936fmt = kmem_alloc(i + 1, KM_SLEEP);13937bcopy(&str[desc->dofa_arg], fmt, i + 1);13938arg = (uint64_t)(uintptr_t)fmt;13939} else {13940if (kind == DTRACEACT_PRINTA) {13941ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);13942arg = 0;13943} else {13944arg = desc->dofa_arg;13945}13946}1394713948act = dtrace_actdesc_create(kind, desc->dofa_ntuple,13949desc->dofa_uarg, arg);1395013951if (last != NULL) {13952last->dtad_next = act;13953} else {13954first = act;13955}1395613957last = act;1395813959if (desc->dofa_difo == DOF_SECIDX_NONE)13960continue;1396113962if ((difosec = dtrace_dof_sect(dof,13963DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)13964goto err;1396513966act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);1396713968if (act->dtad_difo == NULL)13969goto err;13970}1397113972ASSERT(first != NULL);13973return (first);1397413975err:13976for (act = first; act != NULL; act = next) {13977next = act->dtad_next;13978dtrace_actdesc_release(act, vstate);13979}1398013981return (NULL);13982}1398313984static dtrace_ecbdesc_t *13985dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,13986cred_t *cr)13987{13988dtrace_ecbdesc_t *ep;13989dof_ecbdesc_t *ecb;13990dtrace_probedesc_t *desc;13991dtrace_predicate_t *pred = NULL;1399213993if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {13994dtrace_dof_error(dof, "truncated ECB description");13995return (NULL);13996}1399713998if (sec->dofs_align != sizeof (uint64_t)) {13999dtrace_dof_error(dof, "bad alignment in ECB description");14000return (NULL);14001}1400214003ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);14004sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);1400514006if (sec == NULL)14007return (NULL);1400814009ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);14010ep->dted_uarg = ecb->dofe_uarg;14011desc = &ep->dted_probe;1401214013if (dtrace_dof_probedesc(dof, sec, desc) == NULL)14014goto err;1401514016if (ecb->dofe_pred != DOF_SECIDX_NONE) {14017if ((sec = dtrace_dof_sect(dof,14018DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)14019goto err;1402014021if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)14022goto err;1402314024ep->dted_pred.dtpdd_predicate = pred;14025}1402614027if (ecb->dofe_actions != DOF_SECIDX_NONE) {14028if ((sec = dtrace_dof_sect(dof,14029DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)14030goto err;1403114032ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);1403314034if (ep->dted_action == NULL)14035goto err;14036}1403714038return (ep);1403914040err:14041if (pred != NULL)14042dtrace_predicate_release(pred, vstate);14043kmem_free(ep, sizeof (dtrace_ecbdesc_t));14044return (NULL);14045}1404614047/*14048* Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the14049* specified DOF. SETX relocations are computed using 'ubase', the base load14050* address of the object containing the DOF, and DOFREL relocations are relative14051* to the relocation offset within the DOF.14052*/14053static int14054dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase,14055uint64_t udaddr)14056{14057uintptr_t daddr = (uintptr_t)dof;14058uintptr_t ts_end;14059dof_relohdr_t *dofr =14060(dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);14061dof_sec_t *ss, *rs, *ts;14062dof_relodesc_t *r;14063uint_t i, n;1406414065if (sec->dofs_size < sizeof (dof_relohdr_t) ||14066sec->dofs_align != sizeof (dof_secidx_t)) {14067dtrace_dof_error(dof, "invalid relocation header");14068return (-1);14069}1407014071ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);14072rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);14073ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);14074ts_end = (uintptr_t)ts + sizeof (dof_sec_t);1407514076if (ss == NULL || rs == NULL || ts == NULL)14077return (-1); /* dtrace_dof_error() has been called already */1407814079if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||14080rs->dofs_align != sizeof (uint64_t)) {14081dtrace_dof_error(dof, "invalid relocation section");14082return (-1);14083}1408414085r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);14086n = rs->dofs_size / rs->dofs_entsize;1408714088for (i = 0; i < n; i++) {14089uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;1409014091switch (r->dofr_type) {14092case DOF_RELO_NONE:14093break;14094case DOF_RELO_SETX:14095case DOF_RELO_DOFREL:14096if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +14097sizeof (uint64_t) > ts->dofs_size) {14098dtrace_dof_error(dof, "bad relocation offset");14099return (-1);14100}1410114102if (taddr >= (uintptr_t)ts && taddr < ts_end) {14103dtrace_dof_error(dof, "bad relocation offset");14104return (-1);14105}1410614107if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {14108dtrace_dof_error(dof, "misaligned setx relo");14109return (-1);14110}1411114112if (r->dofr_type == DOF_RELO_SETX)14113*(uint64_t *)taddr += ubase;14114else14115*(uint64_t *)taddr +=14116udaddr + ts->dofs_offset + r->dofr_offset;14117break;14118default:14119dtrace_dof_error(dof, "invalid relocation type");14120return (-1);14121}1412214123r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);14124}1412514126return (0);14127}1412814129/*14130* The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated14131* header: it should be at the front of a memory region that is at least14132* sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in14133* size. It need not be validated in any other way.14134*/14135static int14136dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,14137dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes)14138{14139uint64_t len = dof->dofh_loadsz, seclen;14140uintptr_t daddr = (uintptr_t)dof;14141dtrace_ecbdesc_t *ep;14142dtrace_enabling_t *enab;14143uint_t i;1414414145ASSERT(MUTEX_HELD(&dtrace_lock));14146ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));1414714148/*14149* Check the DOF header identification bytes. In addition to checking14150* valid settings, we also verify that unused bits/bytes are zeroed so14151* we can use them later without fear of regressing existing binaries.14152*/14153if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],14154DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {14155dtrace_dof_error(dof, "DOF magic string mismatch");14156return (-1);14157}1415814159if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&14160dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {14161dtrace_dof_error(dof, "DOF has invalid data model");14162return (-1);14163}1416414165if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {14166dtrace_dof_error(dof, "DOF encoding mismatch");14167return (-1);14168}1416914170if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&14171dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {14172dtrace_dof_error(dof, "DOF version mismatch");14173return (-1);14174}1417514176if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {14177dtrace_dof_error(dof, "DOF uses unsupported instruction set");14178return (-1);14179}1418014181if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {14182dtrace_dof_error(dof, "DOF uses too many integer registers");14183return (-1);14184}1418514186if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {14187dtrace_dof_error(dof, "DOF uses too many tuple registers");14188return (-1);14189}1419014191for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {14192if (dof->dofh_ident[i] != 0) {14193dtrace_dof_error(dof, "DOF has invalid ident byte set");14194return (-1);14195}14196}1419714198if (dof->dofh_flags & ~DOF_FL_VALID) {14199dtrace_dof_error(dof, "DOF has invalid flag bits set");14200return (-1);14201}1420214203if (dof->dofh_secsize == 0) {14204dtrace_dof_error(dof, "zero section header size");14205return (-1);14206}1420714208/*14209* Check that the section headers don't exceed the amount of DOF14210* data. Note that we cast the section size and number of sections14211* to uint64_t's to prevent possible overflow in the multiplication.14212*/14213seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;1421414215if (dof->dofh_secoff > len || seclen > len ||14216dof->dofh_secoff + seclen > len) {14217dtrace_dof_error(dof, "truncated section headers");14218return (-1);14219}1422014221if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {14222dtrace_dof_error(dof, "misaligned section headers");14223return (-1);14224}1422514226if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {14227dtrace_dof_error(dof, "misaligned section size");14228return (-1);14229}1423014231/*14232* Take an initial pass through the section headers to be sure that14233* the headers don't have stray offsets. If the 'noprobes' flag is14234* set, do not permit sections relating to providers, probes, or args.14235*/14236for (i = 0; i < dof->dofh_secnum; i++) {14237dof_sec_t *sec = (dof_sec_t *)(daddr +14238(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);1423914240if (noprobes) {14241switch (sec->dofs_type) {14242case DOF_SECT_PROVIDER:14243case DOF_SECT_PROBES:14244case DOF_SECT_PRARGS:14245case DOF_SECT_PROFFS:14246dtrace_dof_error(dof, "illegal sections "14247"for enabling");14248return (-1);14249}14250}1425114252if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&14253!(sec->dofs_flags & DOF_SECF_LOAD)) {14254dtrace_dof_error(dof, "loadable section with load "14255"flag unset");14256return (-1);14257}1425814259if (!(sec->dofs_flags & DOF_SECF_LOAD))14260continue; /* just ignore non-loadable sections */1426114262if (!ISP2(sec->dofs_align)) {14263dtrace_dof_error(dof, "bad section alignment");14264return (-1);14265}1426614267if (sec->dofs_offset & (sec->dofs_align - 1)) {14268dtrace_dof_error(dof, "misaligned section");14269return (-1);14270}1427114272if (sec->dofs_offset > len || sec->dofs_size > len ||14273sec->dofs_offset + sec->dofs_size > len) {14274dtrace_dof_error(dof, "corrupt section header");14275return (-1);14276}1427714278if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +14279sec->dofs_offset + sec->dofs_size - 1) != '\0') {14280dtrace_dof_error(dof, "non-terminating string table");14281return (-1);14282}14283}1428414285/*14286* Take a second pass through the sections and locate and perform any14287* relocations that are present. We do this after the first pass to14288* be sure that all sections have had their headers validated.14289*/14290for (i = 0; i < dof->dofh_secnum; i++) {14291dof_sec_t *sec = (dof_sec_t *)(daddr +14292(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);1429314294if (!(sec->dofs_flags & DOF_SECF_LOAD))14295continue; /* skip sections that are not loadable */1429614297switch (sec->dofs_type) {14298case DOF_SECT_URELHDR:14299if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0)14300return (-1);14301break;14302}14303}1430414305if ((enab = *enabp) == NULL)14306enab = *enabp = dtrace_enabling_create(vstate);1430714308for (i = 0; i < dof->dofh_secnum; i++) {14309dof_sec_t *sec = (dof_sec_t *)(daddr +14310(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);1431114312if (sec->dofs_type != DOF_SECT_ECBDESC)14313continue;1431414315if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {14316dtrace_enabling_destroy(enab);14317*enabp = NULL;14318return (-1);14319}1432014321dtrace_enabling_add(enab, ep);14322}1432314324return (0);14325}1432614327/*14328* Process DOF for any options. This routine assumes that the DOF has been14329* at least processed by dtrace_dof_slurp().14330*/14331static int14332dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)14333{14334int i, rval;14335uint32_t entsize;14336size_t offs;14337dof_optdesc_t *desc;1433814339for (i = 0; i < dof->dofh_secnum; i++) {14340dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +14341(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);1434214343if (sec->dofs_type != DOF_SECT_OPTDESC)14344continue;1434514346if (sec->dofs_align != sizeof (uint64_t)) {14347dtrace_dof_error(dof, "bad alignment in "14348"option description");14349return (EINVAL);14350}1435114352if ((entsize = sec->dofs_entsize) == 0) {14353dtrace_dof_error(dof, "zeroed option entry size");14354return (EINVAL);14355}1435614357if (entsize < sizeof (dof_optdesc_t)) {14358dtrace_dof_error(dof, "bad option entry size");14359return (EINVAL);14360}1436114362for (offs = 0; offs < sec->dofs_size; offs += entsize) {14363desc = (dof_optdesc_t *)((uintptr_t)dof +14364(uintptr_t)sec->dofs_offset + offs);1436514366if (desc->dofo_strtab != DOF_SECIDX_NONE) {14367dtrace_dof_error(dof, "non-zero option string");14368return (EINVAL);14369}1437014371if (desc->dofo_value == DTRACEOPT_UNSET) {14372dtrace_dof_error(dof, "unset option");14373return (EINVAL);14374}1437514376if ((rval = dtrace_state_option(state,14377desc->dofo_option, desc->dofo_value)) != 0) {14378dtrace_dof_error(dof, "rejected option");14379return (rval);14380}14381}14382}1438314384return (0);14385}1438614387/*14388* DTrace Consumer State Functions14389*/14390static int14391dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)14392{14393size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;14394void *base;14395uintptr_t limit;14396dtrace_dynvar_t *dvar, *next, *start;14397int i;1439814399ASSERT(MUTEX_HELD(&dtrace_lock));14400ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);1440114402bzero(dstate, sizeof (dtrace_dstate_t));1440314404if ((dstate->dtds_chunksize = chunksize) == 0)14405dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;1440614407VERIFY(dstate->dtds_chunksize < LONG_MAX);1440814409if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))14410size = min;1441114412if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)14413return (ENOMEM);1441414415dstate->dtds_size = size;14416dstate->dtds_base = base;14417dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);14418bzero(dstate->dtds_percpu,14419(mp_maxid + 1) * sizeof (dtrace_dstate_percpu_t));1442014421hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));1442214423if (hashsize != 1 && (hashsize & 1))14424hashsize--;1442514426dstate->dtds_hashsize = hashsize;14427dstate->dtds_hash = dstate->dtds_base;1442814429/*14430* Set all of our hash buckets to point to the single sink, and (if14431* it hasn't already been set), set the sink's hash value to be the14432* sink sentinel value. The sink is needed for dynamic variable14433* lookups to know that they have iterated over an entire, valid hash14434* chain.14435*/14436for (i = 0; i < hashsize; i++)14437dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;1443814439if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)14440dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;1444114442/*14443* Determine number of active CPUs. Divide free list evenly among14444* active CPUs.14445*/14446start = (dtrace_dynvar_t *)14447((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));14448limit = (uintptr_t)base + size;1444914450VERIFY((uintptr_t)start < limit);14451VERIFY((uintptr_t)start >= (uintptr_t)base);1445214453maxper = (limit - (uintptr_t)start) / (mp_maxid + 1);14454maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;1445514456CPU_FOREACH(i) {14457dstate->dtds_percpu[i].dtdsc_free = dvar = start;1445814459/*14460* If we don't even have enough chunks to make it once through14461* NCPUs, we're just going to allocate everything to the first14462* CPU. And if we're on the last CPU, we're going to allocate14463* whatever is left over. In either case, we set the limit to14464* be the limit of the dynamic variable space.14465*/14466if (maxper == 0 || i == mp_maxid) {14467limit = (uintptr_t)base + size;14468start = NULL;14469} else {14470limit = (uintptr_t)start + maxper;14471start = (dtrace_dynvar_t *)limit;14472}1447314474VERIFY(limit <= (uintptr_t)base + size);1447514476for (;;) {14477next = (dtrace_dynvar_t *)((uintptr_t)dvar +14478dstate->dtds_chunksize);1447914480if ((uintptr_t)next + dstate->dtds_chunksize >= limit)14481break;1448214483VERIFY((uintptr_t)dvar >= (uintptr_t)base &&14484(uintptr_t)dvar <= (uintptr_t)base + size);14485dvar->dtdv_next = next;14486dvar = next;14487}1448814489if (maxper == 0)14490break;14491}1449214493return (0);14494}1449514496static void14497dtrace_dstate_fini(dtrace_dstate_t *dstate)14498{14499ASSERT(MUTEX_HELD(&cpu_lock));1450014501if (dstate->dtds_base == NULL)14502return;1450314504kmem_free(dstate->dtds_base, dstate->dtds_size);14505kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);14506}1450714508static void14509dtrace_vstate_fini(dtrace_vstate_t *vstate)14510{14511/*14512* Logical XOR, where are you?14513*/14514ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));1451514516if (vstate->dtvs_nglobals > 0) {14517kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *14518sizeof (dtrace_statvar_t *));14519}1452014521if (vstate->dtvs_ntlocals > 0) {14522kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *14523sizeof (dtrace_difv_t));14524}1452514526ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));1452714528if (vstate->dtvs_nlocals > 0) {14529kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *14530sizeof (dtrace_statvar_t *));14531}14532}1453314534#ifdef illumos14535static void14536dtrace_state_clean(dtrace_state_t *state)14537{14538if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)14539return;1454014541dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);14542dtrace_speculation_clean(state);14543}1454414545static void14546dtrace_state_deadman(dtrace_state_t *state)14547{14548hrtime_t now;1454914550dtrace_sync();1455114552now = dtrace_gethrtime();1455314554if (state != dtrace_anon.dta_state &&14555now - state->dts_laststatus >= dtrace_deadman_user)14556return;1455714558/*14559* We must be sure that dts_alive never appears to be less than the14560* value upon entry to dtrace_state_deadman(), and because we lack a14561* dtrace_cas64(), we cannot store to it atomically. We thus instead14562* store INT64_MAX to it, followed by a memory barrier, followed by14563* the new value. This assures that dts_alive never appears to be14564* less than its true value, regardless of the order in which the14565* stores to the underlying storage are issued.14566*/14567state->dts_alive = INT64_MAX;14568dtrace_membar_producer();14569state->dts_alive = now;14570}14571#else /* !illumos */14572static void14573dtrace_state_clean(void *arg)14574{14575dtrace_state_t *state = arg;14576dtrace_optval_t *opt = state->dts_options;1457714578if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)14579return;1458014581dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);14582dtrace_speculation_clean(state);1458314584callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,14585dtrace_state_clean, state);14586}1458714588static void14589dtrace_state_deadman(void *arg)14590{14591dtrace_state_t *state = arg;14592hrtime_t now;1459314594dtrace_sync();1459514596dtrace_debug_output();1459714598now = dtrace_gethrtime();1459914600if (state != dtrace_anon.dta_state &&14601now - state->dts_laststatus >= dtrace_deadman_user)14602return;1460314604/*14605* We must be sure that dts_alive never appears to be less than the14606* value upon entry to dtrace_state_deadman(), and because we lack a14607* dtrace_cas64(), we cannot store to it atomically. We thus instead14608* store INT64_MAX to it, followed by a memory barrier, followed by14609* the new value. This assures that dts_alive never appears to be14610* less than its true value, regardless of the order in which the14611* stores to the underlying storage are issued.14612*/14613state->dts_alive = INT64_MAX;14614dtrace_membar_producer();14615state->dts_alive = now;1461614617callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,14618dtrace_state_deadman, state);14619}14620#endif /* illumos */1462114622static dtrace_state_t *14623#ifdef illumos14624dtrace_state_create(dev_t *devp, cred_t *cr)14625#else14626dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)14627#endif14628{14629#ifdef illumos14630minor_t minor;14631major_t major;14632#else14633cred_t *cr = NULL;14634int m = 0;14635#endif14636char c[30];14637dtrace_state_t *state;14638dtrace_optval_t *opt;14639int bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t), i;14640int cpu_it;1464114642ASSERT(MUTEX_HELD(&dtrace_lock));14643ASSERT(MUTEX_HELD(&cpu_lock));1464414645#ifdef illumos14646minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,14647VM_BESTFIT | VM_SLEEP);1464814649if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {14650vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);14651return (NULL);14652}1465314654state = ddi_get_soft_state(dtrace_softstate, minor);14655#else14656if (dev != NULL) {14657cr = dev->si_cred;14658m = dev2unit(dev);14659}1466014661/* Allocate memory for the state. */14662state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);14663#endif1466414665state->dts_epid = DTRACE_EPIDNONE + 1;1466614667(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);14668#ifdef illumos14669state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,14670NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);1467114672if (devp != NULL) {14673major = getemajor(*devp);14674} else {14675major = ddi_driver_major(dtrace_devi);14676}1467714678state->dts_dev = makedevice(major, minor);1467914680if (devp != NULL)14681*devp = state->dts_dev;14682#else14683state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);14684state->dts_dev = dev;14685#endif1468614687state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);14688state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);1468914690/*14691* Allocate and initialise the per-process per-CPU random state.14692* SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is14693* assumed to be seeded at this point (if from Fortuna seed file).14694*/14695arc4random_buf(&state->dts_rstate[0], 2 * sizeof(uint64_t));14696for (cpu_it = 1; cpu_it <= mp_maxid; cpu_it++) {14697/*14698* Each CPU is assigned a 2^64 period, non-overlapping14699* subsequence.14700*/14701dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it - 1],14702state->dts_rstate[cpu_it]);14703}1470414705#ifdef illumos14706state->dts_cleaner = CYCLIC_NONE;14707state->dts_deadman = CYCLIC_NONE;14708#else14709callout_init(&state->dts_cleaner, 1);14710callout_init(&state->dts_deadman, 1);14711#endif14712state->dts_vstate.dtvs_state = state;1471314714for (i = 0; i < DTRACEOPT_MAX; i++)14715state->dts_options[i] = DTRACEOPT_UNSET;1471614717/*14718* Set the default options.14719*/14720opt = state->dts_options;14721opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;14722opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;14723opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;14724opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;14725opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;14726opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;14727opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;14728opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;14729opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;14730opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;14731opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;14732opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;14733opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;14734opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;1473514736state->dts_activity = DTRACE_ACTIVITY_INACTIVE;1473714738/*14739* Depending on the user credentials, we set flag bits which alter probe14740* visibility or the amount of destructiveness allowed. In the case of14741* actual anonymous tracing, or the possession of all privileges, all of14742* the normal checks are bypassed.14743*/14744if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {14745state->dts_cred.dcr_visible = DTRACE_CRV_ALL;14746state->dts_cred.dcr_action = DTRACE_CRA_ALL;14747} else {14748/*14749* Set up the credentials for this instantiation. We take a14750* hold on the credential to prevent it from disappearing on14751* us; this in turn prevents the zone_t referenced by this14752* credential from disappearing. This means that we can14753* examine the credential and the zone from probe context.14754*/14755crhold(cr);14756state->dts_cred.dcr_cred = cr;1475714758/*14759* CRA_PROC means "we have *some* privilege for dtrace" and14760* unlocks the use of variables like pid, zonename, etc.14761*/14762if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||14763PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {14764state->dts_cred.dcr_action |= DTRACE_CRA_PROC;14765}1476614767/*14768* dtrace_user allows use of syscall and profile providers.14769* If the user also has proc_owner and/or proc_zone, we14770* extend the scope to include additional visibility and14771* destructive power.14772*/14773if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {14774if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {14775state->dts_cred.dcr_visible |=14776DTRACE_CRV_ALLPROC;1477714778state->dts_cred.dcr_action |=14779DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;14780}1478114782if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {14783state->dts_cred.dcr_visible |=14784DTRACE_CRV_ALLZONE;1478514786state->dts_cred.dcr_action |=14787DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;14788}1478914790/*14791* If we have all privs in whatever zone this is,14792* we can do destructive things to processes which14793* have altered credentials.14794*/14795#ifdef illumos14796if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),14797cr->cr_zone->zone_privset)) {14798state->dts_cred.dcr_action |=14799DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;14800}14801#endif14802}1480314804/*14805* Holding the dtrace_kernel privilege also implies that14806* the user has the dtrace_user privilege from a visibility14807* perspective. But without further privileges, some14808* destructive actions are not available.14809*/14810if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {14811/*14812* Make all probes in all zones visible. However,14813* this doesn't mean that all actions become available14814* to all zones.14815*/14816state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |14817DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;1481814819state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |14820DTRACE_CRA_PROC;14821/*14822* Holding proc_owner means that destructive actions14823* for *this* zone are allowed.14824*/14825if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))14826state->dts_cred.dcr_action |=14827DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;1482814829/*14830* Holding proc_zone means that destructive actions14831* for this user/group ID in all zones is allowed.14832*/14833if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))14834state->dts_cred.dcr_action |=14835DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;1483614837#ifdef illumos14838/*14839* If we have all privs in whatever zone this is,14840* we can do destructive things to processes which14841* have altered credentials.14842*/14843if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),14844cr->cr_zone->zone_privset)) {14845state->dts_cred.dcr_action |=14846DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;14847}14848#endif14849}1485014851/*14852* Holding the dtrace_proc privilege gives control over fasttrap14853* and pid providers. We need to grant wider destructive14854* privileges in the event that the user has proc_owner and/or14855* proc_zone.14856*/14857if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {14858if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))14859state->dts_cred.dcr_action |=14860DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;1486114862if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))14863state->dts_cred.dcr_action |=14864DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;14865}14866}1486714868return (state);14869}1487014871static int14872dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)14873{14874dtrace_optval_t *opt = state->dts_options, size;14875processorid_t cpu = 0;14876int flags = 0, rval, factor, divisor = 1;1487714878ASSERT(MUTEX_HELD(&dtrace_lock));14879ASSERT(MUTEX_HELD(&cpu_lock));14880ASSERT(which < DTRACEOPT_MAX);14881ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||14882(state == dtrace_anon.dta_state &&14883state->dts_activity == DTRACE_ACTIVITY_ACTIVE));1488414885if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)14886return (0);1488714888if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)14889cpu = opt[DTRACEOPT_CPU];1489014891if (which == DTRACEOPT_SPECSIZE)14892flags |= DTRACEBUF_NOSWITCH;1489314894if (which == DTRACEOPT_BUFSIZE) {14895if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)14896flags |= DTRACEBUF_RING;1489714898if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)14899flags |= DTRACEBUF_FILL;1490014901if (state != dtrace_anon.dta_state ||14902state->dts_activity != DTRACE_ACTIVITY_ACTIVE)14903flags |= DTRACEBUF_INACTIVE;14904}1490514906for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {14907/*14908* The size must be 8-byte aligned. If the size is not 8-byte14909* aligned, drop it down by the difference.14910*/14911if (size & (sizeof (uint64_t) - 1))14912size -= size & (sizeof (uint64_t) - 1);1491314914if (size < state->dts_reserve) {14915/*14916* Buffers always must be large enough to accommodate14917* their prereserved space. We return E2BIG instead14918* of ENOMEM in this case to allow for user-level14919* software to differentiate the cases.14920*/14921return (E2BIG);14922}1492314924rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);1492514926if (rval != ENOMEM) {14927opt[which] = size;14928return (rval);14929}1493014931if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)14932return (rval);1493314934for (divisor = 2; divisor < factor; divisor <<= 1)14935continue;14936}1493714938return (ENOMEM);14939}1494014941static int14942dtrace_state_buffers(dtrace_state_t *state)14943{14944dtrace_speculation_t *spec = state->dts_speculations;14945int rval, i;1494614947if ((rval = dtrace_state_buffer(state, state->dts_buffer,14948DTRACEOPT_BUFSIZE)) != 0)14949return (rval);1495014951if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,14952DTRACEOPT_AGGSIZE)) != 0)14953return (rval);1495414955for (i = 0; i < state->dts_nspeculations; i++) {14956if ((rval = dtrace_state_buffer(state,14957spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)14958return (rval);14959}1496014961return (0);14962}1496314964static void14965dtrace_state_prereserve(dtrace_state_t *state)14966{14967dtrace_ecb_t *ecb;14968dtrace_probe_t *probe;1496914970state->dts_reserve = 0;1497114972if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)14973return;1497414975/*14976* If our buffer policy is a "fill" buffer policy, we need to set the14977* prereserved space to be the space required by the END probes.14978*/14979probe = dtrace_probes[dtrace_probeid_end - 1];14980ASSERT(probe != NULL);1498114982for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {14983if (ecb->dte_state != state)14984continue;1498514986state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;14987}14988}1498914990static int14991dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)14992{14993dtrace_optval_t *opt = state->dts_options, sz, nspec;14994dtrace_speculation_t *spec;14995dtrace_buffer_t *buf;14996#ifdef illumos14997cyc_handler_t hdlr;14998cyc_time_t when;14999#endif15000int rval = 0, i, bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t);15001dtrace_icookie_t cookie;1500215003mutex_enter(&cpu_lock);15004mutex_enter(&dtrace_lock);1500515006if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {15007rval = EBUSY;15008goto out;15009}1501015011/*15012* Before we can perform any checks, we must prime all of the15013* retained enablings that correspond to this state.15014*/15015dtrace_enabling_prime(state);1501615017if (state->dts_destructive && !state->dts_cred.dcr_destructive) {15018rval = EACCES;15019goto out;15020}1502115022dtrace_state_prereserve(state);1502315024/*15025* Now we want to do is try to allocate our speculations.15026* We do not automatically resize the number of speculations; if15027* this fails, we will fail the operation.15028*/15029nspec = opt[DTRACEOPT_NSPEC];15030ASSERT(nspec != DTRACEOPT_UNSET);1503115032if (nspec > INT_MAX) {15033rval = ENOMEM;15034goto out;15035}1503615037spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),15038KM_NOSLEEP | KM_NORMALPRI);1503915040if (spec == NULL) {15041rval = ENOMEM;15042goto out;15043}1504415045state->dts_speculations = spec;15046state->dts_nspeculations = (int)nspec;1504715048for (i = 0; i < nspec; i++) {15049if ((buf = kmem_zalloc(bufsize,15050KM_NOSLEEP | KM_NORMALPRI)) == NULL) {15051rval = ENOMEM;15052goto err;15053}1505415055spec[i].dtsp_buffer = buf;15056}1505715058if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {15059if (dtrace_anon.dta_state == NULL) {15060rval = ENOENT;15061goto out;15062}1506315064if (state->dts_necbs != 0) {15065rval = EALREADY;15066goto out;15067}1506815069state->dts_anon = dtrace_anon_grab();15070ASSERT(state->dts_anon != NULL);15071state = state->dts_anon;1507215073/*15074* We want "grabanon" to be set in the grabbed state, so we'll15075* copy that option value from the grabbing state into the15076* grabbed state.15077*/15078state->dts_options[DTRACEOPT_GRABANON] =15079opt[DTRACEOPT_GRABANON];1508015081*cpu = dtrace_anon.dta_beganon;1508215083/*15084* If the anonymous state is active (as it almost certainly15085* is if the anonymous enabling ultimately matched anything),15086* we don't allow any further option processing -- but we15087* don't return failure.15088*/15089if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)15090goto out;15091}1509215093if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&15094opt[DTRACEOPT_AGGSIZE] != 0) {15095if (state->dts_aggregations == NULL) {15096/*15097* We're not going to create an aggregation buffer15098* because we don't have any ECBs that contain15099* aggregations -- set this option to 0.15100*/15101opt[DTRACEOPT_AGGSIZE] = 0;15102} else {15103/*15104* If we have an aggregation buffer, we must also have15105* a buffer to use as scratch.15106*/15107if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||15108opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {15109opt[DTRACEOPT_BUFSIZE] = state->dts_needed;15110}15111}15112}1511315114if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&15115opt[DTRACEOPT_SPECSIZE] != 0) {15116if (!state->dts_speculates) {15117/*15118* We're not going to create speculation buffers15119* because we don't have any ECBs that actually15120* speculate -- set the speculation size to 0.15121*/15122opt[DTRACEOPT_SPECSIZE] = 0;15123}15124}1512515126/*15127* The bare minimum size for any buffer that we're actually going to15128* do anything to is sizeof (uint64_t).15129*/15130sz = sizeof (uint64_t);1513115132if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||15133(state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||15134(state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {15135/*15136* A buffer size has been explicitly set to 0 (or to a size15137* that will be adjusted to 0) and we need the space -- we15138* need to return failure. We return ENOSPC to differentiate15139* it from failing to allocate a buffer due to failure to meet15140* the reserve (for which we return E2BIG).15141*/15142rval = ENOSPC;15143goto out;15144}1514515146if ((rval = dtrace_state_buffers(state)) != 0)15147goto err;1514815149if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)15150sz = dtrace_dstate_defsize;1515115152do {15153rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);1515415155if (rval == 0)15156break;1515715158if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)15159goto err;15160} while (sz >>= 1);1516115162opt[DTRACEOPT_DYNVARSIZE] = sz;1516315164if (rval != 0)15165goto err;1516615167if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)15168opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;1516915170if (opt[DTRACEOPT_CLEANRATE] == 0)15171opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;1517215173if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)15174opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;1517515176if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)15177opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;1517815179state->dts_alive = state->dts_laststatus = dtrace_gethrtime();15180#ifdef illumos15181hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;15182hdlr.cyh_arg = state;15183hdlr.cyh_level = CY_LOW_LEVEL;1518415185when.cyt_when = 0;15186when.cyt_interval = opt[DTRACEOPT_CLEANRATE];1518715188state->dts_cleaner = cyclic_add(&hdlr, &when);1518915190hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;15191hdlr.cyh_arg = state;15192hdlr.cyh_level = CY_LOW_LEVEL;1519315194when.cyt_when = 0;15195when.cyt_interval = dtrace_deadman_interval;1519615197state->dts_deadman = cyclic_add(&hdlr, &when);15198#else15199callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,15200dtrace_state_clean, state);15201callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,15202dtrace_state_deadman, state);15203#endif1520415205state->dts_activity = DTRACE_ACTIVITY_WARMUP;1520615207#ifdef illumos15208if (state->dts_getf != 0 &&15209!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {15210/*15211* We don't have kernel privs but we have at least one call15212* to getf(); we need to bump our zone's count, and (if15213* this is the first enabling to have an unprivileged call15214* to getf()) we need to hook into closef().15215*/15216state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;1521715218if (dtrace_getf++ == 0) {15219ASSERT(dtrace_closef == NULL);15220dtrace_closef = dtrace_getf_barrier;15221}15222}15223#endif1522415225/*15226* Now it's time to actually fire the BEGIN probe. We need to disable15227* interrupts here both to record the CPU on which we fired the BEGIN15228* probe (the data from this CPU will be processed first at user15229* level) and to manually activate the buffer for this CPU.15230*/15231cookie = dtrace_interrupt_disable();15232*cpu = curcpu;15233ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);15234state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;1523515236dtrace_probe(dtrace_probeid_begin,15237(uint64_t)(uintptr_t)state, 0, 0, 0, 0);15238dtrace_interrupt_enable(cookie);15239/*15240* We may have had an exit action from a BEGIN probe; only change our15241* state to ACTIVE if we're still in WARMUP.15242*/15243ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||15244state->dts_activity == DTRACE_ACTIVITY_DRAINING);1524515246if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)15247state->dts_activity = DTRACE_ACTIVITY_ACTIVE;1524815249#ifdef __FreeBSD__15250/*15251* We enable anonymous tracing before APs are started, so we must15252* activate buffers using the current CPU.15253*/15254if (state == dtrace_anon.dta_state) {15255CPU_FOREACH(i)15256dtrace_buffer_activate_cpu(state, i);15257} else15258dtrace_xcall(DTRACE_CPUALL,15259(dtrace_xcall_t)dtrace_buffer_activate, state);15260#else15261/*15262* Regardless of whether or not now we're in ACTIVE or DRAINING, we15263* want each CPU to transition its principal buffer out of the15264* INACTIVE state. Doing this assures that no CPU will suddenly begin15265* processing an ECB halfway down a probe's ECB chain; all CPUs will15266* atomically transition from processing none of a state's ECBs to15267* processing all of them.15268*/15269dtrace_xcall(DTRACE_CPUALL,15270(dtrace_xcall_t)dtrace_buffer_activate, state);15271#endif15272goto out;1527315274err:15275dtrace_buffer_free(state->dts_buffer);15276dtrace_buffer_free(state->dts_aggbuffer);1527715278if ((nspec = state->dts_nspeculations) == 0) {15279ASSERT(state->dts_speculations == NULL);15280goto out;15281}1528215283spec = state->dts_speculations;15284ASSERT(spec != NULL);1528515286for (i = 0; i < state->dts_nspeculations; i++) {15287if ((buf = spec[i].dtsp_buffer) == NULL)15288break;1528915290dtrace_buffer_free(buf);15291kmem_free(buf, bufsize);15292}1529315294kmem_free(spec, nspec * sizeof (dtrace_speculation_t));15295state->dts_nspeculations = 0;15296state->dts_speculations = NULL;1529715298out:15299mutex_exit(&dtrace_lock);15300mutex_exit(&cpu_lock);1530115302return (rval);15303}1530415305static int15306dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)15307{15308dtrace_icookie_t cookie;1530915310ASSERT(MUTEX_HELD(&dtrace_lock));1531115312if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&15313state->dts_activity != DTRACE_ACTIVITY_DRAINING)15314return (EINVAL);1531515316/*15317* We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync15318* to be sure that every CPU has seen it. See below for the details15319* on why this is done.15320*/15321state->dts_activity = DTRACE_ACTIVITY_DRAINING;15322dtrace_sync();1532315324/*15325* By this point, it is impossible for any CPU to be still processing15326* with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to15327* DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any15328* other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()15329* and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN15330* iff we're in the END probe.15331*/15332state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;15333dtrace_sync();15334ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);1533515336/*15337* Finally, we can release the reserve and call the END probe. We15338* disable interrupts across calling the END probe to allow us to15339* return the CPU on which we actually called the END probe. This15340* allows user-land to be sure that this CPU's principal buffer is15341* processed last.15342*/15343state->dts_reserve = 0;1534415345cookie = dtrace_interrupt_disable();15346*cpu = curcpu;15347dtrace_probe(dtrace_probeid_end,15348(uint64_t)(uintptr_t)state, 0, 0, 0, 0);15349dtrace_interrupt_enable(cookie);1535015351state->dts_activity = DTRACE_ACTIVITY_STOPPED;15352dtrace_sync();1535315354#ifdef illumos15355if (state->dts_getf != 0 &&15356!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {15357/*15358* We don't have kernel privs but we have at least one call15359* to getf(); we need to lower our zone's count, and (if15360* this is the last enabling to have an unprivileged call15361* to getf()) we need to clear the closef() hook.15362*/15363ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);15364ASSERT(dtrace_closef == dtrace_getf_barrier);15365ASSERT(dtrace_getf > 0);1536615367state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;1536815369if (--dtrace_getf == 0)15370dtrace_closef = NULL;15371}15372#endif1537315374return (0);15375}1537615377static int15378dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,15379dtrace_optval_t val)15380{15381ASSERT(MUTEX_HELD(&dtrace_lock));1538215383if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)15384return (EBUSY);1538515386if (option >= DTRACEOPT_MAX)15387return (EINVAL);1538815389if (option != DTRACEOPT_CPU && val < 0)15390return (EINVAL);1539115392switch (option) {15393case DTRACEOPT_DESTRUCTIVE:15394if (dtrace_destructive_disallow)15395return (EACCES);1539615397state->dts_cred.dcr_destructive = 1;15398break;1539915400case DTRACEOPT_BUFSIZE:15401case DTRACEOPT_DYNVARSIZE:15402case DTRACEOPT_AGGSIZE:15403case DTRACEOPT_SPECSIZE:15404case DTRACEOPT_STRSIZE:15405if (val < 0)15406return (EINVAL);1540715408if (val >= LONG_MAX) {15409/*15410* If this is an otherwise negative value, set it to15411* the highest multiple of 128m less than LONG_MAX.15412* Technically, we're adjusting the size without15413* regard to the buffer resizing policy, but in fact,15414* this has no effect -- if we set the buffer size to15415* ~LONG_MAX and the buffer policy is ultimately set to15416* be "manual", the buffer allocation is guaranteed to15417* fail, if only because the allocation requires two15418* buffers. (We set the the size to the highest15419* multiple of 128m because it ensures that the size15420* will remain a multiple of a megabyte when15421* repeatedly halved -- all the way down to 15m.)15422*/15423val = LONG_MAX - (1 << 27) + 1;15424}15425}1542615427state->dts_options[option] = val;1542815429return (0);15430}1543115432static void15433dtrace_state_destroy(dtrace_state_t *state)15434{15435dtrace_ecb_t *ecb;15436dtrace_vstate_t *vstate = &state->dts_vstate;15437#ifdef illumos15438minor_t minor = getminor(state->dts_dev);15439#endif15440int i, bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t);15441dtrace_speculation_t *spec = state->dts_speculations;15442int nspec = state->dts_nspeculations;15443uint32_t match;1544415445ASSERT(MUTEX_HELD(&dtrace_lock));15446ASSERT(MUTEX_HELD(&cpu_lock));1544715448/*15449* First, retract any retained enablings for this state.15450*/15451dtrace_enabling_retract(state);15452ASSERT(state->dts_nretained == 0);1545315454if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||15455state->dts_activity == DTRACE_ACTIVITY_DRAINING) {15456/*15457* We have managed to come into dtrace_state_destroy() on a15458* hot enabling -- almost certainly because of a disorderly15459* shutdown of a consumer. (That is, a consumer that is15460* exiting without having called dtrace_stop().) In this case,15461* we're going to set our activity to be KILLED, and then15462* issue a sync to be sure that everyone is out of probe15463* context before we start blowing away ECBs.15464*/15465state->dts_activity = DTRACE_ACTIVITY_KILLED;15466dtrace_sync();15467}1546815469/*15470* Release the credential hold we took in dtrace_state_create().15471*/15472if (state->dts_cred.dcr_cred != NULL)15473crfree(state->dts_cred.dcr_cred);1547415475/*15476* Now we can safely disable and destroy any enabled probes. Because15477* any DTRACE_PRIV_KERNEL probes may actually be slowing our progress15478* (especially if they're all enabled), we take two passes through the15479* ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and15480* in the second we disable whatever is left over.15481*/15482for (match = DTRACE_PRIV_KERNEL; ; match = 0) {15483for (i = 0; i < state->dts_necbs; i++) {15484if ((ecb = state->dts_ecbs[i]) == NULL)15485continue;1548615487if (match && ecb->dte_probe != NULL) {15488dtrace_probe_t *probe = ecb->dte_probe;15489dtrace_provider_t *prov = probe->dtpr_provider;1549015491if (!(prov->dtpv_priv.dtpp_flags & match))15492continue;15493}1549415495dtrace_ecb_disable(ecb);15496dtrace_ecb_destroy(ecb);15497}1549815499if (!match)15500break;15501}1550215503/*15504* Before we free the buffers, perform one more sync to assure that15505* every CPU is out of probe context.15506*/15507dtrace_sync();1550815509dtrace_buffer_free(state->dts_buffer);15510dtrace_buffer_free(state->dts_aggbuffer);1551115512for (i = 0; i < nspec; i++)15513dtrace_buffer_free(spec[i].dtsp_buffer);1551415515#ifdef illumos15516if (state->dts_cleaner != CYCLIC_NONE)15517cyclic_remove(state->dts_cleaner);1551815519if (state->dts_deadman != CYCLIC_NONE)15520cyclic_remove(state->dts_deadman);15521#else15522callout_stop(&state->dts_cleaner);15523callout_drain(&state->dts_cleaner);15524callout_stop(&state->dts_deadman);15525callout_drain(&state->dts_deadman);15526#endif1552715528dtrace_dstate_fini(&vstate->dtvs_dynvars);15529dtrace_vstate_fini(vstate);15530if (state->dts_ecbs != NULL)15531kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));1553215533if (state->dts_aggregations != NULL) {15534#ifdef DEBUG15535for (i = 0; i < state->dts_naggregations; i++)15536ASSERT(state->dts_aggregations[i] == NULL);15537#endif15538ASSERT(state->dts_naggregations > 0);15539kmem_free(state->dts_aggregations,15540state->dts_naggregations * sizeof (dtrace_aggregation_t *));15541}1554215543kmem_free(state->dts_buffer, bufsize);15544kmem_free(state->dts_aggbuffer, bufsize);1554515546for (i = 0; i < nspec; i++)15547kmem_free(spec[i].dtsp_buffer, bufsize);1554815549if (spec != NULL)15550kmem_free(spec, nspec * sizeof (dtrace_speculation_t));1555115552dtrace_format_destroy(state);1555315554if (state->dts_aggid_arena != NULL) {15555#ifdef illumos15556vmem_destroy(state->dts_aggid_arena);15557#else15558delete_unrhdr(state->dts_aggid_arena);15559#endif15560state->dts_aggid_arena = NULL;15561}15562#ifdef illumos15563ddi_soft_state_free(dtrace_softstate, minor);15564vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);15565#endif15566}1556715568/*15569* DTrace Anonymous Enabling Functions15570*/15571static dtrace_state_t *15572dtrace_anon_grab(void)15573{15574dtrace_state_t *state;1557515576ASSERT(MUTEX_HELD(&dtrace_lock));1557715578if ((state = dtrace_anon.dta_state) == NULL) {15579ASSERT(dtrace_anon.dta_enabling == NULL);15580return (NULL);15581}1558215583ASSERT(dtrace_anon.dta_enabling != NULL);15584ASSERT(dtrace_retained != NULL);1558515586dtrace_enabling_destroy(dtrace_anon.dta_enabling);15587dtrace_anon.dta_enabling = NULL;15588dtrace_anon.dta_state = NULL;1558915590return (state);15591}1559215593static void15594dtrace_anon_property(void)15595{15596int i, rv;15597dtrace_state_t *state;15598dof_hdr_t *dof;15599char c[32]; /* enough for "dof-data-" + digits */1560015601ASSERT(MUTEX_HELD(&dtrace_lock));15602ASSERT(MUTEX_HELD(&cpu_lock));1560315604for (i = 0; ; i++) {15605(void) snprintf(c, sizeof (c), "dof-data-%d", i);1560615607dtrace_err_verbose = 1;1560815609if ((dof = dtrace_dof_property(c)) == NULL) {15610dtrace_err_verbose = 0;15611break;15612}1561315614#ifdef illumos15615/*15616* We want to create anonymous state, so we need to transition15617* the kernel debugger to indicate that DTrace is active. If15618* this fails (e.g. because the debugger has modified text in15619* some way), we won't continue with the processing.15620*/15621if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {15622cmn_err(CE_NOTE, "kernel debugger active; anonymous "15623"enabling ignored.");15624dtrace_dof_destroy(dof);15625break;15626}15627#endif1562815629/*15630* If we haven't allocated an anonymous state, we'll do so now.15631*/15632if ((state = dtrace_anon.dta_state) == NULL) {15633state = dtrace_state_create(NULL, NULL);15634dtrace_anon.dta_state = state;1563515636if (state == NULL) {15637/*15638* This basically shouldn't happen: the only15639* failure mode from dtrace_state_create() is a15640* failure of ddi_soft_state_zalloc() that15641* itself should never happen. Still, the15642* interface allows for a failure mode, and15643* we want to fail as gracefully as possible:15644* we'll emit an error message and cease15645* processing anonymous state in this case.15646*/15647cmn_err(CE_WARN, "failed to create "15648"anonymous state");15649dtrace_dof_destroy(dof);15650break;15651}15652}1565315654rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),15655&dtrace_anon.dta_enabling, 0, 0, B_TRUE);1565615657if (rv == 0)15658rv = dtrace_dof_options(dof, state);1565915660dtrace_err_verbose = 0;15661dtrace_dof_destroy(dof);1566215663if (rv != 0) {15664/*15665* This is malformed DOF; chuck any anonymous state15666* that we created.15667*/15668ASSERT(dtrace_anon.dta_enabling == NULL);15669dtrace_state_destroy(state);15670dtrace_anon.dta_state = NULL;15671break;15672}1567315674ASSERT(dtrace_anon.dta_enabling != NULL);15675}1567615677if (dtrace_anon.dta_enabling != NULL) {15678int rval;1567915680/*15681* dtrace_enabling_retain() can only fail because we are15682* trying to retain more enablings than are allowed -- but15683* we only have one anonymous enabling, and we are guaranteed15684* to be allowed at least one retained enabling; we assert15685* that dtrace_enabling_retain() returns success.15686*/15687rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);15688ASSERT(rval == 0);1568915690dtrace_enabling_dump(dtrace_anon.dta_enabling);15691}15692}1569315694/*15695* DTrace Helper Functions15696*/15697static void15698dtrace_helper_trace(dtrace_helper_action_t *helper,15699dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)15700{15701uint32_t size, next, nnext, i;15702dtrace_helptrace_t *ent, *buffer;15703uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;1570415705if ((buffer = dtrace_helptrace_buffer) == NULL)15706return;1570715708ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);1570915710/*15711* What would a tracing framework be without its own tracing15712* framework? (Well, a hell of a lot simpler, for starters...)15713*/15714size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *15715sizeof (uint64_t) - sizeof (uint64_t);1571615717/*15718* Iterate until we can allocate a slot in the trace buffer.15719*/15720do {15721next = dtrace_helptrace_next;1572215723if (next + size < dtrace_helptrace_bufsize) {15724nnext = next + size;15725} else {15726nnext = size;15727}15728} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);1572915730/*15731* We have our slot; fill it in.15732*/15733if (nnext == size) {15734dtrace_helptrace_wrapped++;15735next = 0;15736}1573715738ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);15739ent->dtht_helper = helper;15740ent->dtht_where = where;15741ent->dtht_nlocals = vstate->dtvs_nlocals;1574215743ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?15744mstate->dtms_fltoffs : -1;15745ent->dtht_fault = DTRACE_FLAGS2FLT(flags);15746ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;1574715748for (i = 0; i < vstate->dtvs_nlocals; i++) {15749dtrace_statvar_t *svar;1575015751if ((svar = vstate->dtvs_locals[i]) == NULL)15752continue;1575315754ASSERT(svar->dtsv_size >= (mp_maxid + 1) * sizeof (uint64_t));15755ent->dtht_locals[i] =15756((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];15757}15758}1575915760static uint64_t15761dtrace_helper(int which, dtrace_mstate_t *mstate,15762dtrace_state_t *state, uint64_t arg0, uint64_t arg1)15763{15764uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;15765uint64_t sarg0 = mstate->dtms_arg[0];15766uint64_t sarg1 = mstate->dtms_arg[1];15767uint64_t rval = 0;15768dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;15769dtrace_helper_action_t *helper;15770dtrace_vstate_t *vstate;15771dtrace_difo_t *pred;15772int i, trace = dtrace_helptrace_buffer != NULL;1577315774ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);1577515776if (helpers == NULL)15777return (0);1577815779if ((helper = helpers->dthps_actions[which]) == NULL)15780return (0);1578115782vstate = &helpers->dthps_vstate;15783mstate->dtms_arg[0] = arg0;15784mstate->dtms_arg[1] = arg1;1578515786/*15787* Now iterate over each helper. If its predicate evaluates to 'true',15788* we'll call the corresponding actions. Note that the below calls15789* to dtrace_dif_emulate() may set faults in machine state. This is15790* okay: our caller (the outer dtrace_dif_emulate()) will simply plow15791* the stored DIF offset with its own (which is the desired behavior).15792* Also, note the calls to dtrace_dif_emulate() may allocate scratch15793* from machine state; this is okay, too.15794*/15795for (; helper != NULL; helper = helper->dtha_next) {15796if ((pred = helper->dtha_predicate) != NULL) {15797if (trace)15798dtrace_helper_trace(helper, mstate, vstate, 0);1579915800if (!dtrace_dif_emulate(pred, mstate, vstate, state))15801goto next;1580215803if (*flags & CPU_DTRACE_FAULT)15804goto err;15805}1580615807for (i = 0; i < helper->dtha_nactions; i++) {15808if (trace)15809dtrace_helper_trace(helper,15810mstate, vstate, i + 1);1581115812rval = dtrace_dif_emulate(helper->dtha_actions[i],15813mstate, vstate, state);1581415815if (*flags & CPU_DTRACE_FAULT)15816goto err;15817}1581815819next:15820if (trace)15821dtrace_helper_trace(helper, mstate, vstate,15822DTRACE_HELPTRACE_NEXT);15823}1582415825if (trace)15826dtrace_helper_trace(helper, mstate, vstate,15827DTRACE_HELPTRACE_DONE);1582815829/*15830* Restore the arg0 that we saved upon entry.15831*/15832mstate->dtms_arg[0] = sarg0;15833mstate->dtms_arg[1] = sarg1;1583415835return (rval);1583615837err:15838if (trace)15839dtrace_helper_trace(helper, mstate, vstate,15840DTRACE_HELPTRACE_ERR);1584115842/*15843* Restore the arg0 that we saved upon entry.15844*/15845mstate->dtms_arg[0] = sarg0;15846mstate->dtms_arg[1] = sarg1;1584715848return (0);15849}1585015851static void15852dtrace_helper_action_destroy(dtrace_helper_action_t *helper,15853dtrace_vstate_t *vstate)15854{15855int i;1585615857if (helper->dtha_predicate != NULL)15858dtrace_difo_release(helper->dtha_predicate, vstate);1585915860for (i = 0; i < helper->dtha_nactions; i++) {15861ASSERT(helper->dtha_actions[i] != NULL);15862dtrace_difo_release(helper->dtha_actions[i], vstate);15863}1586415865kmem_free(helper->dtha_actions,15866helper->dtha_nactions * sizeof (dtrace_difo_t *));15867kmem_free(helper, sizeof (dtrace_helper_action_t));15868}1586915870static int15871dtrace_helper_destroygen(dtrace_helpers_t *help, int gen)15872{15873proc_t *p = curproc;15874dtrace_vstate_t *vstate;15875int i;1587615877if (help == NULL)15878help = p->p_dtrace_helpers;1587915880ASSERT(MUTEX_HELD(&dtrace_lock));1588115882if (help == NULL || gen > help->dthps_generation)15883return (EINVAL);1588415885vstate = &help->dthps_vstate;1588615887for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {15888dtrace_helper_action_t *last = NULL, *h, *next;1588915890for (h = help->dthps_actions[i]; h != NULL; h = next) {15891next = h->dtha_next;1589215893if (h->dtha_generation == gen) {15894if (last != NULL) {15895last->dtha_next = next;15896} else {15897help->dthps_actions[i] = next;15898}1589915900dtrace_helper_action_destroy(h, vstate);15901} else {15902last = h;15903}15904}15905}1590615907/*15908* Interate until we've cleared out all helper providers with the15909* given generation number.15910*/15911for (;;) {15912dtrace_helper_provider_t *prov;1591315914/*15915* Look for a helper provider with the right generation. We15916* have to start back at the beginning of the list each time15917* because we drop dtrace_lock. It's unlikely that we'll make15918* more than two passes.15919*/15920for (i = 0; i < help->dthps_nprovs; i++) {15921prov = help->dthps_provs[i];1592215923if (prov->dthp_generation == gen)15924break;15925}1592615927/*15928* If there were no matches, we're done.15929*/15930if (i == help->dthps_nprovs)15931break;1593215933/*15934* Move the last helper provider into this slot.15935*/15936help->dthps_nprovs--;15937help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];15938help->dthps_provs[help->dthps_nprovs] = NULL;1593915940mutex_exit(&dtrace_lock);1594115942/*15943* If we have a meta provider, remove this helper provider.15944*/15945mutex_enter(&dtrace_meta_lock);15946if (dtrace_meta_pid != NULL) {15947ASSERT(dtrace_deferred_pid == NULL);15948dtrace_helper_provider_remove(&prov->dthp_prov,15949p->p_pid);15950}15951mutex_exit(&dtrace_meta_lock);1595215953dtrace_helper_provider_destroy(prov);1595415955mutex_enter(&dtrace_lock);15956}1595715958return (0);15959}1596015961static int15962dtrace_helper_validate(dtrace_helper_action_t *helper)15963{15964int err = 0, i;15965dtrace_difo_t *dp;1596615967if ((dp = helper->dtha_predicate) != NULL)15968err += dtrace_difo_validate_helper(dp);1596915970for (i = 0; i < helper->dtha_nactions; i++)15971err += dtrace_difo_validate_helper(helper->dtha_actions[i]);1597215973return (err == 0);15974}1597515976static int15977dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep,15978dtrace_helpers_t *help)15979{15980dtrace_helper_action_t *helper, *last;15981dtrace_actdesc_t *act;15982dtrace_vstate_t *vstate;15983dtrace_predicate_t *pred;15984int count = 0, nactions = 0, i;1598515986if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)15987return (EINVAL);1598815989last = help->dthps_actions[which];15990vstate = &help->dthps_vstate;1599115992for (count = 0; last != NULL; last = last->dtha_next) {15993count++;15994if (last->dtha_next == NULL)15995break;15996}1599715998/*15999* If we already have dtrace_helper_actions_max helper actions for this16000* helper action type, we'll refuse to add a new one.16001*/16002if (count >= dtrace_helper_actions_max)16003return (ENOSPC);1600416005helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);16006helper->dtha_generation = help->dthps_generation;1600716008if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {16009ASSERT(pred->dtp_difo != NULL);16010dtrace_difo_hold(pred->dtp_difo);16011helper->dtha_predicate = pred->dtp_difo;16012}1601316014for (act = ep->dted_action; act != NULL; act = act->dtad_next) {16015if (act->dtad_kind != DTRACEACT_DIFEXPR)16016goto err;1601716018if (act->dtad_difo == NULL)16019goto err;1602016021nactions++;16022}1602316024helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *16025(helper->dtha_nactions = nactions), KM_SLEEP);1602616027for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {16028dtrace_difo_hold(act->dtad_difo);16029helper->dtha_actions[i++] = act->dtad_difo;16030}1603116032if (!dtrace_helper_validate(helper))16033goto err;1603416035if (last == NULL) {16036help->dthps_actions[which] = helper;16037} else {16038last->dtha_next = helper;16039}1604016041if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {16042dtrace_helptrace_nlocals = vstate->dtvs_nlocals;16043dtrace_helptrace_next = 0;16044}1604516046return (0);16047err:16048dtrace_helper_action_destroy(helper, vstate);16049return (EINVAL);16050}1605116052static void16053dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,16054dof_helper_t *dofhp)16055{16056ASSERT(MUTEX_NOT_HELD(&dtrace_lock));1605716058mutex_enter(&dtrace_meta_lock);16059mutex_enter(&dtrace_lock);1606016061if (!dtrace_attached() || dtrace_meta_pid == NULL) {16062/*16063* If the dtrace module is loaded but not attached, or if16064* there aren't isn't a meta provider registered to deal with16065* these provider descriptions, we need to postpone creating16066* the actual providers until later.16067*/1606816069if (help->dthps_next == NULL && help->dthps_prev == NULL &&16070dtrace_deferred_pid != help) {16071help->dthps_deferred = 1;16072help->dthps_pid = p->p_pid;16073help->dthps_next = dtrace_deferred_pid;16074help->dthps_prev = NULL;16075if (dtrace_deferred_pid != NULL)16076dtrace_deferred_pid->dthps_prev = help;16077dtrace_deferred_pid = help;16078}1607916080mutex_exit(&dtrace_lock);1608116082} else if (dofhp != NULL) {16083/*16084* If the dtrace module is loaded and we have a particular16085* helper provider description, pass that off to the16086* meta provider.16087*/1608816089mutex_exit(&dtrace_lock);1609016091dtrace_helper_provide(dofhp, p->p_pid);1609216093} else {16094/*16095* Otherwise, just pass all the helper provider descriptions16096* off to the meta provider.16097*/1609816099int i;16100mutex_exit(&dtrace_lock);1610116102for (i = 0; i < help->dthps_nprovs; i++) {16103dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,16104p->p_pid);16105}16106}1610716108mutex_exit(&dtrace_meta_lock);16109}1611016111static int16112dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen)16113{16114dtrace_helper_provider_t *hprov, **tmp_provs;16115uint_t tmp_maxprovs, i;1611616117ASSERT(MUTEX_HELD(&dtrace_lock));16118ASSERT(help != NULL);1611916120/*16121* If we already have dtrace_helper_providers_max helper providers,16122* we're refuse to add a new one.16123*/16124if (help->dthps_nprovs >= dtrace_helper_providers_max)16125return (ENOSPC);1612616127/*16128* Check to make sure this isn't a duplicate.16129*/16130for (i = 0; i < help->dthps_nprovs; i++) {16131if (dofhp->dofhp_addr ==16132help->dthps_provs[i]->dthp_prov.dofhp_addr)16133return (EALREADY);16134}1613516136hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);16137hprov->dthp_prov = *dofhp;16138hprov->dthp_ref = 1;16139hprov->dthp_generation = gen;1614016141/*16142* Allocate a bigger table for helper providers if it's already full.16143*/16144if (help->dthps_maxprovs == help->dthps_nprovs) {16145tmp_maxprovs = help->dthps_maxprovs;16146tmp_provs = help->dthps_provs;1614716148if (help->dthps_maxprovs == 0)16149help->dthps_maxprovs = 2;16150else16151help->dthps_maxprovs *= 2;16152if (help->dthps_maxprovs > dtrace_helper_providers_max)16153help->dthps_maxprovs = dtrace_helper_providers_max;1615416155ASSERT(tmp_maxprovs < help->dthps_maxprovs);1615616157help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *16158sizeof (dtrace_helper_provider_t *), KM_SLEEP);1615916160if (tmp_provs != NULL) {16161bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *16162sizeof (dtrace_helper_provider_t *));16163kmem_free(tmp_provs, tmp_maxprovs *16164sizeof (dtrace_helper_provider_t *));16165}16166}1616716168help->dthps_provs[help->dthps_nprovs] = hprov;16169help->dthps_nprovs++;1617016171return (0);16172}1617316174static void16175dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)16176{16177mutex_enter(&dtrace_lock);1617816179if (--hprov->dthp_ref == 0) {16180dof_hdr_t *dof;16181mutex_exit(&dtrace_lock);16182dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;16183dtrace_dof_destroy(dof);16184kmem_free(hprov, sizeof (dtrace_helper_provider_t));16185} else {16186mutex_exit(&dtrace_lock);16187}16188}1618916190static int16191dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)16192{16193uintptr_t daddr = (uintptr_t)dof;16194dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;16195dof_provider_t *provider;16196dof_probe_t *probe;16197uint8_t *arg;16198char *strtab, *typestr;16199dof_stridx_t typeidx;16200size_t typesz;16201uint_t nprobes, j, k;1620216203ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);1620416205if (sec->dofs_offset & (sizeof (uint_t) - 1)) {16206dtrace_dof_error(dof, "misaligned section offset");16207return (-1);16208}1620916210/*16211* The section needs to be large enough to contain the DOF provider16212* structure appropriate for the given version.16213*/16214if (sec->dofs_size <16215((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?16216offsetof(dof_provider_t, dofpv_prenoffs) :16217sizeof (dof_provider_t))) {16218dtrace_dof_error(dof, "provider section too small");16219return (-1);16220}1622116222provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);16223str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);16224prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);16225arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);16226off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);1622716228if (str_sec == NULL || prb_sec == NULL ||16229arg_sec == NULL || off_sec == NULL)16230return (-1);1623116232enoff_sec = NULL;1623316234if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&16235provider->dofpv_prenoffs != DOF_SECT_NONE &&16236(enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,16237provider->dofpv_prenoffs)) == NULL)16238return (-1);1623916240strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);1624116242if (provider->dofpv_name >= str_sec->dofs_size ||16243strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {16244dtrace_dof_error(dof, "invalid provider name");16245return (-1);16246}1624716248if (prb_sec->dofs_entsize == 0 ||16249prb_sec->dofs_entsize > prb_sec->dofs_size) {16250dtrace_dof_error(dof, "invalid entry size");16251return (-1);16252}1625316254if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {16255dtrace_dof_error(dof, "misaligned entry size");16256return (-1);16257}1625816259if (off_sec->dofs_entsize != sizeof (uint32_t)) {16260dtrace_dof_error(dof, "invalid entry size");16261return (-1);16262}1626316264if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {16265dtrace_dof_error(dof, "misaligned section offset");16266return (-1);16267}1626816269if (arg_sec->dofs_entsize != sizeof (uint8_t)) {16270dtrace_dof_error(dof, "invalid entry size");16271return (-1);16272}1627316274arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);1627516276nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;1627716278/*16279* Take a pass through the probes to check for errors.16280*/16281for (j = 0; j < nprobes; j++) {16282probe = (dof_probe_t *)(uintptr_t)(daddr +16283prb_sec->dofs_offset + j * prb_sec->dofs_entsize);1628416285if (probe->dofpr_func >= str_sec->dofs_size) {16286dtrace_dof_error(dof, "invalid function name");16287return (-1);16288}1628916290if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {16291dtrace_dof_error(dof, "function name too long");16292/*16293* Keep going if the function name is too long.16294* Unlike provider and probe names, we cannot reasonably16295* impose restrictions on function names, since they're16296* a property of the code being instrumented. We will16297* skip this probe in dtrace_helper_provide_one().16298*/16299}1630016301if (probe->dofpr_name >= str_sec->dofs_size ||16302strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {16303dtrace_dof_error(dof, "invalid probe name");16304return (-1);16305}1630616307/*16308* The offset count must not wrap the index, and the offsets16309* must also not overflow the section's data.16310*/16311if (probe->dofpr_offidx + probe->dofpr_noffs <16312probe->dofpr_offidx ||16313(probe->dofpr_offidx + probe->dofpr_noffs) *16314off_sec->dofs_entsize > off_sec->dofs_size) {16315dtrace_dof_error(dof, "invalid probe offset");16316return (-1);16317}1631816319if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {16320/*16321* If there's no is-enabled offset section, make sure16322* there aren't any is-enabled offsets. Otherwise16323* perform the same checks as for probe offsets16324* (immediately above).16325*/16326if (enoff_sec == NULL) {16327if (probe->dofpr_enoffidx != 0 ||16328probe->dofpr_nenoffs != 0) {16329dtrace_dof_error(dof, "is-enabled "16330"offsets with null section");16331return (-1);16332}16333} else if (probe->dofpr_enoffidx +16334probe->dofpr_nenoffs < probe->dofpr_enoffidx ||16335(probe->dofpr_enoffidx + probe->dofpr_nenoffs) *16336enoff_sec->dofs_entsize > enoff_sec->dofs_size) {16337dtrace_dof_error(dof, "invalid is-enabled "16338"offset");16339return (-1);16340}1634116342if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {16343dtrace_dof_error(dof, "zero probe and "16344"is-enabled offsets");16345return (-1);16346}16347} else if (probe->dofpr_noffs == 0) {16348dtrace_dof_error(dof, "zero probe offsets");16349return (-1);16350}1635116352if (probe->dofpr_argidx + probe->dofpr_xargc <16353probe->dofpr_argidx ||16354(probe->dofpr_argidx + probe->dofpr_xargc) *16355arg_sec->dofs_entsize > arg_sec->dofs_size) {16356dtrace_dof_error(dof, "invalid args");16357return (-1);16358}1635916360typeidx = probe->dofpr_nargv;16361typestr = strtab + probe->dofpr_nargv;16362for (k = 0; k < probe->dofpr_nargc; k++) {16363if (typeidx >= str_sec->dofs_size) {16364dtrace_dof_error(dof, "bad "16365"native argument type");16366return (-1);16367}1636816369typesz = strlen(typestr) + 1;16370if (typesz > DTRACE_ARGTYPELEN) {16371dtrace_dof_error(dof, "native "16372"argument type too long");16373return (-1);16374}16375typeidx += typesz;16376typestr += typesz;16377}1637816379typeidx = probe->dofpr_xargv;16380typestr = strtab + probe->dofpr_xargv;16381for (k = 0; k < probe->dofpr_xargc; k++) {16382if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {16383dtrace_dof_error(dof, "bad "16384"native argument index");16385return (-1);16386}1638716388if (typeidx >= str_sec->dofs_size) {16389dtrace_dof_error(dof, "bad "16390"translated argument type");16391return (-1);16392}1639316394typesz = strlen(typestr) + 1;16395if (typesz > DTRACE_ARGTYPELEN) {16396dtrace_dof_error(dof, "translated argument "16397"type too long");16398return (-1);16399}1640016401typeidx += typesz;16402typestr += typesz;16403}16404}1640516406return (0);16407}1640816409static int16410dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p)16411{16412dtrace_helpers_t *help;16413dtrace_vstate_t *vstate;16414dtrace_enabling_t *enab = NULL;16415int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;16416uintptr_t daddr = (uintptr_t)dof;1641716418ASSERT(MUTEX_HELD(&dtrace_lock));1641916420if ((help = p->p_dtrace_helpers) == NULL)16421help = dtrace_helpers_create(p);1642216423vstate = &help->dthps_vstate;1642416425if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr,16426dhp->dofhp_dof, B_FALSE)) != 0) {16427dtrace_dof_destroy(dof);16428return (rv);16429}1643016431/*16432* Look for helper providers and validate their descriptions.16433*/16434for (i = 0; i < dof->dofh_secnum; i++) {16435dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +16436dof->dofh_secoff + i * dof->dofh_secsize);1643716438if (sec->dofs_type != DOF_SECT_PROVIDER)16439continue;1644016441if (dtrace_helper_provider_validate(dof, sec) != 0) {16442dtrace_enabling_destroy(enab);16443dtrace_dof_destroy(dof);16444return (-1);16445}1644616447nprovs++;16448}1644916450/*16451* Now we need to walk through the ECB descriptions in the enabling.16452*/16453for (i = 0; i < enab->dten_ndesc; i++) {16454dtrace_ecbdesc_t *ep = enab->dten_desc[i];16455dtrace_probedesc_t *desc = &ep->dted_probe;1645616457if (strcmp(desc->dtpd_provider, "dtrace") != 0)16458continue;1645916460if (strcmp(desc->dtpd_mod, "helper") != 0)16461continue;1646216463if (strcmp(desc->dtpd_func, "ustack") != 0)16464continue;1646516466if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,16467ep, help)) != 0) {16468/*16469* Adding this helper action failed -- we are now going16470* to rip out the entire generation and return failure.16471*/16472(void) dtrace_helper_destroygen(help,16473help->dthps_generation);16474dtrace_enabling_destroy(enab);16475dtrace_dof_destroy(dof);16476return (-1);16477}1647816479nhelpers++;16480}1648116482if (nhelpers < enab->dten_ndesc)16483dtrace_dof_error(dof, "unmatched helpers");1648416485gen = help->dthps_generation++;16486dtrace_enabling_destroy(enab);1648716488if (nprovs > 0) {16489/*16490* Now that this is in-kernel, we change the sense of the16491* members: dofhp_dof denotes the in-kernel copy of the DOF16492* and dofhp_addr denotes the address at user-level.16493*/16494dhp->dofhp_addr = dhp->dofhp_dof;16495dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;1649616497if (dtrace_helper_provider_add(dhp, help, gen) == 0) {16498mutex_exit(&dtrace_lock);16499dtrace_helper_provider_register(p, help, dhp);16500mutex_enter(&dtrace_lock);1650116502destroy = 0;16503}16504}1650516506if (destroy)16507dtrace_dof_destroy(dof);1650816509return (gen);16510}1651116512static dtrace_helpers_t *16513dtrace_helpers_create(proc_t *p)16514{16515dtrace_helpers_t *help;1651616517ASSERT(MUTEX_HELD(&dtrace_lock));16518ASSERT(p->p_dtrace_helpers == NULL);1651916520help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);16521help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *16522DTRACE_NHELPER_ACTIONS, KM_SLEEP);1652316524p->p_dtrace_helpers = help;16525dtrace_helpers++;1652616527return (help);16528}1652916530#ifdef illumos16531static16532#endif16533void16534dtrace_helpers_destroy(proc_t *p)16535{16536dtrace_helpers_t *help;16537dtrace_vstate_t *vstate;16538#ifdef illumos16539proc_t *p = curproc;16540#endif16541int i;1654216543mutex_enter(&dtrace_lock);1654416545ASSERT(p->p_dtrace_helpers != NULL);16546ASSERT(dtrace_helpers > 0);1654716548help = p->p_dtrace_helpers;16549vstate = &help->dthps_vstate;1655016551/*16552* We're now going to lose the help from this process.16553*/16554p->p_dtrace_helpers = NULL;16555dtrace_sync();1655616557/*16558* Destory the helper actions.16559*/16560for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {16561dtrace_helper_action_t *h, *next;1656216563for (h = help->dthps_actions[i]; h != NULL; h = next) {16564next = h->dtha_next;16565dtrace_helper_action_destroy(h, vstate);16566h = next;16567}16568}1656916570mutex_exit(&dtrace_lock);1657116572/*16573* Destroy the helper providers.16574*/16575if (help->dthps_maxprovs > 0) {16576mutex_enter(&dtrace_meta_lock);16577if (dtrace_meta_pid != NULL) {16578ASSERT(dtrace_deferred_pid == NULL);1657916580for (i = 0; i < help->dthps_nprovs; i++) {16581dtrace_helper_provider_remove(16582&help->dthps_provs[i]->dthp_prov, p->p_pid);16583}16584} else {16585mutex_enter(&dtrace_lock);16586ASSERT(help->dthps_deferred == 0 ||16587help->dthps_next != NULL ||16588help->dthps_prev != NULL ||16589help == dtrace_deferred_pid);1659016591/*16592* Remove the helper from the deferred list.16593*/16594if (help->dthps_next != NULL)16595help->dthps_next->dthps_prev = help->dthps_prev;16596if (help->dthps_prev != NULL)16597help->dthps_prev->dthps_next = help->dthps_next;16598if (dtrace_deferred_pid == help) {16599dtrace_deferred_pid = help->dthps_next;16600ASSERT(help->dthps_prev == NULL);16601}1660216603mutex_exit(&dtrace_lock);16604}1660516606mutex_exit(&dtrace_meta_lock);1660716608for (i = 0; i < help->dthps_nprovs; i++) {16609dtrace_helper_provider_destroy(help->dthps_provs[i]);16610}1661116612kmem_free(help->dthps_provs, help->dthps_maxprovs *16613sizeof (dtrace_helper_provider_t *));16614}1661516616mutex_enter(&dtrace_lock);1661716618dtrace_vstate_fini(&help->dthps_vstate);16619kmem_free(help->dthps_actions,16620sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);16621kmem_free(help, sizeof (dtrace_helpers_t));1662216623--dtrace_helpers;16624mutex_exit(&dtrace_lock);16625}1662616627#ifdef illumos16628static16629#endif16630void16631dtrace_helpers_duplicate(proc_t *from, proc_t *to)16632{16633dtrace_helpers_t *help, *newhelp;16634dtrace_helper_action_t *helper, *new, *last;16635dtrace_difo_t *dp;16636dtrace_vstate_t *vstate;16637int i, j, sz, hasprovs = 0;1663816639mutex_enter(&dtrace_lock);16640ASSERT(from->p_dtrace_helpers != NULL);16641ASSERT(dtrace_helpers > 0);1664216643help = from->p_dtrace_helpers;16644newhelp = dtrace_helpers_create(to);16645ASSERT(to->p_dtrace_helpers != NULL);1664616647newhelp->dthps_generation = help->dthps_generation;16648vstate = &newhelp->dthps_vstate;1664916650/*16651* Duplicate the helper actions.16652*/16653for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {16654if ((helper = help->dthps_actions[i]) == NULL)16655continue;1665616657for (last = NULL; helper != NULL; helper = helper->dtha_next) {16658new = kmem_zalloc(sizeof (dtrace_helper_action_t),16659KM_SLEEP);16660new->dtha_generation = helper->dtha_generation;1666116662if ((dp = helper->dtha_predicate) != NULL) {16663dp = dtrace_difo_duplicate(dp, vstate);16664new->dtha_predicate = dp;16665}1666616667new->dtha_nactions = helper->dtha_nactions;16668sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;16669new->dtha_actions = kmem_alloc(sz, KM_SLEEP);1667016671for (j = 0; j < new->dtha_nactions; j++) {16672dtrace_difo_t *dp = helper->dtha_actions[j];1667316674ASSERT(dp != NULL);16675dp = dtrace_difo_duplicate(dp, vstate);16676new->dtha_actions[j] = dp;16677}1667816679if (last != NULL) {16680last->dtha_next = new;16681} else {16682newhelp->dthps_actions[i] = new;16683}1668416685last = new;16686}16687}1668816689/*16690* Duplicate the helper providers and register them with the16691* DTrace framework.16692*/16693if (help->dthps_nprovs > 0) {16694newhelp->dthps_nprovs = help->dthps_nprovs;16695newhelp->dthps_maxprovs = help->dthps_nprovs;16696newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *16697sizeof (dtrace_helper_provider_t *), KM_SLEEP);16698for (i = 0; i < newhelp->dthps_nprovs; i++) {16699newhelp->dthps_provs[i] = help->dthps_provs[i];16700newhelp->dthps_provs[i]->dthp_ref++;16701}1670216703hasprovs = 1;16704}1670516706mutex_exit(&dtrace_lock);1670716708if (hasprovs)16709dtrace_helper_provider_register(to, newhelp, NULL);16710}1671116712/*16713* DTrace Hook Functions16714*/16715static void16716dtrace_module_loaded(modctl_t *ctl)16717{16718dtrace_provider_t *prv;1671916720mutex_enter(&dtrace_provider_lock);16721#ifdef illumos16722mutex_enter(&mod_lock);16723#endif1672416725#ifdef illumos16726ASSERT(ctl->mod_busy);16727#endif1672816729/*16730* We're going to call each providers per-module provide operation16731* specifying only this module.16732*/16733for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)16734prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);1673516736#ifdef illumos16737mutex_exit(&mod_lock);16738#endif16739mutex_exit(&dtrace_provider_lock);1674016741/*16742* If we have any retained enablings, we need to match against them.16743* Enabling probes requires that cpu_lock be held, and we cannot hold16744* cpu_lock here -- it is legal for cpu_lock to be held when loading a16745* module. (In particular, this happens when loading scheduling16746* classes.) So if we have any retained enablings, we need to dispatch16747* our task queue to do the match for us.16748*/16749mutex_enter(&dtrace_lock);1675016751if (dtrace_retained == NULL) {16752mutex_exit(&dtrace_lock);16753return;16754}1675516756(void)taskq_dispatch(dtrace_taskq,16757(task_func_t *)dtrace_enabling_matchall_task, NULL, TQ_SLEEP);1675816759mutex_exit(&dtrace_lock);1676016761/*16762* And now, for a little heuristic sleaze: in general, we want to16763* match modules as soon as they load. However, we cannot guarantee16764* this, because it would lead us to the lock ordering violation16765* outlined above. The common case, of course, is that cpu_lock is16766* _not_ held -- so we delay here for a clock tick, hoping that that's16767* long enough for the task queue to do its work. If it's not, it's16768* not a serious problem -- it just means that the module that we16769* just loaded may not be immediately instrumentable.16770*/16771delay(1);16772}1677316774static void16775#ifdef illumos16776dtrace_module_unloaded(modctl_t *ctl)16777#else16778dtrace_module_unloaded(modctl_t *ctl, int *error)16779#endif16780{16781dtrace_probe_t template, *probe, *first, *next;16782dtrace_provider_t *prov;16783#ifndef illumos16784char modname[DTRACE_MODNAMELEN];16785size_t len;16786#endif1678716788#ifdef illumos16789template.dtpr_mod = ctl->mod_modname;16790#else16791/* Handle the fact that ctl->filename may end in ".ko". */16792strlcpy(modname, ctl->filename, sizeof(modname));16793len = strlen(ctl->filename);16794if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)16795modname[len - 3] = '\0';16796template.dtpr_mod = modname;16797#endif1679816799mutex_enter(&dtrace_provider_lock);16800#ifdef illumos16801mutex_enter(&mod_lock);16802#endif16803mutex_enter(&dtrace_lock);1680416805#ifndef illumos16806if (ctl->nenabled > 0) {16807/* Don't allow unloads if a probe is enabled. */16808mutex_exit(&dtrace_provider_lock);16809mutex_exit(&dtrace_lock);16810*error = -1;16811printf(16812"kldunload: attempt to unload module that has DTrace probes enabled\n");16813return;16814}16815#endif1681616817if (dtrace_bymod == NULL) {16818/*16819* The DTrace module is loaded (obviously) but not attached;16820* we don't have any work to do.16821*/16822mutex_exit(&dtrace_provider_lock);16823#ifdef illumos16824mutex_exit(&mod_lock);16825#endif16826mutex_exit(&dtrace_lock);16827return;16828}1682916830for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);16831probe != NULL; probe = probe->dtpr_nextmod) {16832if (probe->dtpr_ecb != NULL) {16833mutex_exit(&dtrace_provider_lock);16834#ifdef illumos16835mutex_exit(&mod_lock);16836#endif16837mutex_exit(&dtrace_lock);1683816839/*16840* This shouldn't _actually_ be possible -- we're16841* unloading a module that has an enabled probe in it.16842* (It's normally up to the provider to make sure that16843* this can't happen.) However, because dtps_enable()16844* doesn't have a failure mode, there can be an16845* enable/unload race. Upshot: we don't want to16846* assert, but we're not going to disable the16847* probe, either.16848*/16849if (dtrace_err_verbose) {16850#ifdef illumos16851cmn_err(CE_WARN, "unloaded module '%s' had "16852"enabled probes", ctl->mod_modname);16853#else16854cmn_err(CE_WARN, "unloaded module '%s' had "16855"enabled probes", modname);16856#endif16857}1685816859return;16860}16861}1686216863probe = first;1686416865for (first = NULL; probe != NULL; probe = next) {16866ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);1686716868dtrace_probes[probe->dtpr_id - 1] = NULL;1686916870next = probe->dtpr_nextmod;16871dtrace_hash_remove(dtrace_bymod, probe);16872dtrace_hash_remove(dtrace_byfunc, probe);16873dtrace_hash_remove(dtrace_byname, probe);1687416875if (first == NULL) {16876first = probe;16877probe->dtpr_nextmod = NULL;16878} else {16879probe->dtpr_nextmod = first;16880first = probe;16881}16882}1688316884/*16885* We've removed all of the module's probes from the hash chains and16886* from the probe array. Now issue a dtrace_sync() to be sure that16887* everyone has cleared out from any probe array processing.16888*/16889dtrace_sync();1689016891for (probe = first; probe != NULL; probe = first) {16892first = probe->dtpr_nextmod;16893prov = probe->dtpr_provider;16894prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,16895probe->dtpr_arg);16896kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);16897kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);16898kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);16899#ifdef illumos16900vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);16901#else16902free_unr(dtrace_arena, probe->dtpr_id);16903#endif16904kmem_free(probe, sizeof (dtrace_probe_t));16905}1690616907mutex_exit(&dtrace_lock);16908#ifdef illumos16909mutex_exit(&mod_lock);16910#endif16911mutex_exit(&dtrace_provider_lock);16912}1691316914#ifndef illumos16915static void16916dtrace_kld_load(void *arg __unused, linker_file_t lf)16917{1691816919dtrace_module_loaded(lf);16920}1692116922static void16923dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)16924{1692516926if (*error != 0)16927/* We already have an error, so don't do anything. */16928return;16929dtrace_module_unloaded(lf, error);16930}16931#endif1693216933#ifdef illumos16934static void16935dtrace_suspend(void)16936{16937dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));16938}1693916940static void16941dtrace_resume(void)16942{16943dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));16944}16945#endif1694616947static int16948dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)16949{16950ASSERT(MUTEX_HELD(&cpu_lock));16951mutex_enter(&dtrace_lock);1695216953switch (what) {16954case CPU_CONFIG: {16955dtrace_state_t *state;16956dtrace_optval_t *opt, rs, c;1695716958/*16959* For now, we only allocate a new buffer for anonymous state.16960*/16961if ((state = dtrace_anon.dta_state) == NULL)16962break;1696316964if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)16965break;1696616967opt = state->dts_options;16968c = opt[DTRACEOPT_CPU];1696916970if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)16971break;1697216973/*16974* Regardless of what the actual policy is, we're going to16975* temporarily set our resize policy to be manual. We're16976* also going to temporarily set our CPU option to denote16977* the newly configured CPU.16978*/16979rs = opt[DTRACEOPT_BUFRESIZE];16980opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;16981opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;1698216983(void) dtrace_state_buffers(state);1698416985opt[DTRACEOPT_BUFRESIZE] = rs;16986opt[DTRACEOPT_CPU] = c;1698716988break;16989}1699016991case CPU_UNCONFIG:16992/*16993* We don't free the buffer in the CPU_UNCONFIG case. (The16994* buffer will be freed when the consumer exits.)16995*/16996break;1699716998default:16999break;17000}1700117002mutex_exit(&dtrace_lock);17003return (0);17004}1700517006#ifdef illumos17007static void17008dtrace_cpu_setup_initial(processorid_t cpu)17009{17010(void) dtrace_cpu_setup(CPU_CONFIG, cpu);17011}17012#endif1701317014static void17015dtrace_toxrange_add(uintptr_t base, uintptr_t limit)17016{17017if (dtrace_toxranges >= dtrace_toxranges_max) {17018int osize, nsize;17019dtrace_toxrange_t *range;1702017021osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);1702217023if (osize == 0) {17024ASSERT(dtrace_toxrange == NULL);17025ASSERT(dtrace_toxranges_max == 0);17026dtrace_toxranges_max = 1;17027} else {17028dtrace_toxranges_max <<= 1;17029}1703017031nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);17032range = kmem_zalloc(nsize, KM_SLEEP);1703317034if (dtrace_toxrange != NULL) {17035ASSERT(osize != 0);17036bcopy(dtrace_toxrange, range, osize);17037kmem_free(dtrace_toxrange, osize);17038}1703917040dtrace_toxrange = range;17041}1704217043ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);17044ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);1704517046dtrace_toxrange[dtrace_toxranges].dtt_base = base;17047dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;17048dtrace_toxranges++;17049}1705017051static void17052dtrace_getf_barrier(void)17053{17054#ifdef illumos17055/*17056* When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings17057* that contain calls to getf(), this routine will be called on every17058* closef() before either the underlying vnode is released or the17059* file_t itself is freed. By the time we are here, it is essential17060* that the file_t can no longer be accessed from a call to getf()17061* in probe context -- that assures that a dtrace_sync() can be used17062* to clear out any enablings referring to the old structures.17063*/17064if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||17065kcred->cr_zone->zone_dtrace_getf != 0)17066dtrace_sync();17067#endif17068}1706917070/*17071* DTrace Driver Cookbook Functions17072*/17073#ifdef illumos17074/*ARGSUSED*/17075static int17076dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)17077{17078dtrace_provider_id_t id;17079dtrace_state_t *state = NULL;17080dtrace_enabling_t *enab;1708117082mutex_enter(&cpu_lock);17083mutex_enter(&dtrace_provider_lock);17084mutex_enter(&dtrace_lock);1708517086if (ddi_soft_state_init(&dtrace_softstate,17087sizeof (dtrace_state_t), 0) != 0) {17088cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");17089mutex_exit(&cpu_lock);17090mutex_exit(&dtrace_provider_lock);17091mutex_exit(&dtrace_lock);17092return (DDI_FAILURE);17093}1709417095if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,17096DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||17097ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,17098DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {17099cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");17100ddi_remove_minor_node(devi, NULL);17101ddi_soft_state_fini(&dtrace_softstate);17102mutex_exit(&cpu_lock);17103mutex_exit(&dtrace_provider_lock);17104mutex_exit(&dtrace_lock);17105return (DDI_FAILURE);17106}1710717108ddi_report_dev(devi);17109dtrace_devi = devi;1711017111dtrace_modload = dtrace_module_loaded;17112dtrace_modunload = dtrace_module_unloaded;17113dtrace_cpu_init = dtrace_cpu_setup_initial;17114dtrace_helpers_cleanup = dtrace_helpers_destroy;17115dtrace_helpers_fork = dtrace_helpers_duplicate;17116dtrace_cpustart_init = dtrace_suspend;17117dtrace_cpustart_fini = dtrace_resume;17118dtrace_debugger_init = dtrace_suspend;17119dtrace_debugger_fini = dtrace_resume;1712017121register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);1712217123ASSERT(MUTEX_HELD(&cpu_lock));1712417125dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,17126NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);17127dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,17128UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,17129VM_SLEEP | VMC_IDENTIFIER);17130dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,171311, INT_MAX, 0);1713217133dtrace_state_cache = kmem_cache_create("dtrace_state_cache",17134sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,17135NULL, NULL, NULL, NULL, NULL, 0);1713617137ASSERT(MUTEX_HELD(&cpu_lock));17138dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),17139offsetof(dtrace_probe_t, dtpr_nextmod),17140offsetof(dtrace_probe_t, dtpr_prevmod));1714117142dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),17143offsetof(dtrace_probe_t, dtpr_nextfunc),17144offsetof(dtrace_probe_t, dtpr_prevfunc));1714517146dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),17147offsetof(dtrace_probe_t, dtpr_nextname),17148offsetof(dtrace_probe_t, dtpr_prevname));1714917150if (dtrace_retain_max < 1) {17151cmn_err(CE_WARN, "illegal value (%zu) for dtrace_retain_max; "17152"setting to 1", dtrace_retain_max);17153dtrace_retain_max = 1;17154}1715517156/*17157* Now discover our toxic ranges.17158*/17159dtrace_toxic_ranges(dtrace_toxrange_add);1716017161/*17162* Before we register ourselves as a provider to our own framework,17163* we would like to assert that dtrace_provider is NULL -- but that's17164* not true if we were loaded as a dependency of a DTrace provider.17165* Once we've registered, we can assert that dtrace_provider is our17166* pseudo provider.17167*/17168(void) dtrace_register("dtrace", &dtrace_provider_attr,17169DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);1717017171ASSERT(dtrace_provider != NULL);17172ASSERT((dtrace_provider_id_t)dtrace_provider == id);1717317174dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)17175dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);17176dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)17177dtrace_provider, NULL, NULL, "END", 0, NULL);17178dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)17179dtrace_provider, NULL, NULL, "ERROR", 1, NULL);1718017181dtrace_anon_property();17182mutex_exit(&cpu_lock);1718317184/*17185* If there are already providers, we must ask them to provide their17186* probes, and then match any anonymous enabling against them. Note17187* that there should be no other retained enablings at this time:17188* the only retained enablings at this time should be the anonymous17189* enabling.17190*/17191if (dtrace_anon.dta_enabling != NULL) {17192ASSERT(dtrace_retained == dtrace_anon.dta_enabling);1719317194dtrace_enabling_provide(NULL);17195state = dtrace_anon.dta_state;1719617197/*17198* We couldn't hold cpu_lock across the above call to17199* dtrace_enabling_provide(), but we must hold it to actually17200* enable the probes. We have to drop all of our locks, pick17201* up cpu_lock, and regain our locks before matching the17202* retained anonymous enabling.17203*/17204mutex_exit(&dtrace_lock);17205mutex_exit(&dtrace_provider_lock);1720617207mutex_enter(&cpu_lock);17208mutex_enter(&dtrace_provider_lock);17209mutex_enter(&dtrace_lock);1721017211if ((enab = dtrace_anon.dta_enabling) != NULL)17212(void) dtrace_enabling_match(enab, NULL);1721317214mutex_exit(&cpu_lock);17215}1721617217mutex_exit(&dtrace_lock);17218mutex_exit(&dtrace_provider_lock);1721917220if (state != NULL) {17221/*17222* If we created any anonymous state, set it going now.17223*/17224(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);17225}1722617227return (DDI_SUCCESS);17228}17229#endif /* illumos */1723017231#ifndef illumos17232static void dtrace_dtr(void *);17233#endif1723417235/*ARGSUSED*/17236static int17237#ifdef illumos17238dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)17239#else17240dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)17241#endif17242{17243dtrace_state_t *state;17244uint32_t priv;17245uid_t uid;17246zoneid_t zoneid;1724717248#ifdef illumos17249if (getminor(*devp) == DTRACEMNRN_HELPER)17250return (0);1725117252/*17253* If this wasn't an open with the "helper" minor, then it must be17254* the "dtrace" minor.17255*/17256if (getminor(*devp) == DTRACEMNRN_DTRACE)17257return (ENXIO);17258#else17259cred_t *cred_p = NULL;17260cred_p = dev->si_cred;1726117262/*17263* If no DTRACE_PRIV_* bits are set in the credential, then the17264* caller lacks sufficient permission to do anything with DTrace.17265*/17266dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);17267if (priv == DTRACE_PRIV_NONE) {17268#endif1726917270return (EACCES);17271}1727217273/*17274* Ask all providers to provide all their probes.17275*/17276mutex_enter(&dtrace_provider_lock);17277dtrace_probe_provide(NULL, NULL);17278mutex_exit(&dtrace_provider_lock);1727917280mutex_enter(&cpu_lock);17281mutex_enter(&dtrace_lock);17282dtrace_opens++;17283dtrace_membar_producer();1728417285#ifdef illumos17286/*17287* If the kernel debugger is active (that is, if the kernel debugger17288* modified text in some way), we won't allow the open.17289*/17290if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {17291dtrace_opens--;17292mutex_exit(&cpu_lock);17293mutex_exit(&dtrace_lock);17294return (EBUSY);17295}1729617297if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {17298/*17299* If DTrace helper tracing is enabled, we need to allocate the17300* trace buffer and initialize the values.17301*/17302dtrace_helptrace_buffer =17303kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);17304dtrace_helptrace_next = 0;17305dtrace_helptrace_wrapped = 0;17306dtrace_helptrace_enable = 0;17307}1730817309state = dtrace_state_create(devp, cred_p);17310#else17311state = dtrace_state_create(dev, NULL);17312devfs_set_cdevpriv(state, dtrace_dtr);17313#endif1731417315mutex_exit(&cpu_lock);1731617317if (state == NULL) {17318#ifdef illumos17319if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)17320(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);17321#else17322--dtrace_opens;17323#endif17324mutex_exit(&dtrace_lock);17325return (EAGAIN);17326}1732717328mutex_exit(&dtrace_lock);1732917330return (0);17331}1733217333/*ARGSUSED*/17334#ifdef illumos17335static int17336dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)17337#else17338static void17339dtrace_dtr(void *data)17340#endif17341{17342#ifdef illumos17343minor_t minor = getminor(dev);17344dtrace_state_t *state;17345#endif17346dtrace_helptrace_t *buf = NULL;1734717348#ifdef illumos17349if (minor == DTRACEMNRN_HELPER)17350return (0);1735117352state = ddi_get_soft_state(dtrace_softstate, minor);17353#else17354dtrace_state_t *state = data;17355#endif1735617357mutex_enter(&cpu_lock);17358mutex_enter(&dtrace_lock);1735917360#ifdef illumos17361if (state->dts_anon)17362#else17363if (state != NULL && state->dts_anon)17364#endif17365{17366/*17367* There is anonymous state. Destroy that first.17368*/17369ASSERT(dtrace_anon.dta_state == NULL);17370dtrace_state_destroy(state->dts_anon);17371}1737217373if (dtrace_helptrace_disable) {17374/*17375* If we have been told to disable helper tracing, set the17376* buffer to NULL before calling into dtrace_state_destroy();17377* we take advantage of its dtrace_sync() to know that no17378* CPU is in probe context with enabled helper tracing17379* after it returns.17380*/17381buf = dtrace_helptrace_buffer;17382dtrace_helptrace_buffer = NULL;17383}1738417385#ifdef illumos17386dtrace_state_destroy(state);17387#else17388if (state != NULL) {17389dtrace_state_destroy(state);17390kmem_free(state, 0);17391}17392#endif17393ASSERT(dtrace_opens > 0);1739417395#ifdef illumos17396/*17397* Only relinquish control of the kernel debugger interface when there17398* are no consumers and no anonymous enablings.17399*/17400if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)17401(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);17402#else17403--dtrace_opens;17404#endif1740517406if (buf != NULL) {17407kmem_free(buf, dtrace_helptrace_bufsize);17408dtrace_helptrace_disable = 0;17409}1741017411mutex_exit(&dtrace_lock);17412mutex_exit(&cpu_lock);1741317414#ifdef illumos17415return (0);17416#endif17417}1741817419#ifdef illumos17420/*ARGSUSED*/17421static int17422dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)17423{17424int rval;17425dof_helper_t help, *dhp = NULL;1742617427switch (cmd) {17428case DTRACEHIOC_ADDDOF:17429if (copyin((void *)arg, &help, sizeof (help)) != 0) {17430dtrace_dof_error(NULL, "failed to copyin DOF helper");17431return (EFAULT);17432}1743317434dhp = &help;17435arg = (intptr_t)help.dofhp_dof;17436/*FALLTHROUGH*/1743717438case DTRACEHIOC_ADD: {17439dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);1744017441if (dof == NULL)17442return (rval);1744317444mutex_enter(&dtrace_lock);1744517446/*17447* dtrace_helper_slurp() takes responsibility for the dof --17448* it may free it now or it may save it and free it later.17449*/17450if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {17451*rv = rval;17452rval = 0;17453} else {17454rval = EINVAL;17455}1745617457mutex_exit(&dtrace_lock);17458return (rval);17459}1746017461case DTRACEHIOC_REMOVE: {17462mutex_enter(&dtrace_lock);17463rval = dtrace_helper_destroygen(NULL, arg);17464mutex_exit(&dtrace_lock);1746517466return (rval);17467}1746817469default:17470break;17471}1747217473return (ENOTTY);17474}1747517476/*ARGSUSED*/17477static int17478dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)17479{17480minor_t minor = getminor(dev);17481dtrace_state_t *state;17482int rval;1748317484if (minor == DTRACEMNRN_HELPER)17485return (dtrace_ioctl_helper(cmd, arg, rv));1748617487state = ddi_get_soft_state(dtrace_softstate, minor);1748817489if (state->dts_anon) {17490ASSERT(dtrace_anon.dta_state == NULL);17491state = state->dts_anon;17492}1749317494switch (cmd) {17495case DTRACEIOC_PROVIDER: {17496dtrace_providerdesc_t pvd;17497dtrace_provider_t *pvp;1749817499if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)17500return (EFAULT);1750117502pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';17503mutex_enter(&dtrace_provider_lock);1750417505for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {17506if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)17507break;17508}1750917510mutex_exit(&dtrace_provider_lock);1751117512if (pvp == NULL)17513return (ESRCH);1751417515bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));17516bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));1751717518if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)17519return (EFAULT);1752017521return (0);17522}1752317524case DTRACEIOC_EPROBE: {17525dtrace_eprobedesc_t epdesc;17526dtrace_ecb_t *ecb;17527dtrace_action_t *act;17528void *buf;17529size_t size;17530uintptr_t dest;17531int nrecs;1753217533if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)17534return (EFAULT);1753517536mutex_enter(&dtrace_lock);1753717538if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {17539mutex_exit(&dtrace_lock);17540return (EINVAL);17541}1754217543if (ecb->dte_probe == NULL) {17544mutex_exit(&dtrace_lock);17545return (EINVAL);17546}1754717548epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;17549epdesc.dtepd_uarg = ecb->dte_uarg;17550epdesc.dtepd_size = ecb->dte_size;1755117552nrecs = epdesc.dtepd_nrecs;17553epdesc.dtepd_nrecs = 0;17554for (act = ecb->dte_action; act != NULL; act = act->dta_next) {17555if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)17556continue;1755717558epdesc.dtepd_nrecs++;17559}1756017561/*17562* Now that we have the size, we need to allocate a temporary17563* buffer in which to store the complete description. We need17564* the temporary buffer to be able to drop dtrace_lock()17565* across the copyout(), below.17566*/17567size = sizeof (dtrace_eprobedesc_t) +17568(epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));1756917570buf = kmem_alloc(size, KM_SLEEP);17571dest = (uintptr_t)buf;1757217573bcopy(&epdesc, (void *)dest, sizeof (epdesc));17574dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);1757517576for (act = ecb->dte_action; act != NULL; act = act->dta_next) {17577if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)17578continue;1757917580if (nrecs-- == 0)17581break;1758217583bcopy(&act->dta_rec, (void *)dest,17584sizeof (dtrace_recdesc_t));17585dest += sizeof (dtrace_recdesc_t);17586}1758717588mutex_exit(&dtrace_lock);1758917590if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {17591kmem_free(buf, size);17592return (EFAULT);17593}1759417595kmem_free(buf, size);17596return (0);17597}1759817599case DTRACEIOC_AGGDESC: {17600dtrace_aggdesc_t aggdesc;17601dtrace_action_t *act;17602dtrace_aggregation_t *agg;17603int nrecs;17604uint32_t offs;17605dtrace_recdesc_t *lrec;17606void *buf;17607size_t size;17608uintptr_t dest;1760917610if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)17611return (EFAULT);1761217613mutex_enter(&dtrace_lock);1761417615if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {17616mutex_exit(&dtrace_lock);17617return (EINVAL);17618}1761917620aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;1762117622nrecs = aggdesc.dtagd_nrecs;17623aggdesc.dtagd_nrecs = 0;1762417625offs = agg->dtag_base;17626lrec = &agg->dtag_action.dta_rec;17627aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;1762817629for (act = agg->dtag_first; ; act = act->dta_next) {17630ASSERT(act->dta_intuple ||17631DTRACEACT_ISAGG(act->dta_kind));1763217633/*17634* If this action has a record size of zero, it17635* denotes an argument to the aggregating action.17636* Because the presence of this record doesn't (or17637* shouldn't) affect the way the data is interpreted,17638* we don't copy it out to save user-level the17639* confusion of dealing with a zero-length record.17640*/17641if (act->dta_rec.dtrd_size == 0) {17642ASSERT(agg->dtag_hasarg);17643continue;17644}1764517646aggdesc.dtagd_nrecs++;1764717648if (act == &agg->dtag_action)17649break;17650}1765117652/*17653* Now that we have the size, we need to allocate a temporary17654* buffer in which to store the complete description. We need17655* the temporary buffer to be able to drop dtrace_lock()17656* across the copyout(), below.17657*/17658size = sizeof (dtrace_aggdesc_t) +17659(aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));1766017661buf = kmem_alloc(size, KM_SLEEP);17662dest = (uintptr_t)buf;1766317664bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));17665dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);1766617667for (act = agg->dtag_first; ; act = act->dta_next) {17668dtrace_recdesc_t rec = act->dta_rec;1766917670/*17671* See the comment in the above loop for why we pass17672* over zero-length records.17673*/17674if (rec.dtrd_size == 0) {17675ASSERT(agg->dtag_hasarg);17676continue;17677}1767817679if (nrecs-- == 0)17680break;1768117682rec.dtrd_offset -= offs;17683bcopy(&rec, (void *)dest, sizeof (rec));17684dest += sizeof (dtrace_recdesc_t);1768517686if (act == &agg->dtag_action)17687break;17688}1768917690mutex_exit(&dtrace_lock);1769117692if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {17693kmem_free(buf, size);17694return (EFAULT);17695}1769617697kmem_free(buf, size);17698return (0);17699}1770017701case DTRACEIOC_ENABLE: {17702dof_hdr_t *dof;17703dtrace_enabling_t *enab = NULL;17704dtrace_vstate_t *vstate;17705int err = 0;1770617707*rv = 0;1770817709/*17710* If a NULL argument has been passed, we take this as our17711* cue to reevaluate our enablings.17712*/17713if (arg == NULL) {17714dtrace_enabling_matchall();1771517716return (0);17717}1771817719if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)17720return (rval);1772117722mutex_enter(&cpu_lock);17723mutex_enter(&dtrace_lock);17724vstate = &state->dts_vstate;1772517726if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {17727mutex_exit(&dtrace_lock);17728mutex_exit(&cpu_lock);17729dtrace_dof_destroy(dof);17730return (EBUSY);17731}1773217733if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {17734mutex_exit(&dtrace_lock);17735mutex_exit(&cpu_lock);17736dtrace_dof_destroy(dof);17737return (EINVAL);17738}1773917740if ((rval = dtrace_dof_options(dof, state)) != 0) {17741dtrace_enabling_destroy(enab);17742mutex_exit(&dtrace_lock);17743mutex_exit(&cpu_lock);17744dtrace_dof_destroy(dof);17745return (rval);17746}1774717748if ((err = dtrace_enabling_match(enab, rv)) == 0) {17749err = dtrace_enabling_retain(enab);17750} else {17751dtrace_enabling_destroy(enab);17752}1775317754mutex_exit(&cpu_lock);17755mutex_exit(&dtrace_lock);17756dtrace_dof_destroy(dof);1775717758return (err);17759}1776017761case DTRACEIOC_REPLICATE: {17762dtrace_repldesc_t desc;17763dtrace_probedesc_t *match = &desc.dtrpd_match;17764dtrace_probedesc_t *create = &desc.dtrpd_create;17765int err;1776617767if (copyin((void *)arg, &desc, sizeof (desc)) != 0)17768return (EFAULT);1776917770match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';17771match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';17772match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';17773match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';1777417775create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';17776create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';17777create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';17778create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';1777917780mutex_enter(&dtrace_lock);17781err = dtrace_enabling_replicate(state, match, create);17782mutex_exit(&dtrace_lock);1778317784return (err);17785}1778617787case DTRACEIOC_PROBEMATCH:17788case DTRACEIOC_PROBES: {17789dtrace_probe_t *probe = NULL;17790dtrace_probedesc_t desc;17791dtrace_probekey_t pkey;17792dtrace_id_t i;17793int m = 0;17794uint32_t priv;17795uid_t uid;17796zoneid_t zoneid;1779717798if (copyin((void *)arg, &desc, sizeof (desc)) != 0)17799return (EFAULT);1780017801desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';17802desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';17803desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';17804desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';1780517806/*17807* Before we attempt to match this probe, we want to give17808* all providers the opportunity to provide it.17809*/17810if (desc.dtpd_id == DTRACE_IDNONE) {17811mutex_enter(&dtrace_provider_lock);17812dtrace_probe_provide(&desc, NULL);17813mutex_exit(&dtrace_provider_lock);17814desc.dtpd_id++;17815}1781617817if (cmd == DTRACEIOC_PROBEMATCH) {17818dtrace_probekey(&desc, &pkey);17819pkey.dtpk_id = DTRACE_IDNONE;17820}1782117822dtrace_cred2priv(cr, &priv, &uid, &zoneid);1782317824mutex_enter(&dtrace_lock);1782517826if (cmd == DTRACEIOC_PROBEMATCH) {17827for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {17828if ((probe = dtrace_probes[i - 1]) != NULL &&17829(m = dtrace_match_probe(probe, &pkey,17830priv, uid, zoneid)) != 0)17831break;17832}1783317834if (m < 0) {17835mutex_exit(&dtrace_lock);17836return (EINVAL);17837}1783817839} else {17840for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {17841if ((probe = dtrace_probes[i - 1]) != NULL &&17842dtrace_match_priv(probe, priv, uid, zoneid))17843break;17844}17845}1784617847if (probe == NULL) {17848mutex_exit(&dtrace_lock);17849return (ESRCH);17850}1785117852dtrace_probe_description(probe, &desc);17853mutex_exit(&dtrace_lock);1785417855if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)17856return (EFAULT);1785717858return (0);17859}1786017861case DTRACEIOC_PROBEARG: {17862dtrace_argdesc_t desc;17863dtrace_probe_t *probe;17864dtrace_provider_t *prov;1786517866if (copyin((void *)arg, &desc, sizeof (desc)) != 0)17867return (EFAULT);1786817869if (desc.dtargd_id == DTRACE_IDNONE)17870return (EINVAL);1787117872if (desc.dtargd_ndx == DTRACE_ARGNONE)17873return (EINVAL);1787417875mutex_enter(&dtrace_provider_lock);17876mutex_enter(&mod_lock);17877mutex_enter(&dtrace_lock);1787817879if (desc.dtargd_id > dtrace_nprobes) {17880mutex_exit(&dtrace_lock);17881mutex_exit(&mod_lock);17882mutex_exit(&dtrace_provider_lock);17883return (EINVAL);17884}1788517886if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {17887mutex_exit(&dtrace_lock);17888mutex_exit(&mod_lock);17889mutex_exit(&dtrace_provider_lock);17890return (EINVAL);17891}1789217893mutex_exit(&dtrace_lock);1789417895prov = probe->dtpr_provider;1789617897if (prov->dtpv_pops.dtps_getargdesc == NULL) {17898/*17899* There isn't any typed information for this probe.17900* Set the argument number to DTRACE_ARGNONE.17901*/17902desc.dtargd_ndx = DTRACE_ARGNONE;17903} else {17904desc.dtargd_native[0] = '\0';17905desc.dtargd_xlate[0] = '\0';17906desc.dtargd_mapping = desc.dtargd_ndx;1790717908prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,17909probe->dtpr_id, probe->dtpr_arg, &desc);17910}1791117912mutex_exit(&mod_lock);17913mutex_exit(&dtrace_provider_lock);1791417915if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)17916return (EFAULT);1791717918return (0);17919}1792017921case DTRACEIOC_GO: {17922processorid_t cpuid;17923rval = dtrace_state_go(state, &cpuid);1792417925if (rval != 0)17926return (rval);1792717928if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)17929return (EFAULT);1793017931return (0);17932}1793317934case DTRACEIOC_STOP: {17935processorid_t cpuid;1793617937mutex_enter(&dtrace_lock);17938rval = dtrace_state_stop(state, &cpuid);17939mutex_exit(&dtrace_lock);1794017941if (rval != 0)17942return (rval);1794317944if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)17945return (EFAULT);1794617947return (0);17948}1794917950case DTRACEIOC_DOFGET: {17951dof_hdr_t hdr, *dof;17952uint64_t len;1795317954if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)17955return (EFAULT);1795617957mutex_enter(&dtrace_lock);17958dof = dtrace_dof_create(state);17959mutex_exit(&dtrace_lock);1796017961len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);17962rval = copyout(dof, (void *)arg, len);17963dtrace_dof_destroy(dof);1796417965return (rval == 0 ? 0 : EFAULT);17966}1796717968case DTRACEIOC_AGGSNAP:17969case DTRACEIOC_BUFSNAP: {17970dtrace_bufdesc_t desc;17971caddr_t cached;17972dtrace_buffer_t *buf;1797317974if (copyin((void *)arg, &desc, sizeof (desc)) != 0)17975return (EFAULT);1797617977if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)17978return (EINVAL);1797917980mutex_enter(&dtrace_lock);1798117982if (cmd == DTRACEIOC_BUFSNAP) {17983buf = &state->dts_buffer[desc.dtbd_cpu];17984} else {17985buf = &state->dts_aggbuffer[desc.dtbd_cpu];17986}1798717988if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {17989size_t sz = buf->dtb_offset;1799017991if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {17992mutex_exit(&dtrace_lock);17993return (EBUSY);17994}1799517996/*17997* If this buffer has already been consumed, we're17998* going to indicate that there's nothing left here17999* to consume.18000*/18001if (buf->dtb_flags & DTRACEBUF_CONSUMED) {18002mutex_exit(&dtrace_lock);1800318004desc.dtbd_size = 0;18005desc.dtbd_drops = 0;18006desc.dtbd_errors = 0;18007desc.dtbd_oldest = 0;18008sz = sizeof (desc);1800918010if (copyout(&desc, (void *)arg, sz) != 0)18011return (EFAULT);1801218013return (0);18014}1801518016/*18017* If this is a ring buffer that has wrapped, we want18018* to copy the whole thing out.18019*/18020if (buf->dtb_flags & DTRACEBUF_WRAPPED) {18021dtrace_buffer_polish(buf);18022sz = buf->dtb_size;18023}1802418025if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {18026mutex_exit(&dtrace_lock);18027return (EFAULT);18028}1802918030desc.dtbd_size = sz;18031desc.dtbd_drops = buf->dtb_drops;18032desc.dtbd_errors = buf->dtb_errors;18033desc.dtbd_oldest = buf->dtb_xamot_offset;18034desc.dtbd_timestamp = dtrace_gethrtime();1803518036mutex_exit(&dtrace_lock);1803718038if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)18039return (EFAULT);1804018041buf->dtb_flags |= DTRACEBUF_CONSUMED;1804218043return (0);18044}1804518046if (buf->dtb_tomax == NULL) {18047ASSERT(buf->dtb_xamot == NULL);18048mutex_exit(&dtrace_lock);18049return (ENOENT);18050}1805118052cached = buf->dtb_tomax;18053ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));1805418055dtrace_xcall(desc.dtbd_cpu,18056(dtrace_xcall_t)dtrace_buffer_switch, buf);1805718058state->dts_errors += buf->dtb_xamot_errors;1805918060/*18061* If the buffers did not actually switch, then the cross call18062* did not take place -- presumably because the given CPU is18063* not in the ready set. If this is the case, we'll return18064* ENOENT.18065*/18066if (buf->dtb_tomax == cached) {18067ASSERT(buf->dtb_xamot != cached);18068mutex_exit(&dtrace_lock);18069return (ENOENT);18070}1807118072ASSERT(cached == buf->dtb_xamot);1807318074/*18075* We have our snapshot; now copy it out.18076*/18077if (copyout(buf->dtb_xamot, desc.dtbd_data,18078buf->dtb_xamot_offset) != 0) {18079mutex_exit(&dtrace_lock);18080return (EFAULT);18081}1808218083desc.dtbd_size = buf->dtb_xamot_offset;18084desc.dtbd_drops = buf->dtb_xamot_drops;18085desc.dtbd_errors = buf->dtb_xamot_errors;18086desc.dtbd_oldest = 0;18087desc.dtbd_timestamp = buf->dtb_switched;1808818089mutex_exit(&dtrace_lock);1809018091/*18092* Finally, copy out the buffer description.18093*/18094if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)18095return (EFAULT);1809618097return (0);18098}1809918100case DTRACEIOC_CONF: {18101dtrace_conf_t conf;1810218103bzero(&conf, sizeof (conf));18104conf.dtc_difversion = DIF_VERSION;18105conf.dtc_difintregs = DIF_DIR_NREGS;18106conf.dtc_diftupregs = DIF_DTR_NREGS;18107conf.dtc_ctfmodel = CTF_MODEL_NATIVE;1810818109if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)18110return (EFAULT);1811118112return (0);18113}1811418115case DTRACEIOC_STATUS: {18116dtrace_status_t stat;18117dtrace_dstate_t *dstate;18118int i, j;18119uint64_t nerrs;1812018121/*18122* See the comment in dtrace_state_deadman() for the reason18123* for setting dts_laststatus to INT64_MAX before setting18124* it to the correct value.18125*/18126state->dts_laststatus = INT64_MAX;18127dtrace_membar_producer();18128state->dts_laststatus = dtrace_gethrtime();1812918130bzero(&stat, sizeof (stat));1813118132mutex_enter(&dtrace_lock);1813318134if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {18135mutex_exit(&dtrace_lock);18136return (ENOENT);18137}1813818139if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)18140stat.dtst_exiting = 1;1814118142nerrs = state->dts_errors;18143dstate = &state->dts_vstate.dtvs_dynvars;1814418145for (i = 0; i < NCPU; i++) {18146dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];1814718148stat.dtst_dyndrops += dcpu->dtdsc_drops;18149stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;18150stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;1815118152if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)18153stat.dtst_filled++;1815418155nerrs += state->dts_buffer[i].dtb_errors;1815618157for (j = 0; j < state->dts_nspeculations; j++) {18158dtrace_speculation_t *spec;18159dtrace_buffer_t *buf;1816018161spec = &state->dts_speculations[j];18162buf = &spec->dtsp_buffer[i];18163stat.dtst_specdrops += buf->dtb_xamot_drops;18164}18165}1816618167stat.dtst_specdrops_busy = state->dts_speculations_busy;18168stat.dtst_specdrops_unavail = state->dts_speculations_unavail;18169stat.dtst_stkstroverflows = state->dts_stkstroverflows;18170stat.dtst_dblerrors = state->dts_dblerrors;18171stat.dtst_killed =18172(state->dts_activity == DTRACE_ACTIVITY_KILLED);18173stat.dtst_errors = nerrs;1817418175mutex_exit(&dtrace_lock);1817618177if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)18178return (EFAULT);1817918180return (0);18181}1818218183case DTRACEIOC_FORMAT: {18184dtrace_fmtdesc_t fmt;18185char *str;18186int len;1818718188if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)18189return (EFAULT);1819018191mutex_enter(&dtrace_lock);1819218193if (fmt.dtfd_format == 0 ||18194fmt.dtfd_format > state->dts_nformats) {18195mutex_exit(&dtrace_lock);18196return (EINVAL);18197}1819818199/*18200* Format strings are allocated contiguously and they are18201* never freed; if a format index is less than the number18202* of formats, we can assert that the format map is non-NULL18203* and that the format for the specified index is non-NULL.18204*/18205ASSERT(state->dts_formats != NULL);18206str = state->dts_formats[fmt.dtfd_format - 1];18207ASSERT(str != NULL);1820818209len = strlen(str) + 1;1821018211if (len > fmt.dtfd_length) {18212fmt.dtfd_length = len;1821318214if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {18215mutex_exit(&dtrace_lock);18216return (EINVAL);18217}18218} else {18219if (copyout(str, fmt.dtfd_string, len) != 0) {18220mutex_exit(&dtrace_lock);18221return (EINVAL);18222}18223}1822418225mutex_exit(&dtrace_lock);18226return (0);18227}1822818229default:18230break;18231}1823218233return (ENOTTY);18234}1823518236/*ARGSUSED*/18237static int18238dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)18239{18240dtrace_state_t *state;1824118242switch (cmd) {18243case DDI_DETACH:18244break;1824518246case DDI_SUSPEND:18247return (DDI_SUCCESS);1824818249default:18250return (DDI_FAILURE);18251}1825218253mutex_enter(&cpu_lock);18254mutex_enter(&dtrace_provider_lock);18255mutex_enter(&dtrace_lock);1825618257ASSERT(dtrace_opens == 0);1825818259if (dtrace_helpers > 0) {18260mutex_exit(&dtrace_provider_lock);18261mutex_exit(&dtrace_lock);18262mutex_exit(&cpu_lock);18263return (DDI_FAILURE);18264}1826518266if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {18267mutex_exit(&dtrace_provider_lock);18268mutex_exit(&dtrace_lock);18269mutex_exit(&cpu_lock);18270return (DDI_FAILURE);18271}1827218273dtrace_provider = NULL;1827418275if ((state = dtrace_anon_grab()) != NULL) {18276/*18277* If there were ECBs on this state, the provider should18278* have not been allowed to detach; assert that there is18279* none.18280*/18281ASSERT(state->dts_necbs == 0);18282dtrace_state_destroy(state);1828318284/*18285* If we're being detached with anonymous state, we need to18286* indicate to the kernel debugger that DTrace is now inactive.18287*/18288(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);18289}1829018291bzero(&dtrace_anon, sizeof (dtrace_anon_t));18292unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);18293dtrace_cpu_init = NULL;18294dtrace_helpers_cleanup = NULL;18295dtrace_helpers_fork = NULL;18296dtrace_cpustart_init = NULL;18297dtrace_cpustart_fini = NULL;18298dtrace_debugger_init = NULL;18299dtrace_debugger_fini = NULL;18300dtrace_modload = NULL;18301dtrace_modunload = NULL;1830218303ASSERT(dtrace_getf == 0);18304ASSERT(dtrace_closef == NULL);1830518306mutex_exit(&cpu_lock);1830718308kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));18309dtrace_probes = NULL;18310dtrace_nprobes = 0;1831118312dtrace_hash_destroy(dtrace_bymod);18313dtrace_hash_destroy(dtrace_byfunc);18314dtrace_hash_destroy(dtrace_byname);18315dtrace_bymod = NULL;18316dtrace_byfunc = NULL;18317dtrace_byname = NULL;1831818319kmem_cache_destroy(dtrace_state_cache);18320vmem_destroy(dtrace_minor);18321vmem_destroy(dtrace_arena);1832218323if (dtrace_toxrange != NULL) {18324kmem_free(dtrace_toxrange,18325dtrace_toxranges_max * sizeof (dtrace_toxrange_t));18326dtrace_toxrange = NULL;18327dtrace_toxranges = 0;18328dtrace_toxranges_max = 0;18329}1833018331ddi_remove_minor_node(dtrace_devi, NULL);18332dtrace_devi = NULL;1833318334ddi_soft_state_fini(&dtrace_softstate);1833518336ASSERT(dtrace_vtime_references == 0);18337ASSERT(dtrace_opens == 0);18338ASSERT(dtrace_retained == NULL);1833918340mutex_exit(&dtrace_lock);18341mutex_exit(&dtrace_provider_lock);1834218343/*18344* We don't destroy the task queue until after we have dropped our18345* locks (taskq_destroy() may block on running tasks). To prevent18346* attempting to do work after we have effectively detached but before18347* the task queue has been destroyed, all tasks dispatched via the18348* task queue must check that DTrace is still attached before18349* performing any operation.18350*/18351taskq_destroy(dtrace_taskq);18352dtrace_taskq = NULL;1835318354return (DDI_SUCCESS);18355}18356#endif1835718358#ifdef illumos18359/*ARGSUSED*/18360static int18361dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)18362{18363int error;1836418365switch (infocmd) {18366case DDI_INFO_DEVT2DEVINFO:18367*result = (void *)dtrace_devi;18368error = DDI_SUCCESS;18369break;18370case DDI_INFO_DEVT2INSTANCE:18371*result = (void *)0;18372error = DDI_SUCCESS;18373break;18374default:18375error = DDI_FAILURE;18376}18377return (error);18378}18379#endif1838018381#ifdef illumos18382static struct cb_ops dtrace_cb_ops = {18383dtrace_open, /* open */18384dtrace_close, /* close */18385nulldev, /* strategy */18386nulldev, /* print */18387nodev, /* dump */18388nodev, /* read */18389nodev, /* write */18390dtrace_ioctl, /* ioctl */18391nodev, /* devmap */18392nodev, /* mmap */18393nodev, /* segmap */18394nochpoll, /* poll */18395ddi_prop_op, /* cb_prop_op */183960, /* streamtab */18397D_NEW | D_MP /* Driver compatibility flag */18398};1839918400static struct dev_ops dtrace_ops = {18401DEVO_REV, /* devo_rev */184020, /* refcnt */18403dtrace_info, /* get_dev_info */18404nulldev, /* identify */18405nulldev, /* probe */18406dtrace_attach, /* attach */18407dtrace_detach, /* detach */18408nodev, /* reset */18409&dtrace_cb_ops, /* driver operations */18410NULL, /* bus operations */18411nodev /* dev power */18412};1841318414static struct modldrv modldrv = {18415&mod_driverops, /* module type (this is a pseudo driver) */18416"Dynamic Tracing", /* name of module */18417&dtrace_ops, /* driver ops */18418};1841918420static struct modlinkage modlinkage = {18421MODREV_1,18422(void *)&modldrv,18423NULL18424};1842518426int18427_init(void)18428{18429return (mod_install(&modlinkage));18430}1843118432int18433_info(struct modinfo *modinfop)18434{18435return (mod_info(&modlinkage, modinfop));18436}1843718438int18439_fini(void)18440{18441return (mod_remove(&modlinkage));18442}18443#else1844418445static d_ioctl_t dtrace_ioctl;18446static d_ioctl_t dtrace_ioctl_helper;18447static void dtrace_load(void *);18448static int dtrace_unload(void);18449static struct cdev *dtrace_dev;18450static struct cdev *helper_dev;1845118452void dtrace_invop_init(void);18453void dtrace_invop_uninit(void);1845418455static struct cdevsw dtrace_cdevsw = {18456.d_version = D_VERSION,18457.d_ioctl = dtrace_ioctl,18458.d_open = dtrace_open,18459.d_name = "dtrace",18460};1846118462static struct cdevsw helper_cdevsw = {18463.d_version = D_VERSION,18464.d_ioctl = dtrace_ioctl_helper,18465.d_name = "helper",18466};1846718468#include <dtrace_anon.c>18469#include <dtrace_ioctl.c>18470#include <dtrace_load.c>18471#include <dtrace_modevent.c>18472#include <dtrace_sysctl.c>18473#include <dtrace_unload.c>18474#include <dtrace_vtime.c>18475#include <dtrace_hacks.c>1847618477SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);18478SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);18479SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);1848018481DEV_MODULE(dtrace, dtrace_modevent, NULL);18482MODULE_VERSION(dtrace, 1);18483MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);18484#endif184851848618487