Path: blob/master/arch/blackfin/kernel/perf_event.c
10817 views
/*1* Blackfin performance counters2*3* Copyright 2011 Analog Devices Inc.4*5* Ripped from SuperH version:6*7* Copyright (C) 2009 Paul Mundt8*9* Heavily based on the x86 and PowerPC implementations.10*11* x86:12* Copyright (C) 2008 Thomas Gleixner <[email protected]>13* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar14* Copyright (C) 2009 Jaswinder Singh Rajput15* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter16* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <[email protected]>17* Copyright (C) 2009 Intel Corporation, <[email protected]>18*19* ppc:20* Copyright 2008-2009 Paul Mackerras, IBM Corporation.21*22* Licensed under the GPL-2 or later.23*/2425#include <linux/kernel.h>26#include <linux/init.h>27#include <linux/perf_event.h>28#include <asm/bfin_pfmon.h>2930/*31* We have two counters, and each counter can support an event type.32* The 'o' is PFCNTx=1 and 's' is PFCNTx=033*34* 0x04 o pc invariant branches35* 0x06 o mispredicted branches36* 0x09 o predicted branches taken37* 0x0B o EXCPT insn38* 0x0C o CSYNC/SSYNC insn39* 0x0D o Insns committed40* 0x0E o Interrupts taken41* 0x0F o Misaligned address exceptions42* 0x80 o Code memory fetches stalled due to DMA43* 0x83 o 64bit insn fetches delivered44* 0x9A o data cache fills (bank a)45* 0x9B o data cache fills (bank b)46* 0x9C o data cache lines evicted (bank a)47* 0x9D o data cache lines evicted (bank b)48* 0x9E o data cache high priority fills49* 0x9F o data cache low priority fills50* 0x00 s loop 0 iterations51* 0x01 s loop 1 iterations52* 0x0A s CSYNC/SSYNC stalls53* 0x10 s DAG read/after write hazards54* 0x13 s RAW data hazards55* 0x81 s code TAG stalls56* 0x82 s code fill stalls57* 0x90 s processor to memory stalls58* 0x91 s data memory stalls not hidden by 0x9059* 0x92 s data store buffer full stalls60* 0x93 s data memory write buffer full stalls due to high->low priority61* 0x95 s data memory fill buffer stalls62* 0x96 s data TAG collision stalls63* 0x97 s data collision stalls64* 0x98 s data stalls65* 0x99 s data stalls sent to processor66*/6768static const int event_map[] = {69/* use CYCLES cpu register */70[PERF_COUNT_HW_CPU_CYCLES] = -1,71[PERF_COUNT_HW_INSTRUCTIONS] = 0x0D,72[PERF_COUNT_HW_CACHE_REFERENCES] = -1,73[PERF_COUNT_HW_CACHE_MISSES] = 0x83,74[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x09,75[PERF_COUNT_HW_BRANCH_MISSES] = 0x06,76[PERF_COUNT_HW_BUS_CYCLES] = -1,77};7879#define C(x) PERF_COUNT_HW_CACHE_##x8081static const int cache_events[PERF_COUNT_HW_CACHE_MAX]82[PERF_COUNT_HW_CACHE_OP_MAX]83[PERF_COUNT_HW_CACHE_RESULT_MAX] =84{85[C(L1D)] = { /* Data bank A */86[C(OP_READ)] = {87[C(RESULT_ACCESS)] = 0,88[C(RESULT_MISS) ] = 0x9A,89},90[C(OP_WRITE)] = {91[C(RESULT_ACCESS)] = 0,92[C(RESULT_MISS) ] = 0,93},94[C(OP_PREFETCH)] = {95[C(RESULT_ACCESS)] = 0,96[C(RESULT_MISS) ] = 0,97},98},99100[C(L1I)] = {101[C(OP_READ)] = {102[C(RESULT_ACCESS)] = 0,103[C(RESULT_MISS) ] = 0x83,104},105[C(OP_WRITE)] = {106[C(RESULT_ACCESS)] = -1,107[C(RESULT_MISS) ] = -1,108},109[C(OP_PREFETCH)] = {110[C(RESULT_ACCESS)] = 0,111[C(RESULT_MISS) ] = 0,112},113},114115[C(LL)] = {116[C(OP_READ)] = {117[C(RESULT_ACCESS)] = -1,118[C(RESULT_MISS) ] = -1,119},120[C(OP_WRITE)] = {121[C(RESULT_ACCESS)] = -1,122[C(RESULT_MISS) ] = -1,123},124[C(OP_PREFETCH)] = {125[C(RESULT_ACCESS)] = -1,126[C(RESULT_MISS) ] = -1,127},128},129130[C(DTLB)] = {131[C(OP_READ)] = {132[C(RESULT_ACCESS)] = -1,133[C(RESULT_MISS) ] = -1,134},135[C(OP_WRITE)] = {136[C(RESULT_ACCESS)] = -1,137[C(RESULT_MISS) ] = -1,138},139[C(OP_PREFETCH)] = {140[C(RESULT_ACCESS)] = -1,141[C(RESULT_MISS) ] = -1,142},143},144145[C(ITLB)] = {146[C(OP_READ)] = {147[C(RESULT_ACCESS)] = -1,148[C(RESULT_MISS) ] = -1,149},150[C(OP_WRITE)] = {151[C(RESULT_ACCESS)] = -1,152[C(RESULT_MISS) ] = -1,153},154[C(OP_PREFETCH)] = {155[C(RESULT_ACCESS)] = -1,156[C(RESULT_MISS) ] = -1,157},158},159160[C(BPU)] = {161[C(OP_READ)] = {162[C(RESULT_ACCESS)] = -1,163[C(RESULT_MISS) ] = -1,164},165[C(OP_WRITE)] = {166[C(RESULT_ACCESS)] = -1,167[C(RESULT_MISS) ] = -1,168},169[C(OP_PREFETCH)] = {170[C(RESULT_ACCESS)] = -1,171[C(RESULT_MISS) ] = -1,172},173},174};175176const char *perf_pmu_name(void)177{178return "bfin";179}180EXPORT_SYMBOL(perf_pmu_name);181182int perf_num_counters(void)183{184return ARRAY_SIZE(event_map);185}186EXPORT_SYMBOL(perf_num_counters);187188static u64 bfin_pfmon_read(int idx)189{190return bfin_read32(PFCNTR0 + (idx * 4));191}192193static void bfin_pfmon_disable(struct hw_perf_event *hwc, int idx)194{195bfin_write_PFCTL(bfin_read_PFCTL() & ~PFCEN(idx, PFCEN_MASK));196}197198static void bfin_pfmon_enable(struct hw_perf_event *hwc, int idx)199{200u32 val, mask;201202val = PFPWR;203if (idx) {204mask = ~(PFCNT1 | PFMON1 | PFCEN1 | PEMUSW1);205/* The packed config is for event0, so shift it to event1 slots */206val |= (hwc->config << (PFMON1_P - PFMON0_P));207val |= (hwc->config & PFCNT0) << (PFCNT1_P - PFCNT0_P);208bfin_write_PFCNTR1(0);209} else {210mask = ~(PFCNT0 | PFMON0 | PFCEN0 | PEMUSW0);211val |= hwc->config;212bfin_write_PFCNTR0(0);213}214215bfin_write_PFCTL((bfin_read_PFCTL() & mask) | val);216}217218static void bfin_pfmon_disable_all(void)219{220bfin_write_PFCTL(bfin_read_PFCTL() & ~PFPWR);221}222223static void bfin_pfmon_enable_all(void)224{225bfin_write_PFCTL(bfin_read_PFCTL() | PFPWR);226}227228struct cpu_hw_events {229struct perf_event *events[MAX_HWEVENTS];230unsigned long used_mask[BITS_TO_LONGS(MAX_HWEVENTS)];231};232DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);233234static int hw_perf_cache_event(int config, int *evp)235{236unsigned long type, op, result;237int ev;238239/* unpack config */240type = config & 0xff;241op = (config >> 8) & 0xff;242result = (config >> 16) & 0xff;243244if (type >= PERF_COUNT_HW_CACHE_MAX ||245op >= PERF_COUNT_HW_CACHE_OP_MAX ||246result >= PERF_COUNT_HW_CACHE_RESULT_MAX)247return -EINVAL;248249ev = cache_events[type][op][result];250if (ev == 0)251return -EOPNOTSUPP;252if (ev == -1)253return -EINVAL;254*evp = ev;255return 0;256}257258static void bfin_perf_event_update(struct perf_event *event,259struct hw_perf_event *hwc, int idx)260{261u64 prev_raw_count, new_raw_count;262s64 delta;263int shift = 0;264265/*266* Depending on the counter configuration, they may or may not267* be chained, in which case the previous counter value can be268* updated underneath us if the lower-half overflows.269*270* Our tactic to handle this is to first atomically read and271* exchange a new raw count - then add that new-prev delta272* count to the generic counter atomically.273*274* As there is no interrupt associated with the overflow events,275* this is the simplest approach for maintaining consistency.276*/277again:278prev_raw_count = local64_read(&hwc->prev_count);279new_raw_count = bfin_pfmon_read(idx);280281if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,282new_raw_count) != prev_raw_count)283goto again;284285/*286* Now we have the new raw value and have updated the prev287* timestamp already. We can now calculate the elapsed delta288* (counter-)time and add that to the generic counter.289*290* Careful, not all hw sign-extends above the physical width291* of the count.292*/293delta = (new_raw_count << shift) - (prev_raw_count << shift);294delta >>= shift;295296local64_add(delta, &event->count);297}298299static void bfin_pmu_stop(struct perf_event *event, int flags)300{301struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);302struct hw_perf_event *hwc = &event->hw;303int idx = hwc->idx;304305if (!(event->hw.state & PERF_HES_STOPPED)) {306bfin_pfmon_disable(hwc, idx);307cpuc->events[idx] = NULL;308event->hw.state |= PERF_HES_STOPPED;309}310311if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {312bfin_perf_event_update(event, &event->hw, idx);313event->hw.state |= PERF_HES_UPTODATE;314}315}316317static void bfin_pmu_start(struct perf_event *event, int flags)318{319struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);320struct hw_perf_event *hwc = &event->hw;321int idx = hwc->idx;322323if (WARN_ON_ONCE(idx == -1))324return;325326if (flags & PERF_EF_RELOAD)327WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));328329cpuc->events[idx] = event;330event->hw.state = 0;331bfin_pfmon_enable(hwc, idx);332}333334static void bfin_pmu_del(struct perf_event *event, int flags)335{336struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);337338bfin_pmu_stop(event, PERF_EF_UPDATE);339__clear_bit(event->hw.idx, cpuc->used_mask);340341perf_event_update_userpage(event);342}343344static int bfin_pmu_add(struct perf_event *event, int flags)345{346struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);347struct hw_perf_event *hwc = &event->hw;348int idx = hwc->idx;349int ret = -EAGAIN;350351perf_pmu_disable(event->pmu);352353if (__test_and_set_bit(idx, cpuc->used_mask)) {354idx = find_first_zero_bit(cpuc->used_mask, MAX_HWEVENTS);355if (idx == MAX_HWEVENTS)356goto out;357358__set_bit(idx, cpuc->used_mask);359hwc->idx = idx;360}361362bfin_pfmon_disable(hwc, idx);363364event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;365if (flags & PERF_EF_START)366bfin_pmu_start(event, PERF_EF_RELOAD);367368perf_event_update_userpage(event);369ret = 0;370out:371perf_pmu_enable(event->pmu);372return ret;373}374375static void bfin_pmu_read(struct perf_event *event)376{377bfin_perf_event_update(event, &event->hw, event->hw.idx);378}379380static int bfin_pmu_event_init(struct perf_event *event)381{382struct perf_event_attr *attr = &event->attr;383struct hw_perf_event *hwc = &event->hw;384int config = -1;385int ret;386387if (attr->exclude_hv || attr->exclude_idle)388return -EPERM;389390/*391* All of the on-chip counters are "limited", in that they have392* no interrupts, and are therefore unable to do sampling without393* further work and timer assistance.394*/395if (hwc->sample_period)396return -EINVAL;397398ret = 0;399switch (attr->type) {400case PERF_TYPE_RAW:401config = PFMON(0, attr->config & PFMON_MASK) |402PFCNT(0, !(attr->config & 0x100));403break;404case PERF_TYPE_HW_CACHE:405ret = hw_perf_cache_event(attr->config, &config);406break;407case PERF_TYPE_HARDWARE:408if (attr->config >= ARRAY_SIZE(event_map))409return -EINVAL;410411config = event_map[attr->config];412break;413}414415if (config == -1)416return -EINVAL;417418if (!attr->exclude_kernel)419config |= PFCEN(0, PFCEN_ENABLE_SUPV);420if (!attr->exclude_user)421config |= PFCEN(0, PFCEN_ENABLE_USER);422423hwc->config |= config;424425return ret;426}427428static void bfin_pmu_enable(struct pmu *pmu)429{430struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);431struct perf_event *event;432struct hw_perf_event *hwc;433int i;434435for (i = 0; i < MAX_HWEVENTS; ++i) {436event = cpuc->events[i];437if (!event)438continue;439hwc = &event->hw;440bfin_pfmon_enable(hwc, hwc->idx);441}442443bfin_pfmon_enable_all();444}445446static void bfin_pmu_disable(struct pmu *pmu)447{448bfin_pfmon_disable_all();449}450451static struct pmu pmu = {452.pmu_enable = bfin_pmu_enable,453.pmu_disable = bfin_pmu_disable,454.event_init = bfin_pmu_event_init,455.add = bfin_pmu_add,456.del = bfin_pmu_del,457.start = bfin_pmu_start,458.stop = bfin_pmu_stop,459.read = bfin_pmu_read,460};461462static void bfin_pmu_setup(int cpu)463{464struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);465466memset(cpuhw, 0, sizeof(struct cpu_hw_events));467}468469static int __cpuinit470bfin_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)471{472unsigned int cpu = (long)hcpu;473474switch (action & ~CPU_TASKS_FROZEN) {475case CPU_UP_PREPARE:476bfin_write_PFCTL(0);477bfin_pmu_setup(cpu);478break;479480default:481break;482}483484return NOTIFY_OK;485}486487static int __init bfin_pmu_init(void)488{489int ret;490491ret = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);492if (!ret)493perf_cpu_notifier(bfin_pmu_notifier);494495return ret;496}497early_initcall(bfin_pmu_init);498499500