Path: blob/master/arch/powerpc/oprofile/cell/spu_profiler.c
10819 views
/*1* Cell Broadband Engine OProfile Support2*3* (C) Copyright IBM Corporation 20064*5* Authors: Maynard Johnson <[email protected]>6* Carl Love <[email protected]>7*8* This program is free software; you can redistribute it and/or9* modify it under the terms of the GNU General Public License10* as published by the Free Software Foundation; either version11* 2 of the License, or (at your option) any later version.12*/1314#include <linux/hrtimer.h>15#include <linux/smp.h>16#include <linux/slab.h>17#include <asm/cell-pmu.h>18#include <asm/time.h>19#include "pr_util.h"2021#define SCALE_SHIFT 142223static u32 *samples;2425/* spu_prof_running is a flag used to indicate if spu profiling is enabled26* or not. It is set by the routines start_spu_profiling_cycles() and27* start_spu_profiling_events(). The flag is cleared by the routines28* stop_spu_profiling_cycles() and stop_spu_profiling_events(). These29* routines are called via global_start() and global_stop() which are called in30* op_powerpc_start() and op_powerpc_stop(). These routines are called once31* per system as a result of the user starting/stopping oprofile. Hence, only32* one CPU per user at a time will be changing the value of spu_prof_running.33* In general, OProfile does not protect against multiple users trying to run34* OProfile at a time.35*/36int spu_prof_running;37static unsigned int profiling_interval;3839#define NUM_SPU_BITS_TRBUF 1640#define SPUS_PER_TB_ENTRY 44142#define SPU_PC_MASK 0xFFFF4344DEFINE_SPINLOCK(oprof_spu_smpl_arry_lck);45unsigned long oprof_spu_smpl_arry_lck_flags;4647void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)48{49unsigned long ns_per_cyc;5051if (!freq_khz)52freq_khz = ppc_proc_freq/1000;5354/* To calculate a timeout in nanoseconds, the basic55* formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).56* To avoid floating point math, we use the scale math57* technique as described in linux/jiffies.h. We use58* a scale factor of SCALE_SHIFT, which provides 4 decimal places59* of precision. This is close enough for the purpose at hand.60*61* The value of the timeout should be small enough that the hw62* trace buffer will not get more than about 1/3 full for the63* maximum user specified (the LFSR value) hw sampling frequency.64* This is to ensure the trace buffer will never fill even if the65* kernel thread scheduling varies under a heavy system load.66*/6768ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;69profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT;7071}7273/*74* Extract SPU PC from trace buffer entry75*/76static void spu_pc_extract(int cpu, int entry)77{78/* the trace buffer is 128 bits */79u64 trace_buffer[2];80u64 spu_mask;81int spu;8283spu_mask = SPU_PC_MASK;8485/* Each SPU PC is 16 bits; hence, four spus in each of86* the two 64-bit buffer entries that make up the87* 128-bit trace_buffer entry. Process two 64-bit values88* simultaneously.89* trace[0] SPU PC contents are: 0 1 2 390* trace[1] SPU PC contents are: 4 5 6 791*/9293cbe_read_trace_buffer(cpu, trace_buffer);9495for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {96/* spu PC trace entry is upper 16 bits of the97* 18 bit SPU program counter98*/99samples[spu * TRACE_ARRAY_SIZE + entry]100= (spu_mask & trace_buffer[0]) << 2;101samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]102= (spu_mask & trace_buffer[1]) << 2;103104trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;105trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;106}107}108109static int cell_spu_pc_collection(int cpu)110{111u32 trace_addr;112int entry;113114/* process the collected SPU PC for the node */115116entry = 0;117118trace_addr = cbe_read_pm(cpu, trace_address);119while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {120/* there is data in the trace buffer to process */121spu_pc_extract(cpu, entry);122123entry++;124125if (entry >= TRACE_ARRAY_SIZE)126/* spu_samples is full */127break;128129trace_addr = cbe_read_pm(cpu, trace_address);130}131132return entry;133}134135136static enum hrtimer_restart profile_spus(struct hrtimer *timer)137{138ktime_t kt;139int cpu, node, k, num_samples, spu_num;140141if (!spu_prof_running)142goto stop;143144for_each_online_cpu(cpu) {145if (cbe_get_hw_thread_id(cpu))146continue;147148node = cbe_cpu_to_node(cpu);149150/* There should only be one kernel thread at a time processing151* the samples. In the very unlikely case that the processing152* is taking a very long time and multiple kernel threads are153* started to process the samples. Make sure only one kernel154* thread is working on the samples array at a time. The155* sample array must be loaded and then processed for a given156* cpu. The sample array is not per cpu.157*/158spin_lock_irqsave(&oprof_spu_smpl_arry_lck,159oprof_spu_smpl_arry_lck_flags);160num_samples = cell_spu_pc_collection(cpu);161162if (num_samples == 0) {163spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,164oprof_spu_smpl_arry_lck_flags);165continue;166}167168for (k = 0; k < SPUS_PER_NODE; k++) {169spu_num = k + (node * SPUS_PER_NODE);170spu_sync_buffer(spu_num,171samples + (k * TRACE_ARRAY_SIZE),172num_samples);173}174175spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,176oprof_spu_smpl_arry_lck_flags);177178}179smp_wmb(); /* insure spu event buffer updates are written */180/* don't want events intermingled... */181182kt = ktime_set(0, profiling_interval);183if (!spu_prof_running)184goto stop;185hrtimer_forward(timer, timer->base->get_time(), kt);186return HRTIMER_RESTART;187188stop:189printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");190return HRTIMER_NORESTART;191}192193static struct hrtimer timer;194/*195* Entry point for SPU cycle profiling.196* NOTE: SPU profiling is done system-wide, not per-CPU.197*198* cycles_reset is the count value specified by the user when199* setting up OProfile to count SPU_CYCLES.200*/201int start_spu_profiling_cycles(unsigned int cycles_reset)202{203ktime_t kt;204205pr_debug("timer resolution: %lu\n", TICK_NSEC);206kt = ktime_set(0, profiling_interval);207hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);208hrtimer_set_expires(&timer, kt);209timer.function = profile_spus;210211/* Allocate arrays for collecting SPU PC samples */212samples = kzalloc(SPUS_PER_NODE *213TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);214215if (!samples)216return -ENOMEM;217218spu_prof_running = 1;219hrtimer_start(&timer, kt, HRTIMER_MODE_REL);220schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);221222return 0;223}224225/*226* Entry point for SPU event profiling.227* NOTE: SPU profiling is done system-wide, not per-CPU.228*229* cycles_reset is the count value specified by the user when230* setting up OProfile to count SPU_CYCLES.231*/232void start_spu_profiling_events(void)233{234spu_prof_running = 1;235schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);236237return;238}239240void stop_spu_profiling_cycles(void)241{242spu_prof_running = 0;243hrtimer_cancel(&timer);244kfree(samples);245pr_debug("SPU_PROF: stop_spu_profiling_cycles issued\n");246}247248void stop_spu_profiling_events(void)249{250spu_prof_running = 0;251}252253254