#define KMSG_COMPONENT "hd"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/cpufeature.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/kernel_stat.h>
#include <linux/kstrtox.h>
#include <linux/ktime.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <asm/hiperdispatch.h>
#include <asm/setup.h>
#include <asm/smp.h>
#include <asm/topology.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/hiperdispatch.h>
#define HD_DELAY_FACTOR (4)
#define HD_DELAY_INTERVAL (HZ / 4)
#define HD_STEAL_THRESHOLD 30
#define HD_STEAL_AVG_WEIGHT 16
static cpumask_t hd_vl_coremask;
static cpumask_t hd_vmvl_cpumask;
static int hd_high_capacity_cores;
static int hd_entitled_cores;
static int hd_online_cores;
static unsigned long hd_previous_steal;
static unsigned long hd_high_time;
static unsigned long hd_low_time;
static atomic64_t hd_adjustments;
static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
static int hd_enabled;
static void hd_capacity_work_fn(struct work_struct *work);
static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
static int hd_set_hiperdispatch_mode(int enable)
{
if (!cpu_has_topology())
enable = 0;
if (hd_enabled == enable)
return 0;
hd_enabled = enable;
return 1;
}
void hd_reset_state(void)
{
cpumask_clear(&hd_vl_coremask);
cpumask_clear(&hd_vmvl_cpumask);
hd_entitled_cores = 0;
hd_online_cores = 0;
}
void hd_add_core(int cpu)
{
const struct cpumask *siblings;
int polarization;
hd_online_cores++;
polarization = smp_cpu_get_polarization(cpu);
siblings = topology_sibling_cpumask(cpu);
switch (polarization) {
case POLARIZATION_VH:
hd_entitled_cores++;
break;
case POLARIZATION_VM:
hd_entitled_cores++;
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
break;
case POLARIZATION_VL:
cpumask_set_cpu(cpu, &hd_vl_coremask);
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
break;
}
}
static DEFINE_MUTEX(hd_counter_mutex);
static void hd_update_times(void)
{
static ktime_t prev;
ktime_t now;
if (hd_entitled_cores == 0 || hd_enabled == 0) {
prev = ktime_set(0, 0);
return;
}
now = ktime_get();
if (ktime_after(prev, 0)) {
if (hd_high_capacity_cores == hd_online_cores)
hd_high_time += ktime_ms_delta(now, prev);
else
hd_low_time += ktime_ms_delta(now, prev);
}
prev = now;
}
static void hd_update_capacities(void)
{
int cpu, upscaling_cores;
unsigned long capacity;
upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
hd_high_capacity_cores = hd_entitled_cores;
for_each_cpu(cpu, &hd_vl_coremask) {
smp_set_core_capacity(cpu, capacity);
if (capacity != CPU_CAPACITY_HIGH)
continue;
hd_high_capacity_cores++;
upscaling_cores--;
if (upscaling_cores == 0)
capacity = CPU_CAPACITY_LOW;
}
}
void hd_disable_hiperdispatch(void)
{
cancel_delayed_work_sync(&hd_capacity_work);
hd_high_capacity_cores = hd_online_cores;
hd_previous_steal = 0;
}
int hd_enable_hiperdispatch(void)
{
mutex_lock(&hd_counter_mutex);
hd_update_times();
mutex_unlock(&hd_counter_mutex);
if (hd_enabled == 0)
return 0;
if (hd_entitled_cores == 0)
return 0;
if (hd_online_cores <= hd_entitled_cores)
return 0;
mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
hd_update_capacities();
return 1;
}
static unsigned long hd_steal_avg(unsigned long new)
{
static unsigned long steal;
steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
return steal;
}
static unsigned long hd_calculate_steal_percentage(void)
{
unsigned long time_delta, steal_delta, steal, percentage;
static ktime_t prev;
int cpus, cpu;
ktime_t now;
cpus = 0;
steal = 0;
percentage = 0;
for_each_cpu(cpu, &hd_vmvl_cpumask) {
steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
cpus++;
}
if (cpus == 0)
return percentage;
now = ktime_get();
time_delta = ktime_to_ns(ktime_sub(now, prev));
if (steal > hd_previous_steal && hd_previous_steal != 0) {
steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
percentage = steal_delta / cpus;
}
hd_previous_steal = steal;
prev = now;
return percentage;
}
static void hd_capacity_work_fn(struct work_struct *work)
{
unsigned long steal_percentage, new_cores;
mutex_lock(&smp_cpu_state_mutex);
if (hd_online_cores <= hd_entitled_cores) {
topology_schedule_update();
mutex_unlock(&smp_cpu_state_mutex);
return;
}
steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
if (steal_percentage < hd_steal_threshold)
new_cores = hd_online_cores;
else
new_cores = hd_entitled_cores;
if (hd_high_capacity_cores != new_cores) {
trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
hd_high_capacity_cores = new_cores;
atomic64_inc(&hd_adjustments);
topology_schedule_update();
}
trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
mutex_unlock(&smp_cpu_state_mutex);
schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
}
static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int hiperdispatch;
int rc;
struct ctl_table ctl_entry = {
.procname = ctl->procname,
.data = &hiperdispatch,
.maxlen = sizeof(int),
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
};
hiperdispatch = hd_enabled;
rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
if (rc < 0 || !write)
return rc;
mutex_lock(&smp_cpu_state_mutex);
if (hd_set_hiperdispatch_mode(hiperdispatch))
topology_schedule_update();
mutex_unlock(&smp_cpu_state_mutex);
return 0;
}
static const struct ctl_table hiperdispatch_ctl_table[] = {
{
.procname = "hiperdispatch",
.mode = 0644,
.proc_handler = hiperdispatch_ctl_handler,
},
};
static ssize_t hd_steal_threshold_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", hd_steal_threshold);
}
static ssize_t hd_steal_threshold_store(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t count)
{
unsigned int val;
int rc;
rc = kstrtouint(buf, 0, &val);
if (rc)
return rc;
if (val > 100)
return -ERANGE;
hd_steal_threshold = val;
return count;
}
static DEVICE_ATTR_RW(hd_steal_threshold);
static ssize_t hd_delay_factor_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%u\n", hd_delay_factor);
}
static ssize_t hd_delay_factor_store(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t count)
{
unsigned int val;
int rc;
rc = kstrtouint(buf, 0, &val);
if (rc)
return rc;
if (!val)
return -ERANGE;
hd_delay_factor = val;
return count;
}
static DEVICE_ATTR_RW(hd_delay_factor);
static struct attribute *hd_attrs[] = {
&dev_attr_hd_steal_threshold.attr,
&dev_attr_hd_delay_factor.attr,
NULL,
};
static const struct attribute_group hd_attr_group = {
.name = "hiperdispatch",
.attrs = hd_attrs,
};
static int hd_greedy_time_get(void *unused, u64 *val)
{
mutex_lock(&hd_counter_mutex);
hd_update_times();
*val = hd_high_time;
mutex_unlock(&hd_counter_mutex);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
static int hd_conservative_time_get(void *unused, u64 *val)
{
mutex_lock(&hd_counter_mutex);
hd_update_times();
*val = hd_low_time;
mutex_unlock(&hd_counter_mutex);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
static int hd_adjustment_count_get(void *unused, u64 *val)
{
*val = atomic64_read(&hd_adjustments);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
static void __init hd_create_debugfs_counters(void)
{
struct dentry *dir;
dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
}
static void __init hd_create_attributes(void)
{
struct device *dev;
dev = bus_get_dev_root(&cpu_subsys);
if (!dev)
return;
if (sysfs_create_group(&dev->kobj, &hd_attr_group))
pr_warn("Unable to create hiperdispatch attribute group\n");
put_device(dev);
}
static int __init hd_init(void)
{
if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
hd_set_hiperdispatch_mode(1);
topology_schedule_update();
}
if (!register_sysctl("s390", hiperdispatch_ctl_table))
pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
hd_create_debugfs_counters();
hd_create_attributes();
return 0;
}
late_initcall(hd_init);