Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/s390/kernel/hiperdispatch.c
49317 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright IBM Corp. 2024
4
*/
5
6
#define pr_fmt(fmt) "hd: " fmt
7
8
/*
9
* Hiperdispatch:
10
* Dynamically calculates the optimum number of high capacity COREs
11
* by considering the state the system is in. When hiperdispatch decides
12
* that a capacity update is necessary, it schedules a topology update.
13
* During topology updates the CPU capacities are always re-adjusted.
14
*
15
* There is two places where CPU capacities are being accessed within
16
* hiperdispatch.
17
* -> hiperdispatch's reoccuring work function reads CPU capacities to
18
* determine high capacity CPU count.
19
* -> during a topology update hiperdispatch's adjustment function
20
* updates CPU capacities.
21
* These two can run on different CPUs in parallel which can cause
22
* hiperdispatch to make wrong decisions. This can potentially cause
23
* some overhead by leading to extra rebuild_sched_domains() calls
24
* for correction. Access to capacities within hiperdispatch has to be
25
* serialized to prevent the overhead.
26
*
27
* Hiperdispatch decision making revolves around steal time.
28
* HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
29
* crosses the threshold value hiperdispatch falls back to giving high
30
* capacities to entitled CPUs. When steal time drops below the
31
* threshold boundary, hiperdispatch utilizes all CPUs by giving all
32
* of them high capacity.
33
*
34
* The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
35
* performance. Comparing the throughput of;
36
* - single CORE, with N threads, running N tasks
37
* - N separate COREs running N tasks,
38
* using individual COREs for individual tasks yield better
39
* performance. This performance difference is roughly ~30% (can change
40
* between machine generations)
41
*
42
* Hiperdispatch tries to hint scheduler to use individual COREs for
43
* each task, as long as steal time on those COREs are less than 30%,
44
* therefore delaying the throughput loss caused by using SMP threads.
45
*/
46
47
#include <linux/cpufeature.h>
48
#include <linux/cpumask.h>
49
#include <linux/debugfs.h>
50
#include <linux/device.h>
51
#include <linux/kernel_stat.h>
52
#include <linux/kstrtox.h>
53
#include <linux/ktime.h>
54
#include <linux/sysctl.h>
55
#include <linux/types.h>
56
#include <linux/workqueue.h>
57
#include <asm/hiperdispatch.h>
58
#include <asm/setup.h>
59
#include <asm/smp.h>
60
#include <asm/topology.h>
61
62
#define CREATE_TRACE_POINTS
63
#include <asm/trace/hiperdispatch.h>
64
65
#define HD_DELAY_FACTOR (4)
66
#define HD_DELAY_INTERVAL (HZ / 4)
67
#define HD_STEAL_THRESHOLD 10
68
#define HD_STEAL_AVG_WEIGHT 16
69
70
static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */
71
static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */
72
static int hd_high_capacity_cores; /* Current CORE count with high capacity */
73
static int hd_entitled_cores; /* Total vertical high and medium CORE count */
74
static int hd_online_cores; /* Current online CORE count */
75
76
static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
77
static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */
78
static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */
79
static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */
80
81
static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
82
static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
83
static int hd_enabled;
84
85
static void hd_capacity_work_fn(struct work_struct *work);
86
static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
87
88
static int hd_set_hiperdispatch_mode(int enable)
89
{
90
if (!cpu_has_topology())
91
enable = 0;
92
if (hd_enabled == enable)
93
return 0;
94
hd_enabled = enable;
95
return 1;
96
}
97
98
void hd_reset_state(void)
99
{
100
cpumask_clear(&hd_vl_coremask);
101
cpumask_clear(&hd_vmvl_cpumask);
102
hd_entitled_cores = 0;
103
hd_online_cores = 0;
104
}
105
106
void hd_add_core(int cpu)
107
{
108
const struct cpumask *siblings;
109
int polarization;
110
111
hd_online_cores++;
112
polarization = smp_cpu_get_polarization(cpu);
113
siblings = topology_sibling_cpumask(cpu);
114
switch (polarization) {
115
case POLARIZATION_VH:
116
hd_entitled_cores++;
117
break;
118
case POLARIZATION_VM:
119
hd_entitled_cores++;
120
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
121
break;
122
case POLARIZATION_VL:
123
cpumask_set_cpu(cpu, &hd_vl_coremask);
124
cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
125
break;
126
}
127
}
128
129
/* Serialize update and read operations of debug counters. */
130
static DEFINE_MUTEX(hd_counter_mutex);
131
132
static void hd_update_times(void)
133
{
134
static ktime_t prev;
135
ktime_t now;
136
137
/*
138
* Check if hiperdispatch is active, if not set the prev to 0.
139
* This way it is possible to differentiate the first update iteration after
140
* enabling hiperdispatch.
141
*/
142
if (hd_entitled_cores == 0 || hd_enabled == 0) {
143
prev = ktime_set(0, 0);
144
return;
145
}
146
now = ktime_get();
147
if (ktime_after(prev, 0)) {
148
if (hd_high_capacity_cores == hd_online_cores)
149
hd_high_time += ktime_ms_delta(now, prev);
150
else
151
hd_low_time += ktime_ms_delta(now, prev);
152
}
153
prev = now;
154
}
155
156
static void hd_update_capacities(void)
157
{
158
int cpu, upscaling_cores;
159
unsigned long capacity;
160
161
upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
162
capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
163
hd_high_capacity_cores = hd_entitled_cores;
164
for_each_cpu(cpu, &hd_vl_coremask) {
165
smp_set_core_capacity(cpu, capacity);
166
if (capacity != CPU_CAPACITY_HIGH)
167
continue;
168
hd_high_capacity_cores++;
169
upscaling_cores--;
170
if (upscaling_cores == 0)
171
capacity = CPU_CAPACITY_LOW;
172
}
173
}
174
175
void hd_disable_hiperdispatch(void)
176
{
177
cancel_delayed_work_sync(&hd_capacity_work);
178
hd_high_capacity_cores = hd_online_cores;
179
hd_previous_steal = 0;
180
}
181
182
int hd_enable_hiperdispatch(void)
183
{
184
mutex_lock(&hd_counter_mutex);
185
hd_update_times();
186
mutex_unlock(&hd_counter_mutex);
187
if (hd_enabled == 0)
188
return 0;
189
if (hd_entitled_cores == 0)
190
return 0;
191
if (hd_online_cores <= hd_entitled_cores)
192
return 0;
193
mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
194
hd_update_capacities();
195
return 1;
196
}
197
198
static unsigned long hd_steal_avg(unsigned long new)
199
{
200
static unsigned long steal;
201
202
steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
203
return steal;
204
}
205
206
static unsigned long hd_calculate_steal_percentage(void)
207
{
208
unsigned long time_delta, steal_delta, steal, percentage;
209
static ktime_t prev;
210
int cpus, cpu;
211
ktime_t now;
212
213
cpus = 0;
214
steal = 0;
215
percentage = 0;
216
for_each_cpu(cpu, &hd_vmvl_cpumask) {
217
steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
218
cpus++;
219
}
220
/*
221
* If there is no vertical medium and low CPUs steal time
222
* is 0 as vertical high CPUs shouldn't experience steal time.
223
*/
224
if (cpus == 0)
225
return percentage;
226
now = ktime_get();
227
time_delta = ktime_to_ns(ktime_sub(now, prev));
228
if (steal > hd_previous_steal && hd_previous_steal != 0) {
229
steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
230
percentage = steal_delta / cpus;
231
}
232
hd_previous_steal = steal;
233
prev = now;
234
return percentage;
235
}
236
237
static void hd_capacity_work_fn(struct work_struct *work)
238
{
239
unsigned long steal_percentage, new_cores;
240
241
mutex_lock(&smp_cpu_state_mutex);
242
/*
243
* If online cores are less or equal to entitled cores hiperdispatch
244
* does not need to make any adjustments, call a topology update to
245
* disable hiperdispatch.
246
* Normally this check is handled on topology update, but during cpu
247
* unhotplug, topology and cpu mask updates are done in reverse
248
* order, causing hd_enable_hiperdispatch() to get stale data.
249
*/
250
if (hd_online_cores <= hd_entitled_cores) {
251
topology_schedule_update();
252
mutex_unlock(&smp_cpu_state_mutex);
253
return;
254
}
255
steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
256
if (steal_percentage < hd_steal_threshold)
257
new_cores = hd_online_cores;
258
else
259
new_cores = hd_entitled_cores;
260
if (hd_high_capacity_cores != new_cores) {
261
trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
262
hd_high_capacity_cores = new_cores;
263
atomic64_inc(&hd_adjustments);
264
topology_schedule_update();
265
}
266
trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
267
mutex_unlock(&smp_cpu_state_mutex);
268
schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
269
}
270
271
static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
272
void *buffer, size_t *lenp, loff_t *ppos)
273
{
274
int hiperdispatch;
275
int rc;
276
struct ctl_table ctl_entry = {
277
.procname = ctl->procname,
278
.data = &hiperdispatch,
279
.maxlen = sizeof(int),
280
.extra1 = SYSCTL_ZERO,
281
.extra2 = SYSCTL_ONE,
282
};
283
284
hiperdispatch = hd_enabled;
285
rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
286
if (rc < 0 || !write)
287
return rc;
288
mutex_lock(&smp_cpu_state_mutex);
289
if (hd_set_hiperdispatch_mode(hiperdispatch))
290
topology_schedule_update();
291
mutex_unlock(&smp_cpu_state_mutex);
292
return 0;
293
}
294
295
static const struct ctl_table hiperdispatch_ctl_table[] = {
296
{
297
.procname = "hiperdispatch",
298
.mode = 0644,
299
.proc_handler = hiperdispatch_ctl_handler,
300
},
301
};
302
303
static ssize_t hd_steal_threshold_show(struct device *dev,
304
struct device_attribute *attr,
305
char *buf)
306
{
307
return sysfs_emit(buf, "%u\n", hd_steal_threshold);
308
}
309
310
static ssize_t hd_steal_threshold_store(struct device *dev,
311
struct device_attribute *attr,
312
const char *buf,
313
size_t count)
314
{
315
unsigned int val;
316
int rc;
317
318
rc = kstrtouint(buf, 0, &val);
319
if (rc)
320
return rc;
321
if (val > 100)
322
return -ERANGE;
323
hd_steal_threshold = val;
324
return count;
325
}
326
327
static DEVICE_ATTR_RW(hd_steal_threshold);
328
329
static ssize_t hd_delay_factor_show(struct device *dev,
330
struct device_attribute *attr,
331
char *buf)
332
{
333
return sysfs_emit(buf, "%u\n", hd_delay_factor);
334
}
335
336
static ssize_t hd_delay_factor_store(struct device *dev,
337
struct device_attribute *attr,
338
const char *buf,
339
size_t count)
340
{
341
unsigned int val;
342
int rc;
343
344
rc = kstrtouint(buf, 0, &val);
345
if (rc)
346
return rc;
347
if (!val)
348
return -ERANGE;
349
hd_delay_factor = val;
350
return count;
351
}
352
353
static DEVICE_ATTR_RW(hd_delay_factor);
354
355
static struct attribute *hd_attrs[] = {
356
&dev_attr_hd_steal_threshold.attr,
357
&dev_attr_hd_delay_factor.attr,
358
NULL,
359
};
360
361
static const struct attribute_group hd_attr_group = {
362
.name = "hiperdispatch",
363
.attrs = hd_attrs,
364
};
365
366
static int hd_greedy_time_get(void *unused, u64 *val)
367
{
368
mutex_lock(&hd_counter_mutex);
369
hd_update_times();
370
*val = hd_high_time;
371
mutex_unlock(&hd_counter_mutex);
372
return 0;
373
}
374
375
DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
376
377
static int hd_conservative_time_get(void *unused, u64 *val)
378
{
379
mutex_lock(&hd_counter_mutex);
380
hd_update_times();
381
*val = hd_low_time;
382
mutex_unlock(&hd_counter_mutex);
383
return 0;
384
}
385
386
DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
387
388
static int hd_adjustment_count_get(void *unused, u64 *val)
389
{
390
*val = atomic64_read(&hd_adjustments);
391
return 0;
392
}
393
394
DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
395
396
static void __init hd_create_debugfs_counters(void)
397
{
398
struct dentry *dir;
399
400
dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
401
debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
402
debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
403
debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
404
}
405
406
static void __init hd_create_attributes(void)
407
{
408
struct device *dev;
409
410
dev = bus_get_dev_root(&cpu_subsys);
411
if (!dev)
412
return;
413
if (sysfs_create_group(&dev->kobj, &hd_attr_group))
414
pr_warn("Unable to create hiperdispatch attribute group\n");
415
put_device(dev);
416
}
417
418
static int __init hd_init(void)
419
{
420
if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
421
hd_set_hiperdispatch_mode(1);
422
topology_schedule_update();
423
}
424
if (!register_sysctl("s390", hiperdispatch_ctl_table))
425
pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
426
hd_create_debugfs_counters();
427
hd_create_attributes();
428
return 0;
429
}
430
late_initcall(hd_init);
431
432