CoCalc -- hiperdispatch.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/s390/kernel/hiperdispatch.c
²⁶⁴⁵¹ views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * Copyright IBM Corp. 2024
4
 */
5

6
#define KMSG_COMPONENT "hd"
7
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
8

9
/*
10
 * Hiperdispatch:
11
 * Dynamically calculates the optimum number of high capacity COREs
12
 * by considering the state the system is in. When hiperdispatch decides
13
 * that a capacity update is necessary, it schedules a topology update.
14
 * During topology updates the CPU capacities are always re-adjusted.
15
 *
16
 * There is two places where CPU capacities are being accessed within
17
 * hiperdispatch.
18
 * -> hiperdispatch's reoccuring work function reads CPU capacities to
19
 *    determine high capacity CPU count.
20
 * -> during a topology update hiperdispatch's adjustment function
21
 *    updates CPU capacities.
22
 * These two can run on different CPUs in parallel which can cause
23
 * hiperdispatch to make wrong decisions. This can potentially cause
24
 * some overhead by leading to extra rebuild_sched_domains() calls
25
 * for correction. Access to capacities within hiperdispatch has to be
26
 * serialized to prevent the overhead.
27
 *
28
 * Hiperdispatch decision making revolves around steal time.
29
 * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
30
 * crosses the threshold value hiperdispatch falls back to giving high
31
 * capacities to entitled CPUs. When steal time drops below the
32
 * threshold boundary, hiperdispatch utilizes all CPUs by giving all
33
 * of them high capacity.
34
 *
35
 * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
36
 * performance. Comparing the throughput of;
37
 * - single CORE, with N threads, running N tasks
38
 * - N separate COREs running N tasks,
39
 * using individual COREs for individual tasks yield better
40
 * performance. This performance difference is roughly ~30% (can change
41
 * between machine generations)
42
 *
43
 * Hiperdispatch tries to hint scheduler to use individual COREs for
44
 * each task, as long as steal time on those COREs are less than 30%,
45
 * therefore delaying the throughput loss caused by using SMP threads.
46
 */
47

48
#include <linux/cpufeature.h>
49
#include <linux/cpumask.h>
50
#include <linux/debugfs.h>
51
#include <linux/device.h>
52
#include <linux/kernel_stat.h>
53
#include <linux/kstrtox.h>
54
#include <linux/ktime.h>
55
#include <linux/sysctl.h>
56
#include <linux/types.h>
57
#include <linux/workqueue.h>
58
#include <asm/hiperdispatch.h>
59
#include <asm/setup.h>
60
#include <asm/smp.h>
61
#include <asm/topology.h>
62

63
#define CREATE_TRACE_POINTS
64
#include <asm/trace/hiperdispatch.h>
65

66
#define HD_DELAY_FACTOR			(4)
67
#define HD_DELAY_INTERVAL		(HZ / 4)
68
#define HD_STEAL_THRESHOLD		30
69
#define HD_STEAL_AVG_WEIGHT		16
70

71
static cpumask_t hd_vl_coremask;	/* Mask containing all vertical low COREs */
72
static cpumask_t hd_vmvl_cpumask;	/* Mask containing vertical medium and low CPUs */
73
static int hd_high_capacity_cores;	/* Current CORE count with high capacity */
74
static int hd_entitled_cores;		/* Total vertical high and medium CORE count */
75
static int hd_online_cores;		/* Current online CORE count */
76

77
static unsigned long hd_previous_steal;	/* Previous iteration's CPU steal timer total */
78
static unsigned long hd_high_time;	/* Total time spent while all cpus have high capacity */
79
static unsigned long hd_low_time;	/* Total time spent while vl cpus have low capacity */
80
static atomic64_t hd_adjustments;	/* Total occurrence count of hiperdispatch adjustments */
81

82
static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
83
static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
84
static int hd_enabled;
85

86
static void hd_capacity_work_fn(struct work_struct *work);
87
static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
88

89
static int hd_set_hiperdispatch_mode(int enable)
90
{
91
	if (!cpu_has_topology())
92
		enable = 0;
93
	if (hd_enabled == enable)
94
		return 0;
95
	hd_enabled = enable;
96
	return 1;
97
}
98

99
void hd_reset_state(void)
100
{
101
	cpumask_clear(&hd_vl_coremask);
102
	cpumask_clear(&hd_vmvl_cpumask);
103
	hd_entitled_cores = 0;
104
	hd_online_cores = 0;
105
}
106

107
void hd_add_core(int cpu)
108
{
109
	const struct cpumask *siblings;
110
	int polarization;
111

112
	hd_online_cores++;
113
	polarization = smp_cpu_get_polarization(cpu);
114
	siblings = topology_sibling_cpumask(cpu);
115
	switch (polarization) {
116
	case POLARIZATION_VH:
117
		hd_entitled_cores++;
118
		break;
119
	case POLARIZATION_VM:
120
		hd_entitled_cores++;
121
		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
122
		break;
123
	case POLARIZATION_VL:
124
		cpumask_set_cpu(cpu, &hd_vl_coremask);
125
		cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
126
		break;
127
	}
128
}
129

130
/* Serialize update and read operations of debug counters. */
131
static DEFINE_MUTEX(hd_counter_mutex);
132

133
static void hd_update_times(void)
134
{
135
	static ktime_t prev;
136
	ktime_t now;
137

138
	/*
139
	 * Check if hiperdispatch is active, if not set the prev to 0.
140
	 * This way it is possible to differentiate the first update iteration after
141
	 * enabling hiperdispatch.
142
	 */
143
	if (hd_entitled_cores == 0 || hd_enabled == 0) {
144
		prev = ktime_set(0, 0);
145
		return;
146
	}
147
	now = ktime_get();
148
	if (ktime_after(prev, 0)) {
149
		if (hd_high_capacity_cores == hd_online_cores)
150
			hd_high_time += ktime_ms_delta(now, prev);
151
		else
152
			hd_low_time += ktime_ms_delta(now, prev);
153
	}
154
	prev = now;
155
}
156

157
static void hd_update_capacities(void)
158
{
159
	int cpu, upscaling_cores;
160
	unsigned long capacity;
161

162
	upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
163
	capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
164
	hd_high_capacity_cores = hd_entitled_cores;
165
	for_each_cpu(cpu, &hd_vl_coremask) {
166
		smp_set_core_capacity(cpu, capacity);
167
		if (capacity != CPU_CAPACITY_HIGH)
168
			continue;
169
		hd_high_capacity_cores++;
170
		upscaling_cores--;
171
		if (upscaling_cores == 0)
172
			capacity = CPU_CAPACITY_LOW;
173
	}
174
}
175

176
void hd_disable_hiperdispatch(void)
177
{
178
	cancel_delayed_work_sync(&hd_capacity_work);
179
	hd_high_capacity_cores = hd_online_cores;
180
	hd_previous_steal = 0;
181
}
182

183
int hd_enable_hiperdispatch(void)
184
{
185
	mutex_lock(&hd_counter_mutex);
186
	hd_update_times();
187
	mutex_unlock(&hd_counter_mutex);
188
	if (hd_enabled == 0)
189
		return 0;
190
	if (hd_entitled_cores == 0)
191
		return 0;
192
	if (hd_online_cores <= hd_entitled_cores)
193
		return 0;
194
	mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
195
	hd_update_capacities();
196
	return 1;
197
}
198

199
static unsigned long hd_steal_avg(unsigned long new)
200
{
201
	static unsigned long steal;
202

203
	steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
204
	return steal;
205
}
206

207
static unsigned long hd_calculate_steal_percentage(void)
208
{
209
	unsigned long time_delta, steal_delta, steal, percentage;
210
	static ktime_t prev;
211
	int cpus, cpu;
212
	ktime_t now;
213

214
	cpus = 0;
215
	steal = 0;
216
	percentage = 0;
217
	for_each_cpu(cpu, &hd_vmvl_cpumask) {
218
		steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
219
		cpus++;
220
	}
221
	/*
222
	 * If there is no vertical medium and low CPUs steal time
223
	 * is 0 as vertical high CPUs shouldn't experience steal time.
224
	 */
225
	if (cpus == 0)
226
		return percentage;
227
	now = ktime_get();
228
	time_delta = ktime_to_ns(ktime_sub(now, prev));
229
	if (steal > hd_previous_steal && hd_previous_steal != 0) {
230
		steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
231
		percentage = steal_delta / cpus;
232
	}
233
	hd_previous_steal = steal;
234
	prev = now;
235
	return percentage;
236
}
237

238
static void hd_capacity_work_fn(struct work_struct *work)
239
{
240
	unsigned long steal_percentage, new_cores;
241

242
	mutex_lock(&smp_cpu_state_mutex);
243
	/*
244
	 * If online cores are less or equal to entitled cores hiperdispatch
245
	 * does not need to make any adjustments, call a topology update to
246
	 * disable hiperdispatch.
247
	 * Normally this check is handled on topology update, but during cpu
248
	 * unhotplug, topology and cpu mask updates are done in reverse
249
	 * order, causing hd_enable_hiperdispatch() to get stale data.
250
	 */
251
	if (hd_online_cores <= hd_entitled_cores) {
252
		topology_schedule_update();
253
		mutex_unlock(&smp_cpu_state_mutex);
254
		return;
255
	}
256
	steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
257
	if (steal_percentage < hd_steal_threshold)
258
		new_cores = hd_online_cores;
259
	else
260
		new_cores = hd_entitled_cores;
261
	if (hd_high_capacity_cores != new_cores) {
262
		trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
263
		hd_high_capacity_cores = new_cores;
264
		atomic64_inc(&hd_adjustments);
265
		topology_schedule_update();
266
	}
267
	trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
268
	mutex_unlock(&smp_cpu_state_mutex);
269
	schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
270
}
271

272
static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
273
				     void *buffer, size_t *lenp, loff_t *ppos)
274
{
275
	int hiperdispatch;
276
	int rc;
277
	struct ctl_table ctl_entry = {
278
		.procname	= ctl->procname,
279
		.data		= &hiperdispatch,
280
		.maxlen		= sizeof(int),
281
		.extra1		= SYSCTL_ZERO,
282
		.extra2		= SYSCTL_ONE,
283
	};
284

285
	hiperdispatch = hd_enabled;
286
	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
287
	if (rc < 0 || !write)
288
		return rc;
289
	mutex_lock(&smp_cpu_state_mutex);
290
	if (hd_set_hiperdispatch_mode(hiperdispatch))
291
		topology_schedule_update();
292
	mutex_unlock(&smp_cpu_state_mutex);
293
	return 0;
294
}
295

296
static const struct ctl_table hiperdispatch_ctl_table[] = {
297
	{
298
		.procname	= "hiperdispatch",
299
		.mode		= 0644,
300
		.proc_handler	= hiperdispatch_ctl_handler,
301
	},
302
};
303

304
static ssize_t hd_steal_threshold_show(struct device *dev,
305
				       struct device_attribute *attr,
306
				       char *buf)
307
{
308
	return sysfs_emit(buf, "%u\n", hd_steal_threshold);
309
}
310

311
static ssize_t hd_steal_threshold_store(struct device *dev,
312
					struct device_attribute *attr,
313
					const char *buf,
314
					size_t count)
315
{
316
	unsigned int val;
317
	int rc;
318

319
	rc = kstrtouint(buf, 0, &val);
320
	if (rc)
321
		return rc;
322
	if (val > 100)
323
		return -ERANGE;
324
	hd_steal_threshold = val;
325
	return count;
326
}
327

328
static DEVICE_ATTR_RW(hd_steal_threshold);
329

330
static ssize_t hd_delay_factor_show(struct device *dev,
331
				    struct device_attribute *attr,
332
				    char *buf)
333
{
334
	return sysfs_emit(buf, "%u\n", hd_delay_factor);
335
}
336

337
static ssize_t hd_delay_factor_store(struct device *dev,
338
				     struct device_attribute *attr,
339
				     const char *buf,
340
				     size_t count)
341
{
342
	unsigned int val;
343
	int rc;
344

345
	rc = kstrtouint(buf, 0, &val);
346
	if (rc)
347
		return rc;
348
	if (!val)
349
		return -ERANGE;
350
	hd_delay_factor = val;
351
	return count;
352
}
353

354
static DEVICE_ATTR_RW(hd_delay_factor);
355

356
static struct attribute *hd_attrs[] = {
357
	&dev_attr_hd_steal_threshold.attr,
358
	&dev_attr_hd_delay_factor.attr,
359
	NULL,
360
};
361

362
static const struct attribute_group hd_attr_group = {
363
	.name  = "hiperdispatch",
364
	.attrs = hd_attrs,
365
};
366

367
static int hd_greedy_time_get(void *unused, u64 *val)
368
{
369
	mutex_lock(&hd_counter_mutex);
370
	hd_update_times();
371
	*val = hd_high_time;
372
	mutex_unlock(&hd_counter_mutex);
373
	return 0;
374
}
375

376
DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
377

378
static int hd_conservative_time_get(void *unused, u64 *val)
379
{
380
	mutex_lock(&hd_counter_mutex);
381
	hd_update_times();
382
	*val = hd_low_time;
383
	mutex_unlock(&hd_counter_mutex);
384
	return 0;
385
}
386

387
DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
388

389
static int hd_adjustment_count_get(void *unused, u64 *val)
390
{
391
	*val = atomic64_read(&hd_adjustments);
392
	return 0;
393
}
394

395
DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
396

397
static void __init hd_create_debugfs_counters(void)
398
{
399
	struct dentry *dir;
400

401
	dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
402
	debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
403
	debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
404
	debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
405
}
406

407
static void __init hd_create_attributes(void)
408
{
409
	struct device *dev;
410

411
	dev = bus_get_dev_root(&cpu_subsys);
412
	if (!dev)
413
		return;
414
	if (sysfs_create_group(&dev->kobj, &hd_attr_group))
415
		pr_warn("Unable to create hiperdispatch attribute group\n");
416
	put_device(dev);
417
}
418

419
static int __init hd_init(void)
420
{
421
	if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
422
		hd_set_hiperdispatch_mode(1);
423
		topology_schedule_update();
424
	}
425
	if (!register_sysctl("s390", hiperdispatch_ctl_table))
426
		pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
427
	hd_create_debugfs_counters();
428
	hd_create_attributes();
429
	return 0;
430
}
431
late_initcall(hd_init);
432

433
Product

Resources

Company