Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kernel/cpu/aperfmperf.c
49939 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* x86 APERF/MPERF KHz calculation for
4
* /sys/.../cpufreq/scaling_cur_freq
5
*
6
* Copyright (C) 2017 Intel Corp.
7
* Author: Len Brown <[email protected]>
8
*/
9
#include <linux/cpufreq.h>
10
#include <linux/delay.h>
11
#include <linux/ktime.h>
12
#include <linux/math64.h>
13
#include <linux/percpu.h>
14
#include <linux/rcupdate.h>
15
#include <linux/sched/isolation.h>
16
#include <linux/sched/topology.h>
17
#include <linux/smp.h>
18
#include <linux/syscore_ops.h>
19
20
#include <asm/cpu.h>
21
#include <asm/cpu_device_id.h>
22
#include <asm/intel-family.h>
23
#include <asm/msr.h>
24
25
#include "cpu.h"
26
27
struct aperfmperf {
28
seqcount_t seq;
29
unsigned long last_update;
30
u64 acnt;
31
u64 mcnt;
32
u64 aperf;
33
u64 mperf;
34
};
35
36
static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
37
.seq = SEQCNT_ZERO(cpu_samples.seq)
38
};
39
40
static void init_counter_refs(void *data)
41
{
42
u64 aperf, mperf;
43
44
rdmsrq(MSR_IA32_APERF, aperf);
45
rdmsrq(MSR_IA32_MPERF, mperf);
46
47
this_cpu_write(cpu_samples.aperf, aperf);
48
this_cpu_write(cpu_samples.mperf, mperf);
49
}
50
51
#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
52
/*
53
* APERF/MPERF frequency ratio computation.
54
*
55
* The scheduler wants to do frequency invariant accounting and needs a <1
56
* ratio to account for the 'current' frequency, corresponding to
57
* freq_curr / freq_max.
58
*
59
* Since the frequency freq_curr on x86 is controlled by micro-controller and
60
* our P-state setting is little more than a request/hint, we need to observe
61
* the effective frequency 'BusyMHz', i.e. the average frequency over a time
62
* interval after discarding idle time. This is given by:
63
*
64
* BusyMHz = delta_APERF / delta_MPERF * freq_base
65
*
66
* where freq_base is the max non-turbo P-state.
67
*
68
* The freq_max term has to be set to a somewhat arbitrary value, because we
69
* can't know which turbo states will be available at a given point in time:
70
* it all depends on the thermal headroom of the entire package. We set it to
71
* the turbo level with 4 cores active.
72
*
73
* Benchmarks show that's a good compromise between the 1C turbo ratio
74
* (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
75
* which would ignore the entire turbo range (a conspicuous part, making
76
* freq_curr/freq_max always maxed out).
77
*
78
* An exception to the heuristic above is the Atom uarch, where we choose the
79
* highest turbo level for freq_max since Atom's are generally oriented towards
80
* power efficiency.
81
*
82
* Setting freq_max to anything less than the 1C turbo ratio makes the ratio
83
* freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
84
*/
85
86
DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
87
88
static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
89
static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
90
91
void arch_set_max_freq_ratio(bool turbo_disabled)
92
{
93
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
94
arch_turbo_freq_ratio;
95
}
96
EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
97
98
static bool __init turbo_disabled(void)
99
{
100
u64 misc_en;
101
int err;
102
103
err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
104
if (err)
105
return false;
106
107
return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
108
}
109
110
static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
111
{
112
int err;
113
114
err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
115
if (err)
116
return false;
117
118
err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
119
if (err)
120
return false;
121
122
*base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
123
*turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
124
125
return true;
126
}
127
128
#define X86_MATCH(vfm) \
129
X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
130
131
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
132
X86_MATCH(INTEL_XEON_PHI_KNL),
133
X86_MATCH(INTEL_XEON_PHI_KNM),
134
{}
135
};
136
137
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
138
X86_MATCH(INTEL_SKYLAKE_X),
139
{}
140
};
141
142
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
143
X86_MATCH(INTEL_ATOM_GOLDMONT),
144
X86_MATCH(INTEL_ATOM_GOLDMONT_D),
145
X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
146
{}
147
};
148
149
static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
150
int num_delta_fratio)
151
{
152
int fratio, delta_fratio, found;
153
int err, i;
154
u64 msr;
155
156
err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
157
if (err)
158
return false;
159
160
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
161
162
err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
163
if (err)
164
return false;
165
166
fratio = (msr >> 8) & 0xFF;
167
i = 16;
168
found = 0;
169
do {
170
if (found >= num_delta_fratio) {
171
*turbo_freq = fratio;
172
return true;
173
}
174
175
delta_fratio = (msr >> (i + 5)) & 0x7;
176
177
if (delta_fratio) {
178
found += 1;
179
fratio -= delta_fratio;
180
}
181
182
i += 8;
183
} while (i < 64);
184
185
return true;
186
}
187
188
static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
189
{
190
u64 ratios, counts;
191
u32 group_size;
192
int err, i;
193
194
err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
195
if (err)
196
return false;
197
198
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
199
200
err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
201
if (err)
202
return false;
203
204
err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
205
if (err)
206
return false;
207
208
for (i = 0; i < 64; i += 8) {
209
group_size = (counts >> i) & 0xFF;
210
if (group_size >= size) {
211
*turbo_freq = (ratios >> i) & 0xFF;
212
return true;
213
}
214
}
215
216
return false;
217
}
218
219
static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
220
{
221
u64 msr;
222
int err;
223
224
err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
225
if (err)
226
return false;
227
228
err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
229
if (err)
230
return false;
231
232
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
233
*turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
234
235
/* The CPU may have less than 4 cores */
236
if (!*turbo_freq)
237
*turbo_freq = msr & 0xFF; /* 1C turbo */
238
239
return true;
240
}
241
242
static bool __init intel_set_max_freq_ratio(void)
243
{
244
u64 base_freq, turbo_freq;
245
u64 turbo_ratio;
246
247
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
248
goto out;
249
250
if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
251
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
252
goto out;
253
254
if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
255
knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
256
goto out;
257
258
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
259
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
260
goto out;
261
262
if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
263
goto out;
264
265
return false;
266
267
out:
268
/*
269
* Some hypervisors advertise X86_FEATURE_APERFMPERF
270
* but then fill all MSR's with zeroes.
271
* Some CPUs have turbo boost but don't declare any turbo ratio
272
* in MSR_TURBO_RATIO_LIMIT.
273
*/
274
if (!base_freq || !turbo_freq) {
275
pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
276
return false;
277
}
278
279
turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
280
if (!turbo_ratio) {
281
pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
282
return false;
283
}
284
285
arch_turbo_freq_ratio = turbo_ratio;
286
arch_set_max_freq_ratio(turbo_disabled());
287
288
return true;
289
}
290
291
#ifdef CONFIG_PM_SLEEP
292
static const struct syscore_ops freq_invariance_syscore_ops = {
293
.resume = init_counter_refs,
294
};
295
296
static struct syscore freq_invariance_syscore = {
297
.ops = &freq_invariance_syscore_ops,
298
};
299
300
static void register_freq_invariance_syscore(void)
301
{
302
register_syscore(&freq_invariance_syscore);
303
}
304
#else
305
static inline void register_freq_invariance_syscore(void) {}
306
#endif
307
308
static void freq_invariance_enable(void)
309
{
310
if (static_branch_unlikely(&arch_scale_freq_key)) {
311
WARN_ON_ONCE(1);
312
return;
313
}
314
static_branch_enable_cpuslocked(&arch_scale_freq_key);
315
register_freq_invariance_syscore();
316
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
317
}
318
319
void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
320
{
321
arch_turbo_freq_ratio = ratio;
322
arch_set_max_freq_ratio(turbo_disabled);
323
freq_invariance_enable();
324
}
325
326
static void __init bp_init_freq_invariance(void)
327
{
328
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
329
return;
330
331
if (intel_set_max_freq_ratio()) {
332
guard(cpus_read_lock)();
333
freq_invariance_enable();
334
}
335
}
336
337
static void disable_freq_invariance_workfn(struct work_struct *work)
338
{
339
int cpu;
340
341
static_branch_disable(&arch_scale_freq_key);
342
343
/*
344
* Set arch_freq_scale to a default value on all cpus
345
* This negates the effect of scaling
346
*/
347
for_each_possible_cpu(cpu)
348
per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
349
}
350
351
static DECLARE_WORK(disable_freq_invariance_work,
352
disable_freq_invariance_workfn);
353
354
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
355
EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
356
357
static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
358
359
struct arch_hybrid_cpu_scale {
360
unsigned long capacity;
361
unsigned long freq_ratio;
362
};
363
364
static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
365
366
/**
367
* arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
368
*
369
* Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
370
* initialize it and set the static key controlling its code paths.
371
*
372
* Must be called before arch_set_cpu_capacity().
373
*/
374
bool arch_enable_hybrid_capacity_scale(void)
375
{
376
int cpu;
377
378
if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
379
WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
380
return true;
381
}
382
383
arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
384
if (!arch_cpu_scale)
385
return false;
386
387
for_each_possible_cpu(cpu) {
388
per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
389
per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
390
}
391
392
static_branch_enable(&arch_hybrid_cap_scale_key);
393
394
pr_info("Hybrid CPU capacity scaling enabled\n");
395
396
return true;
397
}
398
399
/**
400
* arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
401
* @cpu: Target CPU.
402
* @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
403
* @max_cap: System-wide maximum CPU capacity.
404
* @cap_freq: Frequency of @cpu corresponding to @cap.
405
* @base_freq: Frequency of @cpu at which MPERF counts.
406
*
407
* The units in which @cap and @max_cap are expressed do not matter, so long
408
* as they are consistent, because the former is effectively divided by the
409
* latter. Analogously for @cap_freq and @base_freq.
410
*
411
* After calling this function for all CPUs, call arch_rebuild_sched_domains()
412
* to let the scheduler know that capacity-aware scheduling can be used going
413
* forward.
414
*/
415
void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
416
unsigned long cap_freq, unsigned long base_freq)
417
{
418
if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
419
WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
420
div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
421
WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
422
div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
423
} else {
424
WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
425
}
426
}
427
428
unsigned long arch_scale_cpu_capacity(int cpu)
429
{
430
if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
431
return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
432
433
return SCHED_CAPACITY_SCALE;
434
}
435
EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
436
437
static void scale_freq_tick(u64 acnt, u64 mcnt)
438
{
439
u64 freq_scale, freq_ratio;
440
441
if (!arch_scale_freq_invariant())
442
return;
443
444
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
445
goto error;
446
447
if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
448
freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
449
else
450
freq_ratio = arch_max_freq_ratio;
451
452
if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
453
goto error;
454
455
freq_scale = div64_u64(acnt, mcnt);
456
if (!freq_scale)
457
goto error;
458
459
if (freq_scale > SCHED_CAPACITY_SCALE)
460
freq_scale = SCHED_CAPACITY_SCALE;
461
462
this_cpu_write(arch_freq_scale, freq_scale);
463
return;
464
465
error:
466
pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
467
schedule_work(&disable_freq_invariance_work);
468
}
469
#else
470
static inline void bp_init_freq_invariance(void) { }
471
static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
472
#endif /* CONFIG_X86_64 && CONFIG_SMP */
473
474
void arch_scale_freq_tick(void)
475
{
476
struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
477
u64 acnt, mcnt, aperf, mperf;
478
479
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
480
return;
481
482
rdmsrq(MSR_IA32_APERF, aperf);
483
rdmsrq(MSR_IA32_MPERF, mperf);
484
acnt = aperf - s->aperf;
485
mcnt = mperf - s->mperf;
486
487
s->aperf = aperf;
488
s->mperf = mperf;
489
490
raw_write_seqcount_begin(&s->seq);
491
s->last_update = jiffies;
492
s->acnt = acnt;
493
s->mcnt = mcnt;
494
raw_write_seqcount_end(&s->seq);
495
496
scale_freq_tick(acnt, mcnt);
497
}
498
499
/*
500
* Discard samples older than the define maximum sample age of 20ms. There
501
* is no point in sending IPIs in such a case. If the scheduler tick was
502
* not running then the CPU is either idle or isolated.
503
*/
504
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
505
506
int arch_freq_get_on_cpu(int cpu)
507
{
508
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
509
unsigned int seq, freq;
510
unsigned long last;
511
u64 acnt, mcnt;
512
513
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
514
goto fallback;
515
516
do {
517
seq = raw_read_seqcount_begin(&s->seq);
518
last = s->last_update;
519
acnt = s->acnt;
520
mcnt = s->mcnt;
521
} while (read_seqcount_retry(&s->seq, seq));
522
523
/*
524
* Bail on invalid count and when the last update was too long ago,
525
* which covers idle and NOHZ full CPUs.
526
*/
527
if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
528
goto fallback;
529
530
return div64_u64((cpu_khz * acnt), mcnt);
531
532
fallback:
533
freq = cpufreq_quick_get(cpu);
534
return freq ? freq : cpu_khz;
535
}
536
537
static int __init bp_init_aperfmperf(void)
538
{
539
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
540
return 0;
541
542
init_counter_refs(NULL);
543
bp_init_freq_invariance();
544
return 0;
545
}
546
early_initcall(bp_init_aperfmperf);
547
548
void ap_init_aperfmperf(void)
549
{
550
if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
551
init_counter_refs(NULL);
552
}
553
554