Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
google
GitHub Repository: google/crosvm
Path: blob/main/devices/src/virtcpufreq_v2.rs
5392 views
1
// Copyright 2024 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
use std::fs::File;
6
use std::path::PathBuf;
7
use std::sync::atomic::AtomicU32;
8
use std::sync::atomic::Ordering;
9
use std::sync::Arc;
10
use std::time::Duration;
11
12
use anyhow::Context;
13
use base::sched_attr;
14
use base::sched_setattr;
15
use base::set_cpu_affinity;
16
use base::warn;
17
use base::Error;
18
use base::Event;
19
use base::EventToken;
20
use base::Timer;
21
use base::TimerTrait;
22
use base::Tube;
23
use base::WaitContext;
24
use base::WorkerThread;
25
use sync::Mutex;
26
use vm_control::DeviceId;
27
use vm_control::PlatformDeviceId;
28
29
use crate::BusAccessInfo;
30
use crate::BusDevice;
31
use crate::Suspendable;
32
33
const CPUFREQ_GOV_SCALE_FACTOR_DEFAULT: u32 = 100;
34
const CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL: u32 = 80;
35
36
const SCHED_FLAG_RESET_ON_FORK: u64 = 0x1;
37
const SCHED_FLAG_KEEP_POLICY: u64 = 0x08;
38
const SCHED_FLAG_KEEP_PARAMS: u64 = 0x10;
39
const SCHED_FLAG_UTIL_CLAMP_MIN: u64 = 0x20;
40
const SCHED_FLAG_UTIL_CLAMP_MAX: u64 = 0x40;
41
42
const VCPUFREQ_CUR_PERF: u32 = 0x0;
43
const VCPUFREQ_SET_PERF: u32 = 0x4;
44
const VCPUFREQ_FREQTBL_LEN: u32 = 0x8;
45
const VCPUFREQ_FREQTBL_SEL: u32 = 0xc;
46
const VCPUFREQ_FREQTBL_RD: u32 = 0x10;
47
const VCPUFREQ_PERF_DOMAIN: u32 = 0x14;
48
49
const SCHED_FLAG_KEEP_ALL: u64 = SCHED_FLAG_KEEP_POLICY | SCHED_FLAG_KEEP_PARAMS;
50
const SCHED_CAPACITY_SCALE: u32 = 1024;
51
52
// Timer values in microseconds
53
const MIN_TIMER_US: u32 = 75;
54
const TIMER_OVERHEAD_US: u32 = 15;
55
56
/// Upstream linux compatible version of the virtual cpufreq interface
57
pub struct VirtCpufreqV2 {
58
vcpu_freq_table: Vec<u32>,
59
pcpu_fmax: u32,
60
pcpu_capacity: u32,
61
pcpu: u32,
62
util_factor: u32,
63
freqtbl_sel: u32,
64
vcpu_domain: u32,
65
domain_uclamp_min: Option<File>,
66
domain_uclamp_max: Option<File>,
67
vcpu_fmax: u32,
68
vcpu_capacity: u32,
69
vcpu_relative_capacity: u32,
70
worker: Option<WorkerThread<()>>,
71
timer: Arc<Mutex<Timer>>,
72
vm_ctrl: Arc<Mutex<Tube>>,
73
pcpu_min_cap: u32,
74
/// The largest(or the last) pCPU index to be used by all the vCPUs. This index is used to
75
/// figure out the proper placement of the throttle workers which are placed on pCPUs right
76
/// after the last pCPU being used the vCPUs. Throttle workers require their own exclusive
77
/// pCPU allocation and this ensure that the workers are placed contiguously and makes it
78
/// easier for user to manage pCPU allocations when running multiple instances on a large
79
/// server.
80
largest_pcpu_idx: usize,
81
//TODO: Put the shared_domain_members in a struct
82
shared_domain_vcpus: Vec<usize>,
83
shared_domain_perf: Arc<AtomicU32>,
84
}
85
86
fn get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error> {
87
let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
88
std::fs::read_to_string(path)?
89
.trim()
90
.parse()
91
.map_err(|_| Error::new(libc::EINVAL))
92
}
93
94
fn get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error> {
95
let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
96
std::fs::read_to_string(path).map_err(|_| Error::new(libc::EINVAL))
97
}
98
99
fn get_cpu_capacity(cpu_id: u32) -> Result<u32, Error> {
100
get_cpu_info(cpu_id, "cpu_capacity")
101
}
102
103
fn get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error> {
104
get_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
105
}
106
107
fn get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error> {
108
get_cpu_info(cpu_id, "cpufreq/cpuinfo_min_freq")
109
}
110
111
fn get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error> {
112
get_cpu_info(cpu_id, "cpufreq/scaling_cur_freq")
113
}
114
115
fn get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error> {
116
let gov = get_cpu_info_str(cpu_id, "cpufreq/scaling_governor")?;
117
match gov.trim() {
118
"schedutil" => Ok(CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL),
119
_ => Ok(CPUFREQ_GOV_SCALE_FACTOR_DEFAULT),
120
}
121
}
122
123
impl VirtCpufreqV2 {
124
pub fn new(
125
pcpu: u32,
126
vcpu_freq_table: Vec<u32>,
127
vcpu_domain_path: Option<PathBuf>,
128
vcpu_domain: u32,
129
vcpu_capacity: u32,
130
largest_pcpu_idx: usize,
131
vm_ctrl: Arc<Mutex<Tube>>,
132
shared_domain_vcpus: Vec<usize>,
133
shared_domain_perf: Arc<AtomicU32>,
134
) -> Self {
135
let pcpu_capacity = get_cpu_capacity(pcpu).expect("Error reading capacity");
136
let pcpu_fmax = get_cpu_maxfreq_khz(pcpu).expect("Error reading max freq");
137
let util_factor = get_cpu_util_factor(pcpu).expect("Error getting util factor");
138
let freqtbl_sel = 0;
139
let mut domain_uclamp_min = None;
140
let mut domain_uclamp_max = None;
141
// The vcpu_capacity passed in is normalized for frequency, reverse the normalization to
142
// get the performance per clock ratio between the vCPU and the pCPU its running on. This
143
// "relative capacity" is an approximation of the delta in IPC (Instructions per Cycle)
144
// between the pCPU vs vCPU running a usecase containing a mix of instruction types.
145
let vcpu_fmax = vcpu_freq_table.clone().into_iter().max().unwrap();
146
let vcpu_relative_capacity =
147
u32::try_from(u64::from(vcpu_capacity) * u64::from(pcpu_fmax) / u64::from(vcpu_fmax))
148
.unwrap();
149
let pcpu_min_cap =
150
get_cpu_minfreq_khz(pcpu).expect("Error reading min freq") * pcpu_capacity / pcpu_fmax;
151
152
if let Some(cgroup_path) = &vcpu_domain_path {
153
domain_uclamp_min = Some(
154
File::create(cgroup_path.join("cpu.uclamp.min")).unwrap_or_else(|err| {
155
panic!(
156
"Err: {}, Unable to open: {}",
157
err,
158
cgroup_path.join("cpu.uclamp.min").display()
159
)
160
}),
161
);
162
domain_uclamp_max = Some(
163
File::create(cgroup_path.join("cpu.uclamp.max")).unwrap_or_else(|err| {
164
panic!(
165
"Err: {}, Unable to open: {}",
166
err,
167
cgroup_path.join("cpu.uclamp.max").display()
168
)
169
}),
170
);
171
}
172
173
VirtCpufreqV2 {
174
vcpu_freq_table,
175
pcpu_fmax,
176
pcpu_capacity,
177
pcpu,
178
util_factor,
179
freqtbl_sel,
180
vcpu_domain,
181
domain_uclamp_min,
182
domain_uclamp_max,
183
vcpu_fmax,
184
vcpu_capacity,
185
vcpu_relative_capacity,
186
worker: None,
187
timer: Arc::new(Mutex::new(Timer::new().expect("failed to create Timer"))),
188
vm_ctrl,
189
pcpu_min_cap,
190
largest_pcpu_idx,
191
shared_domain_vcpus,
192
shared_domain_perf,
193
}
194
}
195
}
196
197
impl BusDevice for VirtCpufreqV2 {
198
fn device_id(&self) -> DeviceId {
199
PlatformDeviceId::VirtCpufreq.into()
200
}
201
202
fn debug_label(&self) -> String {
203
"VirtCpufreq Device".to_owned()
204
}
205
206
fn read(&mut self, info: BusAccessInfo, data: &mut [u8]) {
207
if data.len() != std::mem::size_of::<u32>() {
208
warn!(
209
"{}: unsupported read length {}, only support 4bytes read",
210
self.debug_label(),
211
data.len()
212
);
213
return;
214
}
215
216
let val = match info.offset as u32 {
217
VCPUFREQ_CUR_PERF => {
218
let shared_util = self.shared_domain_perf.load(Ordering::SeqCst);
219
if shared_util != 0 && shared_util < self.pcpu_min_cap {
220
shared_util * self.vcpu_fmax / self.vcpu_capacity
221
} else {
222
match get_cpu_curfreq_khz(self.pcpu) {
223
Ok(freq) => u32::try_from(
224
u64::from(freq) * u64::from(self.pcpu_capacity)
225
/ u64::from(self.vcpu_relative_capacity),
226
)
227
.unwrap(),
228
Err(_) => 0,
229
}
230
}
231
}
232
VCPUFREQ_FREQTBL_LEN => self.vcpu_freq_table.len() as u32,
233
VCPUFREQ_PERF_DOMAIN => self.vcpu_domain,
234
VCPUFREQ_FREQTBL_RD => *self
235
.vcpu_freq_table
236
.get(self.freqtbl_sel as usize)
237
.unwrap_or(&0),
238
_ => {
239
warn!("{}: unsupported read address {}", self.debug_label(), info);
240
return;
241
}
242
};
243
244
let val_arr = val.to_ne_bytes();
245
data.copy_from_slice(&val_arr);
246
}
247
248
fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
249
let val: u32 = match data.try_into().map(u32::from_ne_bytes) {
250
Ok(v) => v,
251
Err(e) => {
252
warn!(
253
"{}: unsupported write length {:#}, only support 4bytes write",
254
self.debug_label(),
255
e
256
);
257
return;
258
}
259
};
260
261
match info.offset as u32 {
262
VCPUFREQ_SET_PERF => {
263
// Util margin depends on the cpufreq governor on the host
264
let util_raw = match u32::try_from(
265
u64::from(self.vcpu_capacity) * u64::from(val) / u64::from(self.vcpu_fmax),
266
) {
267
Ok(util) => util,
268
Err(e) => {
269
warn!("Potential overflow {:#}", e);
270
SCHED_CAPACITY_SCALE
271
}
272
};
273
274
let util = util_raw * self.util_factor / CPUFREQ_GOV_SCALE_FACTOR_DEFAULT;
275
276
if let (Some(domain_uclamp_min), Some(domain_uclamp_max)) =
277
(&mut self.domain_uclamp_min, &mut self.domain_uclamp_max)
278
{
279
use std::io::Write;
280
let val = util as f32 * 100.0 / SCHED_CAPACITY_SCALE as f32;
281
let val_formatted = format!("{val:4}").into_bytes();
282
283
if self.vcpu_fmax != self.pcpu_fmax {
284
if let Err(e) = domain_uclamp_max.write(&val_formatted) {
285
warn!("Error setting uclamp_max: {:#}", e);
286
}
287
}
288
if let Err(e) = domain_uclamp_min.write(&val_formatted) {
289
warn!("Error setting uclamp_min: {:#}", e);
290
}
291
} else {
292
let mut sched_attr = sched_attr {
293
sched_flags: SCHED_FLAG_KEEP_ALL
294
| SCHED_FLAG_UTIL_CLAMP_MIN
295
| SCHED_FLAG_UTIL_CLAMP_MAX
296
| SCHED_FLAG_RESET_ON_FORK,
297
sched_util_min: util,
298
..Default::default()
299
};
300
301
if self.vcpu_fmax != self.pcpu_fmax {
302
sched_attr.sched_util_max = util;
303
} else {
304
sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
305
}
306
307
if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
308
panic!("{}: Error setting util value: {:#}", self.debug_label(), e);
309
}
310
}
311
312
// Return early if vcpu_fmax matches pcpu_fmax as that denotes no vCPU throttling
313
// is required.
314
if self.vcpu_fmax == self.pcpu_fmax {
315
return;
316
}
317
318
self.shared_domain_perf.store(util_raw, Ordering::SeqCst);
319
let timer = self.timer.clone();
320
if self.worker.is_none() {
321
let vcpu_id = info.id;
322
let vm_ctrl = self.vm_ctrl.clone();
323
let worker_cpu_affinity = self.largest_pcpu_idx + self.vcpu_domain as usize + 1;
324
let shared_domain_vcpus = self.shared_domain_vcpus.clone();
325
326
self.worker = Some(WorkerThread::start(
327
format!("vcpu_throttle{vcpu_id}"),
328
move |kill_evt| {
329
vcpufreq_worker_thread(
330
shared_domain_vcpus,
331
kill_evt,
332
timer,
333
vm_ctrl,
334
worker_cpu_affinity,
335
)
336
.expect("error running vpucfreq_worker")
337
},
338
));
339
} else if util_raw < self.pcpu_min_cap {
340
// The period is porportional to the performance requested by the vCPU, we
341
// reduce the timeout period to increase the amount of throttling applied to
342
// the vCPU as the performance decreases. Ex. If vCPU requests half of the
343
// performance relatively to its pCPU@FMin, the vCPU will spend 50% of its
344
// cycles being throttled to increase time for the same workload that otherwise
345
// would've taken 1/2 of the time if ran at pCPU@FMin. We could've
346
// alternatively adjusted the workload and used some fixed period (such as
347
// 250us), but there's a floor for the minimum delay we add (cost of handling
348
// the userspace exit) and limits the range of performance we can emulate.
349
let timeout_period = (MIN_TIMER_US + TIMER_OVERHEAD_US) as f32
350
/ (1.0 - (util_raw as f32 / self.pcpu_min_cap as f32));
351
let _ = timer
352
.lock()
353
.reset_repeating(Duration::from_micros(timeout_period as u64));
354
} else {
355
let _ = timer.lock().clear();
356
}
357
}
358
VCPUFREQ_FREQTBL_SEL => self.freqtbl_sel = val,
359
_ => {
360
warn!("{}: unsupported read address {}", self.debug_label(), info);
361
}
362
}
363
}
364
}
365
366
pub fn vcpufreq_worker_thread(
367
shared_domain_vcpus: Vec<usize>,
368
kill_evt: Event,
369
timer: Arc<Mutex<Timer>>,
370
vm_ctrl: Arc<Mutex<Tube>>,
371
cpu_affinity: usize,
372
) -> anyhow::Result<()> {
373
#[derive(EventToken)]
374
enum Token {
375
// The timer expired.
376
TimerExpire,
377
// The parent thread requested an exit.
378
Kill,
379
}
380
381
let wait_ctx = WaitContext::build_with(&[
382
(&*timer.lock(), Token::TimerExpire),
383
(&kill_evt, Token::Kill),
384
])
385
.context("Failed to create wait_ctx")?;
386
387
// The vcpufreq thread has strict scheduling requirements, let's affine it away from the vCPU
388
// threads and clamp its util to high value.
389
let cpu_set: Vec<usize> = vec![cpu_affinity];
390
set_cpu_affinity(cpu_set)?;
391
392
let mut sched_attr = sched_attr {
393
sched_flags: SCHED_FLAG_KEEP_ALL
394
| SCHED_FLAG_UTIL_CLAMP_MIN
395
| SCHED_FLAG_UTIL_CLAMP_MAX
396
| SCHED_FLAG_RESET_ON_FORK,
397
sched_util_min: SCHED_CAPACITY_SCALE,
398
sched_util_max: SCHED_CAPACITY_SCALE,
399
..Default::default()
400
};
401
if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
402
warn!("Error setting util value: {}", e);
403
}
404
405
loop {
406
let events = wait_ctx.wait().context("Failed to wait for events")?;
407
for event in events.iter().filter(|e| e.is_readable) {
408
match event.token {
409
Token::TimerExpire => {
410
timer
411
.lock()
412
.mark_waited()
413
.context("failed to reset timer")?;
414
let vm_ctrl_unlocked = vm_ctrl.lock();
415
for vcpu_id in &shared_domain_vcpus {
416
let msg = vm_control::VmRequest::Throttle(*vcpu_id, MIN_TIMER_US);
417
vm_ctrl_unlocked
418
.send(&msg)
419
.context("failed to stall vCPUs")?;
420
}
421
}
422
Token::Kill => {
423
return Ok(());
424
}
425
}
426
}
427
}
428
}
429
430
impl Suspendable for VirtCpufreqV2 {}
431
432