CoCalc -- virtcpufreq

GitHub Repository: google/crosvm
Path: blob/main/devices/src/virtcpufreq_v2.rs
⁵³⁹² views
1
// Copyright 2024 The ChromiumOS Authors
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4

5
use std::fs::File;
6
use std::path::PathBuf;
7
use std::sync::atomic::AtomicU32;
8
use std::sync::atomic::Ordering;
9
use std::sync::Arc;
10
use std::time::Duration;
11

12
use anyhow::Context;
13
use base::sched_attr;
14
use base::sched_setattr;
15
use base::set_cpu_affinity;
16
use base::warn;
17
use base::Error;
18
use base::Event;
19
use base::EventToken;
20
use base::Timer;
21
use base::TimerTrait;
22
use base::Tube;
23
use base::WaitContext;
24
use base::WorkerThread;
25
use sync::Mutex;
26
use vm_control::DeviceId;
27
use vm_control::PlatformDeviceId;
28

29
use crate::BusAccessInfo;
30
use crate::BusDevice;
31
use crate::Suspendable;
32

33
const CPUFREQ_GOV_SCALE_FACTOR_DEFAULT: u32 = 100;
34
const CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL: u32 = 80;
35

36
const SCHED_FLAG_RESET_ON_FORK: u64 = 0x1;
37
const SCHED_FLAG_KEEP_POLICY: u64 = 0x08;
38
const SCHED_FLAG_KEEP_PARAMS: u64 = 0x10;
39
const SCHED_FLAG_UTIL_CLAMP_MIN: u64 = 0x20;
40
const SCHED_FLAG_UTIL_CLAMP_MAX: u64 = 0x40;
41

42
const VCPUFREQ_CUR_PERF: u32 = 0x0;
43
const VCPUFREQ_SET_PERF: u32 = 0x4;
44
const VCPUFREQ_FREQTBL_LEN: u32 = 0x8;
45
const VCPUFREQ_FREQTBL_SEL: u32 = 0xc;
46
const VCPUFREQ_FREQTBL_RD: u32 = 0x10;
47
const VCPUFREQ_PERF_DOMAIN: u32 = 0x14;
48

49
const SCHED_FLAG_KEEP_ALL: u64 = SCHED_FLAG_KEEP_POLICY | SCHED_FLAG_KEEP_PARAMS;
50
const SCHED_CAPACITY_SCALE: u32 = 1024;
51

52
// Timer values in microseconds
53
const MIN_TIMER_US: u32 = 75;
54
const TIMER_OVERHEAD_US: u32 = 15;
55

56
/// Upstream linux compatible version of the virtual cpufreq interface
57
pub struct VirtCpufreqV2 {
58
    vcpu_freq_table: Vec<u32>,
59
    pcpu_fmax: u32,
60
    pcpu_capacity: u32,
61
    pcpu: u32,
62
    util_factor: u32,
63
    freqtbl_sel: u32,
64
    vcpu_domain: u32,
65
    domain_uclamp_min: Option<File>,
66
    domain_uclamp_max: Option<File>,
67
    vcpu_fmax: u32,
68
    vcpu_capacity: u32,
69
    vcpu_relative_capacity: u32,
70
    worker: Option<WorkerThread<()>>,
71
    timer: Arc<Mutex<Timer>>,
72
    vm_ctrl: Arc<Mutex<Tube>>,
73
    pcpu_min_cap: u32,
74
    /// The largest(or the last) pCPU index to be used by all the vCPUs. This index is used to
75
    /// figure out the proper placement of the throttle workers which are placed on pCPUs right
76
    /// after the last pCPU being used the vCPUs. Throttle workers require their own exclusive
77
    /// pCPU allocation and this ensure that the workers are placed contiguously and makes it
78
    /// easier for user to manage pCPU allocations when running multiple instances on a large
79
    /// server.
80
    largest_pcpu_idx: usize,
81
    //TODO: Put the shared_domain_members in a struct
82
    shared_domain_vcpus: Vec<usize>,
83
    shared_domain_perf: Arc<AtomicU32>,
84
}
85

86
fn get_cpu_info(cpu_id: u32, property: &str) -> Result<u32, Error> {
87
    let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
88
    std::fs::read_to_string(path)?
89
        .trim()
90
        .parse()
91
        .map_err(|_| Error::new(libc::EINVAL))
92
}
93

94
fn get_cpu_info_str(cpu_id: u32, property: &str) -> Result<String, Error> {
95
    let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
96
    std::fs::read_to_string(path).map_err(|_| Error::new(libc::EINVAL))
97
}
98

99
fn get_cpu_capacity(cpu_id: u32) -> Result<u32, Error> {
100
    get_cpu_info(cpu_id, "cpu_capacity")
101
}
102

103
fn get_cpu_maxfreq_khz(cpu_id: u32) -> Result<u32, Error> {
104
    get_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
105
}
106

107
fn get_cpu_minfreq_khz(cpu_id: u32) -> Result<u32, Error> {
108
    get_cpu_info(cpu_id, "cpufreq/cpuinfo_min_freq")
109
}
110

111
fn get_cpu_curfreq_khz(cpu_id: u32) -> Result<u32, Error> {
112
    get_cpu_info(cpu_id, "cpufreq/scaling_cur_freq")
113
}
114

115
fn get_cpu_util_factor(cpu_id: u32) -> Result<u32, Error> {
116
    let gov = get_cpu_info_str(cpu_id, "cpufreq/scaling_governor")?;
117
    match gov.trim() {
118
        "schedutil" => Ok(CPUFREQ_GOV_SCALE_FACTOR_SCHEDUTIL),
119
        _ => Ok(CPUFREQ_GOV_SCALE_FACTOR_DEFAULT),
120
    }
121
}
122

123
impl VirtCpufreqV2 {
124
    pub fn new(
125
        pcpu: u32,
126
        vcpu_freq_table: Vec<u32>,
127
        vcpu_domain_path: Option<PathBuf>,
128
        vcpu_domain: u32,
129
        vcpu_capacity: u32,
130
        largest_pcpu_idx: usize,
131
        vm_ctrl: Arc<Mutex<Tube>>,
132
        shared_domain_vcpus: Vec<usize>,
133
        shared_domain_perf: Arc<AtomicU32>,
134
    ) -> Self {
135
        let pcpu_capacity = get_cpu_capacity(pcpu).expect("Error reading capacity");
136
        let pcpu_fmax = get_cpu_maxfreq_khz(pcpu).expect("Error reading max freq");
137
        let util_factor = get_cpu_util_factor(pcpu).expect("Error getting util factor");
138
        let freqtbl_sel = 0;
139
        let mut domain_uclamp_min = None;
140
        let mut domain_uclamp_max = None;
141
        // The vcpu_capacity passed in is normalized for frequency, reverse the normalization to
142
        // get the performance per clock ratio between the vCPU and the pCPU its running on. This
143
        // "relative capacity" is an approximation of the delta in IPC (Instructions per Cycle)
144
        // between the pCPU vs vCPU running a usecase containing a mix of instruction types.
145
        let vcpu_fmax = vcpu_freq_table.clone().into_iter().max().unwrap();
146
        let vcpu_relative_capacity =
147
            u32::try_from(u64::from(vcpu_capacity) * u64::from(pcpu_fmax) / u64::from(vcpu_fmax))
148
                .unwrap();
149
        let pcpu_min_cap =
150
            get_cpu_minfreq_khz(pcpu).expect("Error reading min freq") * pcpu_capacity / pcpu_fmax;
151

152
        if let Some(cgroup_path) = &vcpu_domain_path {
153
            domain_uclamp_min = Some(
154
                File::create(cgroup_path.join("cpu.uclamp.min")).unwrap_or_else(|err| {
155
                    panic!(
156
                        "Err: {}, Unable to open: {}",
157
                        err,
158
                        cgroup_path.join("cpu.uclamp.min").display()
159
                    )
160
                }),
161
            );
162
            domain_uclamp_max = Some(
163
                File::create(cgroup_path.join("cpu.uclamp.max")).unwrap_or_else(|err| {
164
                    panic!(
165
                        "Err: {}, Unable to open: {}",
166
                        err,
167
                        cgroup_path.join("cpu.uclamp.max").display()
168
                    )
169
                }),
170
            );
171
        }
172

173
        VirtCpufreqV2 {
174
            vcpu_freq_table,
175
            pcpu_fmax,
176
            pcpu_capacity,
177
            pcpu,
178
            util_factor,
179
            freqtbl_sel,
180
            vcpu_domain,
181
            domain_uclamp_min,
182
            domain_uclamp_max,
183
            vcpu_fmax,
184
            vcpu_capacity,
185
            vcpu_relative_capacity,
186
            worker: None,
187
            timer: Arc::new(Mutex::new(Timer::new().expect("failed to create Timer"))),
188
            vm_ctrl,
189
            pcpu_min_cap,
190
            largest_pcpu_idx,
191
            shared_domain_vcpus,
192
            shared_domain_perf,
193
        }
194
    }
195
}
196

197
impl BusDevice for VirtCpufreqV2 {
198
    fn device_id(&self) -> DeviceId {
199
        PlatformDeviceId::VirtCpufreq.into()
200
    }
201

202
    fn debug_label(&self) -> String {
203
        "VirtCpufreq Device".to_owned()
204
    }
205

206
    fn read(&mut self, info: BusAccessInfo, data: &mut [u8]) {
207
        if data.len() != std::mem::size_of::<u32>() {
208
            warn!(
209
                "{}: unsupported read length {}, only support 4bytes read",
210
                self.debug_label(),
211
                data.len()
212
            );
213
            return;
214
        }
215

216
        let val = match info.offset as u32 {
217
            VCPUFREQ_CUR_PERF => {
218
                let shared_util = self.shared_domain_perf.load(Ordering::SeqCst);
219
                if shared_util != 0 && shared_util < self.pcpu_min_cap {
220
                    shared_util * self.vcpu_fmax / self.vcpu_capacity
221
                } else {
222
                    match get_cpu_curfreq_khz(self.pcpu) {
223
                        Ok(freq) => u32::try_from(
224
                            u64::from(freq) * u64::from(self.pcpu_capacity)
225
                                / u64::from(self.vcpu_relative_capacity),
226
                        )
227
                        .unwrap(),
228
                        Err(_) => 0,
229
                    }
230
                }
231
            }
232
            VCPUFREQ_FREQTBL_LEN => self.vcpu_freq_table.len() as u32,
233
            VCPUFREQ_PERF_DOMAIN => self.vcpu_domain,
234
            VCPUFREQ_FREQTBL_RD => *self
235
                .vcpu_freq_table
236
                .get(self.freqtbl_sel as usize)
237
                .unwrap_or(&0),
238
            _ => {
239
                warn!("{}: unsupported read address {}", self.debug_label(), info);
240
                return;
241
            }
242
        };
243

244
        let val_arr = val.to_ne_bytes();
245
        data.copy_from_slice(&val_arr);
246
    }
247

248
    fn write(&mut self, info: BusAccessInfo, data: &[u8]) {
249
        let val: u32 = match data.try_into().map(u32::from_ne_bytes) {
250
            Ok(v) => v,
251
            Err(e) => {
252
                warn!(
253
                    "{}: unsupported write length {:#}, only support 4bytes write",
254
                    self.debug_label(),
255
                    e
256
                );
257
                return;
258
            }
259
        };
260

261
        match info.offset as u32 {
262
            VCPUFREQ_SET_PERF => {
263
                // Util margin depends on the cpufreq governor on the host
264
                let util_raw = match u32::try_from(
265
                    u64::from(self.vcpu_capacity) * u64::from(val) / u64::from(self.vcpu_fmax),
266
                ) {
267
                    Ok(util) => util,
268
                    Err(e) => {
269
                        warn!("Potential overflow {:#}", e);
270
                        SCHED_CAPACITY_SCALE
271
                    }
272
                };
273

274
                let util = util_raw * self.util_factor / CPUFREQ_GOV_SCALE_FACTOR_DEFAULT;
275

276
                if let (Some(domain_uclamp_min), Some(domain_uclamp_max)) =
277
                    (&mut self.domain_uclamp_min, &mut self.domain_uclamp_max)
278
                {
279
                    use std::io::Write;
280
                    let val = util as f32 * 100.0 / SCHED_CAPACITY_SCALE as f32;
281
                    let val_formatted = format!("{val:4}").into_bytes();
282

283
                    if self.vcpu_fmax != self.pcpu_fmax {
284
                        if let Err(e) = domain_uclamp_max.write(&val_formatted) {
285
                            warn!("Error setting uclamp_max: {:#}", e);
286
                        }
287
                    }
288
                    if let Err(e) = domain_uclamp_min.write(&val_formatted) {
289
                        warn!("Error setting uclamp_min: {:#}", e);
290
                    }
291
                } else {
292
                    let mut sched_attr = sched_attr {
293
                        sched_flags: SCHED_FLAG_KEEP_ALL
294
                            | SCHED_FLAG_UTIL_CLAMP_MIN
295
                            | SCHED_FLAG_UTIL_CLAMP_MAX
296
                            | SCHED_FLAG_RESET_ON_FORK,
297
                        sched_util_min: util,
298
                        ..Default::default()
299
                    };
300

301
                    if self.vcpu_fmax != self.pcpu_fmax {
302
                        sched_attr.sched_util_max = util;
303
                    } else {
304
                        sched_attr.sched_util_max = SCHED_CAPACITY_SCALE;
305
                    }
306

307
                    if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
308
                        panic!("{}: Error setting util value: {:#}", self.debug_label(), e);
309
                    }
310
                }
311

312
                // Return early if vcpu_fmax matches pcpu_fmax as that denotes no vCPU throttling
313
                // is required.
314
                if self.vcpu_fmax == self.pcpu_fmax {
315
                    return;
316
                }
317

318
                self.shared_domain_perf.store(util_raw, Ordering::SeqCst);
319
                let timer = self.timer.clone();
320
                if self.worker.is_none() {
321
                    let vcpu_id = info.id;
322
                    let vm_ctrl = self.vm_ctrl.clone();
323
                    let worker_cpu_affinity = self.largest_pcpu_idx + self.vcpu_domain as usize + 1;
324
                    let shared_domain_vcpus = self.shared_domain_vcpus.clone();
325

326
                    self.worker = Some(WorkerThread::start(
327
                        format!("vcpu_throttle{vcpu_id}"),
328
                        move |kill_evt| {
329
                            vcpufreq_worker_thread(
330
                                shared_domain_vcpus,
331
                                kill_evt,
332
                                timer,
333
                                vm_ctrl,
334
                                worker_cpu_affinity,
335
                            )
336
                            .expect("error running vpucfreq_worker")
337
                        },
338
                    ));
339
                } else if util_raw < self.pcpu_min_cap {
340
                    // The period is porportional to the performance requested by the vCPU, we
341
                    // reduce the timeout period to increase the amount of throttling applied to
342
                    // the vCPU as the performance decreases. Ex. If vCPU requests half of the
343
                    // performance relatively to its pCPU@FMin, the vCPU will spend 50% of its
344
                    // cycles being throttled to increase time for the same workload that otherwise
345
                    // would've taken 1/2 of the time if ran at pCPU@FMin. We could've
346
                    // alternatively adjusted the workload and used some fixed period (such as
347
                    // 250us), but there's a floor for the minimum delay we add (cost of handling
348
                    // the userspace exit) and limits the range of performance we can emulate.
349
                    let timeout_period = (MIN_TIMER_US + TIMER_OVERHEAD_US) as f32
350
                        / (1.0 - (util_raw as f32 / self.pcpu_min_cap as f32));
351
                    let _ = timer
352
                        .lock()
353
                        .reset_repeating(Duration::from_micros(timeout_period as u64));
354
                } else {
355
                    let _ = timer.lock().clear();
356
                }
357
            }
358
            VCPUFREQ_FREQTBL_SEL => self.freqtbl_sel = val,
359
            _ => {
360
                warn!("{}: unsupported read address {}", self.debug_label(), info);
361
            }
362
        }
363
    }
364
}
365

366
pub fn vcpufreq_worker_thread(
367
    shared_domain_vcpus: Vec<usize>,
368
    kill_evt: Event,
369
    timer: Arc<Mutex<Timer>>,
370
    vm_ctrl: Arc<Mutex<Tube>>,
371
    cpu_affinity: usize,
372
) -> anyhow::Result<()> {
373
    #[derive(EventToken)]
374
    enum Token {
375
        // The timer expired.
376
        TimerExpire,
377
        // The parent thread requested an exit.
378
        Kill,
379
    }
380

381
    let wait_ctx = WaitContext::build_with(&[
382
        (&*timer.lock(), Token::TimerExpire),
383
        (&kill_evt, Token::Kill),
384
    ])
385
    .context("Failed to create wait_ctx")?;
386

387
    // The vcpufreq thread has strict scheduling requirements, let's affine it away from the vCPU
388
    // threads and clamp its util to high value.
389
    let cpu_set: Vec<usize> = vec![cpu_affinity];
390
    set_cpu_affinity(cpu_set)?;
391

392
    let mut sched_attr = sched_attr {
393
        sched_flags: SCHED_FLAG_KEEP_ALL
394
            | SCHED_FLAG_UTIL_CLAMP_MIN
395
            | SCHED_FLAG_UTIL_CLAMP_MAX
396
            | SCHED_FLAG_RESET_ON_FORK,
397
        sched_util_min: SCHED_CAPACITY_SCALE,
398
        sched_util_max: SCHED_CAPACITY_SCALE,
399
        ..Default::default()
400
    };
401
    if let Err(e) = sched_setattr(0, &mut sched_attr, 0) {
402
        warn!("Error setting util value: {}", e);
403
    }
404

405
    loop {
406
        let events = wait_ctx.wait().context("Failed to wait for events")?;
407
        for event in events.iter().filter(|e| e.is_readable) {
408
            match event.token {
409
                Token::TimerExpire => {
410
                    timer
411
                        .lock()
412
                        .mark_waited()
413
                        .context("failed to reset timer")?;
414
                    let vm_ctrl_unlocked = vm_ctrl.lock();
415
                    for vcpu_id in &shared_domain_vcpus {
416
                        let msg = vm_control::VmRequest::Throttle(*vcpu_id, MIN_TIMER_US);
417
                        vm_ctrl_unlocked
418
                            .send(&msg)
419
                            .context("failed to stall vCPUs")?;
420
                    }
421
                }
422
                Token::Kill => {
423
                    return Ok(());
424
                }
425
            }
426
        }
427
    }
428
}
429

430
impl Suspendable for VirtCpufreqV2 {}
431

432
Product

Resources

Company