Path: blob/main/metrics/src/sys/windows/system_metrics.rs
5394 views
// Copyright 2022 The ChromiumOS Authors1// Use of this source code is governed by a BSD-style license that can be2// found in the LICENSE file.34use std::fmt;5use std::fmt::Write as _;6use std::mem;7use std::sync::Arc;8use std::sync::Mutex;9use std::sync::Weak;10use std::thread;11use std::thread::JoinHandle;12use std::time::Duration;13use std::time::Instant;1415use base::error;16use base::AsRawDescriptor;17use base::Error as SysError;18use base::Event;19use base::EventToken;20use base::FromRawDescriptor;21use base::SafeDescriptor;22use base::WaitContext;23use chrono::DateTime;24use chrono::Utc;25use winapi::shared::minwindef::DWORD;26use winapi::shared::minwindef::FILETIME;27use winapi::um::processthreadsapi::GetProcessTimes;28use winapi::um::processthreadsapi::GetSystemTimes;29use winapi::um::processthreadsapi::OpenProcess;30use winapi::um::psapi::GetProcessMemoryInfo;31use winapi::um::psapi::PROCESS_MEMORY_COUNTERS;32use winapi::um::psapi::PROCESS_MEMORY_COUNTERS_EX;33use winapi::um::winbase::GetProcessIoCounters;34use winapi::um::winnt::IO_COUNTERS;35use winapi::um::winnt::LARGE_INTEGER;36use winapi::um::winnt::LONGLONG;37use winapi::um::winnt::PROCESS_QUERY_LIMITED_INFORMATION;38use winapi::um::winnt::PROCESS_VM_READ;39use winapi::um::winnt::SYNCHRONIZE;4041use crate::log_metric;42use crate::sys::windows::Error;43use crate::sys::windows::Result;44use crate::sys::windows::METRICS_UPLOAD_INTERVAL;45use crate::MetricEventType;4647const BYTES_PER_MB: usize = 1024 * 1024;48const WORKER_REPORT_INTERVAL: Duration = Duration::from_secs(1);4950type SysResult<T> = std::result::Result<T, SysError>;5152/// A worker job which periodically logs system metrics.53struct Worker {54exit_evt: Event,55io: Arc<Mutex<Option<ProcessIoRecord>>>,56measurements: Arc<Mutex<Option<Measurements>>>,57memory: Arc<Mutex<ProcessMemory>>,58memory_acc: Arc<Mutex<Option<ProcessMemoryAccumulated>>>,59metrics_string: Arc<Mutex<String>>,60}6162impl Worker {63fn run(&mut self) {64#[derive(EventToken)]65enum Token {66Exit,67}68let event_ctx: WaitContext<Token> =69match WaitContext::build_with(&[(&self.exit_evt, Token::Exit)]) {70Ok(event_ctx) => event_ctx,71Err(e) => {72error!("failed creating WaitContext: {}", e);73return;74}75};76let mut last_metric_upload_time = Instant::now();77'poll: loop {78let events = match event_ctx.wait_timeout(WORKER_REPORT_INTERVAL) {79Ok(events) => events,80Err(e) => {81error!("failed polling for events: {}", e);82return;83}84};85if events.is_empty() {86self.collect_metrics();87// Time budget for UI thread is very limited.88// We make the metric string for displaying in UI in the89// worker thread for best performance.90self.make_metrics_string();9192self.upload_metrics(&mut last_metric_upload_time);93}9495if events.into_iter().any(|e| e.is_readable) {96break 'poll;97}98}99}100101fn make_metrics_string(&mut self) {102let mut metrics_string = self.metrics_string.lock().unwrap();103*metrics_string = format!(104"{}\n{}",105self.cpu_metrics_string(),106self.mem_metrics_string()107);108}109110fn upload_metrics(&self, last_metric_upload_time: &mut Instant) {111if last_metric_upload_time.elapsed() >= METRICS_UPLOAD_INTERVAL {112let mut memory_acc = self.memory_acc.lock().unwrap();113if let Some(acc) = &*memory_acc {114let mem = acc.accumulated.physical / acc.accumulated_count / BYTES_PER_MB;115// The i64 cast will not cause overflow because the mem is at most 10TB for116// Windows 10.117log_metric(MetricEventType::MemoryUsage, mem as i64);118}119*memory_acc = None;120121let mut cpu_measurements = self.measurements.lock().unwrap();122if let Some(measurements) = &*cpu_measurements {123let sys_time = measurements.current.sys_time;124let process_time = measurements.current.process_time;125let prev_sys_time = measurements.last_upload.sys_time;126let prev_process_time = measurements.last_upload.process_time;127128let diff_systime_kernel =129compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);130let diff_systime_user =131compute_filetime_subtraction(sys_time.user, prev_sys_time.user);132133let diff_processtime_kernel =134compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);135let diff_processtime_user =136compute_filetime_subtraction(process_time.user, prev_process_time.user);137138let total_systime = diff_systime_kernel + diff_systime_user;139let total_processtime = diff_processtime_kernel + diff_processtime_user;140141if total_systime > 0 {142let cpu_usage = 100 * total_processtime / total_systime;143// The i64 cast will not cause overflow because the usage is at most 100.144log_metric(MetricEventType::CpuUsage, cpu_usage);145}146}147*cpu_measurements = None;148149let mut io = self.io.lock().unwrap();150if let Some(io_record) = &*io {151let new_io_read_bytes =152io_record.current.read_bytes - io_record.last_upload.read_bytes;153let new_io_write_bytes =154io_record.current.write_bytes - io_record.last_upload.write_bytes;155156let ms_elapsed =157(io_record.current_time - io_record.last_upload_time).num_milliseconds();158if ms_elapsed > 0 {159let io_read_bytes_per_sec =160(new_io_read_bytes as f32) / (ms_elapsed as f32) * 1000.0;161let io_write_bytes_per_sec =162(new_io_write_bytes as f32) / (ms_elapsed as f32) * 1000.0;163log_metric(MetricEventType::ReadIo, io_read_bytes_per_sec as i64);164log_metric(MetricEventType::WriteIo, io_write_bytes_per_sec as i64);165}166}167*io = None;168*last_metric_upload_time = Instant::now();169}170}171172fn collect_metrics(&mut self) {173match self.get_cpu_metrics() {174Ok(new_measurement) => {175let mut measurements = self.measurements.lock().unwrap();176let next_measurements = match *measurements {177Some(Measurements {178current,179last_upload,180..181}) => Measurements {182current: new_measurement,183previous: current,184last_upload,185},186None => Measurements {187current: new_measurement,188previous: new_measurement,189last_upload: new_measurement,190},191};192*measurements = Some(next_measurements);193}194Err(e) => {195// Do not panic because of cpu query related failures.196error!("Get cpu measurement failed: {}", e);197}198}199200match self.get_mem_metrics() {201Ok(mem) => {202// Keep running sum and count to calculate averages.203let mut memory_acc = self.memory_acc.lock().unwrap();204let updated_memory_acc = match *memory_acc {205Some(acc) => accumulate_process_memory(acc, mem),206None => ProcessMemoryAccumulated {207accumulated: mem,208accumulated_count: 1,209},210};211*memory_acc = Some(updated_memory_acc);212*self.memory.lock().unwrap() = mem213}214Err(e) => {215// Do not panic because of memory query failures.216error!("Get cpu measurement failed: {}", e);217}218}219220match self.get_io_metrics() {221Ok(new_io) => {222let mut io_record = self.io.lock().unwrap();223let updated_io = match *io_record {224Some(io) => ProcessIoRecord {225current: new_io,226current_time: Utc::now(),227last_upload: io.last_upload,228last_upload_time: io.last_upload_time,229},230None => ProcessIoRecord {231current: new_io,232current_time: Utc::now(),233last_upload: new_io,234last_upload_time: Utc::now(),235},236};237*io_record = Some(updated_io);238}239Err(e) => {240// Do not panic because of io query failures.241error!("Get io measurement failed: {}", e);242}243}244}245246fn get_mem_metrics(&self) -> SysResult<ProcessMemory> {247let process_handle = CoreWinMetrics::get_process_handle()?;248249let mut counters = PROCESS_MEMORY_COUNTERS_EX::default();250251// SAFETY:252// Safe because we own the process handle and all memory was allocated.253let result = unsafe {254GetProcessMemoryInfo(255process_handle.as_raw_descriptor(),256// Converting is necessary because the `winapi`' GetProcessMemoryInfo257// does NOT support `PROCESS_MEMORY_COUNTERS_EX`, but only258// 'PROCESS_MEMORY_COUNTERS'. The casting is safe because the underlining259// Windows api does `PROCESS_MEMORY_COUNTERS_EX`.260&mut counters as *mut PROCESS_MEMORY_COUNTERS_EX as *mut PROCESS_MEMORY_COUNTERS,261mem::size_of::<PROCESS_MEMORY_COUNTERS_EX>() as DWORD,262)263};264if result == 0 {265return Err(SysError::last());266}267268Ok(ProcessMemory {269page_fault_count: counters.PageFaultCount,270working_set_size: counters.WorkingSetSize,271working_set_peak: counters.PeakWorkingSetSize,272page_file_usage: counters.PagefileUsage,273page_file_peak: counters.PeakPagefileUsage,274physical: counters.PrivateUsage,275})276}277278fn get_cpu_metrics(&self) -> SysResult<Measurement> {279let mut sys_time: SystemCpuTime = Default::default();280let mut process_time: ProcessCpuTime = Default::default();281let sys_time_success: i32;282283// SAFETY:284// Safe because memory is allocated for sys_time before the windows call.285// And the value were initilized to 0s.286unsafe {287// First get kernel cpu time.288sys_time_success =289GetSystemTimes(&mut sys_time.idle, &mut sys_time.kernel, &mut sys_time.user);290}291if sys_time_success == 0 {292error!("Systime collection failed.\n");293return Err(SysError::last());294} else {295// Query current process cpu time.296let process_handle = CoreWinMetrics::get_process_handle()?;297let process_time_success: i32;298// SAFETY:299// Safe because memory is allocated for process_time before the windows call.300// And the value were initilized to 0s.301unsafe {302process_time_success = GetProcessTimes(303process_handle.as_raw_descriptor(),304&mut process_time.create,305&mut process_time.exit,306&mut process_time.kernel,307&mut process_time.user,308);309}310if process_time_success == 0 {311error!("Systime collection failed.\n");312return Err(SysError::last());313}314}315Ok(Measurement {316sys_time: SystemCpuTime {317idle: sys_time.idle,318kernel: sys_time.kernel,319user: sys_time.user,320},321process_time: ProcessCpuTime {322create: process_time.create,323exit: process_time.exit,324kernel: process_time.kernel,325user: process_time.user,326},327})328}329330fn get_io_metrics(&self) -> SysResult<ProcessIo> {331let process_handle = CoreWinMetrics::get_process_handle()?;332let mut io_counters = IO_COUNTERS::default();333// SAFETY:334// Safe because we own the process handle and all memory was allocated.335let result = unsafe {336GetProcessIoCounters(337process_handle.as_raw_descriptor(),338&mut io_counters as *mut IO_COUNTERS,339)340};341if result == 0 {342return Err(SysError::last());343}344Ok(ProcessIo {345read_bytes: io_counters.ReadTransferCount,346write_bytes: io_counters.WriteTransferCount,347})348}349350fn mem_metrics_string(&self) -> String {351let guard = self.memory.lock().unwrap();352let memory: ProcessMemory = *guard;353let mut buf = format!(354"Physical memory used: {} mb.\n",355memory.physical / BYTES_PER_MB356);357let _ = writeln!(358buf,359"Total working memory: {} mb.",360memory.working_set_size / BYTES_PER_MB361);362let _ = writeln!(363buf,364"Peak working memory: {} mb.",365memory.working_set_peak / BYTES_PER_MB366);367let _ = writeln!(buf, "Page fault count: {}.", memory.page_fault_count);368let _ = writeln!(369buf,370"Page file used: {} mb.",371memory.page_file_usage / BYTES_PER_MB372);373let _ = writeln!(374buf,375"Peak page file used: {} mb.",376memory.page_file_peak / BYTES_PER_MB377);378buf379}380381fn cpu_metrics_string(&self) -> String {382let guard = self.measurements.lock().unwrap();383let mut buf = String::new();384385// Now we use current and last cpu measurment data to calculate cpu usage386// as a percentage.387if let Some(measurements) = &*guard {388let sys_time = measurements.current.sys_time;389let process_time = measurements.current.process_time;390let prev_sys_time = measurements.previous.sys_time;391let prev_process_time = measurements.previous.process_time;392393let diff_systime_kernel =394compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);395let diff_systime_user = compute_filetime_subtraction(sys_time.user, prev_sys_time.user);396397let diff_processtime_kernel =398compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);399let diff_processtime_user =400compute_filetime_subtraction(process_time.user, prev_process_time.user);401402let total_systime = diff_systime_kernel + diff_systime_user;403let total_processtime = diff_processtime_kernel + diff_processtime_user;404405let mut process_cpu = String::from("still calculating...");406if total_systime > 0 {407process_cpu = format!("{}%", (100 * total_processtime / total_systime));408}409let _ = writeln!(buf, "Process cpu usage is: {process_cpu}");410411#[cfg(debug_assertions)]412{413// Show data supporting our cpu usage calculation.414// Output system cpu time.415let _ = writeln!(416buf,417"Systime Idle: low {} / high {}",418sys_time.idle.dwLowDateTime, sys_time.idle.dwHighDateTime419);420let _ = writeln!(421buf,422"Systime User: low {} / high {}",423sys_time.user.dwLowDateTime, sys_time.user.dwHighDateTime424);425let _ = writeln!(426buf,427"Systime kernel: low {} / high {}",428sys_time.kernel.dwLowDateTime, sys_time.kernel.dwHighDateTime429);430// Output process cpu time.431let _ = writeln!(432buf,433"Process Create: low {} / high {}",434process_time.create.dwLowDateTime, process_time.create.dwHighDateTime435);436let _ = writeln!(437buf,438"Process Exit: low {} / high {}",439process_time.exit.dwLowDateTime, process_time.exit.dwHighDateTime440);441let _ = writeln!(442buf,443"Process kernel: low {} / high {}",444process_time.kernel.dwLowDateTime, process_time.kernel.dwHighDateTime445);446let _ = writeln!(447buf,448"Process user: low {} / high {}",449process_time.user.dwLowDateTime, process_time.user.dwHighDateTime450);451}452} else {453let _ = write!(buf, "Calculating cpu usage...");454}455buf456}457}458459fn compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG {460// SAFETY:461// safe because we are initializing the struct to 0s.462unsafe {463let mut a: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();464a.u_mut().LowPart = fta.dwLowDateTime;465a.u_mut().HighPart = fta.dwHighDateTime as i32;466let mut b: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();467b.u_mut().LowPart = ftb.dwLowDateTime;468b.u_mut().HighPart = ftb.dwHighDateTime as i32;469a.QuadPart() - b.QuadPart()470}471}472473// Adds to a running total of memory metrics over the course of a collection period.474// Can divide these sums to calculate averages.475fn accumulate_process_memory(476acc: ProcessMemoryAccumulated,477mem: ProcessMemory,478) -> ProcessMemoryAccumulated {479ProcessMemoryAccumulated {480accumulated: ProcessMemory {481page_fault_count: mem.page_fault_count,482working_set_size: acc.accumulated.working_set_size + mem.working_set_size,483working_set_peak: mem.working_set_peak,484page_file_usage: acc.accumulated.page_file_usage + mem.page_file_usage,485page_file_peak: mem.page_file_peak,486physical: acc.accumulated.physical + mem.physical,487},488accumulated_count: acc.accumulated_count + 1,489}490}491492#[derive(Copy, Clone, Default)]493struct SystemCpuTime {494idle: FILETIME,495kernel: FILETIME,496user: FILETIME,497}498499#[derive(Copy, Clone, Default)]500struct ProcessCpuTime {501create: FILETIME,502exit: FILETIME,503kernel: FILETIME,504user: FILETIME,505}506507#[derive(Copy, Clone, Default)]508struct ProcessMemory {509page_fault_count: u32,510working_set_size: usize,511working_set_peak: usize,512page_file_usage: usize,513page_file_peak: usize,514physical: usize,515}516517#[derive(Copy, Clone)]518struct ProcessMemoryAccumulated {519accumulated: ProcessMemory,520accumulated_count: usize,521}522523#[derive(Copy, Clone, Default)]524struct ProcessIo {525read_bytes: u64,526write_bytes: u64,527}528529#[derive(Copy, Clone)]530struct ProcessIoRecord {531current: ProcessIo,532current_time: DateTime<Utc>,533last_upload: ProcessIo,534last_upload_time: DateTime<Utc>,535}536537#[derive(Copy, Clone)]538struct Measurement {539sys_time: SystemCpuTime,540process_time: ProcessCpuTime,541}542543struct Measurements {544current: Measurement,545previous: Measurement,546last_upload: Measurement,547}548549/// A managing struct for a job which defines regular logging of core Windows system metrics.550pub(crate) struct CoreWinMetrics {551metrics_string: Weak<Mutex<String>>,552exit_evt: Event,553worker_thread: Option<JoinHandle<()>>,554}555556impl CoreWinMetrics {557pub fn new() -> Result<Self> {558let exit_evt = match Event::new() {559Ok(evt) => evt,560Err(_e) => return Err(Error::CannotInstantiateEvent),561};562563let metrics_string = String::new();564let arc_metrics_memory = Arc::new(Mutex::new(metrics_string));565let weak_metrics_memory = Arc::downgrade(&arc_metrics_memory);566567let mut me = Self {568metrics_string: weak_metrics_memory,569exit_evt,570worker_thread: None,571};572let exit_evt_clone = match me.exit_evt.try_clone() {573Ok(evt) => evt,574Err(_) => return Err(Error::CannotCloneEvent),575};576me.worker_thread.replace(thread::spawn(|| {577Worker {578exit_evt: exit_evt_clone,579io: Arc::new(Mutex::new(None)),580measurements: Arc::new(Mutex::new(None)),581memory: Arc::new(Mutex::new(Default::default())),582memory_acc: Arc::new(Mutex::new(None)),583metrics_string: arc_metrics_memory,584}585.run();586}));587Ok(me)588}589590fn get_process_handle() -> SysResult<SafeDescriptor> {591// SAFETY:592// Safe because we own the current process.593let process_handle = unsafe {594OpenProcess(595PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_VM_READ | SYNCHRONIZE,5960,597std::process::id(),598)599};600if process_handle.is_null() {601return Err(SysError::last());602}603// SAFETY:604// Safe as the SafeDescriptor is the only thing with access to the handle after this.605Ok(unsafe { SafeDescriptor::from_raw_descriptor(process_handle) })606}607}608609impl Drop for CoreWinMetrics {610fn drop(&mut self) {611if let Some(join_handle) = self.worker_thread.take() {612let _ = self.exit_evt.signal();613join_handle614.join()615.expect("fail to join the worker thread of a win core metrics collector.");616}617}618}619620impl fmt::Display for CoreWinMetrics {621fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {622match self.metrics_string.upgrade() {623Some(metrics_string) => write!(f, "{}", *metrics_string.lock().unwrap()),624None => write!(f, ""),625}626}627}628629630