// SPDX-License-Identifier: GPL-2.012/*3* Clocksource driver for the synthetic counter and timers4* provided by the Hyper-V hypervisor to guest VMs, as described5* in the Hyper-V Top Level Functional Spec (TLFS). This driver6* is instruction set architecture independent.7*8* Copyright (C) 2019, Microsoft, Inc.9*10* Author: Michael Kelley <[email protected]>11*/1213#include <linux/percpu.h>14#include <linux/cpumask.h>15#include <linux/clockchips.h>16#include <linux/clocksource.h>17#include <linux/sched_clock.h>18#include <linux/mm.h>19#include <linux/cpuhotplug.h>20#include <linux/interrupt.h>21#include <linux/irq.h>22#include <linux/acpi.h>23#include <linux/hyperv.h>24#include <linux/export.h>25#include <clocksource/hyperv_timer.h>26#include <hyperv/hvhdk.h>27#include <asm/mshyperv.h>2829static struct clock_event_device __percpu *hv_clock_event;30/* Note: offset can hold negative values after hibernation. */31static u64 hv_sched_clock_offset __read_mostly;3233/*34* If false, we're using the old mechanism for stimer0 interrupts35* where it sends a VMbus message when it expires. The old36* mechanism is used when running on older versions of Hyper-V37* that don't support Direct Mode. While Hyper-V provides38* four stimer's per CPU, Linux uses only stimer0.39*40* Because Direct Mode does not require processing a VMbus41* message, stimer interrupts can be enabled earlier in the42* process of booting a CPU, and consistent with when timer43* interrupts are enabled for other clocksource drivers.44* However, for legacy versions of Hyper-V when Direct Mode45* is not enabled, setting up stimer interrupts must be46* delayed until VMbus is initialized and can process the47* interrupt message.48*/49static bool direct_mode_enabled;5051static int stimer0_irq = -1;52static int stimer0_message_sint;53static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt);5455/*56* Common code for stimer0 interrupts coming via Direct Mode or57* as a VMbus message.58*/59void hv_stimer0_isr(void)60{61struct clock_event_device *ce;6263ce = this_cpu_ptr(hv_clock_event);64ce->event_handler(ce);65}66EXPORT_SYMBOL_GPL(hv_stimer0_isr);6768/*69* stimer0 interrupt handler for architectures that support70* per-cpu interrupts, which also implies Direct Mode.71*/72static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id)73{74hv_stimer0_isr();75return IRQ_HANDLED;76}7778static int hv_ce_set_next_event(unsigned long delta,79struct clock_event_device *evt)80{81u64 current_tick;8283current_tick = hv_read_reference_counter();84current_tick += delta;85hv_set_msr(HV_MSR_STIMER0_COUNT, current_tick);86return 0;87}8889static int hv_ce_shutdown(struct clock_event_device *evt)90{91hv_set_msr(HV_MSR_STIMER0_COUNT, 0);92hv_set_msr(HV_MSR_STIMER0_CONFIG, 0);93if (direct_mode_enabled && stimer0_irq >= 0)94disable_percpu_irq(stimer0_irq);9596return 0;97}9899static int hv_ce_set_oneshot(struct clock_event_device *evt)100{101union hv_stimer_config timer_cfg;102103timer_cfg.as_uint64 = 0;104timer_cfg.enable = 1;105timer_cfg.auto_enable = 1;106if (direct_mode_enabled) {107/*108* When it expires, the timer will directly interrupt109* on the specified hardware vector/IRQ.110*/111timer_cfg.direct_mode = 1;112timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR;113if (stimer0_irq >= 0)114enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE);115} else {116/*117* When it expires, the timer will generate a VMbus message,118* to be handled by the normal VMbus interrupt handler.119*/120timer_cfg.direct_mode = 0;121timer_cfg.sintx = stimer0_message_sint;122}123hv_set_msr(HV_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);124return 0;125}126127/*128* hv_stimer_init - Per-cpu initialization of the clockevent129*/130static int hv_stimer_init(unsigned int cpu)131{132struct clock_event_device *ce;133134if (!hv_clock_event)135return 0;136137ce = per_cpu_ptr(hv_clock_event, cpu);138ce->name = "Hyper-V clockevent";139ce->features = CLOCK_EVT_FEAT_ONESHOT;140ce->cpumask = cpumask_of(cpu);141142/*143* Lower the rating of the Hyper-V timer in a TDX VM without paravisor,144* so the local APIC timer (lapic_clockevent) is the default timer in145* such a VM. The Hyper-V timer is not preferred in such a VM because146* it depends on the slow VM Reference Counter MSR (the Hyper-V TSC147* page is not enbled in such a VM because the VM uses Invariant TSC148* as a better clocksource and it's challenging to mark the Hyper-V149* TSC page shared in very early boot).150*/151if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx())152ce->rating = 90;153else154ce->rating = 1000;155156ce->set_state_shutdown = hv_ce_shutdown;157ce->set_state_oneshot = hv_ce_set_oneshot;158ce->set_next_event = hv_ce_set_next_event;159160clockevents_config_and_register(ce,161HV_CLOCK_HZ,162HV_MIN_DELTA_TICKS,163HV_MAX_MAX_DELTA_TICKS);164return 0;165}166167/*168* hv_stimer_cleanup - Per-cpu cleanup of the clockevent169*/170int hv_stimer_cleanup(unsigned int cpu)171{172struct clock_event_device *ce;173174if (!hv_clock_event)175return 0;176177/*178* In the legacy case where Direct Mode is not enabled179* (which can only be on x86/64), stimer cleanup happens180* relatively early in the CPU offlining process. We181* must unbind the stimer-based clockevent device so182* that the LAPIC timer can take over until clockevents183* are no longer needed in the offlining process. Note184* that clockevents_unbind_device() eventually calls185* hv_ce_shutdown().186*187* The unbind should not be done when Direct Mode is188* enabled because we may be on an architecture where189* there are no other clockevent devices to fallback to.190*/191ce = per_cpu_ptr(hv_clock_event, cpu);192if (direct_mode_enabled)193hv_ce_shutdown(ce);194else195clockevents_unbind_device(ce, cpu);196197return 0;198}199EXPORT_SYMBOL_GPL(hv_stimer_cleanup);200201/*202* These placeholders are overridden by arch specific code on203* architectures that need special setup of the stimer0 IRQ because204* they don't support per-cpu IRQs (such as x86/x64).205*/206void __weak hv_setup_stimer0_handler(void (*handler)(void))207{208};209210void __weak hv_remove_stimer0_handler(void)211{212};213214#ifdef CONFIG_ACPI215/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */216static int hv_setup_stimer0_irq(void)217{218int ret;219220ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR,221ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH);222if (ret < 0) {223pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret);224return ret;225}226stimer0_irq = ret;227228ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr,229"Hyper-V stimer0", &stimer0_evt);230if (ret) {231pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d",232stimer0_irq, ret);233acpi_unregister_gsi(stimer0_irq);234stimer0_irq = -1;235}236return ret;237}238239static void hv_remove_stimer0_irq(void)240{241if (stimer0_irq == -1) {242hv_remove_stimer0_handler();243} else {244free_percpu_irq(stimer0_irq, &stimer0_evt);245acpi_unregister_gsi(stimer0_irq);246stimer0_irq = -1;247}248}249#else250static int hv_setup_stimer0_irq(void)251{252return 0;253}254255static void hv_remove_stimer0_irq(void)256{257}258#endif259260/* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */261int hv_stimer_alloc(bool have_percpu_irqs)262{263int ret;264265/*266* Synthetic timers are always available except on old versions of267* Hyper-V on x86. In that case, return as error as Linux will use a268* clockevent based on emulated LAPIC timer hardware.269*/270if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE))271return -EINVAL;272273hv_clock_event = alloc_percpu(struct clock_event_device);274if (!hv_clock_event)275return -ENOMEM;276277direct_mode_enabled = ms_hyperv.misc_features &278HV_STIMER_DIRECT_MODE_AVAILABLE;279280/*281* If Direct Mode isn't enabled, the remainder of the initialization282* is done later by hv_stimer_legacy_init()283*/284if (!direct_mode_enabled)285return 0;286287if (have_percpu_irqs) {288ret = hv_setup_stimer0_irq();289if (ret)290goto free_clock_event;291} else {292hv_setup_stimer0_handler(hv_stimer0_isr);293}294295/*296* Since we are in Direct Mode, stimer initialization297* can be done now with a CPUHP value in the same range298* as other clockevent devices.299*/300ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING,301"clockevents/hyperv/stimer:starting",302hv_stimer_init, hv_stimer_cleanup);303if (ret < 0) {304hv_remove_stimer0_irq();305goto free_clock_event;306}307return ret;308309free_clock_event:310free_percpu(hv_clock_event);311hv_clock_event = NULL;312return ret;313}314EXPORT_SYMBOL_GPL(hv_stimer_alloc);315316/*317* hv_stimer_legacy_init -- Called from the VMbus driver to handle318* the case when Direct Mode is not enabled, and the stimer319* must be initialized late in the CPU onlining process.320*321*/322void hv_stimer_legacy_init(unsigned int cpu, int sint)323{324if (direct_mode_enabled)325return;326327/*328* This function gets called by each vCPU, so setting the329* global stimer_message_sint value each time is conceptually330* not ideal, but the value passed in is always the same and331* it avoids introducing yet another interface into this332* clocksource driver just to set the sint in the legacy case.333*/334stimer0_message_sint = sint;335(void)hv_stimer_init(cpu);336}337EXPORT_SYMBOL_GPL(hv_stimer_legacy_init);338339/*340* hv_stimer_legacy_cleanup -- Called from the VMbus driver to341* handle the case when Direct Mode is not enabled, and the342* stimer must be cleaned up early in the CPU offlining343* process.344*/345void hv_stimer_legacy_cleanup(unsigned int cpu)346{347if (direct_mode_enabled)348return;349(void)hv_stimer_cleanup(cpu);350}351EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup);352353/*354* Do a global cleanup of clockevents for the cases of kexec and355* vmbus exit356*/357void hv_stimer_global_cleanup(void)358{359int cpu;360361/*362* hv_stime_legacy_cleanup() will stop the stimer if Direct363* Mode is not enabled, and fallback to the LAPIC timer.364*/365for_each_present_cpu(cpu) {366hv_stimer_legacy_cleanup(cpu);367}368369if (!hv_clock_event)370return;371372if (direct_mode_enabled) {373cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING);374hv_remove_stimer0_irq();375stimer0_irq = -1;376}377free_percpu(hv_clock_event);378hv_clock_event = NULL;379380}381EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);382383static __always_inline u64 read_hv_clock_msr(void)384{385/*386* Read the partition counter to get the current tick count. This count387* is set to 0 when the partition is created and is incremented in 100388* nanosecond units.389*390* Use hv_raw_get_msr() because this function is used from391* noinstr. Notable; while HV_MSR_TIME_REF_COUNT is a synthetic392* register it doesn't need the GHCB path.393*/394return hv_raw_get_msr(HV_MSR_TIME_REF_COUNT);395}396397/*398* Code and definitions for the Hyper-V clocksources. Two399* clocksources are defined: one that reads the Hyper-V defined MSR, and400* the other that uses the TSC reference page feature as defined in the401* TLFS. The MSR version is for compatibility with old versions of402* Hyper-V and 32-bit x86. The TSC reference page version is preferred.403*/404405static union {406struct ms_hyperv_tsc_page page;407u8 reserved[PAGE_SIZE];408} tsc_pg __bss_decrypted __aligned(PAGE_SIZE);409410static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page;411static unsigned long tsc_pfn;412413unsigned long hv_get_tsc_pfn(void)414{415return tsc_pfn;416}417EXPORT_SYMBOL_GPL(hv_get_tsc_pfn);418419struct ms_hyperv_tsc_page *hv_get_tsc_page(void)420{421return tsc_page;422}423EXPORT_SYMBOL_GPL(hv_get_tsc_page);424425static __always_inline u64 read_hv_clock_tsc(void)426{427u64 cur_tsc, time;428429/*430* The Hyper-V Top-Level Function Spec (TLFS), section Timers,431* subsection Refererence Counter, guarantees that the TSC and MSR432* times are in sync and monotonic. Therefore we can fall back433* to the MSR in case the TSC page indicates unavailability.434*/435if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time))436time = read_hv_clock_msr();437438return time;439}440441static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)442{443return read_hv_clock_tsc();444}445446static u64 noinstr read_hv_sched_clock_tsc(void)447{448return (read_hv_clock_tsc() - hv_sched_clock_offset) *449(NSEC_PER_SEC / HV_CLOCK_HZ);450}451452static void suspend_hv_clock_tsc(struct clocksource *arg)453{454union hv_reference_tsc_msr tsc_msr;455456/* Disable the TSC page */457tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC);458tsc_msr.enable = 0;459hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);460}461462463static void resume_hv_clock_tsc(struct clocksource *arg)464{465union hv_reference_tsc_msr tsc_msr;466467/* Re-enable the TSC page */468tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC);469tsc_msr.enable = 1;470tsc_msr.pfn = tsc_pfn;471hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);472}473474/*475* Called during resume from hibernation, from overridden476* x86_platform.restore_sched_clock_state routine. This is to adjust offsets477* used to calculate time for hv tsc page based sched_clock, to account for478* time spent before hibernation.479*/480void hv_adj_sched_clock_offset(u64 offset)481{482hv_sched_clock_offset -= offset;483}484485#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK486static int hv_cs_enable(struct clocksource *cs)487{488vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);489return 0;490}491#endif492493static struct clocksource hyperv_cs_tsc = {494.name = "hyperv_clocksource_tsc_page",495.rating = 500,496.read = read_hv_clock_tsc_cs,497.mask = CLOCKSOURCE_MASK(64),498.flags = CLOCK_SOURCE_IS_CONTINUOUS,499.suspend= suspend_hv_clock_tsc,500.resume = resume_hv_clock_tsc,501#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK502.enable = hv_cs_enable,503.vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK,504#else505.vdso_clock_mode = VDSO_CLOCKMODE_NONE,506#endif507};508509static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)510{511return read_hv_clock_msr();512}513514static struct clocksource hyperv_cs_msr = {515.name = "hyperv_clocksource_msr",516.rating = 495,517.read = read_hv_clock_msr_cs,518.mask = CLOCKSOURCE_MASK(64),519.flags = CLOCK_SOURCE_IS_CONTINUOUS,520};521522/*523* Reference to pv_ops must be inline so objtool524* detection of noinstr violations can work correctly.525*/526#ifdef CONFIG_GENERIC_SCHED_CLOCK527static __always_inline void hv_setup_sched_clock(void *sched_clock)528{529/*530* We're on an architecture with generic sched clock (not x86/x64).531* The Hyper-V sched clock read function returns nanoseconds, not532* the normal 100ns units of the Hyper-V synthetic clock.533*/534sched_clock_register(sched_clock, 64, NSEC_PER_SEC);535}536#elif defined CONFIG_PARAVIRT537static __always_inline void hv_setup_sched_clock(void *sched_clock)538{539/* We're on x86/x64 *and* using PV ops */540paravirt_set_sched_clock(sched_clock);541}542#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */543static __always_inline void hv_setup_sched_clock(void *sched_clock) {}544#endif /* CONFIG_GENERIC_SCHED_CLOCK */545546static void __init hv_init_tsc_clocksource(void)547{548union hv_reference_tsc_msr tsc_msr;549550/*551* If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly552* handles frequency and offset changes due to live migration,553* pause/resume, and other VM management operations. So lower the554* Hyper-V Reference TSC rating, causing the generic TSC to be used.555* TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference556* TSC will be preferred over the virtualized ARM64 arch counter.557*/558if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {559hyperv_cs_tsc.rating = 250;560hyperv_cs_msr.rating = 245;561}562563if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))564return;565566hv_read_reference_counter = read_hv_clock_tsc;567568/*569* TSC page mapping works differently in root compared to guest.570* - In guest partition the guest PFN has to be passed to the571* hypervisor.572* - In root partition it's other way around: it has to map the PFN573* provided by the hypervisor.574* But it can't be mapped right here as it's too early and MMU isn't575* ready yet. So, we only set the enable bit here and will remap the576* page later in hv_remap_tsc_clocksource().577*578* It worth mentioning, that TSC clocksource read function579* (read_hv_clock_tsc) has a MSR-based fallback mechanism, used when580* TSC page is zeroed (which is the case until the PFN is remapped) and581* thus TSC clocksource will work even without the real TSC page582* mapped.583*/584tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC);585if (hv_root_partition())586tsc_pfn = tsc_msr.pfn;587else588tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page));589tsc_msr.enable = 1;590tsc_msr.pfn = tsc_pfn;591hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);592593clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);594595/*596* If TSC is invariant, then let it stay as the sched clock since it597* will be faster than reading the TSC page. But if not invariant, use598* the TSC page so that live migrations across hosts with different599* frequencies is handled correctly.600*/601if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) {602hv_sched_clock_offset = hv_read_reference_counter();603hv_setup_sched_clock(read_hv_sched_clock_tsc);604}605}606607void __init hv_init_clocksource(void)608{609/*610* Try to set up the TSC page clocksource, then the MSR clocksource.611* At least one of these will always be available except on very old612* versions of Hyper-V on x86. In that case we won't have a Hyper-V613* clocksource, but Linux will still run with a clocksource based614* on the emulated PIT or LAPIC timer.615*616* Never use the MSR clocksource as sched clock. It's too slow.617* Better to use the native sched clock as the fallback.618*/619hv_init_tsc_clocksource();620621if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)622clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);623}624625void __init hv_remap_tsc_clocksource(void)626{627if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))628return;629630if (!hv_root_partition()) {631WARN(1, "%s: attempt to remap TSC page in guest partition\n",632__func__);633return;634}635636tsc_page = memremap(tsc_pfn << HV_HYP_PAGE_SHIFT, sizeof(tsc_pg),637MEMREMAP_WB);638if (!tsc_page)639pr_err("Failed to remap Hyper-V TSC page.\n");640}641642643