Path: blob/master/arch/powerpc/platforms/pseries/mobility.c
26481 views
// SPDX-License-Identifier: GPL-2.0-only1/*2* Support for Partition Mobility/Migration3*4* Copyright (C) 2010 Nathan Fontenot5* Copyright (C) 2010 IBM Corporation6*/789#define pr_fmt(fmt) "mobility: " fmt1011#include <linux/cpu.h>12#include <linux/kernel.h>13#include <linux/kobject.h>14#include <linux/nmi.h>15#include <linux/sched.h>16#include <linux/smp.h>17#include <linux/stat.h>18#include <linux/stop_machine.h>19#include <linux/completion.h>20#include <linux/device.h>21#include <linux/delay.h>22#include <linux/slab.h>23#include <linux/stringify.h>2425#include <asm/machdep.h>26#include <asm/nmi.h>27#include <asm/rtas.h>28#include "pseries.h"29#include "vas.h" /* vas_migration_handler() */30#include "../../kernel/cacheinfo.h"3132static struct kobject *mobility_kobj;3334struct update_props_workarea {35__be32 phandle;36__be32 state;37__be64 reserved;38__be32 nprops;39} __packed;4041#define NODE_ACTION_MASK 0xff00000042#define NODE_COUNT_MASK 0x00ffffff4344#define DELETE_DT_NODE 0x0100000045#define UPDATE_DT_NODE 0x0200000046#define ADD_DT_NODE 0x030000004748#define MIGRATION_SCOPE (1)49#define PRRN_SCOPE -25051#ifdef CONFIG_PPC_WATCHDOG52static unsigned int nmi_wd_lpm_factor = 200;5354#ifdef CONFIG_SYSCTL55static const struct ctl_table nmi_wd_lpm_factor_ctl_table[] = {56{57.procname = "nmi_wd_lpm_factor",58.data = &nmi_wd_lpm_factor,59.maxlen = sizeof(int),60.mode = 0644,61.proc_handler = proc_douintvec_minmax,62},63};6465static int __init register_nmi_wd_lpm_factor_sysctl(void)66{67register_sysctl("kernel", nmi_wd_lpm_factor_ctl_table);6869return 0;70}71device_initcall(register_nmi_wd_lpm_factor_sysctl);72#endif /* CONFIG_SYSCTL */73#endif /* CONFIG_PPC_WATCHDOG */7475static int mobility_rtas_call(int token, char *buf, s32 scope)76{77int rc;7879spin_lock(&rtas_data_buf_lock);8081memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);82rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);83memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);8485spin_unlock(&rtas_data_buf_lock);86return rc;87}8889static int delete_dt_node(struct device_node *dn)90{91struct device_node *pdn;92bool is_platfac;9394pdn = of_get_parent(dn);95is_platfac = of_node_is_type(dn, "ibm,platform-facilities") ||96of_node_is_type(pdn, "ibm,platform-facilities");97of_node_put(pdn);9899/*100* The drivers that bind to nodes in the platform-facilities101* hierarchy don't support node removal, and the removal directive102* from firmware is always followed by an add of an equivalent103* node. The capability (e.g. RNG, encryption, compression)104* represented by the node is never interrupted by the migration.105* So ignore changes to this part of the tree.106*/107if (is_platfac) {108pr_notice("ignoring remove operation for %pOFfp\n", dn);109return 0;110}111112pr_debug("removing node %pOFfp\n", dn);113dlpar_detach_node(dn);114return 0;115}116117static int update_dt_property(struct device_node *dn, struct property **prop,118const char *name, u32 vd, char *value)119{120struct property *new_prop = *prop;121int more = 0;122123/* A negative 'vd' value indicates that only part of the new property124* value is contained in the buffer and we need to call125* ibm,update-properties again to get the rest of the value.126*127* A negative value is also the two's compliment of the actual value.128*/129if (vd & 0x80000000) {130vd = ~vd + 1;131more = 1;132}133134if (new_prop) {135/* partial property fixup */136char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);137if (!new_data)138return -ENOMEM;139140memcpy(new_data, new_prop->value, new_prop->length);141memcpy(new_data + new_prop->length, value, vd);142143kfree(new_prop->value);144new_prop->value = new_data;145new_prop->length += vd;146} else {147new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);148if (!new_prop)149return -ENOMEM;150151new_prop->name = kstrdup(name, GFP_KERNEL);152if (!new_prop->name) {153kfree(new_prop);154return -ENOMEM;155}156157new_prop->length = vd;158new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);159if (!new_prop->value) {160kfree(new_prop->name);161kfree(new_prop);162return -ENOMEM;163}164165memcpy(new_prop->value, value, vd);166*prop = new_prop;167}168169if (!more) {170pr_debug("updating node %pOF property %s\n", dn, name);171of_update_property(dn, new_prop);172*prop = NULL;173}174175return 0;176}177178static int update_dt_node(struct device_node *dn, s32 scope)179{180struct update_props_workarea *upwa;181struct property *prop = NULL;182int i, rc, rtas_rc;183char *prop_data;184char *rtas_buf;185int update_properties_token;186u32 nprops;187u32 vd;188189update_properties_token = rtas_function_token(RTAS_FN_IBM_UPDATE_PROPERTIES);190if (update_properties_token == RTAS_UNKNOWN_SERVICE)191return -EINVAL;192193rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);194if (!rtas_buf)195return -ENOMEM;196197upwa = (struct update_props_workarea *)&rtas_buf[0];198upwa->phandle = cpu_to_be32(dn->phandle);199200do {201rtas_rc = mobility_rtas_call(update_properties_token, rtas_buf,202scope);203if (rtas_rc < 0)204break;205206prop_data = rtas_buf + sizeof(*upwa);207nprops = be32_to_cpu(upwa->nprops);208209/* On the first call to ibm,update-properties for a node the210* first property value descriptor contains an empty211* property name, the property value length encoded as u32,212* and the property value is the node path being updated.213*/214if (*prop_data == 0) {215prop_data++;216vd = be32_to_cpu(*(__be32 *)prop_data);217prop_data += vd + sizeof(vd);218nprops--;219}220221for (i = 0; i < nprops; i++) {222char *prop_name;223224prop_name = prop_data;225prop_data += strlen(prop_name) + 1;226vd = be32_to_cpu(*(__be32 *)prop_data);227prop_data += sizeof(vd);228229switch (vd) {230case 0x00000000:231/* name only property, nothing to do */232break;233234case 0x80000000:235of_remove_property(dn, of_find_property(dn,236prop_name, NULL));237prop = NULL;238break;239240default:241rc = update_dt_property(dn, &prop, prop_name,242vd, prop_data);243if (rc) {244pr_err("updating %s property failed: %d\n",245prop_name, rc);246}247248prop_data += vd;249break;250}251252cond_resched();253}254255cond_resched();256} while (rtas_rc == 1);257258kfree(rtas_buf);259return 0;260}261262static int add_dt_node(struct device_node *parent_dn, __be32 drc_index)263{264struct device_node *dn;265int rc;266267dn = dlpar_configure_connector(drc_index, parent_dn);268if (!dn)269return -ENOENT;270271/*272* Since delete_dt_node() ignores this node type, this is the273* necessary counterpart. We also know that a platform-facilities274* node returned from dlpar_configure_connector() has children275* attached, and dlpar_attach_node() only adds the parent, leaking276* the children. So ignore these on the add side for now.277*/278if (of_node_is_type(dn, "ibm,platform-facilities")) {279pr_notice("ignoring add operation for %pOF\n", dn);280dlpar_free_cc_nodes(dn);281return 0;282}283284rc = dlpar_attach_node(dn, parent_dn);285if (rc)286dlpar_free_cc_nodes(dn);287288pr_debug("added node %pOFfp\n", dn);289290return rc;291}292293static int pseries_devicetree_update(s32 scope)294{295char *rtas_buf;296__be32 *data;297int update_nodes_token;298int rc;299300update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES);301if (update_nodes_token == RTAS_UNKNOWN_SERVICE)302return 0;303304rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);305if (!rtas_buf)306return -ENOMEM;307308do {309rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);310if (rc && rc != 1)311break;312313data = (__be32 *)rtas_buf + 4;314while (be32_to_cpu(*data) & NODE_ACTION_MASK) {315int i;316u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK;317u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;318319data++;320321for (i = 0; i < node_count; i++) {322struct device_node *np;323__be32 phandle = *data++;324__be32 drc_index;325326np = of_find_node_by_phandle(be32_to_cpu(phandle));327if (!np) {328pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n",329be32_to_cpu(phandle), action);330continue;331}332333switch (action) {334case DELETE_DT_NODE:335delete_dt_node(np);336break;337case UPDATE_DT_NODE:338update_dt_node(np, scope);339break;340case ADD_DT_NODE:341drc_index = *data++;342add_dt_node(np, drc_index);343break;344}345346of_node_put(np);347cond_resched();348}349}350351cond_resched();352} while (rc == 1);353354kfree(rtas_buf);355return rc;356}357358void post_mobility_fixup(void)359{360int rc;361362rtas_activate_firmware();363364/*365* We don't want CPUs to go online/offline while the device366* tree is being updated.367*/368cpus_read_lock();369370/*371* It's common for the destination firmware to replace cache372* nodes. Release all of the cacheinfo hierarchy's references373* before updating the device tree.374*/375cacheinfo_teardown();376377rc = pseries_devicetree_update(MIGRATION_SCOPE);378if (rc)379pr_err("device tree update failed: %d\n", rc);380381cacheinfo_rebuild();382383cpus_read_unlock();384385/* Possibly switch to a new L1 flush type */386pseries_setup_security_mitigations();387388/* Reinitialise system information for hv-24x7 */389read_24x7_sys_info();390391return;392}393394static int poll_vasi_state(u64 handle, unsigned long *res)395{396unsigned long retbuf[PLPAR_HCALL_BUFSIZE];397long hvrc;398int ret;399400hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle);401switch (hvrc) {402case H_SUCCESS:403ret = 0;404*res = retbuf[0];405break;406case H_PARAMETER:407ret = -EINVAL;408break;409case H_FUNCTION:410ret = -EOPNOTSUPP;411break;412case H_HARDWARE:413default:414pr_err("unexpected H_VASI_STATE result %ld\n", hvrc);415ret = -EIO;416break;417}418return ret;419}420421static int wait_for_vasi_session_suspending(u64 handle)422{423unsigned long state;424int ret;425426/*427* Wait for transition from H_VASI_ENABLED to428* H_VASI_SUSPENDING. Treat anything else as an error.429*/430while (true) {431ret = poll_vasi_state(handle, &state);432433if (ret != 0 || state == H_VASI_SUSPENDING) {434break;435} else if (state == H_VASI_ENABLED) {436ssleep(1);437} else {438pr_err("unexpected H_VASI_STATE result %lu\n", state);439ret = -EIO;440break;441}442}443444/*445* Proceed even if H_VASI_STATE is unavailable. If H_JOIN or446* ibm,suspend-me are also unimplemented, we'll recover then.447*/448if (ret == -EOPNOTSUPP)449ret = 0;450451return ret;452}453454static void wait_for_vasi_session_completed(u64 handle)455{456unsigned long state = 0;457int ret;458459pr_info("waiting for memory transfer to complete...\n");460461/*462* Wait for transition from H_VASI_RESUMED to H_VASI_COMPLETED.463*/464while (true) {465ret = poll_vasi_state(handle, &state);466467/*468* If the memory transfer is already complete and the migration469* has been cleaned up by the hypervisor, H_PARAMETER is return,470* which is translate in EINVAL by poll_vasi_state().471*/472if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) {473pr_info("memory transfer completed.\n");474break;475}476477if (ret) {478pr_err("H_VASI_STATE return error (%d)\n", ret);479break;480}481482if (state != H_VASI_RESUMED) {483pr_err("unexpected H_VASI_STATE result %lu\n", state);484break;485}486487msleep(500);488}489}490491static void prod_single(unsigned int target_cpu)492{493long hvrc;494int hwid;495496hwid = get_hard_smp_processor_id(target_cpu);497hvrc = plpar_hcall_norets(H_PROD, hwid);498if (hvrc == H_SUCCESS)499return;500pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n",501target_cpu, hwid, hvrc);502}503504static void prod_others(void)505{506unsigned int cpu;507508for_each_online_cpu(cpu) {509if (cpu != smp_processor_id())510prod_single(cpu);511}512}513514static u16 clamp_slb_size(void)515{516#ifdef CONFIG_PPC_64S_HASH_MMU517u16 prev = mmu_slb_size;518519slb_set_size(SLB_MIN_SIZE);520521return prev;522#else523return 0;524#endif525}526527static int do_suspend(void)528{529u16 saved_slb_size;530int status;531int ret;532533pr_info("calling ibm,suspend-me on CPU %i\n", smp_processor_id());534535/*536* The destination processor model may have fewer SLB entries537* than the source. We reduce mmu_slb_size to a safe minimum538* before suspending in order to minimize the possibility of539* programming non-existent entries on the destination. If540* suspend fails, we restore it before returning. On success541* the OF reconfig path will update it from the new device542* tree after resuming on the destination.543*/544saved_slb_size = clamp_slb_size();545546ret = rtas_ibm_suspend_me(&status);547if (ret != 0) {548pr_err("ibm,suspend-me error: %d\n", status);549slb_set_size(saved_slb_size);550}551552return ret;553}554555/**556* struct pseries_suspend_info - State shared between CPUs for join/suspend.557* @counter: Threads are to increment this upon resuming from suspend558* or if an error is received from H_JOIN. The thread which performs559* the first increment (i.e. sets it to 1) is responsible for560* waking the other threads.561* @done: False if join/suspend is in progress. True if the operation is562* complete (successful or not).563*/564struct pseries_suspend_info {565atomic_t counter;566bool done;567};568569static int do_join(void *arg)570{571struct pseries_suspend_info *info = arg;572atomic_t *counter = &info->counter;573long hvrc;574int ret;575576retry:577/* Must ensure MSR.EE off for H_JOIN. */578hard_irq_disable();579hvrc = plpar_hcall_norets(H_JOIN);580581switch (hvrc) {582case H_CONTINUE:583/*584* All other CPUs are offline or in H_JOIN. This CPU585* attempts the suspend.586*/587ret = do_suspend();588break;589case H_SUCCESS:590/*591* The suspend is complete and this cpu has received a592* prod, or we've received a stray prod from unrelated593* code (e.g. paravirt spinlocks) and we need to join594* again.595*596* This barrier orders the return from H_JOIN above vs597* the load of info->done. It pairs with the barrier598* in the wakeup/prod path below.599*/600smp_mb();601if (READ_ONCE(info->done) == false) {602pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying",603smp_processor_id());604goto retry;605}606ret = 0;607break;608case H_BAD_MODE:609case H_HARDWARE:610default:611ret = -EIO;612pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",613hvrc, smp_processor_id());614break;615}616617if (atomic_inc_return(counter) == 1) {618pr_info("CPU %u waking all threads\n", smp_processor_id());619WRITE_ONCE(info->done, true);620/*621* This barrier orders the store to info->done vs subsequent622* H_PRODs to wake the other CPUs. It pairs with the barrier623* in the H_SUCCESS case above.624*/625smp_mb();626prod_others();627}628/*629* Execution may have been suspended for several seconds, so reset630* the watchdogs. touch_nmi_watchdog() also touches the soft lockup631* watchdog.632*/633rcu_cpu_stall_reset();634touch_nmi_watchdog();635636return ret;637}638639/*640* Abort reason code byte 0. We use only the 'Migrating partition' value.641*/642enum vasi_aborting_entity {643ORCHESTRATOR = 1,644VSP_SOURCE = 2,645PARTITION_FIRMWARE = 3,646PLATFORM_FIRMWARE = 4,647VSP_TARGET = 5,648MIGRATING_PARTITION = 6,649};650651static void pseries_cancel_migration(u64 handle, int err)652{653u32 reason_code;654u32 detail;655u8 entity;656long hvrc;657658entity = MIGRATING_PARTITION;659detail = abs(err) & 0xffffff;660reason_code = (entity << 24) | detail;661662hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle,663H_VASI_SIGNAL_CANCEL, reason_code);664if (hvrc)665pr_err("H_VASI_SIGNAL error: %ld\n", hvrc);666}667668static int pseries_suspend(u64 handle)669{670const unsigned int max_attempts = 5;671unsigned int retry_interval_ms = 1;672unsigned int attempt = 1;673int ret;674675while (true) {676struct pseries_suspend_info info;677unsigned long vasi_state;678int vasi_err;679680info = (struct pseries_suspend_info) {681.counter = ATOMIC_INIT(0),682.done = false,683};684685ret = stop_machine(do_join, &info, cpu_online_mask);686if (ret == 0)687break;688/*689* Encountered an error. If the VASI stream is still690* in Suspending state, it's likely a transient691* condition related to some device in the partition692* and we can retry in the hope that the cause has693* cleared after some delay.694*695* A better design would allow drivers etc to prepare696* for the suspend and avoid conditions which prevent697* the suspend from succeeding. For now, we have this698* mitigation.699*/700pr_notice("Partition suspend attempt %u of %u error: %d\n",701attempt, max_attempts, ret);702703if (attempt == max_attempts)704break;705706vasi_err = poll_vasi_state(handle, &vasi_state);707if (vasi_err == 0) {708if (vasi_state != H_VASI_SUSPENDING) {709pr_notice("VASI state %lu after failed suspend\n",710vasi_state);711break;712}713} else if (vasi_err != -EOPNOTSUPP) {714pr_err("VASI state poll error: %d", vasi_err);715break;716}717718pr_notice("Will retry partition suspend after %u ms\n",719retry_interval_ms);720721msleep(retry_interval_ms);722retry_interval_ms *= 10;723attempt++;724}725726return ret;727}728729static int pseries_migrate_partition(u64 handle)730{731int ret;732unsigned int factor = 0;733734#ifdef CONFIG_PPC_WATCHDOG735factor = nmi_wd_lpm_factor;736#endif737/*738* When the migration is initiated, the hypervisor changes VAS739* mappings to prepare before OS gets the notification and740* closes all VAS windows. NX generates continuous faults during741* this time and the user space can not differentiate these742* faults from the migration event. So reduce this time window743* by closing VAS windows at the beginning of this function.744*/745vas_migration_handler(VAS_SUSPEND);746747ret = wait_for_vasi_session_suspending(handle);748if (ret)749goto out;750751if (factor)752watchdog_hardlockup_set_timeout_pct(factor);753754ret = pseries_suspend(handle);755if (ret == 0) {756post_mobility_fixup();757/*758* Wait until the memory transfer is complete, so that the user759* space process returns from the syscall after the transfer is760* complete. This allows the user hooks to be executed at the761* right time.762*/763wait_for_vasi_session_completed(handle);764} else765pseries_cancel_migration(handle, ret);766767if (factor)768watchdog_hardlockup_set_timeout_pct(0);769770out:771vas_migration_handler(VAS_RESUME);772773return ret;774}775776int rtas_syscall_dispatch_ibm_suspend_me(u64 handle)777{778return pseries_migrate_partition(handle);779}780781static ssize_t migration_store(const struct class *class,782const struct class_attribute *attr, const char *buf,783size_t count)784{785u64 streamid;786int rc;787788rc = kstrtou64(buf, 0, &streamid);789if (rc)790return rc;791792rc = pseries_migrate_partition(streamid);793if (rc)794return rc;795796return count;797}798799/*800* Used by drmgr to determine the kernel behavior of the migration interface.801*802* Version 1: Performs all PAPR requirements for migration including803* firmware activation and device tree update.804*/805#define MIGRATION_API_VERSION 1806807static CLASS_ATTR_WO(migration);808static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION));809810static int __init mobility_sysfs_init(void)811{812int rc;813814mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);815if (!mobility_kobj)816return -ENOMEM;817818rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);819if (rc)820pr_err("unable to create migration sysfs file (%d)\n", rc);821822rc = sysfs_create_file(mobility_kobj, &class_attr_api_version.attr.attr);823if (rc)824pr_err("unable to create api_version sysfs file (%d)\n", rc);825826return 0;827}828machine_device_initcall(pseries, mobility_sysfs_init);829830831