Path: blob/master/arch/powerpc/platforms/pseries/eeh_driver.c
10818 views
/*1* PCI Error Recovery Driver for RPA-compliant PPC64 platform.2* Copyright IBM Corp. 2004 20053* Copyright Linas Vepstas <[email protected]> 2004, 20054*5* All rights reserved.6*7* This program is free software; you can redistribute it and/or modify8* it under the terms of the GNU General Public License as published by9* the Free Software Foundation; either version 2 of the License, or (at10* your option) any later version.11*12* This program is distributed in the hope that it will be useful, but13* WITHOUT ANY WARRANTY; without even the implied warranty of14* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or15* NON INFRINGEMENT. See the GNU General Public License for more16* details.17*18* You should have received a copy of the GNU General Public License19* along with this program; if not, write to the Free Software20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.21*22* Send comments and feedback to Linas Vepstas <[email protected]>23*/24#include <linux/delay.h>25#include <linux/interrupt.h>26#include <linux/irq.h>27#include <linux/pci.h>28#include <asm/eeh.h>29#include <asm/eeh_event.h>30#include <asm/ppc-pci.h>31#include <asm/pci-bridge.h>32#include <asm/prom.h>33#include <asm/rtas.h>343536static inline const char * pcid_name (struct pci_dev *pdev)37{38if (pdev && pdev->dev.driver)39return pdev->dev.driver->name;40return "";41}4243#if 044static void print_device_node_tree(struct pci_dn *pdn, int dent)45{46int i;47struct device_node *pc;4849if (!pdn)50return;51for (i = 0; i < dent; i++)52printk(" ");53printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",54pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,55pdn->eeh_pe_config_addr, pdn->node->full_name);56dent += 3;57pc = pdn->node->child;58while (pc) {59print_device_node_tree(PCI_DN(pc), dent);60pc = pc->sibling;61}62}63#endif6465/**66* eeh_disable_irq - disable interrupt for the recovering device67*/68static void eeh_disable_irq(struct pci_dev *dev)69{70struct device_node *dn = pci_device_to_OF_node(dev);7172/* Don't disable MSI and MSI-X interrupts. They are73* effectively disabled by the DMA Stopped state74* when an EEH error occurs.75*/76if (dev->msi_enabled || dev->msix_enabled)77return;7879if (!irq_has_action(dev->irq))80return;8182PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;83disable_irq_nosync(dev->irq);84}8586/**87* eeh_enable_irq - enable interrupt for the recovering device88*/89static void eeh_enable_irq(struct pci_dev *dev)90{91struct device_node *dn = pci_device_to_OF_node(dev);9293if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) {94PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;95enable_irq(dev->irq);96}97}9899/* ------------------------------------------------------- */100/**101* eeh_report_error - report pci error to each device driver102*103* Report an EEH error to each device driver, collect up and104* merge the device driver responses. Cumulative response105* passed back in "userdata".106*/107108static int eeh_report_error(struct pci_dev *dev, void *userdata)109{110enum pci_ers_result rc, *res = userdata;111struct pci_driver *driver = dev->driver;112113dev->error_state = pci_channel_io_frozen;114115if (!driver)116return 0;117118eeh_disable_irq(dev);119120if (!driver->err_handler ||121!driver->err_handler->error_detected)122return 0;123124rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);125126/* A driver that needs a reset trumps all others */127if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;128if (*res == PCI_ERS_RESULT_NONE) *res = rc;129130return 0;131}132133/**134* eeh_report_mmio_enabled - tell drivers that MMIO has been enabled135*136* Tells each device driver that IO ports, MMIO and config space I/O137* are now enabled. Collects up and merges the device driver responses.138* Cumulative response passed back in "userdata".139*/140141static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)142{143enum pci_ers_result rc, *res = userdata;144struct pci_driver *driver = dev->driver;145146if (!driver ||147!driver->err_handler ||148!driver->err_handler->mmio_enabled)149return 0;150151rc = driver->err_handler->mmio_enabled (dev);152153/* A driver that needs a reset trumps all others */154if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;155if (*res == PCI_ERS_RESULT_NONE) *res = rc;156157return 0;158}159160/**161* eeh_report_reset - tell device that slot has been reset162*/163164static int eeh_report_reset(struct pci_dev *dev, void *userdata)165{166enum pci_ers_result rc, *res = userdata;167struct pci_driver *driver = dev->driver;168169if (!driver)170return 0;171172dev->error_state = pci_channel_io_normal;173174eeh_enable_irq(dev);175176if (!driver->err_handler ||177!driver->err_handler->slot_reset)178return 0;179180rc = driver->err_handler->slot_reset(dev);181if ((*res == PCI_ERS_RESULT_NONE) ||182(*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;183if (*res == PCI_ERS_RESULT_DISCONNECT &&184rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;185186return 0;187}188189/**190* eeh_report_resume - tell device to resume normal operations191*/192193static int eeh_report_resume(struct pci_dev *dev, void *userdata)194{195struct pci_driver *driver = dev->driver;196197dev->error_state = pci_channel_io_normal;198199if (!driver)200return 0;201202eeh_enable_irq(dev);203204if (!driver->err_handler ||205!driver->err_handler->resume)206return 0;207208driver->err_handler->resume(dev);209210return 0;211}212213/**214* eeh_report_failure - tell device driver that device is dead.215*216* This informs the device driver that the device is permanently217* dead, and that no further recovery attempts will be made on it.218*/219220static int eeh_report_failure(struct pci_dev *dev, void *userdata)221{222struct pci_driver *driver = dev->driver;223224dev->error_state = pci_channel_io_perm_failure;225226if (!driver)227return 0;228229eeh_disable_irq(dev);230231if (!driver->err_handler ||232!driver->err_handler->error_detected)233return 0;234235driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);236237return 0;238}239240/* ------------------------------------------------------- */241/**242* handle_eeh_events -- reset a PCI device after hard lockup.243*244* pSeries systems will isolate a PCI slot if the PCI-Host245* bridge detects address or data parity errors, DMA's246* occurring to wild addresses (which usually happen due to247* bugs in device drivers or in PCI adapter firmware).248* Slot isolations also occur if #SERR, #PERR or other misc249* PCI-related errors are detected.250*251* Recovery process consists of unplugging the device driver252* (which generated hotplug events to userspace), then issuing253* a PCI #RST to the device, then reconfiguring the PCI config254* space for all bridges & devices under this slot, and then255* finally restarting the device drivers (which cause a second256* set of hotplug events to go out to userspace).257*/258259/**260* eeh_reset_device() -- perform actual reset of a pci slot261* @bus: pointer to the pci bus structure corresponding262* to the isolated slot. A non-null value will263* cause all devices under the bus to be removed264* and then re-added.265* @pe_dn: pointer to a "Partionable Endpoint" device node.266* This is the top-level structure on which pci267* bus resets can be performed.268*/269270static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)271{272struct device_node *dn;273int cnt, rc;274275/* pcibios will clear the counter; save the value */276cnt = pe_dn->eeh_freeze_count;277278if (bus)279pcibios_remove_pci_devices(bus);280281/* Reset the pci controller. (Asserts RST#; resets config space).282* Reconfigure bridges and devices. Don't try to bring the system283* up if the reset failed for some reason. */284rc = rtas_set_slot_reset(pe_dn);285if (rc)286return rc;287288/* Walk over all functions on this device. */289dn = pe_dn->node;290if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))291dn = dn->parent->child;292293while (dn) {294struct pci_dn *ppe = PCI_DN(dn);295/* On Power4, always true because eeh_pe_config_addr=0 */296if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) {297rtas_configure_bridge(ppe);298eeh_restore_bars(ppe);299}300dn = dn->sibling;301}302303/* Give the system 5 seconds to finish running the user-space304* hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,305* this is a hack, but if we don't do this, and try to bring306* the device up before the scripts have taken it down,307* potentially weird things happen.308*/309if (bus) {310ssleep (5);311pcibios_add_pci_devices(bus);312}313pe_dn->eeh_freeze_count = cnt;314315return 0;316}317318/* The longest amount of time to wait for a pci device319* to come back on line, in seconds.320*/321#define MAX_WAIT_FOR_RECOVERY 150322323struct pci_dn * handle_eeh_events (struct eeh_event *event)324{325struct device_node *frozen_dn;326struct pci_dn *frozen_pdn;327struct pci_bus *frozen_bus;328int rc = 0;329enum pci_ers_result result = PCI_ERS_RESULT_NONE;330const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;331332frozen_dn = find_device_pe(event->dn);333if (!frozen_dn) {334335location = of_get_property(event->dn, "ibm,loc-code", NULL);336location = location ? location : "unknown";337printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "338"for location=%s pci addr=%s\n",339location, eeh_pci_name(event->dev));340return NULL;341}342343frozen_bus = pcibios_find_pci_bus(frozen_dn);344location = of_get_property(frozen_dn, "ibm,loc-code", NULL);345location = location ? location : "unknown";346347/* There are two different styles for coming up with the PE.348* In the old style, it was the highest EEH-capable device349* which was always an EADS pci bridge. In the new style,350* there might not be any EADS bridges, and even when there are,351* the firmware marks them as "EEH incapable". So another352* two-step is needed to find the pci bus.. */353if (!frozen_bus)354frozen_bus = pcibios_find_pci_bus (frozen_dn->parent);355356if (!frozen_bus) {357printk(KERN_ERR "EEH: Cannot find PCI bus "358"for location=%s dn=%s\n",359location, frozen_dn->full_name);360return NULL;361}362363frozen_pdn = PCI_DN(frozen_dn);364frozen_pdn->eeh_freeze_count++;365366pci_str = eeh_pci_name(event->dev);367drv_str = pcid_name(event->dev);368369if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)370goto excess_failures;371372printk(KERN_WARNING373"EEH: This PCI device has failed %d times in the last hour:\n",374frozen_pdn->eeh_freeze_count);375376if (frozen_pdn->pcidev) {377bus_pci_str = pci_name(frozen_pdn->pcidev);378bus_drv_str = pcid_name(frozen_pdn->pcidev);379printk(KERN_WARNING380"EEH: Bus location=%s driver=%s pci addr=%s\n",381location, bus_drv_str, bus_pci_str);382}383384printk(KERN_WARNING385"EEH: Device location=%s driver=%s pci addr=%s\n",386location, drv_str, pci_str);387388/* Walk the various device drivers attached to this slot through389* a reset sequence, giving each an opportunity to do what it needs390* to accomplish the reset. Each child gets a report of the391* status ... if any child can't handle the reset, then the entire392* slot is dlpar removed and added.393*/394pci_walk_bus(frozen_bus, eeh_report_error, &result);395396/* Get the current PCI slot state. This can take a long time,397* sometimes over 3 seconds for certain systems. */398rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000);399if (rc < 0) {400printk(KERN_WARNING "EEH: Permanent failure\n");401goto hard_fail;402}403404/* Since rtas may enable MMIO when posting the error log,405* don't post the error log until after all dev drivers406* have been informed.407*/408eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE);409410/* If all device drivers were EEH-unaware, then shut411* down all of the device drivers, and hope they412* go down willingly, without panicing the system.413*/414if (result == PCI_ERS_RESULT_NONE) {415rc = eeh_reset_device(frozen_pdn, frozen_bus);416if (rc) {417printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);418goto hard_fail;419}420}421422/* If all devices reported they can proceed, then re-enable MMIO */423if (result == PCI_ERS_RESULT_CAN_RECOVER) {424rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO);425426if (rc < 0)427goto hard_fail;428if (rc) {429result = PCI_ERS_RESULT_NEED_RESET;430} else {431result = PCI_ERS_RESULT_NONE;432pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);433}434}435436/* If all devices reported they can proceed, then re-enable DMA */437if (result == PCI_ERS_RESULT_CAN_RECOVER) {438rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA);439440if (rc < 0)441goto hard_fail;442if (rc)443result = PCI_ERS_RESULT_NEED_RESET;444else445result = PCI_ERS_RESULT_RECOVERED;446}447448/* If any device has a hard failure, then shut off everything. */449if (result == PCI_ERS_RESULT_DISCONNECT) {450printk(KERN_WARNING "EEH: Device driver gave up\n");451goto hard_fail;452}453454/* If any device called out for a reset, then reset the slot */455if (result == PCI_ERS_RESULT_NEED_RESET) {456rc = eeh_reset_device(frozen_pdn, NULL);457if (rc) {458printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);459goto hard_fail;460}461result = PCI_ERS_RESULT_NONE;462pci_walk_bus(frozen_bus, eeh_report_reset, &result);463}464465/* All devices should claim they have recovered by now. */466if ((result != PCI_ERS_RESULT_RECOVERED) &&467(result != PCI_ERS_RESULT_NONE)) {468printk(KERN_WARNING "EEH: Not recovered\n");469goto hard_fail;470}471472/* Tell all device drivers that they can resume operations */473pci_walk_bus(frozen_bus, eeh_report_resume, NULL);474475return frozen_pdn;476477excess_failures:478/*479* About 90% of all real-life EEH failures in the field480* are due to poorly seated PCI cards. Only 10% or so are481* due to actual, failed cards.482*/483printk(KERN_ERR484"EEH: PCI device at location=%s driver=%s pci addr=%s\n"485"has failed %d times in the last hour "486"and has been permanently disabled.\n"487"Please try reseating this device or replacing it.\n",488location, drv_str, pci_str, frozen_pdn->eeh_freeze_count);489goto perm_error;490491hard_fail:492printk(KERN_ERR493"EEH: Unable to recover from failure of PCI device "494"at location=%s driver=%s pci addr=%s\n"495"Please try reseating this device or replacing it.\n",496location, drv_str, pci_str);497498perm_error:499eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE);500501/* Notify all devices that they're about to go down. */502pci_walk_bus(frozen_bus, eeh_report_failure, NULL);503504/* Shut down the device drivers for good. */505pcibios_remove_pci_devices(frozen_bus);506507return NULL;508}509510/* ---------- end of file ---------- */511512513