Newer
Older
* Copyright IBM Corporation 2001, 2005, 2006
* Copyright Dave Engebretsen & Todd Inglett 2001
* Copyright Linas Vepstas 2005, 2006
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <asm/rtas.h>
/** Overview:
* EEH, or "Extended Error Handling" is a PCI bridge technology for
* dealing with PCI bus errors that can't be dealt with within the
* usual PCI framework, except by check-stopping the CPU. Systems
* that are designed for high-availability/reliability cannot afford
* to crash due to a "mere" PCI error, thus the need for EEH.
* An EEH-capable bridge operates by converting a detected error
* into a "slot freeze", taking the PCI adapter off-line, making
* the slot behave, from the OS'es point of view, as if the slot
* were "empty": all reads return 0xff's and all writes are silently
* ignored. EEH slot isolation events can be triggered by parity
* errors on the address or data busses (e.g. during posted writes),
* which in turn might be caused by low voltage on the bus, dust,
* vibration, humidity, radioactivity or plain-old failed hardware.
*
* Note, however, that one of the leading causes of EEH slot
* freeze events are buggy device drivers, buggy device microcode,
* or buggy device hardware. This is because any attempt by the
* device to bus-master data to a memory address that is not
* assigned to the device will trigger a slot freeze. (The idea
* is to prevent devices-gone-wild from corrupting system memory).
* Buggy hardware/drivers will have a miserable time co-existing
* with EEH.
*
* Ideally, a PCI device driver, when suspecting that an isolation
* event has occurred (e.g. by reading 0xff's), will then ask EEH
* whether this is the case, and then take appropriate steps to
* reset the PCI slot, the PCI device, and then resume operations.
* However, until that day, the checking is done here, with the
* eeh_check_failure() routine embedded in the MMIO macros. If
* the slot is found to be isolated, an "EEH Event" is synthesized
* and sent out for processing.
*/
/* If a device driver keeps reading an MMIO register in an interrupt
* handler after a slot isolation event, it might be broken.
* This sets the threshold for how many read attempts we allow
* before printing an error message.
/* Time to wait for a PCI slot to report status, in milliseconds */
#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
/* RTAS tokens */
static int ibm_set_eeh_option;
static int ibm_set_slot_reset;
static int ibm_read_slot_reset_state;
static int ibm_read_slot_reset_state2;
static int ibm_slot_error_detail;
static int ibm_get_config_addr_info;
static int ibm_get_config_addr_info2;
static int ibm_configure_bridge;
static int ibm_configure_pe;
int eeh_subsystem_enabled;
EXPORT_SYMBOL(eeh_subsystem_enabled);
/* Lock to avoid races due to multiple reports of an error */
static DEFINE_RAW_SPINLOCK(confirm_error_lock);
/* Buffer for reporting slot-error-detail rtas calls. Its here
* in BSS, and not dynamically alloced, so that it ends up in
* RMO where RTAS can access it.
*/
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(slot_errbuf_lock);
static int eeh_error_buf_size;
/* Buffer for reporting pci register dumps. Its here in BSS, and
* not dynamically alloced, so that it ends up in RMO where RTAS
* can access it.
*/
#define EEH_PCI_REGS_LOG_LEN 4096
static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
static unsigned long no_device;
static unsigned long no_dn;
static unsigned long no_cfg_addr;
static unsigned long ignored_check;
static unsigned long total_mmio_ffs;
static unsigned long false_positives;
static unsigned long slot_resets;
#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
/* --------------------------------------------------------------- */
/* Below lies the EEH event infrastructure */
static void rtas_slot_error_detail(struct pci_dn *pdn, int severity,
char *driver_log, size_t loglen)
int config_addr;
unsigned long flags;
int rc;
/* Log the error with the rtas logger */
spin_lock_irqsave(&slot_errbuf_lock, flags);
memset(slot_errbuf, 0, eeh_error_buf_size);
/* Use PE configuration address, if present */
config_addr = pdn->eeh_config_addr;
if (pdn->eeh_pe_config_addr)
config_addr = pdn->eeh_pe_config_addr;
rc = rtas_call(ibm_slot_error_detail,
8, 1, NULL, config_addr,
BUID_HI(pdn->phb->buid),
BUID_LO(pdn->phb->buid),
virt_to_phys(driver_log), loglen,
virt_to_phys(slot_errbuf),
eeh_error_buf_size,
severity);
if (rc == 0)
log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
spin_unlock_irqrestore(&slot_errbuf_lock, flags);
}
/**
* gather_pci_data - copy assorted PCI config space registers to buff
* @pdn: device to report data for
* @buf: point to buffer in which to log
* @len: amount of room in buffer
*
* This routine captures assorted PCI configuration space data,
* and puts them into a buffer for RTAS error logging.
*/
static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len)
{
struct pci_dev *dev = pdn->pcidev;
n += scnprintf(buf+n, len-n, "%s\n", pdn->node->full_name);
printk(KERN_WARNING "EEH: of node=%s\n", pdn->node->full_name);
rtas_read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
rtas_read_config(pdn, PCI_COMMAND, 4, &cfg);
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg);
if (!dev) {
printk(KERN_WARNING "EEH: no PCI device for this of node\n");
return n;
}
/* Gather bridge-specific registers */
if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
rtas_read_config(pdn, PCI_SEC_STATUS, 2, &cfg);
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg);
Loading
Loading full blame...