Newer
Older
/*
* eeh.c
* Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/init.h>
#include <linux/list.h>
#include <linux/pci.h>
#include <linux/proc_fs.h>
#include <linux/rbtree.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <asm/rtas.h>
#undef DEBUG
/** Overview:
* EEH, or "Extended Error Handling" is a PCI bridge technology for
* dealing with PCI bus errors that can't be dealt with within the
* usual PCI framework, except by check-stopping the CPU. Systems
* that are designed for high-availability/reliability cannot afford
* to crash due to a "mere" PCI error, thus the need for EEH.
* An EEH-capable bridge operates by converting a detected error
* into a "slot freeze", taking the PCI adapter off-line, making
* the slot behave, from the OS'es point of view, as if the slot
* were "empty": all reads return 0xff's and all writes are silently
* ignored. EEH slot isolation events can be triggered by parity
* errors on the address or data busses (e.g. during posted writes),
* which in turn might be caused by low voltage on the bus, dust,
* vibration, humidity, radioactivity or plain-old failed hardware.
*
* Note, however, that one of the leading causes of EEH slot
* freeze events are buggy device drivers, buggy device microcode,
* or buggy device hardware. This is because any attempt by the
* device to bus-master data to a memory address that is not
* assigned to the device will trigger a slot freeze. (The idea
* is to prevent devices-gone-wild from corrupting system memory).
* Buggy hardware/drivers will have a miserable time co-existing
* with EEH.
*
* Ideally, a PCI device driver, when suspecting that an isolation
* event has occured (e.g. by reading 0xff's), will then ask EEH
* whether this is the case, and then take appropriate steps to
* reset the PCI slot, the PCI device, and then resume operations.
* However, until that day, the checking is done here, with the
* eeh_check_failure() routine embedded in the MMIO macros. If
* the slot is found to be isolated, an "EEH Event" is synthesized
* and sent out for processing.
*/
/* If a device driver keeps reading an MMIO register in an interrupt
* handler after a slot isolation event has occurred, we assume it
* is broken and panic. This sets the threshold for how many read
* attempts we allow before panicking.
*/
#define EEH_MAX_FAILS 100000
/* RTAS tokens */
static int ibm_set_eeh_option;
static int ibm_set_slot_reset;
static int ibm_read_slot_reset_state;
static int ibm_read_slot_reset_state2;
static int ibm_slot_error_detail;
static int ibm_get_config_addr_info;
static int ibm_configure_bridge;
int eeh_subsystem_enabled;
EXPORT_SYMBOL(eeh_subsystem_enabled);
/* Lock to avoid races due to multiple reports of an error */
static DEFINE_SPINLOCK(confirm_error_lock);
/* Buffer for reporting slot-error-detail rtas calls */
static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(slot_errbuf_lock);
static int eeh_error_buf_size;
/* System monitoring statistics */
static DEFINE_PER_CPU(unsigned long, no_device);
static DEFINE_PER_CPU(unsigned long, no_dn);
static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
static DEFINE_PER_CPU(unsigned long, ignored_check);
static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
static DEFINE_PER_CPU(unsigned long, false_positives);
static DEFINE_PER_CPU(unsigned long, ignored_failures);
static DEFINE_PER_CPU(unsigned long, slot_resets);
#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
/* --------------------------------------------------------------- */
/* Below lies the EEH event infrastructure */
void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
{
int config_addr;
unsigned long flags;
int rc;
/* Log the error with the rtas logger */
spin_lock_irqsave(&slot_errbuf_lock, flags);
memset(slot_errbuf, 0, eeh_error_buf_size);
/* Use PE configuration address, if present */
config_addr = pdn->eeh_config_addr;
if (pdn->eeh_pe_config_addr)
config_addr = pdn->eeh_pe_config_addr;
rc = rtas_call(ibm_slot_error_detail,
8, 1, NULL, config_addr,
BUID_HI(pdn->phb->buid),
BUID_LO(pdn->phb->buid), NULL, 0,
virt_to_phys(slot_errbuf),
eeh_error_buf_size,
severity);
if (rc == 0)
log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
spin_unlock_irqrestore(&slot_errbuf_lock, flags);
}
/**
* read_slot_reset_state - Read the reset state of a device node's slot
* @dn: device node to read
* @rets: array to return results in
*/
static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
int config_addr;
if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
token = ibm_read_slot_reset_state2;
outputs = 4;
} else {
token = ibm_read_slot_reset_state;
rets[2] = 0; /* fake PE Unavailable info */
/* Use PE configuration address, if present */
config_addr = pdn->eeh_config_addr;
if (pdn->eeh_pe_config_addr)
config_addr = pdn->eeh_pe_config_addr;
return rtas_call(token, 3, outputs, rets, config_addr,
BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid));
}
/**
* eeh_token_to_phys - convert EEH address token to phys address
* @token i/o token, should be address in the form 0xA....
*/
static inline unsigned long eeh_token_to_phys(unsigned long token)
{
pte_t *ptep;
unsigned long pa;
ptep = find_linux_pte(init_mm.pgd, token);
if (!ptep)
return token;
pa = pte_pfn(*ptep) << PAGE_SHIFT;
return pa | (token & (PAGE_SIZE-1));
}
/**
* Return the "partitionable endpoint" (pe) under which this device lies
*/
struct device_node * find_device_pe(struct device_node *dn)
{
while ((dn->parent) && PCI_DN(dn->parent) &&
(PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
dn = dn->parent;
}
return dn;
}
/** Mark all devices that are peers of this device as failed.
* Mark the device driver too, so that it can see the failure
Loading
Loading full blame...