Commit b1c089b7 authored by Hidetoshi Seto's avatar Hidetoshi Seto Committed by Jesse Barnes

PCI: pcie, aer: report all error before recovery

This patch is required not to lost error records by action invoked on
error recovery, such as slot reset etc.

Following sample (real machine + dummy record injected by aer-inject)
shows that record of 28:00.1 could not be retrieved by recovery of 28:00.0:

- Before:

pcieport-driver 0000:00:02.0: AER: Multiple Uncorrected (Non-Fatal) error received: id=2801
e1000e 0000:28:00.0: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2800(Receiver ID)
e1000e 0000:28:00.0:   device [8086:1096] error status/mask=00001000/00100000
e1000e 0000:28:00.0:    [12] Poisoned TLP           (First)
e1000e 0000:28:00.0:   TLP Header: 00000000 00000001 00000002 00000003
e1000e 0000:28:00.0: broadcast error_detected message
e1000e 0000:28:00.0: broadcast slot_reset message
e1000e 0000:28:00.0: setting latency timer to 64
e1000e 0000:28:00.0: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.1: setting latency timer to 64
e1000e 0000:28:00.1: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.0: broadcast resume message
e1000e 0000:28:00.0: AER driver successfully recovered
e1000e: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX

- After:

pcieport-driver 0000:00:02.0: AER: Multiple Uncorrected (Non-Fatal) error received: id=2801
e1000e 0000:28:00.0: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2800(Receiver ID)
e1000e 0000:28:00.0:   device [8086:1096] error status/mask=00001000/00100000
e1000e 0000:28:00.0:    [12] Poisoned TLP           (First)
e1000e 0000:28:00.0:   TLP Header: 00000000 00000001 00000002 00000003
e1000e 0000:28:00.1: PCIE Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, id=2801(Receiver ID)
e1000e 0000:28:00.1:   device [8086:1096] error status/mask=00081000/00100000
e1000e 0000:28:00.1:    [12] Poisoned TLP           (First)
e1000e 0000:28:00.1:    [19] ECRC
e1000e 0000:28:00.1:   TLP Header: 00000000 00000001 00000002 00000003
e1000e 0000:28:00.1:   Error of this Agent(2801) is reported first
e1000e 0000:28:00.0: broadcast error_detected message
e1000e 0000:28:00.0: broadcast slot_reset message
e1000e 0000:28:00.0: setting latency timer to 64
e1000e 0000:28:00.0: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.0: PME# disabled
e1000e 0000:28:00.1: setting latency timer to 64
e1000e 0000:28:00.1: restoring config space at offset 0x1 (was 0x100547, writing 0x100147)
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.1: PME# disabled
e1000e 0000:28:00.0: broadcast resume message
e1000e 0000:28:00.0: AER driver successfully recovered
e1000e: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX/TX
Signed-off-by: default avatarHidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: default avatarJesse Barnes <jbarnes@virtuousgeek.org>
parent 79e4b89b
...@@ -29,8 +29,6 @@ ...@@ -29,8 +29,6 @@
#define ERR_COR_ID(d) (d & 0xffff) #define ERR_COR_ID(d) (d & 0xffff)
#define ERR_UNCOR_ID(d) (d >> 16) #define ERR_UNCOR_ID(d) (d >> 16)
#define AER_SUCCESS 0
#define AER_UNSUCCESS 1
#define AER_ERROR_SOURCES_MAX 100 #define AER_ERROR_SOURCES_MAX 100
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
......
...@@ -696,6 +696,13 @@ static struct aer_err_source *get_e_source(struct aer_rpc *rpc) ...@@ -696,6 +696,13 @@ static struct aer_err_source *get_e_source(struct aer_rpc *rpc)
return e_source; return e_source;
} }
/**
* get_device_error_info - read error status from dev and store it to info
* @dev: pointer to the device expected to have a error record
* @info: pointer to structure to store the error record
*
* Return 1 on success, 0 on error.
*/
static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
{ {
int pos, temp; int pos, temp;
...@@ -707,7 +714,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) ...@@ -707,7 +714,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
/* The device might not support AER */ /* The device might not support AER */
if (!pos) if (!pos)
return AER_SUCCESS; return 1;
if (info->severity == AER_CORRECTABLE) { if (info->severity == AER_CORRECTABLE) {
pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS, pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS,
...@@ -715,7 +722,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) ...@@ -715,7 +722,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK,
&info->mask); &info->mask);
if (!(info->status & ~info->mask)) if (!(info->status & ~info->mask))
return AER_UNSUCCESS; return 0;
} else if (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE || } else if (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE ||
info->severity == AER_NONFATAL) { info->severity == AER_NONFATAL) {
...@@ -725,7 +732,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) ...@@ -725,7 +732,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
&info->mask); &info->mask);
if (!(info->status & ~info->mask)) if (!(info->status & ~info->mask))
return AER_UNSUCCESS; return 0;
/* Get First Error Pointer */ /* Get First Error Pointer */
pci_read_config_dword(dev, pos + PCI_ERR_CAP, &temp); pci_read_config_dword(dev, pos + PCI_ERR_CAP, &temp);
...@@ -744,7 +751,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) ...@@ -744,7 +751,7 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
} }
} }
return AER_SUCCESS; return 1;
} }
static inline void aer_process_err_devices(struct pcie_device *p_device, static inline void aer_process_err_devices(struct pcie_device *p_device,
...@@ -758,14 +765,14 @@ static inline void aer_process_err_devices(struct pcie_device *p_device, ...@@ -758,14 +765,14 @@ static inline void aer_process_err_devices(struct pcie_device *p_device,
e_info->id); e_info->id);
} }
/* Report all before handle them, not to lost records by reset etc. */
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (get_device_error_info(e_info->dev[i], e_info) == if (get_device_error_info(e_info->dev[i], e_info))
AER_SUCCESS) {
aer_print_error(e_info->dev[i], e_info); aer_print_error(e_info->dev[i], e_info);
handle_error_source(p_device,
e_info->dev[i],
e_info);
} }
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (get_device_error_info(e_info->dev[i], e_info))
handle_error_source(p_device, e_info->dev[i], e_info);
} }
} }
...@@ -870,5 +877,5 @@ int aer_init(struct pcie_device *dev) ...@@ -870,5 +877,5 @@ int aer_init(struct pcie_device *dev)
if (aer_osc_setup(dev) && !forceload) if (aer_osc_setup(dev) && !forceload)
return -ENXIO; return -ENXIO;
return AER_SUCCESS; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment