Commit 0918472ceeffad234df5589e45b646a94476f835

Authored by Huang Ying
Committed by Jesse Barnes
1 parent 0aba496fc8

PCI: PCIe AER: add aer_recover_queue

In addition to native PCIe AER, now APEI (ACPI Platform Error
Interface) GHES (Generic Hardware Error Source) can be used to report
PCIe AER errors too.  To add support to APEI GHES PCIe AER recovery,
aer_recover_queue is added to export the recovery function in native
PCIe AER driver.

Recoverable PCIe AER errors are reported via NMI in APEI GHES.  Then
APEI GHES uses irq_work to delay the error processing into an IRQ
handler.  But PCIe AER recovery can be very time-consuming, so
aer_recover_queue, which can be used in IRQ handler, delays the real
recovery action into the process context, that is, work queue.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>

Showing 3 changed files with 74 additions and 8 deletions Side-by-side Diff

drivers/pci/pcie/aer/aerdrv_core.c
... ... @@ -24,6 +24,7 @@
24 24 #include <linux/suspend.h>
25 25 #include <linux/delay.h>
26 26 #include <linux/slab.h>
  27 +#include <linux/kfifo.h>
27 28 #include "aerdrv.h"
28 29  
29 30 static int forceload;
... ... @@ -445,8 +446,7 @@
445 446 return drv;
446 447 }
447 448  
448   -static pci_ers_result_t reset_link(struct pcie_device *aerdev,
449   - struct pci_dev *dev)
  449 +static pci_ers_result_t reset_link(struct pci_dev *dev)
450 450 {
451 451 struct pci_dev *udev;
452 452 pci_ers_result_t status;
... ... @@ -486,7 +486,6 @@
486 486  
487 487 /**
488 488 * do_recovery - handle nonfatal/fatal error recovery process
489   - * @aerdev: pointer to a pcie_device data structure of root port
490 489 * @dev: pointer to a pci_dev data structure of agent detecting an error
491 490 * @severity: error severity type
492 491 *
... ... @@ -494,8 +493,7 @@
494 493 * error detected message to all downstream drivers within a hierarchy in
495 494 * question and return the returned code.
496 495 */
497   -static void do_recovery(struct pcie_device *aerdev, struct pci_dev *dev,
498   - int severity)
  496 +static void do_recovery(struct pci_dev *dev, int severity)
499 497 {
500 498 pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
501 499 enum pci_channel_state state;
... ... @@ -511,7 +509,7 @@
511 509 report_error_detected);
512 510  
513 511 if (severity == AER_FATAL) {
514   - result = reset_link(aerdev, dev);
  512 + result = reset_link(dev);
515 513 if (result != PCI_ERS_RESULT_RECOVERED)
516 514 goto failed;
517 515 }
518 516  
... ... @@ -576,8 +574,72 @@
576 574 pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
577 575 info->status);
578 576 } else
579   - do_recovery(aerdev, dev, info->severity);
  577 + do_recovery(dev, info->severity);
580 578 }
  579 +
  580 +#ifdef CONFIG_ACPI_APEI_PCIEAER
  581 +static void aer_recover_work_func(struct work_struct *work);
  582 +
  583 +#define AER_RECOVER_RING_ORDER 4
  584 +#define AER_RECOVER_RING_SIZE (1 << AER_RECOVER_RING_ORDER)
  585 +
  586 +struct aer_recover_entry
  587 +{
  588 + u8 bus;
  589 + u8 devfn;
  590 + u16 domain;
  591 + int severity;
  592 +};
  593 +
  594 +static DEFINE_KFIFO(aer_recover_ring, struct aer_recover_entry,
  595 + AER_RECOVER_RING_SIZE);
  596 +/*
  597 + * Mutual exclusion for writers of aer_recover_ring, reader side don't
  598 + * need lock, because there is only one reader and lock is not needed
  599 + * between reader and writer.
  600 + */
  601 +static DEFINE_SPINLOCK(aer_recover_ring_lock);
  602 +static DECLARE_WORK(aer_recover_work, aer_recover_work_func);
  603 +
  604 +void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
  605 + int severity)
  606 +{
  607 + unsigned long flags;
  608 + struct aer_recover_entry entry = {
  609 + .bus = bus,
  610 + .devfn = devfn,
  611 + .domain = domain,
  612 + .severity = severity,
  613 + };
  614 +
  615 + spin_lock_irqsave(&aer_recover_ring_lock, flags);
  616 + if (kfifo_put(&aer_recover_ring, &entry))
  617 + schedule_work(&aer_recover_work);
  618 + else
  619 + pr_err("AER recover: Buffer overflow when recovering AER for %04x:%02x:%02x:%x\n",
  620 + domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
  621 + spin_unlock_irqrestore(&aer_recover_ring_lock, flags);
  622 +}
  623 +EXPORT_SYMBOL_GPL(aer_recover_queue);
  624 +
  625 +static void aer_recover_work_func(struct work_struct *work)
  626 +{
  627 + struct aer_recover_entry entry;
  628 + struct pci_dev *pdev;
  629 +
  630 + while (kfifo_get(&aer_recover_ring, &entry)) {
  631 + pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
  632 + entry.devfn);
  633 + if (!pdev) {
  634 + pr_err("AER recover: Can not find pci_dev for %04x:%02x:%02x:%x\n",
  635 + entry.domain, entry.bus,
  636 + PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
  637 + continue;
  638 + }
  639 + do_recovery(pdev, entry.severity);
  640 + }
  641 +}
  642 +#endif
581 643  
582 644 /**
583 645 * get_device_error_info - read error status from dev and store it to info
drivers/pci/pcie/aer/aerdrv_errprint.c
... ... @@ -204,7 +204,7 @@
204 204 }
205 205  
206 206 #ifdef CONFIG_ACPI_APEI_PCIEAER
207   -static int cper_severity_to_aer(int cper_severity)
  207 +int cper_severity_to_aer(int cper_severity)
208 208 {
209 209 switch (cper_severity) {
210 210 case CPER_SEV_RECOVERABLE:
... ... @@ -215,6 +215,7 @@
215 215 return AER_CORRECTABLE;
216 216 }
217 217 }
  218 +EXPORT_SYMBOL_GPL(cper_severity_to_aer);
218 219  
219 220 void cper_print_aer(const char *prefix, int cper_severity,
220 221 struct aer_capability_regs *aer)
... ... @@ -51,5 +51,8 @@
51 51  
52 52 extern void cper_print_aer(const char *prefix, int cper_severity,
53 53 struct aer_capability_regs *aer);
  54 +extern int cper_severity_to_aer(int cper_severity);
  55 +extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
  56 + int severity);
54 57 #endif //_AER_H_