Commit da8c7d807ac7dd7a692321977ee072a24b3006e3

Authored by Gavin Shan
Committed by Greg Kroah-Hartman
1 parent 1a20d22f3e

powerpc/powernv: Dump PHB diag-data immediately

commit 947166043732b69878123bf31f51933ad0316080 upstream.

The PHB diag-data is important to help locating the root cause for
EEH errors such as frozen PE or fenced PHB. However, the EEH core
enables IO path by clearing part of HW registers before collecting
this data causing it to be corrupted.

This patch fixes this by dumping the PHB diag-data immediately when
frozen/fenced state on PE or PHB is detected for the first time in
eeh_ops::get_state() or next_error() backend.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
CC: <stable@vger.kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 43 additions and 56 deletions Side-by-side Diff

arch/powerpc/platforms/powernv/eeh-ioda.c
... ... @@ -114,6 +114,7 @@
114 114 ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
115 115 #endif /* CONFIG_DEBUG_FS */
116 116  
  117 +
117 118 /**
118 119 * ioda_eeh_post_init - Chip dependent post initialization
119 120 * @hose: PCI controller
... ... @@ -221,6 +222,22 @@
221 222 return ret;
222 223 }
223 224  
  225 +static void ioda_eeh_phb_diag(struct pci_controller *hose)
  226 +{
  227 + struct pnv_phb *phb = hose->private_data;
  228 + long rc;
  229 +
  230 + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
  231 + PNV_PCI_DIAG_BUF_SIZE);
  232 + if (rc != OPAL_SUCCESS) {
  233 + pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
  234 + __func__, hose->global_number, rc);
  235 + return;
  236 + }
  237 +
  238 + pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
  239 +}
  240 +
224 241 /**
225 242 * ioda_eeh_get_state - Retrieve the state of PE
226 243 * @pe: EEH PE
... ... @@ -272,6 +289,9 @@
272 289 result |= EEH_STATE_DMA_ACTIVE;
273 290 result |= EEH_STATE_MMIO_ENABLED;
274 291 result |= EEH_STATE_DMA_ENABLED;
  292 + } else if (!(pe->state & EEH_PE_ISOLATED)) {
  293 + eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
  294 + ioda_eeh_phb_diag(hose);
275 295 }
276 296  
277 297 return result;
... ... @@ -315,6 +335,15 @@
315 335 __func__, fstate, hose->global_number, pe_no);
316 336 }
317 337  
  338 + /* Dump PHB diag-data for frozen PE */
  339 + if (result != EEH_STATE_NOT_SUPPORT &&
  340 + (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) !=
  341 + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) &&
  342 + !(pe->state & EEH_PE_ISOLATED)) {
  343 + eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
  344 + ioda_eeh_phb_diag(hose);
  345 + }
  346 +
318 347 return result;
319 348 }
320 349  
... ... @@ -530,45 +559,6 @@
530 559 }
531 560  
532 561 /**
533   - * ioda_eeh_get_log - Retrieve error log
534   - * @pe: EEH PE
535   - * @severity: Severity level of the log
536   - * @drv_log: buffer to store the log
537   - * @len: space of the log buffer
538   - *
539   - * The function is used to retrieve error log from P7IOC.
540   - */
541   -static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
542   - char *drv_log, unsigned long len)
543   -{
544   - s64 ret;
545   - unsigned long flags;
546   - struct pci_controller *hose = pe->phb;
547   - struct pnv_phb *phb = hose->private_data;
548   -
549   - spin_lock_irqsave(&phb->lock, flags);
550   -
551   - ret = opal_pci_get_phb_diag_data2(phb->opal_id,
552   - phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
553   - if (ret) {
554   - spin_unlock_irqrestore(&phb->lock, flags);
555   - pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
556   - __func__, hose->global_number, pe->addr, ret);
557   - return -EIO;
558   - }
559   -
560   - /*
561   - * FIXME: We probably need log the error in somewhere.
562   - * Lets make it up in future.
563   - */
564   - /* pr_info("%s", phb->diag.blob); */
565   -
566   - spin_unlock_irqrestore(&phb->lock, flags);
567   -
568   - return 0;
569   -}
570   -
571   -/**
572 562 * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
573 563 * @pe: EEH PE
574 564 *
... ... @@ -649,22 +639,6 @@
649 639 }
650 640 }
651 641  
652   -static void ioda_eeh_phb_diag(struct pci_controller *hose)
653   -{
654   - struct pnv_phb *phb = hose->private_data;
655   - long rc;
656   -
657   - rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
658   - PNV_PCI_DIAG_BUF_SIZE);
659   - if (rc != OPAL_SUCCESS) {
660   - pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
661   - __func__, hose->global_number, rc);
662   - return;
663   - }
664   -
665   - pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
666   -}
667   -
668 642 static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
669 643 struct eeh_pe **pe)
670 644 {
... ... @@ -827,6 +801,20 @@
827 801 }
828 802  
829 803 /*
  804 + * EEH core will try recover from fenced PHB or
  805 + * frozen PE. In the time for frozen PE, EEH core
  806 + * enable IO path for that before collecting logs,
  807 + * but it ruins the site. So we have to dump the
  808 + * log in advance here.
  809 + */
  810 + if ((ret == EEH_NEXT_ERR_FROZEN_PE ||
  811 + ret == EEH_NEXT_ERR_FENCED_PHB) &&
  812 + !((*pe)->state & EEH_PE_ISOLATED)) {
  813 + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
  814 + ioda_eeh_phb_diag(hose);
  815 + }
  816 +
  817 + /*
830 818 * If we have no errors on the specific PHB or only
831 819 * informative error there, we continue poking it.
832 820 * Otherwise, we need actions to be taken by upper
... ... @@ -844,7 +832,6 @@
844 832 .set_option = ioda_eeh_set_option,
845 833 .get_state = ioda_eeh_get_state,
846 834 .reset = ioda_eeh_reset,
847   - .get_log = ioda_eeh_get_log,
848 835 .configure_bridge = ioda_eeh_configure_bridge,
849 836 .next_error = ioda_eeh_next_error
850 837 };