powerpc/powernv: Dump PHB diag-data immediately

commit 947166043732b69878123bf31f51933ad0316080 upstream. The PHB diag-data is important to help locating the root cause for EEH errors such as frozen PE or fenced PHB. However, the EEH core enables IO path by clearing part of HW registers before collecting this data causing it to be corrupted. This patch fixes this by dumping the PHB diag-data immediately when frozen/fenced state on PE or PHB is detected for the first time in eeh_ops::get_state() or next_error() backend. Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> CC: <stable@vger.kernel.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

powerpc/powernv: Dump PHB diag-data immediately
commit 947166043732b69878123bf31f51933ad0316080 upstream. The PHB diag-data is important to help locating the root cause for EEH errors such as frozen PE or fenced PHB. However, the EEH core enables IO path by clearing part of HW registers before collecting this data causing it to be corrupted. This patch fixes this by dumping the PHB diag-data immediately when frozen/fenced state on PE or PHB is detected for the first time in eeh_ops::get_state() or next_error() backend. Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> CC: <stable@vger.kernel.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Gavin Shan · Greg Kroah-Hartman
1 parent 1a20d22f3e
Showing 1 changed file with 43 additions and 56 deletions Side-by-side Diff
arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -114,6 +114,7 @@
 			ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
 #endif /* CONFIG_DEBUG_FS */
  
+
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
  * @hose: PCI controller
@@ -221,6 +222,22 @@
 	return ret;
 }
  
+static void ioda_eeh_phb_diag(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+	long rc;
+
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+					 PNV_PCI_DIAG_BUF_SIZE);
+	if (rc != OPAL_SUCCESS) {
+		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
+			    __func__, hose->global_number, rc);
+		return;
+	}
+
+	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
+}
+
 /**
  * ioda_eeh_get_state - Retrieve the state of PE
  * @pe: EEH PE
@@ -272,6 +289,9 @@
 			result |= EEH_STATE_DMA_ACTIVE;
 			result |= EEH_STATE_MMIO_ENABLED;
 			result |= EEH_STATE_DMA_ENABLED;
+		} else if (!(pe->state & EEH_PE_ISOLATED)) {
+			eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+			ioda_eeh_phb_diag(hose);
 		}
  
 		return result;
@@ -315,6 +335,15 @@
 			   __func__, fstate, hose->global_number, pe_no);
 	}
  
+	/* Dump PHB diag-data for frozen PE */
+	if (result != EEH_STATE_NOT_SUPPORT &&
+	    (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) !=
+	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) &&
+	    !(pe->state & EEH_PE_ISOLATED)) {
+		eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+		ioda_eeh_phb_diag(hose);
+	}
+
 	return result;
 }
  
@@ -530,45 +559,6 @@
 }
  
 /**
- * ioda_eeh_get_log - Retrieve error log
- * @pe: EEH PE
- * @severity: Severity level of the log
- * @drv_log: buffer to store the log
- * @len: space of the log buffer
- *
- * The function is used to retrieve error log from P7IOC.
- */
-static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
-			    char *drv_log, unsigned long len)
-{
-	s64 ret;
-	unsigned long flags;
-	struct pci_controller *hose = pe->phb;
-	struct pnv_phb *phb = hose->private_data;
-
-	spin_lock_irqsave(&phb->lock, flags);
-
-	ret = opal_pci_get_phb_diag_data2(phb->opal_id,
-			phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
-	if (ret) {
-		spin_unlock_irqrestore(&phb->lock, flags);
-		pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
-			   __func__, hose->global_number, pe->addr, ret);
-		return -EIO;
-	}
-
-	/*
-	 * FIXME: We probably need log the error in somewhere.
-	 * Lets make it up in future.
-	 */
-	/* pr_info("%s", phb->diag.blob); */
-
-	spin_unlock_irqrestore(&phb->lock, flags);
-
-	return 0;
-}
-
-/**
  * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
  * @pe: EEH PE
  *
@@ -649,22 +639,6 @@
 	}
 }
  
-static void ioda_eeh_phb_diag(struct pci_controller *hose)
-{
-	struct pnv_phb *phb = hose->private_data;
-	long rc;
-
-	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
-					 PNV_PCI_DIAG_BUF_SIZE);
-	if (rc != OPAL_SUCCESS) {
-		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
-			    __func__, hose->global_number, rc);
-		return;
-	}
-
-	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
-}
-
 static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
 			       struct eeh_pe **pe)
 {
@@ -827,6 +801,20 @@
 		}
  
 		/*
+		 * EEH core will try recover from fenced PHB or
+		 * frozen PE. In the time for frozen PE, EEH core
+		 * enable IO path for that before collecting logs,
+		 * but it ruins the site. So we have to dump the
+		 * log in advance here.
+		 */
+		if ((ret == EEH_NEXT_ERR_FROZEN_PE  ||
+		    ret == EEH_NEXT_ERR_FENCED_PHB) &&
+		    !((*pe)->state & EEH_PE_ISOLATED)) {
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+			ioda_eeh_phb_diag(hose);
+		}
+
+		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
 		 * Otherwise, we need actions to be taken by upper
@@ -844,7 +832,6 @@
 	.set_option		= ioda_eeh_set_option,
 	.get_state		= ioda_eeh_get_state,
 	.reset			= ioda_eeh_reset,
-	.get_log		= ioda_eeh_get_log,
 	.configure_bridge	= ioda_eeh_configure_bridge,
 	.next_error		= ioda_eeh_next_error
 };
...	...	@@ -114,6 +114,7 @@
114	114	ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
115	115	#endif /* CONFIG_DEBUG_FS */
116	116
	117	+
117	118	/**
118	119	* ioda_eeh_post_init - Chip dependent post initialization
119	120	* @hose: PCI controller
...	...	@@ -221,6 +222,22 @@
221	222	return ret;
222	223	}
223	224
	225	+static void ioda_eeh_phb_diag(struct pci_controller *hose)
	226	+{
	227	+ struct pnv_phb *phb = hose->private_data;
	228	+ long rc;
	229	+
	230	+ rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
	231	+ PNV_PCI_DIAG_BUF_SIZE);
	232	+ if (rc != OPAL_SUCCESS) {
	233	+ pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
	234	+ __func__, hose->global_number, rc);
	235	+ return;
	236	+ }
	237	+
	238	+ pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
	239	+}
	240	+
224	241	/**
225	242	* ioda_eeh_get_state - Retrieve the state of PE
226	243	* @pe: EEH PE
...	...	@@ -272,6 +289,9 @@
272	289	result \|= EEH_STATE_DMA_ACTIVE;
273	290	result \|= EEH_STATE_MMIO_ENABLED;
274	291	result \|= EEH_STATE_DMA_ENABLED;
	292	+ } else if (!(pe->state & EEH_PE_ISOLATED)) {
	293	+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
	294	+ ioda_eeh_phb_diag(hose);
275	295	}
276	296
277	297	return result;
...	...	@@ -315,6 +335,15 @@
315	335	__func__, fstate, hose->global_number, pe_no);
316	336	}
317	337
	338	+ /* Dump PHB diag-data for frozen PE */
	339	+ if (result != EEH_STATE_NOT_SUPPORT &&
	340	+ (result & (EEH_STATE_MMIO_ACTIVE \| EEH_STATE_DMA_ACTIVE)) !=
	341	+ (EEH_STATE_MMIO_ACTIVE \| EEH_STATE_DMA_ACTIVE) &&
	342	+ !(pe->state & EEH_PE_ISOLATED)) {
	343	+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
	344	+ ioda_eeh_phb_diag(hose);
	345	+ }
	346	+
318	347	return result;
319	348	}
320	349
...	...	@@ -530,45 +559,6 @@
530	559	}
531	560
532	561	/**
533		- * ioda_eeh_get_log - Retrieve error log
534		- * @pe: EEH PE
535		- * @severity: Severity level of the log
536		- * @drv_log: buffer to store the log
537		- * @len: space of the log buffer
538		- *
539		- * The function is used to retrieve error log from P7IOC.
540		- */
541		-static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
542		- char *drv_log, unsigned long len)
543		-{
544		- s64 ret;
545		- unsigned long flags;
546		- struct pci_controller *hose = pe->phb;
547		- struct pnv_phb *phb = hose->private_data;
548		-
549		- spin_lock_irqsave(&phb->lock, flags);
550		-
551		- ret = opal_pci_get_phb_diag_data2(phb->opal_id,
552		- phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
553		- if (ret) {
554		- spin_unlock_irqrestore(&phb->lock, flags);
555		- pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
556		- __func__, hose->global_number, pe->addr, ret);
557		- return -EIO;
558		- }
559		-
560		- /*
561		- * FIXME: We probably need log the error in somewhere.
562		- * Lets make it up in future.
563		- */
564		- /* pr_info("%s", phb->diag.blob); */
565		-
566		- spin_unlock_irqrestore(&phb->lock, flags);
567		-
568		- return 0;
569		-}
570		-
571		-/**
572	562	* ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
573	563	* @pe: EEH PE
574	564	*
...	...	@@ -649,22 +639,6 @@
649	639	}
650	640	}
651	641
652		-static void ioda_eeh_phb_diag(struct pci_controller *hose)
653		-{
654		- struct pnv_phb *phb = hose->private_data;
655		- long rc;
656		-
657		- rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
658		- PNV_PCI_DIAG_BUF_SIZE);
659		- if (rc != OPAL_SUCCESS) {
660		- pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
661		- __func__, hose->global_number, rc);
662		- return;
663		- }
664		-
665		- pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
666		-}
667		-
668	642	static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
669	643	struct eeh_pe **pe)
670	644	{
...	...	@@ -827,6 +801,20 @@
827	801	}
828	802
829	803	/*
	804	+ * EEH core will try recover from fenced PHB or
	805	+ * frozen PE. In the time for frozen PE, EEH core
	806	+ * enable IO path for that before collecting logs,
	807	+ * but it ruins the site. So we have to dump the
	808	+ * log in advance here.
	809	+ */
	810	+ if ((ret == EEH_NEXT_ERR_FROZEN_PE \|\|
	811	+ ret == EEH_NEXT_ERR_FENCED_PHB) &&
	812	+ !((*pe)->state & EEH_PE_ISOLATED)) {
	813	+ eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
	814	+ ioda_eeh_phb_diag(hose);
	815	+ }
	816	+
	817	+ /*
830	818	* If we have no errors on the specific PHB or only
831	819	* informative error there, we continue poking it.
832	820	* Otherwise, we need actions to be taken by upper
...	...	@@ -844,7 +832,6 @@
844	832	.set_option = ioda_eeh_set_option,
845	833	.get_state = ioda_eeh_get_state,
846	834	.reset = ioda_eeh_reset,
847		- .get_log = ioda_eeh_get_log,
848	835	.configure_bridge = ioda_eeh_configure_bridge,
849	836	.next_error = ioda_eeh_next_error
850	837	};