amd64_edac: Remove polling mechanism

Switch to reusing the mcheck core's machine check polling mechanism instead of duplicating functionality by using the EDAC polling routine. Correct formatting while at it. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> Acked-by: Doug Thompson <dougthompson@xmission.com>

amd64_edac: Remove polling mechanism
Switch to reusing the mcheck core's machine check polling mechanism instead of duplicating functionality by using the EDAC polling routine. Correct formatting while at it. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> Acked-by: Doug Thompson <dougthompson@xmission.com>
Borislav Petkov
1 parent 98a5ae2d99
Showing 2 changed files with 8 additions and 126 deletions Side-by-side Diff
drivers/edac/amd64_edac.c
drivers/edac/edac_mce_amd.c
@@ -1979,107 +1979,6 @@
 }
  
 /*
- * Check for valid error in the NB Status High register. If so, proceed to read
- * NB Status Low, NB Address Low and NB Address High registers and store data
- * into error structure.
- *
- * Returns:
- *	- 1: if hardware regs contains valid error info
- *	- 0: if no valid error is indicated
- */
-static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
-				     struct err_regs *regs)
-{
-	struct amd64_pvt *pvt;
-	struct pci_dev *misc_f3_ctl;
-
-	pvt = mci->pvt_info;
-	misc_f3_ctl = pvt->misc_f3_ctl;
-
-	if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
-		return 0;
-
-	if (!(regs->nbsh & K8_NBSH_VALID_BIT))
-		return 0;
-
-	/* valid error, read remaining error information registers */
-	if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) ||
-	    amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) ||
-	    amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) ||
-	    amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
-		return 0;
-
-	return 1;
-}
-
-/*
- * This function is called to retrieve the error data from hardware and store it
- * in the info structure.
- *
- * Returns:
- *	- 1: if a valid error is found
- *	- 0: if no error is found
- */
-static int amd64_get_error_info(struct mem_ctl_info *mci,
-				struct err_regs *info)
-{
-	struct amd64_pvt *pvt;
-	struct err_regs regs;
-
-	pvt = mci->pvt_info;
-
-	if (!amd64_get_error_info_regs(mci, info))
-		return 0;
-
-	/*
-	 * Here's the problem with the K8's EDAC reporting: There are four
-	 * registers which report pieces of error information. They are shared
-	 * between CEs and UEs. Furthermore, contrary to what is stated in the
-	 * BKDG, the overflow bit is never used! Every error always updates the
-	 * reporting registers.
-	 *
-	 * Can you see the race condition? All four error reporting registers
-	 * must be read before a new error updates them! There is no way to read
-	 * all four registers atomically. The best than can be done is to detect
-	 * that a race has occured and then report the error without any kind of
-	 * precision.
-	 *
-	 * What is still positive is that errors are still reported and thus
-	 * problems can still be detected - just not localized because the
-	 * syndrome and address are spread out across registers.
-	 *
-	 * Grrrrr!!!!!  Here's hoping that AMD fixes this in some future K8 rev.
-	 * UEs and CEs should have separate register sets with proper overflow
-	 * bits that are used! At very least the problem can be fixed by
-	 * honoring the ErrValid bit in 'nbsh' and not updating registers - just
-	 * set the overflow bit - unless the current error is CE and the new
-	 * error is UE which would be the only situation for overwriting the
-	 * current values.
-	 */
-
-	regs = *info;
-
-	/* Use info from the second read - most current */
-	if (unlikely(!amd64_get_error_info_regs(mci, info)))
-		return 0;
-
-	/* clear the error bits in hardware */
-	pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
-
-	/* Check for the possible race condition */
-	if ((regs.nbsh != info->nbsh) ||
-	     (regs.nbsl != info->nbsl) ||
-	     (regs.nbeah != info->nbeah) ||
-	     (regs.nbeal != info->nbeal)) {
-		amd64_mc_printk(mci, KERN_WARNING,
-				"hardware STATUS read access race condition "
-				"detected!\n");
-		return 0;
-	}
-	return 1;
-}
-
-/*
  * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
  * ADDRESS and process.
  */
@@ -2203,20 +2102,6 @@
 }
  
 /*
- * The main polling 'check' function, called FROM the edac core to perform the
- * error checking and if an error is encountered, error processing.
- */
-static void amd64_check(struct mem_ctl_info *mci)
-{
-	struct err_regs regs;
-
-	if (amd64_get_error_info(mci, &regs)) {
-		struct amd64_pvt *pvt = mci->pvt_info;
-		amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
-	}
-}
-
-/*
  * Input:
  *	1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
  *	2) AMD Family index value
@@ -2755,9 +2640,6 @@
 	mci->ctl_name		= get_amd_family_name(pvt->mc_type_index);
 	mci->dev_name		= pci_name(pvt->dram_f2_ctl);
 	mci->ctl_page_to_phys	= NULL;
-
-	/* IMPORTANT: Set the polling 'check' function in this module */
-	mci->edac_check		= amd64_check;
  
 	/* memory scrubber interface */
 	mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
@@ -133,7 +133,7 @@
 	u32 ec  = mc0_status & 0xffff;
 	u32 xec = (mc0_status >> 16) & 0xf;
  
-	pr_emerg(" Data Cache Error");
+	pr_emerg("Data Cache Error");
  
 	if (xec == 1 && TLB_ERROR(ec))
 		pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -176,7 +176,7 @@
 	u32 ec  = mc1_status & 0xffff;
 	u32 xec = (mc1_status >> 16) & 0xf;
  
-	pr_emerg(" Instruction Cache Error");
+	pr_emerg("Instruction Cache Error");
  
 	if (xec == 1 && TLB_ERROR(ec))
 		pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
@@ -233,7 +233,7 @@
 	u32 ec = mc2_status & 0xffff;
 	u32 xec = (mc2_status >> 16) & 0xf;
  
-	pr_emerg(" Bus Unit Error");
+	pr_emerg("Bus Unit Error");
  
 	if (xec == 0x1)
 		pr_cont(" in the write data buffers.\n");
@@ -275,7 +275,7 @@
 	u32 ec  = mc3_status & 0xffff;
 	u32 xec = (mc3_status >> 16) & 0xf;
  
-	pr_emerg(" Load Store Error");
+	pr_emerg("Load Store Error");
  
 	if (xec == 0x0) {
 		u8 rrrr = (ec >> 4) & 0xf;
@@ -304,7 +304,7 @@
 	if (TLB_ERROR(ec) && !report_gart_errors)
 		return;
  
-	pr_emerg(" Northbridge Error, node %d", node_id);
+	pr_emerg("Northbridge Error, node %d", node_id);
  
 	/*
 	 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
  
  
@@ -342,13 +342,13 @@
 static inline void amd_decode_err_code(unsigned int ec)
 {
 	if (TLB_ERROR(ec)) {
-		pr_emerg(" Transaction: %s, Cache Level %s\n",
+		pr_emerg("Transaction: %s, Cache Level %s\n",
 			 TT_MSG(ec), LL_MSG(ec));
 	} else if (MEM_ERROR(ec)) {
-		pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
+		pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
 			 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
 	} else if (BUS_ERROR(ec)) {
-		pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
+		pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
 			 "Participating Processor: %s\n",
 			  RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
 			  PP_MSG(ec));
...	...	@@ -1979,107 +1979,6 @@
1979	1979	}
1980	1980
1981	1981	/*
1982		- * Check for valid error in the NB Status High register. If so, proceed to read
1983		- * NB Status Low, NB Address Low and NB Address High registers and store data
1984		- * into error structure.
1985		- *
1986		- * Returns:
1987		- * - 1: if hardware regs contains valid error info
1988		- * - 0: if no valid error is indicated
1989		- */
1990		-static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
1991		- struct err_regs *regs)
1992		-{
1993		- struct amd64_pvt *pvt;
1994		- struct pci_dev *misc_f3_ctl;
1995		-
1996		- pvt = mci->pvt_info;
1997		- misc_f3_ctl = pvt->misc_f3_ctl;
1998		-
1999		- if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
2000		- return 0;
2001		-
2002		- if (!(regs->nbsh & K8_NBSH_VALID_BIT))
2003		- return 0;
2004		-
2005		- /* valid error, read remaining error information registers */
2006		- if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) \|\|
2007		- amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) \|\|
2008		- amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) \|\|
2009		- amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
2010		- return 0;
2011		-
2012		- return 1;
2013		-}
2014		-
2015		-/*
2016		- * This function is called to retrieve the error data from hardware and store it
2017		- * in the info structure.
2018		- *
2019		- * Returns:
2020		- * - 1: if a valid error is found
2021		- * - 0: if no error is found
2022		- */
2023		-static int amd64_get_error_info(struct mem_ctl_info *mci,
2024		- struct err_regs *info)
2025		-{
2026		- struct amd64_pvt *pvt;
2027		- struct err_regs regs;
2028		-
2029		- pvt = mci->pvt_info;
2030		-
2031		- if (!amd64_get_error_info_regs(mci, info))
2032		- return 0;
2033		-
2034		- /*
2035		- * Here's the problem with the K8's EDAC reporting: There are four
2036		- * registers which report pieces of error information. They are shared
2037		- * between CEs and UEs. Furthermore, contrary to what is stated in the
2038		- * BKDG, the overflow bit is never used! Every error always updates the
2039		- * reporting registers.
2040		- *
2041		- * Can you see the race condition? All four error reporting registers
2042		- * must be read before a new error updates them! There is no way to read
2043		- * all four registers atomically. The best than can be done is to detect
2044		- * that a race has occured and then report the error without any kind of
2045		- * precision.
2046		- *
2047		- * What is still positive is that errors are still reported and thus
2048		- * problems can still be detected - just not localized because the
2049		- * syndrome and address are spread out across registers.
2050		- *
2051		- * Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
2052		- * UEs and CEs should have separate register sets with proper overflow
2053		- * bits that are used! At very least the problem can be fixed by
2054		- * honoring the ErrValid bit in 'nbsh' and not updating registers - just
2055		- * set the overflow bit - unless the current error is CE and the new
2056		- * error is UE which would be the only situation for overwriting the
2057		- * current values.
2058		- */
2059		-
2060		- regs = *info;
2061		-
2062		- /* Use info from the second read - most current */
2063		- if (unlikely(!amd64_get_error_info_regs(mci, info)))
2064		- return 0;
2065		-
2066		- /* clear the error bits in hardware */
2067		- pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
2068		-
2069		- /* Check for the possible race condition */
2070		- if ((regs.nbsh != info->nbsh) \|\|
2071		- (regs.nbsl != info->nbsl) \|\|
2072		- (regs.nbeah != info->nbeah) \|\|
2073		- (regs.nbeal != info->nbeal)) {
2074		- amd64_mc_printk(mci, KERN_WARNING,
2075		- "hardware STATUS read access race condition "
2076		- "detected!\n");
2077		- return 0;
2078		- }
2079		- return 1;
2080		-}
2081		-
2082		-/*
2083	1982	* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
2084	1983	* ADDRESS and process.
2085	1984	*/
...	...	@@ -2203,20 +2102,6 @@
2203	2102	}
2204	2103
2205	2104	/*
2206		- * The main polling 'check' function, called FROM the edac core to perform the
2207		- * error checking and if an error is encountered, error processing.
2208		- */
2209		-static void amd64_check(struct mem_ctl_info *mci)
2210		-{
2211		- struct err_regs regs;
2212		-
2213		- if (amd64_get_error_info(mci, &regs)) {
2214		- struct amd64_pvt *pvt = mci->pvt_info;
2215		- amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2216		- }
2217		-}
2218		-
2219		-/*
2220	2105	* Input:
2221	2106	* 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
2222	2107	* 2) AMD Family index value
...	...	@@ -2755,9 +2640,6 @@
2755	2640	mci->ctl_name = get_amd_family_name(pvt->mc_type_index);
2756	2641	mci->dev_name = pci_name(pvt->dram_f2_ctl);
2757	2642	mci->ctl_page_to_phys = NULL;
2758		-
2759		- /* IMPORTANT: Set the polling 'check' function in this module */
2760		- mci->edac_check = amd64_check;
2761	2643
2762	2644	/* memory scrubber interface */
2763	2645	mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
...	...	@@ -133,7 +133,7 @@
133	133	u32 ec = mc0_status & 0xffff;
134	134	u32 xec = (mc0_status >> 16) & 0xf;
135	135
136		- pr_emerg(" Data Cache Error");
	136	+ pr_emerg("Data Cache Error");
137	137
138	138	if (xec == 1 && TLB_ERROR(ec))
139	139	pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
...	...	@@ -176,7 +176,7 @@
176	176	u32 ec = mc1_status & 0xffff;
177	177	u32 xec = (mc1_status >> 16) & 0xf;
178	178
179		- pr_emerg(" Instruction Cache Error");
	179	+ pr_emerg("Instruction Cache Error");
180	180
181	181	if (xec == 1 && TLB_ERROR(ec))
182	182	pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
...	...	@@ -233,7 +233,7 @@
233	233	u32 ec = mc2_status & 0xffff;
234	234	u32 xec = (mc2_status >> 16) & 0xf;
235	235
236		- pr_emerg(" Bus Unit Error");
	236	+ pr_emerg("Bus Unit Error");
237	237
238	238	if (xec == 0x1)
239	239	pr_cont(" in the write data buffers.\n");
...	...	@@ -275,7 +275,7 @@
275	275	u32 ec = mc3_status & 0xffff;
276	276	u32 xec = (mc3_status >> 16) & 0xf;
277	277
278		- pr_emerg(" Load Store Error");
	278	+ pr_emerg("Load Store Error");
279	279
280	280	if (xec == 0x0) {
281	281	u8 rrrr = (ec >> 4) & 0xf;
...	...	@@ -304,7 +304,7 @@
304	304	if (TLB_ERROR(ec) && !report_gart_errors)
305	305	return;
306	306
307		- pr_emerg(" Northbridge Error, node %d", node_id);
	307	+ pr_emerg("Northbridge Error, node %d", node_id);
308	308
309	309	/*
310	310	* F10h, revD can disable ErrCpu[3:0] so check that first and also the
311	311
312	312
...	...	@@ -342,13 +342,13 @@
342	342	static inline void amd_decode_err_code(unsigned int ec)
343	343	{
344	344	if (TLB_ERROR(ec)) {
345		- pr_emerg(" Transaction: %s, Cache Level %s\n",
	345	+ pr_emerg("Transaction: %s, Cache Level %s\n",
346	346	TT_MSG(ec), LL_MSG(ec));
347	347	} else if (MEM_ERROR(ec)) {
348		- pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
	348	+ pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
349	349	RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
350	350	} else if (BUS_ERROR(ec)) {
351		- pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
	351	+ pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
352	352	"Participating Processor: %s\n",
353	353	RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
354	354	PP_MSG(ec));