Commit f4347553b30ec66530bfe63c84530afea3803396

Authored by Borislav Petkov
1 parent 98a5ae2d99

amd64_edac: Remove polling mechanism

Switch to reusing the mcheck core's machine check polling mechanism
instead of duplicating functionality by using the EDAC polling routine.

Correct formatting while at it.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Doug Thompson <dougthompson@xmission.com>

Showing 2 changed files with 8 additions and 126 deletions Side-by-side Diff

drivers/edac/amd64_edac.c
... ... @@ -1979,107 +1979,6 @@
1979 1979 }
1980 1980  
1981 1981 /*
1982   - * Check for valid error in the NB Status High register. If so, proceed to read
1983   - * NB Status Low, NB Address Low and NB Address High registers and store data
1984   - * into error structure.
1985   - *
1986   - * Returns:
1987   - * - 1: if hardware regs contains valid error info
1988   - * - 0: if no valid error is indicated
1989   - */
1990   -static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
1991   - struct err_regs *regs)
1992   -{
1993   - struct amd64_pvt *pvt;
1994   - struct pci_dev *misc_f3_ctl;
1995   -
1996   - pvt = mci->pvt_info;
1997   - misc_f3_ctl = pvt->misc_f3_ctl;
1998   -
1999   - if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSH, &regs->nbsh))
2000   - return 0;
2001   -
2002   - if (!(regs->nbsh & K8_NBSH_VALID_BIT))
2003   - return 0;
2004   -
2005   - /* valid error, read remaining error information registers */
2006   - if (amd64_read_pci_cfg(misc_f3_ctl, K8_NBSL, &regs->nbsl) ||
2007   - amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAL, &regs->nbeal) ||
2008   - amd64_read_pci_cfg(misc_f3_ctl, K8_NBEAH, &regs->nbeah) ||
2009   - amd64_read_pci_cfg(misc_f3_ctl, K8_NBCFG, &regs->nbcfg))
2010   - return 0;
2011   -
2012   - return 1;
2013   -}
2014   -
2015   -/*
2016   - * This function is called to retrieve the error data from hardware and store it
2017   - * in the info structure.
2018   - *
2019   - * Returns:
2020   - * - 1: if a valid error is found
2021   - * - 0: if no error is found
2022   - */
2023   -static int amd64_get_error_info(struct mem_ctl_info *mci,
2024   - struct err_regs *info)
2025   -{
2026   - struct amd64_pvt *pvt;
2027   - struct err_regs regs;
2028   -
2029   - pvt = mci->pvt_info;
2030   -
2031   - if (!amd64_get_error_info_regs(mci, info))
2032   - return 0;
2033   -
2034   - /*
2035   - * Here's the problem with the K8's EDAC reporting: There are four
2036   - * registers which report pieces of error information. They are shared
2037   - * between CEs and UEs. Furthermore, contrary to what is stated in the
2038   - * BKDG, the overflow bit is never used! Every error always updates the
2039   - * reporting registers.
2040   - *
2041   - * Can you see the race condition? All four error reporting registers
2042   - * must be read before a new error updates them! There is no way to read
2043   - * all four registers atomically. The best than can be done is to detect
2044   - * that a race has occured and then report the error without any kind of
2045   - * precision.
2046   - *
2047   - * What is still positive is that errors are still reported and thus
2048   - * problems can still be detected - just not localized because the
2049   - * syndrome and address are spread out across registers.
2050   - *
2051   - * Grrrrr!!!!! Here's hoping that AMD fixes this in some future K8 rev.
2052   - * UEs and CEs should have separate register sets with proper overflow
2053   - * bits that are used! At very least the problem can be fixed by
2054   - * honoring the ErrValid bit in 'nbsh' and not updating registers - just
2055   - * set the overflow bit - unless the current error is CE and the new
2056   - * error is UE which would be the only situation for overwriting the
2057   - * current values.
2058   - */
2059   -
2060   - regs = *info;
2061   -
2062   - /* Use info from the second read - most current */
2063   - if (unlikely(!amd64_get_error_info_regs(mci, info)))
2064   - return 0;
2065   -
2066   - /* clear the error bits in hardware */
2067   - pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
2068   -
2069   - /* Check for the possible race condition */
2070   - if ((regs.nbsh != info->nbsh) ||
2071   - (regs.nbsl != info->nbsl) ||
2072   - (regs.nbeah != info->nbeah) ||
2073   - (regs.nbeal != info->nbeal)) {
2074   - amd64_mc_printk(mci, KERN_WARNING,
2075   - "hardware STATUS read access race condition "
2076   - "detected!\n");
2077   - return 0;
2078   - }
2079   - return 1;
2080   -}
2081   -
2082   -/*
2083 1982 * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
2084 1983 * ADDRESS and process.
2085 1984 */
... ... @@ -2203,20 +2102,6 @@
2203 2102 }
2204 2103  
2205 2104 /*
2206   - * The main polling 'check' function, called FROM the edac core to perform the
2207   - * error checking and if an error is encountered, error processing.
2208   - */
2209   -static void amd64_check(struct mem_ctl_info *mci)
2210   -{
2211   - struct err_regs regs;
2212   -
2213   - if (amd64_get_error_info(mci, &regs)) {
2214   - struct amd64_pvt *pvt = mci->pvt_info;
2215   - amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
2216   - }
2217   -}
2218   -
2219   -/*
2220 2105 * Input:
2221 2106 * 1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
2222 2107 * 2) AMD Family index value
... ... @@ -2755,9 +2640,6 @@
2755 2640 mci->ctl_name = get_amd_family_name(pvt->mc_type_index);
2756 2641 mci->dev_name = pci_name(pvt->dram_f2_ctl);
2757 2642 mci->ctl_page_to_phys = NULL;
2758   -
2759   - /* IMPORTANT: Set the polling 'check' function in this module */
2760   - mci->edac_check = amd64_check;
2761 2643  
2762 2644 /* memory scrubber interface */
2763 2645 mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
drivers/edac/edac_mce_amd.c
... ... @@ -133,7 +133,7 @@
133 133 u32 ec = mc0_status & 0xffff;
134 134 u32 xec = (mc0_status >> 16) & 0xf;
135 135  
136   - pr_emerg(" Data Cache Error");
  136 + pr_emerg("Data Cache Error");
137 137  
138 138 if (xec == 1 && TLB_ERROR(ec))
139 139 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
... ... @@ -176,7 +176,7 @@
176 176 u32 ec = mc1_status & 0xffff;
177 177 u32 xec = (mc1_status >> 16) & 0xf;
178 178  
179   - pr_emerg(" Instruction Cache Error");
  179 + pr_emerg("Instruction Cache Error");
180 180  
181 181 if (xec == 1 && TLB_ERROR(ec))
182 182 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
... ... @@ -233,7 +233,7 @@
233 233 u32 ec = mc2_status & 0xffff;
234 234 u32 xec = (mc2_status >> 16) & 0xf;
235 235  
236   - pr_emerg(" Bus Unit Error");
  236 + pr_emerg("Bus Unit Error");
237 237  
238 238 if (xec == 0x1)
239 239 pr_cont(" in the write data buffers.\n");
... ... @@ -275,7 +275,7 @@
275 275 u32 ec = mc3_status & 0xffff;
276 276 u32 xec = (mc3_status >> 16) & 0xf;
277 277  
278   - pr_emerg(" Load Store Error");
  278 + pr_emerg("Load Store Error");
279 279  
280 280 if (xec == 0x0) {
281 281 u8 rrrr = (ec >> 4) & 0xf;
... ... @@ -304,7 +304,7 @@
304 304 if (TLB_ERROR(ec) && !report_gart_errors)
305 305 return;
306 306  
307   - pr_emerg(" Northbridge Error, node %d", node_id);
  307 + pr_emerg("Northbridge Error, node %d", node_id);
308 308  
309 309 /*
310 310 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
311 311  
312 312  
... ... @@ -342,13 +342,13 @@
342 342 static inline void amd_decode_err_code(unsigned int ec)
343 343 {
344 344 if (TLB_ERROR(ec)) {
345   - pr_emerg(" Transaction: %s, Cache Level %s\n",
  345 + pr_emerg("Transaction: %s, Cache Level %s\n",
346 346 TT_MSG(ec), LL_MSG(ec));
347 347 } else if (MEM_ERROR(ec)) {
348   - pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
  348 + pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
349 349 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
350 350 } else if (BUS_ERROR(ec)) {
351   - pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
  351 + pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
352 352 "Participating Processor: %s\n",
353 353 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
354 354 PP_MSG(ec));