Commit 549d042df240dfb4203bab40ad44f9336751b7d6

Authored by Borislav Petkov
1 parent ecaf5606de

x86, mce: pass mce info to EDAC for decoding

Move NB decoder along with required defines to EDAC MCE core. Add
registration routines for further decoding of the MCE info in the AMD64
EDAC module.

CC: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

Showing 6 changed files with 185 additions and 111 deletions Side-by-side Diff

arch/x86/kernel/cpu/mcheck/mce.c
... ... @@ -183,6 +183,11 @@
183 183 set_bit(0, &mce_need_notify);
184 184 }
185 185  
  186 +void __weak decode_mce(struct mce *m)
  187 +{
  188 + return;
  189 +}
  190 +
186 191 static void print_mce(struct mce *m)
187 192 {
188 193 printk(KERN_EMERG
... ... @@ -205,6 +210,8 @@
205 210 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 211 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 212 m->apicid);
  213 +
  214 + decode_mce(m);
208 215 }
209 216  
210 217 static void print_mce_head(void)
drivers/edac/amd64_edac.c
... ... @@ -2282,8 +2282,8 @@
2282 2282 }
2283 2283 }
2284 2284  
2285   -static void amd64_decode_bus_error(struct mem_ctl_info *mci,
2286   - struct err_regs *info, int ecc_type)
  2285 +static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
  2286 + struct err_regs *info, int ecc_type)
2287 2287 {
2288 2288 u32 ec = ERROR_CODE(info->nbsl);
2289 2289 u32 xec = EXT_ERROR_CODE(info->nbsl);
2290 2290  
2291 2291  
2292 2292  
2293 2293  
2294 2294  
2295 2295  
... ... @@ -2316,86 +2316,23 @@
2316 2316 edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
2317 2317 }
2318 2318  
2319   -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
2320   - int handle_errors)
  2319 +void amd64_decode_bus_error(int node_id, struct err_regs *regs,
  2320 + int ecc_type)
2321 2321 {
2322   - struct amd64_pvt *pvt = mci->pvt_info;
2323   - int ecc;
2324   - u32 ec = ERROR_CODE(regs->nbsl);
2325   - u32 xec = EXT_ERROR_CODE(regs->nbsl);
  2322 + struct mem_ctl_info *mci = mci_lookup[node_id];
2326 2323  
2327   - if (!handle_errors)
2328   - return;
  2324 + __amd64_decode_bus_error(mci, regs, ecc_type);
2329 2325  
2330   - pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
2331   -
2332 2326 /*
2333   - * F10h, revD can disable ErrCpu[3:0] so check that first and also the
2334   - * value encoding has changed so interpret those differently
2335   - */
2336   - if ((boot_cpu_data.x86 == 0x10) &&
2337   - (boot_cpu_data.x86_model > 8)) {
2338   - if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
2339   - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
2340   - } else {
2341   - pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
2342   - }
2343   -
2344   - pr_emerg(" Error: %sorrected",
2345   - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
2346   - pr_cont(", Report Error: %s",
2347   - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
2348   - pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
2349   - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
2350   - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
2351   -
2352   - /* do the two bits[14:13] together */
2353   - ecc = regs->nbsh & (0x3 << 13);
2354   - if (ecc)
2355   - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
2356   -
2357   - pr_cont("\n");
2358   -
2359   - if (TLB_ERROR(ec)) {
2360   - /*
2361   - * GART errors are intended to help graphics driver developers
2362   - * to detect bad GART PTEs. It is recommended by AMD to disable
2363   - * GART table walk error reporting by default[1] (currently
2364   - * being disabled in mce_cpu_quirks()) and according to the
2365   - * comment in mce_cpu_quirks(), such GART errors can be
2366   - * incorrectly triggered. We may see these errors anyway and
2367   - * unless requested by the user, they won't be reported.
2368   - *
2369   - * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
2370   - * AMD NPT family 0Fh processors
2371   - */
2372   - if (!report_gart_errors)
2373   - return;
2374   -
2375   - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
2376   - TT_MSG(ec), LL_MSG(ec));
2377   - } else if (MEM_ERROR(ec)) {
2378   - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
2379   - " Cache Level: %s",
2380   - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
2381   - } else if (BUS_ERROR(ec)) {
2382   - pr_emerg(" Bus (Link/DRAM) error\n");
2383   - amd64_decode_bus_error(mci, regs, ecc);
2384   - } else {
2385   - /* shouldn't reach here! */
2386   - amd64_mc_printk(mci, KERN_WARNING,
2387   - "%s(): unknown MCE error 0x%x\n", __func__, ec);
2388   - }
2389   -
2390   - pr_emerg("%s.\n", EXT_ERR_MSG(xec));
2391   -
2392   - /*
2393 2327 * Check the UE bit of the NB status high register, if set generate some
2394 2328 * logs. If NOT a GART error, then process the event as a NO-INFO event.
2395 2329 * If it was a GART error, skip that process.
  2330 + *
  2331 + * FIXME: this should go somewhere else, if at all.
2396 2332 */
2397 2333 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2398 2334 edac_mc_handle_ue_no_info(mci, "UE bit is set");
  2335 +
2399 2336 }
2400 2337  
2401 2338 /*
... ... @@ -2406,8 +2343,10 @@
2406 2343 {
2407 2344 struct err_regs regs;
2408 2345  
2409   - if (amd64_get_error_info(mci, &regs))
2410   - amd64_decode_nb_mce(mci, &regs, 1);
  2346 + if (amd64_get_error_info(mci, &regs)) {
  2347 + struct amd64_pvt *pvt = mci->pvt_info;
  2348 + amd_decode_nb_mce(pvt->mc_node_id, &regs, 1);
  2349 + }
2411 2350 }
2412 2351  
2413 2352 /*
... ... @@ -3103,6 +3042,13 @@
3103 3042  
3104 3043 mci_lookup[node_id] = mci;
3105 3044 pvt_lookup[node_id] = NULL;
  3045 +
  3046 + /* register stuff with EDAC MCE */
  3047 + if (report_gart_errors)
  3048 + amd_report_gart_errors(true);
  3049 +
  3050 + amd_register_ecc_decoder(amd64_decode_bus_error);
  3051 +
3106 3052 return 0;
3107 3053  
3108 3054 err_add_mc:
... ... @@ -3168,6 +3114,10 @@
3168 3114 mci->pvt_info = NULL;
3169 3115  
3170 3116 mci_lookup[pvt->mc_node_id] = NULL;
  3117 +
  3118 + /* unregister from EDAC MCE */
  3119 + amd_report_gart_errors(false);
  3120 + amd_unregister_ecc_decoder(amd64_decode_bus_error);
3171 3121  
3172 3122 /* Free the EDAC CORE resources */
3173 3123 edac_mc_free(mci);
drivers/edac/amd64_edac.h
... ... @@ -346,24 +346,8 @@
346 346 #define K8_NBSL_PP_OBS 0x2
347 347 #define K8_NBSL_PP_GENERIC 0x3
348 348  
349   -
350   -#define K8_NBSH 0x4C
351   -
352   -#define K8_NBSH_VALID_BIT BIT(31)
353   -#define K8_NBSH_OVERFLOW BIT(30)
354   -#define K8_NBSH_UC_ERR BIT(29)
355   -#define K8_NBSH_ERR_EN BIT(28)
356   -#define K8_NBSH_MISCV BIT(27)
357   -#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
358   -#define K8_NBSH_PCC BIT(25)
359   -#define K8_NBSH_ERR_CPU_VAL BIT(24)
360   -#define K8_NBSH_CECC BIT(14)
361   -#define K8_NBSH_UECC BIT(13)
362   -#define K8_NBSH_ERR_SCRUBER BIT(8)
363   -
364 349 #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
365 350  
366   -
367 351 #define K8_NBEAL 0x50
368 352 #define K8_NBEAH 0x54
369 353 #define K8_SCRCTRL 0x58
... ... @@ -428,23 +412,6 @@
428 412 F11_CPUS,
429 413 };
430 414  
431   -/*
432   - * Structure to hold:
433   - *
434   - * 1) dynamically read status and error address HW registers
435   - * 2) sysfs entered values
436   - * 3) MCE values
437   - *
438   - * Depends on entry into the modules
439   - */
440   -struct err_regs {
441   - u32 nbcfg;
442   - u32 nbsh;
443   - u32 nbsl;
444   - u32 nbeah;
445   - u32 nbeal;
446   -};
447   -
448 415 /* Error injection control structure */
449 416 struct error_injection {
450 417 u32 section;
... ... @@ -609,9 +576,6 @@
609 576 #define K8_MIN_SCRUB_RATE_BITS 0x0
610 577 #define F10_MIN_SCRUB_RATE_BITS 0x5
611 578 #define F11_MIN_SCRUB_RATE_BITS 0x6
612   -
613   -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
614   - int handle_errors);
615 579  
616 580 int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
617 581 u64 *hole_offset, u64 *hole_size);
drivers/edac/amd64_edac_dbg.c
... ... @@ -24,7 +24,7 @@
24 24  
25 25 /* Process the Mapping request */
26 26 /* TODO: Add race prevention */
27   - amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1);
  27 + amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
28 28  
29 29 return count;
30 30 }
drivers/edac/edac_mce_amd.c
1 1 #include <linux/module.h>
2 2 #include "edac_mce_amd.h"
3 3  
  4 +static bool report_gart_errors;
  5 +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type);
  6 +
  7 +void amd_report_gart_errors(bool v)
  8 +{
  9 + report_gart_errors = v;
  10 +}
  11 +EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  12 +
  13 +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int))
  14 +{
  15 + nb_bus_decoder = f;
  16 +}
  17 +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  18 +
  19 +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int))
  20 +{
  21 + if (nb_bus_decoder) {
  22 + WARN_ON(nb_bus_decoder != f);
  23 +
  24 + nb_bus_decoder = NULL;
  25 + }
  26 +}
  27 +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  28 +
4 29 /*
5 30 * string representation for the different MCA reported error types, see F3x48
6 31 * or MSR0000_0411.
... ... @@ -102,4 +127,94 @@
102 127 "Probe Filter error" /* 1_1111b */
103 128 };
104 129 EXPORT_SYMBOL_GPL(ext_msgs);
  130 +
  131 +void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
  132 +{
  133 + int ecc;
  134 + u32 ec = ERROR_CODE(regs->nbsl);
  135 + u32 xec = EXT_ERROR_CODE(regs->nbsl);
  136 +
  137 + if (!handle_errors)
  138 + return;
  139 +
  140 + pr_emerg(" Northbridge Error, node %d", node_id);
  141 +
  142 + /*
  143 + * F10h, revD can disable ErrCpu[3:0] so check that first and also the
  144 + * value encoding has changed so interpret those differently
  145 + */
  146 + if ((boot_cpu_data.x86 == 0x10) &&
  147 + (boot_cpu_data.x86_model > 8)) {
  148 + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
  149 + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
  150 + } else {
  151 + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
  152 + }
  153 +
  154 + pr_emerg(" Error: %sorrected",
  155 + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
  156 + pr_cont(", Report Error: %s",
  157 + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
  158 + pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
  159 + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
  160 + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
  161 +
  162 + /* do the two bits[14:13] together */
  163 + ecc = regs->nbsh & (0x3 << 13);
  164 + if (ecc)
  165 + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
  166 +
  167 + pr_cont("\n");
  168 +
  169 + if (TLB_ERROR(ec)) {
  170 + /*
  171 + * GART errors are intended to help graphics driver developers
  172 + * to detect bad GART PTEs. It is recommended by AMD to disable
  173 + * GART table walk error reporting by default[1] (currently
  174 + * being disabled in mce_cpu_quirks()) and according to the
  175 + * comment in mce_cpu_quirks(), such GART errors can be
  176 + * incorrectly triggered. We may see these errors anyway and
  177 + * unless requested by the user, they won't be reported.
  178 + *
  179 + * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
  180 + * AMD NPT family 0Fh processors
  181 + */
  182 + if (!report_gart_errors)
  183 + return;
  184 +
  185 + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
  186 + TT_MSG(ec), LL_MSG(ec));
  187 + } else if (MEM_ERROR(ec)) {
  188 + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
  189 + " Cache Level: %s",
  190 + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
  191 + } else if (BUS_ERROR(ec)) {
  192 + pr_emerg(" Bus (Link/DRAM) error\n");
  193 + if (nb_bus_decoder)
  194 + nb_bus_decoder(node_id, regs, ecc);
  195 + } else {
  196 + /* shouldn't reach here! */
  197 + pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec);
  198 + }
  199 +
  200 + pr_emerg("%s.\n", EXT_ERR_MSG(xec));
  201 +}
  202 +EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
  203 +
  204 +void decode_mce(struct mce *m)
  205 +{
  206 + struct err_regs regs;
  207 + int node;
  208 +
  209 + if (m->bank != 4)
  210 + return;
  211 +
  212 + regs.nbsl = (u32) m->status;
  213 + regs.nbsh = (u32)(m->status >> 32);
  214 + regs.nbeal = (u32) m->addr;
  215 + regs.nbeah = (u32)(m->addr >> 32);
  216 + node = topology_cpu_node_id(m->extcpu);
  217 +
  218 + amd_decode_nb_mce(node, &regs, 1);
  219 +}
drivers/edac/edac_mce_amd.h
  1 +#ifndef _EDAC_MCE_AMD_H
  2 +#define _EDAC_MCE_AMD_H
  3 +
  4 +#include <asm/mce.h>
  5 +
1 6 #define ERROR_CODE(x) ((x) & 0xffff)
2 7 #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
3 8 #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
... ... @@ -22,6 +27,20 @@
22 27 #define PP(x) (((x) >> 9) & 0x3)
23 28 #define PP_MSG(x) pp_msgs[PP(x)]
24 29  
  30 +#define K8_NBSH 0x4C
  31 +
  32 +#define K8_NBSH_VALID_BIT BIT(31)
  33 +#define K8_NBSH_OVERFLOW BIT(30)
  34 +#define K8_NBSH_UC_ERR BIT(29)
  35 +#define K8_NBSH_ERR_EN BIT(28)
  36 +#define K8_NBSH_MISCV BIT(27)
  37 +#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
  38 +#define K8_NBSH_PCC BIT(25)
  39 +#define K8_NBSH_ERR_CPU_VAL BIT(24)
  40 +#define K8_NBSH_CECC BIT(14)
  41 +#define K8_NBSH_UECC BIT(13)
  42 +#define K8_NBSH_ERR_SCRUBER BIT(8)
  43 +
25 44 extern const char *tt_msgs[];
26 45 extern const char *ll_msgs[];
27 46 extern const char *rrrr_msgs[];
... ... @@ -29,4 +48,23 @@
29 48 extern const char *to_msgs[];
30 49 extern const char *ii_msgs[];
31 50 extern const char *ext_msgs[];
  51 +
  52 +/*
  53 + * relevant NB regs
  54 + */
  55 +struct err_regs {
  56 + u32 nbcfg;
  57 + u32 nbsh;
  58 + u32 nbsl;
  59 + u32 nbeah;
  60 + u32 nbeal;
  61 +};
  62 +
  63 +
  64 +void amd_report_gart_errors(bool);
  65 +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int));
  66 +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int));
  67 +void amd_decode_nb_mce(int, struct err_regs *, int);
  68 +
  69 +#endif /* _EDAC_MCE_AMD_H */