Commit 47ca08a40b043815134d489e21870b53276f1a4a

Authored by Borislav Petkov
Committed by Borislav Petkov
1 parent 9cdeb404a1

EDAC, MCE: Rename files

Drop "edac_" string from the filenames since they're prefixed with edac/
in their pathname anyway.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>

Showing 7 changed files with 489 additions and 488 deletions Side-by-side Diff

drivers/edac/Makefile
... ... @@ -19,6 +19,7 @@
19 19  
20 20 obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
21 21  
  22 +edac_mce_amd-objs := mce_amd.o
22 23 obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o
23 24  
24 25 obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o
drivers/edac/amd64_edac.h
... ... @@ -72,7 +72,7 @@
72 72 #include <linux/edac.h>
73 73 #include <asm/msr.h>
74 74 #include "edac_core.h"
75   -#include "edac_mce_amd.h"
  75 +#include "mce_amd.h"
76 76  
77 77 #define amd64_printk(level, fmt, arg...) \
78 78 edac_printk(level, "amd64", fmt, ##arg)
drivers/edac/edac_mce_amd.c
1   -#include <linux/module.h>
2   -#include "edac_mce_amd.h"
3   -
4   -static bool report_gart_errors;
5   -static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
6   -
7   -void amd_report_gart_errors(bool v)
8   -{
9   - report_gart_errors = v;
10   -}
11   -EXPORT_SYMBOL_GPL(amd_report_gart_errors);
12   -
13   -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
14   -{
15   - nb_bus_decoder = f;
16   -}
17   -EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
18   -
19   -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
20   -{
21   - if (nb_bus_decoder) {
22   - WARN_ON(nb_bus_decoder != f);
23   -
24   - nb_bus_decoder = NULL;
25   - }
26   -}
27   -EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
28   -
29   -/*
30   - * string representation for the different MCA reported error types, see F3x48
31   - * or MSR0000_0411.
32   - */
33   -
34   -/* transaction type */
35   -const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
36   -EXPORT_SYMBOL_GPL(tt_msgs);
37   -
38   -/* cache level */
39   -const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
40   -EXPORT_SYMBOL_GPL(ll_msgs);
41   -
42   -/* memory transaction type */
43   -const char *rrrr_msgs[] = {
44   - "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45   -};
46   -EXPORT_SYMBOL_GPL(rrrr_msgs);
47   -
48   -/* participating processor */
49   -const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
50   -EXPORT_SYMBOL_GPL(pp_msgs);
51   -
52   -/* request timeout */
53   -const char *to_msgs[] = { "no timeout", "timed out" };
54   -EXPORT_SYMBOL_GPL(to_msgs);
55   -
56   -/* memory or i/o */
57   -const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
58   -EXPORT_SYMBOL_GPL(ii_msgs);
59   -
60   -/*
61   - * Map the 4 or 5 (family-specific) bits of Extended Error code to the
62   - * string table.
63   - */
64   -const char *ext_msgs[] = {
65   - "K8 ECC error", /* 0_0000b */
66   - "CRC error on link", /* 0_0001b */
67   - "Sync error packets on link", /* 0_0010b */
68   - "Master Abort during link operation", /* 0_0011b */
69   - "Target Abort during link operation", /* 0_0100b */
70   - "Invalid GART PTE entry during table walk", /* 0_0101b */
71   - "Unsupported atomic RMW command received", /* 0_0110b */
72   - "WDT error: NB transaction timeout", /* 0_0111b */
73   - "ECC/ChipKill ECC error", /* 0_1000b */
74   - "SVM DEV Error", /* 0_1001b */
75   - "Link Data error", /* 0_1010b */
76   - "Link/L3/Probe Filter Protocol error", /* 0_1011b */
77   - "NB Internal Arrays Parity error", /* 0_1100b */
78   - "DRAM Address/Control Parity error", /* 0_1101b */
79   - "Link Transmission error", /* 0_1110b */
80   - "GART/DEV Table Walk Data error" /* 0_1111b */
81   - "Res 0x100 error", /* 1_0000b */
82   - "Res 0x101 error", /* 1_0001b */
83   - "Res 0x102 error", /* 1_0010b */
84   - "Res 0x103 error", /* 1_0011b */
85   - "Res 0x104 error", /* 1_0100b */
86   - "Res 0x105 error", /* 1_0101b */
87   - "Res 0x106 error", /* 1_0110b */
88   - "Res 0x107 error", /* 1_0111b */
89   - "Res 0x108 error", /* 1_1000b */
90   - "Res 0x109 error", /* 1_1001b */
91   - "Res 0x10A error", /* 1_1010b */
92   - "Res 0x10B error", /* 1_1011b */
93   - "ECC error in L3 Cache Data", /* 1_1100b */
94   - "L3 Cache Tag error", /* 1_1101b */
95   - "L3 Cache LRU Parity error", /* 1_1110b */
96   - "Probe Filter error" /* 1_1111b */
97   -};
98   -EXPORT_SYMBOL_GPL(ext_msgs);
99   -
100   -static void amd_decode_dc_mce(struct mce *m)
101   -{
102   - u32 ec = m->status & 0xffff;
103   - u32 xec = (m->status >> 16) & 0xf;
104   -
105   - pr_emerg(HW_ERR "Data Cache Error: ");
106   -
107   - if (xec == 1 && TLB_ERROR(ec))
108   - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
109   - else if (xec == 0) {
110   - if (m->status & (1ULL << 40))
111   - pr_cont(" during Data Scrub.\n");
112   - else if (TLB_ERROR(ec))
113   - pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
114   - else if (MEM_ERROR(ec)) {
115   - u8 ll = ec & 0x3;
116   - u8 tt = (ec >> 2) & 0x3;
117   - u8 rrrr = (ec >> 4) & 0xf;
118   -
119   - /* see F10h BKDG (31116), Table 92. */
120   - if (ll == 0x1) {
121   - if (tt != 0x1)
122   - goto wrong_dc_mce;
123   -
124   - pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
125   -
126   - } else if (ll == 0x2 && rrrr == 0x3)
127   - pr_cont(" during L1 linefill from L2.\n");
128   - else
129   - goto wrong_dc_mce;
130   - } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
131   - pr_cont(" during system linefill.\n");
132   - else
133   - goto wrong_dc_mce;
134   - } else
135   - goto wrong_dc_mce;
136   -
137   - return;
138   -
139   -wrong_dc_mce:
140   - pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
141   -}
142   -
143   -static void amd_decode_ic_mce(struct mce *m)
144   -{
145   - u32 ec = m->status & 0xffff;
146   - u32 xec = (m->status >> 16) & 0xf;
147   -
148   - pr_emerg(HW_ERR "Instruction Cache Error");
149   -
150   - if (xec == 1 && TLB_ERROR(ec))
151   - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
152   - else if (xec == 0) {
153   - if (TLB_ERROR(ec))
154   - pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
155   - else if (BUS_ERROR(ec)) {
156   - if (boot_cpu_data.x86 == 0xf &&
157   - (m->status & BIT(58)))
158   - pr_cont(" during system linefill.\n");
159   - else
160   - pr_cont(" during attempted NB data read.\n");
161   - } else if (MEM_ERROR(ec)) {
162   - u8 ll = ec & 0x3;
163   - u8 rrrr = (ec >> 4) & 0xf;
164   -
165   - if (ll == 0x2)
166   - pr_cont(" during a linefill from L2.\n");
167   - else if (ll == 0x1) {
168   -
169   - switch (rrrr) {
170   - case 0x5:
171   - pr_cont(": Parity error during "
172   - "data load.\n");
173   - break;
174   -
175   - case 0x7:
176   - pr_cont(": Copyback Parity/Victim"
177   - " error.\n");
178   - break;
179   -
180   - case 0x8:
181   - pr_cont(": Tag Snoop error.\n");
182   - break;
183   -
184   - default:
185   - goto wrong_ic_mce;
186   - break;
187   - }
188   - }
189   - } else
190   - goto wrong_ic_mce;
191   - } else
192   - goto wrong_ic_mce;
193   -
194   - return;
195   -
196   -wrong_ic_mce:
197   - pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
198   -}
199   -
200   -static void amd_decode_bu_mce(struct mce *m)
201   -{
202   - u32 ec = m->status & 0xffff;
203   - u32 xec = (m->status >> 16) & 0xf;
204   -
205   - pr_emerg(HW_ERR "Bus Unit Error");
206   -
207   - if (xec == 0x1)
208   - pr_cont(" in the write data buffers.\n");
209   - else if (xec == 0x3)
210   - pr_cont(" in the victim data buffers.\n");
211   - else if (xec == 0x2 && MEM_ERROR(ec))
212   - pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
213   - else if (xec == 0x0) {
214   - if (TLB_ERROR(ec))
215   - pr_cont(": %s error in a Page Descriptor Cache or "
216   - "Guest TLB.\n", TT_MSG(ec));
217   - else if (BUS_ERROR(ec))
218   - pr_cont(": %s/ECC error in data read from NB: %s.\n",
219   - RRRR_MSG(ec), PP_MSG(ec));
220   - else if (MEM_ERROR(ec)) {
221   - u8 rrrr = (ec >> 4) & 0xf;
222   -
223   - if (rrrr >= 0x7)
224   - pr_cont(": %s error during data copyback.\n",
225   - RRRR_MSG(ec));
226   - else if (rrrr <= 0x1)
227   - pr_cont(": %s parity/ECC error during data "
228   - "access from L2.\n", RRRR_MSG(ec));
229   - else
230   - goto wrong_bu_mce;
231   - } else
232   - goto wrong_bu_mce;
233   - } else
234   - goto wrong_bu_mce;
235   -
236   - return;
237   -
238   -wrong_bu_mce:
239   - pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
240   -}
241   -
242   -static void amd_decode_ls_mce(struct mce *m)
243   -{
244   - u32 ec = m->status & 0xffff;
245   - u32 xec = (m->status >> 16) & 0xf;
246   -
247   - pr_emerg(HW_ERR "Load Store Error");
248   -
249   - if (xec == 0x0) {
250   - u8 rrrr = (ec >> 4) & 0xf;
251   -
252   - if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
253   - goto wrong_ls_mce;
254   -
255   - pr_cont(" during %s.\n", RRRR_MSG(ec));
256   - }
257   - return;
258   -
259   -wrong_ls_mce:
260   - pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
261   -}
262   -
263   -void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
264   -{
265   - u32 ec = m->status & 0xffff;
266   - u32 nbsh = (u32)(m->status >> 32);
267   - u32 nbsl = (u32)m->status;
268   -
269   - /*
270   - * GART TLB error reporting is disabled by default. Bail out early.
271   - */
272   - if (TLB_ERROR(ec) && !report_gart_errors)
273   - return;
274   -
275   - pr_emerg(HW_ERR "Northbridge Error, node %d", node_id);
276   -
277   - /*
278   - * F10h, revD can disable ErrCpu[3:0] so check that first and also the
279   - * value encoding has changed so interpret those differently
280   - */
281   - if ((boot_cpu_data.x86 == 0x10) &&
282   - (boot_cpu_data.x86_model > 7)) {
283   - if (nbsh & K8_NBSH_ERR_CPU_VAL)
284   - pr_cont(", core: %u\n", (u8)(nbsh & 0xf));
285   - } else {
286   - u8 assoc_cpus = nbsh & 0xf;
287   -
288   - if (assoc_cpus > 0)
289   - pr_cont(", core: %d", fls(assoc_cpus) - 1);
290   -
291   - pr_cont("\n");
292   - }
293   -
294   - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl));
295   -
296   - if (BUS_ERROR(ec) && nb_bus_decoder)
297   - nb_bus_decoder(node_id, m, nbcfg);
298   -}
299   -EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
300   -
301   -static void amd_decode_fr_mce(struct mce *m)
302   -{
303   - /* we have only one error signature so match all fields at once. */
304   - if ((m->status & 0xffff) == 0x0f0f)
305   - pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n");
306   - else
307   - pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
308   -}
309   -
310   -static inline void amd_decode_err_code(u16 ec)
311   -{
312   - if (TLB_ERROR(ec)) {
313   - pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
314   - TT_MSG(ec), LL_MSG(ec));
315   - } else if (MEM_ERROR(ec)) {
316   - pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
317   - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
318   - } else if (BUS_ERROR(ec)) {
319   - pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
320   - "Participating Processor: %s\n",
321   - RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
322   - PP_MSG(ec));
323   - } else
324   - pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
325   -}
326   -
327   -int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
328   -{
329   - struct mce *m = (struct mce *)data;
330   - int node, ecc;
331   -
332   - pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
333   -
334   - pr_cont("%sorrected error, other errors lost: %s, "
335   - "CPU context corrupt: %s",
336   - ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
337   - ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
338   - ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
339   -
340   - /* do the two bits[14:13] together */
341   - ecc = (m->status >> 45) & 0x3;
342   - if (ecc)
343   - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
344   -
345   - pr_cont("\n");
346   -
347   - switch (m->bank) {
348   - case 0:
349   - amd_decode_dc_mce(m);
350   - break;
351   -
352   - case 1:
353   - amd_decode_ic_mce(m);
354   - break;
355   -
356   - case 2:
357   - amd_decode_bu_mce(m);
358   - break;
359   -
360   - case 3:
361   - amd_decode_ls_mce(m);
362   - break;
363   -
364   - case 4:
365   - node = amd_get_nb_id(m->extcpu);
366   - amd_decode_nb_mce(node, m, 0);
367   - break;
368   -
369   - case 5:
370   - amd_decode_fr_mce(m);
371   - break;
372   -
373   - default:
374   - break;
375   - }
376   -
377   - amd_decode_err_code(m->status & 0xffff);
378   -
379   - return NOTIFY_STOP;
380   -}
381   -EXPORT_SYMBOL_GPL(amd_decode_mce);
382   -
383   -static struct notifier_block amd_mce_dec_nb = {
384   - .notifier_call = amd_decode_mce,
385   -};
386   -
387   -static int __init mce_amd_init(void)
388   -{
389   - /*
390   - * We can decode MCEs for K8, F10h and F11h CPUs:
391   - */
392   - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
393   - return 0;
394   -
395   - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
396   - return 0;
397   -
398   - atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
399   -
400   - return 0;
401   -}
402   -early_initcall(mce_amd_init);
403   -
404   -#ifdef MODULE
405   -static void __exit mce_amd_exit(void)
406   -{
407   - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
408   -}
409   -
410   -MODULE_DESCRIPTION("AMD MCE decoder");
411   -MODULE_ALIAS("edac-mce-amd");
412   -MODULE_LICENSE("GPL");
413   -module_exit(mce_amd_exit);
414   -#endif
drivers/edac/edac_mce_amd.h
1   -#ifndef _EDAC_MCE_AMD_H
2   -#define _EDAC_MCE_AMD_H
3   -
4   -#include <linux/notifier.h>
5   -
6   -#include <asm/mce.h>
7   -
8   -#define ERROR_CODE(x) ((x) & 0xffff)
9   -#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
10   -#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
11   -
12   -#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
13   -#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
14   -
15   -#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
16   -#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
17   -#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
18   -
19   -#define TT(x) (((x) >> 2) & 0x3)
20   -#define TT_MSG(x) tt_msgs[TT(x)]
21   -#define II(x) (((x) >> 2) & 0x3)
22   -#define II_MSG(x) ii_msgs[II(x)]
23   -#define LL(x) (((x) >> 0) & 0x3)
24   -#define LL_MSG(x) ll_msgs[LL(x)]
25   -#define TO(x) (((x) >> 8) & 0x1)
26   -#define TO_MSG(x) to_msgs[TO(x)]
27   -#define PP(x) (((x) >> 9) & 0x3)
28   -#define PP_MSG(x) pp_msgs[PP(x)]
29   -
30   -#define RRRR(x) (((x) >> 4) & 0xf)
31   -#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!")
32   -
33   -#define K8_NBSH 0x4C
34   -
35   -#define K8_NBSH_VALID_BIT BIT(31)
36   -#define K8_NBSH_OVERFLOW BIT(30)
37   -#define K8_NBSH_UC_ERR BIT(29)
38   -#define K8_NBSH_ERR_EN BIT(28)
39   -#define K8_NBSH_MISCV BIT(27)
40   -#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
41   -#define K8_NBSH_PCC BIT(25)
42   -#define K8_NBSH_ERR_CPU_VAL BIT(24)
43   -#define K8_NBSH_CECC BIT(14)
44   -#define K8_NBSH_UECC BIT(13)
45   -#define K8_NBSH_ERR_SCRUBER BIT(8)
46   -
47   -extern const char *tt_msgs[];
48   -extern const char *ll_msgs[];
49   -extern const char *rrrr_msgs[];
50   -extern const char *pp_msgs[];
51   -extern const char *to_msgs[];
52   -extern const char *ii_msgs[];
53   -extern const char *ext_msgs[];
54   -
55   -/*
56   - * relevant NB regs
57   - */
58   -struct err_regs {
59   - u32 nbcfg;
60   - u32 nbsh;
61   - u32 nbsl;
62   - u32 nbeah;
63   - u32 nbeal;
64   -};
65   -
66   -void amd_report_gart_errors(bool);
67   -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
68   -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
69   -void amd_decode_nb_mce(int, struct mce *, u32);
70   -int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data);
71   -
72   -#endif /* _EDAC_MCE_AMD_H */
drivers/edac/mce_amd.c
  1 +#include <linux/module.h>
  2 +#include "mce_amd.h"
  3 +
  4 +static bool report_gart_errors;
  5 +static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
  6 +
  7 +void amd_report_gart_errors(bool v)
  8 +{
  9 + report_gart_errors = v;
  10 +}
  11 +EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  12 +
  13 +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
  14 +{
  15 + nb_bus_decoder = f;
  16 +}
  17 +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  18 +
  19 +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
  20 +{
  21 + if (nb_bus_decoder) {
  22 + WARN_ON(nb_bus_decoder != f);
  23 +
  24 + nb_bus_decoder = NULL;
  25 + }
  26 +}
  27 +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  28 +
  29 +/*
  30 + * string representation for the different MCA reported error types, see F3x48
  31 + * or MSR0000_0411.
  32 + */
  33 +
  34 +/* transaction type */
  35 +const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  36 +EXPORT_SYMBOL_GPL(tt_msgs);
  37 +
  38 +/* cache level */
  39 +const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  40 +EXPORT_SYMBOL_GPL(ll_msgs);
  41 +
  42 +/* memory transaction type */
  43 +const char *rrrr_msgs[] = {
  44 + "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  45 +};
  46 +EXPORT_SYMBOL_GPL(rrrr_msgs);
  47 +
  48 +/* participating processor */
  49 +const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  50 +EXPORT_SYMBOL_GPL(pp_msgs);
  51 +
  52 +/* request timeout */
  53 +const char *to_msgs[] = { "no timeout", "timed out" };
  54 +EXPORT_SYMBOL_GPL(to_msgs);
  55 +
  56 +/* memory or i/o */
  57 +const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  58 +EXPORT_SYMBOL_GPL(ii_msgs);
  59 +
  60 +/*
  61 + * Map the 4 or 5 (family-specific) bits of Extended Error code to the
  62 + * string table.
  63 + */
  64 +const char *ext_msgs[] = {
  65 + "K8 ECC error", /* 0_0000b */
  66 + "CRC error on link", /* 0_0001b */
  67 + "Sync error packets on link", /* 0_0010b */
  68 + "Master Abort during link operation", /* 0_0011b */
  69 + "Target Abort during link operation", /* 0_0100b */
  70 + "Invalid GART PTE entry during table walk", /* 0_0101b */
  71 + "Unsupported atomic RMW command received", /* 0_0110b */
  72 + "WDT error: NB transaction timeout", /* 0_0111b */
  73 + "ECC/ChipKill ECC error", /* 0_1000b */
  74 + "SVM DEV Error", /* 0_1001b */
  75 + "Link Data error", /* 0_1010b */
  76 + "Link/L3/Probe Filter Protocol error", /* 0_1011b */
  77 + "NB Internal Arrays Parity error", /* 0_1100b */
  78 + "DRAM Address/Control Parity error", /* 0_1101b */
  79 + "Link Transmission error", /* 0_1110b */
  80 + "GART/DEV Table Walk Data error" /* 0_1111b */
  81 + "Res 0x100 error", /* 1_0000b */
  82 + "Res 0x101 error", /* 1_0001b */
  83 + "Res 0x102 error", /* 1_0010b */
  84 + "Res 0x103 error", /* 1_0011b */
  85 + "Res 0x104 error", /* 1_0100b */
  86 + "Res 0x105 error", /* 1_0101b */
  87 + "Res 0x106 error", /* 1_0110b */
  88 + "Res 0x107 error", /* 1_0111b */
  89 + "Res 0x108 error", /* 1_1000b */
  90 + "Res 0x109 error", /* 1_1001b */
  91 + "Res 0x10A error", /* 1_1010b */
  92 + "Res 0x10B error", /* 1_1011b */
  93 + "ECC error in L3 Cache Data", /* 1_1100b */
  94 + "L3 Cache Tag error", /* 1_1101b */
  95 + "L3 Cache LRU Parity error", /* 1_1110b */
  96 + "Probe Filter error" /* 1_1111b */
  97 +};
  98 +EXPORT_SYMBOL_GPL(ext_msgs);
  99 +
  100 +static void amd_decode_dc_mce(struct mce *m)
  101 +{
  102 + u32 ec = m->status & 0xffff;
  103 + u32 xec = (m->status >> 16) & 0xf;
  104 +
  105 + pr_emerg(HW_ERR "Data Cache Error: ");
  106 +
  107 + if (xec == 1 && TLB_ERROR(ec))
  108 + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
  109 + else if (xec == 0) {
  110 + if (m->status & (1ULL << 40))
  111 + pr_cont(" during Data Scrub.\n");
  112 + else if (TLB_ERROR(ec))
  113 + pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
  114 + else if (MEM_ERROR(ec)) {
  115 + u8 ll = ec & 0x3;
  116 + u8 tt = (ec >> 2) & 0x3;
  117 + u8 rrrr = (ec >> 4) & 0xf;
  118 +
  119 + /* see F10h BKDG (31116), Table 92. */
  120 + if (ll == 0x1) {
  121 + if (tt != 0x1)
  122 + goto wrong_dc_mce;
  123 +
  124 + pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
  125 +
  126 + } else if (ll == 0x2 && rrrr == 0x3)
  127 + pr_cont(" during L1 linefill from L2.\n");
  128 + else
  129 + goto wrong_dc_mce;
  130 + } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
  131 + pr_cont(" during system linefill.\n");
  132 + else
  133 + goto wrong_dc_mce;
  134 + } else
  135 + goto wrong_dc_mce;
  136 +
  137 + return;
  138 +
  139 +wrong_dc_mce:
  140 + pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
  141 +}
  142 +
  143 +static void amd_decode_ic_mce(struct mce *m)
  144 +{
  145 + u32 ec = m->status & 0xffff;
  146 + u32 xec = (m->status >> 16) & 0xf;
  147 +
  148 + pr_emerg(HW_ERR "Instruction Cache Error");
  149 +
  150 + if (xec == 1 && TLB_ERROR(ec))
  151 + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
  152 + else if (xec == 0) {
  153 + if (TLB_ERROR(ec))
  154 + pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
  155 + else if (BUS_ERROR(ec)) {
  156 + if (boot_cpu_data.x86 == 0xf &&
  157 + (m->status & BIT(58)))
  158 + pr_cont(" during system linefill.\n");
  159 + else
  160 + pr_cont(" during attempted NB data read.\n");
  161 + } else if (MEM_ERROR(ec)) {
  162 + u8 ll = ec & 0x3;
  163 + u8 rrrr = (ec >> 4) & 0xf;
  164 +
  165 + if (ll == 0x2)
  166 + pr_cont(" during a linefill from L2.\n");
  167 + else if (ll == 0x1) {
  168 +
  169 + switch (rrrr) {
  170 + case 0x5:
  171 + pr_cont(": Parity error during "
  172 + "data load.\n");
  173 + break;
  174 +
  175 + case 0x7:
  176 + pr_cont(": Copyback Parity/Victim"
  177 + " error.\n");
  178 + break;
  179 +
  180 + case 0x8:
  181 + pr_cont(": Tag Snoop error.\n");
  182 + break;
  183 +
  184 + default:
  185 + goto wrong_ic_mce;
  186 + break;
  187 + }
  188 + }
  189 + } else
  190 + goto wrong_ic_mce;
  191 + } else
  192 + goto wrong_ic_mce;
  193 +
  194 + return;
  195 +
  196 +wrong_ic_mce:
  197 + pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
  198 +}
  199 +
  200 +static void amd_decode_bu_mce(struct mce *m)
  201 +{
  202 + u32 ec = m->status & 0xffff;
  203 + u32 xec = (m->status >> 16) & 0xf;
  204 +
  205 + pr_emerg(HW_ERR "Bus Unit Error");
  206 +
  207 + if (xec == 0x1)
  208 + pr_cont(" in the write data buffers.\n");
  209 + else if (xec == 0x3)
  210 + pr_cont(" in the victim data buffers.\n");
  211 + else if (xec == 0x2 && MEM_ERROR(ec))
  212 + pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
  213 + else if (xec == 0x0) {
  214 + if (TLB_ERROR(ec))
  215 + pr_cont(": %s error in a Page Descriptor Cache or "
  216 + "Guest TLB.\n", TT_MSG(ec));
  217 + else if (BUS_ERROR(ec))
  218 + pr_cont(": %s/ECC error in data read from NB: %s.\n",
  219 + RRRR_MSG(ec), PP_MSG(ec));
  220 + else if (MEM_ERROR(ec)) {
  221 + u8 rrrr = (ec >> 4) & 0xf;
  222 +
  223 + if (rrrr >= 0x7)
  224 + pr_cont(": %s error during data copyback.\n",
  225 + RRRR_MSG(ec));
  226 + else if (rrrr <= 0x1)
  227 + pr_cont(": %s parity/ECC error during data "
  228 + "access from L2.\n", RRRR_MSG(ec));
  229 + else
  230 + goto wrong_bu_mce;
  231 + } else
  232 + goto wrong_bu_mce;
  233 + } else
  234 + goto wrong_bu_mce;
  235 +
  236 + return;
  237 +
  238 +wrong_bu_mce:
  239 + pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
  240 +}
  241 +
  242 +static void amd_decode_ls_mce(struct mce *m)
  243 +{
  244 + u32 ec = m->status & 0xffff;
  245 + u32 xec = (m->status >> 16) & 0xf;
  246 +
  247 + pr_emerg(HW_ERR "Load Store Error");
  248 +
  249 + if (xec == 0x0) {
  250 + u8 rrrr = (ec >> 4) & 0xf;
  251 +
  252 + if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
  253 + goto wrong_ls_mce;
  254 +
  255 + pr_cont(" during %s.\n", RRRR_MSG(ec));
  256 + }
  257 + return;
  258 +
  259 +wrong_ls_mce:
  260 + pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
  261 +}
  262 +
  263 +void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
  264 +{
  265 + u32 ec = m->status & 0xffff;
  266 + u32 nbsh = (u32)(m->status >> 32);
  267 + u32 nbsl = (u32)m->status;
  268 +
  269 + /*
  270 + * GART TLB error reporting is disabled by default. Bail out early.
  271 + */
  272 + if (TLB_ERROR(ec) && !report_gart_errors)
  273 + return;
  274 +
  275 + pr_emerg(HW_ERR "Northbridge Error, node %d", node_id);
  276 +
  277 + /*
  278 + * F10h, revD can disable ErrCpu[3:0] so check that first and also the
  279 + * value encoding has changed so interpret those differently
  280 + */
  281 + if ((boot_cpu_data.x86 == 0x10) &&
  282 + (boot_cpu_data.x86_model > 7)) {
  283 + if (nbsh & K8_NBSH_ERR_CPU_VAL)
  284 + pr_cont(", core: %u\n", (u8)(nbsh & 0xf));
  285 + } else {
  286 + u8 assoc_cpus = nbsh & 0xf;
  287 +
  288 + if (assoc_cpus > 0)
  289 + pr_cont(", core: %d", fls(assoc_cpus) - 1);
  290 +
  291 + pr_cont("\n");
  292 + }
  293 +
  294 + pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl));
  295 +
  296 + if (BUS_ERROR(ec) && nb_bus_decoder)
  297 + nb_bus_decoder(node_id, m, nbcfg);
  298 +}
  299 +EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
  300 +
  301 +static void amd_decode_fr_mce(struct mce *m)
  302 +{
  303 + /* we have only one error signature so match all fields at once. */
  304 + if ((m->status & 0xffff) == 0x0f0f)
  305 + pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n");
  306 + else
  307 + pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
  308 +}
  309 +
  310 +static inline void amd_decode_err_code(u16 ec)
  311 +{
  312 + if (TLB_ERROR(ec)) {
  313 + pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
  314 + TT_MSG(ec), LL_MSG(ec));
  315 + } else if (MEM_ERROR(ec)) {
  316 + pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
  317 + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
  318 + } else if (BUS_ERROR(ec)) {
  319 + pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
  320 + "Participating Processor: %s\n",
  321 + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
  322 + PP_MSG(ec));
  323 + } else
  324 + pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
  325 +}
  326 +
  327 +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
  328 +{
  329 + struct mce *m = (struct mce *)data;
  330 + int node, ecc;
  331 +
  332 + pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
  333 +
  334 + pr_cont("%sorrected error, other errors lost: %s, "
  335 + "CPU context corrupt: %s",
  336 + ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
  337 + ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
  338 + ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
  339 +
  340 + /* do the two bits[14:13] together */
  341 + ecc = (m->status >> 45) & 0x3;
  342 + if (ecc)
  343 + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
  344 +
  345 + pr_cont("\n");
  346 +
  347 + switch (m->bank) {
  348 + case 0:
  349 + amd_decode_dc_mce(m);
  350 + break;
  351 +
  352 + case 1:
  353 + amd_decode_ic_mce(m);
  354 + break;
  355 +
  356 + case 2:
  357 + amd_decode_bu_mce(m);
  358 + break;
  359 +
  360 + case 3:
  361 + amd_decode_ls_mce(m);
  362 + break;
  363 +
  364 + case 4:
  365 + node = amd_get_nb_id(m->extcpu);
  366 + amd_decode_nb_mce(node, m, 0);
  367 + break;
  368 +
  369 + case 5:
  370 + amd_decode_fr_mce(m);
  371 + break;
  372 +
  373 + default:
  374 + break;
  375 + }
  376 +
  377 + amd_decode_err_code(m->status & 0xffff);
  378 +
  379 + return NOTIFY_STOP;
  380 +}
  381 +EXPORT_SYMBOL_GPL(amd_decode_mce);
  382 +
  383 +static struct notifier_block amd_mce_dec_nb = {
  384 + .notifier_call = amd_decode_mce,
  385 +};
  386 +
  387 +static int __init mce_amd_init(void)
  388 +{
  389 + /*
  390 + * We can decode MCEs for K8, F10h and F11h CPUs:
  391 + */
  392 + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
  393 + return 0;
  394 +
  395 + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
  396 + return 0;
  397 +
  398 + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
  399 +
  400 + return 0;
  401 +}
  402 +early_initcall(mce_amd_init);
  403 +
  404 +#ifdef MODULE
  405 +static void __exit mce_amd_exit(void)
  406 +{
  407 + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
  408 +}
  409 +
  410 +MODULE_DESCRIPTION("AMD MCE decoder");
  411 +MODULE_ALIAS("edac-mce-amd");
  412 +MODULE_LICENSE("GPL");
  413 +module_exit(mce_amd_exit);
  414 +#endif
drivers/edac/mce_amd.h
  1 +#ifndef _EDAC_MCE_AMD_H
  2 +#define _EDAC_MCE_AMD_H
  3 +
  4 +#include <linux/notifier.h>
  5 +
  6 +#include <asm/mce.h>
  7 +
  8 +#define ERROR_CODE(x) ((x) & 0xffff)
  9 +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
  10 +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
  11 +
  12 +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
  13 +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
  14 +
  15 +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
  16 +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
  17 +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
  18 +
  19 +#define TT(x) (((x) >> 2) & 0x3)
  20 +#define TT_MSG(x) tt_msgs[TT(x)]
  21 +#define II(x) (((x) >> 2) & 0x3)
  22 +#define II_MSG(x) ii_msgs[II(x)]
  23 +#define LL(x) (((x) >> 0) & 0x3)
  24 +#define LL_MSG(x) ll_msgs[LL(x)]
  25 +#define TO(x) (((x) >> 8) & 0x1)
  26 +#define TO_MSG(x) to_msgs[TO(x)]
  27 +#define PP(x) (((x) >> 9) & 0x3)
  28 +#define PP_MSG(x) pp_msgs[PP(x)]
  29 +
  30 +#define RRRR(x) (((x) >> 4) & 0xf)
  31 +#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!")
  32 +
  33 +#define K8_NBSH 0x4C
  34 +
  35 +#define K8_NBSH_VALID_BIT BIT(31)
  36 +#define K8_NBSH_OVERFLOW BIT(30)
  37 +#define K8_NBSH_UC_ERR BIT(29)
  38 +#define K8_NBSH_ERR_EN BIT(28)
  39 +#define K8_NBSH_MISCV BIT(27)
  40 +#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
  41 +#define K8_NBSH_PCC BIT(25)
  42 +#define K8_NBSH_ERR_CPU_VAL BIT(24)
  43 +#define K8_NBSH_CECC BIT(14)
  44 +#define K8_NBSH_UECC BIT(13)
  45 +#define K8_NBSH_ERR_SCRUBER BIT(8)
  46 +
  47 +extern const char *tt_msgs[];
  48 +extern const char *ll_msgs[];
  49 +extern const char *rrrr_msgs[];
  50 +extern const char *pp_msgs[];
  51 +extern const char *to_msgs[];
  52 +extern const char *ii_msgs[];
  53 +extern const char *ext_msgs[];
  54 +
  55 +/*
  56 + * relevant NB regs
  57 + */
  58 +struct err_regs {
  59 + u32 nbcfg;
  60 + u32 nbsh;
  61 + u32 nbsl;
  62 + u32 nbeah;
  63 + u32 nbeal;
  64 +};
  65 +
  66 +void amd_report_gart_errors(bool);
  67 +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
  68 +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
  69 +void amd_decode_nb_mce(int, struct mce *, u32);
  70 +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data);
  71 +
  72 +#endif /* _EDAC_MCE_AMD_H */
drivers/edac/mce_amd_inj.c
... ... @@ -15,7 +15,7 @@
15 15 #include <linux/edac.h>
16 16 #include <asm/mce.h>
17 17  
18   -#include "edac_mce_amd.h"
  18 +#include "mce_amd.h"
19 19  
20 20 struct edac_mce_attr {
21 21 struct attribute attr;