Commit 3045f94a20cc54e3e5b20a843701eeab86f57163
Exists in
smarc-imx_3.14.28_1.0.0_ga
and in
1 other branch
Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS update from Ingo Molnar: "The changes in this tree are: - ACPI APEI (ACPI Platform Error Interface) improvements, by Chen Gong - misc MCE fixes/cleanups" * 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Update MCE severity condition check mce: acpi/apei: Add comments to clarify usage of the various bitfields in the MCA subsystem ACPI/APEI: Update einj documentation for param1/param2 ACPI/APEI: Add parameter check before error injection ACPI, APEI, EINJ: Fix error return code in einj_init() x86, mce: Fix "braodcast" typo
Showing 8 changed files Side-by-side Diff
Documentation/acpi/apei/einj.txt
... | ... | @@ -47,11 +47,16 @@ |
47 | 47 | |
48 | 48 | - param1 |
49 | 49 | This file is used to set the first error parameter value. Effect of |
50 | - parameter depends on error_type specified. | |
50 | + parameter depends on error_type specified. For example, if error | |
51 | + type is memory related type, the param1 should be a valid physical | |
52 | + memory address. | |
51 | 53 | |
52 | 54 | - param2 |
53 | 55 | This file is used to set the second error parameter value. Effect of |
54 | - parameter depends on error_type specified. | |
56 | + parameter depends on error_type specified. For example, if error | |
57 | + type is memory related type, the param2 should be a physical memory | |
58 | + address mask. Linux requires page or narrower granularity, say, | |
59 | + 0xfffffffffffff000. | |
55 | 60 | |
56 | 61 | - notrigger |
57 | 62 | The EINJ mechanism is a two step process. First inject the error, then |
arch/x86/include/asm/mce.h
... | ... | @@ -61,7 +61,7 @@ |
61 | 61 | #define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */ |
62 | 62 | #define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */ |
63 | 63 | #define MCJ_EXCEPTION 0x8 /* raise as exception */ |
64 | -#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */ | |
64 | +#define MCJ_IRQ_BROADCAST 0x10 /* do IRQ broadcasting */ | |
65 | 65 | |
66 | 66 | #define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */ |
67 | 67 |
arch/x86/kernel/cpu/mcheck/mce-inject.c
... | ... | @@ -153,7 +153,7 @@ |
153 | 153 | return; |
154 | 154 | |
155 | 155 | #ifdef CONFIG_X86_LOCAL_APIC |
156 | - if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { | |
156 | + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { | |
157 | 157 | unsigned long start; |
158 | 158 | int cpu; |
159 | 159 | |
... | ... | @@ -167,7 +167,7 @@ |
167 | 167 | cpumask_clear_cpu(cpu, mce_inject_cpumask); |
168 | 168 | } |
169 | 169 | if (!cpumask_empty(mce_inject_cpumask)) { |
170 | - if (m->inject_flags & MCJ_IRQ_BRAODCAST) { | |
170 | + if (m->inject_flags & MCJ_IRQ_BROADCAST) { | |
171 | 171 | /* |
172 | 172 | * don't wait because mce_irq_ipi is necessary |
173 | 173 | * to be sync with following raise_local |
arch/x86/kernel/cpu/mcheck/mce-severity.c
... | ... | @@ -110,22 +110,17 @@ |
110 | 110 | /* known AR MCACODs: */ |
111 | 111 | #ifdef CONFIG_MEMORY_FAILURE |
112 | 112 | MCESEV( |
113 | - KEEP, "HT thread notices Action required: data load error", | |
114 | - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | |
115 | - MCGMASK(MCG_STATUS_EIPV, 0) | |
113 | + KEEP, "Action required but unaffected thread is continuable", | |
114 | + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR), | |
115 | + MCGMASK(MCG_STATUS_RIPV, MCG_STATUS_RIPV) | |
116 | 116 | ), |
117 | 117 | MCESEV( |
118 | - AR, "Action required: data load error", | |
118 | + AR, "Action required: data load error in a user process", | |
119 | 119 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), |
120 | 120 | USER |
121 | 121 | ), |
122 | 122 | MCESEV( |
123 | - KEEP, "HT thread notices Action required: instruction fetch error", | |
124 | - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), | |
125 | - MCGMASK(MCG_STATUS_EIPV, 0) | |
126 | - ), | |
127 | - MCESEV( | |
128 | - AR, "Action required: instruction fetch error", | |
123 | + AR, "Action required: instruction fetch error in a user process", | |
129 | 124 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), |
130 | 125 | USER |
131 | 126 | ), |
arch/x86/kernel/cpu/mcheck/mce.c
... | ... | @@ -89,7 +89,10 @@ |
89 | 89 | static DEFINE_PER_CPU(struct mce, mces_seen); |
90 | 90 | static int cpu_missing; |
91 | 91 | |
92 | -/* MCA banks polled by the period polling timer for corrected events */ | |
92 | +/* | |
93 | + * MCA banks polled by the period polling timer for corrected events. | |
94 | + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). | |
95 | + */ | |
93 | 96 | DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { |
94 | 97 | [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL |
95 | 98 | }; |
arch/x86/kernel/cpu/mcheck/mce_intel.c
... | ... | @@ -24,6 +24,18 @@ |
24 | 24 | * Also supports reliable discovery of shared banks. |
25 | 25 | */ |
26 | 26 | |
27 | +/* | |
28 | + * CMCI can be delivered to multiple cpus that share a machine check bank | |
29 | + * so we need to designate a single cpu to process errors logged in each bank | |
30 | + * in the interrupt handler (otherwise we would have many races and potential | |
31 | + * double reporting of the same error). | |
32 | + * Note that this can change when a cpu is offlined or brought online since | |
33 | + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() | |
34 | + * disables CMCI on all banks owned by the cpu and clears this bitfield. At | |
35 | + * this point, cmci_rediscover() kicks in and a different cpu may end up | |
36 | + * taking ownership of some of the shared MCA banks that were previously | |
37 | + * owned by the offlined cpu. | |
38 | + */ | |
27 | 39 | static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); |
28 | 40 | |
29 | 41 | /* |
drivers/acpi/apei/einj.c
... | ... | @@ -32,6 +32,7 @@ |
32 | 32 | #include <linux/seq_file.h> |
33 | 33 | #include <linux/nmi.h> |
34 | 34 | #include <linux/delay.h> |
35 | +#include <linux/mm.h> | |
35 | 36 | #include <acpi/acpi.h> |
36 | 37 | |
37 | 38 | #include "apei-internal.h" |
... | ... | @@ -41,6 +42,10 @@ |
41 | 42 | #define SPIN_UNIT 100 /* 100ns */ |
42 | 43 | /* Firmware should respond within 1 milliseconds */ |
43 | 44 | #define FIRMWARE_TIMEOUT (1 * NSEC_PER_MSEC) |
45 | +#define ACPI5_VENDOR_BIT BIT(31) | |
46 | +#define MEM_ERROR_MASK (ACPI_EINJ_MEMORY_CORRECTABLE | \ | |
47 | + ACPI_EINJ_MEMORY_UNCORRECTABLE | \ | |
48 | + ACPI_EINJ_MEMORY_FATAL) | |
44 | 49 | |
45 | 50 | /* |
46 | 51 | * ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action. |
... | ... | @@ -367,7 +372,7 @@ |
367 | 372 | * This will cause resource conflict with regular memory. So |
368 | 373 | * remove it from trigger table resources. |
369 | 374 | */ |
370 | - if ((param_extension || acpi5) && (type & 0x0038) && param2) { | |
375 | + if ((param_extension || acpi5) && (type & MEM_ERROR_MASK) && param2) { | |
371 | 376 | struct apei_resources addr_resources; |
372 | 377 | apei_resources_init(&addr_resources); |
373 | 378 | trigger_param_region = einj_get_trigger_parameter_region( |
... | ... | @@ -427,7 +432,7 @@ |
427 | 432 | struct set_error_type_with_address *v5param = einj_param; |
428 | 433 | |
429 | 434 | v5param->type = type; |
430 | - if (type & 0x80000000) { | |
435 | + if (type & ACPI5_VENDOR_BIT) { | |
431 | 436 | switch (vendor_flags) { |
432 | 437 | case SETWA_FLAGS_APICID: |
433 | 438 | v5param->apicid = param1; |
434 | 439 | |
... | ... | @@ -512,7 +517,34 @@ |
512 | 517 | static int einj_error_inject(u32 type, u64 param1, u64 param2) |
513 | 518 | { |
514 | 519 | int rc; |
520 | + unsigned long pfn; | |
515 | 521 | |
522 | + /* | |
523 | + * We need extra sanity checks for memory errors. | |
524 | + * Other types leap directly to injection. | |
525 | + */ | |
526 | + | |
527 | + /* ensure param1/param2 existed */ | |
528 | + if (!(param_extension || acpi5)) | |
529 | + goto inject; | |
530 | + | |
531 | + /* ensure injection is memory related */ | |
532 | + if (type & ACPI5_VENDOR_BIT) { | |
533 | + if (vendor_flags != SETWA_FLAGS_MEM) | |
534 | + goto inject; | |
535 | + } else if (!(type & MEM_ERROR_MASK)) | |
536 | + goto inject; | |
537 | + | |
538 | + /* | |
539 | + * Disallow crazy address masks that give BIOS leeway to pick | |
540 | + * injection address almost anywhere. Insist on page or | |
541 | + * better granularity and that target address is normal RAM. | |
542 | + */ | |
543 | + pfn = PFN_DOWN(param1 & param2); | |
544 | + if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK)) | |
545 | + return -EINVAL; | |
546 | + | |
547 | +inject: | |
516 | 548 | mutex_lock(&einj_mutex); |
517 | 549 | rc = __einj_error_inject(type, param1, param2); |
518 | 550 | mutex_unlock(&einj_mutex); |
... | ... | @@ -590,7 +622,7 @@ |
590 | 622 | * Vendor defined types have 0x80000000 bit set, and |
591 | 623 | * are not enumerated by ACPI_EINJ_GET_ERROR_TYPE |
592 | 624 | */ |
593 | - vendor = val & 0x80000000; | |
625 | + vendor = val & ACPI5_VENDOR_BIT; | |
594 | 626 | tval = val & 0x7fffffff; |
595 | 627 | |
596 | 628 | /* Only one error type can be specified */ |
... | ... | @@ -694,6 +726,7 @@ |
694 | 726 | if (rc) |
695 | 727 | goto err_release; |
696 | 728 | |
729 | + rc = -ENOMEM; | |
697 | 730 | einj_param = einj_get_parameter_address(); |
698 | 731 | if ((param_extension || acpi5) && einj_param) { |
699 | 732 | fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR, |