Commit 55672ecfa21f23616541c50e0e687f14f9ecf165

Authored by Mahesh Salgaonkar
Committed by Benjamin Herrenschmidt
1 parent d2a36071ef

powerpc/book3s: Recover from MC in sapphire on SCOM read via MMIO.

Detect and recover from machine check when inside opal on a special
scom load instructions. On specific SCOM read via MMIO we may get a machine
check exception with SRR0 pointing inside opal. To recover from MC
in this scenario, get a recovery instruction address and return to it from
MC.

OPAL will export the machine check recoverable ranges through
device tree node mcheck-recoverable-ranges under ibm,opal:

# hexdump /proc/device-tree/ibm,opal/mcheck-recoverable-ranges
0000000 0000 0000 3000 2804 0000 000c 0000 0000
0000010 3000 2814 0000 0000 3000 27f0 0000 000c
0000020 0000 0000 3000 2814 xxxx xxxx xxxx xxxx
0000030 llll llll yyyy yyyy yyyy yyyy
...
...
#

where:
	xxxx xxxx xxxx xxxx = Starting instruction address
	llll llll           = Length of the address range.
	yyyy yyyy yyyy yyyy = recovery address

Each recoverable address range entry is (start address, len,
recovery address), 2 cells each for start and recovery address, 1 cell for
len, totalling 5 cells per entry. During kernel boot time, build up the
recovery table with the list of recovery ranges from device-tree node which
will be used during machine check exception to recover from MMIO SCOM UE.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Showing 8 changed files with 146 additions and 10 deletions Side-by-side Diff

arch/powerpc/include/asm/machdep.h
... ... @@ -170,6 +170,9 @@
170 170 int (*system_reset_exception)(struct pt_regs *regs);
171 171 int (*machine_check_exception)(struct pt_regs *regs);
172 172  
  173 + /* Called during machine check exception to retrive fixup address. */
  174 + bool (*mce_check_early_recovery)(struct pt_regs *regs);
  175 +
173 176 /* Motherboard/chipset features. This is a kind of general purpose
174 177 * hook used to control some machine specific features (like reset
175 178 * lines, chip power control, etc...).
arch/powerpc/include/asm/mce.h
... ... @@ -187,7 +187,8 @@
187 187 #define MCE_EVENT_DONTRELEASE false
188 188  
189 189 extern void save_mce_event(struct pt_regs *regs, long handled,
190   - struct mce_error_info *mce_err, uint64_t addr);
  190 + struct mce_error_info *mce_err, uint64_t nip,
  191 + uint64_t addr);
191 192 extern int get_mce_event(struct machine_check_event *mce, bool release);
192 193 extern void release_mce_event(void);
193 194 extern void machine_check_queue_event(void);
arch/powerpc/include/asm/opal.h
... ... @@ -833,6 +833,8 @@
833 833  
834 834 /* Internal functions */
835 835 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
  836 +extern int early_init_dt_scan_recoverable_ranges(unsigned long node,
  837 + const char *uname, int depth, void *data);
836 838  
837 839 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
838 840 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
... ... @@ -863,6 +865,7 @@
863 865 extern void opal_flash_init(void);
864 866  
865 867 extern int opal_machine_check(struct pt_regs *regs);
  868 +extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
866 869  
867 870 extern void opal_shutdown(void);
868 871  
arch/powerpc/kernel/mce.c
... ... @@ -70,7 +70,7 @@
70 70 */
71 71 void save_mce_event(struct pt_regs *regs, long handled,
72 72 struct mce_error_info *mce_err,
73   - uint64_t addr)
  73 + uint64_t nip, uint64_t addr)
74 74 {
75 75 uint64_t srr1;
76 76 int index = __get_cpu_var(mce_nest_count)++;
... ... @@ -86,7 +86,7 @@
86 86  
87 87 /* Populate generic machine check info */
88 88 mce->version = MCE_V1;
89   - mce->srr0 = regs->nip;
  89 + mce->srr0 = nip;
90 90 mce->srr1 = regs->msr;
91 91 mce->gpr3 = regs->gpr[3];
92 92 mce->in_use = 1;
arch/powerpc/kernel/mce_power.c
... ... @@ -26,6 +26,7 @@
26 26 #include <linux/ptrace.h>
27 27 #include <asm/mmu.h>
28 28 #include <asm/mce.h>
  29 +#include <asm/machdep.h>
29 30  
30 31 /* flush SLBs and reload */
31 32 static void flush_and_reload_slb(void)
32 33  
33 34  
... ... @@ -197,13 +198,32 @@
197 198 }
198 199 }
199 200  
  201 +static long mce_handle_ue_error(struct pt_regs *regs)
  202 +{
  203 + long handled = 0;
  204 +
  205 + /*
  206 + * On specific SCOM read via MMIO we may get a machine check
  207 + * exception with SRR0 pointing inside opal. If that is the
  208 + * case OPAL may have recovery address to re-read SCOM data in
  209 + * different way and hence we can recover from this MC.
  210 + */
  211 +
  212 + if (ppc_md.mce_check_early_recovery) {
  213 + if (ppc_md.mce_check_early_recovery(regs))
  214 + handled = 1;
  215 + }
  216 + return handled;
  217 +}
  218 +
200 219 long __machine_check_early_realmode_p7(struct pt_regs *regs)
201 220 {
202   - uint64_t srr1, addr;
  221 + uint64_t srr1, nip, addr;
203 222 long handled = 1;
204 223 struct mce_error_info mce_error_info = { 0 };
205 224  
206 225 srr1 = regs->msr;
  226 + nip = regs->nip;
207 227  
208 228 /*
209 229 * Handle memory errors depending whether this was a load/store or
... ... @@ -221,7 +241,11 @@
221 241 addr = regs->nip;
222 242 }
223 243  
224   - save_mce_event(regs, handled, &mce_error_info, addr);
  244 + /* Handle UE error. */
  245 + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
  246 + handled = mce_handle_ue_error(regs);
  247 +
  248 + save_mce_event(regs, handled, &mce_error_info, nip, addr);
225 249 return handled;
226 250 }
227 251  
228 252  
... ... @@ -263,11 +287,12 @@
263 287  
264 288 long __machine_check_early_realmode_p8(struct pt_regs *regs)
265 289 {
266   - uint64_t srr1, addr;
  290 + uint64_t srr1, nip, addr;
267 291 long handled = 1;
268 292 struct mce_error_info mce_error_info = { 0 };
269 293  
270 294 srr1 = regs->msr;
  295 + nip = regs->nip;
271 296  
272 297 if (P7_SRR1_MC_LOADSTORE(srr1)) {
273 298 handled = mce_handle_derror_p8(regs->dsisr);
... ... @@ -279,7 +304,11 @@
279 304 addr = regs->nip;
280 305 }
281 306  
282   - save_mce_event(regs, handled, &mce_error_info, addr);
  307 + /* Handle UE error. */
  308 + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
  309 + handled = mce_handle_ue_error(regs);
  310 +
  311 + save_mce_event(regs, handled, &mce_error_info, nip, addr);
283 312 return handled;
284 313 }
arch/powerpc/kernel/prom.c
... ... @@ -752,6 +752,11 @@
752 752 spinning_secondaries = boot_cpu_count - 1;
753 753 #endif
754 754  
  755 +#ifdef CONFIG_PPC_POWERNV
  756 + /* Scan and build the list of machine check recoverable ranges */
  757 + of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
  758 +#endif
  759 +
755 760 DBG(" <- early_init_devtree()\n");
756 761 }
757 762  
arch/powerpc/platforms/powernv/opal.c
... ... @@ -21,6 +21,7 @@
21 21 #include <linux/sched.h>
22 22 #include <linux/kobject.h>
23 23 #include <linux/delay.h>
  24 +#include <linux/memblock.h>
24 25 #include <asm/opal.h>
25 26 #include <asm/firmware.h>
26 27 #include <asm/mce.h>
27 28  
... ... @@ -33,8 +34,18 @@
33 34 struct opal {
34 35 u64 base;
35 36 u64 entry;
  37 + u64 size;
36 38 } opal;
37 39  
  40 +struct mcheck_recoverable_range {
  41 + u64 start_addr;
  42 + u64 end_addr;
  43 + u64 recover_addr;
  44 +};
  45 +
  46 +static struct mcheck_recoverable_range *mc_recoverable_range;
  47 +static int mc_recoverable_range_len;
  48 +
38 49 static struct device_node *opal_node;
39 50 static DEFINE_SPINLOCK(opal_write_lock);
40 51 extern u64 opal_mc_secondary_handler[];
41 52  
42 53  
43 54  
44 55  
... ... @@ -49,25 +60,29 @@
49 60 int __init early_init_dt_scan_opal(unsigned long node,
50 61 const char *uname, int depth, void *data)
51 62 {
52   - const void *basep, *entryp;
53   - unsigned long basesz, entrysz;
  63 + const void *basep, *entryp, *sizep;
  64 + unsigned long basesz, entrysz, runtimesz;
54 65  
55 66 if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
56 67 return 0;
57 68  
58 69 basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
59 70 entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
  71 + sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
60 72  
61   - if (!basep || !entryp)
  73 + if (!basep || !entryp || !sizep)
62 74 return 1;
63 75  
64 76 opal.base = of_read_number(basep, basesz/4);
65 77 opal.entry = of_read_number(entryp, entrysz/4);
  78 + opal.size = of_read_number(sizep, runtimesz/4);
66 79  
67 80 pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%ld)\n",
68 81 opal.base, basep, basesz);
69 82 pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%ld)\n",
70 83 opal.entry, entryp, entrysz);
  84 + pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%ld)\n",
  85 + opal.size, sizep, runtimesz);
71 86  
72 87 powerpc_firmware_features |= FW_FEATURE_OPAL;
73 88 if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
... ... @@ -84,6 +99,53 @@
84 99 return 1;
85 100 }
86 101  
  102 +int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
  103 + const char *uname, int depth, void *data)
  104 +{
  105 + unsigned long i, size;
  106 + const __be32 *prop;
  107 +
  108 + if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
  109 + return 0;
  110 +
  111 + prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &size);
  112 +
  113 + if (!prop)
  114 + return 1;
  115 +
  116 + pr_debug("Found machine check recoverable ranges.\n");
  117 +
  118 + /*
  119 + * Allocate a buffer to hold the MC recoverable ranges. We would be
  120 + * accessing them in real mode, hence it needs to be within
  121 + * RMO region.
  122 + */
  123 + mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64),
  124 + ppc64_rma_size));
  125 + memset(mc_recoverable_range, 0, size);
  126 +
  127 + /*
  128 + * Each recoverable address entry is an (start address,len,
  129 + * recover address) pair, * 2 cells each, totalling 4 cells per entry.
  130 + */
  131 + for (i = 0; i < size / (sizeof(*prop) * 5); i++) {
  132 + mc_recoverable_range[i].start_addr =
  133 + of_read_number(prop + (i * 5) + 0, 2);
  134 + mc_recoverable_range[i].end_addr =
  135 + mc_recoverable_range[i].start_addr +
  136 + of_read_number(prop + (i * 5) + 2, 1);
  137 + mc_recoverable_range[i].recover_addr =
  138 + of_read_number(prop + (i * 5) + 3, 2);
  139 +
  140 + pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
  141 + mc_recoverable_range[i].start_addr,
  142 + mc_recoverable_range[i].end_addr,
  143 + mc_recoverable_range[i].recover_addr);
  144 + }
  145 + mc_recoverable_range_len = i;
  146 + return 1;
  147 +}
  148 +
87 149 static int __init opal_register_exception_handlers(void)
88 150 {
89 151 #ifdef __BIG_ENDIAN__
... ... @@ -399,6 +461,38 @@
399 461 if (opal_recover_mce(regs, &evt))
400 462 return 1;
401 463 return 0;
  464 +}
  465 +
  466 +static uint64_t find_recovery_address(uint64_t nip)
  467 +{
  468 + int i;
  469 +
  470 + for (i = 0; i < mc_recoverable_range_len; i++)
  471 + if ((nip >= mc_recoverable_range[i].start_addr) &&
  472 + (nip < mc_recoverable_range[i].end_addr))
  473 + return mc_recoverable_range[i].recover_addr;
  474 + return 0;
  475 +}
  476 +
  477 +bool opal_mce_check_early_recovery(struct pt_regs *regs)
  478 +{
  479 + uint64_t recover_addr = 0;
  480 +
  481 + if (!opal.base || !opal.size)
  482 + goto out;
  483 +
  484 + if ((regs->nip >= opal.base) &&
  485 + (regs->nip <= (opal.base + opal.size)))
  486 + recover_addr = find_recovery_address(regs->nip);
  487 +
  488 + /*
  489 + * Setup regs->nip to rfi into fixup address.
  490 + */
  491 + if (recover_addr)
  492 + regs->nip = recover_addr;
  493 +
  494 +out:
  495 + return !!recover_addr;
402 496 }
403 497  
404 498 static irqreturn_t opal_interrupt(int irq, void *data)
arch/powerpc/platforms/powernv/setup.c
... ... @@ -188,6 +188,7 @@
188 188 ppc_md.power_off = pnv_power_off;
189 189 ppc_md.halt = pnv_halt;
190 190 ppc_md.machine_check_exception = opal_machine_check;
  191 + ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
191 192 }
192 193  
193 194 #ifdef CONFIG_PPC_POWERNV_RTAS