Commit d2b0f6f77ee525811b6efe864efa6a4eb82eea73

Authored by Gavin Shan
Committed by Benjamin Herrenschmidt
1 parent 7f52a526f6

powerpc/eeh: No hotplug on permanently removed dev

The issue was detected in a bit complicated test case where
we have multiple hierarchical PEs shown as following figure:

                +-----------------+
                | PE#3     p2p#0  |
                |          p2p#1  |
                +-----------------+
                        |
                +-----------------+
                | PE#4     pdev#0 |
                |          pdev#1 |
                +-----------------+

PE#4 (have 2 PCI devices) is the child of PE#3, which has 2 p2p
bridges. We accidentally had less-known scenario: PE#4 was removed
permanently from the system because of permanent failure (e.g.
exceeding the max allowd failure times in last hour), then we detects
EEH errors on PE#3 and tried to recover it. However, eeh_dev instances
for pdev#0/1 were not detached from PE#4, which was still connected to
PE#3. All of that was because of the fact that we rely on count-based
pcibios_release_device(), which isn't reliable enough. When doing
recovery for PE#3, we still apply hotplug on PE#4 and pdev#0/1, which
are not valid any more. Eventually, we run into kernel crash.

The patch fixes above issue from two aspects. For unplug, we simply
skip those permanently removed PE, whose state is (EEH_PE_STATE_ISOLATED
&& !EEH_PE_STATE_RECOVERING) and its frozen count should be greater
than EEH_MAX_ALLOWED_FREEZES. For plug, we marked all permanently
removed EEH devices with EEH_DEV_REMOVED and return 0xFF's on read
its PCI config so that PCI core will omit them.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Showing 6 changed files with 100 additions and 19 deletions Side-by-side Diff

arch/powerpc/include/asm/eeh.h
... ... @@ -98,6 +98,7 @@
98 98  
99 99 #define EEH_DEV_NO_HANDLER (1 << 8) /* No error handler */
100 100 #define EEH_DEV_SYSFS (1 << 9) /* Sysfs created */
  101 +#define EEH_DEV_REMOVED (1 << 10) /* Removed permanently */
101 102  
102 103 struct eeh_dev {
103 104 int mode; /* EEH mode */
arch/powerpc/include/asm/ppc-pci.h
... ... @@ -58,6 +58,7 @@
58 58 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
59 59 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
60 60 void eeh_pe_state_clear(struct eeh_pe *pe, int state);
  61 +void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
61 62  
62 63 void eeh_sysfs_add_device(struct pci_dev *pdev);
63 64 void eeh_sysfs_remove_device(struct pci_dev *pdev);
arch/powerpc/kernel/eeh_driver.c
... ... @@ -171,6 +171,15 @@
171 171 }
172 172 }
173 173  
  174 +static bool eeh_dev_removed(struct eeh_dev *edev)
  175 +{
  176 + /* EEH device removed ? */
  177 + if (!edev || (edev->mode & EEH_DEV_REMOVED))
  178 + return true;
  179 +
  180 + return false;
  181 +}
  182 +
174 183 /**
175 184 * eeh_report_error - Report pci error to each device driver
176 185 * @data: eeh device
... ... @@ -187,10 +196,8 @@
187 196 enum pci_ers_result rc, *res = userdata;
188 197 struct pci_driver *driver;
189 198  
190   - /* We might not have the associated PCI device,
191   - * then we should continue for next one.
192   - */
193   - if (!dev) return NULL;
  199 + if (!dev || eeh_dev_removed(edev))
  200 + return NULL;
194 201 dev->error_state = pci_channel_io_frozen;
195 202  
196 203 driver = eeh_pcid_get(dev);
... ... @@ -230,6 +237,9 @@
230 237 enum pci_ers_result rc, *res = userdata;
231 238 struct pci_driver *driver;
232 239  
  240 + if (!dev || eeh_dev_removed(edev))
  241 + return NULL;
  242 +
233 243 driver = eeh_pcid_get(dev);
234 244 if (!driver) return NULL;
235 245  
... ... @@ -267,7 +277,8 @@
267 277 enum pci_ers_result rc, *res = userdata;
268 278 struct pci_driver *driver;
269 279  
270   - if (!dev) return NULL;
  280 + if (!dev || eeh_dev_removed(edev))
  281 + return NULL;
271 282 dev->error_state = pci_channel_io_normal;
272 283  
273 284 driver = eeh_pcid_get(dev);
... ... @@ -307,7 +318,8 @@
307 318 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
308 319 struct pci_driver *driver;
309 320  
310   - if (!dev) return NULL;
  321 + if (!dev || eeh_dev_removed(edev))
  322 + return NULL;
311 323 dev->error_state = pci_channel_io_normal;
312 324  
313 325 driver = eeh_pcid_get(dev);
... ... @@ -343,7 +355,8 @@
343 355 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
344 356 struct pci_driver *driver;
345 357  
346   - if (!dev) return NULL;
  358 + if (!dev || eeh_dev_removed(edev))
  359 + return NULL;
347 360 dev->error_state = pci_channel_io_perm_failure;
348 361  
349 362 driver = eeh_pcid_get(dev);
... ... @@ -380,6 +393,16 @@
380 393 if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
381 394 return NULL;
382 395  
  396 + /*
  397 + * We rely on count-based pcibios_release_device() to
  398 + * detach permanently offlined PEs. Unfortunately, that's
  399 + * not reliable enough. We might have the permanently
  400 + * offlined PEs attached, but we needn't take care of
  401 + * them and their child devices.
  402 + */
  403 + if (eeh_dev_removed(edev))
  404 + return NULL;
  405 +
383 406 driver = eeh_pcid_get(dev);
384 407 if (driver) {
385 408 eeh_pcid_put(dev);
386 409  
... ... @@ -694,8 +717,17 @@
694 717 /* Notify all devices that they're about to go down. */
695 718 eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
696 719  
697   - /* Shut down the device drivers for good. */
  720 + /* Mark the PE to be removed permanently */
  721 + pe->freeze_count = EEH_MAX_ALLOWED_FREEZES + 1;
  722 +
  723 + /*
  724 + * Shut down the device drivers for good. We mark
  725 + * all removed devices correctly to avoid access
  726 + * the their PCI config any more.
  727 + */
698 728 if (frozen_bus) {
  729 + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
  730 +
699 731 pci_lock_rescan_remove();
700 732 pcibios_remove_pci_devices(frozen_bus);
701 733 pci_unlock_rescan_remove();
arch/powerpc/kernel/eeh_pe.c
... ... @@ -503,13 +503,17 @@
503 503 struct eeh_dev *edev, *tmp;
504 504 struct pci_dev *pdev;
505 505  
506   - /*
507   - * Mark the PE with the indicated state. Also,
508   - * the associated PCI device will be put into
509   - * I/O frozen state to avoid I/O accesses from
510   - * the PCI device driver.
511   - */
  506 + /* Keep the state of permanently removed PE intact */
  507 + if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
  508 + (state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
  509 + return NULL;
  510 +
512 511 pe->state |= state;
  512 +
  513 + /* Offline PCI devices if applicable */
  514 + if (state != EEH_PE_ISOLATED)
  515 + return NULL;
  516 +
513 517 eeh_pe_for_each_dev(pe, edev, tmp) {
514 518 pdev = eeh_dev_to_pci_dev(edev);
515 519 if (pdev)
516 520  
... ... @@ -532,7 +536,28 @@
532 536 eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
533 537 }
534 538  
  539 +static void *__eeh_pe_dev_mode_mark(void *data, void *flag)
  540 +{
  541 + struct eeh_dev *edev = data;
  542 + int mode = *((int *)flag);
  543 +
  544 + edev->mode |= mode;
  545 +
  546 + return NULL;
  547 +}
  548 +
535 549 /**
  550 + * eeh_pe_dev_state_mark - Mark state for all device under the PE
  551 + * @pe: EEH PE
  552 + *
  553 + * Mark specific state for all child devices of the PE.
  554 + */
  555 +void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
  556 +{
  557 + eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode);
  558 +}
  559 +
  560 +/**
536 561 * __eeh_pe_state_clear - Clear state for the PE
537 562 * @data: EEH PE
538 563 * @flag: state
539 564  
... ... @@ -546,8 +571,16 @@
546 571 struct eeh_pe *pe = (struct eeh_pe *)data;
547 572 int state = *((int *)flag);
548 573  
  574 + /* Keep the state of permanently removed PE intact */
  575 + if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
  576 + (state & EEH_PE_ISOLATED))
  577 + return NULL;
  578 +
549 579 pe->state &= ~state;
550   - pe->check_count = 0;
  580 +
  581 + /* Clear check count since last isolation */
  582 + if (state & EEH_PE_ISOLATED)
  583 + pe->check_count = 0;
551 584  
552 585 return NULL;
553 586 }
arch/powerpc/kernel/pci_of_scan.c
... ... @@ -304,6 +304,9 @@
304 304 struct pci_dev *dev = NULL;
305 305 const __be32 *reg;
306 306 int reglen, devfn;
  307 +#ifdef CONFIG_EEH
  308 + struct eeh_dev *edev = of_node_to_eeh_dev(dn);
  309 +#endif
307 310  
308 311 pr_debug(" * %s\n", dn->full_name);
309 312 if (!of_device_is_available(dn))
... ... @@ -320,6 +323,12 @@
320 323 pci_dev_put(dev);
321 324 return dev;
322 325 }
  326 +
  327 + /* Device removed permanently ? */
  328 +#ifdef CONFIG_EEH
  329 + if (edev && (edev->mode & EEH_DEV_REMOVED))
  330 + return NULL;
  331 +#endif
323 332  
324 333 /* create a new pci_dev for this device */
325 334 dev = of_create_pci_dev(dn, bus, devfn);
arch/powerpc/platforms/powernv/pci.c
... ... @@ -441,11 +441,16 @@
441 441 if (!(phb->flags & PNV_PHB_FLAG_EEH))
442 442 return true;
443 443  
444   - /* PE reset ? */
  444 + /* PE reset or device removed ? */
445 445 edev = of_node_to_eeh_dev(dn);
446   - if (edev && edev->pe &&
447   - (edev->pe->state & EEH_PE_RESET))
448   - return false;
  446 + if (edev) {
  447 + if (edev->pe &&
  448 + (edev->pe->state & EEH_PE_RESET))
  449 + return false;
  450 +
  451 + if (edev->mode & EEH_DEV_REMOVED)
  452 + return false;
  453 + }
449 454  
450 455 return true;
451 456 }