Commit b3b30f5e8a0c50db3d76b6f7c7cc50245aeb57fd
Committed by
Roland Dreier
1 parent
07eeec0627
Exists in
master
and in
7 other branches
IB/mthca: Recover from catastrophic errors
Trigger device remove and then add when a catastrophic error is detected in hardware. This, in turn, will cause a device reset, which we hope will recover from the catastrophic condition. Since this might interefere with debugging the root cause, add a module option to suppress this behaviour. Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il> Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Showing 3 changed files with 136 additions and 21 deletions Side-by-side Diff
drivers/infiniband/hw/mthca/mthca_catas.c
... | ... | @@ -34,6 +34,7 @@ |
34 | 34 | |
35 | 35 | #include <linux/jiffies.h> |
36 | 36 | #include <linux/timer.h> |
37 | +#include <linux/workqueue.h> | |
37 | 38 | |
38 | 39 | #include "mthca_dev.h" |
39 | 40 | |
40 | 41 | |
... | ... | @@ -48,9 +49,41 @@ |
48 | 49 | |
49 | 50 | static DEFINE_SPINLOCK(catas_lock); |
50 | 51 | |
52 | +static LIST_HEAD(catas_list); | |
53 | +static struct workqueue_struct *catas_wq; | |
54 | +static struct work_struct catas_work; | |
55 | + | |
56 | +static int catas_reset_disable; | |
57 | +module_param_named(catas_reset_disable, catas_reset_disable, int, 0644); | |
58 | +MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); | |
59 | + | |
60 | +static void catas_reset(void *work_ptr) | |
61 | +{ | |
62 | + struct mthca_dev *dev, *tmpdev; | |
63 | + LIST_HEAD(tlist); | |
64 | + int ret; | |
65 | + | |
66 | + mutex_lock(&mthca_device_mutex); | |
67 | + | |
68 | + spin_lock_irq(&catas_lock); | |
69 | + list_splice_init(&catas_list, &tlist); | |
70 | + spin_unlock_irq(&catas_lock); | |
71 | + | |
72 | + list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { | |
73 | + ret = __mthca_restart_one(dev->pdev); | |
74 | + if (ret) | |
75 | + mthca_err(dev, "Reset failed (%d)\n", ret); | |
76 | + else | |
77 | + mthca_dbg(dev, "Reset succeeded\n"); | |
78 | + } | |
79 | + | |
80 | + mutex_unlock(&mthca_device_mutex); | |
81 | +} | |
82 | + | |
51 | 83 | static void handle_catas(struct mthca_dev *dev) |
52 | 84 | { |
53 | 85 | struct ib_event event; |
86 | + unsigned long flags; | |
54 | 87 | const char *type; |
55 | 88 | int i; |
56 | 89 | |
... | ... | @@ -82,6 +115,14 @@ |
82 | 115 | for (i = 0; i < dev->catas_err.size; ++i) |
83 | 116 | mthca_err(dev, " buf[%02x]: %08x\n", |
84 | 117 | i, swab32(readl(dev->catas_err.map + i))); |
118 | + | |
119 | + if (catas_reset_disable) | |
120 | + return; | |
121 | + | |
122 | + spin_lock_irqsave(&catas_lock, flags); | |
123 | + list_add(&dev->catas_err.list, &catas_list); | |
124 | + queue_work(catas_wq, &catas_work); | |
125 | + spin_unlock_irqrestore(&catas_lock, flags); | |
85 | 126 | } |
86 | 127 | |
87 | 128 | static void poll_catas(unsigned long dev_ptr) |
... | ... | @@ -135,6 +176,7 @@ |
135 | 176 | dev->catas_err.timer.data = (unsigned long) dev; |
136 | 177 | dev->catas_err.timer.function = poll_catas; |
137 | 178 | dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; |
179 | + INIT_LIST_HEAD(&dev->catas_err.list); | |
138 | 180 | add_timer(&dev->catas_err.timer); |
139 | 181 | } |
140 | 182 | |
... | ... | @@ -153,5 +195,25 @@ |
153 | 195 | dev->catas_err.addr), |
154 | 196 | dev->catas_err.size * 4); |
155 | 197 | } |
198 | + | |
199 | + spin_lock_irq(&catas_lock); | |
200 | + list_del(&dev->catas_err.list); | |
201 | + spin_unlock_irq(&catas_lock); | |
202 | +} | |
203 | + | |
204 | +int __init mthca_catas_init(void) | |
205 | +{ | |
206 | + INIT_WORK(&catas_work, catas_reset, NULL); | |
207 | + | |
208 | + catas_wq = create_singlethread_workqueue("mthca_catas"); | |
209 | + if (!catas_wq) | |
210 | + return -ENOMEM; | |
211 | + | |
212 | + return 0; | |
213 | +} | |
214 | + | |
215 | +void mthca_catas_cleanup(void) | |
216 | +{ | |
217 | + destroy_workqueue(catas_wq); | |
156 | 218 | } |
drivers/infiniband/hw/mthca/mthca_dev.h
... | ... | @@ -45,6 +45,7 @@ |
45 | 45 | #include <linux/dma-mapping.h> |
46 | 46 | #include <linux/timer.h> |
47 | 47 | #include <linux/mutex.h> |
48 | +#include <linux/list.h> | |
48 | 49 | |
49 | 50 | #include <asm/semaphore.h> |
50 | 51 | |
51 | 52 | |
... | ... | @@ -283,8 +284,11 @@ |
283 | 284 | unsigned long stop; |
284 | 285 | u32 size; |
285 | 286 | struct timer_list timer; |
287 | + struct list_head list; | |
286 | 288 | }; |
287 | 289 | |
290 | +extern struct mutex mthca_device_mutex; | |
291 | + | |
288 | 292 | struct mthca_dev { |
289 | 293 | struct ib_device ib_dev; |
290 | 294 | struct pci_dev *pdev; |
... | ... | @@ -450,6 +454,9 @@ |
450 | 454 | |
451 | 455 | void mthca_start_catas_poll(struct mthca_dev *dev); |
452 | 456 | void mthca_stop_catas_poll(struct mthca_dev *dev); |
457 | +int __mthca_restart_one(struct pci_dev *pdev); | |
458 | +int mthca_catas_init(void); | |
459 | +void mthca_catas_cleanup(void); | |
453 | 460 | |
454 | 461 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); |
455 | 462 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); |
drivers/infiniband/hw/mthca/mthca_main.c
... | ... | @@ -80,6 +80,8 @@ |
80 | 80 | module_param(tune_pci, int, 0444); |
81 | 81 | MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); |
82 | 82 | |
83 | +struct mutex mthca_device_mutex; | |
84 | + | |
83 | 85 | static const char mthca_version[] __devinitdata = |
84 | 86 | DRV_NAME ": Mellanox InfiniBand HCA driver v" |
85 | 87 | DRV_VERSION " (" DRV_RELDATE ")\n"; |
86 | 88 | |
87 | 89 | |
88 | 90 | |
... | ... | @@ -978,28 +980,15 @@ |
978 | 980 | MTHCA_FLAG_SINAI_OPT } |
979 | 981 | }; |
980 | 982 | |
981 | -static int __devinit mthca_init_one(struct pci_dev *pdev, | |
982 | - const struct pci_device_id *id) | |
983 | +static int __mthca_init_one(struct pci_dev *pdev, int hca_type) | |
983 | 984 | { |
984 | - static int mthca_version_printed = 0; | |
985 | 985 | int ddr_hidden = 0; |
986 | 986 | int err; |
987 | 987 | struct mthca_dev *mdev; |
988 | 988 | |
989 | - if (!mthca_version_printed) { | |
990 | - printk(KERN_INFO "%s", mthca_version); | |
991 | - ++mthca_version_printed; | |
992 | - } | |
993 | - | |
994 | 989 | printk(KERN_INFO PFX "Initializing %s\n", |
995 | 990 | pci_name(pdev)); |
996 | 991 | |
997 | - if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { | |
998 | - printk(KERN_ERR PFX "%s has invalid driver data %lx\n", | |
999 | - pci_name(pdev), id->driver_data); | |
1000 | - return -ENODEV; | |
1001 | - } | |
1002 | - | |
1003 | 992 | err = pci_enable_device(pdev); |
1004 | 993 | if (err) { |
1005 | 994 | dev_err(&pdev->dev, "Cannot enable PCI device, " |
... | ... | @@ -1065,7 +1054,7 @@ |
1065 | 1054 | |
1066 | 1055 | mdev->pdev = pdev; |
1067 | 1056 | |
1068 | - mdev->mthca_flags = mthca_hca_table[id->driver_data].flags; | |
1057 | + mdev->mthca_flags = mthca_hca_table[hca_type].flags; | |
1069 | 1058 | if (ddr_hidden) |
1070 | 1059 | mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; |
1071 | 1060 | |
1072 | 1061 | |
... | ... | @@ -1099,13 +1088,13 @@ |
1099 | 1088 | if (err) |
1100 | 1089 | goto err_cmd; |
1101 | 1090 | |
1102 | - if (mdev->fw_ver < mthca_hca_table[id->driver_data].latest_fw) { | |
1091 | + if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { | |
1103 | 1092 | mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", |
1104 | 1093 | (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, |
1105 | 1094 | (int) (mdev->fw_ver & 0xffff), |
1106 | - (int) (mthca_hca_table[id->driver_data].latest_fw >> 32), | |
1107 | - (int) (mthca_hca_table[id->driver_data].latest_fw >> 16) & 0xffff, | |
1108 | - (int) (mthca_hca_table[id->driver_data].latest_fw & 0xffff)); | |
1095 | + (int) (mthca_hca_table[hca_type].latest_fw >> 32), | |
1096 | + (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, | |
1097 | + (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); | |
1109 | 1098 | mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); |
1110 | 1099 | } |
1111 | 1100 | |
... | ... | @@ -1122,6 +1111,7 @@ |
1122 | 1111 | goto err_unregister; |
1123 | 1112 | |
1124 | 1113 | pci_set_drvdata(pdev, mdev); |
1114 | + mdev->hca_type = hca_type; | |
1125 | 1115 | |
1126 | 1116 | return 0; |
1127 | 1117 | |
... | ... | @@ -1166,7 +1156,7 @@ |
1166 | 1156 | return err; |
1167 | 1157 | } |
1168 | 1158 | |
1169 | -static void __devexit mthca_remove_one(struct pci_dev *pdev) | |
1159 | +static void __mthca_remove_one(struct pci_dev *pdev) | |
1170 | 1160 | { |
1171 | 1161 | struct mthca_dev *mdev = pci_get_drvdata(pdev); |
1172 | 1162 | u8 status; |
... | ... | @@ -1211,6 +1201,51 @@ |
1211 | 1201 | } |
1212 | 1202 | } |
1213 | 1203 | |
1204 | +int __mthca_restart_one(struct pci_dev *pdev) | |
1205 | +{ | |
1206 | + struct mthca_dev *mdev; | |
1207 | + | |
1208 | + mdev = pci_get_drvdata(pdev); | |
1209 | + if (!mdev) | |
1210 | + return -ENODEV; | |
1211 | + __mthca_remove_one(pdev); | |
1212 | + return __mthca_init_one(pdev, mdev->hca_type); | |
1213 | +} | |
1214 | + | |
1215 | +static int __devinit mthca_init_one(struct pci_dev *pdev, | |
1216 | + const struct pci_device_id *id) | |
1217 | +{ | |
1218 | + static int mthca_version_printed = 0; | |
1219 | + int ret; | |
1220 | + | |
1221 | + mutex_lock(&mthca_device_mutex); | |
1222 | + | |
1223 | + if (!mthca_version_printed) { | |
1224 | + printk(KERN_INFO "%s", mthca_version); | |
1225 | + ++mthca_version_printed; | |
1226 | + } | |
1227 | + | |
1228 | + if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { | |
1229 | + printk(KERN_ERR PFX "%s has invalid driver data %lx\n", | |
1230 | + pci_name(pdev), id->driver_data); | |
1231 | + mutex_unlock(&mthca_device_mutex); | |
1232 | + return -ENODEV; | |
1233 | + } | |
1234 | + | |
1235 | + ret = __mthca_init_one(pdev, id->driver_data); | |
1236 | + | |
1237 | + mutex_unlock(&mthca_device_mutex); | |
1238 | + | |
1239 | + return ret; | |
1240 | +} | |
1241 | + | |
1242 | +static void __devexit mthca_remove_one(struct pci_dev *pdev) | |
1243 | +{ | |
1244 | + mutex_lock(&mthca_device_mutex); | |
1245 | + __mthca_remove_one(pdev); | |
1246 | + mutex_unlock(&mthca_device_mutex); | |
1247 | +} | |
1248 | + | |
1214 | 1249 | static struct pci_device_id mthca_pci_table[] = { |
1215 | 1250 | { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), |
1216 | 1251 | .driver_data = TAVOR }, |
1217 | 1252 | |
1218 | 1253 | |
... | ... | @@ -1248,13 +1283,24 @@ |
1248 | 1283 | { |
1249 | 1284 | int ret; |
1250 | 1285 | |
1286 | + mutex_init(&mthca_device_mutex); | |
1287 | + ret = mthca_catas_init(); | |
1288 | + if (ret) | |
1289 | + return ret; | |
1290 | + | |
1251 | 1291 | ret = pci_register_driver(&mthca_driver); |
1252 | - return ret < 0 ? ret : 0; | |
1292 | + if (ret < 0) { | |
1293 | + mthca_catas_cleanup(); | |
1294 | + return ret; | |
1295 | + } | |
1296 | + | |
1297 | + return 0; | |
1253 | 1298 | } |
1254 | 1299 | |
1255 | 1300 | static void __exit mthca_cleanup(void) |
1256 | 1301 | { |
1257 | 1302 | pci_unregister_driver(&mthca_driver); |
1303 | + mthca_catas_cleanup(); | |
1258 | 1304 | } |
1259 | 1305 | |
1260 | 1306 | module_init(mthca_init); |