blob: 7859af897058c3d1ae6ac602578e9db6087561ed [file] [log] [blame]
Linas Vepstas77bd7412005-11-03 18:52:49 -06001/*
2 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
Linas Vepstas3c8c90a2007-05-24 03:28:01 +10003 * Copyright IBM Corp. 2004 2005
4 * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
Linas Vepstas77bd7412005-11-03 18:52:49 -06005 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or (at
11 * your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
Linas Vepstas3c8c90a2007-05-24 03:28:01 +100023 * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
Linas Vepstas77bd7412005-11-03 18:52:49 -060024 */
25#include <linux/delay.h>
Linas Vepstas77bd7412005-11-03 18:52:49 -060026#include <linux/interrupt.h>
Linas Vepstasac325ac2006-04-18 21:05:21 -070027#include <linux/irq.h>
Gavin Shanfeadf7c2012-09-17 04:34:27 +000028#include <linux/module.h>
Linas Vepstas77bd7412005-11-03 18:52:49 -060029#include <linux/pci.h>
30#include <asm/eeh.h>
31#include <asm/eeh_event.h>
32#include <asm/ppc-pci.h>
33#include <asm/pci-bridge.h>
34#include <asm/prom.h>
35#include <asm/rtas.h>
36
Wei Yang67086e32016-03-04 10:53:11 +110037struct eeh_rmv_data {
38 struct list_head edev_list;
39 int removed;
40};
41
Sam Bobroff30424e32018-05-25 13:11:34 +100042static int eeh_result_priority(enum pci_ers_result result)
43{
44 switch (result) {
45 case PCI_ERS_RESULT_NONE:
46 return 1;
47 case PCI_ERS_RESULT_NO_AER_DRIVER:
48 return 2;
49 case PCI_ERS_RESULT_RECOVERED:
50 return 3;
51 case PCI_ERS_RESULT_CAN_RECOVER:
52 return 4;
53 case PCI_ERS_RESULT_DISCONNECT:
54 return 5;
55 case PCI_ERS_RESULT_NEED_RESET:
56 return 6;
57 default:
58 WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result);
59 return 0;
60 }
61};
62
Sam Bobroff20b34492018-05-25 13:11:40 +100063const char *pci_ers_result_name(enum pci_ers_result result)
64{
65 switch (result) {
66 case PCI_ERS_RESULT_NONE:
67 return "none";
68 case PCI_ERS_RESULT_CAN_RECOVER:
69 return "can recover";
70 case PCI_ERS_RESULT_NEED_RESET:
71 return "need reset";
72 case PCI_ERS_RESULT_DISCONNECT:
73 return "disconnect";
74 case PCI_ERS_RESULT_RECOVERED:
75 return "recovered";
76 case PCI_ERS_RESULT_NO_AER_DRIVER:
77 return "no AER driver";
78 default:
79 WARN_ONCE(1, "Unknown result type: %d\n", (int)result);
80 return "unknown";
81 }
82};
83
84static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
85 const char *fmt, ...)
86{
87 struct va_format vaf;
88 va_list args;
89
90 va_start(args, fmt);
91
92 vaf.fmt = fmt;
93 vaf.va = &args;
94
95 printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
96 edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf);
97
98 va_end(args);
99}
100
Sam Bobroff30424e32018-05-25 13:11:34 +1000101static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
102 enum pci_ers_result new)
103{
104 if (eeh_result_priority(new) > eeh_result_priority(old))
105 return new;
106 return old;
107}
108
Sam Bobroffe2b810d52018-05-25 13:11:36 +1000109static bool eeh_dev_removed(struct eeh_dev *edev)
110{
111 return !edev || (edev->mode & EEH_DEV_REMOVED);
112}
113
114static bool eeh_edev_actionable(struct eeh_dev *edev)
115{
116 return (edev->pdev && !eeh_dev_removed(edev) &&
117 !eeh_pe_passed(edev->pe));
118}
119
Gavin Shan29f8bf12012-02-27 20:04:02 +0000120/**
Gavin Shanfeadf7c2012-09-17 04:34:27 +0000121 * eeh_pcid_get - Get the PCI device driver
122 * @pdev: PCI device
123 *
124 * The function is used to retrieve the PCI device driver for
125 * the indicated PCI device. Besides, we will increase the reference
126 * of the PCI device driver to prevent that being unloaded on
127 * the fly. Otherwise, kernel crash would be seen.
128 */
129static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
130{
131 if (!pdev || !pdev->driver)
132 return NULL;
133
134 if (!try_module_get(pdev->driver->driver.owner))
135 return NULL;
136
137 return pdev->driver;
138}
139
140/**
141 * eeh_pcid_put - Dereference on the PCI device driver
142 * @pdev: PCI device
143 *
144 * The function is called to do dereference on the PCI device
145 * driver of the indicated PCI device.
146 */
147static inline void eeh_pcid_put(struct pci_dev *pdev)
148{
149 if (!pdev || !pdev->driver)
150 return;
151
152 module_put(pdev->driver->driver.owner);
153}
154
Mike Mason8535ef02009-02-10 11:12:21 +0000155/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000156 * eeh_disable_irq - Disable interrupt for the recovering device
157 * @dev: PCI device
158 *
159 * This routine must be called when reporting temporary or permanent
160 * error to the particular PCI device to disable interrupt of that
161 * device. If the device has enabled MSI or MSI-X interrupt, we needn't
162 * do real work because EEH should freeze DMA transfers for those PCI
163 * devices encountering EEH errors, which includes MSI or MSI-X.
Mike Mason8535ef02009-02-10 11:12:21 +0000164 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000165static void eeh_disable_irq(struct eeh_dev *edev)
Mike Mason8535ef02009-02-10 11:12:21 +0000166{
Mike Mason8535ef02009-02-10 11:12:21 +0000167 /* Don't disable MSI and MSI-X interrupts. They are
168 * effectively disabled by the DMA Stopped state
169 * when an EEH error occurs.
Gavin Shan29f8bf12012-02-27 20:04:02 +0000170 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000171 if (edev->pdev->msi_enabled || edev->pdev->msix_enabled)
Mike Mason8535ef02009-02-10 11:12:21 +0000172 return;
173
Sam Bobroff010acfa2018-05-25 13:11:38 +1000174 if (!irq_has_action(edev->pdev->irq))
Mike Mason8535ef02009-02-10 11:12:21 +0000175 return;
176
Gavin Shandbbceee2012-09-07 22:44:20 +0000177 edev->mode |= EEH_DEV_IRQ_DISABLED;
Sam Bobroff010acfa2018-05-25 13:11:38 +1000178 disable_irq_nosync(edev->pdev->irq);
Mike Mason8535ef02009-02-10 11:12:21 +0000179}
180
181/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000182 * eeh_enable_irq - Enable interrupt for the recovering device
183 * @dev: PCI device
184 *
185 * This routine must be called to enable interrupt while failed
186 * device could be resumed.
Mike Mason8535ef02009-02-10 11:12:21 +0000187 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000188static void eeh_enable_irq(struct eeh_dev *edev)
Mike Mason8535ef02009-02-10 11:12:21 +0000189{
Gavin Shandbbceee2012-09-07 22:44:20 +0000190 if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
191 edev->mode &= ~EEH_DEV_IRQ_DISABLED;
Thomas Gleixnerb8a9a112014-02-23 21:40:09 +0000192 /*
193 * FIXME !!!!!
194 *
195 * This is just ass backwards. This maze has
196 * unbalanced irq_enable/disable calls. So instead of
197 * finding the root cause it works around the warning
198 * in the irq_enable code by conditionally calling
199 * into it.
200 *
201 * That's just wrong.The warning in the core code is
Michael Ellerman027dfac2016-06-01 16:34:37 +1000202 * there to tell people to fix their asymmetries in
Thomas Gleixnerb8a9a112014-02-23 21:40:09 +0000203 * their own code, not by abusing the core information
204 * to avoid it.
205 *
206 * I so wish that the assymetry would be the other way
207 * round and a few more irq_disable calls render that
208 * shit unusable forever.
209 *
210 * tglx
211 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000212 if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq)))
213 enable_irq(edev->pdev->irq);
Thomas Gleixner57310c32014-03-05 00:06:11 +0100214 }
Mike Mason8535ef02009-02-10 11:12:21 +0000215}
216
Sam Bobroffd6c49322018-05-25 13:11:32 +1000217static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000218{
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000219 struct pci_dev *pdev;
220
221 if (!edev)
222 return NULL;
223
Gavin Shan5a0cdbf2016-04-27 11:14:51 +1000224 /*
225 * We cannot access the config space on some adapters.
226 * Otherwise, it will cause fenced PHB. We don't save
227 * the content in their config space and will restore
228 * from the initial config space saved when the EEH
229 * device is created.
230 */
231 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
232 return NULL;
233
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000234 pdev = eeh_dev_to_pci_dev(edev);
235 if (!pdev)
236 return NULL;
237
238 pci_save_state(pdev);
239 return NULL;
240}
241
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000242static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s)
243{
244 struct eeh_pe *pe;
245 struct eeh_dev *edev, *tmp;
246
247 eeh_for_each_pe(root, pe)
248 eeh_pe_for_each_dev(pe, edev, tmp)
249 if (eeh_edev_actionable(edev))
250 edev->pdev->error_state = s;
251}
252
Sam Bobroff010acfa2018-05-25 13:11:38 +1000253static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
254{
255 struct eeh_pe *pe;
256 struct eeh_dev *edev, *tmp;
257
258 eeh_for_each_pe(root, pe) {
259 eeh_pe_for_each_dev(pe, edev, tmp) {
260 if (!eeh_edev_actionable(edev))
261 continue;
262
263 if (!eeh_pcid_get(edev->pdev))
264 continue;
265
266 if (enable)
267 eeh_enable_irq(edev);
268 else
269 eeh_disable_irq(edev);
270
271 eeh_pcid_put(edev->pdev);
272 }
273 }
274}
275
Sam Bobroff20b34492018-05-25 13:11:40 +1000276typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
277 struct pci_driver *);
278static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
279 enum pci_ers_result *result)
280{
281 struct pci_driver *driver;
282 enum pci_ers_result new_result;
283
Sam Bobroffbcbe3732018-09-12 11:23:21 +1000284 if (!edev->pdev) {
285 eeh_edev_info(edev, "no device");
286 return;
287 }
Sam Bobroff20b34492018-05-25 13:11:40 +1000288 device_lock(&edev->pdev->dev);
289 if (eeh_edev_actionable(edev)) {
290 driver = eeh_pcid_get(edev->pdev);
291
292 if (!driver)
293 eeh_edev_info(edev, "no driver");
294 else if (!driver->err_handler)
295 eeh_edev_info(edev, "driver not EEH aware");
296 else if (edev->mode & EEH_DEV_NO_HANDLER)
297 eeh_edev_info(edev, "driver bound too late");
298 else {
299 new_result = fn(edev, driver);
300 eeh_edev_info(edev, "%s driver reports: '%s'",
301 driver->name,
302 pci_ers_result_name(new_result));
303 if (result)
304 *result = pci_ers_merge_result(*result,
305 new_result);
306 }
307 if (driver)
308 eeh_pcid_put(edev->pdev);
309 } else {
310 eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev,
311 !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
312 }
313 device_unlock(&edev->pdev->dev);
314}
315
316static void eeh_pe_report(const char *name, struct eeh_pe *root,
317 eeh_report_fn fn, enum pci_ers_result *result)
318{
319 struct eeh_pe *pe;
320 struct eeh_dev *edev, *tmp;
321
322 pr_info("EEH: Beginning: '%s'\n", name);
323 eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
324 eeh_pe_report_edev(edev, fn, result);
325 if (result)
326 pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
327 name, pci_ers_result_name(*result));
328 else
329 pr_info("EEH: Finished:'%s'", name);
330}
331
Linas Vepstascb5b56242006-09-15 18:56:35 -0500332/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000333 * eeh_report_error - Report pci error to each device driver
Sam Bobroff20b34492018-05-25 13:11:40 +1000334 * @edev: eeh device
335 * @driver: device's PCI driver
Gavin Shana84f2732013-06-20 13:20:51 +0800336 *
Sam Bobroff20b34492018-05-25 13:11:40 +1000337 * Report an EEH error to each device driver.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600338 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000339static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
340 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600341{
Sam Bobroff20b34492018-05-25 13:11:40 +1000342 enum pci_ers_result rc;
343 struct pci_dev *dev = edev->pdev;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600344
Sam Bobroff20b34492018-05-25 13:11:40 +1000345 if (!driver->err_handler->error_detected)
346 return PCI_ERS_RESULT_NONE;
Michael Neulingf0295e02018-03-26 15:17:07 +1100347
Sam Bobroff20b34492018-05-25 13:11:40 +1000348 eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
349 driver->name);
Gavin Shan29f8bf12012-02-27 20:04:02 +0000350 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
Linas Vepstas2a50f142007-11-03 07:27:50 +1100351
Wei Yang67086e32016-03-04 10:53:11 +1100352 edev->in_error = true;
Bryant G. Ly856e1eb2018-01-05 10:45:47 -0600353 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
Sam Bobroff20b34492018-05-25 13:11:40 +1000354 return rc;
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500355}
356
357/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000358 * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
Sam Bobroff20b34492018-05-25 13:11:40 +1000359 * @edev: eeh device
360 * @driver: device's PCI driver
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500361 *
Linas Vepstas638799b2007-11-03 07:25:55 +1100362 * Tells each device driver that IO ports, MMIO and config space I/O
Sam Bobroff20b34492018-05-25 13:11:40 +1000363 * are now enabled.
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500364 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000365static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
366 struct pci_driver *driver)
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500367{
Sam Bobroff20b34492018-05-25 13:11:40 +1000368 if (!driver->err_handler->mmio_enabled)
369 return PCI_ERS_RESULT_NONE;
370 eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
371 return driver->err_handler->mmio_enabled(edev->pdev);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600372}
373
Linas Vepstascb5b56242006-09-15 18:56:35 -0500374/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000375 * eeh_report_reset - Tell device that slot has been reset
Sam Bobroff20b34492018-05-25 13:11:40 +1000376 * @edev: eeh device
377 * @driver: device's PCI driver
Gavin Shan29f8bf12012-02-27 20:04:02 +0000378 *
379 * This routine must be called while EEH tries to reset particular
380 * PCI device so that the associated PCI device driver could take
381 * some actions, usually to save data the driver needs so that the
382 * driver can work again while the device is recovered.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600383 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000384static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
385 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600386{
Sam Bobroff20b34492018-05-25 13:11:40 +1000387 if (!driver->err_handler->slot_reset || !edev->in_error)
388 return PCI_ERS_RESULT_NONE;
389 eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
390 return driver->err_handler->slot_reset(edev->pdev);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600391}
392
Sam Bobroffd6c49322018-05-25 13:11:32 +1000393static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000394{
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000395 struct pci_dev *pdev;
396
397 if (!edev)
398 return NULL;
399
Gavin Shan5a0cdbf2016-04-27 11:14:51 +1000400 /*
401 * The content in the config space isn't saved because
402 * the blocked config space on some adapters. We have
403 * to restore the initial saved config space when the
404 * EEH device is created.
405 */
406 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
Sam Bobroff80e65b02018-09-12 11:23:26 +1000407 if (list_is_last(&edev->entry, &edev->pe->edevs))
Gavin Shan5a0cdbf2016-04-27 11:14:51 +1000408 eeh_pe_restore_bars(edev->pe);
409
410 return NULL;
411 }
412
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000413 pdev = eeh_dev_to_pci_dev(edev);
414 if (!pdev)
415 return NULL;
416
417 pci_restore_state(pdev);
418 return NULL;
419}
420
Linas Vepstascb5b56242006-09-15 18:56:35 -0500421/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000422 * eeh_report_resume - Tell device to resume normal operations
Sam Bobroff20b34492018-05-25 13:11:40 +1000423 * @edev: eeh device
424 * @driver: device's PCI driver
Gavin Shan29f8bf12012-02-27 20:04:02 +0000425 *
426 * This routine must be called to notify the device driver that it
427 * could resume so that the device driver can do some initialization
428 * to make the recovered device work again.
Linas Vepstascb5b56242006-09-15 18:56:35 -0500429 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000430static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
431 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600432{
Sam Bobroff20b34492018-05-25 13:11:40 +1000433 if (!driver->err_handler->resume || !edev->in_error)
434 return PCI_ERS_RESULT_NONE;
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000435
Sam Bobroff20b34492018-05-25 13:11:40 +1000436 eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
437 driver->err_handler->resume(edev->pdev);
Michael Neulingf0295e02018-03-26 15:17:07 +1100438
Sam Bobroff20b34492018-05-25 13:11:40 +1000439 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
Bryant G. Ly856e1eb2018-01-05 10:45:47 -0600440#ifdef CONFIG_PCI_IOV
Juan J. Alvarez521ca5a2018-02-15 12:49:51 -0600441 if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
442 eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
Bryant G. Ly856e1eb2018-01-05 10:45:47 -0600443#endif
Sam Bobroff20b34492018-05-25 13:11:40 +1000444 return PCI_ERS_RESULT_NONE;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600445}
446
Linas Vepstascb5b56242006-09-15 18:56:35 -0500447/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000448 * eeh_report_failure - Tell device driver that device is dead.
Sam Bobroff20b34492018-05-25 13:11:40 +1000449 * @edev: eeh device
450 * @driver: device's PCI driver
Linas Vepstascb5b56242006-09-15 18:56:35 -0500451 *
452 * This informs the device driver that the device is permanently
453 * dead, and that no further recovery attempts will be made on it.
454 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000455static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
456 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600457{
Sam Bobroff20b34492018-05-25 13:11:40 +1000458 enum pci_ers_result rc;
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000459
Sam Bobroff20b34492018-05-25 13:11:40 +1000460 if (!driver->err_handler->error_detected)
461 return PCI_ERS_RESULT_NONE;
Michael Neulingf0295e02018-03-26 15:17:07 +1100462
Sam Bobroff20b34492018-05-25 13:11:40 +1000463 eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
464 driver->name);
465 rc = driver->err_handler->error_detected(edev->pdev,
466 pci_channel_io_perm_failure);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600467
Sam Bobroff20b34492018-05-25 13:11:40 +1000468 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT);
469 return rc;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600470}
471
Sam Bobroffbf773df2018-09-12 11:23:25 +1000472static void *eeh_add_virt_device(struct eeh_dev *edev)
Wei Yang67086e32016-03-04 10:53:11 +1100473{
474 struct pci_driver *driver;
Wei Yang67086e32016-03-04 10:53:11 +1100475 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
476 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
477
478 if (!(edev->physfn)) {
479 pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
Alexey Kardashevskiy69672bd2017-08-29 17:34:01 +1000480 __func__, pdn->phb->global_number, pdn->busno,
Wei Yang67086e32016-03-04 10:53:11 +1100481 PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
482 return NULL;
483 }
484
485 driver = eeh_pcid_get(dev);
486 if (driver) {
Sam Bobroff46d4be42018-05-25 13:11:30 +1000487 if (driver->err_handler) {
488 eeh_pcid_put(dev);
Wei Yang67086e32016-03-04 10:53:11 +1100489 return NULL;
Sam Bobroff46d4be42018-05-25 13:11:30 +1000490 }
491 eeh_pcid_put(dev);
Wei Yang67086e32016-03-04 10:53:11 +1100492 }
493
Bryant G. Ly988fc3b2017-11-09 08:00:33 -0600494#ifdef CONFIG_PCI_IOV
Jan H. Schönherr753f6122017-09-26 12:53:23 -0500495 pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
Wei Yang67086e32016-03-04 10:53:11 +1100496#endif
497 return NULL;
498}
499
Sam Bobroffd6c49322018-05-25 13:11:32 +1000500static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
Gavin Shanf5c57712013-07-24 10:24:58 +0800501{
502 struct pci_driver *driver;
Gavin Shanf5c57712013-07-24 10:24:58 +0800503 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
Wei Yang67086e32016-03-04 10:53:11 +1100504 struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
505 int *removed = rmv_data ? &rmv_data->removed : NULL;
Gavin Shanf5c57712013-07-24 10:24:58 +0800506
507 /*
508 * Actually, we should remove the PCI bridges as well.
509 * However, that's lots of complexity to do that,
510 * particularly some of devices under the bridge might
511 * support EEH. So we just care about PCI devices for
512 * simplicity here.
513 */
Bjorn Helgaas93de6902015-12-03 13:18:18 -0600514 if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
Gavin Shanf5c57712013-07-24 10:24:58 +0800515 return NULL;
Thadeu Lima de Souza Cascardo8cc6b6c2014-02-05 16:20:45 -0200516
Gavin Shand2b0f6f2014-04-24 18:00:19 +1000517 /*
518 * We rely on count-based pcibios_release_device() to
519 * detach permanently offlined PEs. Unfortunately, that's
520 * not reliable enough. We might have the permanently
521 * offlined PEs attached, but we needn't take care of
522 * them and their child devices.
523 */
524 if (eeh_dev_removed(edev))
525 return NULL;
526
Sam Bobroff46d4be42018-05-25 13:11:30 +1000527 if (removed) {
528 if (eeh_pe_passed(edev->pe))
Gavin Shan3fa7bf72016-03-04 10:53:13 +1100529 return NULL;
Sam Bobroff46d4be42018-05-25 13:11:30 +1000530 driver = eeh_pcid_get(dev);
531 if (driver) {
532 if (driver->err_handler &&
533 driver->err_handler->error_detected &&
534 driver->err_handler->slot_reset) {
535 eeh_pcid_put(dev);
536 return NULL;
537 }
538 eeh_pcid_put(dev);
539 }
Thadeu Lima de Souza Cascardo8cc6b6c2014-02-05 16:20:45 -0200540 }
Gavin Shanf5c57712013-07-24 10:24:58 +0800541
542 /* Remove it from PCI subsystem */
543 pr_debug("EEH: Removing %s without EEH sensitive driver\n",
544 pci_name(dev));
Gavin Shanf5c57712013-07-24 10:24:58 +0800545 edev->mode |= EEH_DEV_DISCONNECTED;
Wei Yang67086e32016-03-04 10:53:11 +1100546 if (removed)
547 (*removed)++;
Gavin Shanf5c57712013-07-24 10:24:58 +0800548
Wei Yang67086e32016-03-04 10:53:11 +1100549 if (edev->physfn) {
Bryant G. Ly988fc3b2017-11-09 08:00:33 -0600550#ifdef CONFIG_PCI_IOV
Wei Yang67086e32016-03-04 10:53:11 +1100551 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
552
Jan H. Schönherr753f6122017-09-26 12:53:23 -0500553 pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
Wei Yang67086e32016-03-04 10:53:11 +1100554 edev->pdev = NULL;
555
556 /*
557 * We have to set the VF PE number to invalid one, which is
558 * required to plug the VF successfully.
559 */
560 pdn->pe_number = IODA_INVALID_PE;
561#endif
562 if (rmv_data)
Sam Bobroff80e65b02018-09-12 11:23:26 +1000563 list_add(&edev->rmv_entry, &rmv_data->edev_list);
Wei Yang67086e32016-03-04 10:53:11 +1100564 } else {
565 pci_lock_rescan_remove();
566 pci_stop_and_remove_bus_device(dev);
567 pci_unlock_rescan_remove();
568 }
Gavin Shanf5c57712013-07-24 10:24:58 +0800569
570 return NULL;
571}
572
Sam Bobroffd6c49322018-05-25 13:11:32 +1000573static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
Gavin Shanf5c57712013-07-24 10:24:58 +0800574{
Gavin Shanf5c57712013-07-24 10:24:58 +0800575 struct eeh_dev *edev, *tmp;
576
577 eeh_pe_for_each_dev(pe, edev, tmp) {
578 if (!(edev->mode & EEH_DEV_DISCONNECTED))
579 continue;
580
581 edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
582 eeh_rmv_from_parent_pe(edev);
583 }
584
585 return NULL;
586}
587
Gavin Shan78954702014-04-24 18:00:14 +1000588/*
589 * Explicitly clear PE's frozen state for PowerNV where
590 * we have frozen PE until BAR restore is completed. It's
591 * harmless to clear it for pSeries. To be consistent with
592 * PE reset (for 3 times), we try to clear the frozen state
593 * for 3 times as well.
594 */
Sam Bobroffd6c49322018-05-25 13:11:32 +1000595static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag)
Gavin Shan78954702014-04-24 18:00:14 +1000596{
Gavin Shanf05fea52017-01-19 10:10:16 +1100597 bool clear_sw_state = *(bool *)flag;
Gavin Shanc9dd0142014-09-30 12:39:02 +1000598 int i, rc = 1;
Gavin Shan78954702014-04-24 18:00:14 +1000599
Gavin Shanc9dd0142014-09-30 12:39:02 +1000600 for (i = 0; rc && i < 3; i++)
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000601 rc = eeh_unfreeze_pe(pe, clear_sw_state);
Gavin Shan78954702014-04-24 18:00:14 +1000602
Gavin Shanc9dd0142014-09-30 12:39:02 +1000603 /* Stop immediately on any errors */
Gavin Shan2c665992014-05-05 09:29:02 +1000604 if (rc) {
Gavin Shanc9dd0142014-09-30 12:39:02 +1000605 pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n",
606 __func__, rc, pe->phb->global_number, pe->addr);
Gavin Shan2c665992014-05-05 09:29:02 +1000607 return (void *)pe;
608 }
609
610 return NULL;
611}
612
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000613static int eeh_clear_pe_frozen_state(struct eeh_pe *pe,
614 bool clear_sw_state)
Gavin Shan2c665992014-05-05 09:29:02 +1000615{
616 void *rc;
617
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000618 rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state);
Gavin Shan2c665992014-05-05 09:29:02 +1000619 if (!rc)
Gavin Shan78954702014-04-24 18:00:14 +1000620 eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
621
Gavin Shan2c665992014-05-05 09:29:02 +1000622 return rc ? -EIO : 0;
Gavin Shan78954702014-04-24 18:00:14 +1000623}
624
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000625int eeh_pe_reset_and_recover(struct eeh_pe *pe)
626{
Gavin Shan2efc7712016-04-27 11:14:52 +1000627 int ret;
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000628
629 /* Bail if the PE is being recovered */
630 if (pe->state & EEH_PE_RECOVERING)
631 return 0;
632
633 /* Put the PE into recovery mode */
634 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
635
636 /* Save states */
637 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
638
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000639 /* Issue reset */
Russell Currey6654c932016-11-17 16:07:47 +1100640 ret = eeh_pe_reset_full(pe);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000641 if (ret) {
Gavin Shan28bf36f2014-11-14 10:47:29 +1100642 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000643 return ret;
644 }
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000645
646 /* Unfreeze the PE */
647 ret = eeh_clear_pe_frozen_state(pe, true);
648 if (ret) {
649 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
650 return ret;
651 }
652
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000653 /* Restore device state */
654 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
655
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000656 /* Clear recovery mode */
657 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
658
659 return 0;
660}
661
Linas Vepstas77bd7412005-11-03 18:52:49 -0600662/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000663 * eeh_reset_device - Perform actual reset of a pci slot
Sam Bobroff5fd13462018-03-19 13:48:55 +1100664 * @driver_eeh_aware: Does the device's driver provide EEH support?
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000665 * @pe: EEH PE
Gavin Shan29f8bf12012-02-27 20:04:02 +0000666 * @bus: PCI bus corresponding to the isolcated slot
Sam Bobroff5fd13462018-03-19 13:48:55 +1100667 * @rmv_data: Optional, list to record removed devices
Linas Vepstas77bd7412005-11-03 18:52:49 -0600668 *
Gavin Shan29f8bf12012-02-27 20:04:02 +0000669 * This routine must be called to do reset on the indicated PE.
670 * During the reset, udev might be invoked because those affected
671 * PCI devices will be removed and then added.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600672 */
Wei Yang67086e32016-03-04 10:53:11 +1100673static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
Sam Bobroff5fd13462018-03-19 13:48:55 +1100674 struct eeh_rmv_data *rmv_data,
675 bool driver_eeh_aware)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600676{
Arnd Bergmannedfd17f2017-11-04 22:26:52 +0100677 time64_t tstamp;
Wei Yang67086e32016-03-04 10:53:11 +1100678 int cnt, rc;
679 struct eeh_dev *edev;
Linas Vepstas42405452006-04-28 17:39:38 -0500680
681 /* pcibios will clear the counter; save the value */
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000682 cnt = pe->freeze_count;
Gavin Shan5a719782013-06-20 13:21:01 +0800683 tstamp = pe->tstamp;
Linas Vepstas42405452006-04-28 17:39:38 -0500684
Gavin Shan20ee6a92012-09-11 19:16:17 +0000685 /*
686 * We don't remove the corresponding PE instances because
687 * we need the information afterwords. The attached EEH
688 * devices are expected to be attached soon when calling
Gavin Shanbd251b82016-05-03 15:41:37 +1000689 * into pci_hp_add_devices().
Gavin Shan20ee6a92012-09-11 19:16:17 +0000690 */
Gavin Shanf5c57712013-07-24 10:24:58 +0800691 eeh_pe_state_mark(pe, EEH_PE_KEEP);
Sam Bobroff54048cf2018-03-21 13:06:40 +1100692 if (driver_eeh_aware || (pe->type & EEH_PE_VF)) {
Gavin Shancca0e5422016-06-24 14:49:02 +1000693 eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
Sam Bobroff54048cf2018-03-21 13:06:40 +1100694 } else {
695 pci_lock_rescan_remove();
696 pci_hp_remove_devices(bus);
697 pci_unlock_rescan_remove();
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100698 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600699
Gavin Shand0914f52014-04-24 18:00:12 +1000700 /*
701 * Reset the pci controller. (Asserts RST#; resets config space).
Linas Vepstasb6495c02005-11-03 18:54:54 -0600702 * Reconfigure bridges and devices. Don't try to bring the system
Gavin Shan29f8bf12012-02-27 20:04:02 +0000703 * up if the reset failed for some reason.
Gavin Shand0914f52014-04-24 18:00:12 +1000704 *
705 * During the reset, it's very dangerous to have uncontrolled PCI
706 * config accesses. So we prefer to block them. However, controlled
707 * PCI config accesses initiated from EEH itself are allowed.
Gavin Shan29f8bf12012-02-27 20:04:02 +0000708 */
Russell Currey6654c932016-11-17 16:07:47 +1100709 rc = eeh_pe_reset_full(pe);
Gavin Shan28bf36f2014-11-14 10:47:29 +1100710 if (rc)
Linas Vepstasb6495c02005-11-03 18:54:54 -0600711 return rc;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600712
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100713 pci_lock_rescan_remove();
714
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000715 /* Restore PE */
716 eeh_ops->configure_bridge(pe);
717 eeh_pe_restore_bars(pe);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600718
Andrew Donnellandc9c41b2015-12-08 16:59:25 +1100719 /* Clear frozen state */
720 rc = eeh_clear_pe_frozen_state(pe, false);
Andrew Donnellan409bf7f2016-12-01 11:23:05 +1100721 if (rc) {
722 pci_unlock_rescan_remove();
Andrew Donnellandc9c41b2015-12-08 16:59:25 +1100723 return rc;
Andrew Donnellan409bf7f2016-12-01 11:23:05 +1100724 }
Gavin Shan78954702014-04-24 18:00:14 +1000725
Linas Vepstas77bd7412005-11-03 18:52:49 -0600726 /* Give the system 5 seconds to finish running the user-space
Gavin Shana84f2732013-06-20 13:20:51 +0800727 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
728 * this is a hack, but if we don't do this, and try to bring
729 * the device up before the scripts have taken it down,
Linas Vepstas77bd7412005-11-03 18:52:49 -0600730 * potentially weird things happen.
731 */
Sam Bobroff54048cf2018-03-21 13:06:40 +1100732 if (!driver_eeh_aware || rmv_data->removed) {
733 pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
734 (driver_eeh_aware ? "partial" : "complete"));
Gavin Shan29f8bf12012-02-27 20:04:02 +0000735 ssleep(5);
Gavin Shanf5c57712013-07-24 10:24:58 +0800736
737 /*
738 * The EEH device is still connected with its parent
739 * PE. We should disconnect it so the binding can be
740 * rebuilt when adding PCI devices.
741 */
Sam Bobroff80e65b02018-09-12 11:23:26 +1000742 edev = list_first_entry(&pe->edevs, struct eeh_dev, entry);
Gavin Shanf5c57712013-07-24 10:24:58 +0800743 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
Gavin Shana3aa2562016-06-17 13:05:11 +1000744 if (pe->type & EEH_PE_VF) {
Sam Bobroffbf773df2018-09-12 11:23:25 +1000745 eeh_add_virt_device(edev);
Gavin Shana3aa2562016-06-17 13:05:11 +1000746 } else {
Sam Bobroff54048cf2018-03-21 13:06:40 +1100747 if (!driver_eeh_aware)
748 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
Gavin Shanbd251b82016-05-03 15:41:37 +1000749 pci_hp_add_devices(bus);
Gavin Shana3aa2562016-06-17 13:05:11 +1000750 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600751 }
Gavin Shanf5c57712013-07-24 10:24:58 +0800752 eeh_pe_state_clear(pe, EEH_PE_KEEP);
Gavin Shan5a719782013-06-20 13:21:01 +0800753
754 pe->tstamp = tstamp;
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000755 pe->freeze_count = cnt;
Linas Vepstasb6495c02005-11-03 18:54:54 -0600756
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100757 pci_unlock_rescan_remove();
Linas Vepstasb6495c02005-11-03 18:54:54 -0600758 return 0;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600759}
760
761/* The longest amount of time to wait for a pci device
762 * to come back on line, in seconds.
763 */
Brian Kingfb48dc22013-11-25 16:27:54 -0600764#define MAX_WAIT_FOR_RECOVERY 300
Linas Vepstas77bd7412005-11-03 18:52:49 -0600765
Russell Curreyc0b64972017-04-19 17:39:27 +1000766/**
767 * eeh_handle_normal_event - Handle EEH events on a specific PE
Sam Bobroff37fd8122018-03-19 13:46:30 +1100768 * @pe: EEH PE - which should not be used after we return, as it may
769 * have been invalidated.
Russell Curreyc0b64972017-04-19 17:39:27 +1000770 *
771 * Attempts to recover the given PE. If recovery fails or the PE has failed
772 * too many times, remove the PE.
773 *
Sam Bobroff68701782018-03-19 13:46:20 +1100774 * While PHB detects address or data parity errors on particular PCI
775 * slot, the associated PE will be frozen. Besides, DMA's occurring
776 * to wild addresses (which usually happen due to bugs in device
777 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
778 * #PERR or other misc PCI-related errors also can trigger EEH errors.
779 *
780 * Recovery process consists of unplugging the device driver (which
781 * generated hotplug events to userspace), then issuing a PCI #RST to
782 * the device, then reconfiguring the PCI config space for all bridges
783 * & devices under this slot, and then finally restarting the device
784 * drivers (which cause a second set of hotplug events to go out to
785 * userspace).
Russell Curreyc0b64972017-04-19 17:39:27 +1000786 */
Sam Bobroff37fd8122018-03-19 13:46:30 +1100787void eeh_handle_normal_event(struct eeh_pe *pe)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600788{
Sam Bobroffcd95f802018-03-19 13:47:02 +1100789 struct pci_bus *bus;
Wei Yang67086e32016-03-04 10:53:11 +1100790 struct eeh_dev *edev, *tmp;
Sam Bobroff665012c2018-05-25 13:11:39 +1000791 struct eeh_pe *tmp_pe;
Linas Vepstasb6495c02005-11-03 18:54:54 -0600792 int rc = 0;
Paul Mackerras18eb3b32005-11-29 17:17:02 +1100793 enum pci_ers_result result = PCI_ERS_RESULT_NONE;
Wei Yang67086e32016-03-04 10:53:11 +1100794 struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
Linas Vepstas77bd7412005-11-03 18:52:49 -0600795
Sam Bobroffcd95f802018-03-19 13:47:02 +1100796 bus = eeh_pe_bus_get(pe);
797 if (!bus) {
Russell Currey1f52f172016-11-16 14:02:15 +1100798 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000799 __func__, pe->phb->global_number, pe->addr);
Sam Bobroff37fd8122018-03-19 13:46:30 +1100800 return;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600801 }
802
Sam Bobroff37fd8122018-03-19 13:46:30 +1100803 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
804
Gavin Shan5a719782013-06-20 13:21:01 +0800805 eeh_pe_update_time_stamp(pe);
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000806 pe->freeze_count++;
Russell Curreyc0b64972017-04-19 17:39:27 +1000807 if (pe->freeze_count > eeh_max_freezes) {
Sam Bobroff796b9f52018-05-25 13:11:28 +1000808 pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
Russell Curreyc0b64972017-04-19 17:39:27 +1000809 pe->phb->global_number, pe->addr,
810 pe->freeze_count);
811 goto hard_fail;
812 }
Sam Bobroff796b9f52018-05-25 13:11:28 +1000813 pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
814 pe->freeze_count, eeh_max_freezes);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600815
816 /* Walk the various device drivers attached to this slot through
817 * a reset sequence, giving each an opportunity to do what it needs
818 * to accomplish the reset. Each child gets a report of the
819 * status ... if any child can't handle the reset, then the entire
820 * slot is dlpar removed and added.
Gavin Shan8234fce2015-10-08 14:58:54 +1100821 *
822 * When the PHB is fenced, we have to issue a reset to recover from
823 * the error. Override the result if necessary to have partially
824 * hotplug for this case.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600825 */
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800826 pr_info("EEH: Notify device drivers to shutdown\n");
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000827 eeh_set_channel_state(pe, pci_channel_io_frozen);
Sam Bobroff010acfa2018-05-25 13:11:38 +1000828 eeh_set_irq_state(pe, false);
Sam Bobroff20b34492018-05-25 13:11:40 +1000829 eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error,
830 &result);
Gavin Shan8234fce2015-10-08 14:58:54 +1100831 if ((pe->type & EEH_PE_PHB) &&
832 result != PCI_ERS_RESULT_NONE &&
833 result != PCI_ERS_RESULT_NEED_RESET)
834 result = PCI_ERS_RESULT_NEED_RESET;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600835
Linas Vepstas5f1a7c82007-11-16 05:58:36 +1100836 /* Get the current PCI slot state. This can take a long time,
Wei Yang2ac39902015-04-27 09:25:10 +0800837 * sometimes over 300 seconds for certain systems.
Gavin Shan29f8bf12012-02-27 20:04:02 +0000838 */
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000839 rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
Gavin Shaneb594a42012-02-27 20:03:57 +0000840 if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000841 pr_warn("EEH: Permanent failure\n");
Linas Vepstas5f1a7c82007-11-16 05:58:36 +1100842 goto hard_fail;
843 }
844
Linas Vepstasede8ca22007-05-09 09:33:29 +1000845 /* Since rtas may enable MMIO when posting the error log,
846 * don't post the error log until after all dev drivers
Linas Vepstas17213c32007-05-10 02:38:11 +1000847 * have been informed.
848 */
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800849 pr_info("EEH: Collect temporary log\n");
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000850 eeh_slot_error_detail(pe, EEH_LOG_TEMP);
Linas Vepstasede8ca22007-05-09 09:33:29 +1000851
Linas Vepstas77bd7412005-11-03 18:52:49 -0600852 /* If all device drivers were EEH-unaware, then shut
853 * down all of the device drivers, and hope they
854 * go down willingly, without panicing the system.
855 */
Paul Mackerras18eb3b32005-11-29 17:17:02 +1100856 if (result == PCI_ERS_RESULT_NONE) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800857 pr_info("EEH: Reset with hotplug activity\n");
Sam Bobroff5fd13462018-03-19 13:48:55 +1100858 rc = eeh_reset_device(pe, bus, NULL, false);
Linas Vepstase0f90b62007-03-19 14:52:04 -0500859 if (rc) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000860 pr_warn("%s: Unable to reset, err=%d\n",
861 __func__, rc);
Linas Vepstasb6495c02005-11-03 18:54:54 -0600862 goto hard_fail;
Linas Vepstase0f90b62007-03-19 14:52:04 -0500863 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600864 }
865
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500866 /* If all devices reported they can proceed, then re-enable MMIO */
867 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800868 pr_info("EEH: Enable I/O for affected devices\n");
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000869 rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500870
Linas Vepstasfa1be472007-03-19 14:59:59 -0500871 if (rc < 0)
872 goto hard_fail;
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500873 if (rc) {
874 result = PCI_ERS_RESULT_NEED_RESET;
875 } else {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800876 pr_info("EEH: Notify device drivers to resume I/O\n");
Sam Bobroff20b34492018-05-25 13:11:40 +1000877 eeh_pe_report("mmio_enabled", pe,
878 eeh_report_mmio_enabled, &result);
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500879 }
880 }
881
882 /* If all devices reported they can proceed, then re-enable DMA */
883 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800884 pr_info("EEH: Enabled DMA for affected devices\n");
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000885 rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500886
Linas Vepstasfa1be472007-03-19 14:59:59 -0500887 if (rc < 0)
888 goto hard_fail;
Gavin Shan35845a72014-04-24 18:00:26 +1000889 if (rc) {
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500890 result = PCI_ERS_RESULT_NEED_RESET;
Gavin Shan35845a72014-04-24 18:00:26 +1000891 } else {
892 /*
893 * We didn't do PE reset for the case. The PE
894 * is still in frozen state. Clear it before
895 * resuming the PE.
896 */
897 eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
Linas Vepstasd0e70342006-12-06 12:32:20 -0600898 result = PCI_ERS_RESULT_RECOVERED;
Gavin Shan35845a72014-04-24 18:00:26 +1000899 }
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500900 }
901
902 /* If any device has a hard failure, then shut off everything. */
Linas Vepstase0f90b62007-03-19 14:52:04 -0500903 if (result == PCI_ERS_RESULT_DISCONNECT) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000904 pr_warn("EEH: Device driver gave up\n");
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500905 goto hard_fail;
Linas Vepstase0f90b62007-03-19 14:52:04 -0500906 }
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500907
Linas Vepstas77bd7412005-11-03 18:52:49 -0600908 /* If any device called out for a reset, then reset the slot */
Paul Mackerras18eb3b32005-11-29 17:17:02 +1100909 if (result == PCI_ERS_RESULT_NEED_RESET) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800910 pr_info("EEH: Reset without hotplug activity\n");
Sam Bobroff5fd13462018-03-19 13:48:55 +1100911 rc = eeh_reset_device(pe, bus, &rmv_data, true);
Linas Vepstase0f90b62007-03-19 14:52:04 -0500912 if (rc) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000913 pr_warn("%s: Cannot reset, err=%d\n",
914 __func__, rc);
Linas Vepstasb6495c02005-11-03 18:54:54 -0600915 goto hard_fail;
Linas Vepstase0f90b62007-03-19 14:52:04 -0500916 }
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800917
918 pr_info("EEH: Notify device drivers "
919 "the completion of reset\n");
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500920 result = PCI_ERS_RESULT_NONE;
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000921 eeh_set_channel_state(pe, pci_channel_io_normal);
Sam Bobroff010acfa2018-05-25 13:11:38 +1000922 eeh_set_irq_state(pe, true);
Sam Bobroff20b34492018-05-25 13:11:40 +1000923 eeh_pe_report("slot_reset", pe, eeh_report_reset, &result);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600924 }
925
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500926 /* All devices should claim they have recovered by now. */
Linas Vepstas90fdd612007-03-19 14:55:10 -0500927 if ((result != PCI_ERS_RESULT_RECOVERED) &&
928 (result != PCI_ERS_RESULT_NONE)) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000929 pr_warn("EEH: Not recovered\n");
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500930 goto hard_fail;
Linas Vepstase0f90b62007-03-19 14:52:04 -0500931 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600932
Wei Yang67086e32016-03-04 10:53:11 +1100933 /*
934 * For those hot removed VFs, we should add back them after PF get
935 * recovered properly.
936 */
Sam Bobroff80e65b02018-09-12 11:23:26 +1000937 list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_entry) {
Sam Bobroffbf773df2018-09-12 11:23:25 +1000938 eeh_add_virt_device(edev);
Sam Bobroff80e65b02018-09-12 11:23:26 +1000939 list_del(&edev->rmv_entry);
Wei Yang67086e32016-03-04 10:53:11 +1100940 }
941
Linas Vepstas77bd7412005-11-03 18:52:49 -0600942 /* Tell all device drivers that they can resume operations */
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800943 pr_info("EEH: Notify device driver to resume\n");
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000944 eeh_set_channel_state(pe, pci_channel_io_normal);
Sam Bobroff010acfa2018-05-25 13:11:38 +1000945 eeh_set_irq_state(pe, true);
Sam Bobroff20b34492018-05-25 13:11:40 +1000946 eeh_pe_report("resume", pe, eeh_report_resume, NULL);
947 eeh_for_each_pe(pe, tmp_pe) {
948 eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
Sam Bobroff665012c2018-05-25 13:11:39 +1000949 edev->mode &= ~EEH_DEV_NO_HANDLER;
Sam Bobroff20b34492018-05-25 13:11:40 +1000950 edev->in_error = false;
951 }
952 }
Sam Bobroff665012c2018-05-25 13:11:39 +1000953
Sam Bobroff796b9f52018-05-25 13:11:28 +1000954 pr_info("EEH: Recovery successful.\n");
Sam Bobroff37fd8122018-03-19 13:46:30 +1100955 goto final;
Gavin Shana84f2732013-06-20 13:20:51 +0800956
Russell Curreyc0b64972017-04-19 17:39:27 +1000957hard_fail:
Linas Vepstasb6495c02005-11-03 18:54:54 -0600958 /*
959 * About 90% of all real-life EEH failures in the field
960 * are due to poorly seated PCI cards. Only 10% or so are
961 * due to actual, failed cards.
962 */
Russell Currey1f52f172016-11-16 14:02:15 +1100963 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000964 "Please try reseating or replacing it\n",
965 pe->phb->global_number, pe->addr);
Linas Vepstas8df83022006-03-29 15:31:04 -0600966
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000967 eeh_slot_error_detail(pe, EEH_LOG_PERM);
Linas Vepstasb6495c02005-11-03 18:54:54 -0600968
969 /* Notify all devices that they're about to go down. */
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000970 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
Sam Bobroff010acfa2018-05-25 13:11:38 +1000971 eeh_set_irq_state(pe, false);
Sam Bobroff20b34492018-05-25 13:11:40 +1000972 eeh_pe_report("error_detected(permanent failure)", pe,
973 eeh_report_failure, NULL);
Linas Vepstasb6495c02005-11-03 18:54:54 -0600974
Gavin Shand2b0f6f2014-04-24 18:00:19 +1000975 /* Mark the PE to be removed permanently */
Gavin Shan432227e2014-12-11 14:28:55 +1100976 eeh_pe_state_mark(pe, EEH_PE_REMOVED);
Gavin Shand2b0f6f2014-04-24 18:00:19 +1000977
978 /*
979 * Shut down the device drivers for good. We mark
980 * all removed devices correctly to avoid access
981 * the their PCI config any more.
982 */
Sam Bobroff5b86ac92018-03-19 13:46:51 +1100983 if (pe->type & EEH_PE_VF) {
984 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
985 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
986 } else {
987 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
988 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
Gavin Shand2b0f6f2014-04-24 18:00:19 +1000989
Sam Bobroff5b86ac92018-03-19 13:46:51 +1100990 pci_lock_rescan_remove();
Sam Bobroffcd95f802018-03-19 13:47:02 +1100991 pci_hp_remove_devices(bus);
Sam Bobroff5b86ac92018-03-19 13:46:51 +1100992 pci_unlock_rescan_remove();
993 /* The passed PE should no longer be used */
994 return;
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100995 }
Sam Bobroff37fd8122018-03-19 13:46:30 +1100996final:
997 eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600998}
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800999
Russell Curreyc0b64972017-04-19 17:39:27 +10001000/**
1001 * eeh_handle_special_event - Handle EEH events without a specific failing PE
1002 *
1003 * Called when an EEH event is detected but can't be narrowed down to a
1004 * specific PE. Iterates through possible failures and handles them as
1005 * necessary.
1006 */
Sam Bobroff68701782018-03-19 13:46:20 +11001007void eeh_handle_special_event(void)
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001008{
1009 struct eeh_pe *pe, *phb_pe;
1010 struct pci_bus *bus;
Gavin Shan7e4e7862014-01-15 13:16:11 +08001011 struct pci_controller *hose;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001012 unsigned long flags;
Gavin Shan7e4e7862014-01-15 13:16:11 +08001013 int rc;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001014
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001015
Gavin Shan7e4e7862014-01-15 13:16:11 +08001016 do {
1017 rc = eeh_ops->next_error(&pe);
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001018
Gavin Shan7e4e7862014-01-15 13:16:11 +08001019 switch (rc) {
1020 case EEH_NEXT_ERR_DEAD_IOC:
1021 /* Mark all PHBs in dead state */
1022 eeh_serialize_lock(&flags);
1023
1024 /* Purge all events */
Gavin Shan5c7a35e2014-06-04 17:31:52 +10001025 eeh_remove_event(NULL, true);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001026
1027 list_for_each_entry(hose, &hose_list, list_node) {
1028 phb_pe = eeh_phb_pe_get(hose);
1029 if (!phb_pe) continue;
1030
Gavin Shan9e049372014-04-24 18:00:07 +10001031 eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001032 }
1033
1034 eeh_serialize_unlock(flags);
1035
1036 break;
1037 case EEH_NEXT_ERR_FROZEN_PE:
1038 case EEH_NEXT_ERR_FENCED_PHB:
1039 case EEH_NEXT_ERR_DEAD_PHB:
1040 /* Mark the PE in fenced state */
1041 eeh_serialize_lock(&flags);
1042
1043 /* Purge all events of the PHB */
Gavin Shan5c7a35e2014-06-04 17:31:52 +10001044 eeh_remove_event(pe, true);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001045
1046 if (rc == EEH_NEXT_ERR_DEAD_PHB)
Gavin Shan9e049372014-04-24 18:00:07 +10001047 eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001048 else
1049 eeh_pe_state_mark(pe,
1050 EEH_PE_ISOLATED | EEH_PE_RECOVERING);
1051
1052 eeh_serialize_unlock(flags);
1053
1054 break;
1055 case EEH_NEXT_ERR_NONE:
1056 return;
1057 default:
1058 pr_warn("%s: Invalid value %d from next_error()\n",
1059 __func__, rc);
1060 return;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001061 }
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001062
Gavin Shan7e4e7862014-01-15 13:16:11 +08001063 /*
1064 * For fenced PHB and frozen PE, it's handled as normal
1065 * event. We have to remove the affected PHBs for dead
1066 * PHB and IOC
1067 */
1068 if (rc == EEH_NEXT_ERR_FROZEN_PE ||
1069 rc == EEH_NEXT_ERR_FENCED_PHB) {
Sam Bobroff37fd8122018-03-19 13:46:30 +11001070 eeh_handle_normal_event(pe);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001071 } else {
Linus Torvalds1b173662014-01-27 21:11:26 -08001072 pci_lock_rescan_remove();
Gavin Shan7e4e7862014-01-15 13:16:11 +08001073 list_for_each_entry(hose, &hose_list, list_node) {
1074 phb_pe = eeh_phb_pe_get(hose);
1075 if (!phb_pe ||
Gavin Shan9e049372014-04-24 18:00:07 +10001076 !(phb_pe->state & EEH_PE_ISOLATED) ||
1077 (phb_pe->state & EEH_PE_RECOVERING))
Gavin Shan7e4e7862014-01-15 13:16:11 +08001078 continue;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001079
Gavin Shan7e4e7862014-01-15 13:16:11 +08001080 /* Notify all devices to be down */
Gavin Shan05ba75f2016-02-09 15:50:21 +11001081 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
Sam Bobroff47cc8c12018-05-25 13:11:37 +10001082 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
Sam Bobroff20b34492018-05-25 13:11:40 +10001083 eeh_pe_report(
1084 "error_detected(permanent failure)", pe,
Russell Curreyaf2e3a02016-09-12 14:17:24 +10001085 eeh_report_failure, NULL);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001086 bus = eeh_pe_bus_get(phb_pe);
Russell Currey04fec21c2016-09-12 14:17:22 +10001087 if (!bus) {
1088 pr_err("%s: Cannot find PCI bus for "
Russell Currey1f52f172016-11-16 14:02:15 +11001089 "PHB#%x-PE#%x\n",
Russell Currey04fec21c2016-09-12 14:17:22 +10001090 __func__,
1091 pe->phb->global_number,
1092 pe->addr);
1093 break;
1094 }
Gavin Shanbd251b82016-05-03 15:41:37 +10001095 pci_hp_remove_devices(bus);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001096 }
Linus Torvalds1b173662014-01-27 21:11:26 -08001097 pci_unlock_rescan_remove();
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001098 }
Gavin Shan7e4e7862014-01-15 13:16:11 +08001099
1100 /*
1101 * If we have detected dead IOC, we needn't proceed
1102 * any more since all PHBs would have been removed
1103 */
1104 if (rc == EEH_NEXT_ERR_DEAD_IOC)
1105 break;
1106 } while (rc != EEH_NEXT_ERR_NONE);
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001107}