blob: 89623962c7275235db369f3bf28581e13ec019cb [file] [log] [blame]
Linas Vepstas77bd7412005-11-03 18:52:49 -06001/*
2 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
Linas Vepstas3c8c90a2007-05-24 03:28:01 +10003 * Copyright IBM Corp. 2004 2005
4 * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
Linas Vepstas77bd7412005-11-03 18:52:49 -06005 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or (at
11 * your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
Linas Vepstas3c8c90a2007-05-24 03:28:01 +100023 * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
Linas Vepstas77bd7412005-11-03 18:52:49 -060024 */
25#include <linux/delay.h>
Linas Vepstas77bd7412005-11-03 18:52:49 -060026#include <linux/interrupt.h>
Linas Vepstasac325ac2006-04-18 21:05:21 -070027#include <linux/irq.h>
Gavin Shanfeadf7c2012-09-17 04:34:27 +000028#include <linux/module.h>
Linas Vepstas77bd7412005-11-03 18:52:49 -060029#include <linux/pci.h>
30#include <asm/eeh.h>
31#include <asm/eeh_event.h>
32#include <asm/ppc-pci.h>
33#include <asm/pci-bridge.h>
34#include <asm/prom.h>
35#include <asm/rtas.h>
36
Wei Yang67086e32016-03-04 10:53:11 +110037struct eeh_rmv_data {
Sam Bobroff1c5c5332018-09-12 11:23:27 +100038 struct list_head removed_vf_list;
39 int removed_dev_count;
Wei Yang67086e32016-03-04 10:53:11 +110040};
41
Sam Bobroff30424e32018-05-25 13:11:34 +100042static int eeh_result_priority(enum pci_ers_result result)
43{
44 switch (result) {
45 case PCI_ERS_RESULT_NONE:
46 return 1;
47 case PCI_ERS_RESULT_NO_AER_DRIVER:
48 return 2;
49 case PCI_ERS_RESULT_RECOVERED:
50 return 3;
51 case PCI_ERS_RESULT_CAN_RECOVER:
52 return 4;
53 case PCI_ERS_RESULT_DISCONNECT:
54 return 5;
55 case PCI_ERS_RESULT_NEED_RESET:
56 return 6;
57 default:
58 WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result);
59 return 0;
60 }
61};
62
Breno Leitaoc36c5ff2018-10-22 11:54:13 -030063static const char *pci_ers_result_name(enum pci_ers_result result)
Sam Bobroff20b34492018-05-25 13:11:40 +100064{
65 switch (result) {
66 case PCI_ERS_RESULT_NONE:
67 return "none";
68 case PCI_ERS_RESULT_CAN_RECOVER:
69 return "can recover";
70 case PCI_ERS_RESULT_NEED_RESET:
71 return "need reset";
72 case PCI_ERS_RESULT_DISCONNECT:
73 return "disconnect";
74 case PCI_ERS_RESULT_RECOVERED:
75 return "recovered";
76 case PCI_ERS_RESULT_NO_AER_DRIVER:
77 return "no AER driver";
78 default:
79 WARN_ONCE(1, "Unknown result type: %d\n", (int)result);
80 return "unknown";
81 }
82};
83
84static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
85 const char *fmt, ...)
86{
87 struct va_format vaf;
88 va_list args;
89
90 va_start(args, fmt);
91
92 vaf.fmt = fmt;
93 vaf.va = &args;
94
95 printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
96 edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf);
97
98 va_end(args);
99}
100
Sam Bobroff30424e32018-05-25 13:11:34 +1000101static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
102 enum pci_ers_result new)
103{
104 if (eeh_result_priority(new) > eeh_result_priority(old))
105 return new;
106 return old;
107}
108
Sam Bobroffe2b810d52018-05-25 13:11:36 +1000109static bool eeh_dev_removed(struct eeh_dev *edev)
110{
111 return !edev || (edev->mode & EEH_DEV_REMOVED);
112}
113
114static bool eeh_edev_actionable(struct eeh_dev *edev)
115{
116 return (edev->pdev && !eeh_dev_removed(edev) &&
117 !eeh_pe_passed(edev->pe));
118}
119
Gavin Shan29f8bf12012-02-27 20:04:02 +0000120/**
Gavin Shanfeadf7c2012-09-17 04:34:27 +0000121 * eeh_pcid_get - Get the PCI device driver
122 * @pdev: PCI device
123 *
124 * The function is used to retrieve the PCI device driver for
125 * the indicated PCI device. Besides, we will increase the reference
126 * of the PCI device driver to prevent that being unloaded on
127 * the fly. Otherwise, kernel crash would be seen.
128 */
129static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
130{
131 if (!pdev || !pdev->driver)
132 return NULL;
133
134 if (!try_module_get(pdev->driver->driver.owner))
135 return NULL;
136
137 return pdev->driver;
138}
139
140/**
141 * eeh_pcid_put - Dereference on the PCI device driver
142 * @pdev: PCI device
143 *
144 * The function is called to do dereference on the PCI device
145 * driver of the indicated PCI device.
146 */
147static inline void eeh_pcid_put(struct pci_dev *pdev)
148{
149 if (!pdev || !pdev->driver)
150 return;
151
152 module_put(pdev->driver->driver.owner);
153}
154
Mike Mason8535ef02009-02-10 11:12:21 +0000155/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000156 * eeh_disable_irq - Disable interrupt for the recovering device
157 * @dev: PCI device
158 *
159 * This routine must be called when reporting temporary or permanent
160 * error to the particular PCI device to disable interrupt of that
161 * device. If the device has enabled MSI or MSI-X interrupt, we needn't
162 * do real work because EEH should freeze DMA transfers for those PCI
163 * devices encountering EEH errors, which includes MSI or MSI-X.
Mike Mason8535ef02009-02-10 11:12:21 +0000164 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000165static void eeh_disable_irq(struct eeh_dev *edev)
Mike Mason8535ef02009-02-10 11:12:21 +0000166{
Mike Mason8535ef02009-02-10 11:12:21 +0000167 /* Don't disable MSI and MSI-X interrupts. They are
168 * effectively disabled by the DMA Stopped state
169 * when an EEH error occurs.
Gavin Shan29f8bf12012-02-27 20:04:02 +0000170 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000171 if (edev->pdev->msi_enabled || edev->pdev->msix_enabled)
Mike Mason8535ef02009-02-10 11:12:21 +0000172 return;
173
Sam Bobroff010acfa2018-05-25 13:11:38 +1000174 if (!irq_has_action(edev->pdev->irq))
Mike Mason8535ef02009-02-10 11:12:21 +0000175 return;
176
Gavin Shandbbceee2012-09-07 22:44:20 +0000177 edev->mode |= EEH_DEV_IRQ_DISABLED;
Sam Bobroff010acfa2018-05-25 13:11:38 +1000178 disable_irq_nosync(edev->pdev->irq);
Mike Mason8535ef02009-02-10 11:12:21 +0000179}
180
181/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000182 * eeh_enable_irq - Enable interrupt for the recovering device
183 * @dev: PCI device
184 *
185 * This routine must be called to enable interrupt while failed
186 * device could be resumed.
Mike Mason8535ef02009-02-10 11:12:21 +0000187 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000188static void eeh_enable_irq(struct eeh_dev *edev)
Mike Mason8535ef02009-02-10 11:12:21 +0000189{
Gavin Shandbbceee2012-09-07 22:44:20 +0000190 if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
191 edev->mode &= ~EEH_DEV_IRQ_DISABLED;
Thomas Gleixnerb8a9a112014-02-23 21:40:09 +0000192 /*
193 * FIXME !!!!!
194 *
195 * This is just ass backwards. This maze has
196 * unbalanced irq_enable/disable calls. So instead of
197 * finding the root cause it works around the warning
198 * in the irq_enable code by conditionally calling
199 * into it.
200 *
201 * That's just wrong.The warning in the core code is
Michael Ellerman027dfac2016-06-01 16:34:37 +1000202 * there to tell people to fix their asymmetries in
Thomas Gleixnerb8a9a112014-02-23 21:40:09 +0000203 * their own code, not by abusing the core information
204 * to avoid it.
205 *
206 * I so wish that the assymetry would be the other way
207 * round and a few more irq_disable calls render that
208 * shit unusable forever.
209 *
210 * tglx
211 */
Sam Bobroff010acfa2018-05-25 13:11:38 +1000212 if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq)))
213 enable_irq(edev->pdev->irq);
Thomas Gleixner57310c32014-03-05 00:06:11 +0100214 }
Mike Mason8535ef02009-02-10 11:12:21 +0000215}
216
Sam Bobroffd6c49322018-05-25 13:11:32 +1000217static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000218{
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000219 struct pci_dev *pdev;
220
221 if (!edev)
222 return NULL;
223
Gavin Shan5a0cdbf2016-04-27 11:14:51 +1000224 /*
225 * We cannot access the config space on some adapters.
226 * Otherwise, it will cause fenced PHB. We don't save
227 * the content in their config space and will restore
228 * from the initial config space saved when the EEH
229 * device is created.
230 */
231 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
232 return NULL;
233
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000234 pdev = eeh_dev_to_pci_dev(edev);
235 if (!pdev)
236 return NULL;
237
238 pci_save_state(pdev);
239 return NULL;
240}
241
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000242static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s)
243{
244 struct eeh_pe *pe;
245 struct eeh_dev *edev, *tmp;
246
247 eeh_for_each_pe(root, pe)
248 eeh_pe_for_each_dev(pe, edev, tmp)
249 if (eeh_edev_actionable(edev))
250 edev->pdev->error_state = s;
251}
252
Sam Bobroff010acfa2018-05-25 13:11:38 +1000253static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
254{
255 struct eeh_pe *pe;
256 struct eeh_dev *edev, *tmp;
257
258 eeh_for_each_pe(root, pe) {
259 eeh_pe_for_each_dev(pe, edev, tmp) {
260 if (!eeh_edev_actionable(edev))
261 continue;
262
263 if (!eeh_pcid_get(edev->pdev))
264 continue;
265
266 if (enable)
267 eeh_enable_irq(edev);
268 else
269 eeh_disable_irq(edev);
270
271 eeh_pcid_put(edev->pdev);
272 }
273 }
274}
275
Sam Bobroff20b34492018-05-25 13:11:40 +1000276typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
277 struct pci_driver *);
278static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
279 enum pci_ers_result *result)
280{
281 struct pci_driver *driver;
282 enum pci_ers_result new_result;
283
Sam Bobroffbcbe3732018-09-12 11:23:21 +1000284 if (!edev->pdev) {
285 eeh_edev_info(edev, "no device");
286 return;
287 }
Sam Bobroff20b34492018-05-25 13:11:40 +1000288 device_lock(&edev->pdev->dev);
289 if (eeh_edev_actionable(edev)) {
290 driver = eeh_pcid_get(edev->pdev);
291
292 if (!driver)
293 eeh_edev_info(edev, "no driver");
294 else if (!driver->err_handler)
295 eeh_edev_info(edev, "driver not EEH aware");
296 else if (edev->mode & EEH_DEV_NO_HANDLER)
297 eeh_edev_info(edev, "driver bound too late");
298 else {
299 new_result = fn(edev, driver);
300 eeh_edev_info(edev, "%s driver reports: '%s'",
301 driver->name,
302 pci_ers_result_name(new_result));
303 if (result)
304 *result = pci_ers_merge_result(*result,
305 new_result);
306 }
307 if (driver)
308 eeh_pcid_put(edev->pdev);
309 } else {
310 eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev,
311 !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
312 }
313 device_unlock(&edev->pdev->dev);
314}
315
316static void eeh_pe_report(const char *name, struct eeh_pe *root,
317 eeh_report_fn fn, enum pci_ers_result *result)
318{
319 struct eeh_pe *pe;
320 struct eeh_dev *edev, *tmp;
321
322 pr_info("EEH: Beginning: '%s'\n", name);
323 eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
324 eeh_pe_report_edev(edev, fn, result);
325 if (result)
326 pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
327 name, pci_ers_result_name(*result));
328 else
329 pr_info("EEH: Finished:'%s'", name);
330}
331
Linas Vepstascb5b56242006-09-15 18:56:35 -0500332/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000333 * eeh_report_error - Report pci error to each device driver
Sam Bobroff20b34492018-05-25 13:11:40 +1000334 * @edev: eeh device
335 * @driver: device's PCI driver
Gavin Shana84f2732013-06-20 13:20:51 +0800336 *
Sam Bobroff20b34492018-05-25 13:11:40 +1000337 * Report an EEH error to each device driver.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600338 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000339static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
340 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600341{
Sam Bobroff20b34492018-05-25 13:11:40 +1000342 enum pci_ers_result rc;
343 struct pci_dev *dev = edev->pdev;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600344
Sam Bobroff20b34492018-05-25 13:11:40 +1000345 if (!driver->err_handler->error_detected)
346 return PCI_ERS_RESULT_NONE;
Michael Neulingf0295e02018-03-26 15:17:07 +1100347
Sam Bobroff20b34492018-05-25 13:11:40 +1000348 eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
349 driver->name);
Gavin Shan29f8bf12012-02-27 20:04:02 +0000350 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
Linas Vepstas2a50f142007-11-03 07:27:50 +1100351
Wei Yang67086e32016-03-04 10:53:11 +1100352 edev->in_error = true;
Bryant G. Ly856e1eb2018-01-05 10:45:47 -0600353 pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
Sam Bobroff20b34492018-05-25 13:11:40 +1000354 return rc;
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500355}
356
357/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000358 * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
Sam Bobroff20b34492018-05-25 13:11:40 +1000359 * @edev: eeh device
360 * @driver: device's PCI driver
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500361 *
Linas Vepstas638799b2007-11-03 07:25:55 +1100362 * Tells each device driver that IO ports, MMIO and config space I/O
Sam Bobroff20b34492018-05-25 13:11:40 +1000363 * are now enabled.
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500364 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000365static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
366 struct pci_driver *driver)
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500367{
Sam Bobroff20b34492018-05-25 13:11:40 +1000368 if (!driver->err_handler->mmio_enabled)
369 return PCI_ERS_RESULT_NONE;
370 eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
371 return driver->err_handler->mmio_enabled(edev->pdev);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600372}
373
Linas Vepstascb5b56242006-09-15 18:56:35 -0500374/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000375 * eeh_report_reset - Tell device that slot has been reset
Sam Bobroff20b34492018-05-25 13:11:40 +1000376 * @edev: eeh device
377 * @driver: device's PCI driver
Gavin Shan29f8bf12012-02-27 20:04:02 +0000378 *
379 * This routine must be called while EEH tries to reset particular
380 * PCI device so that the associated PCI device driver could take
381 * some actions, usually to save data the driver needs so that the
382 * driver can work again while the device is recovered.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600383 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000384static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
385 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600386{
Sam Bobroff20b34492018-05-25 13:11:40 +1000387 if (!driver->err_handler->slot_reset || !edev->in_error)
388 return PCI_ERS_RESULT_NONE;
389 eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
390 return driver->err_handler->slot_reset(edev->pdev);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600391}
392
Sam Bobroffd6c49322018-05-25 13:11:32 +1000393static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000394{
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000395 struct pci_dev *pdev;
396
397 if (!edev)
398 return NULL;
399
Gavin Shan5a0cdbf2016-04-27 11:14:51 +1000400 /*
401 * The content in the config space isn't saved because
402 * the blocked config space on some adapters. We have
403 * to restore the initial saved config space when the
404 * EEH device is created.
405 */
406 if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
Sam Bobroff80e65b02018-09-12 11:23:26 +1000407 if (list_is_last(&edev->entry, &edev->pe->edevs))
Gavin Shan5a0cdbf2016-04-27 11:14:51 +1000408 eeh_pe_restore_bars(edev->pe);
409
410 return NULL;
411 }
412
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000413 pdev = eeh_dev_to_pci_dev(edev);
414 if (!pdev)
415 return NULL;
416
417 pci_restore_state(pdev);
418 return NULL;
419}
420
Linas Vepstascb5b56242006-09-15 18:56:35 -0500421/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000422 * eeh_report_resume - Tell device to resume normal operations
Sam Bobroff20b34492018-05-25 13:11:40 +1000423 * @edev: eeh device
424 * @driver: device's PCI driver
Gavin Shan29f8bf12012-02-27 20:04:02 +0000425 *
426 * This routine must be called to notify the device driver that it
427 * could resume so that the device driver can do some initialization
428 * to make the recovered device work again.
Linas Vepstascb5b56242006-09-15 18:56:35 -0500429 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000430static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
431 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600432{
Sam Bobroff20b34492018-05-25 13:11:40 +1000433 if (!driver->err_handler->resume || !edev->in_error)
434 return PCI_ERS_RESULT_NONE;
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000435
Sam Bobroff20b34492018-05-25 13:11:40 +1000436 eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
437 driver->err_handler->resume(edev->pdev);
Michael Neulingf0295e02018-03-26 15:17:07 +1100438
Sam Bobroff20b34492018-05-25 13:11:40 +1000439 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
Bryant G. Ly856e1eb2018-01-05 10:45:47 -0600440#ifdef CONFIG_PCI_IOV
Juan J. Alvarez521ca5a2018-02-15 12:49:51 -0600441 if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
442 eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
Bryant G. Ly856e1eb2018-01-05 10:45:47 -0600443#endif
Sam Bobroff20b34492018-05-25 13:11:40 +1000444 return PCI_ERS_RESULT_NONE;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600445}
446
Linas Vepstascb5b56242006-09-15 18:56:35 -0500447/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000448 * eeh_report_failure - Tell device driver that device is dead.
Sam Bobroff20b34492018-05-25 13:11:40 +1000449 * @edev: eeh device
450 * @driver: device's PCI driver
Linas Vepstascb5b56242006-09-15 18:56:35 -0500451 *
452 * This informs the device driver that the device is permanently
453 * dead, and that no further recovery attempts will be made on it.
454 */
Sam Bobroff20b34492018-05-25 13:11:40 +1000455static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
456 struct pci_driver *driver)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600457{
Sam Bobroff20b34492018-05-25 13:11:40 +1000458 enum pci_ers_result rc;
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000459
Sam Bobroff20b34492018-05-25 13:11:40 +1000460 if (!driver->err_handler->error_detected)
461 return PCI_ERS_RESULT_NONE;
Michael Neulingf0295e02018-03-26 15:17:07 +1100462
Sam Bobroff20b34492018-05-25 13:11:40 +1000463 eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
464 driver->name);
465 rc = driver->err_handler->error_detected(edev->pdev,
466 pci_channel_io_perm_failure);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600467
Sam Bobroff20b34492018-05-25 13:11:40 +1000468 pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT);
469 return rc;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600470}
471
Sam Bobroffbf773df2018-09-12 11:23:25 +1000472static void *eeh_add_virt_device(struct eeh_dev *edev)
Wei Yang67086e32016-03-04 10:53:11 +1100473{
474 struct pci_driver *driver;
Wei Yang67086e32016-03-04 10:53:11 +1100475 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
476 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
477
478 if (!(edev->physfn)) {
479 pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
Alexey Kardashevskiy69672bd2017-08-29 17:34:01 +1000480 __func__, pdn->phb->global_number, pdn->busno,
Wei Yang67086e32016-03-04 10:53:11 +1100481 PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
482 return NULL;
483 }
484
485 driver = eeh_pcid_get(dev);
486 if (driver) {
Sam Bobroff46d4be42018-05-25 13:11:30 +1000487 if (driver->err_handler) {
488 eeh_pcid_put(dev);
Wei Yang67086e32016-03-04 10:53:11 +1100489 return NULL;
Sam Bobroff46d4be42018-05-25 13:11:30 +1000490 }
491 eeh_pcid_put(dev);
Wei Yang67086e32016-03-04 10:53:11 +1100492 }
493
Bryant G. Ly988fc3b2017-11-09 08:00:33 -0600494#ifdef CONFIG_PCI_IOV
Jan H. Schönherr753f6122017-09-26 12:53:23 -0500495 pci_iov_add_virtfn(edev->physfn, pdn->vf_index);
Wei Yang67086e32016-03-04 10:53:11 +1100496#endif
497 return NULL;
498}
499
Sam Bobroffd6c49322018-05-25 13:11:32 +1000500static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata)
Gavin Shanf5c57712013-07-24 10:24:58 +0800501{
502 struct pci_driver *driver;
Gavin Shanf5c57712013-07-24 10:24:58 +0800503 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
Wei Yang67086e32016-03-04 10:53:11 +1100504 struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
Gavin Shanf5c57712013-07-24 10:24:58 +0800505
506 /*
507 * Actually, we should remove the PCI bridges as well.
508 * However, that's lots of complexity to do that,
509 * particularly some of devices under the bridge might
510 * support EEH. So we just care about PCI devices for
511 * simplicity here.
512 */
Sam Bobroff1ef52072018-11-29 14:16:41 +1100513 if (!eeh_edev_actionable(edev) ||
514 (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
Gavin Shand2b0f6f2014-04-24 18:00:19 +1000515 return NULL;
516
Sam Bobroff1c5c5332018-09-12 11:23:27 +1000517 if (rmv_data) {
Sam Bobroff46d4be42018-05-25 13:11:30 +1000518 driver = eeh_pcid_get(dev);
519 if (driver) {
520 if (driver->err_handler &&
521 driver->err_handler->error_detected &&
522 driver->err_handler->slot_reset) {
523 eeh_pcid_put(dev);
524 return NULL;
525 }
526 eeh_pcid_put(dev);
527 }
Thadeu Lima de Souza Cascardo8cc6b6c2014-02-05 16:20:45 -0200528 }
Gavin Shanf5c57712013-07-24 10:24:58 +0800529
530 /* Remove it from PCI subsystem */
Sam Bobroff1ef52072018-11-29 14:16:41 +1100531 pr_info("EEH: Removing %s without EEH sensitive driver\n",
532 pci_name(dev));
Gavin Shanf5c57712013-07-24 10:24:58 +0800533 edev->mode |= EEH_DEV_DISCONNECTED;
Sam Bobroff1c5c5332018-09-12 11:23:27 +1000534 if (rmv_data)
535 rmv_data->removed_dev_count++;
Gavin Shanf5c57712013-07-24 10:24:58 +0800536
Wei Yang67086e32016-03-04 10:53:11 +1100537 if (edev->physfn) {
Bryant G. Ly988fc3b2017-11-09 08:00:33 -0600538#ifdef CONFIG_PCI_IOV
Wei Yang67086e32016-03-04 10:53:11 +1100539 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
540
Jan H. Schönherr753f6122017-09-26 12:53:23 -0500541 pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
Wei Yang67086e32016-03-04 10:53:11 +1100542 edev->pdev = NULL;
543
544 /*
545 * We have to set the VF PE number to invalid one, which is
546 * required to plug the VF successfully.
547 */
548 pdn->pe_number = IODA_INVALID_PE;
549#endif
550 if (rmv_data)
Sam Bobroff1c5c5332018-09-12 11:23:27 +1000551 list_add(&edev->rmv_entry, &rmv_data->removed_vf_list);
Wei Yang67086e32016-03-04 10:53:11 +1100552 } else {
553 pci_lock_rescan_remove();
554 pci_stop_and_remove_bus_device(dev);
555 pci_unlock_rescan_remove();
556 }
Gavin Shanf5c57712013-07-24 10:24:58 +0800557
558 return NULL;
559}
560
Sam Bobroffd6c49322018-05-25 13:11:32 +1000561static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
Gavin Shanf5c57712013-07-24 10:24:58 +0800562{
Gavin Shanf5c57712013-07-24 10:24:58 +0800563 struct eeh_dev *edev, *tmp;
564
565 eeh_pe_for_each_dev(pe, edev, tmp) {
566 if (!(edev->mode & EEH_DEV_DISCONNECTED))
567 continue;
568
569 edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
570 eeh_rmv_from_parent_pe(edev);
571 }
572
573 return NULL;
574}
575
Gavin Shan78954702014-04-24 18:00:14 +1000576/*
577 * Explicitly clear PE's frozen state for PowerNV where
578 * we have frozen PE until BAR restore is completed. It's
579 * harmless to clear it for pSeries. To be consistent with
580 * PE reset (for 3 times), we try to clear the frozen state
581 * for 3 times as well.
582 */
Sam Bobroff4d8e3252018-11-29 14:16:40 +1100583static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed)
Gavin Shan78954702014-04-24 18:00:14 +1000584{
Sam Bobroff3376cb92018-11-29 14:16:37 +1100585 struct eeh_pe *pe;
586 int i;
Gavin Shan78954702014-04-24 18:00:14 +1000587
Sam Bobroff3376cb92018-11-29 14:16:37 +1100588 eeh_for_each_pe(root, pe) {
Sam Bobroff4d8e3252018-11-29 14:16:40 +1100589 if (include_passed || !eeh_pe_passed(pe)) {
590 for (i = 0; i < 3; i++)
591 if (!eeh_unfreeze_pe(pe))
592 break;
593 if (i >= 3)
594 return -EIO;
595 }
Gavin Shan2c665992014-05-05 09:29:02 +1000596 }
Sam Bobroff4d8e3252018-11-29 14:16:40 +1100597 eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed);
Sam Bobroff3376cb92018-11-29 14:16:37 +1100598 return 0;
Gavin Shan78954702014-04-24 18:00:14 +1000599}
600
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000601int eeh_pe_reset_and_recover(struct eeh_pe *pe)
602{
Gavin Shan2efc7712016-04-27 11:14:52 +1000603 int ret;
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000604
605 /* Bail if the PE is being recovered */
606 if (pe->state & EEH_PE_RECOVERING)
607 return 0;
608
609 /* Put the PE into recovery mode */
610 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
611
612 /* Save states */
613 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
614
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000615 /* Issue reset */
Sam Bobroff1ef52072018-11-29 14:16:41 +1100616 ret = eeh_pe_reset_full(pe, true);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000617 if (ret) {
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100618 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000619 return ret;
620 }
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000621
622 /* Unfreeze the PE */
Sam Bobroff4d8e3252018-11-29 14:16:40 +1100623 ret = eeh_clear_pe_frozen_state(pe, true);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000624 if (ret) {
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100625 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000626 return ret;
627 }
628
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000629 /* Restore device state */
630 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
631
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000632 /* Clear recovery mode */
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100633 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
Gavin Shan5cfb20b2014-09-30 12:39:07 +1000634
635 return 0;
636}
637
Linas Vepstas77bd7412005-11-03 18:52:49 -0600638/**
Gavin Shan29f8bf12012-02-27 20:04:02 +0000639 * eeh_reset_device - Perform actual reset of a pci slot
Sam Bobroff5fd13462018-03-19 13:48:55 +1100640 * @driver_eeh_aware: Does the device's driver provide EEH support?
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000641 * @pe: EEH PE
Gavin Shan29f8bf12012-02-27 20:04:02 +0000642 * @bus: PCI bus corresponding to the isolcated slot
Sam Bobroff5fd13462018-03-19 13:48:55 +1100643 * @rmv_data: Optional, list to record removed devices
Linas Vepstas77bd7412005-11-03 18:52:49 -0600644 *
Gavin Shan29f8bf12012-02-27 20:04:02 +0000645 * This routine must be called to do reset on the indicated PE.
646 * During the reset, udev might be invoked because those affected
647 * PCI devices will be removed and then added.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600648 */
Wei Yang67086e32016-03-04 10:53:11 +1100649static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
Sam Bobroff5fd13462018-03-19 13:48:55 +1100650 struct eeh_rmv_data *rmv_data,
651 bool driver_eeh_aware)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600652{
Arnd Bergmannedfd17f2017-11-04 22:26:52 +0100653 time64_t tstamp;
Wei Yang67086e32016-03-04 10:53:11 +1100654 int cnt, rc;
655 struct eeh_dev *edev;
Sam Bobroff1ef52072018-11-29 14:16:41 +1100656 struct eeh_pe *tmp_pe;
657 bool any_passed = false;
658
659 eeh_for_each_pe(pe, tmp_pe)
660 any_passed |= eeh_pe_passed(tmp_pe);
Linas Vepstas42405452006-04-28 17:39:38 -0500661
662 /* pcibios will clear the counter; save the value */
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000663 cnt = pe->freeze_count;
Gavin Shan5a719782013-06-20 13:21:01 +0800664 tstamp = pe->tstamp;
Linas Vepstas42405452006-04-28 17:39:38 -0500665
Gavin Shan20ee6a92012-09-11 19:16:17 +0000666 /*
667 * We don't remove the corresponding PE instances because
668 * we need the information afterwords. The attached EEH
669 * devices are expected to be attached soon when calling
Gavin Shanbd251b82016-05-03 15:41:37 +1000670 * into pci_hp_add_devices().
Gavin Shan20ee6a92012-09-11 19:16:17 +0000671 */
Gavin Shanf5c57712013-07-24 10:24:58 +0800672 eeh_pe_state_mark(pe, EEH_PE_KEEP);
Sam Bobroff1ef52072018-11-29 14:16:41 +1100673 if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
Gavin Shancca0e5422016-06-24 14:49:02 +1000674 eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
Sam Bobroff54048cf2018-03-21 13:06:40 +1100675 } else {
676 pci_lock_rescan_remove();
677 pci_hp_remove_devices(bus);
678 pci_unlock_rescan_remove();
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100679 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600680
Gavin Shand0914f52014-04-24 18:00:12 +1000681 /*
682 * Reset the pci controller. (Asserts RST#; resets config space).
Linas Vepstasb6495c02005-11-03 18:54:54 -0600683 * Reconfigure bridges and devices. Don't try to bring the system
Gavin Shan29f8bf12012-02-27 20:04:02 +0000684 * up if the reset failed for some reason.
Gavin Shand0914f52014-04-24 18:00:12 +1000685 *
686 * During the reset, it's very dangerous to have uncontrolled PCI
687 * config accesses. So we prefer to block them. However, controlled
688 * PCI config accesses initiated from EEH itself are allowed.
Gavin Shan29f8bf12012-02-27 20:04:02 +0000689 */
Sam Bobroff1ef52072018-11-29 14:16:41 +1100690 rc = eeh_pe_reset_full(pe, false);
Gavin Shan28bf36f2014-11-14 10:47:29 +1100691 if (rc)
Linas Vepstasb6495c02005-11-03 18:54:54 -0600692 return rc;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600693
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100694 pci_lock_rescan_remove();
695
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000696 /* Restore PE */
697 eeh_ops->configure_bridge(pe);
698 eeh_pe_restore_bars(pe);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600699
Andrew Donnellandc9c41b2015-12-08 16:59:25 +1100700 /* Clear frozen state */
Sam Bobroff1ef52072018-11-29 14:16:41 +1100701 rc = eeh_clear_pe_frozen_state(pe, false);
Andrew Donnellan409bf7f2016-12-01 11:23:05 +1100702 if (rc) {
703 pci_unlock_rescan_remove();
Andrew Donnellandc9c41b2015-12-08 16:59:25 +1100704 return rc;
Andrew Donnellan409bf7f2016-12-01 11:23:05 +1100705 }
Gavin Shan78954702014-04-24 18:00:14 +1000706
Linas Vepstas77bd7412005-11-03 18:52:49 -0600707 /* Give the system 5 seconds to finish running the user-space
Gavin Shana84f2732013-06-20 13:20:51 +0800708 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
709 * this is a hack, but if we don't do this, and try to bring
710 * the device up before the scripts have taken it down,
Linas Vepstas77bd7412005-11-03 18:52:49 -0600711 * potentially weird things happen.
712 */
Sam Bobroff1c5c5332018-09-12 11:23:27 +1000713 if (!driver_eeh_aware || rmv_data->removed_dev_count) {
Sam Bobroff54048cf2018-03-21 13:06:40 +1100714 pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
715 (driver_eeh_aware ? "partial" : "complete"));
Gavin Shan29f8bf12012-02-27 20:04:02 +0000716 ssleep(5);
Gavin Shanf5c57712013-07-24 10:24:58 +0800717
718 /*
719 * The EEH device is still connected with its parent
720 * PE. We should disconnect it so the binding can be
721 * rebuilt when adding PCI devices.
722 */
Sam Bobroff80e65b02018-09-12 11:23:26 +1000723 edev = list_first_entry(&pe->edevs, struct eeh_dev, entry);
Gavin Shanf5c57712013-07-24 10:24:58 +0800724 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
Gavin Shana3aa2562016-06-17 13:05:11 +1000725 if (pe->type & EEH_PE_VF) {
Sam Bobroffbf773df2018-09-12 11:23:25 +1000726 eeh_add_virt_device(edev);
Gavin Shana3aa2562016-06-17 13:05:11 +1000727 } else {
Sam Bobroff54048cf2018-03-21 13:06:40 +1100728 if (!driver_eeh_aware)
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100729 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
Gavin Shanbd251b82016-05-03 15:41:37 +1000730 pci_hp_add_devices(bus);
Gavin Shana3aa2562016-06-17 13:05:11 +1000731 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600732 }
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100733 eeh_pe_state_clear(pe, EEH_PE_KEEP, true);
Gavin Shan5a719782013-06-20 13:21:01 +0800734
735 pe->tstamp = tstamp;
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000736 pe->freeze_count = cnt;
Linas Vepstasb6495c02005-11-03 18:54:54 -0600737
Rafael J. Wysocki1c2042c2014-01-15 14:33:20 +0100738 pci_unlock_rescan_remove();
Linas Vepstasb6495c02005-11-03 18:54:54 -0600739 return 0;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600740}
741
742/* The longest amount of time to wait for a pci device
743 * to come back on line, in seconds.
744 */
Brian Kingfb48dc22013-11-25 16:27:54 -0600745#define MAX_WAIT_FOR_RECOVERY 300
Linas Vepstas77bd7412005-11-03 18:52:49 -0600746
Russell Curreyc0b64972017-04-19 17:39:27 +1000747/**
748 * eeh_handle_normal_event - Handle EEH events on a specific PE
Sam Bobroff37fd8122018-03-19 13:46:30 +1100749 * @pe: EEH PE - which should not be used after we return, as it may
750 * have been invalidated.
Russell Curreyc0b64972017-04-19 17:39:27 +1000751 *
752 * Attempts to recover the given PE. If recovery fails or the PE has failed
753 * too many times, remove the PE.
754 *
Sam Bobroff68701782018-03-19 13:46:20 +1100755 * While PHB detects address or data parity errors on particular PCI
756 * slot, the associated PE will be frozen. Besides, DMA's occurring
757 * to wild addresses (which usually happen due to bugs in device
758 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
759 * #PERR or other misc PCI-related errors also can trigger EEH errors.
760 *
761 * Recovery process consists of unplugging the device driver (which
762 * generated hotplug events to userspace), then issuing a PCI #RST to
763 * the device, then reconfiguring the PCI config space for all bridges
764 * & devices under this slot, and then finally restarting the device
765 * drivers (which cause a second set of hotplug events to go out to
766 * userspace).
Russell Curreyc0b64972017-04-19 17:39:27 +1000767 */
Sam Bobroff37fd8122018-03-19 13:46:30 +1100768void eeh_handle_normal_event(struct eeh_pe *pe)
Linas Vepstas77bd7412005-11-03 18:52:49 -0600769{
Sam Bobroffcd95f802018-03-19 13:47:02 +1100770 struct pci_bus *bus;
Wei Yang67086e32016-03-04 10:53:11 +1100771 struct eeh_dev *edev, *tmp;
Sam Bobroff665012c2018-05-25 13:11:39 +1000772 struct eeh_pe *tmp_pe;
Linas Vepstasb6495c02005-11-03 18:54:54 -0600773 int rc = 0;
Paul Mackerras18eb3b32005-11-29 17:17:02 +1100774 enum pci_ers_result result = PCI_ERS_RESULT_NONE;
Sam Bobroff1c5c5332018-09-12 11:23:27 +1000775 struct eeh_rmv_data rmv_data =
776 {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
Linas Vepstas77bd7412005-11-03 18:52:49 -0600777
Sam Bobroffcd95f802018-03-19 13:47:02 +1100778 bus = eeh_pe_bus_get(pe);
779 if (!bus) {
Russell Currey1f52f172016-11-16 14:02:15 +1100780 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000781 __func__, pe->phb->global_number, pe->addr);
Sam Bobroff37fd8122018-03-19 13:46:30 +1100782 return;
Linas Vepstas77bd7412005-11-03 18:52:49 -0600783 }
784
Sam Bobroff37fd8122018-03-19 13:46:30 +1100785 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
786
Gavin Shan5a719782013-06-20 13:21:01 +0800787 eeh_pe_update_time_stamp(pe);
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000788 pe->freeze_count++;
Russell Curreyc0b64972017-04-19 17:39:27 +1000789 if (pe->freeze_count > eeh_max_freezes) {
Sam Bobroff796b9f52018-05-25 13:11:28 +1000790 pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
Russell Curreyc0b64972017-04-19 17:39:27 +1000791 pe->phb->global_number, pe->addr,
792 pe->freeze_count);
Sam Bobroffb90484e2018-09-12 11:23:33 +1000793 result = PCI_ERS_RESULT_DISCONNECT;
Russell Curreyc0b64972017-04-19 17:39:27 +1000794 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600795
796 /* Walk the various device drivers attached to this slot through
797 * a reset sequence, giving each an opportunity to do what it needs
798 * to accomplish the reset. Each child gets a report of the
799 * status ... if any child can't handle the reset, then the entire
800 * slot is dlpar removed and added.
Gavin Shan8234fce2015-10-08 14:58:54 +1100801 *
802 * When the PHB is fenced, we have to issue a reset to recover from
803 * the error. Override the result if necessary to have partially
804 * hotplug for this case.
Linas Vepstas77bd7412005-11-03 18:52:49 -0600805 */
Sam Bobroffb90484e2018-09-12 11:23:33 +1000806 if (result != PCI_ERS_RESULT_DISCONNECT) {
807 pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
808 pe->freeze_count, eeh_max_freezes);
809 pr_info("EEH: Notify device drivers to shutdown\n");
810 eeh_set_channel_state(pe, pci_channel_io_frozen);
811 eeh_set_irq_state(pe, false);
812 eeh_pe_report("error_detected(IO frozen)", pe,
813 eeh_report_error, &result);
814 if ((pe->type & EEH_PE_PHB) &&
815 result != PCI_ERS_RESULT_NONE &&
816 result != PCI_ERS_RESULT_NEED_RESET)
817 result = PCI_ERS_RESULT_NEED_RESET;
818 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600819
Linas Vepstas5f1a7c82007-11-16 05:58:36 +1100820 /* Get the current PCI slot state. This can take a long time,
Wei Yang2ac39902015-04-27 09:25:10 +0800821 * sometimes over 300 seconds for certain systems.
Gavin Shan29f8bf12012-02-27 20:04:02 +0000822 */
Sam Bobroffb90484e2018-09-12 11:23:33 +1000823 if (result != PCI_ERS_RESULT_DISCONNECT) {
824 rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
825 if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
826 pr_warn("EEH: Permanent failure\n");
827 result = PCI_ERS_RESULT_DISCONNECT;
828 }
Linas Vepstas5f1a7c82007-11-16 05:58:36 +1100829 }
830
Linas Vepstasede8ca22007-05-09 09:33:29 +1000831 /* Since rtas may enable MMIO when posting the error log,
832 * don't post the error log until after all dev drivers
Linas Vepstas17213c32007-05-10 02:38:11 +1000833 * have been informed.
834 */
Sam Bobroffb90484e2018-09-12 11:23:33 +1000835 if (result != PCI_ERS_RESULT_DISCONNECT) {
836 pr_info("EEH: Collect temporary log\n");
837 eeh_slot_error_detail(pe, EEH_LOG_TEMP);
838 }
Linas Vepstasede8ca22007-05-09 09:33:29 +1000839
Linas Vepstas77bd7412005-11-03 18:52:49 -0600840 /* If all device drivers were EEH-unaware, then shut
841 * down all of the device drivers, and hope they
842 * go down willingly, without panicing the system.
843 */
Paul Mackerras18eb3b32005-11-29 17:17:02 +1100844 if (result == PCI_ERS_RESULT_NONE) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800845 pr_info("EEH: Reset with hotplug activity\n");
Sam Bobroff5fd13462018-03-19 13:48:55 +1100846 rc = eeh_reset_device(pe, bus, NULL, false);
Linas Vepstase0f90b62007-03-19 14:52:04 -0500847 if (rc) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000848 pr_warn("%s: Unable to reset, err=%d\n",
849 __func__, rc);
Sam Bobroffb90484e2018-09-12 11:23:33 +1000850 result = PCI_ERS_RESULT_DISCONNECT;
Linas Vepstase0f90b62007-03-19 14:52:04 -0500851 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600852 }
853
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500854 /* If all devices reported they can proceed, then re-enable MMIO */
855 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800856 pr_info("EEH: Enable I/O for affected devices\n");
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000857 rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500858
Sam Bobroffb90484e2018-09-12 11:23:33 +1000859 if (rc < 0) {
860 result = PCI_ERS_RESULT_DISCONNECT;
861 } else if (rc) {
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500862 result = PCI_ERS_RESULT_NEED_RESET;
863 } else {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800864 pr_info("EEH: Notify device drivers to resume I/O\n");
Sam Bobroff20b34492018-05-25 13:11:40 +1000865 eeh_pe_report("mmio_enabled", pe,
866 eeh_report_mmio_enabled, &result);
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500867 }
868 }
869
870 /* If all devices reported they can proceed, then re-enable DMA */
871 if (result == PCI_ERS_RESULT_CAN_RECOVER) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800872 pr_info("EEH: Enabled DMA for affected devices\n");
Gavin Shan9b3c76f2012-09-07 22:44:19 +0000873 rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500874
Sam Bobroffb90484e2018-09-12 11:23:33 +1000875 if (rc < 0) {
876 result = PCI_ERS_RESULT_DISCONNECT;
877 } else if (rc) {
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500878 result = PCI_ERS_RESULT_NEED_RESET;
Gavin Shan35845a72014-04-24 18:00:26 +1000879 } else {
880 /*
881 * We didn't do PE reset for the case. The PE
882 * is still in frozen state. Clear it before
883 * resuming the PE.
884 */
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100885 eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
Linas Vepstasd0e70342006-12-06 12:32:20 -0600886 result = PCI_ERS_RESULT_RECOVERED;
Gavin Shan35845a72014-04-24 18:00:26 +1000887 }
Linas Vepstas6a1ca372006-09-15 18:58:59 -0500888 }
889
Linas Vepstas77bd7412005-11-03 18:52:49 -0600890 /* If any device called out for a reset, then reset the slot */
Paul Mackerras18eb3b32005-11-29 17:17:02 +1100891 if (result == PCI_ERS_RESULT_NEED_RESET) {
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800892 pr_info("EEH: Reset without hotplug activity\n");
Sam Bobroff5fd13462018-03-19 13:48:55 +1100893 rc = eeh_reset_device(pe, bus, &rmv_data, true);
Linas Vepstase0f90b62007-03-19 14:52:04 -0500894 if (rc) {
Gavin Shan0dae2742014-07-17 14:41:41 +1000895 pr_warn("%s: Cannot reset, err=%d\n",
896 __func__, rc);
Sam Bobroffb90484e2018-09-12 11:23:33 +1000897 result = PCI_ERS_RESULT_DISCONNECT;
898 } else {
899 result = PCI_ERS_RESULT_NONE;
900 eeh_set_channel_state(pe, pci_channel_io_normal);
901 eeh_set_irq_state(pe, true);
902 eeh_pe_report("slot_reset", pe, eeh_report_reset,
903 &result);
904 }
905 }
906
907 if ((result == PCI_ERS_RESULT_RECOVERED) ||
908 (result == PCI_ERS_RESULT_NONE)) {
909 /*
910 * For those hot removed VFs, we should add back them after PF
911 * get recovered properly.
912 */
913 list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
914 rmv_entry) {
915 eeh_add_virt_device(edev);
916 list_del(&edev->rmv_entry);
Linas Vepstase0f90b62007-03-19 14:52:04 -0500917 }
Gavin Shan56ca4fd2013-06-27 13:46:46 +0800918
Sam Bobroffb90484e2018-09-12 11:23:33 +1000919 /* Tell all device drivers that they can resume operations */
920 pr_info("EEH: Notify device driver to resume\n");
Sam Bobroff47cc8c12018-05-25 13:11:37 +1000921 eeh_set_channel_state(pe, pci_channel_io_normal);
Sam Bobroff010acfa2018-05-25 13:11:38 +1000922 eeh_set_irq_state(pe, true);
Sam Bobroffb90484e2018-09-12 11:23:33 +1000923 eeh_pe_report("resume", pe, eeh_report_resume, NULL);
924 eeh_for_each_pe(pe, tmp_pe) {
925 eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
926 edev->mode &= ~EEH_DEV_NO_HANDLER;
927 edev->in_error = false;
928 }
929 }
Linas Vepstas77bd7412005-11-03 18:52:49 -0600930
Sam Bobroffb90484e2018-09-12 11:23:33 +1000931 pr_info("EEH: Recovery successful.\n");
932 } else {
933 /*
934 * About 90% of all real-life EEH failures in the field
935 * are due to poorly seated PCI cards. Only 10% or so are
936 * due to actual, failed cards.
937 */
938 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
939 "Please try reseating or replacing it\n",
940 pe->phb->global_number, pe->addr);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600941
Sam Bobroffb90484e2018-09-12 11:23:33 +1000942 eeh_slot_error_detail(pe, EEH_LOG_PERM);
Wei Yang67086e32016-03-04 10:53:11 +1100943
Sam Bobroffb90484e2018-09-12 11:23:33 +1000944 /* Notify all devices that they're about to go down. */
945 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
946 eeh_set_irq_state(pe, false);
947 eeh_pe_report("error_detected(permanent failure)", pe,
948 eeh_report_failure, NULL);
949
950 /* Mark the PE to be removed permanently */
951 eeh_pe_state_mark(pe, EEH_PE_REMOVED);
952
953 /*
954 * Shut down the device drivers for good. We mark
955 * all removed devices correctly to avoid access
956 * the their PCI config any more.
957 */
958 if (pe->type & EEH_PE_VF) {
959 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
960 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
961 } else {
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100962 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
Sam Bobroffb90484e2018-09-12 11:23:33 +1000963 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
964
965 pci_lock_rescan_remove();
966 pci_hp_remove_devices(bus);
967 pci_unlock_rescan_remove();
968 /* The passed PE should no longer be used */
969 return;
Sam Bobroff20b34492018-05-25 13:11:40 +1000970 }
971 }
Sam Bobroff9ed5ca62018-11-29 14:16:39 +1100972 eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
Linas Vepstas77bd7412005-11-03 18:52:49 -0600973}
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800974
Russell Curreyc0b64972017-04-19 17:39:27 +1000975/**
976 * eeh_handle_special_event - Handle EEH events without a specific failing PE
977 *
978 * Called when an EEH event is detected but can't be narrowed down to a
979 * specific PE. Iterates through possible failures and handles them as
980 * necessary.
981 */
Sam Bobroff68701782018-03-19 13:46:20 +1100982void eeh_handle_special_event(void)
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800983{
984 struct eeh_pe *pe, *phb_pe;
985 struct pci_bus *bus;
Gavin Shan7e4e7862014-01-15 13:16:11 +0800986 struct pci_controller *hose;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800987 unsigned long flags;
Gavin Shan7e4e7862014-01-15 13:16:11 +0800988 int rc;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800989
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800990
Gavin Shan7e4e7862014-01-15 13:16:11 +0800991 do {
992 rc = eeh_ops->next_error(&pe);
Gavin Shan8a6b1bc2013-06-20 13:21:04 +0800993
Gavin Shan7e4e7862014-01-15 13:16:11 +0800994 switch (rc) {
995 case EEH_NEXT_ERR_DEAD_IOC:
996 /* Mark all PHBs in dead state */
997 eeh_serialize_lock(&flags);
998
999 /* Purge all events */
Gavin Shan5c7a35e2014-06-04 17:31:52 +10001000 eeh_remove_event(NULL, true);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001001
1002 list_for_each_entry(hose, &hose_list, list_node) {
1003 phb_pe = eeh_phb_pe_get(hose);
1004 if (!phb_pe) continue;
1005
Sam Bobroffe762bb82018-09-12 11:23:31 +10001006 eeh_pe_mark_isolated(phb_pe);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001007 }
1008
1009 eeh_serialize_unlock(flags);
1010
1011 break;
1012 case EEH_NEXT_ERR_FROZEN_PE:
1013 case EEH_NEXT_ERR_FENCED_PHB:
1014 case EEH_NEXT_ERR_DEAD_PHB:
1015 /* Mark the PE in fenced state */
1016 eeh_serialize_lock(&flags);
1017
1018 /* Purge all events of the PHB */
Gavin Shan5c7a35e2014-06-04 17:31:52 +10001019 eeh_remove_event(pe, true);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001020
Sam Bobroffe762bb82018-09-12 11:23:31 +10001021 if (rc != EEH_NEXT_ERR_DEAD_PHB)
1022 eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
1023 eeh_pe_mark_isolated(pe);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001024
1025 eeh_serialize_unlock(flags);
1026
1027 break;
1028 case EEH_NEXT_ERR_NONE:
1029 return;
1030 default:
1031 pr_warn("%s: Invalid value %d from next_error()\n",
1032 __func__, rc);
1033 return;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001034 }
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001035
Gavin Shan7e4e7862014-01-15 13:16:11 +08001036 /*
1037 * For fenced PHB and frozen PE, it's handled as normal
1038 * event. We have to remove the affected PHBs for dead
1039 * PHB and IOC
1040 */
1041 if (rc == EEH_NEXT_ERR_FROZEN_PE ||
1042 rc == EEH_NEXT_ERR_FENCED_PHB) {
Sam Bobroff37fd8122018-03-19 13:46:30 +11001043 eeh_handle_normal_event(pe);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001044 } else {
Linus Torvalds1b173662014-01-27 21:11:26 -08001045 pci_lock_rescan_remove();
Gavin Shan7e4e7862014-01-15 13:16:11 +08001046 list_for_each_entry(hose, &hose_list, list_node) {
1047 phb_pe = eeh_phb_pe_get(hose);
1048 if (!phb_pe ||
Gavin Shan9e049372014-04-24 18:00:07 +10001049 !(phb_pe->state & EEH_PE_ISOLATED) ||
1050 (phb_pe->state & EEH_PE_RECOVERING))
Gavin Shan7e4e7862014-01-15 13:16:11 +08001051 continue;
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001052
Gavin Shan7e4e7862014-01-15 13:16:11 +08001053 /* Notify all devices to be down */
Sam Bobroff9ed5ca62018-11-29 14:16:39 +11001054 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
Sam Bobroff47cc8c12018-05-25 13:11:37 +10001055 eeh_set_channel_state(pe, pci_channel_io_perm_failure);
Sam Bobroff20b34492018-05-25 13:11:40 +10001056 eeh_pe_report(
1057 "error_detected(permanent failure)", pe,
Russell Curreyaf2e3a02016-09-12 14:17:24 +10001058 eeh_report_failure, NULL);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001059 bus = eeh_pe_bus_get(phb_pe);
Russell Currey04fec21c2016-09-12 14:17:22 +10001060 if (!bus) {
1061 pr_err("%s: Cannot find PCI bus for "
Russell Currey1f52f172016-11-16 14:02:15 +11001062 "PHB#%x-PE#%x\n",
Russell Currey04fec21c2016-09-12 14:17:22 +10001063 __func__,
1064 pe->phb->global_number,
1065 pe->addr);
1066 break;
1067 }
Gavin Shanbd251b82016-05-03 15:41:37 +10001068 pci_hp_remove_devices(bus);
Gavin Shan7e4e7862014-01-15 13:16:11 +08001069 }
Linus Torvalds1b173662014-01-27 21:11:26 -08001070 pci_unlock_rescan_remove();
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001071 }
Gavin Shan7e4e7862014-01-15 13:16:11 +08001072
1073 /*
1074 * If we have detected dead IOC, we needn't proceed
1075 * any more since all PHBs would have been removed
1076 */
1077 if (rc == EEH_NEXT_ERR_DEAD_IOC)
1078 break;
1079 } while (rc != EEH_NEXT_ERR_NONE);
Gavin Shan8a6b1bc2013-06-20 13:21:04 +08001080}