amd64_edac: Cleanup NBCTL code
[linux-2.6.git] / drivers / edac / i5000_edac.c
index 4a16b5b..a5cefab 100644 (file)
 #define                        FERR_NF_UNCORRECTABLE   (FERR_NF_M12ERR | \
                                                        FERR_NF_M11ERR | \
                                                        FERR_NF_M10ERR | \
+                                                       FERR_NF_M9ERR | \
                                                        FERR_NF_M8ERR | \
                                                        FERR_NF_M7ERR | \
                                                        FERR_NF_M6ERR | \
@@ -301,6 +302,9 @@ static char *numcol_toString[] = {
 };
 #endif
 
+/* enables the report of miscellaneous messages as CE errors - default off */
+static int misc_messages;
+
 /* Enumeration of supported devices */
 enum i5000_chips {
        I5000P = 0,
@@ -466,7 +470,8 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
                                        struct i5000_error_info *info,
                                        int handle_errors)
 {
-       char msg[EDAC_MC_LABEL_LEN + 1 + 90];
+       char msg[EDAC_MC_LABEL_LEN + 1 + 160];
+       char *specific = NULL;
        u32 allErrors;
        int branch;
        int channel;
@@ -480,11 +485,6 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
        if (!allErrors)
                return;         /* if no error, return now */
 
-       /* ONLY ONE of the possible error bits will be set, as per the docs */
-       i5000_mc_printk(mci, KERN_ERR,
-                       "FATAL ERRORS Found!!! 1st FATAL Err Reg= 0x%x\n",
-                       allErrors);
-
        branch = EXTRACT_FBDCHAN_INDX(info->ferr_fat_fbd);
        channel = branch;
 
@@ -501,28 +501,42 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
                rdwr ? "Write" : "Read", ras, cas);
 
        /* Only 1 bit will be on */
-       if (allErrors & FERR_FAT_M1ERR) {
-               i5000_mc_printk(mci, KERN_ERR,
-                               "Alert on non-redundant retry or fast "
-                               "reset timeout\n");
-
-       } else if (allErrors & FERR_FAT_M2ERR) {
-               i5000_mc_printk(mci, KERN_ERR,
-                               "Northbound CRC error on non-redundant "
-                               "retry\n");
-
-       } else if (allErrors & FERR_FAT_M3ERR) {
-               i5000_mc_printk(mci, KERN_ERR,
-                               ">Tmid Thermal event with intelligent "
-                               "throttling disabled\n");
+       switch (allErrors) {
+       case FERR_FAT_M1ERR:
+               specific = "Alert on non-redundant retry or fast "
+                               "reset timeout";
+               break;
+       case FERR_FAT_M2ERR:
+               specific = "Northbound CRC error on non-redundant "
+                               "retry";
+               break;
+       case FERR_FAT_M3ERR:
+               {
+               static int done;
+
+               /*
+                * This error is generated to inform that the intelligent
+                * throttling is disabled and the temperature passed the
+                * specified middle point. Since this is something the BIOS
+                * should take care of, we'll warn only once to avoid
+                * worthlessly flooding the log.
+                */
+               if (done)
+                       return;
+               done++;
+
+               specific = ">Tmid Thermal event with intelligent "
+                          "throttling disabled";
+               }
+               break;
        }
 
        /* Form out message */
        snprintf(msg, sizeof(msg),
                 "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d CAS=%d "
-                "FATAL Err=0x%x)",
+                "FATAL Err=0x%x (%s))",
                 branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas,
-                allErrors);
+                allErrors, specific);
 
        /* Call the helper to output message */
        edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
@@ -539,7 +553,8 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
                                        struct i5000_error_info *info,
                                        int handle_errors)
 {
-       char msg[EDAC_MC_LABEL_LEN + 1 + 90];
+       char msg[EDAC_MC_LABEL_LEN + 1 + 170];
+       char *specific = NULL;
        u32 allErrors;
        u32 ue_errors;
        u32 ce_errors;
@@ -557,16 +572,18 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
                return;         /* if no error, return now */
 
        /* ONLY ONE of the possible error bits will be set, as per the docs */
-       i5000_mc_printk(mci, KERN_WARNING,
-                       "NON-FATAL ERRORS Found!!! 1st NON-FATAL Err "
-                       "Reg= 0x%x\n", allErrors);
-
        ue_errors = allErrors & FERR_NF_UNCORRECTABLE;
        if (ue_errors) {
                debugf0("\tUncorrected bits= 0x%x\n", ue_errors);
 
                branch = EXTRACT_FBDCHAN_INDX(info->ferr_nf_fbd);
-               channel = branch;
+
+               /*
+                * According with i5000 datasheet, bit 28 has no significance
+                * for errors M4Err-M12Err and M17Err-M21Err, on FERR_NF_FBD
+                */
+               channel = branch & 2;
+
                bank = NREC_BANK(info->nrecmema);
                rank = NREC_RANK(info->nrecmema);
                rdwr = NREC_RDWR(info->nrecmema);
@@ -579,12 +596,47 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
                        rank, channel, channel + 1, branch >> 1, bank,
                        rdwr ? "Write" : "Read", ras, cas);
 
+               switch (ue_errors) {
+               case FERR_NF_M12ERR:
+                       specific = "Non-Aliased Uncorrectable Patrol Data ECC";
+                       break;
+               case FERR_NF_M11ERR:
+                       specific = "Non-Aliased Uncorrectable Spare-Copy "
+                                       "Data ECC";
+                       break;
+               case FERR_NF_M10ERR:
+                       specific = "Non-Aliased Uncorrectable Mirrored Demand "
+                                       "Data ECC";
+                       break;
+               case FERR_NF_M9ERR:
+                       specific = "Non-Aliased Uncorrectable Non-Mirrored "
+                                       "Demand Data ECC";
+                       break;
+               case FERR_NF_M8ERR:
+                       specific = "Aliased Uncorrectable Patrol Data ECC";
+                       break;
+               case FERR_NF_M7ERR:
+                       specific = "Aliased Uncorrectable Spare-Copy Data ECC";
+                       break;
+               case FERR_NF_M6ERR:
+                       specific = "Aliased Uncorrectable Mirrored Demand "
+                                       "Data ECC";
+                       break;
+               case FERR_NF_M5ERR:
+                       specific = "Aliased Uncorrectable Non-Mirrored Demand "
+                                       "Data ECC";
+                       break;
+               case FERR_NF_M4ERR:
+                       specific = "Uncorrectable Data ECC on Replay";
+                       break;
+               }
+
                /* Form out message */
                snprintf(msg, sizeof(msg),
                         "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d "
-                        "CAS=%d, UE Err=0x%x)",
+                        "CAS=%d, UE Err=0x%x (%s))",
                         branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas,
-                        ue_errors);
+                        ue_errors, specific);
 
                /* Call the helper to output message */
                edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
@@ -616,51 +668,74 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
                        rank, channel, branch >> 1, bank,
                        rdwr ? "Write" : "Read", ras, cas);
 
+               switch (ce_errors) {
+               case FERR_NF_M17ERR:
+                       specific = "Correctable Non-Mirrored Demand Data ECC";
+                       break;
+               case FERR_NF_M18ERR:
+                       specific = "Correctable Mirrored Demand Data ECC";
+                       break;
+               case FERR_NF_M19ERR:
+                       specific = "Correctable Spare-Copy Data ECC";
+                       break;
+               case FERR_NF_M20ERR:
+                       specific = "Correctable Patrol Data ECC";
+                       break;
+               }
+
                /* Form out message */
                snprintf(msg, sizeof(msg),
                         "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d "
-                        "CAS=%d, CE Err=0x%x)", branch >> 1, bank,
-                        rdwr ? "Write" : "Read", ras, cas, ce_errors);
+                        "CAS=%d, CE Err=0x%x (%s))", branch >> 1, bank,
+                        rdwr ? "Write" : "Read", ras, cas, ce_errors,
+                        specific);
 
                /* Call the helper to output message */
                edac_mc_handle_fbd_ce(mci, rank, channel, msg);
        }
 
-       /* See if any of the thermal errors have fired */
-       misc_errors = allErrors & FERR_NF_THERMAL;
-       if (misc_errors) {
-               i5000_printk(KERN_WARNING, "\tTHERMAL Error, bits= 0x%x\n",
-                       misc_errors);
-       }
-
-       /* See if any of the thermal errors have fired */
-       misc_errors = allErrors & FERR_NF_NON_RETRY;
-       if (misc_errors) {
-               i5000_printk(KERN_WARNING, "\tNON-Retry  Errors, bits= 0x%x\n",
-                       misc_errors);
-       }
+       if (!misc_messages)
+               return;
 
-       /* See if any of the thermal errors have fired */
-       misc_errors = allErrors & FERR_NF_NORTH_CRC;
+       misc_errors = allErrors & (FERR_NF_NON_RETRY | FERR_NF_NORTH_CRC |
+                                  FERR_NF_SPD_PROTOCOL | FERR_NF_DIMM_SPARE);
        if (misc_errors) {
-               i5000_printk(KERN_WARNING,
-                       "\tNORTHBOUND CRC  Error, bits= 0x%x\n",
-                       misc_errors);
-       }
+               switch (misc_errors) {
+               case FERR_NF_M13ERR:
+                       specific = "Non-Retry or Redundant Retry FBD Memory "
+                                       "Alert or Redundant Fast Reset Timeout";
+                       break;
+               case FERR_NF_M14ERR:
+                       specific = "Non-Retry or Redundant Retry FBD "
+                                       "Configuration Alert";
+                       break;
+               case FERR_NF_M15ERR:
+                       specific = "Non-Retry or Redundant Retry FBD "
+                                       "Northbound CRC error on read data";
+                       break;
+               case FERR_NF_M21ERR:
+                       specific = "FBD Northbound CRC error on "
+                                       "FBD Sync Status";
+                       break;
+               case FERR_NF_M22ERR:
+                       specific = "SPD protocol error";
+                       break;
+               case FERR_NF_M27ERR:
+                       specific = "DIMM-spare copy started";
+                       break;
+               case FERR_NF_M28ERR:
+                       specific = "DIMM-spare copy completed";
+                       break;
+               }
+               branch = EXTRACT_FBDCHAN_INDX(info->ferr_nf_fbd);
 
-       /* See if any of the thermal errors have fired */
-       misc_errors = allErrors & FERR_NF_SPD_PROTOCOL;
-       if (misc_errors) {
-               i5000_printk(KERN_WARNING,
-                       "\tSPD Protocol  Error, bits= 0x%x\n",
-                       misc_errors);
-       }
+               /* Form out message */
+               snprintf(msg, sizeof(msg),
+                        "(Branch=%d Err=%#x (%s))", branch >> 1,
+                        misc_errors, specific);
 
-       /* See if any of the thermal errors have fired */
-       misc_errors = allErrors & FERR_NF_DIMM_SPARE;
-       if (misc_errors) {
-               i5000_printk(KERN_WARNING, "\tDIMM-Spare  Error, bits= 0x%x\n",
-                       misc_errors);
+               /* Call the helper to output message */
+               edac_mc_handle_fbd_ce(mci, 0, 0, msg);
        }
 }
 
@@ -699,7 +774,7 @@ static void i5000_clear_error(struct mem_ctl_info *mci)
 static void i5000_check_error(struct mem_ctl_info *mci)
 {
        struct i5000_error_info info;
-       debugf4("MC%d: " __FILE__ ": %s()\n", mci->mc_idx, __func__);
+       debugf4("MC%d: %s: %s()\n", mci->mc_idx, __FILE__, __func__);
        i5000_get_error_info(mci, &info);
        i5000_process_error_info(mci, &info, 1);
 }
@@ -1104,7 +1179,7 @@ static void i5000_get_mc_regs(struct mem_ctl_info *mci)
                        pci_read_config_word(pvt->branch_1, where,
                                        &pvt->b1_mtr[slot_row]);
                        debugf2("MTR%d where=0x%x B1 value=0x%x\n", slot_row,
-                               where, pvt->b0_mtr[slot_row]);
+                               where, pvt->b1_mtr[slot_row]);
                } else {
                        pvt->b1_mtr[slot_row] = 0;
                }
@@ -1163,7 +1238,7 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
        struct csrow_info *p_csrow;
        int empty, channel_count;
        int max_csrows;
-       int mtr;
+       int mtr, mtr1;
        int csrow_megs;
        int channel;
        int csrow;
@@ -1182,9 +1257,10 @@ static int i5000_init_csrows(struct mem_ctl_info *mci)
 
                /* use branch 0 for the basis */
                mtr = pvt->b0_mtr[csrow >> 1];
+               mtr1 = pvt->b1_mtr[csrow >> 1];
 
                /* if no DIMMS on this row, continue */
-               if (!MTR_DIMMS_PRESENT(mtr))
+               if (!MTR_DIMMS_PRESENT(mtr) && !MTR_DIMMS_PRESENT(mtr1))
                        continue;
 
                /* FAKE OUT VALUES, FIXME */
@@ -1277,8 +1353,8 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
        int num_dimms_per_channel;
        int num_csrows;
 
-       debugf0("MC: " __FILE__ ": %s(), pdev bus %u dev=0x%x fn=0x%x\n",
-               __func__,
+       debugf0("MC: %s: %s(), pdev bus %u dev=0x%x fn=0x%x\n",
+               __FILE__, __func__,
                pdev->bus->number,
                PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
 
@@ -1312,7 +1388,8 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
        if (mci == NULL)
                return -ENOMEM;
 
-       debugf0("MC: " __FILE__ ": %s(): mci = %p\n", __func__, mci);
+       kobject_get(&mci->edac_mci_kobj);
+       debugf0("MC: %s: %s(): mci = %p\n", __FILE__, __func__, mci);
 
        mci->dev = &pdev->dev;  /* record ptr  to the generic device */
 
@@ -1355,8 +1432,8 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
 
        /* add this new MC control structure to EDAC's list of MCs */
        if (edac_mc_add_mc(mci)) {
-               debugf0("MC: " __FILE__
-                       ": %s(): failed edac_mc_add_mc()\n", __func__);
+               debugf0("MC: %s: %s(): failed edac_mc_add_mc()\n",
+                       __FILE__, __func__);
                /* FIXME: perhaps some code should go here that disables error
                 * reporting if we just enabled it
                 */
@@ -1384,6 +1461,7 @@ fail1:
        i5000_put_devices(mci);
 
 fail0:
+       kobject_put(&mci->edac_mci_kobj);
        edac_mc_free(mci);
        return -ENODEV;
 }
@@ -1400,11 +1478,11 @@ static int __devinit i5000_init_one(struct pci_dev *pdev,
 {
        int rc;
 
-       debugf0("MC: " __FILE__ ": %s()\n", __func__);
+       debugf0("MC: %s: %s()\n", __FILE__, __func__);
 
        /* wake up device */
        rc = pci_enable_device(pdev);
-       if (rc == -EIO)
+       if (rc)
                return rc;
 
        /* now probe and enable the device */
@@ -1419,7 +1497,7 @@ static void __devexit i5000_remove_one(struct pci_dev *pdev)
 {
        struct mem_ctl_info *mci;
 
-       debugf0(__FILE__ ": %s()\n", __func__);
+       debugf0("%s: %s()\n", __FILE__, __func__);
 
        if (i5000_pci)
                edac_pci_release_generic_ctl(i5000_pci);
@@ -1429,7 +1507,7 @@ static void __devexit i5000_remove_one(struct pci_dev *pdev)
 
        /* retrieve references to resources, and free those resources */
        i5000_put_devices(mci);
-
+       kobject_put(&mci->edac_mci_kobj);
        edac_mc_free(mci);
 }
 
@@ -1466,7 +1544,7 @@ static int __init i5000_init(void)
 {
        int pci_rc;
 
-       debugf2("MC: " __FILE__ ": %s()\n", __func__);
+       debugf2("MC: %s: %s()\n", __FILE__, __func__);
 
        /* Ensure that the OPSTATE is set correctly for POLL or NMI */
        opstate_init();
@@ -1482,7 +1560,7 @@ static int __init i5000_init(void)
  */
 static void __exit i5000_exit(void)
 {
-       debugf2("MC: " __FILE__ ": %s()\n", __func__);
+       debugf2("MC: %s: %s()\n", __FILE__, __func__);
        pci_unregister_driver(&i5000_driver);
 }
 
@@ -1497,3 +1575,6 @@ MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
 
 module_param(edac_op_state, int, 0444);
 MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
+module_param(misc_messages, int, 0444);
+MODULE_PARM_DESC(misc_messages, "Log miscellaneous non fatal messages");
+