]> nv-tegra.nvidia Code Review - linux-3.10.git/blobdiff - drivers/edac/edac_mc.c
RAS: Add a tracepoint for reporting memory controller events
[linux-3.10.git] / drivers / edac / edac_mc.c
index 10f375032e9686f7c719c857ceca5dd3f9ef81ed..ce25750a83f904cdf2c4a7306ba5eb33527a0059 100644 (file)
 #include <linux/list.h>
 #include <linux/ctype.h>
 #include <linux/edac.h>
+#include <linux/bitops.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
 #include <asm/edac.h>
 #include "edac_core.h"
 #include "edac_module.h"
 
+#define CREATE_TRACE_POINTS
+#define TRACE_INCLUDE_PATH ../../include/ras
+#include <ras/ras_event.h>
+
 /* lock to memory controller's control array */
 static DEFINE_MUTEX(mem_ctls_mutex);
 static LIST_HEAD(mc_devices);
@@ -384,6 +389,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
         * which will perform kobj unregistration and the actual free
         * will occur during the kobject callback operation
         */
+
        return mci;
 }
 EXPORT_SYMBOL_GPL(edac_mc_alloc);
@@ -902,19 +908,19 @@ static void edac_ce_error(struct mem_ctl_info *mci,
                          const bool enable_per_layer_report,
                          const unsigned long page_frame_number,
                          const unsigned long offset_in_page,
-                         u32 grain)
+                         long grain)
 {
        unsigned long remapped_page;
 
        if (edac_mc_get_log_ce()) {
                if (other_detail && *other_detail)
                        edac_mc_printk(mci, KERN_WARNING,
-                                      "CE %s on %s (%s%s - %s)\n",
+                                      "CE %s on %s (%s %s - %s)\n",
                                       msg, label, location,
                                       detail, other_detail);
                else
                        edac_mc_printk(mci, KERN_WARNING,
-                                      "CE %s on %s (%s%s)\n",
+                                      "CE %s on %s (%s %s)\n",
                                       msg, label, location,
                                       detail);
        }
@@ -953,12 +959,12 @@ static void edac_ue_error(struct mem_ctl_info *mci,
        if (edac_mc_get_log_ue()) {
                if (other_detail && *other_detail)
                        edac_mc_printk(mci, KERN_WARNING,
-                                      "UE %s on %s (%s%s - %s)\n",
+                                      "UE %s on %s (%s %s - %s)\n",
                                       msg, label, location, detail,
                                       other_detail);
                else
                        edac_mc_printk(mci, KERN_WARNING,
-                                      "UE %s on %s (%s%s)\n",
+                                      "UE %s on %s (%s %s)\n",
                                       msg, label, location, detail);
        }
 
@@ -975,27 +981,50 @@ static void edac_ue_error(struct mem_ctl_info *mci,
 }
 
 #define OTHER_LABEL " or "
+
+/**
+ * edac_mc_handle_error - reports a memory event to userspace
+ *
+ * @type:              severity of the error (CE/UE/Fatal)
+ * @mci:               a struct mem_ctl_info pointer
+ * @page_frame_number: mem page where the error occurred
+ * @offset_in_page:    offset of the error inside the page
+ * @syndrome:          ECC syndrome
+ * @top_layer:         Memory layer[0] position
+ * @mid_layer:         Memory layer[1] position
+ * @low_layer:         Memory layer[2] position
+ * @msg:               Message meaningful to the end users that
+ *                     explains the event
+ * @other_detail:      Technical details about the event that
+ *                     may help hardware manufacturers and
+ *                     EDAC developers to analyse the event
+ * @arch_log:          Architecture-specific struct that can
+ *                     be used to add extended information to the
+ *                     tracepoint, like dumping MCE registers.
+ */
 void edac_mc_handle_error(const enum hw_event_mc_err_type type,
                          struct mem_ctl_info *mci,
                          const unsigned long page_frame_number,
                          const unsigned long offset_in_page,
                          const unsigned long syndrome,
-                         const int layer0,
-                         const int layer1,
-                         const int layer2,
+                         const int top_layer,
+                         const int mid_layer,
+                         const int low_layer,
                          const char *msg,
                          const char *other_detail,
-                         const void *mcelog)
+                         const void *arch_log)
 {
        /* FIXME: too much for stack: move it to some pre-alocated area */
        char detail[80], location[80];
        char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
        char *p;
        int row = -1, chan = -1;
-       int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 };
+       int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
        int i;
-       u32 grain;
+       long grain;
        bool enable_per_layer_report = false;
+       u16 error_count;        /* FIXME: make it a parameter */
+       u8 grain_bits;
 
        debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
 
@@ -1045,11 +1074,11 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
        for (i = 0; i < mci->tot_dimms; i++) {
                struct dimm_info *dimm = &mci->dimms[i];
 
-               if (layer0 >= 0 && layer0 != dimm->location[0])
+               if (top_layer >= 0 && top_layer != dimm->location[0])
                        continue;
-               if (layer1 >= 0 && layer1 != dimm->location[1])
+               if (mid_layer >= 0 && mid_layer != dimm->location[1])
                        continue;
-               if (layer2 >= 0 && layer2 != dimm->location[2])
+               if (low_layer >= 0 && low_layer != dimm->location[2])
                        continue;
 
                /* get the max grain, over the error match range */
@@ -1120,11 +1149,22 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
                             edac_layer_name[mci->layers[i].type],
                             pos[i]);
        }
+       if (p > location)
+               *(p - 1) = '\0';
+
+       /* Report the error via the trace interface */
+
+       error_count = 1;        /* FIXME: allow change it */
+       grain_bits = fls_long(grain) + 1;
+       trace_mc_event(type, msg, label, error_count,
+                      mci->mc_idx, top_layer, mid_layer, low_layer,
+                      PAGES_TO_MiB(page_frame_number) | offset_in_page,
+                      grain_bits, syndrome, other_detail);
 
        /* Memory type dependent details about the error */
        if (type == HW_EVENT_ERR_CORRECTED) {
                snprintf(detail, sizeof(detail),
-                       "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx",
+                       "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
                        page_frame_number, offset_in_page,
                        grain, syndrome);
                edac_ce_error(mci, pos, msg, location, label, detail,
@@ -1132,7 +1172,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
                              page_frame_number, offset_in_page, grain);
        } else {
                snprintf(detail, sizeof(detail),
-                       "page:0x%lx offset:0x%lx grain:%d",
+                       "page:0x%lx offset:0x%lx grain:%ld",
                        page_frame_number, offset_in_page, grain);
 
                edac_ue_error(mci, pos, msg, location, label, detail,