Commit 4275be63559719c3149b19751029f1b0f1b26775

Authored by Mauro Carvalho Chehab
1 parent 982216a429

edac: Change internal representation to work with layers

Change the EDAC internal representation to work with non-csrow
based memory controllers.

There are lots of those memory controllers nowadays, and more
are coming. So, the EDAC internal representation needs to be
changed, in order to work with those memory controllers, while
preserving backward compatibility with the old ones.

The edac core was written with the idea that memory controllers
are able to directly access csrows.

This is not true for FB-DIMM and RAMBUS memory controllers.

Also, some recent advanced memory controllers don't present a per-csrows
view. Instead, they view memories as DIMMs, instead of ranks.

So, change the allocation and error report routines to allow
them to work with all types of architectures.

This will allow the removal of several hacks with FB-DIMM and RAMBUS
memory controllers.

Also, several tests were done on different platforms using different
x86 drivers.

TODO: a multi-rank DIMMs are currently represented by multiple DIMM
entries in struct dimm_info. That means that changing a label for one
rank won't change the same label for the other ranks at the same DIMM.
This bug is present since the beginning of the EDAC, so it is not a big
deal. However, on several drivers, it is possible to fix this issue, but
it should be a per-driver fix, as the csrow => DIMM arrangement may not
be equal for all. So, don't try to fix it here yet.

I tried to make this patch as short as possible, preceding it with
several other patches that simplified the logic here. Yet, as the
internal API changes, all drivers need changes. The changes are
generally bigger in the drivers for FB-DIMMs.

Cc: Aristeu Rozanski <arozansk@redhat.com>
Cc: Doug Thompson <norsk5@yahoo.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Mark Gross <mark.gross@intel.com>
Cc: Jason Uhlenkott <juhlenko@akamai.com>
Cc: Tim Small <tim@buttersideup.com>
Cc: Ranganathan Desikan <ravi@jetztechnologies.com>
Cc: "Arvind R." <arvino55@gmail.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Egor Martovetsky <egor@pasemi.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Joe Perches <joe@perches.com>
Cc: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Hitoshi Mitake <h.mitake@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Niklas Söderlund" <niklas.soderlund@ericsson.com>
Cc: Shaohui Xie <Shaohui.Xie@freescale.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>

Showing 3 changed files with 552 additions and 287 deletions Side-by-side Diff

drivers/edac/edac_core.h
... ... @@ -447,8 +447,12 @@
447 447  
448 448 #endif /* CONFIG_PCI */
449 449  
450   -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
451   - unsigned nr_chans, int edac_index);
  450 +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
  451 + unsigned nr_chans, int edac_index);
  452 +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
  453 + unsigned n_layers,
  454 + struct edac_mc_layer *layers,
  455 + unsigned sz_pvt);
452 456 extern int edac_mc_add_mc(struct mem_ctl_info *mci);
453 457 extern void edac_mc_free(struct mem_ctl_info *mci);
454 458 extern struct mem_ctl_info *edac_mc_find(int idx);
455 459  
... ... @@ -467,25 +471,79 @@
467 471 * reporting logic and function interface - reduces conditional
468 472 * statement clutter and extra function arguments.
469 473 */
470   -extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
471   - unsigned long page_frame_number,
472   - unsigned long offset_in_page,
473   - unsigned long syndrome, int row, int channel,
474   - const char *msg);
475   -extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
476   - const char *msg);
477   -extern void edac_mc_handle_ue(struct mem_ctl_info *mci,
478   - unsigned long page_frame_number,
479   - unsigned long offset_in_page, int row,
480   - const char *msg);
481   -extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
482   - const char *msg);
483   -extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow,
484   - unsigned int channel0, unsigned int channel1,
485   - char *msg);
486   -extern void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, unsigned int csrow,
487   - unsigned int channel, char *msg);
488 474  
  475 +void edac_mc_handle_error(const enum hw_event_mc_err_type type,
  476 + struct mem_ctl_info *mci,
  477 + const unsigned long page_frame_number,
  478 + const unsigned long offset_in_page,
  479 + const unsigned long syndrome,
  480 + const int layer0,
  481 + const int layer1,
  482 + const int layer2,
  483 + const char *msg,
  484 + const char *other_detail,
  485 + const void *mcelog);
  486 +
  487 +static inline void edac_mc_handle_ce(struct mem_ctl_info *mci,
  488 + unsigned long page_frame_number,
  489 + unsigned long offset_in_page,
  490 + unsigned long syndrome, int row, int channel,
  491 + const char *msg)
  492 +{
  493 + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
  494 + page_frame_number, offset_in_page, syndrome,
  495 + row, channel, -1, msg, NULL, NULL);
  496 +}
  497 +
  498 +static inline void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
  499 + const char *msg)
  500 +{
  501 + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
  502 + 0, 0, 0, -1, -1, -1, msg, NULL, NULL);
  503 +}
  504 +
  505 +static inline void edac_mc_handle_ue(struct mem_ctl_info *mci,
  506 + unsigned long page_frame_number,
  507 + unsigned long offset_in_page, int row,
  508 + const char *msg)
  509 +{
  510 + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
  511 + page_frame_number, offset_in_page, 0,
  512 + row, -1, -1, msg, NULL, NULL);
  513 +}
  514 +
  515 +static inline void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
  516 + const char *msg)
  517 +{
  518 + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
  519 + 0, 0, 0, -1, -1, -1, msg, NULL, NULL);
  520 +}
  521 +
  522 +static inline void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
  523 + unsigned int csrow,
  524 + unsigned int channel0,
  525 + unsigned int channel1,
  526 + char *msg)
  527 +{
  528 + /*
  529 + *FIXME: The error can also be at channel1 (e. g. at the second
  530 + * channel of the same branch). The fix is to push
  531 + * edac_mc_handle_error() call into each driver
  532 + */
  533 + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
  534 + 0, 0, 0,
  535 + csrow, channel0, -1, msg, NULL, NULL);
  536 +}
  537 +
  538 +static inline void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
  539 + unsigned int csrow,
  540 + unsigned int channel, char *msg)
  541 +{
  542 + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
  543 + 0, 0, 0,
  544 + csrow, channel, -1, msg, NULL, NULL);
  545 +}
  546 +
489 547 /*
490 548 * edac_device APIs
491 549 */
... ... @@ -496,6 +554,7 @@
496 554 extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
497 555 int inst_nr, int block_nr, const char *msg);
498 556 extern int edac_device_alloc_index(void);
  557 +extern const char *edac_layer_name[];
499 558  
500 559 /*
501 560 * edac_pci APIs
drivers/edac/edac_mc.c
... ... @@ -44,11 +44,27 @@
44 44 debugf4("\tchannel = %p\n", chan);
45 45 debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx);
46 46 debugf4("\tchannel->csrow = %p\n\n", chan->csrow);
47   - debugf4("\tdimm->ce_count = %d\n", chan->dimm->ce_count);
48   - debugf4("\tdimm->label = '%s'\n", chan->dimm->label);
49   - debugf4("\tdimm->nr_pages = 0x%x\n", chan->dimm->nr_pages);
  47 + debugf4("\tchannel->dimm = %p\n", chan->dimm);
50 48 }
51 49  
  50 +static void edac_mc_dump_dimm(struct dimm_info *dimm)
  51 +{
  52 + int i;
  53 +
  54 + debugf4("\tdimm = %p\n", dimm);
  55 + debugf4("\tdimm->label = '%s'\n", dimm->label);
  56 + debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
  57 + debugf4("\tdimm location ");
  58 + for (i = 0; i < dimm->mci->n_layers; i++) {
  59 + printk(KERN_CONT "%d", dimm->location[i]);
  60 + if (i < dimm->mci->n_layers - 1)
  61 + printk(KERN_CONT ".");
  62 + }
  63 + printk(KERN_CONT "\n");
  64 + debugf4("\tdimm->grain = %d\n", dimm->grain);
  65 + debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages);
  66 +}
  67 +
52 68 static void edac_mc_dump_csrow(struct csrow_info *csrow)
53 69 {
54 70 debugf4("\tcsrow = %p\n", csrow);
... ... @@ -70,6 +86,8 @@
70 86 debugf4("\tmci->edac_check = %p\n", mci->edac_check);
71 87 debugf3("\tmci->nr_csrows = %d, csrows = %p\n",
72 88 mci->nr_csrows, mci->csrows);
  89 + debugf3("\tmci->nr_dimms = %d, dimms = %p\n",
  90 + mci->tot_dimms, mci->dimms);
73 91 debugf3("\tdev = %p\n", mci->dev);
74 92 debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name);
75 93 debugf3("\tpvt_info = %p\n\n", mci->pvt_info);
76 94  
77 95  
78 96  
79 97  
80 98  
81 99  
82 100  
83 101  
84 102  
... ... @@ -157,46 +175,90 @@
157 175 }
158 176  
159 177 /**
160   - * edac_mc_alloc: Allocate a struct mem_ctl_info structure
161   - * @size_pvt: size of private storage needed
162   - * @nr_csrows: Number of CWROWS needed for this MC
163   - * @nr_chans: Number of channels for the MC
  178 + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
  179 + * @mc_num: Memory controller number
  180 + * @n_layers: Number of MC hierarchy layers
  181 + * layers: Describes each layer as seen by the Memory Controller
  182 + * @size_pvt: size of private storage needed
164 183 *
  184 + *
165 185 * Everything is kmalloc'ed as one big chunk - more efficient.
166 186 * Only can be used if all structures have the same lifetime - otherwise
167 187 * you have to allocate and initialize your own structures.
168 188 *
169 189 * Use edac_mc_free() to free mc structures allocated by this function.
170 190 *
  191 + * NOTE: drivers handle multi-rank memories in different ways: in some
  192 + * drivers, one multi-rank memory stick is mapped as one entry, while, in
  193 + * others, a single multi-rank memory stick would be mapped into several
  194 + * entries. Currently, this function will allocate multiple struct dimm_info
  195 + * on such scenarios, as grouping the multiple ranks require drivers change.
  196 + *
171 197 * Returns:
172 198 * NULL allocation failed
173 199 * struct mem_ctl_info pointer
174 200 */
175   -struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
176   - unsigned nr_chans, int edac_index)
  201 +struct mem_ctl_info *new_edac_mc_alloc(unsigned mc_num,
  202 + unsigned n_layers,
  203 + struct edac_mc_layer *layers,
  204 + unsigned sz_pvt)
177 205 {
178   - void *ptr = NULL;
179 206 struct mem_ctl_info *mci;
180   - struct csrow_info *csi, *csrow;
  207 + struct edac_mc_layer *layer;
  208 + struct csrow_info *csi, *csr;
181 209 struct rank_info *chi, *chp, *chan;
182 210 struct dimm_info *dimm;
183   - void *pvt;
184   - unsigned size;
185   - int row, chn;
186   - int err;
  211 + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
  212 + unsigned pos[EDAC_MAX_LAYERS];
  213 + void *pvt, *ptr = NULL;
  214 + unsigned size, tot_dimms = 1, count = 1;
  215 + unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0;
  216 + int i, j, err, row, chn;
  217 + bool per_rank = false;
187 218  
  219 + BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0);
  220 + /*
  221 + * Calculate the total amount of dimms and csrows/cschannels while
  222 + * in the old API emulation mode
  223 + */
  224 + for (i = 0; i < n_layers; i++) {
  225 + tot_dimms *= layers[i].size;
  226 + if (layers[i].is_virt_csrow)
  227 + tot_csrows *= layers[i].size;
  228 + else
  229 + tot_channels *= layers[i].size;
  230 +
  231 + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
  232 + per_rank = true;
  233 + }
  234 +
188 235 /* Figure out the offsets of the various items from the start of an mc
189 236 * structure. We want the alignment of each item to be at least as
190 237 * stringent as what the compiler would provide if we could simply
191 238 * hardcode everything into a single struct.
192 239 */
193 240 mci = edac_align_ptr(&ptr, sizeof(*mci), 1);
194   - csi = edac_align_ptr(&ptr, sizeof(*csi), nr_csrows);
195   - chi = edac_align_ptr(&ptr, sizeof(*chi), nr_csrows * nr_chans);
196   - dimm = edac_align_ptr(&ptr, sizeof(*dimm), nr_csrows * nr_chans);
  241 + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers);
  242 + csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows);
  243 + chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels);
  244 + dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms);
  245 + for (i = 0; i < n_layers; i++) {
  246 + count *= layers[i].size;
  247 + debugf4("%s: errcount layer %d size %d\n", __func__, i, count);
  248 + ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
  249 + ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count);
  250 + tot_errcount += 2 * count;
  251 + }
  252 +
  253 + debugf4("%s: allocating %d error counters\n", __func__, tot_errcount);
197 254 pvt = edac_align_ptr(&ptr, sz_pvt, 1);
198 255 size = ((unsigned long)pvt) + sz_pvt;
199 256  
  257 + debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n",
  258 + __func__, size,
  259 + tot_dimms,
  260 + per_rank ? "ranks" : "dimms",
  261 + tot_csrows * tot_channels);
200 262 mci = kzalloc(size, GFP_KERNEL);
201 263 if (mci == NULL)
202 264 return NULL;
203 265  
204 266  
205 267  
206 268  
207 269  
208 270  
209 271  
210 272  
211 273  
212 274  
... ... @@ -204,43 +266,88 @@
204 266 /* Adjust pointers so they point within the memory we just allocated
205 267 * rather than an imaginary chunk of memory located at address 0.
206 268 */
  269 + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer));
207 270 csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi));
208 271 chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi));
209 272 dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm));
  273 + for (i = 0; i < n_layers; i++) {
  274 + mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i]));
  275 + mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i]));
  276 + }
210 277 pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
211 278  
212 279 /* setup index and various internal pointers */
213   - mci->mc_idx = edac_index;
  280 + mci->mc_idx = mc_num;
214 281 mci->csrows = csi;
215 282 mci->dimms = dimm;
  283 + mci->tot_dimms = tot_dimms;
216 284 mci->pvt_info = pvt;
217   - mci->nr_csrows = nr_csrows;
  285 + mci->n_layers = n_layers;
  286 + mci->layers = layer;
  287 + memcpy(mci->layers, layers, sizeof(*layer) * n_layers);
  288 + mci->nr_csrows = tot_csrows;
  289 + mci->num_cschannel = tot_channels;
  290 + mci->mem_is_per_rank = per_rank;
218 291  
219 292 /*
220   - * For now, assumes that a per-csrow arrangement for dimms.
221   - * This will be latter changed.
  293 + * Fill the csrow struct
222 294 */
223   - dimm = mci->dimms;
  295 + for (row = 0; row < tot_csrows; row++) {
  296 + csr = &csi[row];
  297 + csr->csrow_idx = row;
  298 + csr->mci = mci;
  299 + csr->nr_channels = tot_channels;
  300 + chp = &chi[row * tot_channels];
  301 + csr->channels = chp;
224 302  
225   - for (row = 0; row < nr_csrows; row++) {
226   - csrow = &csi[row];
227   - csrow->csrow_idx = row;
228   - csrow->mci = mci;
229   - csrow->nr_channels = nr_chans;
230   - chp = &chi[row * nr_chans];
231   - csrow->channels = chp;
232   -
233   - for (chn = 0; chn < nr_chans; chn++) {
  303 + for (chn = 0; chn < tot_channels; chn++) {
234 304 chan = &chp[chn];
235 305 chan->chan_idx = chn;
236   - chan->csrow = csrow;
  306 + chan->csrow = csr;
  307 + }
  308 + }
237 309  
238   - mci->csrows[row].channels[chn].dimm = dimm;
239   - dimm->csrow = row;
240   - dimm->csrow_channel = chn;
241   - dimm++;
242   - mci->nr_dimms++;
  310 + /*
  311 + * Fill the dimm struct
  312 + */
  313 + memset(&pos, 0, sizeof(pos));
  314 + row = 0;
  315 + chn = 0;
  316 + debugf4("%s: initializing %d %s\n", __func__, tot_dimms,
  317 + per_rank ? "ranks" : "dimms");
  318 + for (i = 0; i < tot_dimms; i++) {
  319 + chan = &csi[row].channels[chn];
  320 + dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers,
  321 + pos[0], pos[1], pos[2]);
  322 + dimm->mci = mci;
  323 +
  324 + debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__,
  325 + i, per_rank ? "rank" : "dimm", (dimm - mci->dimms),
  326 + pos[0], pos[1], pos[2], row, chn);
  327 +
  328 + /* Copy DIMM location */
  329 + for (j = 0; j < n_layers; j++)
  330 + dimm->location[j] = pos[j];
  331 +
  332 + /* Link it to the csrows old API data */
  333 + chan->dimm = dimm;
  334 + dimm->csrow = row;
  335 + dimm->cschannel = chn;
  336 +
  337 + /* Increment csrow location */
  338 + row++;
  339 + if (row == tot_csrows) {
  340 + row = 0;
  341 + chn++;
243 342 }
  343 +
  344 + /* Increment dimm location */
  345 + for (j = n_layers - 1; j >= 0; j--) {
  346 + pos[j]++;
  347 + if (pos[j] < layers[j].size)
  348 + break;
  349 + pos[j] = 0;
  350 + }
244 351 }
245 352  
246 353 mci->op_state = OP_ALLOC;
... ... @@ -263,6 +370,46 @@
263 370 */
264 371 return mci;
265 372 }
  373 +EXPORT_SYMBOL_GPL(new_edac_mc_alloc);
  374 +
  375 +/**
  376 + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
  377 + * @mc_num: Memory controller number
  378 + * @n_layers: Number of layers at the MC hierarchy
  379 + * layers: Describes each layer as seen by the Memory Controller
  380 + * @size_pvt: Size of private storage needed
  381 + *
  382 + *
  383 + * FIXME: drivers handle multi-rank memories in different ways: some
  384 + * drivers map multi-ranked DIMMs as one DIMM while others
  385 + * as several DIMMs.
  386 + *
  387 + * Everything is kmalloc'ed as one big chunk - more efficient.
  388 + * It can only be used if all structures have the same lifetime - otherwise
  389 + * you have to allocate and initialize your own structures.
  390 + *
  391 + * Use edac_mc_free() to free mc structures allocated by this function.
  392 + *
  393 + * Returns:
  394 + * On failure: NULL
  395 + * On success: struct mem_ctl_info pointer
  396 + */
  397 +
  398 +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
  399 + unsigned nr_chans, int mc_num)
  400 +{
  401 + unsigned n_layers = 2;
  402 + struct edac_mc_layer layers[n_layers];
  403 +
  404 + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
  405 + layers[0].size = nr_csrows;
  406 + layers[0].is_virt_csrow = true;
  407 + layers[1].type = EDAC_MC_LAYER_CHANNEL;
  408 + layers[1].size = nr_chans;
  409 + layers[1].is_virt_csrow = false;
  410 +
  411 + return new_edac_mc_alloc(mc_num, ARRAY_SIZE(layers), layers, sz_pvt);
  412 +}
266 413 EXPORT_SYMBOL_GPL(edac_mc_alloc);
267 414  
268 415 /**
... ... @@ -528,7 +675,6 @@
528 675 * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and
529 676 * create sysfs entries associated with mci structure
530 677 * @mci: pointer to the mci structure to be added to the list
531   - * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure.
532 678 *
533 679 * Return:
534 680 * 0 Success
... ... @@ -555,6 +701,8 @@
555 701 edac_mc_dump_channel(&mci->csrows[i].
556 702 channels[j]);
557 703 }
  704 + for (i = 0; i < mci->tot_dimms; i++)
  705 + edac_mc_dump_dimm(&mci->dimms[i]);
558 706 }
559 707 #endif
560 708 mutex_lock(&mem_ctls_mutex);
561 709  
562 710  
563 711  
564 712  
565 713  
566 714  
567 715  
568 716  
569 717  
570 718  
571 719  
572 720  
573 721  
574 722  
575 723  
576 724  
577 725  
578 726  
579 727  
580 728  
581 729  
582 730  
583 731  
584 732  
585 733  
586 734  
587 735  
588 736  
589 737  
590 738  
591 739  
592 740  
593 741  
594 742  
... ... @@ -712,262 +860,308 @@
712 860 }
713 861 EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page);
714 862  
715   -/* FIXME - setable log (warning/emerg) levels */
716   -/* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */
717   -void edac_mc_handle_ce(struct mem_ctl_info *mci,
718   - unsigned long page_frame_number,
719   - unsigned long offset_in_page, unsigned long syndrome,
720   - int row, int channel, const char *msg)
  863 +const char *edac_layer_name[] = {
  864 + [EDAC_MC_LAYER_BRANCH] = "branch",
  865 + [EDAC_MC_LAYER_CHANNEL] = "channel",
  866 + [EDAC_MC_LAYER_SLOT] = "slot",
  867 + [EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
  868 +};
  869 +EXPORT_SYMBOL_GPL(edac_layer_name);
  870 +
  871 +static void edac_inc_ce_error(struct mem_ctl_info *mci,
  872 + bool enable_per_layer_report,
  873 + const int pos[EDAC_MAX_LAYERS])
721 874 {
722   - unsigned long remapped_page;
723   - char *label = NULL;
724   - u32 grain;
  875 + int i, index = 0;
725 876  
726   - debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
  877 + mci->ce_count++;
727 878  
728   - /* FIXME - maybe make panic on INTERNAL ERROR an option */
729   - if (row >= mci->nr_csrows || row < 0) {
730   - /* something is wrong */
731   - edac_mc_printk(mci, KERN_ERR,
732   - "INTERNAL ERROR: row out of range "
733   - "(%d >= %d)\n", row, mci->nr_csrows);
734   - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
  879 + if (!enable_per_layer_report) {
  880 + mci->ce_noinfo_count++;
735 881 return;
736 882 }
737 883  
738   - if (channel >= mci->csrows[row].nr_channels || channel < 0) {
739   - /* something is wrong */
740   - edac_mc_printk(mci, KERN_ERR,
741   - "INTERNAL ERROR: channel out of range "
742   - "(%d >= %d)\n", channel,
743   - mci->csrows[row].nr_channels);
744   - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
  884 + for (i = 0; i < mci->n_layers; i++) {
  885 + if (pos[i] < 0)
  886 + break;
  887 + index += pos[i];
  888 + mci->ce_per_layer[i][index]++;
  889 +
  890 + if (i < mci->n_layers - 1)
  891 + index *= mci->layers[i + 1].size;
  892 + }
  893 +}
  894 +
  895 +static void edac_inc_ue_error(struct mem_ctl_info *mci,
  896 + bool enable_per_layer_report,
  897 + const int pos[EDAC_MAX_LAYERS])
  898 +{
  899 + int i, index = 0;
  900 +
  901 + mci->ue_count++;
  902 +
  903 + if (!enable_per_layer_report) {
  904 + mci->ce_noinfo_count++;
745 905 return;
746 906 }
747 907  
748   - label = mci->csrows[row].channels[channel].dimm->label;
749   - grain = mci->csrows[row].channels[channel].dimm->grain;
  908 + for (i = 0; i < mci->n_layers; i++) {
  909 + if (pos[i] < 0)
  910 + break;
  911 + index += pos[i];
  912 + mci->ue_per_layer[i][index]++;
750 913  
751   - if (edac_mc_get_log_ce())
752   - /* FIXME - put in DIMM location */
753   - edac_mc_printk(mci, KERN_WARNING,
754   - "CE page 0x%lx, offset 0x%lx, grain %d, syndrome "
755   - "0x%lx, row %d, channel %d, label \"%s\": %s\n",
756   - page_frame_number, offset_in_page,
757   - grain, syndrome, row, channel,
758   - label, msg);
  914 + if (i < mci->n_layers - 1)
  915 + index *= mci->layers[i + 1].size;
  916 + }
  917 +}
759 918  
760   - mci->ce_count++;
761   - mci->csrows[row].ce_count++;
762   - mci->csrows[row].channels[channel].dimm->ce_count++;
763   - mci->csrows[row].channels[channel].ce_count++;
  919 +static void edac_ce_error(struct mem_ctl_info *mci,
  920 + const int pos[EDAC_MAX_LAYERS],
  921 + const char *msg,
  922 + const char *location,
  923 + const char *label,
  924 + const char *detail,
  925 + const char *other_detail,
  926 + const bool enable_per_layer_report,
  927 + const unsigned long page_frame_number,
  928 + const unsigned long offset_in_page,
  929 + u32 grain)
  930 +{
  931 + unsigned long remapped_page;
764 932  
  933 + if (edac_mc_get_log_ce()) {
  934 + if (other_detail && *other_detail)
  935 + edac_mc_printk(mci, KERN_WARNING,
  936 + "CE %s on %s (%s%s - %s)\n",
  937 + msg, label, location,
  938 + detail, other_detail);
  939 + else
  940 + edac_mc_printk(mci, KERN_WARNING,
  941 + "CE %s on %s (%s%s)\n",
  942 + msg, label, location,
  943 + detail);
  944 + }
  945 + edac_inc_ce_error(mci, enable_per_layer_report, pos);
  946 +
765 947 if (mci->scrub_mode & SCRUB_SW_SRC) {
766 948 /*
767   - * Some MC's can remap memory so that it is still available
768   - * at a different address when PCI devices map into memory.
769   - * MC's that can't do this lose the memory where PCI devices
770   - * are mapped. This mapping is MC dependent and so we call
771   - * back into the MC driver for it to map the MC page to
772   - * a physical (CPU) page which can then be mapped to a virtual
773   - * page - which can then be scrubbed.
774   - */
  949 + * Some memory controllers (called MCs below) can remap
  950 + * memory so that it is still available at a different
  951 + * address when PCI devices map into memory.
  952 + * MC's that can't do this, lose the memory where PCI
  953 + * devices are mapped. This mapping is MC-dependent
  954 + * and so we call back into the MC driver for it to
  955 + * map the MC page to a physical (CPU) page which can
  956 + * then be mapped to a virtual page - which can then
  957 + * be scrubbed.
  958 + */
775 959 remapped_page = mci->ctl_page_to_phys ?
776 960 mci->ctl_page_to_phys(mci, page_frame_number) :
777 961 page_frame_number;
778 962  
779   - edac_mc_scrub_block(remapped_page, offset_in_page, grain);
  963 + edac_mc_scrub_block(remapped_page,
  964 + offset_in_page, grain);
780 965 }
781 966 }
782   -EXPORT_SYMBOL_GPL(edac_mc_handle_ce);
783 967  
784   -void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg)
  968 +static void edac_ue_error(struct mem_ctl_info *mci,
  969 + const int pos[EDAC_MAX_LAYERS],
  970 + const char *msg,
  971 + const char *location,
  972 + const char *label,
  973 + const char *detail,
  974 + const char *other_detail,
  975 + const bool enable_per_layer_report)
785 976 {
786   - if (edac_mc_get_log_ce())
787   - edac_mc_printk(mci, KERN_WARNING,
788   - "CE - no information available: %s\n", msg);
  977 + if (edac_mc_get_log_ue()) {
  978 + if (other_detail && *other_detail)
  979 + edac_mc_printk(mci, KERN_WARNING,
  980 + "UE %s on %s (%s%s - %s)\n",
  981 + msg, label, location, detail,
  982 + other_detail);
  983 + else
  984 + edac_mc_printk(mci, KERN_WARNING,
  985 + "UE %s on %s (%s%s)\n",
  986 + msg, label, location, detail);
  987 + }
789 988  
790   - mci->ce_noinfo_count++;
791   - mci->ce_count++;
  989 + if (edac_mc_get_panic_on_ue()) {
  990 + if (other_detail && *other_detail)
  991 + panic("UE %s on %s (%s%s - %s)\n",
  992 + msg, label, location, detail, other_detail);
  993 + else
  994 + panic("UE %s on %s (%s%s)\n",
  995 + msg, label, location, detail);
  996 + }
  997 +
  998 + edac_inc_ue_error(mci, enable_per_layer_report, pos);
792 999 }
793   -EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info);
794 1000  
795   -void edac_mc_handle_ue(struct mem_ctl_info *mci,
796   - unsigned long page_frame_number,
797   - unsigned long offset_in_page, int row, const char *msg)
  1001 +#define OTHER_LABEL " or "
  1002 +void edac_mc_handle_error(const enum hw_event_mc_err_type type,
  1003 + struct mem_ctl_info *mci,
  1004 + const unsigned long page_frame_number,
  1005 + const unsigned long offset_in_page,
  1006 + const unsigned long syndrome,
  1007 + const int layer0,
  1008 + const int layer1,
  1009 + const int layer2,
  1010 + const char *msg,
  1011 + const char *other_detail,
  1012 + const void *mcelog)
798 1013 {
799   - int len = EDAC_MC_LABEL_LEN * 4;
800   - char labels[len + 1];
801   - char *pos = labels;
802   - int chan;
803   - int chars;
804   - char *label = NULL;
  1014 + /* FIXME: too much for stack: move it to some pre-alocated area */
  1015 + char detail[80], location[80];
  1016 + char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
  1017 + char *p;
  1018 + int row = -1, chan = -1;
  1019 + int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 };
  1020 + int i;
805 1021 u32 grain;
  1022 + bool enable_per_layer_report = false;
806 1023  
807 1024 debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
808 1025  
809   - /* FIXME - maybe make panic on INTERNAL ERROR an option */
810   - if (row >= mci->nr_csrows || row < 0) {
811   - /* something is wrong */
812   - edac_mc_printk(mci, KERN_ERR,
813   - "INTERNAL ERROR: row out of range "
814   - "(%d >= %d)\n", row, mci->nr_csrows);
815   - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
816   - return;
817   - }
  1026 + /*
  1027 + * Check if the event report is consistent and if the memory
  1028 + * location is known. If it is known, enable_per_layer_report will be
  1029 + * true, the DIMM(s) label info will be filled and the per-layer
  1030 + * error counters will be incremented.
  1031 + */
  1032 + for (i = 0; i < mci->n_layers; i++) {
  1033 + if (pos[i] >= (int)mci->layers[i].size) {
  1034 + if (type == HW_EVENT_ERR_CORRECTED)
  1035 + p = "CE";
  1036 + else
  1037 + p = "UE";
818 1038  
819   - grain = mci->csrows[row].channels[0].dimm->grain;
820   - label = mci->csrows[row].channels[0].dimm->label;
821   - chars = snprintf(pos, len + 1, "%s", label);
822   - len -= chars;
823   - pos += chars;
824   -
825   - for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0);
826   - chan++) {
827   - label = mci->csrows[row].channels[chan].dimm->label;
828   - chars = snprintf(pos, len + 1, ":%s", label);
829   - len -= chars;
830   - pos += chars;
  1039 + edac_mc_printk(mci, KERN_ERR,
  1040 + "INTERNAL ERROR: %s value is out of range (%d >= %d)\n",
  1041 + edac_layer_name[mci->layers[i].type],
  1042 + pos[i], mci->layers[i].size);
  1043 + /*
  1044 + * Instead of just returning it, let's use what's
  1045 + * known about the error. The increment routines and
  1046 + * the DIMM filter logic will do the right thing by
  1047 + * pointing the likely damaged DIMMs.
  1048 + */
  1049 + pos[i] = -1;
  1050 + }
  1051 + if (pos[i] >= 0)
  1052 + enable_per_layer_report = true;
831 1053 }
832 1054  
833   - if (edac_mc_get_log_ue())
834   - edac_mc_printk(mci, KERN_EMERG,
835   - "UE page 0x%lx, offset 0x%lx, grain %d, row %d, "
836   - "labels \"%s\": %s\n", page_frame_number,
837   - offset_in_page, grain, row, labels, msg);
  1055 + /*
  1056 + * Get the dimm label/grain that applies to the match criteria.
  1057 + * As the error algorithm may not be able to point to just one memory
  1058 + * stick, the logic here will get all possible labels that could
  1059 + * pottentially be affected by the error.
  1060 + * On FB-DIMM memory controllers, for uncorrected errors, it is common
  1061 + * to have only the MC channel and the MC dimm (also called "branch")
  1062 + * but the channel is not known, as the memory is arranged in pairs,
  1063 + * where each memory belongs to a separate channel within the same
  1064 + * branch.
  1065 + */
  1066 + grain = 0;
  1067 + p = label;
  1068 + *p = '\0';
  1069 + for (i = 0; i < mci->tot_dimms; i++) {
  1070 + struct dimm_info *dimm = &mci->dimms[i];
838 1071  
839   - if (edac_mc_get_panic_on_ue())
840   - panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, "
841   - "row %d, labels \"%s\": %s\n", mci->mc_idx,
842   - page_frame_number, offset_in_page,
843   - grain, row, labels, msg);
  1072 + if (layer0 >= 0 && layer0 != dimm->location[0])
  1073 + continue;
  1074 + if (layer1 >= 0 && layer1 != dimm->location[1])
  1075 + continue;
  1076 + if (layer2 >= 0 && layer2 != dimm->location[2])
  1077 + continue;
844 1078  
845   - mci->ue_count++;
846   - mci->csrows[row].ue_count++;
847   -}
848   -EXPORT_SYMBOL_GPL(edac_mc_handle_ue);
  1079 + /* get the max grain, over the error match range */
  1080 + if (dimm->grain > grain)
  1081 + grain = dimm->grain;
849 1082  
850   -void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg)
851   -{
852   - if (edac_mc_get_panic_on_ue())
853   - panic("EDAC MC%d: Uncorrected Error", mci->mc_idx);
  1083 + /*
  1084 + * If the error is memory-controller wide, there's no need to
  1085 + * seek for the affected DIMMs because the whole
  1086 + * channel/memory controller/... may be affected.
  1087 + * Also, don't show errors for empty DIMM slots.
  1088 + */
  1089 + if (enable_per_layer_report && dimm->nr_pages) {
  1090 + if (p != label) {
  1091 + strcpy(p, OTHER_LABEL);
  1092 + p += strlen(OTHER_LABEL);
  1093 + }
  1094 + strcpy(p, dimm->label);
  1095 + p += strlen(p);
  1096 + *p = '\0';
854 1097  
855   - if (edac_mc_get_log_ue())
856   - edac_mc_printk(mci, KERN_WARNING,
857   - "UE - no information available: %s\n", msg);
858   - mci->ue_noinfo_count++;
859   - mci->ue_count++;
860   -}
861   -EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info);
  1098 + /*
  1099 + * get csrow/channel of the DIMM, in order to allow
  1100 + * incrementing the compat API counters
  1101 + */
  1102 + debugf4("%s: %s csrows map: (%d,%d)\n",
  1103 + __func__,
  1104 + mci->mem_is_per_rank ? "rank" : "dimm",
  1105 + dimm->csrow, dimm->cschannel);
862 1106  
863   -/*************************************************************
864   - * On Fully Buffered DIMM modules, this help function is
865   - * called to process UE events
866   - */
867   -void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci,
868   - unsigned int csrow,
869   - unsigned int channela,
870   - unsigned int channelb, char *msg)
871   -{
872   - int len = EDAC_MC_LABEL_LEN * 4;
873   - char labels[len + 1];
874   - char *pos = labels;
875   - int chars;
876   - char *label;
  1107 + if (row == -1)
  1108 + row = dimm->csrow;
  1109 + else if (row >= 0 && row != dimm->csrow)
  1110 + row = -2;
877 1111  
878   - if (csrow >= mci->nr_csrows) {
879   - /* something is wrong */
880   - edac_mc_printk(mci, KERN_ERR,
881   - "INTERNAL ERROR: row out of range (%d >= %d)\n",
882   - csrow, mci->nr_csrows);
883   - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
884   - return;
  1112 + if (chan == -1)
  1113 + chan = dimm->cschannel;
  1114 + else if (chan >= 0 && chan != dimm->cschannel)
  1115 + chan = -2;
  1116 + }
885 1117 }
886 1118  
887   - if (channela >= mci->csrows[csrow].nr_channels) {
888   - /* something is wrong */
889   - edac_mc_printk(mci, KERN_ERR,
890   - "INTERNAL ERROR: channel-a out of range "
891   - "(%d >= %d)\n",
892   - channela, mci->csrows[csrow].nr_channels);
893   - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
894   - return;
  1119 + if (!enable_per_layer_report) {
  1120 + strcpy(label, "any memory");
  1121 + } else {
  1122 + debugf4("%s: csrow/channel to increment: (%d,%d)\n",
  1123 + __func__, row, chan);
  1124 + if (p == label)
  1125 + strcpy(label, "unknown memory");
  1126 + if (type == HW_EVENT_ERR_CORRECTED) {
  1127 + if (row >= 0) {
  1128 + mci->csrows[row].ce_count++;
  1129 + if (chan >= 0)
  1130 + mci->csrows[row].channels[chan].ce_count++;
  1131 + }
  1132 + } else
  1133 + if (row >= 0)
  1134 + mci->csrows[row].ue_count++;
895 1135 }
896 1136  
897   - if (channelb >= mci->csrows[csrow].nr_channels) {
898   - /* something is wrong */
899   - edac_mc_printk(mci, KERN_ERR,
900   - "INTERNAL ERROR: channel-b out of range "
901   - "(%d >= %d)\n",
902   - channelb, mci->csrows[csrow].nr_channels);
903   - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR");
904   - return;
905   - }
  1137 + /* Fill the RAM location data */
  1138 + p = location;
  1139 + for (i = 0; i < mci->n_layers; i++) {
  1140 + if (pos[i] < 0)
  1141 + continue;
906 1142  
907   - mci->ue_count++;
908   - mci->csrows[csrow].ue_count++;
909   -
910   - /* Generate the DIMM labels from the specified channels */
911   - label = mci->csrows[csrow].channels[channela].dimm->label;
912   - chars = snprintf(pos, len + 1, "%s", label);
913   - len -= chars;
914   - pos += chars;
915   -
916   - chars = snprintf(pos, len + 1, "-%s",
917   - mci->csrows[csrow].channels[channelb].dimm->label);
918   -
919   - if (edac_mc_get_log_ue())
920   - edac_mc_printk(mci, KERN_EMERG,
921   - "UE row %d, channel-a= %d channel-b= %d "
922   - "labels \"%s\": %s\n", csrow, channela, channelb,
923   - labels, msg);
924   -
925   - if (edac_mc_get_panic_on_ue())
926   - panic("UE row %d, channel-a= %d channel-b= %d "
927   - "labels \"%s\": %s\n", csrow, channela,
928   - channelb, labels, msg);
929   -}
930   -EXPORT_SYMBOL(edac_mc_handle_fbd_ue);
931   -
932   -/*************************************************************
933   - * On Fully Buffered DIMM modules, this help function is
934   - * called to process CE events
935   - */
936   -void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci,
937   - unsigned int csrow, unsigned int channel, char *msg)
938   -{
939   - char *label = NULL;
940   -
941   - /* Ensure boundary values */
942   - if (csrow >= mci->nr_csrows) {
943   - /* something is wrong */
944   - edac_mc_printk(mci, KERN_ERR,
945   - "INTERNAL ERROR: row out of range (%d >= %d)\n",
946   - csrow, mci->nr_csrows);
947   - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
948   - return;
  1143 + p += sprintf(p, "%s:%d ",
  1144 + edac_layer_name[mci->layers[i].type],
  1145 + pos[i]);
949 1146 }
950   - if (channel >= mci->csrows[csrow].nr_channels) {
951   - /* something is wrong */
952   - edac_mc_printk(mci, KERN_ERR,
953   - "INTERNAL ERROR: channel out of range (%d >= %d)\n",
954   - channel, mci->csrows[csrow].nr_channels);
955   - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR");
956   - return;
957   - }
958 1147  
959   - label = mci->csrows[csrow].channels[channel].dimm->label;
  1148 + /* Memory type dependent details about the error */
  1149 + if (type == HW_EVENT_ERR_CORRECTED) {
  1150 + snprintf(detail, sizeof(detail),
  1151 + "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx",
  1152 + page_frame_number, offset_in_page,
  1153 + grain, syndrome);
  1154 + edac_ce_error(mci, pos, msg, location, label, detail,
  1155 + other_detail, enable_per_layer_report,
  1156 + page_frame_number, offset_in_page, grain);
  1157 + } else {
  1158 + snprintf(detail, sizeof(detail),
  1159 + "page:0x%lx offset:0x%lx grain:%d",
  1160 + page_frame_number, offset_in_page, grain);
960 1161  
961   - if (edac_mc_get_log_ce())
962   - /* FIXME - put in DIMM location */
963   - edac_mc_printk(mci, KERN_WARNING,
964   - "CE row %d, channel %d, label \"%s\": %s\n",
965   - csrow, channel, label, msg);
966   -
967   - mci->ce_count++;
968   - mci->csrows[csrow].ce_count++;
969   - mci->csrows[csrow].channels[channel].dimm->ce_count++;
970   - mci->csrows[csrow].channels[channel].ce_count++;
  1162 + edac_ue_error(mci, pos, msg, location, label, detail,
  1163 + other_detail, enable_per_layer_report);
  1164 + }
971 1165 }
972   -EXPORT_SYMBOL(edac_mc_handle_fbd_ce);
  1166 +EXPORT_SYMBOL_GPL(edac_mc_handle_error);
include/linux/edac.h
... ... @@ -416,18 +416,20 @@
416 416 /* FIXME: add the proper per-location error counts */
417 417 struct dimm_info {
418 418 char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */
419   - unsigned memory_controller;
420   - unsigned csrow;
421   - unsigned csrow_channel;
422 419  
  420 + /* Memory location data */
  421 + unsigned location[EDAC_MAX_LAYERS];
  422 +
  423 + struct mem_ctl_info *mci; /* the parent */
  424 +
423 425 u32 grain; /* granularity of reported error in bytes */
424 426 enum dev_type dtype; /* memory device type */
425 427 enum mem_type mtype; /* memory dimm type */
426 428 enum edac_type edac_mode; /* EDAC mode for this dimm */
427 429  
428   - u32 nr_pages; /* number of pages in csrow */
  430 + u32 nr_pages; /* number of pages on this dimm */
429 431  
430   - u32 ce_count; /* Correctable Errors for this dimm */
  432 + unsigned csrow, cschannel; /* Points to the old API data */
431 433 };
432 434  
433 435 /**
434 436  
... ... @@ -447,9 +449,10 @@
447 449 */
448 450 struct rank_info {
449 451 int chan_idx;
450   - u32 ce_count;
451 452 struct csrow_info *csrow;
452 453 struct dimm_info *dimm;
  454 +
  455 + u32 ce_count; /* Correctable Errors for this csrow */
453 456 };
454 457  
455 458 struct csrow_info {
456 459  
457 460  
458 461  
... ... @@ -545,13 +548,18 @@
545 548 unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci,
546 549 unsigned long page);
547 550 int mc_idx;
548   - int nr_csrows;
549 551 struct csrow_info *csrows;
  552 + unsigned nr_csrows, num_cschannel;
550 553  
  554 + /* Memory Controller hierarchy */
  555 + unsigned n_layers;
  556 + struct edac_mc_layer *layers;
  557 + bool mem_is_per_rank;
  558 +
551 559 /*
552 560 * DIMM info. Will eventually remove the entire csrows_info some day
553 561 */
554   - unsigned nr_dimms;
  562 + unsigned tot_dimms;
555 563 struct dimm_info *dimms;
556 564  
557 565 /*
558 566  
... ... @@ -566,12 +574,16 @@
566 574 const char *dev_name;
567 575 char proc_name[MC_PROC_NAME_MAX_LEN + 1];
568 576 void *pvt_info;
569   - u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */
570   - u32 ce_noinfo_count; /* Correctable Errors w/o info */
571   - u32 ue_count; /* Total Uncorrectable Errors for this MC */
572   - u32 ce_count; /* Total Correctable Errors for this MC */
573 577 unsigned long start_time; /* mci load start time (in jiffies) */
574 578  
  579 + /*
  580 + * drivers shouldn't access those fields directly, as the core
  581 + * already handles that.
  582 + */
  583 + u32 ce_noinfo_count, ue_noinfo_count;
  584 + u32 ue_count, ce_count;
  585 + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
  586 +
575 587 struct completion complete;
576 588  
577 589 /* edac sysfs device control */
... ... @@ -584,7 +596,7 @@
584 596 * by the low level driver.
585 597 *
586 598 * Set by the low level driver to provide attributes at the
587   - * controller level, same level as 'ue_count' and 'ce_count' above.
  599 + * controller level.
588 600 * An array of structures, NULL terminated
589 601 *
590 602 * If attributes are desired, then set to array of attributes