Commit 982216a4290543fe73ae4f0a156f3d7906bd9b73

Authored by Mauro Carvalho Chehab
1 parent 93e4fe64ec

edac.h: Add generic layers for describing a memory location

The edac core were written with the idea that memory controllers
are able to directly access csrows, and that the channels are
used inside a csrows select.

This is not true for FB-DIMM and RAMBUS memory controllers.

Also, some recent advanced memory controllers don't present a per-csrows
view. Instead, they view memories as DIMMs, instead of ranks, accessed
via csrow/channel.

So, changes are needed in order to allow the EDAC core to
work with all types of architectures.

In preparation for handling non-csrows based memory controllers,
add some memory structs and a macro:

enum hw_event_mc_err_type: describes the type of error
			   (corrected, uncorrected, fatal)

To be used by the new edac_mc_handle_error function;

enum edac_mc_layer: describes the type of a given memory
architecture layer (branch, channel, slot, csrow).

struct edac_mc_layer: describes the properties of a memory
		      layer (type, size, and if the layer
		      will be used on a virtual csrow.

EDAC_DIMM_PTR() - as the number of layers can vary from 1 to 3,
this macro converts from an address with up to 3 layers into
a linear address.

Reviewed-by: Borislav Petkov <bp@amd64.org>
Cc: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>

Showing 1 changed file with 102 additions and 1 deletions Side-by-side Diff

include/linux/edac.h
... ... @@ -71,6 +71,25 @@
71 71 #define DEV_FLAG_X64 BIT(DEV_X64)
72 72  
73 73 /**
  74 + * enum hw_event_mc_err_type - type of the detected error
  75 + *
  76 + * @HW_EVENT_ERR_CORRECTED: Corrected Error - Indicates that an ECC
  77 + * corrected error was detected
  78 + * @HW_EVENT_ERR_UNCORRECTED: Uncorrected Error - Indicates an error that
  79 + * can't be corrected by ECC, but it is not
  80 + * fatal (maybe it is on an unused memory area,
  81 + * or the memory controller could recover from
  82 + * it for example, by re-trying the operation).
  83 + * @HW_EVENT_ERR_FATAL: Fatal Error - Uncorrected error that could not
  84 + * be recovered.
  85 + */
  86 +enum hw_event_mc_err_type {
  87 + HW_EVENT_ERR_CORRECTED,
  88 + HW_EVENT_ERR_UNCORRECTED,
  89 + HW_EVENT_ERR_FATAL,
  90 +};
  91 +
  92 +/**
74 93 * enum mem_type - memory types. For a more detailed reference, please see
75 94 * http://en.wikipedia.org/wiki/DRAM
76 95 *
... ... @@ -312,7 +331,89 @@
312 331 * PS - I enjoyed writing all that about as much as you enjoyed reading it.
313 332 */
314 333  
315   -/* FIXME: add a per-dimm ce error count */
  334 +/**
  335 + * enum edac_mc_layer - memory controller hierarchy layer
  336 + *
  337 + * @EDAC_MC_LAYER_BRANCH: memory layer is named "branch"
  338 + * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
  339 + * @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
  340 + * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
  341 + *
  342 + * This enum is used by the drivers to tell edac_mc_sysfs what name should
  343 + * be used when describing a memory stick location.
  344 + */
  345 +enum edac_mc_layer_type {
  346 + EDAC_MC_LAYER_BRANCH,
  347 + EDAC_MC_LAYER_CHANNEL,
  348 + EDAC_MC_LAYER_SLOT,
  349 + EDAC_MC_LAYER_CHIP_SELECT,
  350 +};
  351 +
  352 +/**
  353 + * struct edac_mc_layer - describes the memory controller hierarchy
  354 + * @layer: layer type
  355 + * @size: number of components per layer. For example,
  356 + * if the channel layer has two channels, size = 2
  357 + * @is_virt_csrow: This layer is part of the "csrow" when old API
  358 + * compatibility mode is enabled. Otherwise, it is
  359 + * a channel
  360 + */
  361 +struct edac_mc_layer {
  362 + enum edac_mc_layer_type type;
  363 + unsigned size;
  364 + bool is_virt_csrow;
  365 +};
  366 +
  367 +/*
  368 + * Maximum number of layers used by the memory controller to uniquely
  369 + * identify a single memory stick.
  370 + * NOTE: Changing this constant requires not only to change the constant
  371 + * below, but also to change the existing code at the core, as there are
  372 + * some code there that are optimized for 3 layers.
  373 + */
  374 +#define EDAC_MAX_LAYERS 3
  375 +
  376 +/**
  377 + * EDAC_DIMM_PTR - Macro responsible to find a pointer inside a pointer array
  378 + * for the element given by [layer0,layer1,layer2] position
  379 + *
  380 + * @layers: a struct edac_mc_layer array, describing how many elements
  381 + * were allocated for each layer
  382 + * @var: name of the var where we want to get the pointer
  383 + * (like mci->dimms)
  384 + * @n_layers: Number of layers at the @layers array
  385 + * @layer0: layer0 position
  386 + * @layer1: layer1 position. Unused if n_layers < 2
  387 + * @layer2: layer2 position. Unused if n_layers < 3
  388 + *
  389 + * For 1 layer, this macro returns &var[layer0]
  390 + * For 2 layers, this macro is similar to allocate a bi-dimensional array
  391 + * and to return "&var[layer0][layer1]"
  392 + * For 3 layers, this macro is similar to allocate a tri-dimensional array
  393 + * and to return "&var[layer0][layer1][layer2]"
  394 + *
  395 + * A loop could be used here to make it more generic, but, as we only have
  396 + * 3 layers, this is a little faster.
  397 + * By design, layers can never be 0 or more than 3. If that ever happens,
  398 + * a NULL is returned, causing an OOPS during the memory allocation routine,
  399 + * with would point to the developer that he's doing something wrong.
  400 + */
  401 +#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \
  402 + typeof(var) __p; \
  403 + if ((nlayers) == 1) \
  404 + __p = &var[layer0]; \
  405 + else if ((nlayers) == 2) \
  406 + __p = &var[(layer1) + ((layers[1]).size * (layer0))]; \
  407 + else if ((nlayers) == 3) \
  408 + __p = &var[(layer2) + ((layers[2]).size * ((layer1) + \
  409 + ((layers[1]).size * (layer0))))]; \
  410 + else \
  411 + __p = NULL; \
  412 + __p; \
  413 +})
  414 +
  415 +
  416 +/* FIXME: add the proper per-location error counts */
316 417 struct dimm_info {
317 418 char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */
318 419 unsigned memory_controller;