13 Jun, 2018

4 commits

  • The vmalloc() function has no 2-factor argument form, so multiplication
    factors need to be wrapped in array_size(). This patch replaces cases of:

    vmalloc(a * b)

    with:
    vmalloc(array_size(a, b))

    as well as handling cases of:

    vmalloc(a * b * c)

    with:

    vmalloc(array3_size(a, b, c))

    This does, however, attempt to ignore constant size factors like:

    vmalloc(4 * 1024)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    vmalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    vmalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    vmalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    vmalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    vmalloc(
    - sizeof(TYPE) * (COUNT_ID)
    + array_size(COUNT_ID, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT_ID
    + array_size(COUNT_ID, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * (COUNT_CONST)
    + array_size(COUNT_CONST, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT_CONST
    + array_size(COUNT_CONST, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT_ID)
    + array_size(COUNT_ID, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT_ID
    + array_size(COUNT_ID, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT_CONST)
    + array_size(COUNT_CONST, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT_CONST
    + array_size(COUNT_CONST, sizeof(THING))
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    vmalloc(
    - SIZE * COUNT
    + array_size(COUNT, SIZE)
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    vmalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    vmalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    vmalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    vmalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    vmalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    vmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    vmalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    vmalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    vmalloc(C1 * C2 * C3, ...)
    |
    vmalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants.
    @@
    expression E1, E2;
    constant C1, C2;
    @@

    (
    vmalloc(C1 * C2, ...)
    |
    vmalloc(
    - E1 * E2
    + array_size(E1, E2)
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     
  • The kvzalloc() function has a 2-factor argument form, kvcalloc(). This
    patch replaces cases of:

    kvzalloc(a * b, gfp)

    with:
    kvcalloc(a * b, gfp)

    as well as handling cases of:

    kvzalloc(a * b * c, gfp)

    with:

    kvzalloc(array3_size(a, b, c), gfp)

    as it's slightly less ugly than:

    kvcalloc(array_size(a, b), c, gfp)

    This does, however, attempt to ignore constant size factors like:

    kvzalloc(4 * 1024, gfp)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kvzalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kvzalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kvzalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kvzalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kvzalloc
    + kvcalloc
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kvzalloc
    + kvcalloc
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kvzalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kvzalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kvzalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kvzalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kvzalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kvzalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kvzalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kvzalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kvzalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kvzalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kvzalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kvzalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kvzalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kvzalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kvzalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kvzalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kvzalloc(C1 * C2 * C3, ...)
    |
    kvzalloc(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kvzalloc(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kvzalloc(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kvzalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kvzalloc(sizeof(THING) * C2, ...)
    |
    kvzalloc(sizeof(TYPE) * C2, ...)
    |
    kvzalloc(C1 * C2 * C3, ...)
    |
    kvzalloc(C1 * C2, ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kvzalloc
    + kvcalloc
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     
  • The kzalloc() function has a 2-factor argument form, kcalloc(). This
    patch replaces cases of:

    kzalloc(a * b, gfp)

    with:
    kcalloc(a * b, gfp)

    as well as handling cases of:

    kzalloc(a * b * c, gfp)

    with:

    kzalloc(array3_size(a, b, c), gfp)

    as it's slightly less ugly than:

    kzalloc_array(array_size(a, b), c, gfp)

    This does, however, attempt to ignore constant size factors like:

    kzalloc(4 * 1024, gfp)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kzalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kzalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kzalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kzalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kzalloc
    + kcalloc
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kzalloc
    + kcalloc
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kzalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kzalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kzalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kzalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kzalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kzalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kzalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kzalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kzalloc(C1 * C2 * C3, ...)
    |
    kzalloc(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kzalloc(sizeof(THING) * C2, ...)
    |
    kzalloc(sizeof(TYPE) * C2, ...)
    |
    kzalloc(C1 * C2 * C3, ...)
    |
    kzalloc(C1 * C2, ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kzalloc
    + kcalloc
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     
  • The kmalloc() function has a 2-factor argument form, kmalloc_array(). This
    patch replaces cases of:

    kmalloc(a * b, gfp)

    with:
    kmalloc_array(a * b, gfp)

    as well as handling cases of:

    kmalloc(a * b * c, gfp)

    with:

    kmalloc(array3_size(a, b, c), gfp)

    as it's slightly less ugly than:

    kmalloc_array(array_size(a, b), c, gfp)

    This does, however, attempt to ignore constant size factors like:

    kmalloc(4 * 1024, gfp)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The tools/ directory was manually excluded, since it has its own
    implementation of kmalloc().

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kmalloc(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kmalloc(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kmalloc(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kmalloc(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kmalloc
    + kmalloc_array
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kmalloc(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kmalloc(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kmalloc(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kmalloc(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kmalloc(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kmalloc(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kmalloc(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kmalloc(sizeof(THING) * C2, ...)
    |
    kmalloc(sizeof(TYPE) * C2, ...)
    |
    kmalloc(C1 * C2 * C3, ...)
    |
    kmalloc(C1 * C2, ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kmalloc
    + kmalloc_array
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

09 Jun, 2018

2 commits

  • Pull libnvdimm updates from Dan Williams:
    "This adds a user for the new 'bytes-remaining' updates to
    memcpy_mcsafe() that you already received through Ingo via the
    x86-dax- for-linus pull.

    Not included here, but still targeting this cycle, is support for
    handling memory media errors (poison) consumed via userspace dax
    mappings.

    Summary:

    - DAX broke a fundamental assumption of truncate of file mapped
    pages. The truncate path assumed that it is safe to disconnect a
    pinned page from a file and let the filesystem reclaim the physical
    block. With DAX the page is equivalent to the filesystem block.
    Introduce dax_layout_busy_page() to enable filesystems to wait for
    pinned DAX pages to be released. Without this wait a filesystem
    could allocate blocks under active device-DMA to a new file.

    - DAX arranges for the block layer to be bypassed and uses
    dax_direct_access() + copy_to_iter() to satisfy read(2) calls.
    However, the memcpy_mcsafe() facility is available through the pmem
    block driver. In order to safely handle media errors, via the DAX
    block-layer bypass, introduce copy_to_iter_mcsafe().

    - Fix cache management policy relative to the ACPI NFIT Platform
    Capabilities Structure to properly elide cache flushes when they
    are not necessary. The table indicates whether CPU caches are
    power-fail protected. Clarify that a deep flush is always performed
    on REQ_{FUA,PREFLUSH} requests"

    * tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (21 commits)
    dax: Use dax_write_cache* helpers
    libnvdimm, pmem: Do not flush power-fail protected CPU caches
    libnvdimm, pmem: Unconditionally deep flush on *sync
    libnvdimm, pmem: Complete REQ_FLUSH => REQ_PREFLUSH
    acpi, nfit: Remove ecc_unit_size
    dax: dax_insert_mapping_entry always succeeds
    libnvdimm, e820: Register all pmem resources
    libnvdimm: Debug probe times
    linvdimm, pmem: Preserve read-only setting for pmem devices
    x86, nfit_test: Add unit test for memcpy_mcsafe()
    pmem: Switch to copy_to_iter_mcsafe()
    dax: Report bytes remaining in dax_iomap_actor()
    dax: Introduce a ->copy_to_iter dax operation
    uio, lib: Fix CONFIG_ARCH_HAS_UACCESS_MCSAFE compilation
    xfs, dax: introduce xfs_break_dax_layouts()
    xfs: prepare xfs_break_layouts() for another layout type
    xfs: prepare xfs_break_layouts() to be called with XFS_MMAPLOCK_EXCL
    mm, fs, dax: handle layout changes to pinned dax mappings
    mm: fix __gup_device_huge vs unmap
    mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS
    ...

    Linus Torvalds
     
  • Dan Williams
     

08 Jun, 2018

34 commits

  • Merge updates from Andrew Morton:

    - a few misc things

    - ocfs2 updates

    - v9fs updates

    - MM

    - procfs updates

    - lib/ updates

    - autofs updates

    * emailed patches from Andrew Morton : (118 commits)
    autofs: small cleanup in autofs_getpath()
    autofs: clean up includes
    autofs: comment on selinux changes needed for module autoload
    autofs: update MAINTAINERS entry for autofs
    autofs: use autofs instead of autofs4 in documentation
    autofs: rename autofs documentation files
    autofs: create autofs Kconfig and Makefile
    autofs: delete fs/autofs4 source files
    autofs: update fs/autofs4/Makefile
    autofs: update fs/autofs4/Kconfig
    autofs: copy autofs4 to autofs
    autofs4: use autofs instead of autofs4 everywhere
    autofs4: merge auto_fs.h and auto_fs4.h
    fs/binfmt_misc.c: do not allow offset overflow
    checkpatch: improve patch recognition
    lib/ucs2_string.c: add MODULE_LICENSE()
    lib/mpi: headers cleanup
    lib/percpu_ida.c: use _irqsave() instead of local_irq_save() + spin_lock
    lib/idr.c: remove simple_ida_lock
    lib/bitmap.c: micro-optimization for __bitmap_complement()
    ...

    Linus Torvalds
     
  • kvmalloc warned about incompatible gfp_mask to catch abusers (mostly
    GFP_NOFS) with an intention that this will motivate authors of the code
    to fix those. Linus argues that this just motivates people to do even
    more hacks like

    if (gfp == GFP_KERNEL)
    kvmalloc
    else
    kmalloc

    I haven't seen this happening much (Linus pointed to bucket_lock special
    cases an atomic allocation but my git foo hasn't found much more) but it
    is true that we can grow those in future. Therefore Linus suggested to
    simply not fallback to vmalloc for incompatible gfp flags and rather
    stick with the kmalloc path.

    Link: http://lkml.kernel.org/r/20180601115329.27807-1-mhocko@kernel.org
    Signed-off-by: Michal Hocko
    Suggested-by: Linus Torvalds
    Cc: Tom Herbert
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Michal Hocko
     
  • shmem/tmpfs uses pseudo vma to allocate page with correct NUMA policy.

    The pseudo vma doesn't have vm_page_prot set. We are going to encode
    encryption KeyID in vm_page_prot. Having garbage there causes problems.

    Zero out all unused fields in the pseudo vma.

    Link: http://lkml.kernel.org/r/20180531135602.20321-1-kirill.shutemov@linux.intel.com
    Signed-off-by: Kirill A. Shutemov
    Reviewed-by: Andrew Morton
    Cc: Hugh Dickins
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Kirill A. Shutemov
     
  • In __alloc_pages_slowpath() we reset zonelist and preferred_zoneref for
    allocations that can ignore memory policies. The zonelist is obtained
    from current CPU's node. This is a problem for __GFP_THISNODE
    allocations that want to allocate on a different node, e.g. because the
    allocating thread has been migrated to a different CPU.

    This has been observed to break SLAB in our 4.4-based kernel, because
    there it relies on __GFP_THISNODE working as intended. If a slab page
    is put on wrong node's list, then further list manipulations may corrupt
    the list because page_to_nid() is used to determine which node's
    list_lock should be locked and thus we may take a wrong lock and race.

    Current SLAB implementation seems to be immune by luck thanks to commit
    511e3a058812 ("mm/slab: make cache_grow() handle the page allocated on
    arbitrary node") but there may be others assuming that __GFP_THISNODE
    works as promised.

    We can fix it by simply removing the zonelist reset completely. There
    is actually no reason to reset it, because memory policies and cpusets
    don't affect the zonelist choice in the first place. This was different
    when commit 183f6371aac2 ("mm: ignore mempolicies when using
    ALLOC_NO_WATERMARK") introduced the code, as mempolicies provided their
    own restricted zonelists.

    We might consider this for 4.17 although I don't know if there's
    anything currently broken.

    SLAB is currently not affected, but in kernels older than 4.7 that don't
    yet have 511e3a058812 ("mm/slab: make cache_grow() handle the page
    allocated on arbitrary node") it is. That's at least 4.4 LTS. Older
    ones I'll have to check.

    So stable backports should be more important, but will have to be
    reviewed carefully, as the code went through many changes. BTW I think
    that also the ac->preferred_zoneref reset is currently useless if we
    don't also reset ac->nodemask from a mempolicy to NULL first (which we
    probably should for the OOM victims etc?), but I would leave that for a
    separate patch.

    Link: http://lkml.kernel.org/r/20180525130853.13915-1-vbabka@suse.cz
    Signed-off-by: Vlastimil Babka
    Fixes: 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK")
    Acked-by: Mel Gorman
    Cc: Michal Hocko
    Cc: David Rientjes
    Cc: Joonsoo Kim
    Cc: Vlastimil Babka
    Cc:
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Vlastimil Babka
     
  • If a process monitored with userfaultfd changes it's memory mappings or
    forks() at the same time as uffd monitor fills the process memory with
    UFFDIO_COPY, the actual creation of page table entries and copying of
    the data in mcopy_atomic may happen either before of after the memory
    mapping modifications and there is no way for the uffd monitor to
    maintain consistent view of the process memory layout.

    For instance, let's consider fork() running in parallel with
    userfaultfd_copy():

    process | uffd monitor
    ---------------------------------+------------------------------
    fork() | userfaultfd_copy()
    ... | ...
    dup_mmap() | down_read(mmap_sem)
    down_write(mmap_sem) | /* create PTEs, copy data */
    dup_uffd() | up_read(mmap_sem)
    copy_page_range() |
    up_write(mmap_sem) |
    dup_uffd_complete() |
    /* notify monitor */ |

    If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will
    be present by the time copy_page_range() is called and they will appear
    in the child's memory mappings. However, if the fork() is the first to
    take the mmap_sem, the new pages won't be mapped in the child's address
    space.

    If the pages are not present and child tries to access them, the monitor
    will get page fault notification and everything is fine. However, if
    the pages *are present*, the child can access them without uffd
    noticing. And if we copy them into child it'll see the wrong data.
    Since we are talking about background copy, we'd need to decide whether
    the pages should be copied or not regardless #PF notifications.

    Since userfaultfd monitor has no way to determine what was the order,
    let's disallow userfaultfd_copy in parallel with the non-cooperative
    events. In such case we return -EAGAIN and the uffd monitor can
    understand that userfaultfd_copy() clashed with a non-cooperative event
    and take an appropriate action.

    Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com
    Signed-off-by: Mike Rapoport
    Acked-by: Pavel Emelyanov
    Cc: Andrea Arcangeli
    Cc: Mike Kravetz
    Cc: Andrei Vagin
    Signed-off-by: Andrew Morton

    Signed-off-by: Linus Torvalds

    Mike Rapoport
     
  • Currently an attempt to set swap.max into a value lower than the actual
    swap usage fails, which causes configuration problems as there's no way
    of lowering the configuration below the current usage short of turning
    off swap entirely. This makes swap.max difficult to use and allows
    delegatees to lock the delegator out of reducing swap allocation.

    This patch updates swap_max_write() so that the limit can be lowered
    below the current usage. It doesn't implement active reclaiming of swap
    entries for the following reasons.

    * mem_cgroup_swap_full() already tells the swap machinary to
    aggressively reclaim swap entries if the usage is above 50% of
    limit, so simply lowering the limit automatically triggers gradual
    reclaim.

    * Forcing back swapped out pages is likely to heavily impact the
    workload and mess up the working set. Given that swap usually is a
    lot less valuable and less scarce, letting the existing usage
    dissipate over time through the above gradual reclaim and as they're
    falted back in is likely the better behavior.

    Link: http://lkml.kernel.org/r/20180523185041.GR1718769@devbig577.frc2.facebook.com
    Signed-off-by: Tejun Heo
    Acked-by: Roman Gushchin
    Acked-by: Rik van Riel
    Acked-by: Johannes Weiner
    Cc: Michal Hocko
    Cc: Shaohua Li
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Tejun Heo
     
  • Use new return type vm_fault_t for fault handler. For now, this is just
    documenting that the function returns a VM_FAULT value rather than an
    errno. Once all instances are converted, vm_fault_t will become a
    distinct type.

    See commit 1c8f422059ae ("mm: change return type to vm_fault_t")

    vmf_error() is the newly introduce inline function in 4.17-rc6.

    Link: http://lkml.kernel.org/r/20180521202410.GA17912@jordon-HP-15-Notebook-PC
    Signed-off-by: Souptick Joarder
    Reviewed-by: Matthew Wilcox
    Cc: Hugh Dickins
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Souptick Joarder
     
  • Christoph doubts anyone was using the 'reserved' file in sysfs, so remove
    it.

    Link: http://lkml.kernel.org/r/20180518194519.3820-17-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: "Kirill A . Shutemov"
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Vlastimil Babka
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • The reserved field was only used for embedding an rcu_head in the data
    structure. With the previous commit, we no longer need it. That lets us
    remove the 'reserved' argument to a lot of functions.

    Link: http://lkml.kernel.org/r/20180518194519.3820-16-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: "Kirill A . Shutemov"
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Vlastimil Babka
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • rcu_head may now grow larger than list_head without affecting slab or
    slub.

    Link: http://lkml.kernel.org/r/20180518194519.3820-15-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Christoph Lameter
    Acked-by: Vlastimil Babka
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: "Kirill A . Shutemov"
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • This gives us five words of space in a single union in struct page. The
    compound_mapcount moves position (from offset 24 to offset 20) on 64-bit
    systems, but that does not seem likely to cause any trouble.

    Link: http://lkml.kernel.org/r/20180518194519.3820-11-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Vlastimil Babka
    Acked-by: Kirill A. Shutemov
    Cc: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • Since the LRU is two words, this does not affect the double-word alignment
    of SLUB's freelist.

    Link: http://lkml.kernel.org/r/20180518194519.3820-10-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Vlastimil Babka
    Acked-by: Kirill A. Shutemov
    Cc: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • Now that we can represent the location of 'deferred_list' in C instead of
    comments, make use of that ability.

    Link: http://lkml.kernel.org/r/20180518194519.3820-9-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Vlastimil Babka
    Acked-by: Kirill A. Shutemov
    Cc: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • By moving page->private to the fourth word of struct page, we can put the
    SLUB counters in the same word as SLAB's s_mem and still do the
    cmpxchg_double trick. Now the SLUB counters no longer overlap with the
    mapcount or refcount so we can drop the call to page_mapcount_reset() and
    simplify set_page_slub_counters() to a single line.

    Link: http://lkml.kernel.org/r/20180518194519.3820-6-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Vlastimil Babka
    Acked-by: Kirill A. Shutemov
    Cc: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • This will allow us to store slub's counters in the same bits as slab's
    s_mem. slub now needs to set page->mapping to NULL as it frees the page,
    just like slab does.

    Link: http://lkml.kernel.org/r/20180518194519.3820-5-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Christoph Lameter
    Acked-by: Vlastimil Babka
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: "Kirill A . Shutemov"
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • We're already using a union of many fields here, so stop abusing the
    _mapcount and make page_type its own field. That implies renaming some of
    the machinery that creates PageBuddy, PageBalloon and PageKmemcg; bring
    back the PG_buddy, PG_balloon and PG_kmemcg names.

    As suggested by Kirill, make page_type a bitmask. Because it starts out
    life as -1 (thanks to sharing the storage with _mapcount), setting a page
    flag means clearing the appropriate bit. This gives us space for probably
    twenty or so extra bits (depending how paranoid we want to be about
    _mapcount underflow).

    Link: http://lkml.kernel.org/r/20180518194519.3820-3-willy@infradead.org
    Signed-off-by: Matthew Wilcox
    Acked-by: Kirill A. Shutemov
    Acked-by: Vlastimil Babka
    Cc: Christoph Lameter
    Cc: Dave Hansen
    Cc: Jérôme Glisse
    Cc: Lai Jiangshan
    Cc: Martin Schwidefsky
    Cc: Pekka Enberg
    Cc: Randy Dunlap
    Cc: Andrey Ryabinin
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Matthew Wilcox
     
  • This is to take better advantage of general huge page clearing
    optimization (commit c79b57e462b5: "mm: hugetlb: clear target sub-page
    last when clearing huge page") for hugetlbfs.

    In the general optimization patch, the sub-page to access will be
    cleared last to avoid the cache lines of to access sub-page to be
    evicted when clearing other sub-pages. This works better if we have the
    address of the sub-page to access, that is, the fault address inside the
    huge page. So the hugetlbfs no page fault handler is changed to pass
    that information. This will benefit workloads which don't access the
    begin of the hugetlbfs huge page after the page fault under heavy cache
    contention for shared last level cache.

    The patch is a generic optimization which should benefit quite some
    workloads, not for a specific use case. To demonstrate the performance
    benefit of the patch, we tested it with vm-scalability run on hugetlbfs.

    With this patch, the throughput increases ~28.1% in vm-scalability
    anon-w-seq test case with 88 processes on a 2 socket Xeon E5 2699 v4
    system (44 cores, 88 threads). The test case creates 88 processes, each
    process mmaps a big anonymous memory area with MAP_HUGETLB and writes to
    it from the end to the begin. For each process, other processes could
    be seen as other workload which generates heavy cache pressure. At the
    same time, the cache miss rate reduced from ~36.3% to ~25.6%, the IPC
    (instruction per cycle) increased from 0.3 to 0.37, and the time spent
    in user space is reduced ~19.3%.

    Link: http://lkml.kernel.org/r/20180517083539.9242-1-ying.huang@intel.com
    Signed-off-by: "Huang, Ying"
    Reviewed-by: Mike Kravetz
    Cc: Michal Hocko
    Cc: David Rientjes
    Cc: Andrea Arcangeli
    Cc: "Kirill A. Shutemov"
    Cc: Andi Kleen
    Cc: Jan Kara
    Cc: Matthew Wilcox
    Cc: Hugh Dickins
    Cc: Minchan Kim
    Cc: Shaohua Li
    Cc: Christopher Lameter
    Cc: "Aneesh Kumar K.V"
    Cc: Punit Agrawal
    Cc: Anshuman Khandual
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Huang Ying
     
  • Use new return type vm_fault_t for fault handler in struct
    vm_operations_struct. For now, this is just documenting that the
    function returns a VM_FAULT value rather than an errno. Once all
    instances are converted, vm_fault_t will become a distinct type.

    See commit 1c8f422059ae ("mm: change return type to vm_fault_t")

    Link: http://lkml.kernel.org/r/20180512063745.GA26866@jordon-HP-15-Notebook-PC
    Signed-off-by: Souptick Joarder
    Reviewed-by: Matthew Wilcox
    Reviewed-by: Andrew Morton
    Cc: Joe Perches
    Cc: Michal Hocko
    Cc: Hugh Dickins
    Cc: Dan Williams
    Cc: David Rientjes
    Cc: Mike Kravetz
    Cc: Naoya Horiguchi
    Cc: Aneesh Kumar K.V
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Souptick Joarder
     
  • Use new return type vm_fault_t for fault handler in struct
    vm_operations_struct. For now, this is just documenting that the
    function returns a VM_FAULT value rather than an errno. Once all
    instances are converted, vm_fault_t will become a distinct type.

    Link: http://lkml.kernel.org/r/20180511190542.GA2412@jordon-HP-15-Notebook-PC
    Signed-off-by: Souptick Joarder
    Reviewed-by: Matthew Wilcox
    Cc: Dan Williams
    Cc: Jan Kara
    Cc: Ross Zwisler
    Cc: Rik van Riel
    Cc: Matthew Wilcox
    Cc: Hugh Dickins
    Cc: Pavel Tatashin
    Cc: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Souptick Joarder
     
  • finalise_ac() has parameter order which is not used at all. Remove it.

    Signed-off-by: Huaisheng Ye
    Acked-by: Michal Hocko
    Reviewed-by: Andrew Morton
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Huaisheng Ye
     
  • The new helper returns index of the matching string in an array. We are
    going to use it here.

    Link: http://lkml.kernel.org/r/20180503203206.44046-1-andriy.shevchenko@linux.intel.com
    Signed-off-by: Andy Shevchenko
    Acked-by: Michal Hocko
    Reviewed-by: Andrew Morton
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andy Shevchenko
     
  • Using kstrndup() simplifies the code.

    Link: http://lkml.kernel.org/r/20180503201807.24941-1-andriy.shevchenko@linux.intel.com
    Signed-off-by: Andy Shevchenko
    Acked-by: Michal Hocko
    Cc: David Rientjes
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Andy Shevchenko
     
  • Memory controller implements the memory.low best-effort memory
    protection mechanism, which works perfectly in many cases and allows
    protecting working sets of important workloads from sudden reclaim.

    But its semantics has a significant limitation: it works only as long as
    there is a supply of reclaimable memory. This makes it pretty useless
    against any sort of slow memory leaks or memory usage increases. This
    is especially true for swapless systems. If swap is enabled, memory
    soft protection effectively postpones problems, allowing a leaking
    application to fill all swap area, which makes no sense. The only
    effective way to guarantee the memory protection in this case is to
    invoke the OOM killer.

    It's possible to handle this case in userspace by reacting on MEMCG_LOW
    events; but there is still a place for a fail-safe in-kernel mechanism
    to provide stronger guarantees.

    This patch introduces the memory.min interface for cgroup v2 memory
    controller. It works very similarly to memory.low (sharing the same
    hierarchical behavior), except that it's not disabled if there is no
    more reclaimable memory in the system.

    If cgroup is not populated, its memory.min is ignored, because otherwise
    even the OOM killer wouldn't be able to reclaim the protected memory,
    and the system can stall.

    [guro@fb.com: s/low/min/ in docs]
    Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com
    Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com
    Signed-off-by: Roman Gushchin
    Reviewed-by: Randy Dunlap
    Acked-by: Johannes Weiner
    Cc: Michal Hocko
    Cc: Vladimir Davydov
    Cc: Tejun Heo
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Roman Gushchin
     
  • is_pageblock_removable_nolock() is not used outside of
    mm/memory_hotplug.c. Move it next to unique caller
    is_mem_section_removable() and make it static.

    Remove prototype in to silence gcc warning (W=1):

    mm/page_alloc.c:7704:6: warning: no previous prototype for `is_pageblock_removable_nolock' [-Wmissing-prototypes]

    Link: http://lkml.kernel.org/r/20180509190001.24789-1-malat@debian.org
    Signed-off-by: Mathieu Malaterre
    Suggested-by: Michal Hocko
    Reviewed-by: Andrew Morton
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Mathieu Malaterre
     
  • memblock_remove report is useful to see why MemTotal of /proc/meminfo
    between two kernels makes difference.

    Link: http://lkml.kernel.org/r/20180508104223.8028-1-minchan@kernel.org
    Signed-off-by: Minchan Kim
    Reviewed-by: Andrew Morton
    Acked-by: Michal Hocko
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Minchan Kim
     
  • The per-cpu memcg stock can retain a charge of upto 32 pages. On a
    machine with large number of cpus, this can amount to a decent amount of
    memory. Additionally force_empty interface might be triggering unneeded
    memcg reclaims.

    Link: http://lkml.kernel.org/r/20180507201651.165879-1-shakeelb@google.com
    Signed-off-by: Junaid Shahid
    Signed-off-by: Shakeel Butt
    Acked-by: Michal Hocko
    Cc: Greg Thelen
    Cc: Johannes Weiner
    Cc: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Junaid Shahid
     
  • Resizing the memcg limit for cgroup-v2 drains the stocks before
    triggering the memcg reclaim. Do the same for cgroup-v1 to make the
    behavior consistent.

    Link: http://lkml.kernel.org/r/20180504205548.110696-1-shakeelb@google.com
    Signed-off-by: Shakeel Butt
    Acked-by: Johannes Weiner
    Acked-by: Michal Hocko
    Cc: Greg Thelen
    Cc: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Shakeel Butt
     
  • Mark memcg1_events static: it's only used by memcontrol.c. And mark it
    const: it's not modified.

    Link: http://lkml.kernel.org/r/20180503192940.94971-1-gthelen@google.com
    Signed-off-by: Greg Thelen
    Acked-by: Michal Hocko
    Cc: Johannes Weiner
    Cc: Vladimir Davydov
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Greg Thelen
     
  • mem_cgroup_cgwb_list is a very simple wrapper and it will never be used
    outside of code under CONFIG_CGROUP_WRITEBACK. so use memcg->cgwb_list
    directly.

    Link: http://lkml.kernel.org/r/1524406173-212182-1-git-send-email-wanglong19@meituan.com
    Signed-off-by: Wang Long
    Reviewed-by: Jan Kara
    Acked-by: Tejun Heo
    Acked-by: Michal Hocko
    Reviewed-by: Andrew Morton
    Cc: Johannes Weiner
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Wang Long
     
  • tmpfs uses the helper d_find_alias() to find a dentry from a decoded
    inode, but d_find_alias() skips unhashed dentries, so unlinked files
    cannot be decoded from a file handle.

    This can be reproduced using xfstests test program open_by_handle:

    $ open_by handle -c /tmp/testdir
    $ open_by_handle -dk /tmp/testdir
    open_by_handle(/tmp/testdir/file000000) returned 116 incorrectly on an unlinked open file!

    To fix this, if d_find_alias() can't find a hashed alias, call
    d_find_any_alias() to return an unhashed one.

    Link: http://lkml.kernel.org/r/CAOQ4uxg+qSLP0KwdW+h1tcPqOCQd+_pGZVXiePQB1TXCMBMctQ@mail.gmail.com
    Signed-off-by: Amir Goldstein
    Reviewed-by: NeilBrown
    Cc: Hugh Dickins
    Cc: Jeff Layton
    Cc: "J. Bruce Fields"
    Cc: Al Viro
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Amir Goldstein
     
  • page_stable_node() and set_page_stable_node() are only used in mm/ksm.c
    and there is no point to keep them in the include/linux/ksm.h

    [akpm@linux-foundation.org: fix SYSFS=n build]
    Link: http://lkml.kernel.org/r/1524552106-7356-3-git-send-email-rppt@linux.vnet.ibm.com
    Signed-off-by: Mike Rapoport
    Reviewed-by: Andrew Morton
    Cc: Andrea Arcangeli
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Mike Rapoport
     
  • While revisiting my Btrfs swapfile series [1], I introduced a situation
    in which reclaim would lock i_rwsem, and even though the swapon() path
    clearly made GFP_KERNEL allocations while holding i_rwsem, I got no
    complaints from lockdep. It turns out that the rework of the fs_reclaim
    annotation was broken: if the current task has PF_MEMALLOC set, we don't
    acquire the dummy fs_reclaim lock, but when reclaiming we always check
    this _after_ we've just set the PF_MEMALLOC flag. In most cases, we can
    fix this by moving the fs_reclaim_{acquire,release}() outside of the
    memalloc_noreclaim_{save,restore}(), althought kswapd is slightly
    different. After applying this, I got the expected lockdep splats.

    1: https://lwn.net/Articles/625412/

    Link: http://lkml.kernel.org/r/9f8aa70652a98e98d7c4de0fc96a4addcee13efe.1523778026.git.osandov@fb.com
    Fixes: d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation")
    Signed-off-by: Omar Sandoval
    Reviewed-by: Andrew Morton
    Cc: Peter Zijlstra
    Cc: Tetsuo Handa
    Cc: Ingo Molnar
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Omar Sandoval
     
  • Since tmpfs THP was supported in 4.8, hugetlbfs is not the only
    filesystem with huge page support anymore. tmpfs can use huge page via
    THP when mounting by "huge=" mount option.

    When applications use huge page on hugetlbfs, it just need check the
    filesystem magic number, but it is not enough for tmpfs. Make
    stat.st_blksize return huge page size if it is mounted by appropriate
    "huge=" option to give applications a hint to optimize the behavior with
    THP.

    Some applications may not do wisely with THP. For example, QEMU may
    mmap file on non huge page aligned hint address with MAP_FIXED, which
    results in no pages are PMD mapped even though THP is used. Some
    applications may mmap file with non huge page aligned offset. Both
    behaviors make THP pointless.

    statfs.f_bsize still returns 4KB for tmpfs since THP could be split, and
    it also may fallback to 4KB page silently if there is not enough huge
    page. Furthermore, different f_bsize makes max_blocks and free_blocks
    calculation harder but without too much benefit. Returning huge page
    size via stat.st_blksize sounds good enough.

    Since PUD size huge page for THP has not been supported, now it just
    returns HPAGE_PMD_SIZE.

    Hugh said:

    : Sorry, I have no enthusiasm for this patch; but do I feel strongly
    : enough to override you and everyone else to NAK it? No, I don't feel
    : that strongly, maybe st_blksize isn't worth arguing over.
    :
    : We did look at struct stat when designing huge tmpfs, to see if there
    : were any fields that should be adjusted for it; but concluded none.
    : Yes, it would sometimes be nice to have a quickly accessible indicator
    : for when tmpfs has been mounted huge (scanning /proc/mounts for options
    : can be tiresome, agreed); but since tmpfs tries to supply huge (or not)
    : pages transparently, no difference seemed right.
    :
    : So, because st_blksize is a not very useful field of struct stat, with
    : "size" in the name, we're going to put HPAGE_PMD_SIZE in there instead
    : of PAGE_SIZE, if the tmpfs was mounted with one of the huge "huge"
    : options (force or always, okay; within_size or advise, not so much).
    : Though HPAGE_PMD_SIZE is no more its "preferred I/O size" or "blocksize
    : for file system I/O" than PAGE_SIZE was.
    :
    : Which we can expect to speed up some applications and disadvantage
    : others, depending on how they interpret st_blksize: just like if we
    : changed it in the same way on non-huge tmpfs. (Did I actually try
    : changing st_blksize early on, and find it broke something? If so, I've
    : now forgotten what, and a search through commit messages didn't find
    : it; but I guess we'll find out soon enough.)
    :
    : If there were an mstat() syscall, returning a field "preferred
    : alignment", then we could certainly agree to put HPAGE_PMD_SIZE in
    : there; but in stat()'s st_blksize? And what happens when (in future)
    : mm maps this or that hard-disk filesystem's blocks with a pmd mapping -
    : should that filesystem then advertise a bigger st_blksize, despite the
    : same disk layout as before? What happens with DAX?
    :
    : And this change is not going to help the QEMU suboptimality that
    : brought you here (or does QEMU align mmaps according to st_blksize?).
    : QEMU ought to work well with kernels without this change, and kernels
    : with this change; and I hope it can easily deal with both by avoiding
    : that use of MAP_FIXED which prevented the kernel's intended alignment.

    [akpm@linux-foundation.org: remove unneeded `else']
    Link: http://lkml.kernel.org/r/1524665633-83806-1-git-send-email-yang.shi@linux.alibaba.com
    Signed-off-by: Yang Shi
    Suggested-by: Christoph Hellwig
    Reviewed-by: Christoph Hellwig
    Acked-by: Kirill A. Shutemov
    Cc: Hugh Dickins
    Cc: Michal Hocko
    Cc: Alexander Viro
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Yang Shi
     
  • Client can call vunmap with some intermediate 'addr' which may not be
    the start of the VM area. Entire unmap code works with vm->vm_start
    which is proper but debug object API is called with 'addr'. This could
    be a problem within debug objects.

    Pass proper start address into debug object API.

    [akpm@linux-foundation.org: fix warning]
    Link: http://lkml.kernel.org/r/1523961828-9485-3-git-send-email-cpandya@codeaurora.org
    Signed-off-by: Chintan Pandya
    Reviewed-by: Andrew Morton
    Cc: Ard Biesheuvel
    Cc: Byungchul Park
    Cc: Catalin Marinas
    Cc: Florian Fainelli
    Cc: Johannes Weiner
    Cc: Laura Abbott
    Cc: Vlastimil Babka
    Cc: Wei Yang
    Cc: Yisheng Xie
    Signed-off-by: Andrew Morton
    Signed-off-by: Linus Torvalds

    Chintan Pandya