13 Jun, 2018

1 commit

  • The kzalloc_node() function has a 2-factor argument form, kcalloc_node(). This
    patch replaces cases of:

    kzalloc_node(a * b, gfp, node)

    with:
    kcalloc_node(a * b, gfp, node)

    as well as handling cases of:

    kzalloc_node(a * b * c, gfp, node)

    with:

    kzalloc_node(array3_size(a, b, c), gfp, node)

    as it's slightly less ugly than:

    kcalloc_node(array_size(a, b), c, gfp, node)

    This does, however, attempt to ignore constant size factors like:

    kzalloc_node(4 * 1024, gfp, node)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kzalloc_node(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kzalloc_node(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kzalloc_node(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kzalloc_node
    + kcalloc_node
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kzalloc_node(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kzalloc_node(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kzalloc_node(C1 * C2 * C3, ...)
    |
    kzalloc_node(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc_node(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc_node(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc_node(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kzalloc_node(sizeof(THING) * C2, ...)
    |
    kzalloc_node(sizeof(TYPE) * C2, ...)
    |
    kzalloc_node(C1 * C2 * C3, ...)
    |
    kzalloc_node(C1 * C2, ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

25 May, 2018

1 commit

  • When the allocation process is scheduled back and the mapped hw queue is
    changed, fake one extra wake up on previous queue for compensating wake
    up miss, so other allocations on the previous queue won't be starved.

    This patch fixes one request allocation hang issue, which can be
    triggered easily in case of very low nr_request.

    The race is as follows:

    1) 2 hw queues, nr_requests are 2, and wake_batch is one

    2) there are 3 waiters on hw queue 0

    3) two in-flight requests in hw queue 0 are completed, and only two
    waiters of 3 are waken up because of wake_batch, but both the two
    waiters can be scheduled to another CPU and cause to switch to hw
    queue 1

    4) then the 3rd waiter will wait for ever, since no in-flight request
    is in hw queue 0 any more.

    5) this patch fixes it by the fake wakeup when waiter is scheduled to
    another hw queue

    Cc:
    Reviewed-by: Omar Sandoval
    Signed-off-by: Ming Lei

    Modified commit message to make it clearer, and make it apply on
    top of the 4.18 branch.

    Signed-off-by: Jens Axboe

    Ming Lei
     

15 May, 2018

1 commit

  • If we have multiple callers of sbq_wake_up(), we can end up in a
    situation where the wait_cnt will continually go more and more
    negative. Consider the case where our wake batch is 1, hence
    wait_cnt will start out as 1.

    wait_cnt == 1

    CPU0 CPU1
    atomic_dec_return(), cnt == 0
    atomic_dec_return(), cnt == -1
    cmpxchg(-1, 0) (succeeds)
    [wait_cnt now 0]
    cmpxchg(0, 1) (fails)

    This ends up with wait_cnt being 0, we'll wakeup immediately
    next time. Going through the same loop as above again, and
    we'll have wait_cnt -1.

    For the case where we have a larger wake batch, the only
    difference is that the starting point will be higher. We'll
    still end up with continually smaller batch wakeups, which
    defeats the purpose of the rolling wakeups.

    Always reset the wait_cnt to the batch value. Then it doesn't
    matter who wins the race. But ensure that whomever does win
    the race is the one that increments the ws index and wakes up
    our batch count, loser gets to call __sbq_wake_up() again to
    account his wakeups towards the next active wait state index.

    Fixes: 6c0ca7ae292a ("sbitmap: fix wakeup hang after sbq resize")
    Reviewed-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Jens Axboe
     

11 May, 2018

2 commits

  • Make sure the user passed the right value to
    sbitmap_queue_min_shallow_depth().

    Acked-by: Paolo Valente
    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • The sbitmap queue wake batch is calculated such that once allocations
    start blocking, all of the bits which are already allocated must be
    enough to fulfill the batch counters of all of the waitqueues. However,
    the shallow allocation depth can break this invariant, since we block
    before our full depth is being utilized. Add
    sbitmap_queue_min_shallow_depth(), which saves the minimum shallow depth
    the sbq will use, and update sbq_calc_wake_batch() to take it into
    account.

    Acked-by: Paolo Valente
    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

01 Mar, 2018

1 commit

  • sbitmap_queue_get()/sbitmap_queue_clear() are used for
    allocating/freeing a resource, so they should provide acquire/release
    barrier semantics, respectively. sbitmap_get() currently contains a full
    barrier, which is unnecessary, so use test_and_set_bit_lock() instead of
    test_and_set_bit() (these are equivalent on x86_64). sbitmap_clear_bit()
    does not imply any barriers, which is incorrect, as accesses of the
    resource (e.g., request) could potentially get reordered to after the
    clear_bit(). Introduce sbitmap_clear_bit_unlock() and use it for
    sbitmap_queue_clear() (this only adds a compiler barrier on x86_64). The
    other existing user of sbitmap_clear_bit() (the blk-mq software queue
    pending map) is serialized through a spinlock and does not need this.

    Reported-by: Tejun Heo
    Acked-by: Tejun Heo
    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

23 Dec, 2017

1 commit

  • Even with a number of waitqueues, we can get into a situation where we
    are heavily contended on the waitqueue lock. I got a report on spc1
    where we're spending seconds doing this. Arguably the use case is nasty,
    I reproduce it with one device and 1000 threads banging on the device.
    But that doesn't mean we shouldn't be handling it better.

    What ends up happening is that a thread will fail to get a tag, add
    itself to the waitqueue, and subsequently get woken up when a tag is
    freed - only to find itself going back to sleep on the waitqueue.

    Instead of waking all threads, use an exclusive wait and wake up our
    sbitmap batch count instead. This seems to work well for me (massive
    improvement for this use case), and it survives basic testing. But I
    haven't fully verified it yet.

    An additional improvement is running the queue and checking for a new
    tag BEFORE needing to add ourselves to the waitqueue.

    Signed-off-by: Jens Axboe

    Jens Axboe
     

15 Apr, 2017

1 commit

  • This operation supports the use case of limiting the number of bits that
    can be allocated for a given operation. Rather than setting aside some
    bits at the end of the bitmap, we can set aside bits in each word of the
    bitmap. This means we can keep the allocation hints spread out and
    support sbitmap_resize() nicely at the cost of lower granularity for the
    allowed depth.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

02 Mar, 2017

1 commit

  • is a low level header that is included early
    in affected kernel headers. But it includes
    which complicates the cleanup of sched.h dependencies.

    But kasan.h has almost no need for sched.h: its only use of
    scheduler functionality is in two inline functions which are
    not used very frequently - so uninline kasan_enable_current()
    and kasan_disable_current().

    Also add a dependency to a .c file that depended
    on kasan.h including it.

    This paves the way to remove the include from kasan.h.

    Acked-by: Linus Torvalds
    Cc: Mike Galbraith
    Cc: Peter Zijlstra
    Cc: Thomas Gleixner
    Cc: linux-kernel@vger.kernel.org
    Signed-off-by: Ingo Molnar

    Ingo Molnar
     

27 Jan, 2017

1 commit


19 Jan, 2017

2 commits

  • When we resize a struct sbitmap_queue, we update the wakeup batch size,
    but we don't update the wait count in the struct sbq_wait_states. If we
    resized down from a size which could use a bigger batch size, these
    counts could be too large and cause us to miss necessary wakeups. To fix
    this, update the wait counts when we resize (ensuring some careful
    memory ordering so that it's safe w.r.t. concurrent clears).

    This also fixes a theoretical issue where two threads could end up
    bumping the wait count up by the batch size, which could also
    potentially lead to hangs.

    Reported-by: Martin Raiber
    Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs")
    Fixes: 2971c35f3588 ("blk-mq: bitmap tag: fix race on blk_mq_bitmap_tags::wake_cnt")
    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • We always do an atomic clear_bit() right before we call sbq_wake_up(),
    so we can use smp_mb__after_atomic(). While we're here, comment the
    memory barriers in here a little more.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

19 Sep, 2016

1 commit

  • Variable weight is not being initialized to zero before it is
    used to compute the weight sum. Ensure it is initialized to zero.

    Found with static analysis with cppcheck:
    [lib/sbitmap.c:177]: (error) Uninitialized variable: weight

    Signed-off-by: Colin Ian King
    Signed-off-by: Jens Axboe

    Colin Ian King
     

18 Sep, 2016

1 commit

  • If we have a bunch of high-numbered bits allocated and then we resize
    the struct sbitmap_queue, when those bits get cleared, we'll update the
    hint and then have to re-randomize it repeatedly. Avoid that by checking
    that the cleared bit is still a valid hint. No measurable performance
    difference in the common case.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

17 Sep, 2016

6 commits

  • After a struct sbitmap_queue is resized smaller, the allocation hints
    may still be set to bits beyond the new depth of the bitmap. This means
    that, for example, if the number of blk-mq tags is reduced through
    sysfs, more requests than the nominal queue depth may be in flight.

    It's tempting to fix this at resize time by doing a one-time
    reinitialization of the hints, but this can race with
    __sbitmap_queue_get() updating the hint. Instead, check the hint before
    we use it. This caused no measurable performance difference in my
    synthetic benchmarks.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • In order to get good cache behavior from a sbitmap, we want each CPU to
    stick to its own cacheline(s) as much as possible. This might happen
    naturally as the bitmap gets filled up and the alloc_hint values spread
    out, but we really want this behavior from the start. blk-mq apparently
    intended to do this, but the code to do this was never wired up. Get rid
    of the dead code and make it part of the sbitmap library.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • Again, there's no point in passing this in every time. Make it part of
    struct sbitmap_queue and clean up the API.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • Allocating your own per-cpu allocation hint separately makes for an
    awkward API. Instead, allocate the per-cpu hint as part of the struct
    sbitmap_queue. There's no point for a struct sbitmap_queue without the
    cache, but you can still use a bare struct sbitmap.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • The original bt_alloc() we converted from was using kzalloc(), not
    kzalloc_node(), to allocate the wait queues. This was probably an
    oversight, so fix it for sbitmap_queue_init_node().

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • This is a generally useful data structure, so make it available to
    anyone else who might want to use it. It's also a nice cleanup
    separating the allocation logic from the rest of the tag handling logic.

    The code is behind a new Kconfig option, CONFIG_SBITMAP, which is only
    selected by CONFIG_BLOCK for now.

    This should be a complete noop functionality-wise.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval