13 Jun, 2018

1 commit

  • The kzalloc_node() function has a 2-factor argument form, kcalloc_node(). This
    patch replaces cases of:

    kzalloc_node(a * b, gfp, node)

    with:
    kcalloc_node(a * b, gfp, node)

    as well as handling cases of:

    kzalloc_node(a * b * c, gfp, node)

    with:

    kzalloc_node(array3_size(a, b, c), gfp, node)

    as it's slightly less ugly than:

    kcalloc_node(array_size(a, b), c, gfp, node)

    This does, however, attempt to ignore constant size factors like:

    kzalloc_node(4 * 1024, gfp, node)

    though any constants defined via macros get caught up in the conversion.

    Any factors with a sizeof() of "unsigned char", "char", and "u8" were
    dropped, since they're redundant.

    The Coccinelle script used for this was:

    // Fix redundant parens around sizeof().
    @@
    type TYPE;
    expression THING, E;
    @@

    (
    kzalloc_node(
    - (sizeof(TYPE)) * E
    + sizeof(TYPE) * E
    , ...)
    |
    kzalloc_node(
    - (sizeof(THING)) * E
    + sizeof(THING) * E
    , ...)
    )

    // Drop single-byte sizes and redundant parens.
    @@
    expression COUNT;
    typedef u8;
    typedef __u8;
    @@

    (
    kzalloc_node(
    - sizeof(u8) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(__u8) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(char) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(unsigned char) * (COUNT)
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(u8) * COUNT
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(__u8) * COUNT
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(char) * COUNT
    + COUNT
    , ...)
    |
    kzalloc_node(
    - sizeof(unsigned char) * COUNT
    + COUNT
    , ...)
    )

    // 2-factor product with sizeof(type/expression) and identifier or constant.
    @@
    type TYPE;
    expression THING;
    identifier COUNT_ID;
    constant COUNT_CONST;
    @@

    (
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * (COUNT_ID)
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * COUNT_ID
    + COUNT_ID, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * (COUNT_CONST)
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * COUNT_CONST
    + COUNT_CONST, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * (COUNT_ID)
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * COUNT_ID
    + COUNT_ID, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * (COUNT_CONST)
    + COUNT_CONST, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * COUNT_CONST
    + COUNT_CONST, sizeof(THING)
    , ...)
    )

    // 2-factor product, only identifiers.
    @@
    identifier SIZE, COUNT;
    @@

    - kzalloc_node
    + kcalloc_node
    (
    - SIZE * COUNT
    + COUNT, SIZE
    , ...)

    // 3-factor product with 1 sizeof(type) or sizeof(expression), with
    // redundant parens removed.
    @@
    expression THING;
    identifier STRIDE, COUNT;
    type TYPE;
    @@

    (
    kzalloc_node(
    - sizeof(TYPE) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(TYPE))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * (COUNT) * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * (COUNT) * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * COUNT * (STRIDE)
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING) * COUNT * STRIDE
    + array3_size(COUNT, STRIDE, sizeof(THING))
    , ...)
    )

    // 3-factor product with 2 sizeof(variable), with redundant parens removed.
    @@
    expression THING1, THING2;
    identifier COUNT;
    type TYPE1, TYPE2;
    @@

    (
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(TYPE2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kzalloc_node(
    - sizeof(THING1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(THING1), sizeof(THING2))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(THING2) * COUNT
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    |
    kzalloc_node(
    - sizeof(TYPE1) * sizeof(THING2) * (COUNT)
    + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
    , ...)
    )

    // 3-factor product, only identifiers, with redundant parens removed.
    @@
    identifier STRIDE, SIZE, COUNT;
    @@

    (
    kzalloc_node(
    - (COUNT) * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - (COUNT) * (STRIDE) * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - (COUNT) * STRIDE * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - (COUNT) * (STRIDE) * (SIZE)
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    |
    kzalloc_node(
    - COUNT * STRIDE * SIZE
    + array3_size(COUNT, STRIDE, SIZE)
    , ...)
    )

    // Any remaining multi-factor products, first at least 3-factor products,
    // when they're not all constants...
    @@
    expression E1, E2, E3;
    constant C1, C2, C3;
    @@

    (
    kzalloc_node(C1 * C2 * C3, ...)
    |
    kzalloc_node(
    - (E1) * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc_node(
    - (E1) * (E2) * E3
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc_node(
    - (E1) * (E2) * (E3)
    + array3_size(E1, E2, E3)
    , ...)
    |
    kzalloc_node(
    - E1 * E2 * E3
    + array3_size(E1, E2, E3)
    , ...)
    )

    // And then all remaining 2 factors products when they're not all constants,
    // keeping sizeof() as the second factor argument.
    @@
    expression THING, E1, E2;
    type TYPE;
    constant C1, C2, C3;
    @@

    (
    kzalloc_node(sizeof(THING) * C2, ...)
    |
    kzalloc_node(sizeof(TYPE) * C2, ...)
    |
    kzalloc_node(C1 * C2 * C3, ...)
    |
    kzalloc_node(C1 * C2, ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * (E2)
    + E2, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(TYPE) * E2
    + E2, sizeof(TYPE)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * (E2)
    + E2, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - sizeof(THING) * E2
    + E2, sizeof(THING)
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - (E1) * E2
    + E1, E2
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - (E1) * (E2)
    + E1, E2
    , ...)
    |
    - kzalloc_node
    + kcalloc_node
    (
    - E1 * E2
    + E1, E2
    , ...)
    )

    Signed-off-by: Kees Cook

    Kees Cook
     

05 Jun, 2018

1 commit

  • If a hardware queue is stopped, it should not be run again before
    explicitly started. Ignore stopped queues in blk_mq_run_work_fn(),
    fixing a regression recently introduced when the START_ON_RUN bit
    was removed.

    Fixes: 15fe8a90bb45 ("blk-mq: remove blk_mq_delay_queue()")
    Reviewed-by: Ming Lei
    Reviewed-by: Bart Van Assche
    Signed-off-by: Jianchao Wang
    Signed-off-by: Jens Axboe

    Jianchao Wang
     

01 Jun, 2018

2 commits


29 May, 2018

5 commits

  • Signed-off-by: Christoph Hellwig
    Reviewed-by: Johannes Thumshirn
    Signed-off-by: Jens Axboe

    Christoph Hellwig
     
  • Signed-off-by: Christoph Hellwig
    Reviewed-by: Hannes Reinecke
    Reviewed-by: Johannes Thumshirn
    Signed-off-by: Jens Axboe

    Christoph Hellwig
     
  • The BLK_EH_NOT_HANDLED implies nothing happen, but very often that
    is not what is happening - instead the driver already completed the
    command. Fix the symbolic name to reflect that a little better.

    Signed-off-by: Christoph Hellwig
    Reviewed-by: Hannes Reinecke
    Reviewed-by: Johannes Thumshirn
    Signed-off-by: Jens Axboe

    Christoph Hellwig
     
  • This patch simplifies the timeout handling by relying on the request
    reference counting to ensure the iterator is operating on an inflight
    and truly timed out request. Since the reference counting prevents the
    tag from being reallocated, the block layer no longer needs to prevent
    drivers from completing their requests while the timeout handler is
    operating on it: a driver completing a request is allowed to proceed to
    the next state without additional syncronization with the block layer.

    This also removes any need for generation sequence numbers since the
    request lifetime is prevented from being reallocated as a new sequence
    while timeout handling is operating on it.

    To enables this a refcount is added to struct request so that request
    users can be sure they're operating on the same request without it
    changing while they're processing it. The request's tag won't be
    released for reuse until both the timeout handler and the completion
    are done with it.

    Signed-off-by: Keith Busch
    [hch: slight cleanups, added back submission side hctx lock, use cmpxchg
    for completions]
    Signed-off-by: Christoph Hellwig
    Signed-off-by: Jens Axboe

    Keith Busch
     
  • The block layer had been setting the state to in-flight prior to updating
    the timer. This is the wrong order since the timeout handler could observe
    the in-flight state with the older timeout, believing the request had
    expired when in fact it is just getting started.

    Signed-off-by: Keith Busch
    Reviewed-by: Hannes Reinecke
    Signed-off-by: Jens Axboe

    Keith Busch
     

22 May, 2018

1 commit


18 May, 2018

1 commit

  • When the number of hardware queues is changed, the drivers will call
    blk_mq_update_nr_hw_queues() to remap hardware queues. This changes
    the ctx mappings, but the current code doesn't clear the
    ->dispatch_from hint. This can result in dispatch_from pointing to
    a ctx that isn't mapped to the hctx anymore.

    Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")
    Signed-off-by: huhai
    Reviewed-by: Ming Lei

    Moved the placement of the clearing to where we clear other items
    pertaining to the existing mapping, added Fixes line, and reworded
    the commit message.

    Signed-off-by: Jens Axboe

    huhai
     

16 May, 2018

1 commit


11 May, 2018

1 commit


09 May, 2018

4 commits

  • Currently, struct request has four timestamp fields:

    - A start time, set at get_request time, in jiffies, used for iostats
    - An I/O start time, set at start_request time, in ktime nanoseconds,
    used for blk-stats (i.e., wbt, kyber, hybrid polling)
    - Another start time and another I/O start time, used for cfq and bfq

    These can all be consolidated into one start time and one I/O start
    time, both in ktime nanoseconds, shaving off up to 16 bytes from struct
    request depending on the kernel config.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • We want this next to blk_account_io_done() for the next change so that
    we can call ktime_get() only once for both.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • struct blk_issue_stat squashes three things into one u64:

    - The time the driver started working on a request
    - The original size of the request (for the io.low controller)
    - Flags for writeback throttling

    It turns out that on x86_64, we have a 4 byte hole in struct request
    which we can fill with the non-timestamp fields from blk_issue_stat,
    simplifying things quite a bit.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • issue_stat is going to go away, so first make writeback throttling take
    the containing request, update the internal wbt helpers accordingly, and
    change rwb->sync_cookie to be the request pointer instead of the
    issue_stat pointer. No functional change.

    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

26 Apr, 2018

2 commits

  • When the blk-mq inflight implementation was added, /proc/diskstats was
    converted to use it, but /sys/block/$dev/inflight was not. Fix it by
    adding another helper to count in-flight requests by data direction.

    Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant")
    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     
  • In the legacy block case, we increment the counter right after we
    allocate the request, not when the driver handles it. In both the legacy
    and blk-mq cases, part_inc_in_flight() is called from
    blk_account_io_start() right after we've allocated the request. blk-mq
    only considers requests started requests as inflight, but this is
    inconsistent with the legacy definition and the intention in the code.
    This removes the started condition and instead counts all allocated
    requests.

    Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant")
    Signed-off-by: Omar Sandoval
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

25 Apr, 2018

1 commit

  • This reverts commit 37c7c6c76d431dd7ef9c29d95f6052bd425f004c.

    Turns out some drivers(most are FC drivers) may not use managed
    IRQ affinity, and has their customized .map_queues meantime, so
    still keep this code for avoiding regression.

    Reported-by: Laurence Oberman
    Tested-by: Laurence Oberman
    Tested-by: Christian Borntraeger
    Tested-by: Stefan Haberland
    Cc: Ewan Milne
    Cc: Christoph Hellwig
    Cc: Sagi Grimberg
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     

17 Apr, 2018

1 commit

  • rq->gstate and rq->aborted_gstate both are zero before rqs are
    allocated. If we have a small timeout, when the timer fires,
    there could be rqs that are never allocated, and also there could
    be rq that has been allocated but not initialized and started. At
    the moment, the rq->gstate and rq->aborted_gstate both are 0, thus
    the blk_mq_terminate_expired will identify the rq is timed out and
    invoke .timeout early.

    For scsi, this will cause scsi_times_out to be invoked before the
    scsi_cmnd is not initialized, scsi_cmnd->device is still NULL at
    the moment, then we will get crash.

    Cc: Bart Van Assche
    Cc: Tejun Heo
    Cc: Ming Lei
    Cc: Martin Steigerwald
    Cc: stable@vger.kernel.org
    Signed-off-by: Jianchao Wang
    Signed-off-by: Jens Axboe

    Jianchao Wang
     

10 Apr, 2018

7 commits

  • Firstly, from commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU),
    blk-mq doesn't remap queue any more after CPU topo is changed.

    Secondly, set->nr_hw_queues can't be bigger than nr_cpu_ids, and now we map
    all possible CPUs to hw queues, so at least one CPU is mapped to each hctx.

    So queue mapping has became static and fixed just like percpu variable, and
    we don't need to handle queue remapping any more.

    Cc: Stefan Haberland
    Tested-by: Christian Borntraeger
    Reviewed-by: Christoph Hellwig
    Reviewed-by: Sagi Grimberg
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     
  • There are several reasons for removing the check:

    1) blk_mq_hw_queue_mapped() returns true always now since each hctx
    may be mapped by one CPU at least

    2) when there isn't any online CPU mapped to this hctx, there won't
    be any IO queued to this CPU, blk_mq_run_hw_queue() only runs queue
    if there is IO queued to this hctx

    3) If __blk_mq_delay_run_hw_queue() is called by blk_mq_delay_run_hw_queue(),
    which is run from blk_mq_dispatch_rq_list() or scsi_mq_get_budget(), and
    the hctx to be handled has to be mapped.

    Cc: Stefan Haberland
    Tested-by: Christian Borntraeger
    Reviewed-by: Christoph Hellwig
    Reviewed-by: Sagi Grimberg
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     
  • No driver uses this interface any more, so remove it.

    Cc: Stefan Haberland
    Tested-by: Christian Borntraeger
    Reviewed-by: Christoph Hellwig
    Reviewed-by: Sagi Grimberg
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     
  • This patch introduces helper of blk_mq_hw_queue_first_cpu() for
    figuring out the hctx's first cpu, and code duplication can be
    avoided.

    Cc: Stefan Haberland
    Tested-by: Christian Borntraeger
    Reviewed-by: Christoph Hellwig
    Reviewed-by: Sagi Grimberg
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     
  • This patch figures out the final selected CPU, then writes
    it to hctx->next_cpu once, then we can avoid to intermediate
    next cpu observed from other dispatch paths.

    Cc: Stefan Haberland
    Tested-by: Christian Borntraeger
    Reviewed-by: Christoph Hellwig
    Reviewed-by: Sagi Grimberg
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     
  • From commit 20e4d81393196 (blk-mq: simplify queue mapping & schedule
    with each possisble CPU), one hctx can be mapped from all offline CPUs,
    then hctx->next_cpu can be set as wrong.

    This patch fixes this issue by making hctx->next_cpu pointing to the
    first CPU in hctx->cpumask if all CPUs in hctx->cpumask are offline.

    Cc: Stefan Haberland
    Tested-by: Christian Borntraeger
    Reviewed-by: Christoph Hellwig
    Reviewed-by: Sagi Grimberg
    Fixes: 20e4d81393196 ("blk-mq: simplify queue mapping & schedule with each possisble CPU")
    Cc: stable@vger.kernel.org
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     
  • This patch orders getting budget and driver tag by making sure to acquire
    driver tag after budget is got, this way can help to avoid the following
    race:

    1) before dispatch request from scheduler queue, get one budget first, then
    dequeue a request, call it request A.

    2) in another IO path for dispatching request B which is from hctx->dispatch,
    driver tag is got, then try to get budget in blk_mq_dispatch_rq_list(),
    unfortunately the budget is held by request A.

    3) meantime blk_mq_dispatch_rq_list() is called for dispatching request
    A, and try to get driver tag first, unfortunately no driver tag is
    available because the driver tag is held by request B

    4) both two IO pathes can't move on, and IO stall is caused.

    This issue can be observed when running dbench on USB storage.

    This patch fixes this issue by always getting budget before getting
    driver tag.

    Cc: stable@vger.kernel.org
    Fixes: de1482974080ec9e ("blk-mq: introduce .get_budget and .put_budget in blk_mq_ops")
    Cc: Christoph Hellwig
    Cc: Bart Van Assche
    Cc: Omar Sandoval
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     

09 Mar, 2018

3 commits

  • Since the queue flags may be changed concurrently from multiple
    contexts after a queue becomes visible in sysfs, make these changes
    safe by protecting these with the queue lock.

    Cc: Christoph Hellwig
    Cc: Hannes Reinecke
    Cc: Ming Lei
    Reviewed-by: Martin K. Petersen
    Reviewed-by: Johannes Thumshirn
    Signed-off-by: Bart Van Assche
    Signed-off-by: Jens Axboe

    Bart Van Assche
     
  • Introduce functions that modify the queue flags and that protect
    these modifications with the request queue lock. Except for moving
    one wake_up_all() call from inside to outside a critical section,
    this patch does not change any functionality.

    Cc: Christoph Hellwig
    Cc: Hannes Reinecke
    Cc: Ming Lei
    Reviewed-by: Johannes Thumshirn
    Reviewed-by: Martin K. Petersen
    Signed-off-by: Bart Van Assche
    Signed-off-by: Jens Axboe

    Bart Van Assche
     
  • Except for changing the atomic queue flag manipulations that are
    protected by the queue lock into non-atomic manipulations, this
    patch does not change any functionality.

    Cc: Christoph Hellwig
    Cc: Hannes Reinecke
    Cc: Ming Lei
    Reviewed-by: Johannes Thumshirn
    Reviewed-by: Martin K. Petersen
    Signed-off-by: Bart Van Assche
    Signed-off-by: Jens Axboe

    Bart Van Assche
     

01 Mar, 2018

2 commits

  • This patch does not change any functionality.

    Signed-off-by: Bart Van Assche
    Reviewed-by: Joseph Qi
    Cc: Christoph Hellwig
    Cc: Philipp Reisner
    Cc: Ulf Hansson
    Cc: Kees Cook
    Signed-off-by: Jens Axboe

    Bart Van Assche
     
  • When we insert a request, we set the software queue pending bit while
    holding the software queue lock. However, we clear it outside of the
    lock, so it's possible that a concurrent insert could reset the bit
    after we clear it but before we empty the request list. Afterwards, the
    bit would still be set but the software queue wouldn't have any requests
    in it, leading us to do a spurious run in the future. This is mostly a
    benign/theoretical issue, but it makes the following change easier to
    justify.

    Signed-off-by: Omar Sandoval
    Acked-by: Tejun Heo
    Signed-off-by: Jens Axboe

    Omar Sandoval
     

25 Feb, 2018

1 commit

  • __blk_mq_requeue_request() covers two cases:

    - one is that the requeued request is added to hctx->dispatch, such as
    blk_mq_dispatch_rq_list()

    - another case is that the request is requeued to io scheduler, such as
    blk_mq_requeue_request().

    We should call io sched's .requeue_request callback only for the 2nd
    case.

    Cc: Paolo Valente
    Cc: Omar Sandoval
    Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers")
    Cc: stable@vger.kernel.org
    Reviewed-by: Bart Van Assche
    Acked-by: Paolo Valente
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei
     

14 Feb, 2018

1 commit

  • This removes the dependency on interrupts to wake up task. Set task
    state as TASK_RUNNING, if need_resched() returns true,
    while polling for IO completion.
    Earlier, polling task used to sleep, relying on interrupt to wake it up.
    This made some IO take very long when interrupt-coalescing is enabled in
    NVMe.

    Reference:
    http://lists.infradead.org/pipermail/linux-nvme/2018-February/015435.html

    Changes since v2->v3:
    -using __set_current_state() instead of set_current_state()

    Changes since v1->v2:
    -setting task state once in blk_poll, instead of multiple
    callers.

    Signed-off-by: Nitesh Shetty
    Signed-off-by: Jens Axboe

    Nitesh Shetty
     

31 Jan, 2018

1 commit

  • This status is returned from driver to block layer if device related
    resource is unavailable, but driver can guarantee that IO dispatch
    will be triggered in future when the resource is available.

    Convert some drivers to return BLK_STS_DEV_RESOURCE. Also, if driver
    returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after
    a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls. BLK_MQ_DELAY_QUEUE is
    3 ms because both scsi-mq and nvmefc are using that magic value.

    If a driver can make sure there is in-flight IO, it is safe to return
    BLK_STS_DEV_RESOURCE because:

    1) If all in-flight IOs complete before examining SCHED_RESTART in
    blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue
    is run immediately in this case by blk_mq_dispatch_rq_list();

    2) if there is any in-flight IO after/when examining SCHED_RESTART
    in blk_mq_dispatch_rq_list():
    - if SCHED_RESTART isn't set, queue is run immediately as handled in 1)
    - otherwise, this request will be dispatched after any in-flight IO is
    completed via blk_mq_sched_restart()

    3) if SCHED_RESTART is set concurently in context because of
    BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two
    cases and make sure IO hang can be avoided.

    One invariant is that queue will be rerun if SCHED_RESTART is set.

    Suggested-by: Jens Axboe
    Tested-by: Laurence Oberman
    Signed-off-by: Ming Lei
    Signed-off-by: Mike Snitzer
    Signed-off-by: Jens Axboe

    Ming Lei
     

30 Jan, 2018

1 commit

  • Pull block updates from Jens Axboe:
    "This is the main pull request for block IO related changes for the
    4.16 kernel. Nothing major in this pull request, but a good amount of
    improvements and fixes all over the map. This contains:

    - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
    Paolo.

    - Support for SMR zones for deadline and mq-deadline from Damien and
    Christoph.

    - Set of fixes for bcache by way of Michael Lyle, including fixes
    from himself, Kent, Rui, Tang, and Coly.

    - Series from Matias for lightnvm with fixes from Hans Holmberg,
    Javier, and Matias. Mostly centered around pblk, and the removing
    rrpc 1.2 in preparation for supporting 2.0.

    - A couple of NVMe pull requests from Christoph. Nothing major in
    here, just fixes and cleanups, and support for command tracing from
    Johannes.

    - Support for blk-throttle for tracking reads and writes separately.
    From Joseph Qi. A few cleanups/fixes also for blk-throttle from
    Weiping.

    - Series from Mike Snitzer that enables dm to register its queue more
    logically, something that's alwways been problematic on dm since
    it's a stacked device.

    - Series from Ming cleaning up some of the bio accessor use, in
    preparation for supporting multipage bvecs.

    - Various fixes from Ming closing up holes around queue mapping and
    quiescing.

    - BSD partition fix from Richard Narron, fixing a problem where we
    can't mount newer (10/11) FreeBSD partitions.

    - Series from Tejun reworking blk-mq timeout handling. The previous
    scheme relied on atomic bits, but it had races where we would think
    a request had timed out if it to reused at the wrong time.

    - null_blk now supports faking timeouts, to enable us to better
    exercise and test that functionality separately. From me.

    - Kill the separate atomic poll bit in the request struct. After
    this, we don't use the atomic bits on blk-mq anymore at all. From
    me.

    - sgl_alloc/free helpers from Bart.

    - Heavily contended tag case scalability improvement from me.

    - Various little fixes and cleanups from Arnd, Bart, Corentin,
    Douglas, Eryu, Goldwyn, and myself"

    * 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
    block: remove smart1,2.h
    nvme: add tracepoint for nvme_complete_rq
    nvme: add tracepoint for nvme_setup_cmd
    nvme-pci: introduce RECONNECTING state to mark initializing procedure
    nvme-rdma: remove redundant boolean for inline_data
    nvme: don't free uuid pointer before printing it
    nvme-pci: Suspend queues after deleting them
    bsg: use pr_debug instead of hand crafted macros
    blk-mq-debugfs: don't allow write on attributes with seq_operations set
    nvme-pci: Fix queue double allocations
    block: Set BIO_TRACE_COMPLETION on new bio during split
    blk-throttle: use queue_is_rq_based
    block: Remove kblockd_schedule_delayed_work{,_on}()
    blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
    blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
    lib/scatterlist: Fix chaining support in sgl_alloc_order()
    blk-throttle: track read and write request individually
    block: add bdev_read_only() checks to common helpers
    block: fail op_is_write() requests to read-only partitions
    blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
    ...

    Linus Torvalds
     

20 Jan, 2018

2 commits


18 Jan, 2018

1 commit

  • If we run into blk_mq_request_direct_issue(), when queue is busy, we
    don't want to dispatch this request into hctx->dispatch_list, and
    what we need to do is to return the queue busy info to caller, so
    that caller can deal with it well.

    Fixes: 396eaf21ee ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback")
    Reported-by: Laurence Oberman
    Reviewed-by: Mike Snitzer
    Signed-off-by: Ming Lei
    Signed-off-by: Jens Axboe

    Ming Lei