Eric Lee / smarc-fsl-linux-kernel

13 Jun, 2018

1 commit

590b5b7d8 treewide: kzalloc_node() -> kcalloc_node() ... Browse Code »

The kzalloc_node() function has a 2-factor argument form, kcalloc_node(). This
patch replaces cases of:

kzalloc_node(a * b, gfp, node)

with:
kcalloc_node(a * b, gfp, node)

as well as handling cases of:

kzalloc_node(a * b * c, gfp, node)

with:

kzalloc_node(array3_size(a, b, c), gfp, node)

as it's slightly less ugly than:

kcalloc_node(array_size(a, b), c, gfp, node)

This does, however, attempt to ignore constant size factors like:

kzalloc_node(4 * 1024, gfp, node)

though any constants defined via macros get caught up in the conversion.

Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.

The Coccinelle script used for this was:

// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@

(
kzalloc_node(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc_node(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)

// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@

(
kzalloc_node(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)

// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@

(
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)

// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@

- kzalloc_node
+ kcalloc_node
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)

// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@

(
kzalloc_node(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc_node(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc_node(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc_node(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)

// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@

(
kzalloc_node(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc_node(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc_node(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc_node(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc_node(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc_node(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)

// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@

(
kzalloc_node(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)

// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@

(
kzalloc_node(C1 * C2 * C3, ...)
|
kzalloc_node(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc_node(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc_node(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc_node(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)

// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@

(
kzalloc_node(sizeof(THING) * C2, ...)
|
kzalloc_node(sizeof(TYPE) * C2, ...)
|
kzalloc_node(C1 * C2 * C3, ...)
|
kzalloc_node(C1 * C2, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- E1 * E2
+ E1, E2
, ...)
)

Signed-off-by: Kees Cook

Kees Cook
2018-06-13 07:19:22 +0800

05 Jun, 2018

1 commit

0196d6b40 blk-mq: return when hctx is stopped in blk_mq_run_work_fn ... Browse Code »

If a hardware queue is stopped, it should not be run again before
explicitly started. Ignore stopped queues in blk_mq_run_work_fn(),
fixing a regression recently introduced when the START_ON_RUN bit
was removed.

Fixes: 15fe8a90bb45 ("blk-mq: remove blk_mq_delay_queue()")
Reviewed-by: Ming Lei
Reviewed-by: Bart Van Assche
Signed-off-by: Jianchao Wang
Signed-off-by: Jens Axboe

Jianchao Wang
2018-06-05 02:10:40 +0800

01 Jun, 2018

2 commits

131d08e12 block: split the blk-mq case from elevator_init ... Browse Code »

There is almost no shared logic, which leads to a very confusing code
flow.

Signed-off-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Tested-by: Damien Le Moal
Signed-off-by: Jens Axboe

Christoph Hellwig
2018-06-01 21:38:21 +0800
acddf3b30 block: move sysfs_lock into elevator_init ... Browse Code »

Both callers take just around so function call, so move it in.
Also remove the now pointless blk_mq_sched_init wrapper.

Signed-off-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Tested-by: Damien Le Moal
Signed-off-by: Jens Axboe

Christoph Hellwig
2018-06-01 21:38:19 +0800

29 May, 2018

5 commits

d1210d5af blk-mq: simplify blk_mq_rq_timed_out ... Browse Code »

Signed-off-by: Christoph Hellwig
Reviewed-by: Johannes Thumshirn
Signed-off-by: Jens Axboe

Christoph Hellwig
2018-05-29 22:59:21 +0800
f6e7d48a7 block: remove BLK_EH_HANDLED ... Browse Code »

Signed-off-by: Christoph Hellwig
Reviewed-by: Hannes Reinecke
Reviewed-by: Johannes Thumshirn
Signed-off-by: Jens Axboe

Christoph Hellwig
2018-05-29 22:59:21 +0800
6600593cb block: rename BLK_EH_NOT_HANDLED to BLK_EH_DONE ... Browse Code »

The BLK_EH_NOT_HANDLED implies nothing happen, but very often that
is not what is happening - instead the driver already completed the
command. Fix the symbolic name to reflect that a little better.

Signed-off-by: Christoph Hellwig
Reviewed-by: Hannes Reinecke
Reviewed-by: Johannes Thumshirn
Signed-off-by: Jens Axboe

Christoph Hellwig
2018-05-29 22:59:21 +0800
12f5b9314 blk-mq: Remove generation seqeunce ... Browse Code »

This patch simplifies the timeout handling by relying on the request
reference counting to ensure the iterator is operating on an inflight
and truly timed out request. Since the reference counting prevents the
tag from being reallocated, the block layer no longer needs to prevent
drivers from completing their requests while the timeout handler is
operating on it: a driver completing a request is allowed to proceed to
the next state without additional syncronization with the block layer.

This also removes any need for generation sequence numbers since the
request lifetime is prevented from being reallocated as a new sequence
while timeout handling is operating on it.

To enables this a refcount is added to struct request so that request
users can be sure they're operating on the same request without it
changing while they're processing it. The request's tag won't be
released for reuse until both the timeout handler and the completion
are done with it.

Signed-off-by: Keith Busch
[hch: slight cleanups, added back submission side hctx lock, use cmpxchg
for completions]
Signed-off-by: Christoph Hellwig
Signed-off-by: Jens Axboe

Keith Busch
2018-05-29 22:59:21 +0800
ad103e798 blk-mq: Fix timeout and state order ... Browse Code »

The block layer had been setting the state to in-flight prior to updating
the timer. This is the wrong order since the timeout handler could observe
the in-flight state with the older timeout, believing the request had
expired when in fact it is just getting started.

Signed-off-by: Keith Busch
Reviewed-by: Hannes Reinecke
Signed-off-by: Jens Axboe

Keith Busch
2018-05-29 22:47:40 +0800

22 May, 2018

1 commit

b4f6f38d9 blk-mq: remove wrong 'unlikely' check ... Browse Code »

When dispatch_rq_from_ctx is called, in the vast majority of cases
the ctx->rq_list is not empty.

Signed-off-by: huhai
Signed-off-by: Jens Axboe

huhai
2018-05-22 22:38:04 +0800

18 May, 2018

1 commit

d416c92c5 blk-mq: clear hctx->dispatch_from when mappings change ... Browse Code »

When the number of hardware queues is changed, the drivers will call
blk_mq_update_nr_hw_queues() to remap hardware queues. This changes
the ctx mappings, but the current code doesn't clear the
->dispatch_from hint. This can result in dispatch_from pointing to
a ctx that isn't mapped to the hctx anymore.

Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue")
Signed-off-by: huhai
Reviewed-by: Ming Lei

Moved the placement of the clearing to where we clear other items
pertaining to the existing mapping, added Fixes line, and reworded
the commit message.

Signed-off-by: Jens Axboe

huhai
2018-05-18 22:35:38 +0800

16 May, 2018

1 commit

8fa9f5564 blk-mq: remove redundant insert case in blk_mq_make_request() ... Browse Code »

We can use blk_mq_sched_insert_request() even if we don't have
an IO scheduler attached, since that case will end up being
exactly the same as what blk_mq_queue_io() was doing now.

Signed-off-by: huhai
Signed-off-by: Jens Axboe

huhai
2018-05-16 22:22:47 +0800

11 May, 2018

1 commit

17a511993 blk-mq: don't call into depth limiting for reserved tags ... Browse Code »

It's not useful, they are internal and/or error handling recovery
commands.

Acked-by: Paolo Valente
Reviewed-by: Omar Sandoval
Signed-off-by: Jens Axboe

Jens Axboe
2018-05-11 01:27:12 +0800

09 May, 2018

4 commits

522a77756 block: consolidate struct request timestamp fields ... Browse Code »

Currently, struct request has four timestamp fields:

- A start time, set at get_request time, in jiffies, used for iostats
- An I/O start time, set at start_request time, in ktime nanoseconds,
used for blk-stats (i.e., wbt, kyber, hybrid polling)
- Another start time and another I/O start time, used for cfq and bfq

These can all be consolidated into one start time and one I/O start
time, both in ktime nanoseconds, shaving off up to 16 bytes from struct
request depending on the kernel config.

Signed-off-by: Omar Sandoval
Signed-off-by: Jens Axboe

Omar Sandoval
2018-05-09 22:33:09 +0800
4bc6339a5 block: move blk_stat_add() to __blk_mq_end_request() ... Browse Code »

We want this next to blk_account_io_done() for the next change so that
we can call ktime_get() only once for both.

Signed-off-by: Omar Sandoval
Signed-off-by: Jens Axboe

Omar Sandoval
2018-05-09 22:33:07 +0800
544ccc8dc block: get rid of struct blk_issue_stat ... Browse Code »

struct blk_issue_stat squashes three things into one u64:

- The time the driver started working on a request
- The original size of the request (for the io.low controller)
- Flags for writeback throttling

It turns out that on x86_64, we have a 4 byte hole in struct request
which we can fill with the non-timestamp fields from blk_issue_stat,
simplifying things quite a bit.

Signed-off-by: Omar Sandoval
Signed-off-by: Jens Axboe

Omar Sandoval
2018-05-09 22:33:05 +0800
a8a459417 block: pass struct request instead of struct blk_issue_stat to wbt ... Browse Code »

issue_stat is going to go away, so first make writeback throttling take
the containing request, update the internal wbt helpers accordingly, and
change rwb->sync_cookie to be the request pointer instead of the
issue_stat pointer. No functional change.

Signed-off-by: Omar Sandoval
Signed-off-by: Jens Axboe

Omar Sandoval
2018-05-09 22:33:02 +0800

26 Apr, 2018

2 commits

bf0ddaba6 blk-mq: fix sysfs inflight counter ... Browse Code »

When the blk-mq inflight implementation was added, /proc/diskstats was
converted to use it, but /sys/block/$dev/inflight was not. Fix it by
adding another helper to count in-flight requests by data direction.

Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant")
Signed-off-by: Omar Sandoval
Signed-off-by: Jens Axboe

Omar Sandoval
2018-04-26 23:02:01 +0800
6131837b1 blk-mq: count allocated but not started requests in iostats inflight ... Browse Code »

In the legacy block case, we increment the counter right after we
allocate the request, not when the driver handles it. In both the legacy
and blk-mq cases, part_inc_in_flight() is called from
blk_account_io_start() right after we've allocated the request. blk-mq
only considers requests started requests as inflight, but this is
inconsistent with the legacy definition and the intention in the code.
This removes the started condition and instead counts all allocated
requests.

Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant")
Signed-off-by: Omar Sandoval
Signed-off-by: Jens Axboe

Omar Sandoval
2018-04-26 23:02:00 +0800

25 Apr, 2018

1 commit

4412efecf Revert "blk-mq: remove code for dealing with remapping queue" ... Browse Code »

This reverts commit 37c7c6c76d431dd7ef9c29d95f6052bd425f004c.

Turns out some drivers(most are FC drivers) may not use managed
IRQ affinity, and has their customized .map_queues meantime, so
still keep this code for avoiding regression.

Reported-by: Laurence Oberman
Tested-by: Laurence Oberman
Tested-by: Christian Borntraeger
Tested-by: Stefan Haberland
Cc: Ewan Milne
Cc: Christoph Hellwig
Cc: Sagi Grimberg
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-25 23:49:22 +0800

17 Apr, 2018

1 commit

f4560231e blk-mq: start request gstate with gen 1 ... Browse Code »

rq->gstate and rq->aborted_gstate both are zero before rqs are
allocated. If we have a small timeout, when the timer fires,
there could be rqs that are never allocated, and also there could
be rq that has been allocated but not initialized and started. At
the moment, the rq->gstate and rq->aborted_gstate both are 0, thus
the blk_mq_terminate_expired will identify the rq is timed out and
invoke .timeout early.

For scsi, this will cause scsi_times_out to be invoked before the
scsi_cmnd is not initialized, scsi_cmnd->device is still NULL at
the moment, then we will get crash.

Cc: Bart Van Assche
Cc: Tejun Heo
Cc: Ming Lei
Cc: Martin Steigerwald
Cc: stable@vger.kernel.org
Signed-off-by: Jianchao Wang
Signed-off-by: Jens Axboe

Jianchao Wang
2018-04-17 11:56:41 +0800

10 Apr, 2018

7 commits

37c7c6c76 blk-mq: remove code for dealing with remapping queue ... Browse Code »

Firstly, from commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU),
blk-mq doesn't remap queue any more after CPU topo is changed.

Secondly, set->nr_hw_queues can't be bigger than nr_cpu_ids, and now we map
all possible CPUs to hw queues, so at least one CPU is mapped to each hctx.

So queue mapping has became static and fixed just like percpu variable, and
we don't need to handle queue remapping any more.

Cc: Stefan Haberland
Tested-by: Christian Borntraeger
Reviewed-by: Christoph Hellwig
Reviewed-by: Sagi Grimberg
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800
efea8450c blk-mq: don't check queue mapped in __blk_mq_delay_run_hw_queue() ... Browse Code »

There are several reasons for removing the check:

1) blk_mq_hw_queue_mapped() returns true always now since each hctx
may be mapped by one CPU at least

2) when there isn't any online CPU mapped to this hctx, there won't
be any IO queued to this CPU, blk_mq_run_hw_queue() only runs queue
if there is IO queued to this hctx

3) If __blk_mq_delay_run_hw_queue() is called by blk_mq_delay_run_hw_queue(),
which is run from blk_mq_dispatch_rq_list() or scsi_mq_get_budget(), and
the hctx to be handled has to be mapped.

Cc: Stefan Haberland
Tested-by: Christian Borntraeger
Reviewed-by: Christoph Hellwig
Reviewed-by: Sagi Grimberg
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800
15fe8a90b blk-mq: remove blk_mq_delay_queue() ... Browse Code »

No driver uses this interface any more, so remove it.

Cc: Stefan Haberland
Tested-by: Christian Borntraeger
Reviewed-by: Christoph Hellwig
Reviewed-by: Sagi Grimberg
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800
f82ddf192 blk-mq: introduce blk_mq_hw_queue_first_cpu() to figure out first cpu ... Browse Code »

This patch introduces helper of blk_mq_hw_queue_first_cpu() for
figuring out the hctx's first cpu, and code duplication can be
avoided.

Cc: Stefan Haberland
Tested-by: Christian Borntraeger
Reviewed-by: Christoph Hellwig
Reviewed-by: Sagi Grimberg
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800
476f8c98a blk-mq: avoid to write intermediate result to hctx->next_cpu ... Browse Code »

This patch figures out the final selected CPU, then writes
it to hctx->next_cpu once, then we can avoid to intermediate
next cpu observed from other dispatch paths.

Cc: Stefan Haberland
Tested-by: Christian Borntraeger
Reviewed-by: Christoph Hellwig
Reviewed-by: Sagi Grimberg
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800
a1c735fb7 blk-mq: make sure that correct hctx->next_cpu is set ... Browse Code »

From commit 20e4d81393196 (blk-mq: simplify queue mapping & schedule
with each possisble CPU), one hctx can be mapped from all offline CPUs,
then hctx->next_cpu can be set as wrong.

This patch fixes this issue by making hctx->next_cpu pointing to the
first CPU in hctx->cpumask if all CPUs in hctx->cpumask are offline.

Cc: Stefan Haberland
Tested-by: Christian Borntraeger
Reviewed-by: Christoph Hellwig
Reviewed-by: Sagi Grimberg
Fixes: 20e4d81393196 ("blk-mq: simplify queue mapping & schedule with each possisble CPU")
Cc: stable@vger.kernel.org
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800
0bca799b9 blk-mq: order getting budget and driver tag ... Browse Code »

This patch orders getting budget and driver tag by making sure to acquire
driver tag after budget is got, this way can help to avoid the following
race:

1) before dispatch request from scheduler queue, get one budget first, then
dequeue a request, call it request A.

2) in another IO path for dispatching request B which is from hctx->dispatch,
driver tag is got, then try to get budget in blk_mq_dispatch_rq_list(),
unfortunately the budget is held by request A.

3) meantime blk_mq_dispatch_rq_list() is called for dispatching request
A, and try to get driver tag first, unfortunately no driver tag is
available because the driver tag is held by request B

4) both two IO pathes can't move on, and IO stall is caused.

This issue can be observed when running dbench on USB storage.

This patch fixes this issue by always getting budget before getting
driver tag.

Cc: stable@vger.kernel.org
Fixes: de1482974080ec9e ("blk-mq: introduce .get_budget and .put_budget in blk_mq_ops")
Cc: Christoph Hellwig
Cc: Bart Van Assche
Cc: Omar Sandoval
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-04-10 22:38:46 +0800

09 Mar, 2018

3 commits

7dfdbc736 block: Protect queue flag changes with the queue lock ... Browse Code »

Since the queue flags may be changed concurrently from multiple
contexts after a queue becomes visible in sysfs, make these changes
safe by protecting these with the queue lock.

Cc: Christoph Hellwig
Cc: Hannes Reinecke
Cc: Ming Lei
Reviewed-by: Martin K. Petersen
Reviewed-by: Johannes Thumshirn
Signed-off-by: Bart Van Assche
Signed-off-by: Jens Axboe

Bart Van Assche
2018-03-09 05:13:48 +0800
8814ce8a0 block: Introduce blk_queue_flag_{set,clear,test_and_{set,clear}}() ... Browse Code »

Introduce functions that modify the queue flags and that protect
these modifications with the request queue lock. Except for moving
one wake_up_all() call from inside to outside a critical section,
this patch does not change any functionality.

Cc: Christoph Hellwig
Cc: Hannes Reinecke
Cc: Ming Lei
Reviewed-by: Johannes Thumshirn
Reviewed-by: Martin K. Petersen
Signed-off-by: Bart Van Assche
Signed-off-by: Jens Axboe

Bart Van Assche
2018-03-09 05:13:48 +0800
f78bac2c8 block: Use the queue_flag_*() functions instead of open-coding these ... Browse Code »

Except for changing the atomic queue flag manipulations that are
protected by the queue lock into non-atomic manipulations, this
patch does not change any functionality.

Cc: Christoph Hellwig
Cc: Hannes Reinecke
Cc: Ming Lei
Reviewed-by: Johannes Thumshirn
Reviewed-by: Martin K. Petersen
Signed-off-by: Bart Van Assche
Signed-off-by: Jens Axboe

Bart Van Assche
2018-03-09 05:13:48 +0800

01 Mar, 2018

2 commits

5ee0524ba block: Add 'lock' as third argument to blk_alloc_queue_node() ... Browse Code »

This patch does not change any functionality.

Signed-off-by: Bart Van Assche
Reviewed-by: Joseph Qi
Cc: Christoph Hellwig
Cc: Philipp Reisner
Cc: Ulf Hansson
Cc: Kees Cook
Signed-off-by: Jens Axboe

Bart Van Assche
2018-03-01 03:23:35 +0800
e9a99a638 block: clear ctx pending bit under ctx lock ... Browse Code »

When we insert a request, we set the software queue pending bit while
holding the software queue lock. However, we clear it outside of the
lock, so it's possible that a concurrent insert could reset the bit
after we clear it but before we empty the request list. Afterwards, the
bit would still be set but the software queue wouldn't have any requests
in it, leading us to do a spurious run in the future. This is mostly a
benign/theoretical issue, but it makes the following change easier to
justify.

Signed-off-by: Omar Sandoval
Acked-by: Tejun Heo
Signed-off-by: Jens Axboe

Omar Sandoval
2018-03-01 03:23:35 +0800

25 Feb, 2018

1 commit

105976f51 blk-mq: don't call io sched's .requeue_request when requeueing rq to ->dispatch ... Browse Code »

__blk_mq_requeue_request() covers two cases:

- one is that the requeued request is added to hctx->dispatch, such as
blk_mq_dispatch_rq_list()

- another case is that the request is requeued to io scheduler, such as
blk_mq_requeue_request().

We should call io sched's .requeue_request callback only for the 2nd
case.

Cc: Paolo Valente
Cc: Omar Sandoval
Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers")
Cc: stable@vger.kernel.org
Reviewed-by: Bart Van Assche
Acked-by: Paolo Valente
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-02-25 06:55:54 +0800

14 Feb, 2018

1 commit

67b4110f8 blk: optimization for classic polling ... Browse Code »

This removes the dependency on interrupts to wake up task. Set task
state as TASK_RUNNING, if need_resched() returns true,
while polling for IO completion.
Earlier, polling task used to sleep, relying on interrupt to wake it up.
This made some IO take very long when interrupt-coalescing is enabled in
NVMe.

Reference:
http://lists.infradead.org/pipermail/linux-nvme/2018-February/015435.html

Changes since v2->v3:
-using __set_current_state() instead of set_current_state()

Changes since v1->v2:
-setting task state once in blk_poll, instead of multiple
callers.

Signed-off-by: Nitesh Shetty
Signed-off-by: Jens Axboe

Nitesh Shetty
2018-02-14 00:12:04 +0800

31 Jan, 2018

1 commit

86ff7c2a8 blk-mq: introduce BLK_STS_DEV_RESOURCE ... Browse Code »

This status is returned from driver to block layer if device related
resource is unavailable, but driver can guarantee that IO dispatch
will be triggered in future when the resource is available.

Convert some drivers to return BLK_STS_DEV_RESOURCE. Also, if driver
returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after
a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls. BLK_MQ_DELAY_QUEUE is
3 ms because both scsi-mq and nvmefc are using that magic value.

If a driver can make sure there is in-flight IO, it is safe to return
BLK_STS_DEV_RESOURCE because:

1) If all in-flight IOs complete before examining SCHED_RESTART in
blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue
is run immediately in this case by blk_mq_dispatch_rq_list();

2) if there is any in-flight IO after/when examining SCHED_RESTART
in blk_mq_dispatch_rq_list():
- if SCHED_RESTART isn't set, queue is run immediately as handled in 1)
- otherwise, this request will be dispatched after any in-flight IO is
completed via blk_mq_sched_restart()

3) if SCHED_RESTART is set concurently in context because of
BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two
cases and make sure IO hang can be avoided.

One invariant is that queue will be rerun if SCHED_RESTART is set.

Suggested-by: Jens Axboe
Tested-by: Laurence Oberman
Signed-off-by: Ming Lei
Signed-off-by: Mike Snitzer
Signed-off-by: Jens Axboe

Ming Lei
2018-01-31 11:18:28 +0800

30 Jan, 2018

1 commit

0a4b6e2f8 Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block ... Browse Code »

Pull block updates from Jens Axboe:
"This is the main pull request for block IO related changes for the
4.16 kernel. Nothing major in this pull request, but a good amount of
improvements and fixes all over the map. This contains:

- BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
Paolo.

- Support for SMR zones for deadline and mq-deadline from Damien and
Christoph.

- Set of fixes for bcache by way of Michael Lyle, including fixes
from himself, Kent, Rui, Tang, and Coly.

- Series from Matias for lightnvm with fixes from Hans Holmberg,
Javier, and Matias. Mostly centered around pblk, and the removing
rrpc 1.2 in preparation for supporting 2.0.

- A couple of NVMe pull requests from Christoph. Nothing major in
here, just fixes and cleanups, and support for command tracing from
Johannes.

- Support for blk-throttle for tracking reads and writes separately.
From Joseph Qi. A few cleanups/fixes also for blk-throttle from
Weiping.

- Series from Mike Snitzer that enables dm to register its queue more
logically, something that's alwways been problematic on dm since
it's a stacked device.

- Series from Ming cleaning up some of the bio accessor use, in
preparation for supporting multipage bvecs.

- Various fixes from Ming closing up holes around queue mapping and
quiescing.

- BSD partition fix from Richard Narron, fixing a problem where we
can't mount newer (10/11) FreeBSD partitions.

- Series from Tejun reworking blk-mq timeout handling. The previous
scheme relied on atomic bits, but it had races where we would think
a request had timed out if it to reused at the wrong time.

- null_blk now supports faking timeouts, to enable us to better
exercise and test that functionality separately. From me.

- Kill the separate atomic poll bit in the request struct. After
this, we don't use the atomic bits on blk-mq anymore at all. From
me.

- sgl_alloc/free helpers from Bart.

- Heavily contended tag case scalability improvement from me.

- Various little fixes and cleanups from Arnd, Bart, Corentin,
Douglas, Eryu, Goldwyn, and myself"

* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
block: remove smart1,2.h
nvme: add tracepoint for nvme_complete_rq
nvme: add tracepoint for nvme_setup_cmd
nvme-pci: introduce RECONNECTING state to mark initializing procedure
nvme-rdma: remove redundant boolean for inline_data
nvme: don't free uuid pointer before printing it
nvme-pci: Suspend queues after deleting them
bsg: use pr_debug instead of hand crafted macros
blk-mq-debugfs: don't allow write on attributes with seq_operations set
nvme-pci: Fix queue double allocations
block: Set BIO_TRACE_COMPLETION on new bio during split
blk-throttle: use queue_is_rq_based
block: Remove kblockd_schedule_delayed_work{,_on}()
blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
lib/scatterlist: Fix chaining support in sgl_alloc_order()
blk-throttle: track read and write request individually
block: add bdev_read_only() checks to common helpers
block: fail op_is_write() requests to read-only partitions
blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
...

Linus Torvalds
2018-01-30 03:51:49 +0800

20 Jan, 2018

2 commits

ae943d206 blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays ... Browse Code »

Make sure that calling blk_mq_run_hw_queue() or
blk_mq_kick_requeue_list() triggers a queue run without delay even
if blk_mq_delay_run_hw_queue() has been called recently and if its
delay has not yet expired.

Reviewed-by: Mike Snitzer
Signed-off-by: Bart Van Assche
Signed-off-by: Jens Axboe

Bart Van Assche
2018-01-20 03:52:01 +0800
c77ff7fd0 blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly() ... Browse Code »

Most blk-mq functions have a name that follows the pattern blk_mq_${action}.
However, the function name blk_mq_request_direct_issue is an exception.
Hence rename this function. This patch does not change any functionality.

Reviewed-by: Mike Snitzer
Signed-off-by: Bart Van Assche
Signed-off-by: Jens Axboe

Bart Van Assche
2018-01-20 03:51:59 +0800

18 Jan, 2018

1 commit

23d4ee19e blk-mq: don't dispatch request in blk_mq_request_direct_issue if queue is busy ... Browse Code »

If we run into blk_mq_request_direct_issue(), when queue is busy, we
don't want to dispatch this request into hctx->dispatch_list, and
what we need to do is to return the queue busy info to caller, so
that caller can deal with it well.

Fixes: 396eaf21ee ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback")
Reported-by: Laurence Oberman
Reviewed-by: Mike Snitzer
Signed-off-by: Ming Lei
Signed-off-by: Jens Axboe

Ming Lei
2018-01-18 12:38:52 +0800