Commit fb2dce862d9f9a68e6b9374579056ec9eca02a63

Authored by David Woodhouse
Committed by Jens Axboe
1 parent d628eaef31

Add 'discard' request handling

Some block devices benefit from a hint that they can forget the contents
of certain sectors. Add basic support for this to the block core, along
with a 'blkdev_issue_discard()' helper function which issues such
requests.

The caller doesn't get to provide an end_io functio, since
blkdev_issue_discard() will automatically split the request up into
multiple bios if appropriate. Neither does the function wait for
completion -- it's expected that callers won't care about when, or even
_if_, the request completes. It's only a hint to the device anyway. By
definition, the file system doesn't _care_ about these sectors any more.

[With feedback from OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> and
Jens Axboe <jens.axboe@oracle.com]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 6 changed files with 130 additions and 11 deletions Side-by-side Diff

... ... @@ -315,4 +315,73 @@
315 315 return ret;
316 316 }
317 317 EXPORT_SYMBOL(blkdev_issue_flush);
  318 +
  319 +static void blkdev_discard_end_io(struct bio *bio, int err)
  320 +{
  321 + if (err) {
  322 + if (err == -EOPNOTSUPP)
  323 + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
  324 + clear_bit(BIO_UPTODATE, &bio->bi_flags);
  325 + }
  326 +
  327 + bio_put(bio);
  328 +}
  329 +
  330 +/**
  331 + * blkdev_issue_discard - queue a discard
  332 + * @bdev: blockdev to issue discard for
  333 + * @sector: start sector
  334 + * @nr_sects: number of sectors to discard
  335 + *
  336 + * Description:
  337 + * Issue a discard request for the sectors in question. Does not wait.
  338 + */
  339 +int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
  340 + unsigned nr_sects)
  341 +{
  342 + struct request_queue *q;
  343 + struct bio *bio;
  344 + int ret = 0;
  345 +
  346 + if (bdev->bd_disk == NULL)
  347 + return -ENXIO;
  348 +
  349 + q = bdev_get_queue(bdev);
  350 + if (!q)
  351 + return -ENXIO;
  352 +
  353 + if (!q->prepare_discard_fn)
  354 + return -EOPNOTSUPP;
  355 +
  356 + while (nr_sects && !ret) {
  357 + bio = bio_alloc(GFP_KERNEL, 0);
  358 + if (!bio)
  359 + return -ENOMEM;
  360 +
  361 + bio->bi_end_io = blkdev_discard_end_io;
  362 + bio->bi_bdev = bdev;
  363 +
  364 + bio->bi_sector = sector;
  365 +
  366 + if (nr_sects > q->max_hw_sectors) {
  367 + bio->bi_size = q->max_hw_sectors << 9;
  368 + nr_sects -= q->max_hw_sectors;
  369 + sector += q->max_hw_sectors;
  370 + } else {
  371 + bio->bi_size = nr_sects << 9;
  372 + nr_sects = 0;
  373 + }
  374 + bio_get(bio);
  375 + submit_bio(WRITE_DISCARD, bio);
  376 +
  377 + /* Check if it failed immediately */
  378 + if (bio_flagged(bio, BIO_EOPNOTSUPP))
  379 + ret = -EOPNOTSUPP;
  380 + else if (!bio_flagged(bio, BIO_UPTODATE))
  381 + ret = -EIO;
  382 + bio_put(bio);
  383 + }
  384 + return ret;
  385 +}
  386 +EXPORT_SYMBOL(blkdev_issue_discard);
... ... @@ -1079,6 +1079,10 @@
1079 1079 */
1080 1080 if (unlikely(bio_barrier(bio)))
1081 1081 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
  1082 + if (unlikely(bio_discard(bio))) {
  1083 + req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD);
  1084 + req->q->prepare_discard_fn(req->q, req);
  1085 + }
1082 1086  
1083 1087 if (bio_sync(bio))
1084 1088 req->cmd_flags |= REQ_RW_SYNC;
... ... @@ -1095,7 +1099,7 @@
1095 1099 static int __make_request(struct request_queue *q, struct bio *bio)
1096 1100 {
1097 1101 struct request *req;
1098   - int el_ret, nr_sectors, barrier, err;
  1102 + int el_ret, nr_sectors, barrier, discard, err;
1099 1103 const unsigned short prio = bio_prio(bio);
1100 1104 const int sync = bio_sync(bio);
1101 1105 int rw_flags;
... ... @@ -1115,6 +1119,12 @@
1115 1119 goto end_io;
1116 1120 }
1117 1121  
  1122 + discard = bio_discard(bio);
  1123 + if (unlikely(discard) && !q->prepare_discard_fn) {
  1124 + err = -EOPNOTSUPP;
  1125 + goto end_io;
  1126 + }
  1127 +
1118 1128 spin_lock_irq(q->queue_lock);
1119 1129  
1120 1130 if (unlikely(barrier) || elv_queue_empty(q))
... ... @@ -1405,7 +1415,8 @@
1405 1415  
1406 1416 if (bio_check_eod(bio, nr_sectors))
1407 1417 goto end_io;
1408   - if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
  1418 + if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
  1419 + (bio_discard(bio) && !q->prepare_discard_fn)) {
1409 1420 err = -EOPNOTSUPP;
1410 1421 goto end_io;
1411 1422 }
... ... @@ -1487,7 +1498,6 @@
1487 1498 * go through the normal accounting stuff before submission.
1488 1499 */
1489 1500 if (bio_has_data(bio)) {
1490   -
1491 1501 if (rw & WRITE) {
1492 1502 count_vm_events(PGPGOUT, count);
1493 1503 } else {
... ... @@ -1881,7 +1891,7 @@
1881 1891 struct request_queue *q = rq->q;
1882 1892 unsigned long flags = 0UL;
1883 1893  
1884   - if (bio_has_data(rq->bio)) {
  1894 + if (bio_has_data(rq->bio) || blk_discard_rq(rq)) {
1885 1895 if (__end_that_request_first(rq, error, nr_bytes))
1886 1896 return 1;
1887 1897  
... ... @@ -1939,7 +1949,7 @@
1939 1949 **/
1940 1950 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
1941 1951 {
1942   - if (bio_has_data(rq->bio) &&
  1952 + if ((bio_has_data(rq->bio) || blk_discard_rq(rq)) &&
1943 1953 __end_that_request_first(rq, error, nr_bytes))
1944 1954 return 1;
1945 1955  
1946 1956  
... ... @@ -2012,12 +2022,14 @@
2012 2022 we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
2013 2023 rq->cmd_flags |= (bio->bi_rw & 3);
2014 2024  
2015   - rq->nr_phys_segments = bio_phys_segments(q, bio);
2016   - rq->nr_hw_segments = bio_hw_segments(q, bio);
  2025 + if (bio_has_data(bio)) {
  2026 + rq->nr_phys_segments = bio_phys_segments(q, bio);
  2027 + rq->nr_hw_segments = bio_hw_segments(q, bio);
  2028 + rq->buffer = bio_data(bio);
  2029 + }
2017 2030 rq->current_nr_sectors = bio_cur_sectors(bio);
2018 2031 rq->hard_cur_sectors = rq->current_nr_sectors;
2019 2032 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
2020   - rq->buffer = bio_data(bio);
2021 2033 rq->data_len = bio->bi_size;
2022 2034  
2023 2035 rq->bio = rq->biotail = bio;
block/blk-settings.c
... ... @@ -33,6 +33,23 @@
33 33 EXPORT_SYMBOL(blk_queue_prep_rq);
34 34  
35 35 /**
  36 + * blk_queue_set_discard - set a discard_sectors function for queue
  37 + * @q: queue
  38 + * @dfn: prepare_discard function
  39 + *
  40 + * It's possible for a queue to register a discard callback which is used
  41 + * to transform a discard request into the appropriate type for the
  42 + * hardware. If none is registered, then discard requests are failed
  43 + * with %EOPNOTSUPP.
  44 + *
  45 + */
  46 +void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
  47 +{
  48 + q->prepare_discard_fn = dfn;
  49 +}
  50 +EXPORT_SYMBOL(blk_queue_set_discard);
  51 +
  52 +/**
36 53 * blk_queue_merge_bvec - set a merge_bvec function for queue
37 54 * @q: queue
38 55 * @mbfn: merge_bvec_fn
... ... @@ -149,6 +149,8 @@
149 149 * bit 2 -- barrier
150 150 * bit 3 -- fail fast, don't want low level driver retries
151 151 * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  152 + * bit 5 -- metadata request
  153 + * bit 6 -- discard sectors
152 154 */
153 155 #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */
154 156 #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */
... ... @@ -156,6 +158,7 @@
156 158 #define BIO_RW_FAILFAST 3
157 159 #define BIO_RW_SYNC 4
158 160 #define BIO_RW_META 5
  161 +#define BIO_RW_DISCARD 6
159 162  
160 163 /*
161 164 * upper 16 bits of bi_rw define the io priority of this bio
162 165  
... ... @@ -186,13 +189,14 @@
186 189 #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
187 190 #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META))
188 191 #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio))
  192 +#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD))
189 193  
190 194 static inline unsigned int bio_cur_sectors(struct bio *bio)
191 195 {
192 196 if (bio->bi_vcnt)
193 197 return bio_iovec(bio)->bv_len >> 9;
194   -
195   - return 0;
  198 + else /* dataless requests such as discard */
  199 + return bio->bi_size >> 9;
196 200 }
197 201  
198 202 static inline void *bio_data(struct bio *bio)
include/linux/blkdev.h
... ... @@ -89,6 +89,7 @@
89 89 enum rq_flag_bits {
90 90 __REQ_RW, /* not set, read. set, write */
91 91 __REQ_FAILFAST, /* no low level driver retries */
  92 + __REQ_DISCARD, /* request to discard sectors */
92 93 __REQ_SORTED, /* elevator knows about this request */
93 94 __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
94 95 __REQ_HARDBARRIER, /* may not be passed by drive either */
... ... @@ -111,6 +112,7 @@
111 112 };
112 113  
113 114 #define REQ_RW (1 << __REQ_RW)
  115 +#define REQ_DISCARD (1 << __REQ_DISCARD)
114 116 #define REQ_FAILFAST (1 << __REQ_FAILFAST)
115 117 #define REQ_SORTED (1 << __REQ_SORTED)
116 118 #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
... ... @@ -252,6 +254,7 @@
252 254 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
253 255 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
254 256 typedef void (unplug_fn) (struct request_queue *);
  257 +typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
255 258  
256 259 struct bio_vec;
257 260 struct bvec_merge_data {
... ... @@ -307,6 +310,7 @@
307 310 make_request_fn *make_request_fn;
308 311 prep_rq_fn *prep_rq_fn;
309 312 unplug_fn *unplug_fn;
  313 + prepare_discard_fn *prepare_discard_fn;
310 314 merge_bvec_fn *merge_bvec_fn;
311 315 prepare_flush_fn *prepare_flush_fn;
312 316 softirq_done_fn *softirq_done_fn;
... ... @@ -546,6 +550,7 @@
546 550 #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED)
547 551 #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER)
548 552 #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA)
  553 +#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD)
549 554 #define blk_bidi_rq(rq) ((rq)->next_rq != NULL)
550 555 #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
551 556 /* rq->queuelist of dequeued request must be list_empty() */
... ... @@ -796,6 +801,7 @@
796 801 extern void blk_queue_dma_alignment(struct request_queue *, int);
797 802 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
798 803 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
  804 +extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
799 805 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
800 806 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
801 807 extern int blk_do_ordered(struct request_queue *, struct request **);
... ... @@ -837,6 +843,16 @@
837 843 }
838 844  
839 845 extern int blkdev_issue_flush(struct block_device *, sector_t *);
  846 +extern int blkdev_issue_discard(struct block_device *, sector_t sector,
  847 + unsigned nr_sects);
  848 +
  849 +static inline int sb_issue_discard(struct super_block *sb,
  850 + sector_t block, unsigned nr_blocks)
  851 +{
  852 + block <<= (sb->s_blocksize_bits - 9);
  853 + nr_blocks <<= (sb->s_blocksize_bits - 9);
  854 + return blkdev_issue_discard(sb->s_bdev, block, nr_blocks);
  855 +}
840 856  
841 857 /*
842 858 * command filter functions
... ... @@ -86,7 +86,8 @@
86 86 #define READ_META (READ | (1 << BIO_RW_META))
87 87 #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
88 88 #define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC))
89   -#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
  89 +#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER))
  90 +#define WRITE_DISCARD (WRITE | (1 << BIO_RW_DISCARD))
90 91  
91 92 #define SEL_IN 1
92 93 #define SEL_OUT 2