From 548bc8e1b38e48653a90f48f636f8d253504f8a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Jan 2013 08:05:13 -0800 Subject: block: RCU free request_queue RCU free request_queue so that blkcg_gq->q can be dereferenced under RCU lock. This will be used to implement hierarchical stats. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94bc83011ed..406343c43cda 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -437,6 +438,7 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif + struct rcu_head rcu_head; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- cgit v1.2.3 From 242d98f077ac0ab80920219769eb095503b93f61 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 17 Dec 2012 10:01:27 -0500 Subject: block,elevator: use new hashtable implementation Switch elevator to use the new hashtable implementation. This reduces the amount of generic unrelated code in the elevator. This also removes the dymanic allocation of the hash table. The size of the table is constant so there's no point in paying the price of an extra dereference when accessing it. This patch depends on d9b482c ("hashtable: introduce a small and naive hashtable") which was merged in v3.6. Signed-off-by: Sasha Levin Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/elevator.c | 23 ++++------------------- include/linux/elevator.h | 5 ++++- 3 files changed, 9 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/block/blk.h b/block/blk.h index 47fdfdd41520..e837b8f619b7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -61,7 +61,7 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash) void blk_insert_flush(struct request *rq); void blk_abort_flushes(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 9edba1b8323e..11683bb10b7b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -46,11 +46,6 @@ static LIST_HEAD(elv_list); /* * Merge hash stuff. */ -static const int elv_hash_shift = 6; -#define ELV_HASH_BLOCK(sec) ((sec) >> 3) -#define ELV_HASH_FN(sec) \ - (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) -#define ELV_HASH_ENTRIES (1 << elv_hash_shift) #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) /* @@ -142,7 +137,6 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) { struct elevator_queue *eq; - int i; eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); if (unlikely(!eq)) @@ -151,14 +145,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); - - eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES, - GFP_KERNEL, q->node); - if (!eq->hash) - goto err; - - for (i = 0; i < ELV_HASH_ENTRIES; i++) - INIT_HLIST_HEAD(&eq->hash[i]); + hash_init(eq->hash); return eq; err: @@ -173,7 +160,6 @@ static void elevator_release(struct kobject *kobj) e = container_of(kobj, struct elevator_queue, kobj); elevator_put(e->type); - kfree(e->hash); kfree(e); } @@ -240,7 +226,7 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { - hlist_del_init(&rq->hash); + hash_del(&rq->hash); } static void elv_rqhash_del(struct request_queue *q, struct request *rq) @@ -254,7 +240,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) struct elevator_queue *e = q->elevator; BUG_ON(ELV_ON_HASH(rq)); - hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); + hash_add(e->hash, &rq->hash, rq_hash_key(rq)); } static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) @@ -266,11 +252,10 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; - struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; struct hlist_node *entry, *next; struct request *rq; - hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) { + hash_for_each_possible_safe(e->hash, rq, entry, next, hash, offset) { BUG_ON(!ELV_ON_HASH(rq)); if (unlikely(!rq_mergeable(rq))) { diff --git a/include/linux/elevator.h b/include/linux/elevator.h index c03af7687bb4..7c5a7c9789ee 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -2,6 +2,7 @@ #define _LINUX_ELEVATOR_H #include +#include #ifdef CONFIG_BLOCK @@ -96,6 +97,8 @@ struct elevator_type struct list_head list; }; +#define ELV_HASH_BITS 6 + /* * each queue has an elevator_queue associated with it */ @@ -105,8 +108,8 @@ struct elevator_queue void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; - struct hlist_head *hash; unsigned int registered:1; + DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; /* -- cgit v1.2.3 From 422765c2638924da10ff363b5eed77924911bdc7 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Fri, 11 Jan 2013 14:46:09 +0100 Subject: block: Remove should_sort judgement when flush blk_plug In commit 975927b942c932,it add blk_rq_pos to sort rq when flushing. Although this commit was used for the situation which blk_plug handled multi devices on the same time like md device. I think there must be some situations like this but only single device. So remove the should_sort judgement. Because the parameter should_sort is only for this purpose,it can delete should_sort from blk_plug. CC: Shaohua Li Signed-off-by: Jianpeng Ma Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +------------ include/linux/blkdev.h | 1 - 2 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index c973249d68cd..aca5d82ff13c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1548,13 +1548,6 @@ get_rq: if (list_empty(&plug->list)) trace_block_plug(q); else { - if (!plug->should_sort) { - struct request *__rq; - - __rq = list_entry_rq(plug->list.prev); - if (__rq->q != q) - plug->should_sort = 1; - } if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); trace_block_plug(q); @@ -2888,7 +2881,6 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->cb_list); - plug->should_sort = 0; /* * If this is a nested plug, don't actually assign it. It will be @@ -2990,10 +2982,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) list_splice_init(&plug->list, &list); - if (plug->should_sort) { - list_sort(NULL, &list, plug_rq_cmp); - plug->should_sort = 0; - } + list_sort(NULL, &list, plug_rq_cmp); q = NULL; depth = 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94bc83011ed..dbe74279f3d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -974,7 +974,6 @@ struct blk_plug { unsigned long magic; /* detect uninitialized use-cases */ struct list_head list; /* requests */ struct list_head cb_list; /* md requires an unplug callback */ - unsigned int should_sort; /* list to be sorted before flushing? */ }; #define BLK_MAX_REQUEST_COUNT 16 -- cgit v1.2.3 From 3a366e614d0837d9fc23f78cdb1a1186ebc3387f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Jan 2013 13:06:33 -0800 Subject: block: add missing block_bio_complete() tracepoint bio completion didn't kick block_bio_complete TP. Only dm was explicitly triggering the TP on IO completion. This makes block_bio_complete TP useless for tracers which want to know about bios, and all other bio based drivers skip generating blktrace completion events. This patch makes all bio completions via bio_endio() generate block_bio_complete TP. * Explicit trace_block_bio_complete() invocation removed from dm and the trace point is unexported. * @rq dropped from trace_block_bio_complete(). bios may fly around w/o queue associated. Verifying and accessing the assocaited queue belongs to TP probes. * blktrace now gets both request and bio completions. Make it ignore bio completions if request completion path is happening. This makes all bio based drivers generate blktrace completion events properly and makes the block_bio_complete TP actually useful. v2: With this change, block_bio_complete TP could be invoked on sg commands which have bio's with %NULL bi_bdev. Update TP assignment code to check whether bio->bi_bdev is %NULL before dereferencing. Signed-off-by: Tejun Heo Original-patch-by: Namhyung Kim Cc: Tejun Heo Cc: Steven Rostedt Cc: Alasdair Kergon Cc: dm-devel@redhat.com Cc: Neil Brown Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - drivers/md/dm.c | 1 - drivers/md/raid5.c | 11 +---------- fs/bio.c | 2 ++ include/linux/blktrace_api.h | 1 + include/trace/events/block.h | 8 ++++---- kernel/trace/blktrace.c | 26 +++++++++++++++++++++++--- 7 files changed, 31 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index aca5d82ff13c..4f5aec708be6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,7 +39,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); -EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c72e4d5a9617..650ec2866e34 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -627,7 +627,6 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 19d77a026639..9ab506df42da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -184,8 +184,6 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -3917,8 +3915,6 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { - trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), - raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4377,8 +4373,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); bio_endio(bi, 0); } } @@ -4755,11 +4749,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) { - trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), - raid_bio, 0); + if (remaining == 0) bio_endio(raid_bio, 0); - } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; diff --git a/fs/bio.c b/fs/bio.c index b96fc6ce4855..bb5768f59b32 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; + trace_block_bio_complete(bio, error); + if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 7c2e030e72f1..0ea61e07a91c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -12,6 +12,7 @@ struct blk_trace { int trace_state; + bool rq_based; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 05c5e61f0a7c..8a168db9a645 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -206,7 +206,6 @@ TRACE_EVENT(block_bio_bounce, /** * block_bio_complete - completed all work on the block operation - * @q: queue holding the block operation * @bio: block operation completed * @error: io error value * @@ -215,9 +214,9 @@ TRACE_EVENT(block_bio_bounce, */ TRACE_EVENT(block_bio_complete, - TP_PROTO(struct request_queue *q, struct bio *bio, int error), + TP_PROTO(struct bio *bio, int error), - TP_ARGS(q, bio, error), + TP_ARGS(bio, error), TP_STRUCT__entry( __field( dev_t, dev ) @@ -228,7 +227,8 @@ TRACE_EVENT(block_bio_complete, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio->bi_bdev ? + bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; __entry->error = error; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741c..190d98fbed27 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { + struct blk_trace *bt = q->blk_trace; + + /* if control ever passes through here, it's a request based driver */ + if (unlikely(bt && !bt->rq_based)) + bt->rq_based = true; + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -774,10 +780,24 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio, - int error) +static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) { + struct request_queue *q; + struct blk_trace *bt; + + if (!bio->bi_bdev) + return; + + q = bdev_get_queue(bio->bi_bdev); + bt = q->blk_trace; + + /* + * Request based drivers will generate both rq and bio completions. + * Ignore bio ones. + */ + if (likely(!bt) || bt->rq_based) + return; + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } -- cgit v1.2.3 From f0059afd3e6e7aa1a0ffc23468b74c43d47660b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Jan 2013 13:06:35 -0800 Subject: buffer: make touch_buffer() an exported function We want to add a trace point to touch_buffer() but macros and inline functions defined in header files can't have tracing points. Move touch_buffer() to fs/buffer.c and make it a proper function. The new exported function is also declared inline. As most uses of touch_buffer() are inside buffer.c with nilfs2 as the only other user, the effect of this change should be negligible. Signed-off-by: Tejun Heo Cc: Steven Rostedt Signed-off-by: Jens Axboe --- fs/buffer.c | 6 ++++++ include/linux/buffer_head.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index c017a2dfb909..a8c2dfb68dcd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -53,6 +53,12 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) } EXPORT_SYMBOL(init_buffer); +inline void touch_buffer(struct buffer_head *bh) +{ + mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); + static int sleep_on_buffer(void *word) { io_schedule(); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 458f497738a4..5afc4f94d110 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -126,7 +126,6 @@ BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) -#define touch_buffer(bh) mark_page_accessed(bh->b_page) /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ @@ -142,6 +141,7 @@ BUFFER_FNS(Unwritten, unwritten) void mark_buffer_dirty(struct buffer_head *bh); void init_buffer(struct buffer_head *, bh_end_io_t *, void *); +void touch_buffer(struct buffer_head *bh); void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); int try_to_free_buffers(struct page *); -- cgit v1.2.3 From 686855f5d833178e518d79e7912cdb3268a9fa69 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Feb 2013 18:19:58 +0400 Subject: sched: add wait_for_completion_io[_timeout] The only difference between wait_for_completion[_timeout]() and wait_for_completion_io[_timeout]() is that the latter calls io_schedule_timeout() instead of schedule_timeout() so that the caller is accounted as waiting for IO, not just sleeping. These functions can be used for correct iowait time accounting when the completion struct is actually used for waiting for IO (e.g. completion of a bio request in the block layer). Signed-off-by: Vladimir Davydov Acked-by: Ingo Molnar Signed-off-by: Jens Axboe --- include/linux/completion.h | 3 +++ kernel/sched/core.c | 57 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/completion.h b/include/linux/completion.h index 51494e6b5548..33f0280fd533 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -77,10 +77,13 @@ static inline void init_completion(struct completion *x) } extern void wait_for_completion(struct completion *); +extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); +extern unsigned long wait_for_completion_io_timeout(struct completion *x, + unsigned long timeout); extern long wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout); extern long wait_for_completion_killable_timeout( diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..d6fdcdcbb9b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3267,7 +3267,8 @@ void complete_all(struct completion *x) EXPORT_SYMBOL(complete_all); static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -3280,7 +3281,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) } __set_current_state(state); spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); + timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); @@ -3291,17 +3292,30 @@ do_wait_for_common(struct completion *x, long timeout, int state) return timeout ?: 1; } -static long __sched -wait_for_common(struct completion *x, long timeout, int state) +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { might_sleep(); spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); + timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); return timeout; } +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + /** * wait_for_completion: - waits for completion of a task * @x: holds the state of this particular completion @@ -3337,6 +3351,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + /** * wait_for_completion_interruptible: - waits for completion of a task (w/intr) * @x: holds the state of this particular completion -- cgit v1.2.3