Skip to content

Commit 0048b48

Browse files
Ming Leiaxboe
Ming Lei
authored andcommitted
blk-mq: fix race between timeout and freeing request
Inside timeout handler, blk_mq_tag_to_rq() is called to retrieve the request from one tag. This way is obviously wrong because the request can be freed any time and some fiedds of the request can't be trusted, then kernel oops might be triggered[1]. Currently wrt. blk_mq_tag_to_rq(), the only special case is that the flush request can share same tag with the request cloned from, and the two requests can't be active at the same time, so this patch fixes the above issue by updating tags->rqs[tag] with the active request(either flush rq or the request cloned from) of the tag. Also blk_mq_tag_to_rq() gets much simplified with this patch. Given blk_mq_tag_to_rq() is mainly for drivers and the caller must make sure the request can't be freed, so in bt_for_each() this helper is replaced with tags->rqs[tag]. [1] kernel oops log [ 439.696220] BUG: unable to handle kernel NULL pointer dereference at 0000000000000158^M [ 439.697162] IP: [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M [ 439.700653] PGD 7ef765067 PUD 7ef764067 PMD 0 ^M [ 439.700653] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ^M [ 439.700653] Dumping ftrace buffer:^M [ 439.700653] (ftrace buffer empty)^M [ 439.700653] Modules linked in: nbd ipv6 kvm_intel kvm serio_raw^M [ 439.700653] CPU: 6 PID: 2779 Comm: stress-ng-sigfd Not tainted 4.2.0-rc5-next-20150805+ #265^M [ 439.730500] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011^M [ 439.730500] task: ffff880605308000 ti: ffff88060530c000 task.ti: ffff88060530c000^M [ 439.730500] RIP: 0010:[<ffffffff812d89ba>] [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M [ 439.730500] RSP: 0018:ffff880819203da0 EFLAGS: 00010283^M [ 439.730500] RAX: ffff880811b0e000 RBX: ffff8800bb465f00 RCX: 0000000000000002^M [ 439.730500] RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000^M [ 439.730500] RBP: ffff880819203db0 R08: 0000000000000002 R09: 0000000000000000^M [ 439.730500] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000202^M [ 439.730500] R13: ffff880814104800 R14: 0000000000000002 R15: ffff880811a2ea00^M [ 439.730500] FS: 00007f165b3f5740(0000) GS:ffff880819200000(0000) knlGS:0000000000000000^M [ 439.730500] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b^M [ 439.730500] CR2: 0000000000000158 CR3: 00000007ef766000 CR4: 00000000000006e0^M [ 439.730500] Stack:^M [ 439.730500] 0000000000000008 ffff8808114eed90 ffff880819203e00 ffffffff812dc104^M [ 439.755663] ffff880819203e40 ffffffff812d9f5e 0000020000000000 ffff8808114eed80^M [ 439.755663] Call Trace:^M [ 439.755663] <IRQ> ^M [ 439.755663] [<ffffffff812dc104>] bt_for_each+0x6e/0xc8^M [ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M [ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M [ 439.755663] [<ffffffff812dc1b3>] blk_mq_tag_busy_iter+0x55/0x5e^M [ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M [ 439.755663] [<ffffffff812d8911>] blk_mq_rq_timer+0x5d/0xd4^M [ 439.755663] [<ffffffff810a3e10>] call_timer_fn+0xf7/0x284^M [ 439.755663] [<ffffffff810a3d1e>] ? call_timer_fn+0x5/0x284^M [ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M [ 439.755663] [<ffffffff810a46d6>] run_timer_softirq+0x1ce/0x1f8^M [ 439.755663] [<ffffffff8104c367>] __do_softirq+0x181/0x3a4^M [ 439.755663] [<ffffffff8104c76e>] irq_exit+0x40/0x94^M [ 439.755663] [<ffffffff81031482>] smp_apic_timer_interrupt+0x33/0x3e^M [ 439.755663] [<ffffffff815559a4>] apic_timer_interrupt+0x84/0x90^M [ 439.755663] <EOI> ^M [ 439.755663] [<ffffffff81554350>] ? _raw_spin_unlock_irq+0x32/0x4a^M [ 439.755663] [<ffffffff8106a98b>] finish_task_switch+0xe0/0x163^M [ 439.755663] [<ffffffff8106a94d>] ? finish_task_switch+0xa2/0x163^M [ 439.755663] [<ffffffff81550066>] __schedule+0x469/0x6cd^M [ 439.755663] [<ffffffff8155039b>] schedule+0x82/0x9a^M [ 439.789267] [<ffffffff8119b28b>] signalfd_read+0x186/0x49a^M [ 439.790911] [<ffffffff8106d86a>] ? wake_up_q+0x47/0x47^M [ 439.790911] [<ffffffff811618c2>] __vfs_read+0x28/0x9f^M [ 439.790911] [<ffffffff8117a289>] ? __fget_light+0x4d/0x74^M [ 439.790911] [<ffffffff811620a7>] vfs_read+0x7a/0xc6^M [ 439.790911] [<ffffffff8116292b>] SyS_read+0x49/0x7f^M [ 439.790911] [<ffffffff81554c17>] entry_SYSCALL_64_fastpath+0x12/0x6f^M [ 439.790911] Code: 48 89 e5 e8 a9 b8 e7 ff 5d c3 0f 1f 44 00 00 55 89 f2 48 89 e5 41 54 41 89 f4 53 48 8b 47 60 48 8b 1c d0 48 8b 7b 30 48 8b 53 38 <48> 8b 87 58 01 00 00 48 85 c0 75 09 48 8b 97 88 0c 00 00 eb 10 ^M [ 439.790911] RIP [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M [ 439.790911] RSP <ffff880819203da0>^M [ 439.790911] CR2: 0000000000000158^M [ 439.790911] ---[ end trace d40af58949325661 ]---^M Cc: <[email protected]> Signed-off-by: Ming Lei <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 596f5aa commit 0048b48

File tree

5 files changed

+35
-18
lines changed

5 files changed

+35
-18
lines changed

block/blk-flush.c

+14-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373

7474
#include "blk.h"
7575
#include "blk-mq.h"
76+
#include "blk-mq-tag.h"
7677

7778
/* FLUSH/FUA sequences */
7879
enum {
@@ -226,7 +227,12 @@ static void flush_end_io(struct request *flush_rq, int error)
226227
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
227228

228229
if (q->mq_ops) {
230+
struct blk_mq_hw_ctx *hctx;
231+
232+
/* release the tag's ownership to the req cloned from */
229233
spin_lock_irqsave(&fq->mq_flush_lock, flags);
234+
hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu);
235+
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
230236
flush_rq->tag = -1;
231237
}
232238

@@ -308,11 +314,18 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
308314

309315
/*
310316
* Borrow tag from the first request since they can't
311-
* be in flight at the same time.
317+
* be in flight at the same time. And acquire the tag's
318+
* ownership for flush req.
312319
*/
313320
if (q->mq_ops) {
321+
struct blk_mq_hw_ctx *hctx;
322+
314323
flush_rq->mq_ctx = first_rq->mq_ctx;
315324
flush_rq->tag = first_rq->tag;
325+
fq->orig_rq = first_rq;
326+
327+
hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu);
328+
blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
316329
}
317330

318331
flush_rq->cmd_type = REQ_TYPE_FS;

block/blk-mq-tag.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
429429
for (bit = find_first_bit(&bm->word, bm->depth);
430430
bit < bm->depth;
431431
bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
432-
rq = blk_mq_tag_to_rq(hctx->tags, off + bit);
432+
rq = hctx->tags->rqs[off + bit];
433433
if (rq->q == hctx->queue)
434434
fn(hctx, rq, data, reserved);
435435
}
@@ -453,7 +453,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags,
453453
for (bit = find_first_bit(&bm->word, bm->depth);
454454
bit < bm->depth;
455455
bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
456-
rq = blk_mq_tag_to_rq(tags, off + bit);
456+
rq = tags->rqs[off + bit];
457457
fn(rq, data, reserved);
458458
}
459459

block/blk-mq-tag.h

+12
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,16 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
8989
__blk_mq_tag_idle(hctx);
9090
}
9191

92+
/*
93+
* This helper should only be used for flush request to share tag
94+
* with the request cloned from, and both the two requests can't be
95+
* in flight at the same time. The caller has to make sure the tag
96+
* can't be freed.
97+
*/
98+
static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
99+
unsigned int tag, struct request *rq)
100+
{
101+
hctx->tags->rqs[tag] = rq;
102+
}
103+
92104
#endif

block/blk-mq.c

+1-15
Original file line numberDiff line numberDiff line change
@@ -559,23 +559,9 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
559559
}
560560
EXPORT_SYMBOL(blk_mq_abort_requeue_list);
561561

562-
static inline bool is_flush_request(struct request *rq,
563-
struct blk_flush_queue *fq, unsigned int tag)
564-
{
565-
return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
566-
fq->flush_rq->tag == tag);
567-
}
568-
569562
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
570563
{
571-
struct request *rq = tags->rqs[tag];
572-
/* mq_ctx of flush rq is always cloned from the corresponding req */
573-
struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
574-
575-
if (!is_flush_request(rq, fq, tag))
576-
return rq;
577-
578-
return fq->flush_rq;
564+
return tags->rqs[tag];
579565
}
580566
EXPORT_SYMBOL(blk_mq_tag_to_rq);
581567

block/blk.h

+6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ struct blk_flush_queue {
2222
struct list_head flush_queue[2];
2323
struct list_head flush_data_in_flight;
2424
struct request *flush_rq;
25+
26+
/*
27+
* flush_rq shares tag with this rq, both can't be active
28+
* at the same time
29+
*/
30+
struct request *orig_rq;
2531
spinlock_t mq_flush_lock;
2632
};
2733

0 commit comments

Comments
 (0)