linux 内核 xfs ail,linux 3.10内核 xfs的一次io异常导致的hung crash

8556395.925003] XFS (sdi): xfs_log_force: error -5returned.

[8556407.425047] INFO: task umount:19286 blocked for more than 1200seconds.----这个默认是120，该环境上是被人为设置1200

[8556407.425653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables thismessage.

[8556407.426050] umount D ffff881fffdd47c0 0 19286 19283 0x00000080------------------------19286是pid，19283是ppid

[8556407.426056] ffff883e39587d80 0000000000000082ffff883e39587fd8 ffff883e39587fd8

[8556407.426066] ffff883e39587fd8 00000000000147c0 ffff881fd3260000 ffff881f860fec80

[8556407.426072] ffff883ed79e1a20 ffff881f860fecc0 ffff881f860fece8 ffff881f860fec90------------这12个64位的值为栈里的值

[8556407.426078] Call Trace:

[8556407.426092] [] schedule+0x29/0x70[8556407.426137] [] xfs_ail_push_all_sync+0xc1/0x110[xfs]

[8556407.426143] [] ? wake_up_atomic_t+0x30/0x30[8556407.426165] [] xfs_unmountfs+0x68/0x190[xfs]

[8556407.426186] [] ? xfs_mru_cache_destroy+0x6b/0x90[xfs]

[8556407.426204] [] xfs_fs_put_super+0x32/0x90[xfs]

[8556407.426211] [] generic_shutdown_super+0x56/0xe0-------已经执行完fsnotify_umount_inodes[8556407.426215] [] kill_block_super+0x27/0x70[8556407.426218] [] deactivate_locked_super+0x49/0x60[8556407.426222] [] deactivate_super+0x46/0x60[8556407.426229] [] mntput_no_expire+0xc5/0x120[8556407.426234] [] SyS_umount+0x15b/0x5b0[8556407.426239] [] system_call_fastpath+0x16/0x1b[8556407.426242] sending NMI to all CPUs:

对应的函数为：

/** Push out all items in the AIL immediately and wait until the AIL is empty.*/

voidxfs_ail_push_all_sync(struct xfs_ail *ailp)

{struct xfs_log_item *lip;

DEFINE_WAIT(wait);

spin_lock(&ailp->xa_lock);while ((lip = xfs_ail_max(ailp)) !=NULL) {

prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE);

ailp->xa_target = lip->li_lsn;

wake_up_process(ailp->xa_task);

spin_unlock(&ailp->xa_lock);

schedule();

spin_lock(&ailp->xa_lock);

}

spin_unlock(&ailp->xa_lock);

finish_wait(&ailp->xa_empty, &wait);

}

排查的时候走了弯路，因为看的是：

crash> dis -l xfs_ail_push_all_sync+0xc1

/usr/src/debug/kernel-3.10.0-327.22.2.el7/linux-3.10.0-327.22.2.el7.x86_64/include/linux/spinlock.h: 293

0xffffffffa07c6cd1 193>: mov %r13,%rdi

所以想当然地认为在第一个获取spinlock的地方，

spin_lock(&ailp->xa_lock);---------以为在这hung住while ((lip = xfs_ail_max(ailp)) != NULL) {

然后查找这把锁：

crash> structsuper_block.s_fs_info ffff883f7fe67800

s_fs_info= 0xffff883f7fe67000crash> mod -s xfs

MODULE NAME SIZE OBJECT FILE

ffffffffa0803d40 xfs939714 /lib/modules/3.10.0-327.22.2.el7.x86_64/kernel/fs/xfs/xfs.ko

crash> struct -xo xfs_mount.m_ail 0xffff883f7fe67000

structxfs_mount {

[ffff883f7fe67010]struct xfs_ail *m_ail;

}struct xfs_mount.m_ail 0xffff883f7fe67000m_ail= 0xffff881f860fec80crash> struct -xo xfs_ail.xa_lock 0xffff881f860fec80

structxfs_ail {

[ffff881f860fecc0] spinlock_t xa_lock;

}

而且这把锁的值是：

crash> struct xfs_ail.xa_lock 0xffff881f860fec80xa_lock={

{

rlock={

raw_lock={

{

head_tail= 2406518640,

tickets={

head= 36720,

tail= 36720}

}

我一看到这个是非0值，想当然认为是被持有了，为了验证我的想法，我甚至随便找了一把锁查看它的值，点背的时候，喝凉水都塞牙，我随手选择的锁是：

crash>p gc_lock

gc_lock= $1 ={

{

rlock={

raw_lock={

{

head_tail= 0,

tickets={

head= 0,

tail= 0}

}

后来仔细看了在没有开启 CONFIG_QUEUED_SPINLOCKS 的情况下，锁的定义才是如此，否则的话，是如下：

#ifndef __GENKSYMS__

typedefstructqspinlock {

atomic_t val;

} arch_spinlock_t;#elsetypedef u32 __ticketpair_t;

typedefstructarch_spinlock {

union {

__ticketpair_t head_tail;struct__raw_tickets {

__ticket_t head, tail;

} tickets;

};

} arch_spinlock_t;#endif

所以要么0，要么1的情况只适合于第一种定义，以前因为业务默认配置的config问题，查的锁相关的crash，大多是第一种定义，但是当时没注意到这个细节，只是简单扫了一眼，浪费我两个小时，所以下盘不稳总会摔。

查看running状态的都没有持有这把锁，(其实按照第二种定义，是正常的，因为此时本就没人持有这把锁，因为head和tail相等)比较奇怪，后来找同事讨论，同事认为是while循环没跳出，所以获取锁的那行是schedule下面那行，还没有执行。

想起来自己曾经写过的博客《https://www.cnblogs.com/10087622blog/p/9558024.html》,应该是如此

voidxfs_ail_push_all_sync(struct xfs_ail *ailp)

{struct xfs_log_item *lip;

DEFINE_WAIT(wait);

spin_lock(&ailp->xa_lock);while ((lip = xfs_ail_max(ailp)) !=NULL) {

prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE);

ailp->xa_target = lip->li_lsn;

wake_up_process(ailp->xa_task);

spin_unlock(&ailp->xa_lock);

schedule();--------------------这个和while循环一直跳不出去，

spin_lock(&ailp->xa_lock);

}

spin_unlock(&ailp->xa_lock);

finish_wait(&ailp->xa_empty, &wait);

}

跳不出去的原因跟io相关，也就是wake_up 相关的内核线程之后，内核线程还是没有完成对应的任务：

crash> struct xfs_ail.xa_task 0xffff881f860fec80xa_task= 0xffff883f8901dc00crash> task_struct.pid 0xffff883f8901dc00pid= 42726crash> set 42726PID:42726COMMAND:"xfsaild/sdi"TASK: ffff883f8901dc00 [THREAD_INFO: ffff881f3d828000]

CPU:17STATE: TASK_INTERRUPTIBLE

crash>bt

PID:42726 TASK: ffff883f8901dc00 CPU: 17 COMMAND: "xfsaild/sdi"#0[ffff881f3d82bcd0] __schedule at ffffffff8163df9b

#1[ffff881f3d82bd38] schedule at ffffffff8163e879

#2[ffff881f3d82bd48] schedule_timeout at ffffffff8163c295

#3[ffff881f3d82bdf8] xfsaild at ffffffffa07c6a4f [xfs]

#4[ffff881f3d82bec8] kthread at ffffffff810a661f

#5 [ffff881f3d82bf50] ret_from_fork at ffffffff81649858

xfsaild 无法完成的原因也是因为IO异常:

附部分相关io异常的message日志：

kernel: sd 5:0:8:0: [sdi] CDB: Read(16) 88 00 00 00 00 00 03 4c b0 48 00 00 00 08 00 00kernel: sd5:0:8:0: [sdi] CDB: Read(16) 88 00 00 00 00 00 15 32 8a 50 00 00 03 30 00 00kernel: blk_update_request: I/O error, dev sdi, sector 355634560kernel: sd5:0:8:0: rejecting I/O to offline device

kernel: XFS (sdi): metadata I/O error: block 0xee173de0 ("xfs_trans_read_buf_map") error 5 numblks 8kernel: XFS (sdi): metadata I/O error: block 0x15da29d90 ("xfs_trans_read_buf_map") error 5 numblks 32kernel: sd5:0:8:0: rejecting I/O to offline device

kernel: XFS (sdi): xfs_imap_to_bp: xfs_trans_read_buf() returned error-5.

kernel: XFS (sdi): metadata I/O error: block 0xeca8a480 ("xfs_trans_read_buf_map") error 5 numblks 8kernel: sd5:0:8:0: rejecting I/O to offline device

kernel: sd5:0:8:0: rejecting I/O to offline device

对应的bug解决可以参照：https://bugzilla.redhat.com/show_bug.cgi?id=1267042