结合coredump分析dirty page和IO request处理逻辑(上)

原创

cdh

修改于 2022-01-17 11:57:22

1.9K1

修改于 2022-01-17 11:57:22

文章被收录于专栏：笔记+笔记+

因文章字数限制，本文分为上下两个章节，源码分析基于内核4.19.

从coredump堆栈信息可以很快找出触发进程20223是因为等待被进程1633持有的write rwsem而触发的系统hung task panic：

crash> log | grep blocked
[27783793.123905] INFO: task filebeat:20223 blocked for more than 327 seconds.
[27783793.158898] Kernel panic - not syncing: hung_task: blocked tasks
crash> bt 20223
PID: 20223  TASK: ffff8f665c25c000  CPU: 52  COMMAND: "filebeat"
 #0 [ffffa29c2277bbd8] __schedule at ffffffff9e8d4009
 #1 [ffffa29c2277bc68] schedule at ffffffff9e8d4438
 #2 [ffffa29c2277bc88] rwsem_down_read_failed at ffffffff9e8d7b08
 #3 [ffffa29c2277bd10] call_rwsem_down_read_failed at ffffffff9e8c9288
 #4 [ffffa29c2277bd60] down_read at ffffffff9e8d6de0
 #5 [ffffa29c2277bd78] xfs_ilock at ffffffffc0e61d1f  [xfs]
 #6 [ffffa29c2277bda8] xfs_file_buffered_aio_read at ffffffffc0e52f68  [xfs]
 #7 [ffffa29c2277bde8] xfs_file_read_iter at ffffffffc0e53032  [xfs]
 #8 [ffffa29c2277be00] new_sync_read at ffffffff9e2ac9e4
 #9 [ffffa29c2277be90] __vfs_read at ffffffff9e2af739
#10 [ffffa29c2277bea0] vfs_read at ffffffff9e2af7de
#11 [ffffa29c2277bed8] ksys_read at ffffffff9e2afc7c
#12 [ffffa29c2277bf20] __x64_sys_read at ffffffff9e2afd1a
#13 [ffffa29c2277bf30] do_syscall_64 at ffffffff9e004250
#14 [ffffa29c2277bf50] entry_SYSCALL_64_after_hwframe at ffffffff9ea00088


crash> struct inode.i_rwsem 0xffff8f65b42db538 | grep owner
    owner = 0xffff8f4e09d14000
crash> bt 0xffff8f4e09d14000
PID: 1633   TASK: ffff8f4e09d14000  CPU: 47  COMMAND: "java"
 #0 [ffffa29c289a78f8] __schedule at ffffffff9e8d4009
 #1 [ffffa29c289a7988] schedule at ffffffff9e8d4438
 #2 [ffffa29c289a79a8] schedule_timeout at ffffffff9e8d830b
 #3 [ffffa29c289a7a28] io_schedule_timeout at ffffffff9e8d480e
 #4 [ffffa29c289a7a48] balance_dirty_pages at ffffffff9e212a0a
 #5 [ffffa29c289a7be8] balance_dirty_pages_ratelimited at ffffffff9e20fc78
 #6 [ffffa29c289a7c10] iomap_write_actor at ffffffff9e3221cd
 #7 [ffffa29c289a7c80] iomap_apply at ffffffff9e322b5a
 #8 [ffffa29c289a7d18] iomap_file_buffered_write at ffffffff9e322f08
 #9 [ffffa29c289a7d58] xfs_file_buffered_aio_write at ffffffffc0e542d9  [xfs]
#10 [ffffa29c289a7dd8] xfs_file_write_iter at ffffffffc0e545db  [xfs]
#11 [ffffa29c289a7df8] new_sync_write at ffffffff9e2ac1e7
#12 [ffffa29c289a7e88] __vfs_write at ffffffff9e2af909
#13 [ffffa29c289a7e98] vfs_write at ffffffff9e2afad2
#14 [ffffa29c289a7ed8] ksys_write at ffffffff9e2afd9c
#15 [ffffa29c289a7f20] __x64_sys_write at ffffffff9e2afe3a
#16 [ffffa29c289a7f30] do_syscall_64 at ffffffff9e004250
#17 [ffffa29c289a7f50] entry_SYSCALL_64_after_hwframe at ffffffff9ea00088

从堆栈上可以看到进程是因为系统dirty page达到设置的比例后被强制休眠，接下来重点分析下balance_dirty_pages函数看看dirty page达到多少时会让写IO进程休眠等待一段时间，休眠时间动态计算，详细参考

commit 143dfe8611a63030ce0c79419dc362f7838be557

static void balance_dirty_pages(struct bdi_writeback *wb,
                                unsigned long pages_dirtied)
{
   ......
      
 for (;;) {
                nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +global_node_page_state(NR_UNSTABLE_NFS);
                gdtc->avail = global_dirtyable_memory();
                gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
                domain_dirty_limits(gdtc);
                if (unlikely(strictlimit)) {
                        } else {
                        dirty = gdtc->dirty;
                        thresh = gdtc->thresh;
                        bg_thresh = gdtc->bg_thresh;
                }
               ......
               ......
      //全局及进程所在mem-cgroup dirty=（NR_FILE_DIRTY+NR_UNSTABLE_NFS+NR_WRITEBACK) 内存页都低于水位线(dirty<=(thresh+bg_thresh)/2)时进程跳出for循环检查dirty page，不会继续休眠一段时间后唤醒继续检查dirty page水位。         
     if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&  
     (!mdtc ||m_dirty <= dirty_freerun_ceiling(m_thresh,  m_bg_thresh))) {
                  ......
                   break;
                }
                 ......
 //全局dirty page 或进程所在mem-cgroup dirty page任一超过水位线(dirty>(thresh+bg_thresh)/2)将唤醒per-pdi后台进程写dirty page        
               if (unlikely(!writeback_in_progress(wb)))
                        wb_start_background_writeback(wb);
                 ......
                //全局dirty page 或进程所在mem-cgroup dirty page任一超过水位线(dirty>(thresh+bg_thresh)/2)则将进程休眠一段时间，休眠时间动态计算，详细参考commit 143dfe8611a63030ce0c79419dc362f7838be557
               __set_current_state(TASK_KILLABLE);
                wb->dirty_sleep = now;
                io_schedule_timeout(pause);
 
               .... 
       }
  
  //全局及进程所在mem-cgroup dirty=（NR_FILE_DIRTY+NR_UNSTABLE_NFS+NR_WRITEBACK) 内存页都低于水位线(dirty<=(thresh+bg_thresh)/2)时进程不会进入休眠限制IO继续写IO，但是如果全局dirty=(NR_FILE_DIRTY+NR_UNSTABLE_NFS)内存页超过（vm.dirty_background_ratio/100*available）则将唤醒per-pdi后台进程写dirty page              
 if (nr_reclaimable > gdtc->bg_thresh)
                wb_start_background_writeback(wb);
}
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
        const unsigned long available_memory = dtc->avail;
        struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
        unsigned long bytes = vm_dirty_bytes;
        unsigned long bg_bytes = dirty_background_bytes;
        /* convert ratios to per-PAGE_SIZE for higher precision */
        unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
        unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
        unsigned long thresh;
        unsigned long bg_thresh;
        struct task_struct *tsk;
        /* gdtc is !NULL iff @dtc is for memcg domain */
        if (gdtc) {
                   if (bytes)//vm.dirty_bytes
       ratio = min(DIV_ROUND_UP(bytes, global_avail),  
                                PAGE_SIZE); 
                if (bg_bytes)
        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                       PAGE_SIZE);
                bytes = bg_bytes = 0;
        }
        if (bytes)
                thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
                thresh = (ratio * available_memory) / PAGE_SIZE;
        if (bg_bytes)
                bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
                thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
        }
        dtc->thresh = thresh;
        dtc->bg_thresh = bg_thresh;
        /* we should eventually report the domain in the TP */
        if (!gdtc)
                trace_global_dirty_state(bg_thresh, thresh);
}

根据函数实现总结说明下coredump环境dirty page的几个sysctl配置参数在内核是如何起作用的：

vm.dirty_background_ratio = 10

vm.dirty_background_bytes 配置为0时该配置生效。

进程写IO时检测到文件系统缓存脏页超过当前系统可用内存vm.dirty_background_ratio%时会唤醒内核后台进程回刷脏页，唤醒脏数据回刷工作后进程直接返回并不会等待回收完成，最终回收工作还是由内核per-bdi((每个盘分区关联一个struct bdi)刷新线程完成。

以vm.dirty_writeback_centisecs为周期执行的per-bdi 刷新线程也会检查脏页超过当前系统可用内存vm.dirty_background_ratio%则回写dirty page。vm.dirty_background_ratio这个比例是指可用内存的占比(FREE内存页和可以被回收的文件页)，而不是系统内存总量的占比。该值范围为[0, 100]

2. vm.dirty_ratio = 40

vm.dirty_bytes 配置为0时该配置生效

当有进程写文件判断当前文件系统缓存脏页超过以当前系统可用内存、vm_dirty_ratio和dirty_background_ratio为变量计算出来的门限值的一半((thresh+bg_thresh)*available/2)时，除了会唤醒内核per-bdi 刷新线程触发回写脏页外，进程进入休眠等待一段时间,减缓进程写数据频率

3.vm.dirty_writeback_centisecs = 500

回刷进程定时唤醒时间（单位：1/100s）,内核per-bdi 刷新线程以dirty_writeback_centisecs/100 秒的周期执行。

4. vm.dirty_expire_centisecs = 3000

脏数据老化时间（单位：1/100s）,周期性内核per-bdi 刷新线程执行时会判断IO数据被写入page缓存到当前时间是否已经超过vm.dirty_expire_centisecs/100 秒，如果是无论dirty page数量是否超过rvm.dirty_background_ratio 比例都会对已达老化时间的dirty page进行回写。

内核dirty page计算方法：

//系统当前可用内存页，当前环境totalreserve_pages为326946 page

available_memory=NR_FREE_PAGES+NR_INACTIVE_FILE+NR_ACTIVE_FILE-totalreserve_pages

//dirty page为为FILE DIRTY+NFS

nr_reclaimable=NR_FILE_DIRTY+NR_UNSTABLE_NFS

dirty = nr_reclaimable+NR_WRITEBACK

unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;

thresh = (ratio * available_memory) / PAGE_SIZE;

相当于：

thresh=(vm_dirty_ratio *available_memory)/100

unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;

bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;

相当于：

bg_thresh = (dirty_background_ratio* available_memory) / 100

当nr_reclaimable > bg_thresh 并且 dirty <= (thresh+bg_thresh)/2时写IO进程不会被强制休眠，但是会wakeup 后台回写进程回写dirty page直到dirty page低于bg_thresh.

当dirty > (thresh+bg_thresh)/2时，除了会wakeup per-bdi后台回写线程外，还会将写IO进程强制休眠10~200ms，被休眠的进程被唤醒后会再次检查dirty page数量并判断是否需要再次休眠。

coredump时系统可用内存：

/**
 * global_dirtyable_memory - number of globally dirtyable pages
 *
 * Returns the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
static unsigned long global_dirtyable_memory(void)
{
        unsigned long x;

        x = global_zone_page_state(NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        x -= min(x, totalreserve_pages);

        x += global_node_page_state(NR_INACTIVE_FILE);
        x += global_node_page_state(NR_ACTIVE_FILE);

        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);

        return x + 1;   /* Ensure that we never return 0 */
}


crash> kmem -V | grep -E 'NR_FREE_PAGES|NR_ZONE_INACTIVE_FILE|NR_ZONE_ACTIVE_FILE'                                                                                                                                                                                NR_FREE_PAGES: 241708
          NR_ZONE_INACTIVE_FILE: 11481764
            NR_ZONE_ACTIVE_FILE: 2521417

crash> p totalreserve_pages
totalreserve_pages = $3 = 326946
crash>
crash> p vm_highmem_is_dirtyable
vm_highmem_is_dirtyable = $4 = 0
crash>
crash> pd (241708+11481764+2521417-326946-0)*4
$5 = 55671772 //KBytes
crash>

根据当前可用内存以及系统配置计算出触发写IO进入会被强制sleep的dirty page大小门限值：
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
{
        return (thresh + bg_thresh) / 2;
}


bg_thresh = (dirty_background_ratio* available_memory) / 100
                 =(10*55671772KB)/100=5567177KB
thresh=(vm_dirty_ratio *available_memory)/100
           =(40*56979556KB)/100=22791822KB
(thresh+bg_thresh)/2=14179499KB



coredump时系统dirty page大小：
crash> kmem -V | grep -E 'NR_FILE_DIRTY|NR_UNSTABLE_NFS|NR_WRITEBACK'
                  NR_FILE_DIRTY: 3518010
                   NR_WRITEBACK: 1575
              NR_WRITEBACK_TEMP: 0
                NR_UNSTABLE_NFS: 0

crash> pd (3518010+1575)*4
$2 = 14078340 //KBytes
crash>
宕机时dirty page数量跟限制值很接近(相差100MB)，考虑到系统已经触发了dirty page回收，
触发panic时dirty page数量满足将写IO进程强制设置为休眠减缓IO写入的条件。

因为写IO进程循环休眠等待的时间取决于per-bdi后台回写线程回写dirty page的速率和dirty page的数量。那么接下来需要分析下系统panic时对应per-bdi后台回写线程在做什么。

既然写IO进程因为系统dirty page达到门限值而被强制休眠等待，那么后台进程应该已经被唤醒回写IO,每个磁盘分区对应一个bdi，通过java进程pid:1633因脏页超限导致阻塞在io等待的堆栈信息找到struct bdi_writeback ffff8f7c93eb4000，再根据该地址找到该盘对应的io回写kworker进程pid为22335.

crash> whatis balance_dirty_pages
void balance_dirty_pages(struct bdi_writeback *, unsigned long);
crash> search -t ffff8f7c93eb4000 | grep kworker
PID: 22335  TASK: ffff8f65f6d08000  CPU: 64  COMMAND: "kworker/u162:4"
crash>

crash> bt 22335
PID: 22335  TASK: ffff8f65f6d08000  CPU: 64  COMMAND: "kworker/u162:4"
 #0 [ffffa29c434b3808] __schedule at ffffffff9e8d4009
 #1 [ffffa29c434b3898] schedule at ffffffff9e8d4438
 #2 [ffffa29c434b38b8] io_schedule at ffffffff9e8d4856
 #3 [ffffa29c434b38d0] wbt_wait at ffffffff9e42b643
 #4 [ffffa29c434b3978] rq_qos_throttle at ffffffff9e402d86
 #5 [ffffa29c434b39a0] blk_mq_make_request at ffffffff9e3f3233
 #6 [ffffa29c434b3a28] generic_make_request at ffffffff9e3e50ea
 #7 [ffffa29c434b3a88] submit_bio at ffffffff9e3e53c1
 #8 [ffffa29c434b3af0] xfs_submit_ioend at ffffffffc0e479c9  [xfs]
 #9 [ffffa29c434b3b20] xfs_vm_writepages at ffffffffc0e47c0f  [xfs]
#10 [ffffa29c434b3b88] do_writepages at ffffffff9e21193f
#11 [ffffa29c434b3ba8] __writeback_single_inode at ffffffff9e2e19f5
#12 [ffffa29c434b3bf0] writeback_sb_inodes at ffffffff9e2e22e9
#13 [ffffa29c434b3cd0] __writeback_inodes_wb at ffffffff9e2e269c
#14 [ffffa29c434b3d18] wb_writeback at ffffffff9e2e293c
#15 [ffffa29c434b3dc0] wb_workfn at ffffffff9e2e3136
#16 [ffffa29c434b3e60] process_one_work at ffffffff9e0accf4
#17 [ffffa29c434b3ea8] worker_thread at ffffffff9e0ad99f
#18 [ffffa29c434b3f08] kthread at ffffffff9e0b2f95
#19 [ffffa29c434b3f50] ret_from_fork at ffffffff9ea00215
crash>

crash> task_struct.sched_info ffff8f65f6d08000 | grep last_arrival
    last_arrival = 27783787184493456,
crash>
crash> runq | grep "CPU 64"
CPU 64 RUNQUEUE: ffff8f7c9f821640
crash> rq.clock ffff8f7c9f821640
  clock = 27783793437141945
crash>

crash> pd (27783793437141945-27783787184493456)
$8 = 6252648489//纳秒
crash>

从PID:22335堆栈可以看出因request队列中等待完成的io request超过限制而进入io等待状态。根据该进程所在cpu的rq.clock以及进程截止到系统panic前最后一次被调度运行的时间戳可知该进程已经进入D状态近6秒。

这里负责回写dirty page的进程为什么会因获取不到request而持续休眠超过6秒都没有被唤醒？接下来进一步分析下这里可能的原因。结合后台回写dirty page的kworker进程pid:22335堆栈信息和相关函数代码可以找到传递给__wbt_wait函数的各参数值：

static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
        ....
        rq_qos_throttle(q, bio, NULL);
        trace_block_getrq(q, bio, bio->bi_opf);
        rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
        ......
}
void rq_qos_throttle(struct request_queue *q, struct bio *bio,
                     spinlock_t *lock)
{
        struct rq_qos *rqos;
        for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
                if (rqos->ops->throttle)//wbt_wait
                        rqos->ops->throttle(rqos, bio, lock);
        }
}
//rdi=rbx = struct rq_qos ffff8f648f036258
//rsi=r13= struct bio ffff8f7c9466f000
static inline struct rq_wb *RQWB(struct rq_qos *rqos)
{
        return container_of(rqos, struct rq_wb, rqos);
}
static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
{
       //struct rq_wb 0xffff8f648f036200
        struct rq_wb *rwb = RQWB(rqos);
        enum wbt_flags flags;
       //如果是读request这里flags为WBT_READ
        flags = bio_to_wbt_flags(rwb, bio);
        //read io和direct write IO这里都直接返回，不做流控限制
        if (!(flags & WBT_TRACKED)) {
                if (flags & WBT_READ)
                        wb_timestamp(rwb, &rwb->last_issue);
                return;
        }
         //rwb= 0xffff8f648f036200
         //flags = 0     bio->bi_opf=0x100001
        __wbt_wait(rwb, flags, bio->bi_opf, lock);
        if (!blk_stat_is_active(rwb->cb))
                rwb_arm_timer(rwb);
}
static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
{
        switch (bio_op(bio)) {
        case REQ_OP_WRITE:
                /*
                 * Don't throttle WRITE_ODIRECT
                 */
                if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
                    (REQ_SYNC | REQ_IDLE))
                        return false;
                /* fallthrough */
//switch没加break，满足REQ_OP_WRITE同时会执行REQ_OP_DISCARD分支    
        case REQ_OP_DISCARD:
                return true;
        default:
                return false;
        }
}
static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
{
        enum wbt_flags flags = 0;
        if (!rwb_enabled(rwb))
                return 0;
        if (bio_op(bio) == REQ_OP_READ) {
                flags = WBT_READ;
        } else if (wbt_should_throttle(rwb, bio)) {
                if (current_is_kswapd())
                        flags |= WBT_KSWAPD;
                if (bio_op(bio) == REQ_OP_DISCARD)
                        flags |= WBT_DISCARD;
                flags |= WBT_TRACKED;
        }
        return flags;
}
crash> struct rq_wb.rqos -xo
struct rq_wb {
  [0x58] struct rq_qos rqos;
}
crash>
//0xffff8f648f036258rq_wb的成员rqos地址，根据该地址算出rq_wb 地址
crash> px (0xffff8f648f036258-0x58)
$29 = 0xffff8f648f036200 //struct rq_wb
crash>
crash> struct bio.bi_opf ffff8f7c9466f000 -x
  bi_opf = 0x100001
crash>
static inline struct rq_wb *RQWB(struct rq_qos *rqos)
{
        return container_of(rqos, struct rq_wb, rqos);
}
static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
                       unsigned long rw, spinlock_t *lock)
        __releases(lock)
        __acquires(lock)
{
        struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
        struct wbt_wait_data data = {
                .wq = {
                        .func   = wbt_wake_function,
                        .entry  = LIST_HEAD_INIT(data.wq.entry),
                },
                .task = current,
                .rwb = rwb,
                .rqw = rqw,
                .rw = rw,
        };
        bool has_sleeper;
        has_sleeper = wq_has_sleeper(&rqw->wait);
        if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
                return;
        prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
        do {
                if (data.got_token)
                        break;
                if (!has_sleeper &&
                    rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
                        finish_wait(&rqw->wait, &data.wq);
                        /*
                         * We raced with wbt_wake_function() getting a token,
                         * which means we now have two. Put our local token
                         * and wake anyone else potentially waiting for one.
                         */
                        if (data.got_token)
                                wbt_rqw_done(rwb, rqw, wb_acct);
                        break;
                }
                if (lock) {
                        spin_unlock_irq(lock);
                        io_schedule();
                        spin_lock_irq(lock);
                } else
                        io_schedule();
                has_sleeper = false;
        } while (1);
        finish_wait(&rqw->wait, &data.wq);
}

crash> bt 22335
PID: 22335  TASK: ffff8f65f6d08000  CPU: 64  COMMAND: "kworker/u162:4"
 #0 [ffffa29c434b3808] __schedule at ffffffff9e8d4009
 #1 [ffffa29c434b3898] schedule at ffffffff9e8d4438
 #2 [ffffa29c434b38b8] io_schedule at ffffffff9e8d4856
 #3 [ffffa29c434b38d0] wbt_wait at ffffffff9e42b643
 #4 [ffffa29c434b3978] rq_qos_throttle at ffffffff9e402d86
 #5 [ffffa29c434b39a0] blk_mq_make_request at ffffffff9e3f3233
 #6 [ffffa29c434b3a28] generic_make_request at ffffffff9e3e50ea
 #7 [ffffa29c434b3a88] submit_bio at ffffffff9e3e53c1
 #8 [ffffa29c434b3af0] xfs_submit_ioend at ffffffffc0e479c9  [xfs]
 #9 [ffffa29c434b3b20] xfs_vm_writepages at ffffffffc0e47c0f  [xfs]
#10 [ffffa29c434b3b88] do_writepages at ffffffff9e21193f
#11 [ffffa29c434b3ba8] __writeback_single_inode at ffffffff9e2e19f5
#12 [ffffa29c434b3bf0] writeback_sb_inodes at ffffffff9e2e22e9
#13 [ffffa29c434b3cd0] __writeback_inodes_wb at ffffffff9e2e269c
#14 [ffffa29c434b3d18] wb_writeback at ffffffff9e2e293c
#15 [ffffa29c434b3dc0] wb_workfn at ffffffff9e2e3136
#16 [ffffa29c434b3e60] process_one_work at ffffffff9e0accf4
#17 [ffffa29c434b3ea8] worker_thread at ffffffff9e0ad99f
#18 [ffffa29c434b3f08] kthread at ffffffff9e0b2f95
#19 [ffffa29c434b3f50] ret_from_fork at ffffffff9ea00215
crash> dis ffffffff9e402d86 -r
0xffffffff9e402d50 <rq_qos_throttle>:   nopl   0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff9e402d55 <rq_qos_throttle+5>: push   %rbp
0xffffffff9e402d56 <rq_qos_throttle+6>: mov    %rsp,%rbp
0xffffffff9e402d59 <rq_qos_throttle+9>: push   %r13
0xffffffff9e402d5b <rq_qos_throttle+11>:        mov    %rsi,%r13
0xffffffff9e402d5e <rq_qos_throttle+14>:        push   %r12
0xffffffff9e402d60 <rq_qos_throttle+16>:        mov    %rdx,%r12
0xffffffff9e402d63 <rq_qos_throttle+19>:        push   %rbx
0xffffffff9e402d64 <rq_qos_throttle+20>:        mov    0x38(%rdi),%rbx
0xffffffff9e402d68 <rq_qos_throttle+24>:        test   %rbx,%rbx
0xffffffff9e402d6b <rq_qos_throttle+27>:        je     0xffffffff9e402d8f <rq_qos_throttle+63>
0xffffffff9e402d6d <rq_qos_throttle+29>:        mov    (%rbx),%rax
0xffffffff9e402d70 <rq_qos_throttle+32>:        mov    (%rax),%rax
0xffffffff9e402d73 <rq_qos_throttle+35>:        test   %rax,%rax
0xffffffff9e402d76 <rq_qos_throttle+38>:        je     0xffffffff9e402d86 <rq_qos_throttle+54>
0xffffffff9e402d78 <rq_qos_throttle+40>:        mov    %r12,%rdx//lock
0xffffffff9e402d7b <rq_qos_throttle+43>:        mov    %r13,%rsi////struct bio
0xffffffff9e402d7e <rq_qos_throttle+46>:        mov    %rbx,%rdi//struct rq_qos
0xffffffff9e402d81 <rq_qos_throttle+49>:        callq  0xffffffff9ec03000 <__entry_trampoline_end>
0xffffffff9e402d86 <rq_qos_throttle+54>:        mov    0x18(%rbx),%rbx

crash> whatis wbt_wait
void wbt_wait(struct rq_qos *, struct bio *, spinlock_t *);
crash>

crash> dis wbt_wait | head -15
0xffffffff9e42b3c0 <wbt_wait>:  nopl   0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff9e42b3c5 <wbt_wait+5>:        push   %rbp
0xffffffff9e42b3c6 <wbt_wait+6>:        mov    %rsp,%rbp
0xffffffff9e42b3c9 <wbt_wait+9>:        push   %r15
0xffffffff9e42b3cb <wbt_wait+11>:       mov    %rsi,%r15
0xffffffff9e42b3ce <wbt_wait+14>:       lea    0x10(%rsi),%rsi
0xffffffff9e42b3d2 <wbt_wait+18>:       push   %r14
0xffffffff9e42b3d4 <wbt_wait+20>:       push   %r13 //参数2 struct bio ffff8f7c9466f000 
0xffffffff9e42b3d6 <wbt_wait+22>:       lea    -0x58(%rdi),%r13
0xffffffff9e42b3da <wbt_wait+26>:       push   %r12//参数3 lock  0000000000000000
0xffffffff9e42b3dc <wbt_wait+28>:       mov    %rdi,%r12
0xffffffff9e42b3df <wbt_wait+31>:       mov    %r13,%rdi 
0xffffffff9e42b3e2 <wbt_wait+34>:       push   %rbx  //参数1 struct rq_qos ffff8f648f036258 
0xffffffff9e42b3e3 <wbt_wait+35>:       mov    %rdx,%rbx
0xffffffff9e42b3e6 <wbt_wait+38>:       sub    $0x70,%rsp


crash> bt 22335 -f | grep wbt_wait -A12
 #3 [ffffa29c434b38d0] wbt_wait at ffffffff9e42b643
    ffffa29c434b38d8: ffff8f65f6d08000 8f03620000100000
    ffffa29c434b38e8: 0000000100000000 0000000000000001
    ffffa29c434b38f8: 0000000000000000 ffffffff9e42b2c0
    ffffa29c434b3908: ffffa29c24fe3ac8 ffff8f648f036288
    ffffa29c434b3918: ffff8f65f6d08000 ffff8f648f036200
    ffffa29c434b3928: ffff8f648f036280 0000000000100001
    ffffa29c434b3938: 0000000000000000 a4a8e8b4f7058000
    ffffa29c434b3948: ffff8f648f036258 0000000000000000
    ffffa29c434b3958: ffff8f7c9466f000 ffffa29c434b3de8
    ffffa29c434b3968: ffff8f7c93eb4000 ffffa29c434b3998
    ffffa29c434b3978: ffffffff9e402d86
 #4 [ffffa29c434b3978] rq_qos_throttle at ffffffff9e402d86
crash>


rdi=rbx = struct rq_qos ffff8f648f036258
rsi=r13= struct bio ffff8f7c9466f000
crash> struct rq_wb.rqos -xo
struct rq_wb {
  [0x58] struct rq_qos rqos;
}
crash>
//0xffff8f648f036258 为rq_wb的成员struct rq_qos rqos地址，根据该地址算出rq_wb 地址
crash> px (0xffff8f648f036258-0x58)
$29 = 0xffff8f648f036200 //struct rq_wb
crash>
crash> struct rq_qos.q ffff8f648f036258
  q = 0xffff8f64792d1c80
crash>
crash> struct bio.bi_opf ffff8f7c9466f000 -x
  bi_opf = 0x100001
crash>

根据rq_qos获取到rq_wb地址:

根据struct rq_wait信息可知IO写等待队列还有4个request没有完成IO写入，结合wbt_rqw_done实现可以确认per-bdi回写线程之所以因获取不到写request资源阻塞等待在队列中超过6秒的原因是因为前面已经分配的write request没有全部完成写入磁盘。read io和direct write IO不会统计inflight：

static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
{
       //struct rq_wb 0xffff8f648f036200
        struct rq_wb *rwb = RQWB(rqos);
        enum wbt_flags flags;
       //如果是读request这里flags为WBT_READ
        flags = bio_to_wbt_flags(rwb, bio);
        //读request和direct write IO都直接返回，不做流控限制
        if (!(flags & WBT_TRACKED)) {
                if (flags & WBT_READ)
                        wb_timestamp(rwb, &rwb->last_issue);
                return;
        }
         //rwb= 0xffff8f648f036200
         //flags = 0     bio->bi_opf=0x100001
        __wbt_wait(rwb, flags, bio->bi_opf, lock);
        if (!blk_stat_is_active(rwb->cb))
                rwb_arm_timer(rwb);
}

static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
                                          enum wbt_flags wb_acct)
{
        if (wb_acct & WBT_KSWAPD)
                return &rwb->rq_wait[WBT_RWQ_KSWAPD];
        else if (wb_acct & WBT_DISCARD)
                return &rwb->rq_wait[WBT_RWQ_DISCARD];

        return &rwb->rq_wait[WBT_RWQ_BG];
}


crash> struct rq_wb.rq_wait 0xffff8f648f036200 -xo
struct rq_wb {
  [ffff8f648f036280] struct rq_wait rq_wait[3];
}

crash> px &((struct rq_wait *)0xffff8f648f036280)[0]
$10 = (struct rq_wait *) 0xffff8f648f036280
crash>

static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
                       unsigned long rw, spinlock_t *lock)
        __releases(lock)
        __acquires(lock)
{
        struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
        struct wbt_wait_data data = {
                .wq = {
                        .func   = wbt_wake_function,
                        .entry  = LIST_HEAD_INIT(data.wq.entry),
                },
                .task = current,
                .rwb = rwb,
                .rqw = rqw,
                .rw = rw,
        };
        bool has_sleeper;
        has_sleeper = wq_has_sleeper(&rqw->wait);
        //前面已经有进程因获取不到request而休眠在等待队列里，则这里进走到下面流程进入休眠等待。
        //如果还未有进程因等待request资源而休眠则这里判断未释放的request资源是否超过上限，
        没超则直接返回申请request资源
        if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
                return;
        prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);//加入等待队列
        do {
                if (data.got_token)
                        break;
                if (!has_sleeper &&
                    rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
                        finish_wait(&rqw->wait, &data.wq);
                        /*
                         * We raced with wbt_wake_function() getting a token,
                         * which means we now have two. Put our local token
                         * and wake anyone else potentially waiting for one.
                         */
                        if (data.got_token)
                                wbt_rqw_done(rwb, rqw, wb_acct);
                        break;
                }
                if (lock) {
                        spin_unlock_irq(lock);
                        io_schedule();
                        spin_lock_irq(lock);
                } else
                        io_schedule();
                has_sleeper = false;
        } while (1);
        finish_wait(&rqw->wait, &data.wq);
}
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
{  //inflight数量说明还有多少非direct I/O write request未释放
        return atomic_inc_below(&rq_wait->inflight, limit);
}
static bool atomic_inc_below(atomic_t *v, unsigned int below)
{
        unsigned int cur = atomic_read(v);
        for (;;) {
                unsigned int old;
                if (cur >= below)
                        return false;
                old = atomic_cmpxchg(v, cur, cur + 1);
                if (old == cur)
                        break;
                cur = old;
        }
        return true;
}
static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
{
        unsigned int limit;
        /*
         * If we got disabled, just return UINT_MAX. This ensures that
         * we'll properly inc a new IO, and dec+wakeup at the end.
         */
        if (!rwb_enabled(rwb))
                return UINT_MAX;
        if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
                return rwb->wb_background;
               if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
                limit = rwb->rq_depth.max_depth;
        else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
                /*
                 * If less than 100ms since we completed unrelated IO,
                 * limit us to half the depth for background writeback.
                 */
                limit = rwb->wb_background;
        } else
                limit = rwb->wb_normal;
        return limit;
}
static bool wb_recent_wait(struct rq_wb *rwb)
{
        struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
        return time_before(jiffies, wb->dirty_sleep + HZ);
}
static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
                         enum wbt_flags wb_acct)
{
        int inflight, limit;
       //每完成一次非direct IO write request则这里对inflight减1
        inflight = atomic_dec_return(&rqw->inflight);
        /*
         * wbt got disabled with IO in flight. Wake up any potential
         * waiters, we don't have to do more than that.
         */
        if (unlikely(!rwb_enabled(rwb))) {
                rwb_wake_all(rwb);
                return;
        }
        /*
         * For discards, our limit is always the background. For writes, if
         * the device does write back caching, drop further down before we
         * wake people up.
         */
        if (wb_acct & WBT_DISCARD)
                limit = rwb->wb_background;
 //支持cgroup后wb_recent_wait这里返回都会是0，每个cgroup组针对每个分区都有自己的bdi，所以不会再使用backing_dev_info，因此backing_dev_info.wb.dirty_sleep一直都是系统启动时的初始jiffies值-300HZ            
        else if (rwb->wc && !wb_recent_wait(rwb))
                limit = 0;
        else
                limit = rwb->wb_normal;
        /*
         * Don't wake anyone up if we are above the normal limit.
         */
         //limit为0，所以这里只有所有已经分配的request都完成IO后才会唤醒等request资源的进程
        if (inflight && inflight >= limit)
                return;
        if (wq_has_sleeper(&rqw->wait)) {
                int diff = limit - inflight;
            if (!inflight || diff >= rwb->wb_background / 2)
                  //唤醒所有等待request资源的进程
                        wake_up_all(&rqw->wait);
        }
}

查看IO写等待队列还有4个request没有完成IO写入，结合wbt_rqw_done实现可以确认per-bdi回写线程之所以因获取不到
写request资源阻塞等待在队列中超过6秒的原因是因为前面已经分配的write request没有全部完成写入磁盘。
read io和direct write IO不会统计inflight：

crash> struct rq_wait.inflight 0xffff8f648f036280
  inflight = {
    counter = 4
  }
crash>
static struct rq_qos_ops wbt_rqos_ops = {
        .throttle = wbt_wait,
        .issue = wbt_issue,
        .track = wbt_track,
        .requeue = wbt_requeue,
        .done = wbt_done,
        .cleanup = wbt_cleanup,
        .exit = wbt_exit,
};


从上面代码分析可以知道当未完成的非direct write io写入的已分配request数量超过limit时，后面申请request资源的进程就会被设置会休眠状态，
根据写入io优先级或者写入io的进程是kswap还是pdi回写进程或者进程直接写io，limit值都不同，无论哪种情况write request数量最大限制是24。

crash> struct rq_wb.wb_background,wb_normal 0xffff8f648f036200
  wb_background = 6
  wb_normal = 12
crash> struct rq_wb 0xffff8f648f036200 | grep max_depth
    max_depth = 24,
crash>

从__wbt_wait实现可以看到等待request资源的进程会在request等待队列里等待被wakeup，找出等待队列中的因写request 超过限制而进入D状态的所有进程:

static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
                       unsigned long rw, spinlock_t *lock)
        __releases(lock)
        __acquires(lock)
{
        struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
        struct wbt_wait_data data = {
                .wq = {
                        .func   = wbt_wake_function,
                        .entry  = LIST_HEAD_INIT(data.wq.entry),
                },
                .task = current,
                .rwb = rwb,
                .rqw = rqw,
                .rw = rw,
        };
       ......
       ......
        prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
       ......
}
void
prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
        unsigned long flags;
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        spin_lock_irqsave(&wq_head->lock, flags);
        if (list_empty(&wq_entry->entry))
                __add_wait_queue_entry_tail(wq_head, wq_entry);
        set_current_state(state);
        spin_unlock_irqrestore(&wq_head->lock, flags);
}
struct wbt_wait_data 成员wait_queue_entry.entry会加入到以struct rq_wb成员rq_wait.wait.head地址为链表头的链表中
crash> struct wbt_wait_data.wq -xo
struct wbt_wait_data {
   [0x0] struct wait_queue_entry wq;
}
crash>
crash> struct wait_queue_entry.entry -xo
struct wait_queue_entry {
  [0x18] struct list_head entry;
}
crash>
crash> struct rq_wait.wait ffff8f648f036280 -xo
struct rq_wait {
  [ffff8f648f036280] wait_queue_head_t wait;
}
crash> struct rq_wait.wait ffff8f648f036280 -xo
struct rq_wait {
  [ffff8f648f036280] wait_queue_head_t wait;
}
crash> wait_queue_head_t.head ffff8f648f036280 -xo
typedef struct wait_queue_head {
  [ffff8f648f036288] struct list_head head;
} wait_queue_head_t;
crash> wait_queue_head_t.head ffff8f648f036280
  head = {
    next = 0xffffa29c434b3908,
    prev = 0xffffa29c27617af8
  }
crash>
以wait_queue_head_t.head地址为链表头串连wait_queue_entry.entry，
通过wait_queue_head_t.head地址可以遍历出链表中所有entry关联的wait_queue_entry信息：
list wait_queue_entry.entry -s wait_queue_entry -H ffff8f648f036288 -x
因为wait_queue_entry是wbt_wait_data的首成员，两者地址相同
crash> struct wbt_wait_data.wq -xo
struct wbt_wait_data {
   [0x0] struct wait_queue_entry wq;
}
crash>
因此可以获取到链表中所有wbt_wait_data的信息：
crash> list wait_queue_entry.entry -s wbt_wait_data -H ffff8f648f036288 -x
crash> wbt_wait_data.task -xo
struct wbt_wait_data {
  [0x28] struct task_struct *task;
}
crash>
列出当前因等待前面已获取write request的进程完成非direct write IO写入完成后释放request而阻塞在等待队列上的非direct IO write IO进程：
crash> list wait_queue_entry.entry -s wbt_wait_data.task -H ffff8f648f036288 -x
ffffa29c434b38f0
  task = 0xffff8f65f6d08000
ffffa29c24fe3ab0
  task = 0xffff8f77e68cc000
ffffa29c39f53960
  task = 0xffff8f4df7894000
ffffa29c3fa7bae0
  task = 0xffff8f6593e24000
ffffa29c27617ae0
  task = 0xffff8f4de3fd0000
crash>
crash> task_struct.comm 0xffff8f65f6d08000
  comm = "kworker/u162:4\000"
crash>

前面分析截止到重启时间点pdi回写进程kworker等待request资源超6s，这6秒就是在等前面已经分配的一批request完成IO写入后释放request资源。一个request是由一个或者多个bio组成。

io request最终要完成IO操作还需要经过IO调度器的调度。系统数据盘当前使用的是mq-deadline调度器，deadline调度器的特点是读IO优先，如果有大量的读IO请求会延迟写IO请求落盘。

crash> struct rq_wb.rqos 0xffff8f648f036200
  rqos = {
    ops = 0xffffffff9f780600,
    q = 0xffff8f64792d1c80,
    id = RQ_QOS_WBT,
    next = 0xffff8f6478d8b360,
    debugfs_dir = 0x0
  }
  
  crash> dev -d | grep ffff8f64792d1c80
    8 ffff8f7c93351000   sdm        ffff8f64792d1c80   21923   184 21739 N/A(MQ)
crash>
  
crash> struct request_queue.elevator 0xffff8f64792d1c80
  elevator = 0xffff8f648e321400
crash>
crash> struct elevator_queue.type 0xffff8f648e321400
  type = 0xffffffff9f77f720
crash>
crash> struct elevator_type.elevator_name 0xffffffff9f77f720
  elevator_name = "mq-deadline\000\000\000\000"
crash>

未完待续......

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

http

网站

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

http

网站

登录后参与评论

0 条评论

热度

结合coredump分析dirty page和IO request处理逻辑(上)

结合coredump分析dirty page和IO request处理逻辑(上)

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐