内存管理基础学习笔记 - 4.2 缺页中断处理 - do_page_fault

1. 前言

本专题我们开始学习内存管理部分,本文为缺页中断处理相关学习笔记。本文主要参考了《奔跑吧, Linux内核》、ULA、ULK的相关内容。
前面概述部分主要介绍了arm64缺页中断的底层逻辑,本文主要以数据异常的do_page_fault为例来说明缺页的详细处理过程。

2. do_page_fault

do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs)
    |--struct mm_struct *mm = current->mm;
    |--初始化vm_flags和mm_flags
    |  //如果由kprobe处理这个page fault直接返回
    |--if (kprobe_page_fault(regs, esr))
    |      return 0;
    |  //page fault禁用或中断上下文,或内核空间(mm==NULL)直接返回
    |--if (faulthandler_disabled() || !mm)
    |      goto no_context;
    |--设置vm_flags和mm_flags标记
    |--获取mm->mmap_lock锁
    |--fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs)
    \--根据返回值fault做相应处理
    
  1. 初始化vm_flags为VM_ACCESS_FLAGS,初始化mm_flags为FAULT_FLAG_DEFAULT,vm_flags将用于与addr所在vma的标记进行比较,以确定是否可以访问addr所在的vma

  2. kprobe_page_fault:如果由kprobe处理这个page fault直接返回true

  3. faulthandler_disabled() || !mm:page fault禁用或中断上下文,或内核空间(mm==NULL)直接返回,排除了中断上下文和内核空间

关于task_struct.mm与active_mm的区别可参考:
https://01.org/linuxgraphics/gfx-docs/drm/vm/active_mm.html
http://t.zoukankan.com/linhaostudy-p-9904846.html

  1. 设置vm_flags和mm_flags标记:根据是否是用户模式、是否是用户指令异常、是否是数据写入异常,分别设置vm_flags和mm_flags标记

  2. 获取mm->mmap_lock锁:检查进程地址空间缺页异常,需先获取锁保护,获取进程的mmap_sem读写信号量,以查询address是否位于当前进程地址空间,通过尝试获取,排除是因为内核bug导致无法获取而引起死锁

  3. __do_page_fault:执行缺页异常处理,它的返回值通过vm_fault_t来保存,主要类型定义在如下:

#include/linux/mm_types.h
/**
 * enum vm_fault_reason - Page fault handlers return a bitmask of
 * these values to tell the core VM what happened when handling the
 * fault. Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
 *
 * @VM_FAULT_OOM:               Out Of Memory
 * @VM_FAULT_SIGBUS:            Bad access
 * @VM_FAULT_MAJOR:             Page read from storage
 * @VM_FAULT_WRITE:             Special case for get_user_pages
 * @VM_FAULT_HWPOISON:          Hit poisoned small page
 * @VM_FAULT_HWPOISON_LARGE:    Hit poisoned large page. Index encoded
 *                              in upper bits
 * @VM_FAULT_SIGSEGV:           segmentation fault
 * @VM_FAULT_NOPAGE:            ->fault installed the pte, not return page
 * @VM_FAULT_LOCKED:            ->fault locked the returned page
 * @VM_FAULT_RETRY:             ->fault blocked, must retry
 * @VM_FAULT_FALLBACK:          huge page fault failed, fall back to small
 * @VM_FAULT_DONE_COW:          ->fault has fully handled COW
 * @VM_FAULT_NEEDDSYNC:         ->fault did not modify page tables and needs
 *                              fsync() to complete (for synchronous page faults
 *                              in DAX)
 * @VM_FAULT_HINDEX_MASK:       mask HINDEX value
 *
 */
enum vm_fault_reason {
        VM_FAULT_OOM            = (__force vm_fault_t)0x000001,
        VM_FAULT_SIGBUS         = (__force vm_fault_t)0x000002,
        VM_FAULT_MAJOR          = (__force vm_fault_t)0x000004,
        VM_FAULT_WRITE          = (__force vm_fault_t)0x000008,
        VM_FAULT_HWPOISON       = (__force vm_fault_t)0x000010,
        VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
        VM_FAULT_SIGSEGV        = (__force vm_fault_t)0x000040,
        VM_FAULT_NOPAGE         = (__force vm_fault_t)0x000100,
        VM_FAULT_LOCKED         = (__force vm_fault_t)0x000200,
        VM_FAULT_RETRY          = (__force vm_fault_t)0x000400,
        VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
        VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
        VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
        VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
};

|- -设置vm_flags和mm_flags标记

设置vm_flags和mm_flags标记
    |--if (user_mode(regs))
    |      mm_flags |= FAULT_FLAG_USER;
    |--if (is_el0_instruction_abort(esr))
    |      vm_flags = VM_EXEC
    |      mm_flags |= FAULT_FLAG_INSTRUCTION
    |--else if (is_write_abort(esr))
    |      vm_flags = VM_WRITE
    |      mm_flags |= FAULT_FLAG_WRITE
    \--if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs))
           die_kernel_fault(...

主要根据是否是用户模式、是否是指令异常、是否是写入异常等分别设置vm_flags和mm_flags标志

  1. user_mode(regs):判断如果是用户态(通过((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t),则 mm_flags |= FAULT_FLAG_USER

  2. is_el0_instruction_abort(esr):如果是用户态指令异常,则表示进程地址空间有可执行权限

  3. is_write_abort(esr):如果是写入导致的异常错误,设置VM_WRITE标志

  4. is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs):如果addr为用户空间,且访问权限问题发生在el1,调用die_kernel_fault进行处理,主要区分了三种情况。

|- -__do_page_fault

__do_page_fault(mm, addr, mm_flags, vm_flags, regs)
    |  //查找vma
    |--vma = find_vma(mm, addr)
    |  //如果查找不到vma,说明addr还没有在进程地址空间,即不在进程的任何vma
    |--if (unlikely(!vma)) 
    |      return VM_FAULT_BADMAP 
    |  //尝试将vma扩展到addr
    |--if (unlikely(vma->vm_start > addr))
    |      expand_stack(vma, addr)
    |  //将根据异常类型更新的vm_flags与addr所在线性区标记进行比较,如果不匹配,则返回错误 
    |--if (!(vma->vm_flags & vm_flags))
    |      return VM_FAULT_BADACCESS
    |--handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs)
handle_mm_fault(vma, address, flags, regs)
    |-- __set_current_state(TASK_RUNNING)
    |--count_vm_event(PGFAULT)
    |--count_memcg_event_mm(vma->vm_mm, PGFAULT)
    |--check_sync_rss_stat(current)
    |--arch_vma_access_permitted(vma, flags&FAULT_FLAG_WRITE,
    |                     flags&FAULT_FLAG_INSTRUCTION,flags&FAULT_FLAG_REMOTE))
    |--ret = __handle_mm_fault(vma, address, flags)
    |  //计数缺页异常的页面,会区分主缺页和次缺页
    \--mm_account_fault(regs, address, flags, ret) 

执行handle_mm_fault(请求调页)分配一个新的页框,如果成功则返回VM_FAULT_MINOR或VM_FAULT_MAJOR

__handle_mm_fault(vma, address, flags)
    |--初始化struct vm_fault vmf
    |--struct mm_struct *mm = vma->vm_mm //获取进程的内存描述符
    |--pgd = pgd_offset(mm, address) //初始化缺页异常地址对应的pgd页表项
    |--p4d = p4d_alloc(mm, pgd, address)
    |--vmf.pud = pud_alloc(mm, p4d, address)
    |--vmf.pmd = pmd_alloc(mm, vmf.pud, address)
    \--handle_pte_fault(&vmf) 

__handle_mm_fault是缺页处理的核心函数,其中关于巨页相关的处理省略

  1. 初始化struct vm_fault vmf:主要初始化了vma, address(页对齐),flags,pgoff(address在vma的偏移),gfp_mask

  2. handle_pte_fault:主要采用vm_fault数据结构来管理很多参数,它主要通过vma首先判断addr所对应的pte是否为空,如果为空则进一步判断是匿名映射还是文件映射,如果不为空,则进一步根据pte页表项,触发页面换入、写时复制等。详细的分析流程见:内存管理基础学习笔记 - 4.3 缺页中断处理 - handle_pte_fault

|- -根据返回值fault做相应处理

根据返回值fault做相应处理
    |  //如果有信号pending,则优先处理信号
    |--if (fault_signal_pending(fault, regs))
    |      return 0;
    |  // retry处理
    |--if (fault & VM_FAULT_RETRY)
    |      if (mm_flags & FAULT_FLAG_ALLOW_RETRY)
    |          mm_flags |= FAULT_FLAG_TRIED;
    |          goto retry;
    |  //正常情况的处理,Handle the "normal" (no error) case first
    |--if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
    |      VM_FAULT_BADACCESS)))) 
    |      return 0|  //如果是kernel模式,直接退出
    |--if (!user_mode(regs)) 
    |      goto no_context
    |  //缺页处理过程中无法分配内存
    |  //执行OOM killer, 并返回用户空间,用户空间会重新触发page fault
    |--if (fault & VM_FAULT_OOM)
    |      pagefault_out_of_memory() 
    |--inf = esr_to_fault_info(esr)
    |--set_thread_esr(addr, esr)
    |  //系统中有内存,但是发生了无法处理的错误,发送信号给内核来终止发生异常的进程
    |--if (fault & VM_FAULT_SIGBUS)
    |      arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,inf->name) //发送SIGBUS给当前进程
    |--else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON))
    |      arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,inf->name)
    |  //Something tried to access memory that isn't in our memory map
    \--else 
           arm64_force_sig_fault(SIGSEGV,fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
           				(void __user *)addr,inf->name);//发送SIGSEGV给当前进程

参考文档

  1. 奔跑吧,Linux内核

版权声明:本文为jasonactions原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。