1. 前言
本专题我们开始学习内存管理部分,本文为缺页中断处理相关学习笔记。本文主要参考了《奔跑吧, Linux内核》、ULA、ULK的相关内容。
前面概述部分主要介绍了arm64缺页中断的底层逻辑,本文主要以数据异常的do_page_fault为例来说明缺页的详细处理过程。
2. do_page_fault
do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs)
|--struct mm_struct *mm = current->mm;
|--初始化vm_flags和mm_flags
| //如果由kprobe处理这个page fault直接返回
|--if (kprobe_page_fault(regs, esr))
| return 0;
| //page fault禁用或中断上下文,或内核空间(mm==NULL)直接返回
|--if (faulthandler_disabled() || !mm)
| goto no_context;
|--设置vm_flags和mm_flags标记
|--获取mm->mmap_lock锁
|--fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs)
\--根据返回值fault做相应处理
初始化vm_flags为VM_ACCESS_FLAGS,初始化mm_flags为FAULT_FLAG_DEFAULT,vm_flags将用于与addr所在vma的标记进行比较,以确定是否可以访问addr所在的vma
kprobe_page_fault:如果由kprobe处理这个page fault直接返回true
faulthandler_disabled() || !mm:page fault禁用或中断上下文,或内核空间(mm==NULL)直接返回,排除了中断上下文和内核空间
关于task_struct.mm与active_mm的区别可参考:
https://01.org/linuxgraphics/gfx-docs/drm/vm/active_mm.html
http://t.zoukankan.com/linhaostudy-p-9904846.html
设置vm_flags和mm_flags标记:根据是否是用户模式、是否是用户指令异常、是否是数据写入异常,分别设置vm_flags和mm_flags标记
获取mm->mmap_lock锁:检查进程地址空间缺页异常,需先获取锁保护,获取进程的mmap_sem读写信号量,以查询address是否位于当前进程地址空间,通过尝试获取,排除是因为内核bug导致无法获取而引起死锁
__do_page_fault:执行缺页异常处理,它的返回值通过vm_fault_t来保存,主要类型定义在如下:
#include/linux/mm_types.h
/**
* enum vm_fault_reason - Page fault handlers return a bitmask of
* these values to tell the core VM what happened when handling the
* fault. Used to decide whether a process gets delivered SIGBUS or
* just gets major/minor fault counters bumped up.
*
* @VM_FAULT_OOM: Out Of Memory
* @VM_FAULT_SIGBUS: Bad access
* @VM_FAULT_MAJOR: Page read from storage
* @VM_FAULT_WRITE: Special case for get_user_pages
* @VM_FAULT_HWPOISON: Hit poisoned small page
* @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded
* in upper bits
* @VM_FAULT_SIGSEGV: segmentation fault
* @VM_FAULT_NOPAGE: ->fault installed the pte, not return page
* @VM_FAULT_LOCKED: ->fault locked the returned page
* @VM_FAULT_RETRY: ->fault blocked, must retry
* @VM_FAULT_FALLBACK: huge page fault failed, fall back to small
* @VM_FAULT_DONE_COW: ->fault has fully handled COW
* @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
* fsync() to complete (for synchronous page faults
* in DAX)
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
enum vm_fault_reason {
VM_FAULT_OOM = (__force vm_fault_t)0x000001,
VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002,
VM_FAULT_MAJOR = (__force vm_fault_t)0x000004,
VM_FAULT_WRITE = (__force vm_fault_t)0x000008,
VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010,
VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040,
VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100,
VM_FAULT_LOCKED = (__force vm_fault_t)0x000200,
VM_FAULT_RETRY = (__force vm_fault_t)0x000400,
VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
|- -设置vm_flags和mm_flags标记
设置vm_flags和mm_flags标记
|--if (user_mode(regs))
| mm_flags |= FAULT_FLAG_USER;
|--if (is_el0_instruction_abort(esr))
| vm_flags = VM_EXEC
| mm_flags |= FAULT_FLAG_INSTRUCTION
|--else if (is_write_abort(esr))
| vm_flags = VM_WRITE
| mm_flags |= FAULT_FLAG_WRITE
\--if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs))
die_kernel_fault(...)
主要根据是否是用户模式、是否是指令异常、是否是写入异常等分别设置vm_flags和mm_flags标志
user_mode(regs):判断如果是用户态(通过((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t),则 mm_flags |= FAULT_FLAG_USER
is_el0_instruction_abort(esr):如果是用户态指令异常,则表示进程地址空间有可执行权限
is_write_abort(esr):如果是写入导致的异常错误,设置VM_WRITE标志
is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs):如果addr为用户空间,且访问权限问题发生在el1,调用die_kernel_fault进行处理,主要区分了三种情况。
|- -__do_page_fault
__do_page_fault(mm, addr, mm_flags, vm_flags, regs)
| //查找vma
|--vma = find_vma(mm, addr)
| //如果查找不到vma,说明addr还没有在进程地址空间,即不在进程的任何vma
|--if (unlikely(!vma))
| return VM_FAULT_BADMAP
| //尝试将vma扩展到addr
|--if (unlikely(vma->vm_start > addr))
| expand_stack(vma, addr)
| //将根据异常类型更新的vm_flags与addr所在线性区标记进行比较,如果不匹配,则返回错误
|--if (!(vma->vm_flags & vm_flags))
| return VM_FAULT_BADACCESS
|--handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs)
handle_mm_fault(vma, address, flags, regs)
|-- __set_current_state(TASK_RUNNING)
|--count_vm_event(PGFAULT)
|--count_memcg_event_mm(vma->vm_mm, PGFAULT)
|--check_sync_rss_stat(current)
|--arch_vma_access_permitted(vma, flags&FAULT_FLAG_WRITE,
| flags&FAULT_FLAG_INSTRUCTION,flags&FAULT_FLAG_REMOTE))
|--ret = __handle_mm_fault(vma, address, flags)
| //计数缺页异常的页面,会区分主缺页和次缺页
\--mm_account_fault(regs, address, flags, ret)
执行handle_mm_fault(请求调页)分配一个新的页框,如果成功则返回VM_FAULT_MINOR或VM_FAULT_MAJOR
__handle_mm_fault(vma, address, flags)
|--初始化struct vm_fault vmf
|--struct mm_struct *mm = vma->vm_mm //获取进程的内存描述符
|--pgd = pgd_offset(mm, address) //初始化缺页异常地址对应的pgd页表项
|--p4d = p4d_alloc(mm, pgd, address)
|--vmf.pud = pud_alloc(mm, p4d, address)
|--vmf.pmd = pmd_alloc(mm, vmf.pud, address)
\--handle_pte_fault(&vmf)
__handle_mm_fault是缺页处理的核心函数,其中关于巨页相关的处理省略
初始化struct vm_fault vmf:主要初始化了vma, address(页对齐),flags,pgoff(address在vma的偏移),gfp_mask
handle_pte_fault:主要采用vm_fault数据结构来管理很多参数,它主要通过vma首先判断addr所对应的pte是否为空,如果为空则进一步判断是匿名映射还是文件映射,如果不为空,则进一步根据pte页表项,触发页面换入、写时复制等。详细的分析流程见:内存管理基础学习笔记 - 4.3 缺页中断处理 - handle_pte_fault
|- -根据返回值fault做相应处理
根据返回值fault做相应处理
| //如果有信号pending,则优先处理信号
|--if (fault_signal_pending(fault, regs))
| return 0;
| // retry处理
|--if (fault & VM_FAULT_RETRY)
| if (mm_flags & FAULT_FLAG_ALLOW_RETRY)
| mm_flags |= FAULT_FLAG_TRIED;
| goto retry;
| //正常情况的处理,Handle the "normal" (no error) case first
|--if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
| VM_FAULT_BADACCESS))))
| return 0;
| //如果是kernel模式,直接退出
|--if (!user_mode(regs))
| goto no_context
| //缺页处理过程中无法分配内存
| //执行OOM killer, 并返回用户空间,用户空间会重新触发page fault
|--if (fault & VM_FAULT_OOM)
| pagefault_out_of_memory()
|--inf = esr_to_fault_info(esr)
|--set_thread_esr(addr, esr)
| //系统中有内存,但是发生了无法处理的错误,发送信号给内核来终止发生异常的进程
|--if (fault & VM_FAULT_SIGBUS)
| arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr,inf->name) //发送SIGBUS给当前进程
|--else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON))
| arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb,inf->name)
| //Something tried to access memory that isn't in our memory map
\--else
arm64_force_sig_fault(SIGSEGV,fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
(void __user *)addr,inf->name);//发送SIGSEGV给当前进程
参考文档
- 奔跑吧,Linux内核