缺页中断处理
do_page_fault
->__do_page_fault
->handle_mm_fault
->__handle_mm_fault
->handle_pte_fault
->do_fault //文件页 mmap方式访问
->do_anonymous_page //匿名页
->do_swap_page //在swap中
->do_wp_page //写时复制
do_page_fault处理流程
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
int fault, sig, code;
unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
mm = tsk->mm;
/* Enable interrupts if they were enabled in the parent context. */
if (interrupts_enabled(regs))
local_irq_enable();
/*
* If we're in an interrupt or have no user context, we must not take
* the fault.
*/
/*
* 这里要区分内核线程和用户线程的内核态
* 内核线程不会出现缺页异常;
* 用户线程的内核态可能会出现缺页异常,比如copy_to_user等
*/
if (in_atomic() || !mm) /* mm = NULL 表示为单纯的内核线程 */
goto no_context;
/* 设置用户态标志 */
if (user_mode(regs))
mm_flags |= FAULT_FLAG_USER;
/* 设置 写 执行标志 */
if (esr & ESR_LNX_EXEC) {
vm_flags = VM_EXEC;
} else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {
vm_flags = VM_WRITE;
mm_flags |= FAULT_FLAG_WRITE;
}
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
if (!user_mode(regs) && !search_exception_tables(regs->pc))
goto no_context;
retry:
down_read(&mm->mmap_sem); /* 睡眠等待锁释放 */
} else {
/*
* The above down_read_trylock() might have succeeded in which
* case, we'll have missed the might_sleep() from down_read().
*/
might_sleep();
#ifdef CONFIG_DEBUG_VM
if (!user_mode(regs) && !search_exception_tables(regs->pc))
goto no_context;
#endif
}
fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
/*
* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_sem because it
* would already be released in __lock_page_or_retry in mm/filemap.c.
*/
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return 0;
/*
* Major/minor page fault accounting is only done on the initial
* attempt. If we go through a retry, it is extremely likely that the
* page will be found in page cache at that point.
*/
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
addr);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
addr);
}
if (fault & VM_FAULT_RETRY) {
/*
* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
* starvation.
*/
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
goto retry;
}
}
/* 释放信号量 */
up_read(&mm->mmap_sem);
/*
* Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
*/
/* 没有异常,正常处理完成 */
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
* userspace (which will retry the fault, or kill us if we got
* oom-killed).
*/
pagefault_out_of_memory(); /* 进行OOM处理,然后返回 */
return 0;
}
/* 处理信号错误 */
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to successfully fix up
* this page fault.
*/
sig = SIGBUS;
code = BUS_ADRERR;
} else {
/*
* Something tried to access memory that isn't in our memory
* map.
*/
sig = SIGSEGV;
code = fault == VM_FAULT_BADACCESS ?
SEGV_ACCERR : SEGV_MAPERR;
}
/* 用户模式下错误处理,通过给用户进程发信号:SIGBUS/SIGSEGV */
__do_user_fault(tsk, addr, esr, sig, code, regs);
return 0;
no_context:
__do_kernel_fault(mm, addr, esr, regs);
return 0;
}
__do_page_fault
static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags,
struct task_struct *tsk)
{
struct vm_area_struct *vma;
int fault;
/* 在缺页中断处理中,地址必须是映射过的,即可以找到vma */
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
if (unlikely(vma->vm_start > addr))
goto check_stack;
/*
* Ok, we have a good vm_area for this memory access, so we can handle
* it.
*/
good_area:
/*
* Check that the permissions on the VMA allow for the fault which
* occurred. If we encountered a write or exec fault, we must have
* appropriate permissions, otherwise we allow any permission.
*/
/*
* 检查权限是否匹配
*/
if (!(vma->vm_flags & vm_flags)) {
fault = VM_FAULT_BADACCESS;
goto out;
}
return handle_mm_fault(mm, vma, addr & PAGE_MASK, mm_flags);
check_stack:
if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
goto good_area;
out:
return fault;
}
handle_mm_fault
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
int ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
ret = __handle_mm_fault(mm, vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
return ret;
}
__handle_mm_fault
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address); /* 获取当前address在当前进程页表项PGD页面目录项 */
pud = pud_alloc(mm, pgd, address); /* 获取当前address在当前进程对应PUD页表目录项 */
if (!pud)
return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address);
......
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address); /* 根据address从pmd中获取pte指针 */
/* */
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
handle_pte_fault
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
* so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
* The code below just needs a consistent view for the ifs and
* we later double check anyway with the ptl lock held. So here
* a barrier will do.
*/
entry = *pte;
barrier();
if (!pte_present(entry)) { /* pte对应的物理页不在内存中 */
if (pte_none(entry)) { /* pte页表项内容为空,同时pte对应物理页面也不存在 */
if (vma->vm_ops) {
/* vm_ops->fault可以区分是匿名也还还是文件页!!!! */
if (likely(vma->vm_ops->fault))
return do_fault(mm, vma, address, pte,
pmd, flags, entry); /* 文件缺页异常 */
}
return do_anonymous_page(mm, vma, address,
pte, pmd, flags); /* 匿名页缺页异常 */
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry); /* pte非空,但是页面不在内存中,即在swap中 */
}
/* 下面都是物理页面存在的情况 */
if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) { /* 对只读属性的页面产生写异常,触发写时复制缺页中断 。硬件的PTE表可写 & 软件PTE表不可写 */
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte); /* pte内容发生变化,需要把新的内容写入pte页表项中,并且刷新TLB和cache */
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}
文件页(mmap访问)缺页中断的处理
do_read_fault
->__do_fault
->filemap_fault
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
struct page *fault_page;
spinlock_t *ptl;
pte_t *pte;
int ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
goto unlock_out;
pte_unmap_unlock(pte, ptl);
}
/* 创建page cache的页面实际操作 */
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
page_cache_release(fault_page);
return ret;
}
/* 生成新的PTE Entry设置到硬件页表项中 */
do_set_pte(vma, address, fault_page, pte, false, false);
unlock_page(fault_page);
unlock_out:
pte_unmap_unlock(pte, ptl);
return ret;
}
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
pgoff_t pgoff, unsigned int flags,
struct page *cow_page, struct page **page)
{
struct vm_fault vmf;
int ret;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
if (!vmf.page)
goto out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
page_cache_release(vmf.page);
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
out:
*page = vmf.page;
return ret;
}
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int error;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
loff_t size;
int ret = 0;
/* 这里需要注意,如果文件页的大小被改变了,触发异常信号处理 */
size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
if (offset >= size >> PAGE_CACHE_SHIFT)
return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
*/
/* 不会设置mark_page_accessed */
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
* We found the page, so try async readahead before
* waiting for the lock.
*/
/* 尝试触发readahead */
do_async_mmap_readahead(vma, ra, file, page, offset);
} else if (!page) {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
if (!page) /* 仍然失败 */
goto no_cached_page;
}
/*
* 查找成功流程,先锁住该页,再判断是否是最新。
* 如果设置了Uptodate,则查找成功;如果未设置Uptodate,则触发再次读,然后wait_on_page_locked
*/
/* 锁住该页 */
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
page_cache_release(page);
return ret | VM_FAULT_RETRY;
}
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
VM_BUG_ON_PAGE(page->index != offset, page);
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
/* 不是最新页,与后备处理器不一致.什么页面是清除uptodata标志的???? */
if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
/* 页面是最新的 */
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
}
vmf->page = page;
return ret | VM_FAULT_LOCKED;
no_cached_page:
/*
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
error = page_cache_read(file, offset);
/*
* The page we want has now been added to the page cache.
* In the unlikely event that someone removed it in the
* meantime, we'll just come back here and read it again.
*/
if (error >= 0)
goto retry_find;
/*
* An error return from page_cache_read can result if the
* system is low on memory, or a problem occurs while trying
* to schedule I/O.
*/
if (error == -ENOMEM)
return VM_FAULT_OOM;
return VM_FAULT_SIGBUS;
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
ClearPageError(page);
error = mapping->a_ops->readpage(file, page); /* 再次读该页 */
if (!error) {
wait_on_page_locked(page); /* 等待页解锁,这个地方会睡眠??? 除非上面是一个同步读,保证这里会成功 ????*/
if (!PageUptodate(page)) /* 再次判断页是否是最新的 */
error = -EIO;
}
page_cache_release(page); /* count-- */
/* 读取成功的情况下,再次查找该页 */
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
}
匿名页缺页中断的处理
页面会设置为PG_SwapBacked、PG_uptodate、PG_Active。
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
{
struct mem_cgroup *memcg;
struct page *page;
spinlock_t *ptl;
pte_t entry;
pte_unmap(page_table);
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGSEGV;
/* Use the zero-page for reads */
/* 如果是分配只读属性的页面,使用一个zeroed的全局页面empty_zero_page */
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/* 为vma准备反向映射条件
* 检查此vma能与前后的vma进行合并吗,如果可以,则使用能够合并的那个vma的anon_vma,如果不能够合并,则申请一个空闲的anon_vma
* 新建一个anon_vma_chain
* 将avc->anon_vma指向获得的vma(这个vma可能是新申请的空闲的anon_vma,也可能是获取到的可以合并的vma的anon_vma),avc->vma指向vma,并把avc加入到vma的anon_vma_chain中
*/
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
/*
* 如果页面是可写的,分配掩码是__GFP_MOVABLE|__GFP_WAIT|__GFP_IO|__GFP_FS|__GFP_HARDWALL|__GFP_HIGHMEM。
* 最终调用alloc_pages,优先使用高端内存
*/
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
* the set_pte_at() write.
*/
/* 默认设置页面PG_uptodate标志 */
__SetPageUptodate(page);
if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto release;
/* 增加系统中匿名页面统计计数,计数类型是MM_ANONPAGES */
inc_mm_counter_fast(mm, MM_ANONPAGES);
/* 逆向映射相关,设置PG_SwapBacked标志,表示可以swap;_mapcount=0 */
page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
/* 一般设置为SetPageActive,等待满足一定数量后加入active LRU */
lru_cache_add_active_or_unevictable(page, vma);
setpte:
/* 将entry设置到PTE硬件中 */
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg);
page_cache_release(page);
goto unlock;
oom_free_page:
page_cache_release(page);
oom:
return VM_FAULT_OOM;
}
Swap页缺页中断的处理
写时复制页缺页中断的处理
版权声明:本文为weixin_42318651原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。