Home Kernel userfaultfd
Post
Cancel

Kernel userfaultfd

Userfault只支持匿名页,hugetlb、共享内存;

一、软件流程

1 初始化

调用__NR_userfaultfd syscall初始化 调用syscall初始化建立匿名inode文件,并初始化file->private_data,并返回用户态文件fd。

  1. 用户态:

    1
    
    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
    
  2. 内核态:

    1
    2
    3
    
    d = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,  
    	              O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));  
       
    

2、设置监视区

ioctl的UFFDIO_REGISTER选项注册监视区域;

  1. 用户态:

    1
    2
    3
    4
    5
    6
    
    uffdio_register.range.start = (unsigned long)start;  
    uffdio_register.range.len = size;  
    uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING  
        | UFFDIO_REGISTER_MODE_USWAP;;  
    if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)  
        errExit("ioctl-UFFDIO_REGISTER");  
    

    2.内核:这里会对监控的区域拆除合并,从非监控区域拆分,合并到已经监控的区域,并新增vm_flags;

    1
    2
    3
    4
    5
    
    //(1)flag的对应关系:
    if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)  
        vm_flags |= VM_UFFD_MISSING;  
    if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {  
        vm_flags |= VM_UFFD_WP;  
    
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    
    new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;  
    //(2)将新增的监控区域与已有的监控区域做合并
    prev = vma_merge(mm, prev, start, vma_end, new_flags,  
                     vma->anon_vma, vma->vm_file, vma->vm_pgoff,  
                     vma_policy(vma),  
                     ((struct vm_userfaultfd_ctx){ ctx }));  
    if (prev) {  
        vma = prev;  
        goto next;  
    }  
    //(3)如果无法合并,可能需要拆分
    if (vma->vm_start < start) {  
        ret = split_vma(mm, vma, start, 1);  
        if (ret)  
            break;  
    }  
    if (vma->vm_end > end) {  
        ret = split_vma(mm, vma, end, 0);  
        if (ret)  
            break;  
    }  
    next:  
    /* 
    * In the vma_merge() successful mprotect-like case 8: 
    * the next vma was merged into the current one and 
    * the current one has not been updated yet. 
    */  
    vma->vm_flags = new_flags;  //+USERFAULT_FD_FLAG: VM_UFFD_MISSING
    vma->vm_userfaultfd_ctx.ctx = ctx; 
    

3、poll event

用户态poll函数轮询uffd,并对轮询到的UFFD_EVENT_PAGEFAULT事件(event)用拷贝(ioctl的UFFDIO_COPY选项)进行处理。

  1. UFFD_EVENT_PAGEFAULT事件(event)Page_fault流程中判断是否是userfaultfd_missing,如果是的话,执行userfault_msg唤醒用户态poll进程,并发送消息。

    1
    2
    3
    4
    
    static inline bool userfaultfd_missing(struct vm_area_struct *vma)  
    {  
        return vma->vm_flags & VM_UFFD_MISSING;  
    }  
    
  2. 用户态通过UFFDIO_COPY选项在用户态申请Page传入内核,内核执行mcopy_atomic完成补页mfill_atomic_pte和用户态page内容的拷贝。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    
    dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);  
    if (unlikely(!dst_pmd)) {  
        err = -ENOMEM;  
        break;  
    }  
         
    dst_pmdval = pmd_read_atomic(dst_pmd);  
    /* 
     * If the dst_pmd is mapped as THP don't 
    	 * override it and just be strict. 
    	 */  
    if (unlikely(pmd_trans_huge(dst_pmdval))) {  
    	err = -EEXIST;  
    	break;  
    }  
    if (unlikely(pmd_none(dst_pmdval)) &&  
    	unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {  
    	err = -ENOMEM;  
    	break;  
    }  
    /* If an huge pmd materialized from under us fail */  
    if (unlikely(pmd_trans_huge(*dst_pmd))) {  
    	err = -EFAULT;  
    	break;  
    }  
         
    BUG_ON(pmd_none(*dst_pmd));  
    BUG_ON(pmd_trans_huge(*dst_pmd));  
         
    err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,  
    			   src_addr, &page, zeropage);  
    
    1
    2
    3
    4
    5
    
    //2)用户态传入的page内容copy
    page_kaddr = kmap(page);  
    err = copy_from_user(page_kaddr,  
              (const void __user *) src_addr,  
    		 PAGE_SIZE);  
    

二、userfaultfd waitQ

https://blog.csdn.net/u012218309/article/details/81148083 linux等待队列 wait_queue的使用

http://gityuan.com/2018/12/02/linux-wait-queue/ 源码解读Linux等待队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
 * Start with fault_pending_wqh and fault_wqh so they're more likely
 * to be in the same cacheline.
 */
struct userfaultfd_ctx {
	/* waitqueue head for the pending (i.e. not read) userfaults */
	wait_queue_head_t fault_pending_wqh;
	/* waitqueue head for the userfaults */
	wait_queue_head_t fault_wqh;
	/* waitqueue head for the pseudo fd to wakeup poll/read */
	wait_queue_head_t fd_wqh;
	/* waitqueue head for events */
	wait_queue_head_t event_wqh;
	/* a refile sequence protected by fault_pending_wqh lock */
	struct seqcount refile_seq;
	/* pseudo fd refcounting */
	atomic_t refcount;
	/* userfaultfd syscall flags */
	unsigned int flags;
	/* features requested from the userspace */
	unsigned int features;
	/* state machine */
	enum userfaultfd_state state;
	/* released */
	bool released;
	/* memory mappings are changing because of non-cooperative event */
	bool mmap_changing;
	/* mm with one ore more vmas attached to this userfaultfd_ctx */
	struct mm_struct *mm;
};
  1. wait_queue_head_t fault_pending_wqh => pending (i.e. not read) userfaults

    未读取页错误等待队列,线程触发页错误异常以后,等到userfaultfd读取页错误事件;

    1)fault_pending_wqh

    1
    2
    
    handle_userfault 
        ->| __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
    

    uwq的结构体内容:

    1
    2
    3
    4
    5
    6
    7
    8
    
    ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
    ...
    init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
    uwq.wq.private = current;
    uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
                            ctx->features);
    uwq.ctx = ctx;
    uwq.waken = false;
    

    2)struct seqcount refile_seq

    顺序锁,用来保护fault_pending_wqh和fault_wqh等待队列;

  2. wait_queue_head_t fault_wqh => userfaults

    已读取页错误的等待队列,userfaultfd已读取页错误事件,还没有唤醒触发页错误异常的线程;

  3. wait_queue_head_t fd_wqh => the pseudo fd to wakeup poll/read

    文件描述符等待队列,userfaultfd等待事件发生;

  4. wait_queue_head_t event_wqh => events Q

    事件等待队列,等待userfaultfd读取事件;

三、userfaultfd优化:

1)userfaultfd文档有推荐PROT_NONE + SIGSEGV的方法 优点: 1、不需要socket/poll通信; 2、不走handle_userfault的流程。 缺点: 1、SIGSEGV信号占用; 2、vma碎片

PS: 内核高精度时延测量方法: asm volatile(“mrs %0, cntvct_el0” : “=r” (ed) :: “memory”); 1、 获取ARM的cntvct_el0累加寄存器,计算路径上的时延差值; 2、 定时器的时钟源,通过CNTFRQ_EL0寄存器获取为50Mhz,即20ns;

This post is licensed under CC BY 4.0 by the author.