Userfault只支持匿名页,hugetlb、共享内存;
一、软件流程
1 初始化
调用__NR_userfaultfd syscall初始化 调用syscall初始化建立匿名inode文件,并初始化file->private_data,并返回用户态文件fd。
-
用户态:
1
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
-
内核态:
1 2 3
d = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
2、设置监视区
ioctl的UFFDIO_REGISTER选项注册监视区域;
-
用户态:
1 2 3 4 5 6
uffdio_register.range.start = (unsigned long)start; uffdio_register.range.len = size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_USWAP;; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) errExit("ioctl-UFFDIO_REGISTER");
2.内核:这里会对监控的区域拆除合并,从非监控区域拆分,合并到已经监控的区域,并新增vm_flags;
1 2 3 4 5
//(1)flag的对应关系: if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { vm_flags |= VM_UFFD_WP;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; //(2)将新增的监控区域与已有的监控区域做合并 prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx })); if (prev) { vma = prev; goto next; } //(3)如果无法合并,可能需要拆分 if (vma->vm_start < start) { ret = split_vma(mm, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; } next: /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ vma->vm_flags = new_flags; //+USERFAULT_FD_FLAG: VM_UFFD_MISSING vma->vm_userfaultfd_ctx.ctx = ctx;
3、poll event
用户态poll函数轮询uffd,并对轮询到的UFFD_EVENT_PAGEFAULT事件(event)用拷贝(ioctl的UFFDIO_COPY选项)进行处理。
-
UFFD_EVENT_PAGEFAULT事件(event)Page_fault流程中判断是否是userfaultfd_missing,如果是的话,执行userfault_msg唤醒用户态poll进程,并发送消息。
1 2 3 4
static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; }
-
用户态通过UFFDIO_COPY选项在用户态申请Page传入内核,内核执行mcopy_atomic完成补页mfill_atomic_pte和用户态page内容的拷贝。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmd_read_atomic(dst_pmd); /* * If the dst_pmd is mapped as THP don't * override it and just be strict. */ if (unlikely(pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } /* If an huge pmd materialized from under us fail */ if (unlikely(pmd_trans_huge(*dst_pmd))) { err = -EFAULT; break; } BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, &page, zeropage);
1 2 3 4 5
//2)用户态传入的page内容copy page_kaddr = kmap(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE);
二、userfaultfd waitQ
https://blog.csdn.net/u012218309/article/details/81148083 linux等待队列 wait_queue的使用
http://gityuan.com/2018/12/02/linux-wait-queue/ 源码解读Linux等待队列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
*/
struct userfaultfd_ctx {
/* waitqueue head for the pending (i.e. not read) userfaults */
wait_queue_head_t fault_pending_wqh;
/* waitqueue head for the userfaults */
wait_queue_head_t fault_wqh;
/* waitqueue head for the pseudo fd to wakeup poll/read */
wait_queue_head_t fd_wqh;
/* waitqueue head for events */
wait_queue_head_t event_wqh;
/* a refile sequence protected by fault_pending_wqh lock */
struct seqcount refile_seq;
/* pseudo fd refcounting */
atomic_t refcount;
/* userfaultfd syscall flags */
unsigned int flags;
/* features requested from the userspace */
unsigned int features;
/* state machine */
enum userfaultfd_state state;
/* released */
bool released;
/* memory mappings are changing because of non-cooperative event */
bool mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
-
wait_queue_head_t fault_pending_wqh => pending (i.e. not read) userfaults
未读取页错误等待队列,线程触发页错误异常以后,等到userfaultfd读取页错误事件;
1)fault_pending_wqh
1 2
handle_userfault ->| __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
uwq的结构体内容:
1 2 3 4 5 6 7 8
ctx = vmf->vma->vm_userfaultfd_ctx.ctx; ... init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, ctx->features); uwq.ctx = ctx; uwq.waken = false;
2)struct seqcount refile_seq
顺序锁,用来保护fault_pending_wqh和fault_wqh等待队列;
-
wait_queue_head_t fault_wqh => userfaults
已读取页错误的等待队列,userfaultfd已读取页错误事件,还没有唤醒触发页错误异常的线程;
-
wait_queue_head_t fd_wqh => the pseudo fd to wakeup poll/read
文件描述符等待队列,userfaultfd等待事件发生;
-
wait_queue_head_t event_wqh => events Q
事件等待队列,等待userfaultfd读取事件;
三、userfaultfd优化:
1)userfaultfd文档有推荐PROT_NONE + SIGSEGV的方法 优点: 1、不需要socket/poll通信; 2、不走handle_userfault的流程。 缺点: 1、SIGSEGV信号占用; 2、vma碎片
PS: 内核高精度时延测量方法: asm volatile(“mrs %0, cntvct_el0” : “=r” (ed) :: “memory”); 1、 获取ARM的cntvct_el0累加寄存器,计算路径上的时延差值; 2、 定时器的时钟源,通过CNTFRQ_EL0寄存器获取为50Mhz,即20ns;