linux进程地址空间(1) fork/clone/vfork详解(2)

发布时间:2017-3-29 11:14:33 编辑 分享查询网我要评论
本篇文章主要介绍了"linux进程地址空间(1) fork/clone/vfork详解(2)",主要涉及到linux进程地址空间(1) fork/clone/vfork详解(2)方面的内容,对于linux进程地址空间(1) fork/clone/vfork详解(2)感兴趣的同学可以参考一下。

接上一篇,dup_mmap函数源码如下: static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) {          struct vm_area_struct *mpnt, *tmp, **pprev;          struct rb_node **rb_link, *rb_parent;          int retval;          unsigned long charge;          struct mempolicy *pol;          down_write(&oldmm->mmap_sem);          flush_cache_dup_mm(oldmm);          /*           * Not linked in yet - no deadlock potential:           */          down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);          mm->locked_vm = 0;          mm->mmap = NULL;          mm->mmap_cache = NULL;          mm->free_area_cache = oldmm->mmap_base;          mm->cached_hole_size = ~0UL;          mm->map_count = 0;          cpumask_clear(mm_cpumask(mm));          mm->mm_rb = RB_ROOT;          rb_link = &mm->mm_rb.rb_node;          rb_parent = NULL;          pprev = &mm->mmap;          retval = ksm_fork(mm, oldmm);          if (retval)                    goto out;     /*遍历父进程的每个vma,准备复制*/          for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {                    struct file *file;         /*不让拷贝的vma,跳过*/                    if (mpnt->vm_flags & VM_DONTCOPY) {                             long pages = vma_pages(mpnt);                             mm->total_vm -= pages;                             vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,                                                                            -pages);                             continue;                    }                    charge = 0;                    if (mpnt->vm_flags & VM_ACCOUNT) {                             unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;                             if (security_vm_enough_memory(len))                                      goto fail_nomem;                             charge = len;                    }              /*注意,这里从slab创建vma,这是给新进程用的*/                    tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);                    if (!tmp)                             goto fail_nomem;         /*把父进程的vma内容拷贝到这个vma*/                    *tmp = *mpnt;                    pol = mpol_dup(vma_policy(mpnt));                    retval = PTR_ERR(pol);                    if (IS_ERR(pol))                             goto fail_nomem_policy;                    vma_set_policy(tmp, pol);         /*不锁定*/                    tmp->vm_flags &= ~VM_LOCKED;         /*回指自己(新进程)的mm*/                    tmp->vm_mm = mm;                    tmp->vm_next = NULL;                    anon_vma_link(tmp);         /*看看这个vma是不是文件映射的*/                    file = tmp->vm_file;                    if (file) {                             struct inode *inode = file->f_path.dentry->d_inode;                             struct address_space *mapping = file->f_mapping;                             get_file(file);                             if (tmp->vm_flags & VM_DENYWRITE)                                      atomic_dec(&inode->i_writecount);                             spin_lock(&mapping->i_mmap_lock);                             if (tmp->vm_flags & VM_SHARED)                                      mapping->i_mmap_writable++;                             tmp->vm_truncate_count = mpnt->vm_truncate_count;                             flush_dcache_mmap_lock(mapping);                             /* insert tmp into the share list, just after mpnt */                             vma_prio_tree_add(tmp, mpnt);                             flush_dcache_mmap_unlock(mapping);                             spin_unlock(&mapping->i_mmap_lock);                    }                    /*                     * Clear hugetlb-related page reserves for children. This only                     * affects MAP_PRIVATE mappings. Faults generated by the child                     * are not guaranteed to succeed, even if read-only                     */                    if (is_vm_hugetlb_page(tmp))                             reset_vma_resv_huge_pages(tmp);                    /*                     * Link in the new vma and copy the page table entries.                     */                    /*更新新进程的vma链表节点指针为这个tmp的vma,并且指定好pprev准备下一个节点*/                    *pprev = tmp;                    pprev = &tmp->vm_next;         /*插入红黑树*/                    __vma_link_rb(mm, tmp, rb_link, rb_parent);                    rb_link = &tmp->vm_rb.rb_right;                    rb_parent = &tmp->vm_rb;         /*更新新进程的mm的vma个数*/                    mm->map_count++;         /*参数分别为: 子进程的mm、父进程的mm、父进程的某个vma           复制mpnt地址空间部分的页表项到新进程的mm*/                    retval = copy_page_range(mm, oldmm, mpnt);                    if (tmp->vm_ops && tmp->vm_ops->open)                             tmp->vm_ops->open(tmp);                    if (retval)                             goto out;          }          /* a new mm has just been created */          arch_dup_mmap(oldmm, mm);          retval = 0; out:          up_write(&mm->mmap_sem);          flush_tlb_mm(oldmm);          up_write(&oldmm->mmap_sem);          return retval; fail_nomem_policy:          kmem_cache_free(vm_area_cachep, tmp); fail_nomem:          retval = -ENOMEM;          vm_unacct_memory(charge);          goto out; } 对于内核源码的很多部分,一时间可能很难完全搞清搞透传,但重在理解它的意图和把握一个脉络,很多结构体成员众多,短时间内很难全部搞清,重点不要放在这里; 这里多数加了注释的就是我目前感觉比较有用的,或者说能看懂的,它们也比较清晰,for循环拷贝了父进程的所有vma给子进程,子进程又做了相应初始化,现在重点看下函数copy_page_range,它的功能是把父进程的页映射关系也复制给子进程,这里涉及了内存页表的知识,不熟悉的或忘了的可从这篇文章往前找到arm-linux内存页表创建(,里边描述的比较清楚,也可以直接看下边copy_page_range: int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,                    struct vm_area_struct *vma) {          pgd_t *src_pgd, *dst_pgd;          unsigned long next;     /*注意参数vma是父进程的vma,所以这里的addr和end也是这个父进程的vma的首尾地址*/          unsigned long addr = vma->vm_start;          unsigned long end = vma->vm_end;          int ret;          /*           * Don't copy ptes where a page fault will fill them correctly.           * Fork becomes much lighter when there are big shared or private           * readonly mappings. The tradeoff is that copy_page_range is more           * efficient than faulting.           */          if (!(vma->vm_flags &          (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {                    if (!vma->anon_vma)                             return 0;          }          if (is_vm_hugetlb_page(vma))                    return copy_hugetlb_page_range(dst_mm, src_mm, vma);          if (unlikely(is_pfn_mapping(vma))) {                    /*                     * We do not free on error cases below as remove_vma                     * gets called on error from higher level routine                     */                    ret = track_pfn_vma_copy(vma);                    if (ret)                             return ret;          }          /*           * We need to invalidate the secondary MMU mappings only when           * there could be a permission downgrade on the ptes of the           * parent mm. And a permission downgrade will only happen if           * is_cow_mapping() returns true.           */          if (is_cow_mapping(vma->vm_flags))         /*空函数*/                    mmu_notifier_invalidate_range_start(src_mm, addr, end);          ret = 0;     /*先后是子进程、父进程的mm的页表,注意父子进程的pgd是不同的(即mm->pgd)*/          dst_pgd = pgd_offset(dst_mm, addr);          src_pgd = pgd_offset(src_mm, addr);     /*循环次数未知,得看addr和end相差多少个2MB*/          do {         /*对于arm,2MB为单位,一段一段来*/                    next = pgd_addr_end(addr, end);         /*对于arm,底下的if默认为0不会进入*/                    if (pgd_none_or_clear_bad(src_pgd))                             continue;                    if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,                                                    vma, addr, next))) {                             ret = -ENOMEM;                             break;                    }          } while (dst_pgd++, src_pgd++, addr = next, addr != end);            if (is_cow_mapping(vma->vm_flags))         /*空函数*/                    mmu_notifier_invalidate_range_end(src_mm,                                                           vma->vm_start, end);          return ret; } 这个函数的重点首先看两个变量addr和end,分别是这个父进程的这个vma线性区的起始和结尾虚拟地址,后面就是把这个区间的虚拟物理映射复制到子进程的vma线性区,先看下边这个: /*先后是子进程、父进程的mm的页表,注意父子进程的pgd是不同的(即mm->pgd)*/          dst_pgd = pgd_offset(dst_mm, addr);          src_pgd = pgd_offset(src_mm, addr); 如上面注释所说,是父子各自mm的pgd成员,即各自进程的一级页表,然后是下面的循环: /*循环次数未知,得看addr和end相差多少个2MB*/ do {         /*对于arm,2MB为单位,一段一段来*/                    next = pgd_addr_end(addr, end);         /*对于arm,底下的if默认为0不会进入*/                    if (pgd_none_or_clear_bad(src_pgd))                             continue;                    if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,                                                    vma, addr, next))) {                             ret = -ENOMEM;                             break;                    }          } while (dst_pgd++, src_pgd++, addr = next, addr != end); 熟悉内存页表的肯定理解这是在干什么,这就是从addr到end这个区间,以2MB为单位不断调用函数copy_pud_range,所以说循环次数未知;对于函数copy_pud_range,它就是实际的拷贝映射关系,linux四级映射在arm上结合为两级映射,所以接下来调用的函数copy_pud_range和copy_pmd_range实际上相当于重复执行一遍,对结果没有影响,直到函数copy_pte_range,这时函数copy_pte_range的参数依然是父子mm、父子pgd、父vma、vma的首尾地址,源码如下: static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,                    pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,                    unsigned long addr, unsigned long end) {          pte_t *orig_src_pte, *orig_dst_pte;          pte_t *src_pte, *dst_pte;          spinlock_t *src_ptl, *dst_ptl;          int progress = 0;          int rss[2]; again:          rss[1] = rss[0] = 0;     /*这就是给子进程的创建二级页表,再次证明一级页表常驻内存,二级页表要靠分配*/          dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);          if (!dst_pte)                    return -ENOMEM;     /*得出对于当前的虚拟地址,父进程的二级页表条目*/          src_pte = pte_offset_map_nested(src_pmd, addr);          src_ptl = pte_lockptr(src_mm, src_pmd);          spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);          orig_src_pte = src_pte;          orig_dst_pte = dst_pte;          arch_enter_lazy_mmu_mode();     /*以addr与end差值为2MB的正常情况下,将循环2MB/4KB=512次,       每次copy_one_pte将把父进程的二级页表映射内容拷贝给子进程二级页表条目,对应复制4KB空间*/          do {                    /*                     * We are holding two locks at this point - either of them                     * could generate latencies in another task on another CPU.                     */                    if (progress >= 32) {                             progress = 0;                             if (need_resched() ||                                 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))                                      break;                    }         /*父进程二级页表映射内容不存在时,进行下一次循环*/                    if (pte_none(*src_pte)) {                             progress++;                             continue;                    }                    copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);                    progress += 8;          } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);          arch_leave_lazy_mmu_mode();          spin_unlock(src_ptl);          pte_unmap_nested(orig_src_pte);          add_mm_rss(dst_mm, rss[0], rss[1]);          pte_unmap_unlock(orig_dst_pte, dst_ptl);          cond_resched();          if (addr != end)                    goto again;          return 0; } 首先看如下片段: /*这就是给子进程的创建二级页表,再次证明一级页表常驻内存,二级页表要靠分配*/          dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 这就是在给子进程创建一个二级页表,然后看下面的片段; /*得出对于当前的虚拟地址,父进程的二级页表条目*/          src_pte = pte_offset_map_nested(src_pmd, addr); 这是给后面的把父进程的二级页表条目加上写保护属性做准备,先把父进程的二级页表条目获取到,在下一步将增加写保护的属性值;接下来是循环调用函数copy_one_pte,之所以会序号512次,是因为从上面调用到这里,addr和end都是2MB的间隔,这里每次调用copy_one_pte写一个二级页表条目,对应4KB,所以需要调用512次,函数copy_one_pte源码如下: static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,                    pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,                    unsigned long addr, int *rss) {          unsigned long vm_flags = vma->vm_flags;          pte_t pte = *src_pte;          struct page *page;          /* pte contains position in swap or file, so copy. */ /*(!pte_present(pte))为1,说明父进程的这个二级页表映射的内容,不在物理内存*/          if (unlikely(!pte_present(pte))) {  /*如果是因为被交换到磁盘(外存),那么把old_pte在swap file中的入口地址,将old_pte复制到内存中*/                    if (!pte_file(pte)) {                             swp_entry_t entry = pte_to_swp_entry(pte);                             swap_duplicate(entry);                             /* make sure dst_mm is on swapoff's mmlist. */                             if (unlikely(list_empty(&dst_mm->mmlist))) {                                      spin_lock(&mmlist_lock);                                      if (list_empty(&dst_mm->mmlist))                                                list_add(&dst_mm->mmlist,                                                          &src_mm->mmlist);                                      spin_unlock(&mmlist_lock);                             }                             if (is_write_migration_entry(entry) &&                                                is_cow_mapping(vm_flags)) {                                      /*                                       * COW mappings require pages in both parent                                       * and child to be set to read.                                       */                                      make_migration_entry_read(&entry);                                      pte = swp_entry_to_pte(entry);                                      set_pte_at(src_mm, addr, src_pte, pte);                             }                    }                    goto out_set_pte;          }          /*           * If it's a COW mapping, write protect it both           * in the parent and the child           */          /*这一步很重要,这是在fork时,当子进程拷贝父进程的页表时,将这一页置为写保护,导致父子任何一方再要改动这页内容时不能写入,将触发COW*/          if (is_cow_mapping(vm_flags)) {         /*父进程的二级页表的该页条目设置为写保护*/                    ptep_set_wrprotect(src_mm, addr, src_pte);         /*子进程的二级页表的该页条目也设置为写保护*/                    pte = pte_wrprotect(pte);          }          /*           * If it's a shared mapping, mark it clean in           * the child           */          if (vm_flags & VM_SHARED)                    pte = pte_mkclean(pte);          pte = pte_mkold(pte);     /*根据二级页表映射的内容,找出是哪一物理页,并返回其页描述符,       如果是零页(zero_pfn)返回NULL*/          page = vm_normal_page(vma, addr, pte);     /*找到该物理页描述符的目的是,让其成员_count和_mapcount均加1,意即该页的使用进程个数*/          if (page) {                    get_page(page);                    page_dup_rmap(page);                    rss[PageAnon(page)]++;          } out_set_pte:          set_pte_at(dst_mm, addr, dst_pte, pte); } 这个函数直到最后才调用set_pte_at实际的写子进程的二级页表条目,前面主要完成以下功能: 1、  置该物理页对于父子进程的二级页表条目的属性均为写保护: /*这一步很重要,这是在fork时,当子进程拷贝父进程的页表时,  将这一页置为写保护,导致父子任何一方再要改动这页内容时不能写入,将触发COW*/ if (is_cow_mapping(vm_flags)) {         /*父进程的二级页表的该页条目设置为写保护*/            ptep_set_wrprotect(src_mm, addr, src_pte);         /*子进程的二级页表的该页条目也设置为写保护*/            pte = pte_wrprotect(pte); } 2、  更新该页的页描述符的一些成员: /*根据二级页表映射的内容,找出是哪一物理页,并返回其页描述符,       如果是零页(zero_pfn)返回NULL*/ page = vm_normal_page(vma, addr, pte);     /*找到该物理页描述符的目的是,让其成员_count和_mapcount均加1,意即该页的使用进程个数*/ if (page) {            get_page(page);            page_dup_rmap(page);            rss[PageAnon(page)]++; } 这里还需要看下函数vm_normal_page,它的功能是:根据二级页表映射的内容,找出是哪一物理页,并返回其页描述符,如果是零页(zero_pfn)返回NULL;也许会奇怪为什么还有可能是零页,后面就会发现原因和应用场合;          3、写子进程的二级页表:set_pte_at(dst_mm, addr, dst_pte, pte);   上面就是在函数dup_mmap中,for循环调用函数copy_page_range,把父进程的所有vma内的物理页映射都拷贝给子进程的所有vma中的全过程,回到函数dup_mmap,还应注意些小细节,诸如子进程的mm的插入红黑树和vma双向链表、更新vma个数(_mapcount成员)等等,有了印象后对后面其他内容的熟悉有好处,更有助于全面理解进程地址空间。 细心的你会发现,父子进程的每个vma的起始结尾虚拟地址值,都是一样的,同时,这些虚拟地址空间对应的物理地址也都是一样的;只是,父子任何一方试图写这些物理页时,MMU会阻止写操作,就会触发写时复制COW的缺页异常,因为这些物理页是写保护的。 所以,有没有CLONE_VM标志的fork/vfork/clone的区别在于,有CLONE_VM标志的情况下,子进程没有自己的mm也没有自己的vma,而有CLONE_VM的情况是有自己的mm和vma的;但虚拟空间的物理地址都是没有额外映射,这样做的好处是节省了物理内存,其实就如ULK所讲,在运行一段时间后,父子进程就会有完全不一样的地址空间了。

上一篇:oracle 闪回
下一篇:Reverse Integer