博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Linux内核源代码情景分析-fork()
阅读量:6403 次
发布时间:2019-06-23

本文共 22478 字,大约阅读时间需要 74 分钟。

    父进程fork子进程:

    child = fork()

    fork经过系统调用。来到了sys_fork。具体过程请參考。

asmlinkage int sys_fork(struct pt_regs regs){	return do_fork(SIGCHLD, regs.esp, &regs, 0);}
int do_fork(unsigned long clone_flags, unsigned long stack_start, //stack_start为用户空间堆栈指针	    struct pt_regs *regs, unsigned long stack_size){	int retval = -ENOMEM;	struct task_struct *p;	DECLARE_MUTEX_LOCKED(sem);	if (clone_flags & CLONE_PID) {		/* This is only allowed from the boot up thread */		if (current->pid)			return -EPERM;	}		current->vfork_sem = &sem;//假设clone_flags中CLONE_VFORK位置1,这个信号量用于up(&sem)。使父进程唤醒	p = alloc_task_struct();//为子进程分配两个连续的物理页面,低端用作子进程的task_struct结构,高端则用作其系统空间堆栈	if (!p)		goto fork_out;	*p = *current;//父进程的整个task_struct就被拷贝到了子进程的数据结构	retval = -EAGAIN;	if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)		goto bad_fork_free;	atomic_inc(&p->user->__count);	atomic_inc(&p->user->processes);	/*	 * Counter increases are protected by	 * the kernel lock so nr_threads can't	 * increase under us (but it may decrease).	 */	if (nr_threads >= max_threads)		goto bad_fork_cleanup_count;		get_exec_domain(p->exec_domain);	if (p->binfmt && p->binfmt->module)		__MOD_INC_USE_COUNT(p->binfmt->module);	p->did_exec = 0;	p->swappable = 0;	p->state = TASK_UNINTERRUPTIBLE;//不可中断等待状态	copy_flags(clone_flags, p);//将參数clone_flags中的标志位略加补充和变换,然后写入p->flags	p->pid = get_pid(clone_flags);//获取进程pid	p->run_list.next = NULL;	p->run_list.prev = NULL;	if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {		p->p_opptr = current;		if (!(p->ptrace & PT_PTRACED))			p->p_pptr = current;	}	p->p_cptr = NULL;	init_waitqueue_head(&p->wait_chldexit);	p->vfork_sem = NULL;	spin_lock_init(&p->alloc_lock);	p->sigpending = 0;	init_sigpending(&p->pending);	p->it_real_value = p->it_virt_value = p->it_prof_value = 0;	p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;	init_timer(&p->real_timer);	p->real_timer.data = (unsigned long) p;	p->leader = 0;		/* session leadership doesn't inherit */	p->tty_old_pgrp = 0;	p->times.tms_utime = p->times.tms_stime = 0;	p->times.tms_cutime = p->times.tms_cstime = 0;#ifdef CONFIG_SMP	{		int i;		p->has_cpu = 0;		p->processor = current->processor;		/* ??

should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; spin_lock_init(&p->sigmask_lock); } #endif p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p))//有条件地复制已打开文件的控制结构files_struct,这样的复制仅仅有在clone_flags中CLONE_FILES标志位为0时才真正进行。否则就仅仅是共享父进程的指针 goto bad_fork_cleanup; if (copy_fs(clone_flags, p))//有条件地拷贝文件系统相关结构files_structfs_struct,这样的复制仅仅有在clone_flags中CLONE_FS标志位为0时才真正进行。否则就仅仅是共享父进程的指针 goto bad_fork_cleanup_files; if (copy_sighand(clone_flags, p))//有条件地复制信号处理相关结构signal_struct。这样的复制仅仅有在clone_flags中CLONE_SIGHAND标志位为0时才真正进行。否则就仅仅是共享父进程的指针 goto bad_fork_cleanup_fs; if (copy_mm(clone_flags, p))//有条件地复制内存管理相关结构mm_struct及其下属的vm_area_struct,这样的复制仅仅有在clone_flags中CLONE_VM标志位为0时才真正进行。否则就仅仅是共享父进程的指针 goto bad_fork_cleanup_sighand; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);//实际上却仅仅是复制父进程的系统空间堆栈 if (retval) goto bad_fork_cleanup_sighand; p->semundo = NULL; /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ p->parent_exec_id = p->self_exec_id; /* ok, now we should be set up.. */ p->swappable = 1; p->exit_signal = clone_flags & CSIGNAL;//本进程运行exit()时应向父进程发出的信号,CSIGNAL p->pdeath_signal = 0; /* * "share" dynamic priority between parent and child, thus the * total amount of dynamic priorities in the system doesnt change, * more scheduling fairness. This is only important in the first * timeslice, on the long run the scheduling behaviour is unchanged. */ p->counter = (current->counter + 1) >> 1; current->counter >>= 1;//task_struct结构中counter字段的值就是进程的运行时间配额,这里将父进程的时间配额分成两半,让父、子进程各有原值的一半。 if (!current->counter) current->need_resched = 1; /* * Ok, add it to the run-queues and make it * visible to the rest of the system. * * Let it rip! */ retval = p->pid; p->tgid = retval; INIT_LIST_HEAD(&p->thread_group); write_lock_irq(&tasklist_lock); if (clone_flags & CLONE_THREAD) { p->tgid = current->tgid; list_add(&p->thread_group, ¤t->thread_group); } SET_LINKS(p);//将子进程的task_struct结构链入内核的进程队列 hash_pid(p);//将其链入按其pid计算得的杂凑队列 nr_threads++;//进程数加1 write_unlock_irq(&tasklist_lock); if (p->ptrace & PT_PTRACED) send_sig(SIGSTOP, p, 1); wake_up_process(p); //将子进程"唤醒",也就是将其挂入可运行进程队列等待调用 ++total_forks; fork_out: if ((clone_flags & CLONE_VFORK) && (retval > 0))//假设clone_flags中CLONE_VFORK位置1 down(&sem);//让父进程在一个信号量上运行一次down()操作。以达到扣留父进程的目的 return retval;//返回p->pid,也就是子进程的pid bad_fork_cleanup_sighand: exit_sighand(p); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup: put_exec_domain(p->exec_domain); if (p->binfmt && p->binfmt->module) __MOD_DEC_USE_COUNT(p->binfmt->module); bad_fork_cleanup_count: atomic_dec(&p->user->processes); free_uid(p->user); bad_fork_free: free_task_struct(p); goto fork_out; }

    当中regs对父进程系统堆栈的指针,stack_start为用户空间堆栈指针。

    alloc_task_struct为子进程分配两个连续的物理页面,低端用作子进程的task_struct结构,高端则用作其系统空间堆栈,代码例如以下:

#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)

    copy_flags,将參数clone_flags中的标志位略加补充和变换,然后写入p->flags。

static inline void copy_flags(unsigned long clone_flags, struct task_struct *p){	unsigned long new_flags = p->flags;	new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);	new_flags |= PF_FORKNOEXEC;	if (!(clone_flags & CLONE_PTRACE))		p->ptrace = 0;	if (clone_flags & CLONE_VFORK)		new_flags |= PF_VFORK;	p->flags = new_flags;}

    对于fork来说,clone_flags为SIGCHLD,copy_files,copy_fs,copy_sighand,copy_mm都是要真正复制

    copy_files。代码例如以下:

static int copy_files(unsigned long clone_flags, struct task_struct * tsk){	struct files_struct *oldf, *newf;	struct file **old_fds, **new_fds;	int open_files, nfds, size, i, error = 0;	/*	 * A background process may not have any files ...	 */	oldf = current->files;	if (!oldf)		goto out;	if (clone_flags & CLONE_FILES) {//clone_flags中CLONE_FILES标志位为1		atomic_inc(&oldf->count);//仅仅是添加计数		goto out;	}	tsk->files = NULL;	error = -ENOMEM;	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);	if (!newf) 		goto out;	atomic_set(&newf->count, 1);	newf->file_lock	    = RW_LOCK_UNLOCKED;	newf->next_fd	    = 0;	newf->max_fds	    = NR_OPEN_DEFAULT;	newf->max_fdset	    = __FD_SETSIZE;	newf->close_on_exec = &newf->close_on_exec_init;	newf->open_fds	    = &newf->open_fds_init;	newf->fd	    = &newf->fd_array[0];	/* We don't yet have the oldf readlock, but even if the old           fdset gets grown now, we'll only copy up to "size" fds */	size = oldf->max_fdset;	if (size > __FD_SETSIZE) {		newf->max_fdset = 0;		write_lock(&newf->file_lock);		error = expand_fdset(newf, size);		write_unlock(&newf->file_lock);		if (error)			goto out_release;	}	read_lock(&oldf->file_lock);	open_files = count_open_files(oldf, size);	/*	 * Check whether we need to allocate a larger fd array.	 * Note: we're not a clone task, so the open count won't	 * change.	 */	nfds = NR_OPEN_DEFAULT;	if (open_files > nfds) {		read_unlock(&oldf->file_lock);		newf->max_fds = 0;		write_lock(&newf->file_lock);		error = expand_fd_array(newf, open_files);		write_unlock(&newf->file_lock);		if (error) 			goto out_release;		nfds = newf->max_fds;		read_lock(&oldf->file_lock);	}	old_fds = oldf->fd;	new_fds = newf->fd;	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);	for (i = open_files; i != 0; i--) {		struct file *f = *old_fds++;		if (f)			get_file(f);		*new_fds++ = f;	}	read_unlock(&oldf->file_lock);	/* compute the remainder to be cleared */	size = (newf->max_fds - open_files) * sizeof(struct file *);	/* This is long word aligned thus could use a optimized version */ 	memset(new_fds, 0, size); 	if (newf->max_fdset > open_files) {		int left = (newf->max_fdset-open_files)/8;		int start = open_files / (8 * sizeof(unsigned long));				memset(&newf->open_fds->fds_bits[start], 0, left);		memset(&newf->close_on_exec->fds_bits[start], 0, left);	}	tsk->files = newf;	error = 0;out:	return error;out_release:	free_fdset (newf->close_on_exec, newf->max_fdset);	free_fdset (newf->open_fds, newf->max_fdset);	kmem_cache_free(files_cachep, newf);	goto out;}

    待我们学习了文件系统后再细致分析。

    copy_fs。代码例如以下:

static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk){	if (clone_flags & CLONE_FS) {//clone_flags中CLONE_FS标志位为1		atomic_inc(current->fs->count);//仅仅是添加计数		return 0;	}	tsk->fs = __copy_fs_struct(current->fs);	if (!tsk->fs)		return -1;	return 0;}
static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old){	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);	/* We don't need to lock fs - think why ;-) */	if (fs) {		atomic_set(&fs->count, 1);		fs->lock = RW_LOCK_UNLOCKED;		fs->umask = old->umask;		read_lock(&old->lock);		fs->rootmnt = mntget(old->rootmnt);		fs->root = dget(old->root);		fs->pwdmnt = mntget(old->pwdmnt);		fs->pwd = dget(old->pwd);		if (old->altroot) {			fs->altrootmnt = mntget(old->altrootmnt);			fs->altroot = dget(old->altroot);		} else {			fs->altrootmnt = NULL;			fs->altroot = NULL;		}			read_unlock(&old->lock);	}	return fs;}
    我们看到,在这里要复制的是fs_struct数据结构,而不复制更深层的数据结构。对于更深层的数据结构通过mntget()和dget()递增响应数据结构中共享计数。

    copy_sighand。代码例如以下:

static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk){	struct signal_struct *sig;	if (clone_flags & CLONE_SIGHAND) {//假设clone_flags中CLONE_SIGHAND标志位为1		atomic_inc(current->sig->count);//添加计数		return 0;	}	sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);	tsk->sig = sig;	if (!sig)		return -1;	spin_lock_init(&sig->siglock);	atomic_set(&sig->count, 1);	memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));	return 0;}
struct signal_struct {	atomic_t		count;	struct k_sigaction	action[_NSIG];	spinlock_t		siglock;};

    
copy_mm,代码例如以下:

static int copy_mm(unsigned long clone_flags, struct task_struct * tsk){	struct mm_struct * mm, *oldmm;	int retval;	tsk->min_flt = tsk->maj_flt = 0;	tsk->cmin_flt = tsk->cmaj_flt = 0;	tsk->nswap = tsk->cnswap = 0;	tsk->mm = NULL;	tsk->active_mm = NULL;	/*	 * Are we cloning a kernel thread?	 *	 * We need to steal a active VM for that..	 */	oldmm = current->mm;	if (!oldmm)//假设是内核线程,那么oldmm为null,直接返回		return 0;	if (clone_flags & CLONE_VM) {//假设clone_flags中CLONE_VM标志位为1		atomic_inc(&oldmm->mm_users);//添加mm_users计数		mm = oldmm;		goto good_mm;	}	retval = -ENOMEM;//clone_flags中CLONE_VM标志位为0	mm = allocate_mm();//分配mm_struct	if (!mm)		goto fail_nomem;	/* Copy the current MM stuff.. */	memcpy(mm, oldmm, sizeof(*mm));	if (!mm_init(mm))//初始化mm_struct		goto fail_nomem;	down(&oldmm->mmap_sem);	retval = dup_mmap(mm);//vm_area_struct数据结构和页面映射表的复制	up(&oldmm->mmap_sem);	/*	 * Add it to the mmlist after the parent.	 *	 * Doing it this way means that we can order	 * the list, and fork() won't mess up the	 * ordering significantly.	 */	spin_lock(&mmlist_lock);	list_add(&mm->mmlist, &oldmm->mmlist);	spin_unlock(&mmlist_lock);	if (retval)		goto free_pt;	/*	 * child gets a private LDT (if there was an LDT in the parent)	 */	copy_segments(tsk, mm);//对ldt来说。我们不关心	if (init_new_context(tsk,mm))//空语句		goto free_pt;good_mm:	tsk->mm = mm;	tsk->active_mm = mm;	return 0;free_pt:	mmput(mm);fail_nomem:	return retval;}
    显然,对mm_struct的复制也仅仅是在clone_flags中CLONE_VM标志位为0时才真正进行,否则就仅仅是通过已经复制的指针共享父进程的用户空间。

对mm_struct的复制就不仅仅是局限于这个数据结构本身了,也包含了对更深层数据结构的复制。

当中最重要的是vm_area_struct数据结构和页面映射表的复制,这是由dup_mmap()复制的。

    allocate_mm,分配mm_struct。代码例如以下:

#define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))

    mm_init。初始化mm_struct。代码例如以下:

static struct mm_struct * mm_init(struct mm_struct * mm){	atomic_set(&mm->mm_users, 1);	atomic_set(&mm->mm_count, 1);	init_MUTEX(&mm->mmap_sem);	mm->page_table_lock = SPIN_LOCK_UNLOCKED;	mm->pgd = pgd_alloc();//指向新分配的页文件夹表	if (mm->pgd)		return mm;	free_mm(mm);	return NULL;}

   dup_mmap是vm_area_struct数据结构和页面映射表的复制。代码例如以下:

static inline int dup_mmap(struct mm_struct * mm){	struct vm_area_struct * mpnt, *tmp, **pprev;	int retval;	flush_cache_mm(current->mm);	mm->locked_vm = 0;	mm->mmap = NULL;	mm->mmap_avl = NULL;	mm->mmap_cache = NULL;	mm->map_count = 0;	mm->cpu_vm_mask = 0;	mm->swap_cnt = 0;	mm->swap_address = 0;	pprev = &mm->mmap;	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {//对于父进程的全部虚拟空间进行轮询		struct file *file;		retval = -ENOMEM;		if(mpnt->vm_flags & VM_DONTCOPY)			continue;		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//分配子进程的vm_struct		if (!tmp)			goto fail_nomem;		*tmp = *mpnt;//父进程的vm_struct复制给子进程vm_struct		tmp->vm_flags &= ~VM_LOCKED;		tmp->vm_mm = mm;		mm->map_count++;//虚拟空间数加1		tmp->vm_next = NULL;		file = tmp->vm_file;		if (file) {//假设为null			struct inode *inode = file->f_dentry->d_inode;			get_file(file);			if (tmp->vm_flags & VM_DENYWRITE)				atomic_dec(&inode->i_writecount);      			/* insert tmp into the share list, just after mpnt */			spin_lock(&inode->i_mapping->i_shared_lock);			if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)				mpnt->vm_next_share->vm_pprev_share =					&tmp->vm_next_share;			mpnt->vm_next_share = tmp;			tmp->vm_pprev_share = &mpnt->vm_next_share;			spin_unlock(&inode->i_mapping->i_shared_lock);		}		/* Copy the pages, but defer checking for errors */		retval = copy_page_range(mm, current->mm, tmp);//复制虚拟空间对应的页文件夹表项和页表项		if (!retval && tmp->vm_ops && tmp->vm_ops->open)			tmp->vm_ops->open(tmp);		/*		 * Link in the new vma even if an error occurred,		 * so that exit_mmap() can clean up the mess.		 */		*pprev = tmp;//下一个虚拟空间		pprev = &tmp->vm_next;		if (retval)			goto fail_nomem;	}	retval = 0;	if (mm->map_count >= AVL_MIN_MAP_COUNT)//当虚拟空间数大于AVL_MIN_MAP_COUNT		build_mmap_avl(mm);//形成avl树,方便查找fail_nomem:	flush_tlb_mm(current->mm);	return retval;}

    copy_page_range。代码例如以下:

int copy_page_range(struct mm_struct *dst, struct mm_struct *src,			struct vm_area_struct *vma){	pgd_t * src_pgd, * dst_pgd;	unsigned long address = vma->vm_start;	unsigned long end = vma->vm_end;	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;//可写,而又不是共享	src_pgd = pgd_offset(src, address)-1;	dst_pgd = pgd_offset(dst, address)-1;		for (;;) {		pmd_t * src_pmd, * dst_pmd;		src_pgd++; dst_pgd++;				/* copy_pmd_range */				if (pgd_none(*src_pgd))			goto skip_copy_pmd_range;		if (pgd_bad(*src_pgd)) {			pgd_ERROR(*src_pgd);			pgd_clear(src_pgd);skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;			if (!address || (address >= end))				goto out;			continue;		}		if (pgd_none(*dst_pgd)) {			if (!pmd_alloc(dst_pgd, 0))				goto nomem;		}				src_pmd = pmd_offset(src_pgd, address);		dst_pmd = pmd_offset(dst_pgd, address);		do {			pte_t * src_pte, * dst_pte;					/* copy_pte_range */					if (pmd_none(*src_pmd))				goto skip_copy_pte_range;			if (pmd_bad(*src_pmd)) {				pmd_ERROR(*src_pmd);				pmd_clear(src_pmd);skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;				if (address >= end)					goto out;				goto cont_copy_pmd_range;			}			if (pmd_none(*dst_pmd)) {				if (!pte_alloc(dst_pmd, 0))					goto nomem;			}						src_pte = pte_offset(src_pmd, address);			dst_pte = pte_offset(dst_pmd, address);						do {				pte_t pte = *src_pte;				struct page *ptepage;								/* copy_one_pte */				if (pte_none(pte)) //第一种情况					goto cont_copy_pte_range_noset;				if (!pte_present(pte)) { //另外一种情况					swap_duplicate(pte_to_swp_entry(pte));					goto cont_copy_pte_range;				}				ptepage = pte_page(pte);//得到页表项所指的页面				if ((!VALID_PAGE(ptepage)) ||  //第三种情况				    PageReserved(ptepage))					goto cont_copy_pte_range;				/* If it's a COW mapping, write protect it both in the parent and the child */				if (cow) {//第四种情况					ptep_set_wrprotect(src_pte);//改成仅仅读					pte = *src_pte;				}				/* If it's a shared mapping, mark it clean in the child */				if (vma->vm_flags & VM_SHARED)					pte = pte_mkclean(pte);				pte = pte_mkold(pte);				get_page(ptepage);//添加页面使用计数                                //cow为0时,仅仅读页面。第五种情况cont_copy_pte_range:		set_pte(dst_pte, pte);//将此表项拷贝到子进程的页表项cont_copy_pte_range_noset:	address += PAGE_SIZE;				if (address >= end)					goto out;				src_pte++;				dst_pte++;			} while ((unsigned long)src_pte & PTE_TABLE_MASK);		cont_copy_pmd_range:	src_pmd++;			dst_pmd++;		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);	}out:	return 0;nomem:	return -ENOMEM;}
    开头是对页文件夹表项的循环,中间是对中间文件夹项的循环。最后是对页表项的循环,我们把注意力放在最后一层循环。也就是对页表项的循环。

    循环中检查父进程一个页表中的每个表项,依据表项的内容决定具体的操作。

而表项的内容,则无非是以下这么一些可能:

    1、表项的内容为全0。所以pte_none()返回1。说明该页面的映射尚未建立,或者说是个“空洞”,因此不须要做不论什么事。

  

    2、表项的最低位,即_PAGE_PRESENT标志位为0,所以pte_present返回1。

说明映射已建立,可是该页面眼下不在内存中,已经被调出到交换设备上。此时表项的内容指向"盘面页面"的地点,而如今该盘上页面多了一个"用户"。所以要通过swap_duplicate()递增它的共享计数。就转到cont_copy_pte_range将此表项拷贝到子进程的页表项。

    3、映射已建立。可是物理页面不是一个有效的内存页面。所以VALID_PAGE()返回0。

读者能够回想一下。我们曾经讲过有些物理页面在外设接口卡上,对应的地址为“总线地址”。而并非内存页面。

这样的页面,就转到cont_copy_pte_range将此表项拷贝到子进程的页表项。

4、须要从父进程复制的可写页面。

本来,此时应该分配一个空暇的内存页面。再从父进程的页面把内容复制过来,并为之建立映射。

显然,这个代价是不小的。然后,对这么辛辛苦苦复制下来的页面,子进程是否一定会用呢?特别是会有写訪问么?假设仅仅是读訪问。则仅仅要父进程从此不再写这个页面。就全然能够通过复制指针来共享这个页面,那不知要省事多少了。所以,Linux内核採用了一种称为"copy on write"的技术,先通过复制页表项临时共享这个页面。到子进程真的要写着个页面时再次分配页面和复制。

变量cow是"copy on write"的缩写。可写。而又不是共享。

实际上。对于绝大多数的可写虚拟区间,cow都是1。在通过复制页表项临时共享一个页表项时要做两件重要的事情,首先将父进程的页表项改成写保护(仅仅读)。然后把已经改成写保护的表项设置到子进程的页表项。

这样一来,响应的页面在两个进程中都变成"仅仅读"了。当无论是父进程或是子进程企图写入该页面时,都会引起一次页面异常。而页面异常处理程序对此的反应则是另行分配一个物理页面。并把内容真正地拷贝到新的物理页面中,让父、子进程各自拥有自己的物理页面,然后将两个页表项中对应的表项改成可写。可是copy_on_write仅仅有在父、子进程各自拥有自己的页表时才干实现。当CLONE_VM标志位为1时。由于父、子进程通过指针共享用户空间,copy_on_write就用不上了。

此时,父、子进程是在真正的意义上共享用户空间。父进程写入其用户空间的内容同一时候也“写入”子进程的用户空间。

 

5、父进程的仅仅读页面。这样的页面本来就不须要复制。因而能够复制页表项共享物理页面。

返回到do_fork。继续运行copy_thread。代码例如以下:

int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,	unsigned long unused,	struct task_struct * p, struct pt_regs * regs){	struct pt_regs * childregs;	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;//指向了子进程系统空间堆栈中的pt_regs结构	struct_cpy(childregs, regs);//把当前进程系统空间堆栈中的pt_regs结构复制过去	childregs->eax = 0;//子进程系统空间堆栈中的pt_regs结构eax置成0	childregs->esp = esp;//子进程系统空间堆栈中的pt_regs结构esp置成这里的參数esp,在fork中,则来自调用do_fork()前夕的regs.esp,所以实际上并没有改变	p->thread.esp = (unsigned long) childregs;//子进程系统空间堆栈中pt_regs结构的起始地址	p->thread.esp0 = (unsigned long) (childregs+1);//指向子进程的系统空间堆栈的顶端	p->thread.eip = (unsigned long) ret_from_fork;	savesegment(fs,p->thread.fs);	savesegment(gs,p->thread.gs);	unlazy_fpu(current);	struct_cpy(&p->thread.i387, ¤t->thread.i387);	return 0;}
    最后形成例如以下图:

    二、clone和vfork

    clone的用户态接口是:int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg)。

    我们看下这clone、fork、vfork几个系统调用的差别:
asmlinkage int sys_fork(struct pt_regs regs){	return do_fork(SIGCHLD, regs.esp, &regs, 0);}asmlinkage int sys_clone(struct pt_regs regs){	unsigned long clone_flags;	unsigned long newsp;	clone_flags = regs.ebx;//就是用户态的flags	newsp = regs.ecx;//就是用户态的child_stack	if (!newsp)		newsp = regs.esp;	return do_fork(clone_flags, newsp, &regs, 0);}/* * This is trivial, and on the face of it looks like it * could equally well be done in user mode. * * Not so, for quite unobvious reasons - register pressure. * In user mode vfork() cannot have a stack frame, and if * done by calling the "clone()" system call directly, you * do not have enough call-clobbered registers to hold all * the information you need. */asmlinkage int sys_vfork(struct pt_regs regs){	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0);//主要差别是有两个标志位CLONE_VFORK,CLONE_VM}
    假设全然没实用户空间,就称为"内核线程";而假设共享用户空间则就是为”用户线程“。
    那么vfork出来的是用户线程,共享用户空间。copy_mm中代码例如以下:
if (clone_flags & CLONE_VM) {//假设clone_flags中CLONE_VM标志位为1		atomic_inc(&oldmm->mm_users);//添加mm_users计数		mm = oldmm;		goto good_mm;	}
    
    vfork和fork另一个差别就是CLONE_VFORK标志位,体如今代码中,do_fork的最后:
fork_out:	if ((clone_flags & CLONE_VFORK) && (retval > 0))//假设clone_flags中CLONE_VFORK位置1		down(&sem);//让父进程在一个信号量上运行一次down()操作,以达到扣留父进程的目的	return retval;
   当调用do_fork的參数中CLONE_VFORK标志位为1时,一定要保证让子进程先运行。一直到子进程通过系统调用execve运行一个新的可运行程序或者通过系统调用exit()退出系统时,才干够恢复父进程的运行。为什么呢?在创建子进程时,假设CLONE_VM为1,仅仅是简单地复制父进程的task_struct结构中指向其mm_struct结构的指针来共享。

此时。父、子进程是在真正的意义上共享用户空间,父进程写入其用户空间的内容同一时候也“写入”子进程的用户空间。绝不能让两个进程都回到用户空间并发地运行;否则,必定是两个进程终于都乱来一气后者因非法越界訪问而死亡。解决的办法仅仅能是”扣留“当中一个进程,而仅仅让一个进程回到用户空间,直到两个进程不再共享它们的用户空间后者当中一个进程消亡为至。

所以才有了上面的操作。让父进程在一个信号量上运行一次down()操作,以达到扣留父进程的目的。

那么谁来运行up操作呢?
子进程在通过execve运行一个新的可运行程序时会做这件事,此外,子进程在通过exit()退出系统时也会做这件事。

代码例如以下:

void mm_release(void){	struct task_struct *tsk = current;	/* notify parent sleeping on vfork() */	if (tsk->flags & PF_VFORK) {		tsk->flags &= ~PF_VFORK;		up(tsk->p_opptr->vfork_sem);	}}
    三、
内核线程
int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags){	long retval, d0;	__asm__ __volatile__(		"movl %%esp,%%esi\n\t"   //系统调用前的堆栈指针赋值给esi		"int $0x80\n\t"				"cmpl %%esp,%%esi\n\t"	//系统调用后的堆栈指针和系统调用前的堆栈指针相比,假设不同就是子进程,假设同样就是父进程		"je 1f\n\t"        //跳到父进程						"movl %4,%%eax\n\t"//把參数arg压入堆栈,作为參数		"pushl %%eax\n\t"				"call *%5\n\t" //call fn				"movl %3,%0\n\t" //eax为_NR_exit			"int $0x80\n" //运行exit系统调用		"1:\t"		:"=&a" (retval), "=&S" (d0)		:"0" (__NR_clone), "i" (__NR_exit),//eax为_NR_clone		 "r" (arg), "r" (fn), 		 "b" (flags | CLONE_VM)//ebx为flags | CLONE_VM		: "memory");	return retval;}
 
刚開始eax为_NR_clone。ebx为flags | CLONE_VM,然后调用int 0x80系统调用。那么就进入了sys_clone,代码例如以下:
asmlinkage int sys_clone(struct pt_regs regs){	unsigned long clone_flags;	unsigned long newsp;	clone_flags = regs.ebx;//就是用户态的flags | CLONE_VM	newsp = regs.ecx;//newsp为null	if (!newsp)		newsp = regs.esp;	return do_fork(clone_flags, newsp, &regs, 0);}
    那么kernel_thread出来的是内核线程,mm指针为null
,copy_mm中代码例如以下:
oldmm = current->mm;	if (!oldmm)//假设是内核线程,那么oldmm为null,直接返回		return 0;

    最后附上,全部标志位的作用:

#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */#define CLONE_VM	0x00000100	/* set if VM shared between processes */#define CLONE_FS	0x00000200	/* set if fs info shared between processes */#define CLONE_FILES	0x00000400	/* set if open files shared between processes */#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */#define CLONE_PID	0x00001000	/* set if pid shared */#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */#define CLONE_THREAD	0x00010000	/* Same thread group? */#define CLONE_SIGNAL	(CLONE_SIGHAND | CLONE_THREAD)

版权声明:本文博客原创文章,博客,未经同意,不得转载。

你可能感兴趣的文章
jQuery打造智能提示插件二(可编辑下拉框)
查看>>
[Python] Python 之 function, unbound method 和 bound method
查看>>
希尔排序
查看>>
改变随机数中一些值的概率
查看>>
Spark分析之SparkContext启动过程分析
查看>>
2014电子商务安全技术峰会(含全议题下载)
查看>>
东大OJ-5到100000000之间的回文质数
查看>>
linux C 快速排序法
查看>>
模仿与创新
查看>>
Python用subprocess的Popen来调用系统命令
查看>>
Java NIO与IO的差别和比較
查看>>
.NET源代码的内部排序实现
查看>>
解决Strict Standards: Only variables should be passed by reference
查看>>
解决JBoss只能通过localhost(127.0.0.1)而不能通过IP访问
查看>>
MS SQL处理双引号(DoubleQuote)函数
查看>>
[智能架构系列]什么是Buddy智能开发框架
查看>>
三十一、关于android camera setParameters出错
查看>>
【收藏】QCIF、 CIF、2CIF、DCIF、D1(4CIF)格式介绍
查看>>
hdu 3836 Equivalent Sets (tarjan缩点)
查看>>
一些iOS高效开源类库(转)
查看>>