Process Creation
리눅스에서 프로세스를 생성하는 과정은 간단하다.
fork() ---> sys_fork()
vfork() ---------> swi ---> sys_vfork() --------> _do_fork()
clone() ---> sys_clone()
pthread_create() kthread_create()
프로세스 생성과 관련된 유저스페이스의 API호출은 시스템콜을 거쳐 _do_fork()를 수행하게 된다.
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
...
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls);
if (!IS_ERR(p)) {
...
struct pid *pid;
...
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
...
wake_up_new_task(p);
...
put_pid(pid);
} else {
nr = PTR_ERR(p);
}
return nr;
}
이 함수가 하는 일은 3가지로 나뉜다.
1) 현재 프로세스를 복사해서 새로운 프로세스를 만든다.
2) 프로세스의 process id인 pid를 만든다.
3) 프로세스를 실행시키기 위해 런큐에 넣는다.
하는 일에 대해 좀 더 자세히 알아보도록 하자.
copy_process() - 부모를 복사해 새로운 태스크를 만들다.
이 함수는 내가 알기로는 커널에서 제일 긴 함수이다. 그만큼 태스크를 복사하는건 할일이 많다. 신경쓸것도 많다.
1) clone_flags가 유효한 조합으로 전달되었는지 체크 2) task_struct 구조체 복사 3) max_threads 체크 4) cgroup, cpuset관련 필드 초기화 6) 스케쥴링 관련 필드 초기화 7) files, fs, sighand, signal, mm, ns, io 복사 8) pid 할당, 초기화 9) 쓰레드 관련 초기화 10) 그외 모든 초기화
clone_flags 조합 체크
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace,
unsigned long tls)
{
int retval;
struct task_struct *p;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
/*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group with the forking task.
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
return ERR_PTR(-EINVAL);
}
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current);
if (!p)
goto fork_out;
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqcount_init(&p->vtime_seqcount);
p->vtime_snap = 0;
p->vtime_snap_whence = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
p->hardirqs_enabled = 0;
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
/* pid 구조체를 하나 할당받음.. */
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_io;
}
}
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = 0;
/*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p);
/* ok, now we should be set up.. */
/* global pid를 리턴함. root namespace인 init ns에서 유일한 값.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
/* thread생성하는 copy_process가 아니라면.. */
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = (clone_flags & CSIGNAL);
/* 당연한 얘기지만 쓰레드그룹의 leader를 이번에 새롭게 생성한 태스크로 지정함..*/
p->group_leader = p;
/* 쓰레드그룹의 id는 당연히 리더의 id로..*/
p->tgid = p->pid;
}
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p);
if (retval)
goto bad_fork_free_pid;
/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
spin_lock(¤t->sighand->siglock);
/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);
/*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
* it's process group.
* A fatal signal pending means that current will exit, so the new
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
/* p->pids[PIDTYPE_PID] = pid */
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
/* p->pids[PIDTYPE_PGID] = ??? */
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
/* p->pids[PIDTYPE_SID] = ??? */
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
atomic_inc(¤t->signal->live);
atomic_inc(¤t->signal->sigcnt);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
total_forks++;
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
return p;
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
threadgroup_change_end(current);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
free_task(p);
fork_out:
return ERR_PTR(retval);
}
set_task_cpu()
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
...
#ifdef CONFIG_LOCKDEP
...
#endif
#endif
...
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
....
}
__set_task_cpu(p, new_cpu);
}
- 태스크 @p의 thread_info에 기록된 cpu번호와 @new_cpu가 다른 경우
- 태스크 @p가 normal태스크라면
- @p가 @new_cpu로 migrate될것이므로 태스크가 속한 cfs_rq에서 load를 제거한다.
- remove_entity_load_avg()
- 태스크 @p가 normal태스크라면
- 태스크 @p가 enqueue될 cfs_rq, rt_rq를 @new_cpu를 참고해서 설정함. @new_cpu를 thread_info, task_struct에 기록해둠.
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
set_task_rq(p, cpu);
#ifdef CONFIG_SMP
smp_wmb();
task_thread_info(p)->cpu = cpu;
p->wake_cpu = cpu;
#endif
}
- 태스크 @p가 enqueue될 cfs_rq, rt_rq를 설정함. 부모se와의 관계도 설정함.
- 이 태스크는 나중에 여기서 설정된 rq로 enqueue되게 됨.
- 태스크 @p의 thread_info->cpu에 자신이 enqueue될 cpu번호를 설정함
- 이걸 어디서 쓰지?
- 태스크 @p가 나중에 wake-up해서 동작할 cpu번호로 @cpu를 설정함. 꼭 이 cpu에 enqueue되는건 아니고 기왕이면 여기에서 enqueu됨.
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
...
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
#endif
}
- set_task_rq_fair()는 어렵다.
- set_task_rq_fair()는 lge의 byungchul.park이 author임
- commit ad936d8658fd348338cb7d42c577dac77892b074
- entity load aging on attaching
- a9280514bf1e54775b8d7cd93d87c05c2b5273e6
- set_task_rq_fair()는 lge의 byungchul.park이 author임
- 그룹스케쥴링이 활성화되어 있다면
- 태스크가 enqueue될 cfs_rq, rt_rq를 초기화함.
- 스케쥴링엔티티 계층도에서의 부모와의 관계를 초기화함. 태스크가 속한 태스크그룹이 이 계층도에서의 부모가 됨.
스케쥴링과 관련된 필드 초기화하기 - sched_fork()
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
__sched_fork(clone_flags, p);
p->state = TASK_RUNNING;
p->prio = current->normal_prio;
if (dl_prio(p->prio)) {
put_cpu();
return -EAGAIN;
} else if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
} else {
p->sched_class = &fair_sched_class;
}
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
set_task_cpu(p, cpu);
#if defined(CONFIG_SMP)
p->on_cpu = 0;
#endif
init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
}
- __sched_fork() 에서는 태스크의 sched_entity, sched_rt_entity를 초기화한다.
- 태스크의 상태를 동작중(TASK_RUNNING)으로 설정한다.
- 태스크의 prio 필드를 normal_prio로 설정한다. 기존의 prio 값을 쓰지 않는 이유는 boosting priority가 상속되는것을 막기위해서이다.
- 태스크의 스케쥴링클래스를 설정한다. prio필드가 나타내는 우선순위값이 어떤 범위에 있냐로 결정한다. 0 - 99는 RT, 100 - 139는 fair, 음수는 dl??
- 설정된 스케쥴링클래스의 초기화를 수행한다.
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0; |
p->se.on_rq = 0; |
p->se.exec_start = 0; |
p->se.sum_exec_runtime = 0; |
p->se.prev_sum_exec_runtime = 0; |---- (1)
p->se.nr_migrations = 0; |
p->se.vruntime = 0; |
INIT_LIST_HEAD(&p->se.group_node); |
#ifdef CONFIG_FAIR_GROUP_SCHED |
p->se.cfs_rq = NULL; |
#endif
#ifdef CONFIG_SCHEDSTATS
...
#endif
RB_CLEAR_NODE(&p->dl.rb_node); |
init_dl_task_timer(&p->dl); |---- (2)
__dl_clear_params(p); |
INIT_LIST_HEAD(&p->rt.run_list); |
p->rt.timeout = 0; |
p->rt.time_slice = sched_rr_timeslice; |---- (3)
p->rt.on_rq = 0; |
p->rt.on_list = 0; |
#ifdef CONFIG_PREEMPT_NOTIFIERS
...
#endif
#ifdef CONFIG_NUMA_BALANCING
...
#endif /* CONFIG_NUMA_BALANCING */
}
코드블럭 1에서는 태스크가 아직 런큐에 enqueue되지 않았고 태스크의 sched_entity도 레드블랙트리에 enqueue되지 않았으므로 관련 필드를 0으로 초기화한다.
코드블럭 2에서는 태스크의 sched_dl_entity 필드를 초기화한다?
코드블럭 3에서는 태스크의 sched_rt_entity 필드를 초기화한다?