You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
/* * This is the structure pointed to by thread.sp for an inactive task. The * order of the fields must match the code in __switch_to_asm().*/structinactive_task_frame {
#ifdef CONFIG_X86_64
unsignedlong r15;
unsignedlong r14;
unsignedlong r13;
unsignedlong r12;
#elseunsignedlong flags;
unsignedlong si;
unsignedlong di;
#endifunsignedlong bx;
/* * These two fields must be together. They form a stack frame header, * needed by get_frame_pointer().*/unsignedlong bp;
unsignedlong ret_addr;
};
/* * A newly forked process directly context switches into this address. * * rax: prev task we switched from * rbx: kernel thread func (NULL for user thread) * r12: kernel thread arg*/
.pushsection .text, "ax"SYM_CODE_START(ret_from_fork_asm)
/* * This is the start of the kernel stack; even through there's a * register set at the top, the regset isn't necessarily coherent * (consider kthreads) and one cannot unwind further. * * This ensures stack unwinds of kernel threads terminate in a known * good state.*/
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
movq %rax, %rdi /* prev *///第一个参数为前一个进程
movq %rsp, %rsi /* regs *///第二个参数为通用寄存器结构
movq %rbx, %rdx /* fn *///第三个参数内核线程函数(用户空间线程为 NULL)
movq %r12, %rcx /* fn_arg *///第四个参数内核线程的参数
call ret_from_fork
/* * Set the stack state to what is expected for the target function * -- at this point the register set should be a valid user set * and unwind should work normally.*/
UNWIND_HINT_REGS
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(ret_from_fork_asm)
.popsection
__visible voidret_from_fork(structtask_struct *prev, structpt_regs *regs,
int (*fn)(void *), void *fn_arg)
{
schedule_tail(prev);
/* Is this a kernel thread? */if (unlikely(fn)) {
fn(fn_arg);
/* * A kernel thread is allowed to return here after successfully * calling kernel_execve(). Exit to userspace to complete the * execve() syscall.*/
regs->ax = 0;
}
syscall_exit_to_user_mode(regs);
}
staticvoidcreate_kthread(structkthread_create_info *create)
{
int pid;
#ifdef CONFIG_NUMA
current->pref_node_fork = create->node;
#endif/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, create->full_name,
CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) { //如果创建失败,回收资源并唤醒等待内核线程创建的完成量/* Release the structure when caller killed by a fatal signal. */structcompletion *done = xchg(&create->done, NULL);
kfree(create->full_name);
if (!done) {
kfree(create);
return;
}
create->result = ERR_PTR(pid);
complete(done);
}
}
内核线程的公用入口函数 kthread()
kthread() 是内核线程的公用入口函数
kernel/kthread.c
staticintkthread(void *_create)
{
staticconststructsched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */structkthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn; //最重要的,内核线程的入口函数void *data = create->data;
structcompletion *done;
structkthread *self;
int ret;
self = to_kthread(current);
//如果内核线程已被致命 signal 杀死,释放内核线程创建信息结构的实例/* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
kfree(create->full_name);
kfree(create);
kthread_exit(-EINTR);
}
self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
//新线程继承了 kthreadd 的优先级和 CPU 掩码。如果它们已更改,重置回默认值。/* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed.*/sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m);
set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
//将 result 域设置为 current,告诉用户我们已经生成了,正等待被停止或唤醒/* OK, tell user we're spawned, wait for stop or wakeup */__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
/* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive().*/preempt_disable();
complete(done); //唤醒等待内核线程创建的完成量schedule_preempt_disabled();
preempt_enable();
//如果内核线程没设置停止标志 KTHREAD_SHOULD_STOP,这就开始运行了
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready(); //该 kthread 完成了初始化。如果该 kthread 应该保留在 root group 中,那么创建者应该设置 PF_NO_SETAFFINITY__kthread_parkme(self); //设置了 KTHREAD_SHOULD_PARK 也会导致内核线程被调度出去
ret = threadfn(data); //调用内核线程的特定入口函数
}
kthread_exit(ret);
}
intstop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
conststructcpumask *cpus)
{ //构造一个本地 multi_stop_data 数据结构的实例structmulti_stop_data msdata = {
.fn = fn,
.data = data,
.num_threads = num_online_cpus(),
.active_cpus = cpus,
};
lockdep_assert_cpus_held();
//cpu_stop_init() 会将此全局标志置为 true,如果为 false 说明 stop task 还未初始化完成if (!stop_machine_initialized) {
/* * Handle the case where stop_machine() is called * early in boot before stop_machine() has been * initialized.*/unsignedlong flags;
int ret;
//处理 stop_machine() 早于 CPU stop task 初始化完成的情况WARN_ON_ONCE(msdata.num_threads != 1);
//其实就是直接调用提供给的回调函数了local_irq_save(flags);
hard_irq_disable();
ret = (*fn)(data);
local_irq_restore(flags);
//后面就直接返回了,不往 CPU stop task 里排工作了,因为没初始化好return ret;
}
//将状态置为初始状态然后在所有在线 CPU 上执行 stop task/* Set the initial state and stop all online cpus. */set_state(&msdata, MULTI_STOP_PREPARE);
returnstop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}
intstop_machine(cpu_stop_fn_t fn, void *data, conststructcpumask *cpus)
{
int ret;
/* No CPUs can come up or down during this. */cpus_read_lock();
ret = stop_machine_cpuslocked(fn, data, cpus);
cpus_read_unlock();
return ret;
}
注意 multi_cpu_stop() 这个函数,后面要展开讲的
从 stop_machine() 的调用路径上来看,它要做的事情并不复杂,就是
把要调用的函数 fn 及其参数 data 包装成工作 work 加入到 per CPU stop task 的工作列表上
/* This controls the threads on each CPU. */enum multi_stop_state {
/* Dummy starting state for thread. *///线程的初始状态
MULTI_STOP_NONE,
/* Awaiting everyone to be scheduled. *///等待线程在各个 CPU 上都被调度过
MULTI_STOP_PREPARE,
/* Disable interrupts. *///禁用中断
MULTI_STOP_DISABLE_IRQ,
/* Run the function *///运行回调函数
MULTI_STOP_RUN,
/* Exit *///stop 任务退出
MULTI_STOP_EXIT,
};
structmulti_stop_data {
cpu_stop_fn_t fn;
void *data;
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */unsignedint num_threads;
conststructcpumask *active_cpus;
enum multi_stop_state state; //记录线程所处的阶段atomic_t thread_ack;
};
详细看看 multi_cpu_stop() 前了解一下 ack_state()
它会在每个 CPU 上被调用
每次它会递减响应 stop task 的计数
当计数减为 0 时,它会 递增状态机的状态,从而推动整个 stop 事件的前进
multi_cpu_stop() 则把控整个状态和 stop work 的调用
kernel/stop_machine.c
/* Last one to ack a state moves to the next state. */staticvoidack_state(structmulti_stop_data *msdata)
{
if (atomic_dec_and_test(&msdata->thread_ack))
set_state(msdata, msdata->state + 1);
}
/* This is the cpu_stop function which stops the CPU. */staticintmulti_cpu_stop(void *data)
{
structmulti_stop_data *msdata = data; //stop_machine_cpuslocked() 栈上的一个本地变量enum multi_stop_state newstate, curstate = MULTI_STOP_NONE; //本地状态初始为 NONEint cpu = smp_processor_id(), err = 0;
conststructcpumask *cpumask;
unsignedlong flags;
bool is_active;
//当从 stop_machine_from_inactive_cpu() 调用时,irq 可能已被禁用。 保存状态并在退出时恢复它/* * When called from stop_machine_from_inactive_cpu(), irq might * already be disabled. Save the state and restore it on exit.*/local_save_flags(flags);
if (!msdata->active_cpus) {
cpumask = cpu_online_mask;
is_active = cpu == cpumask_first(cpumask);
} else {
cpumask = msdata->active_cpus;
is_active = cpumask_test_cpu(cpu, cpumask);
}
/* Simple state machine */do {
/* Chill out and ensure we re-read multi_stop_state. */stop_machine_yield(cpumask);
newstate = READ_ONCE(msdata->state); //读取全局的状态if (newstate != curstate) { //本地状态与全局状态不一致
curstate = newstate; //更新本地状态为全局状态switch (curstate) {
case MULTI_STOP_DISABLE_IRQ: //所有的 CPU 通过走到这里关闭硬中断local_irq_disable();
hard_irq_disable();
break;
case MULTI_STOP_RUN: //所有的 CPU 通过走到这里执行工作的回调函数if (is_active)
err = msdata->fn(msdata->data);
break;
default:
break;
}
ack_state(msdata); //如果所有 CPU 都到达同一个状态,则推进全局状态到下一个阶段
} elseif (curstate > MULTI_STOP_PREPARE) { //如果本地状态与全局状态一致且经过准备阶段/* * At this stage all other CPUs we depend on must spin * in the same loop. Any reason for hard-lockup should * be detected and reported on their side.*/touch_nmi_watchdog(); //喂 NMI watchdog,防止出现 hard lockup
}
rcu_momentary_dyntick_idle();
} while (curstate != MULTI_STOP_EXIT); //所有的 CPU 都执行完工作回调,退出循环local_irq_restore(flags);
return err;
}
voidstop_machine_park(int cpu)
{
structcpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
/* * Lockless. cpu_stopper_thread() will take stopper->lock and flush * the pending works before it parks, until then it is fine to queue * the new works.*/
stopper->enabled = false; //该标志会阻止往 work list 上添加 workkthread_park(stopper->thread);
}