- 从系统调用返回用户空间时(在执行系统调用期间被抢占属于内核抢占,这里说的是执行用户态的代码时发生的抢占,比如说用户态的程序 A 切换到了用户态程序 B)
- 从中断处理程序返回用户空间时
- 只有在主动放弃 CPU 或者进程返回用户态时(系统调用返回时)才发生进程切换
- 进程进入内核态后(例如系统调用)即使时间片用完(例如在系统调用期间发生了时钟中断,并且计算发现其时间片用完)也不立即切换,而是等到返回用户态时(例如系统调用返回)。
- 中断处理程序正在执行,且返回内核空间之前。
- 内核代码再一次具有可抢占性的时候(例如,调用
preempt_enable()
的时候)。 - 如果内核中的任务阻塞(这同样也会导致调用
schedule()
)。 - 除了以上场景,如果内核中的任务显式调用
schedule()
。
-
preemptable()
-
include/linux/preempt.h
... #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) ... #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPT #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) ... #else /* !CONFIG_PREEMPT */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) ... #endif /* CONFIG_PREEMPT */ ... #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() ... #define preempt_enable() barrier() ... #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ ...__```
-
___preempt_schedule()
的实现x86平台进行了重载,其他平台则采用通用的宏替换成preempt_schedule()
。 -
内核抢占前会先通过
preemptible()
判断能否进行抢占。 -
kernel/sched/core.c
/* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ asmlinkage __visible void __sched notrace preempt_schedule(void) { /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ if (likely(!preemptible())) return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); ...*```
-
preempt_count_add()
和preempt_count_sub()
调用可重载的__preempt_count_add()
和__preempt_count_sub()
-
目前就x86重载
__preempt_count_add()
和__preempt_count_sub()
的实现 -
非Debug版本的
preempt_count_add()
和preempt_count_sub()
- include/linux/preempt.h
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); ... #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) ... #endif ...__```
- include/linux/preempt.h
-
Debug版本的
preempt_count_add()
和preempt_count_sub()
- kernel/sched/core.c
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif if (preempt_count() == val) { unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif trace_preempt_off(CALLER_ADDR0, ip); } } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; /* * Is the spinlock portion underflowing? */ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; #endif if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); ...*```
- kernel/sched/core.c
-
x86的
thread_info
结构里并没有preempt_count
成员,而是通过Per-CPU变量__preempt_count
存储的。 -
arch/x86/include/asm/thread_info.h
struct thread_info { struct task_struct *task; /* main task structure */ __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ mm_segment_t addr_limit; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ };
-
x86重载的
preempt_count
相关的实现 -
arch/x86/include/asm/preempt.h
DECLARE_PER_CPU(int, __preempt_count); ... /* * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users * that think a non-zero value indicates we cannot preempt. */ static __always_inline int preempt_count(void) { /*返回抢占计数*/ return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { /*设置抢占计数*/ raw_cpu_write_4(__preempt_count, pc); } ... /* * The various preempt_count add/sub methods */ static __always_inline void __preempt_count_add(int val) { raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { raw_cpu_add_4(__preempt_count, -val); } ...__```
-
ARM的
thread_info
结构有preempt_count
成员,这和x86的不一样。 -
arch/arm/include/asm/thread_info.h
/* * low level task data that entry.S needs immediate access to. * __switch_to() assumes cpu_context follows immediately after cpu_domain. */ struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ __u32 cpu_domain; /* cpu domain */ struct cpu_context_save cpu_context; /* cpu context */ __u32 syscall; /* syscall number */ __u8 used_cp[16]; /* thread used copro */ unsigned long tp_value[2]; /* TLS registers */ #ifdef CONFIG_CRUNCH struct crunch_state crunchstate; #endif union fp_state fpstate __attribute__((aligned(8))); union vfp_state vfpstate; #ifdef CONFIG_ARM_THUMBEE unsigned long thumbee_state; /* ThumbEE Handler Base register */ #endif }; ... /* * how to get the current stack pointer in C */ register unsigned long current_stack_pointer asm ("sp"); /* * how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) __attribute_const__; static inline struct thread_info *current_thread_info(void) { return (struct thread_info *) (current_stack_pointer & ~(THREAD_SIZE - 1)); } ...*```
-
ARM用的是通用的
__preempt_count_add()
和__preempt_count_sub()
实现- include/asm-generic/preempt.h
static __always_inline int preempt_count(void) { /*返回抢占计数*/ return current_thread_info()->preempt_count; } static __always_inline int *preempt_count_ptr(void) { return ¤t_thread_info()->preempt_count; } static __always_inline void preempt_count_set(int pc) { /*设置抢占计数*/ *preempt_count_ptr() = pc; } ... /* * The various preempt_count add/sub methods */ static __always_inline void __preempt_count_add(int val) { *preempt_count_ptr() += val; } static __always_inline void __preempt_count_sub(int val) { *preempt_count_ptr() -= val; } ...*```
-
preempt_count_ptr()
仅在以下函数中被调用,x86thread_info
没有preempt_count
成员,因此以下函数也是独立实现的:preempt_count_set()
__preempt_count_add()
__preempt_count_sub()
__preempt_count_dec_and_test()
- 内核抢占可能发生在处理完硬件中断请求之后。
- 如果处理器在处理中断请求后返回内核态(返回用户态则没有影响),特定体系结构的汇编实现会检查:
- 抢占计数值是否为 0,即是否允许抢占,
- 以及是否设置了重新调度标志。
- 如果以上两个条件满足,则通过函数
preempt_schedule_irq()
调用调度器。 preempt_schedule_irq()
与preempt_schedule()
本质区别是,preempt_schedule_irq()
在调用时中断必须是禁止的,防止中断造成递归调用。- 当然,在
preempt_schedule_irq()
调用schedule()
前必须要调用local_irq_enable()
开启中断,完成后必须调用local_irq_disable()
恢复调用前中断的状态。
- arch/x86/entry/entry_64.S
/*
* Build the entry stubs with some assembler magic.
* We pack 1 stub into every 8-byte block.
*/
.align 8
ENTRY(irq_entries_start) /*中断入口点*/
vector=FIRST_EXTERNAL_VECTOR
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
pushq $(~vector+0x80) /* Note: always in signed byte range */
vector=vector+1
jmp common_interrupt
.align 8
.endr
END(irq_entries_start)
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
*/
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
ASM_CLAC
addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
interrupt do_IRQ /*中断进入 C 处理的入口点*/
/* 0(%rsp): old RSP */
ret_from_intr: /* 注意,这里是连着的,do_IRQ返回后会接着执行后面的指令 */
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count) /*Per-CPU 的 irq_count 减 1*/
/* Restore saved previous stack */
popq %rsp
testb $3, CS(%rsp)
jz retint_kernel /*测试结果为0,中断发生时在内核空间,跳到retint_kernel*/
/* Interrupt came from user space */
GLOBAL(retint_user) /*测试结果不为0,中断发生在用户空间*/
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWAPGS
jmp restore_regs_and_iret
/* Returning to kernel space */
retint_kernel:
#ifdef CONFIG_PREEMPT
/* Interrupts are off */
/* Check if we need preemption */
bt $9, EFLAGS(%rsp) /* were interrupts off? */
jnc 1f /*检查CF,中断是否关闭。如果CF=0,中断未关闭,则前跳至1,不抢占;否则往下执行*/
0: cmpl $0, PER_CPU_VAR(__preempt_count)
jnz 1f /*检查上面比较结果是否不为0。如果不为0,抢占是关闭状态,则前跳至1,不抢占;否则抢占发生*/
call preempt_schedule_irq /*中断关闭状态下调用函数preempt_schedule_irq()*/
jmp 0b
1:
#endif
- arch/powerpc/kernel/entry_32.S
.globl ret_from_except
ret_from_except:
/* Hard-disable interrupts so that current_thread_info()->flags
* can't change between when we test it and when we return
* from the interrupt. */
/* Note: We don't bother telling lockdep about it */
LOAD_MSR_KERNEL(r10,MSR_KERNEL)
SYNC /* Some chip revs have problems here... */
MTMSRD(r10) /* disable interrupts */
lwz r3,_MSR(r1) /* Returning to user mode? */
andi. r0,r3,MSR_PR /*立即数MSR_PR与寄存器r3的值比较,结果存入寄存器r0*/
beq resume_kernel /*如果比较结果相等,中断返回内核空间,跳到resume_kernel*/
user_exc_return: /* r10 contains MSR_KERNEL here */
/* Check current_thread_info()->flags */
CURRENT_THREAD_INFO(r9, r1)
lwz r9,TI_FLAGS(r9)
andi. r0,r9,_TIF_USER_WORK_MASK
bne do_work
restore_user:
#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
/* Check whether this process has its own DBCR0 value. The internal
debug mode bit tells us that dbcr0 should be loaded. */
lwz r0,THREAD+THREAD_DBCR0(r2)
andis. r10,r0,DBCR0_IDM@h
bnel- load_dbcr0
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
CURRENT_THREAD_INFO(r9, r1)
ACCOUNT_CPU_USER_EXIT(r9, r10, r11)
#endif
b restore /*返回用户空间的函数*/
/* N.B. the only way to get here is from the beq following ret_from_except. */
resume_kernel:
/* check current_thread_info, _TIF_EMULATE_STACK_STORE */
CURRENT_THREAD_INFO(r9, r1) /*取当前进程 thread_info*/
lwz r8,TI_FLAGS(r9) /*取当前进程 thread_info 中的TI_FLAGS,存入r8*/
andis. r0,r8,_TIF_EMULATE_STACK_STORE@h /*注意,andis的结果放到寄存器r0*/
beq+ 1f /*向前跳转到1*/
addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */
lwz r3,GPR1(r1)
subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */
mr r4,r1 /* src: current exception frame */
mr r1,r3 /* Reroute the trampoline frame to r1 */
/* Copy from the original to the trampoline. */
li r5,INT_FRAME_SIZE/4 /* size: INT_FRAME_SIZE */
li r6,0 /* start offset: 0 */
mtctr r5
2: lwzx r0,r6,r4
stwx r0,r6,r3
addi r6,r6,4
bdnz 2b
/* Do real store operation to complete stwu */
lwz r5,GPR1(r1)
stw r8,0(r5)
/* Clear _TIF_EMULATE_STACK_STORE flag */
lis r11,_TIF_EMULATE_STACK_STORE@h
addi r5,r9,TI_FLAGS
0: lwarx r8,0,r5
andc r8,r8,r11
#ifdef CONFIG_IBM405_ERR77
dcbt 0,r5
#endif
stwcx. r8,0,r5
bne- 0b
1:
#ifdef CONFIG_PREEMPT
/* check current_thread_info->preempt_count */
lwz r0,TI_PREEMPT(r9) /*取当前进程thread_info里的抢占计数到r0*/
cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
bne restore /*如果非 0,抢占目前处于关闭状态,恢复寄存器并返回中断前的函数*/
andi. r8,r8,_TIF_NEED_RESCHED /*检查TI_FLAGS是否设置重新调度位,结果存入r8*/
beq+ restore /*如果为 0,没有进程要求重新调度,恢复寄存器并返回中断前的函数,否则执行下一条指令*/
lwz r3,_MSR(r1) /*取中断计数到r3。下面会判断中断是否关闭,如果没关闭则不调度*/
andi. r0,r3,MSR_EE /* interrupts off? */
beq restore /* don't schedule if so */
#ifdef CONFIG_TRACE_IRQFLAGS
/* Lockdep thinks irqs are enabled, we need to call
* preempt_schedule_irq with IRQs off, so we inform lockdep
* now that we -did- turn them off already
*/
bl trace_hardirqs_off
#endif
1: bl preempt_schedule_irq /*中断关闭状态下调用函数preempt_schedule_irq()*/
CURRENT_THREAD_INFO(r9, r1) /*取当前进程 thread_info*/
lwz r3,TI_FLAGS(r9) /*取当前进程 thread_info 中的TI_FLAGS到r3*/
andi. r0,r3,_TIF_NEED_RESCHED /*检查TI_FLAGS是否设置重新调度位,结果在r0*/
bne- 1b /*结果不相等,说明设置了重新调度位,往后跳至1,重新调度*/
#ifdef CONFIG_TRACE_IRQFLAGS
/* And now, to properly rebalance the above, we tell lockdep they
* are being turned back on, which will happen when we return
*/
bl trace_hardirqs_on
#endif
#endif /* CONFIG_PREEMPT */
- 最后再来看看
preempt_schedule_irq()
函数 - kernel/sched/core.c
/* * this is the entry point to schedule() from kernel preemption * off of irq context. * Note, that this is called and return with irqs disabled. This will * protect us against recursive calling from irq. */ /*这是内核抢占在中断上下文的入口点。 注意,该函数会在中断禁止的状态下调用,返回时也得是中断禁止的。这会防止中断造成递归调用。*/ asmlinkage __visible void __sched preempt_schedule_irq(void) { enum ctx_state prev_state; /* Catch callers which need to be fixed */ /*抢占计数不为 0,或者中断开启,直接 crash。因为该函数仅在中断返回的最后调用,此时仍 属于中断上下文,如果不满足这两个条件系统肯定出问题了。*/ BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); do { preempt_disable(); /*先禁止抢占*/ local_irq_enable(); /*schedule()前一刻中断必然要打开*/ __schedule(true); local_irq_disable(); /*schedule()完成后中断立即关闭*/ sched_preempt_enable_no_resched(); /*非调度方式开抢占*/ } while (need_resched()); exception_exit(prev_state); }