概述
转载:深入解读Linux进程调度Schedule
转载原因:通俗易懂,逻辑清晰。
6. 时钟中断(Timer Interrupt)
时钟中断是系统中调度和抢占的驱动因素,在时钟中断中会进行进程运行时间的更新等,并更新调度标志,以决定是否进行调度。下面以Powerpc FSL Booke架构芯片ppce500为例来看具体代码,其他架构类似,设计思想相同。
6.1 时钟中断的注册
首先在系统最开始的启动阶段注册中断处理函数,这个过程发生在start_kernel执行之前的汇编初始化部分,在系统初始化完成后时钟中断发生时执行中断回调函数。
IBM的PowerPC架构的内核启动入口head文件在arch/powerpc/kernel/下,其中e500架构的内核入口文件为head_fsl_booke.S,其中定义了中断向量列表:
interrupt_base:
/* Critical Input Interrupt */
CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
......
/* Decrementer Interrupt */
DECREMENTER_EXCEPTION
......
时钟中断的定义为DECREMENTER_EXCEPTION,实际展开过程在arch/powerpc/kernel/head_booke.h头文件中:
#define DECREMENTER_EXCEPTION
START_EXCEPTION(Decrementer)
NORMAL_EXCEPTION_PROLOG(DECREMENTER);
lis r0,TSR_DIS@h; /* Setup the DEC interrupt mask */
mtspr SPRN_TSR,r0; /* Clear the DEC interrupt */
addi r3,r1,STACK_FRAME_OVERHEAD;
EXC_XFER_LITE(0x0900, timer_interrupt)
DECREMENTER_EXCEPTION -> EXC_XFER_LITE
#define EXC_XFER_LITE(n, hdlr)
EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler,
ret_from_except)
EXC_XFER_LITE -> EXC_XFER_TEMPLATE
#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)
li r10,trap;
stw r10,_TRAP(r11);
lis r10,msr@h;
ori r10,r10,msr@l;
copyee(r10, r9);
bl tfer;
.long hdlr;
.long ret
再来看timer_interrupt函数:
/*
* timer_interrupt - gets called when the decrementer overflows,
* with interrupts disabled.
*/
void timer_interrupt(struct pt_regs * regs)
{
struct pt_regs *old_regs;
u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
/* Ensure a positive value is written to the decrementer, or else
* some CPUs will continue to take decrementer exceptions.
*/
set_dec(DECREMENTER_MAX);
/* Some implementations of hotplug will get timer interrupts while
* offline, just ignore these and we also need to set
* decrementers_next_tb as MAX to make sure __check_irq_replay
* don't replay timer interrupt when return, otherwise we'll trap
* here infinitely :(
*/
if (!cpu_online(smp_processor_id())) {
*next_tb = ~(u64)0;
return;
}
/* Conditionally hard-enable interrupts now that the DEC has been
* bumped to its maximum value
*/
may_hard_irq_enable();
#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
if (atomic_read(&ppc_n_lost_interrupts) != 0)
do_IRQ(regs);
#endif
old_regs = set_irq_regs(regs);
irq_enter();
__timer_interrupt();
irq_exit();
set_irq_regs(old_regs);
}
timer_interrupt() -> __timer_interrupt()函数
static void __timer_interrupt(void)
{
struct pt_regs *regs = get_irq_regs();
u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
struct clock_event_device *evt = this_cpu_ptr(&decrementers);
u64 now;
trace_timer_interrupt_entry(regs);
if (test_irq_work_pending()) {
clear_irq_work_pending();
irq_work_run();
}
now = get_tb_or_rtc();
if (now >= *next_tb) {
*next_tb = ~(u64)0;
if (evt->event_handler)
evt->event_handler(evt);
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
if (now <= DECREMENTER_MAX)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
__this_cpu_inc(irq_stat.timer_irqs_others);
}
#ifdef CONFIG_PPC64
/* collect purr register values often, for accurate calculations */
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
struct cpu_usage *cu = this_cpu_ptr(&cpu_usage_array);
cu->current_tb = mfspr(SPRN_PURR);
}
#endif
trace_timer_interrupt_exit(regs);
}
在__timer_interrupt函数中执行了evt->event_handler函数调用,此处event_handler是什么,究竟是怎么注册的呢?
答案:tick_handle_periodic的注册和执行流程如下:
start_kernel->time_init->init_decrementer_clockevent->register_decrementer_clockevent->clockevents_register_device->tick_check_new_device->tick_setup_periodic->tick_set_periodic_handler->tick_handle_periodic->tick_periodic->update_process_times->scheduler_tick
tick_handle_periodic 该函数实际上为中断事件真正的处理过程,前面的interrupt handler仅仅是为中断做一些准备工作,如完成寄存器等相关信息的保存等,做好了入口工作,二下面的event_handler则完成了中断事件实际想做的事情,其函数定义如下:
/*
* Event handler for periodic ticks
*/
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
tick_periodic(cpu);
#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
/*
* The cpu might have transitioned to HIGHRES or NOHZ mode via
* update_process_times() -> run_local_timers() ->
* hrtimer_run_queues().
*/
if (dev->event_handler != tick_handle_periodic)
return;
#endif
if (!clockevent_state_oneshot(dev))
return;
for (;;) {
/*
* Setup the next period for devices, which do not have
* periodic mode:
*/
next = ktime_add(next, tick_period);
if (!clockevents_program_event(dev, next, false))
return;
/*
* Have to be careful here. If we're in oneshot mode,
* before we call tick_periodic() in a loop, we need
* to be sure we're using a real hardware clocksource.
* Otherwise we could get trapped in an infinite
* loop, as the tick_periodic() increments jiffies,
* which then will increment time, possibly causing
* the loop to trigger again and again.
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
}
}
start_kernel->time_init->init_decrementer_clockevent->register_decrementer_clockevent->clockevents_register_device->tick_check_new_device->tick_setup_periodic->tick_set_periodic_handler->tick_handle_periodic->tick_periodic->update_process_times->scheduler_tick
update_process_times函数里:-> scheduler_tick()
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
#endif
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers(p);
}
start_kernel->time_init->init_decrementer_clockevent->register_decrementer_clockevent->clockevents_register_device->tick_check_new_device->tick_setup_periodic->tick_set_periodic_handler->tick_handle_periodic->tick_periodic->update_process_times->scheduler_tick
scheduler_tick函数里:curr->sched_class->task_tick(rq, curr, 0);
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);
rq_unlock(rq, &rf);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
}
可以看到在scheduler_tick中又调用了调度类的task_tick函数接口,如果当前采用CFS调度策略则执行fair_sched_class->task_tick,同样的在rt_sched_class中实现为task_tick_rt,实现如下:
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
watchdog(rq, p);
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
if (p->policy != SCHED_RR)
return;
if (--p->rt.time_slice)
return;
p->rt.time_slice = sched_rr_timeslice;
/*
* Requeue to the end of queue if we (and all of our ancestors) are not
* the only element on the queue
*/
for_each_sched_rt_entity(rt_se) {
if (rt_se->run_list.prev != rt_se->run_list.next) {
requeue_task_rt(rq, p, 0);
resched_curr(rq);
return;
}
}
}
可以看到,如果当前时间片还未用完,则直接返回。代码片段:
if (--p->rt.time_slice)
return;
否则将进程实时时间片设置为sched_rr_timeslice,并且将调度实体的进程放置到调度队列rq的末尾,调用resched_curr设置调度信息后返回,这里实际上是实时调度的RR(Round Robin)思想。
现在又有新的问题,设置了进程的调度标志TIF_NEED_RESCHED之后,实际的调度何时发生呢?
一般调度的入口分为四个:
- 中断返回(用户态和内核态);
- 系统调用返回用户空间;
- 进程主动放弃cpu,执行调度;
- 信号处理完成后返回内核空间;
时钟中断返回导致进程调度为第1种,此处以ppce500为例来看调度如何发生:
各种异常返回的入口RET_FROM_EXC_LEVEL,调用user_exc_return而进入do_work
而do_work作为总的入口点进入执行过程:
do_work: /* r10 contains MSR_KERNEL here */
andi. r0,r9,_TIF_NEED_RESCHED
beq do_user_signal
可以看到,如果未设置调度标志,则会执行do_user_signal来restore_user返回之前的调用栈
do_user_signal: /* r10 contains MSR_KERNEL here */
ori r10,r10,MSR_EE
SYNC
MTMSRD(r10) /* hard-enable interrupts */
/* save r13-r31 in the exception frame, if not already done */
lwz r3,_TRAP(r1)
andi. r0,r3,1
beq 2f
SAVE_NVGPRS(r1)
rlwinm r3,r3,0,0,30
stw r3,_TRAP(r1)
2: addi r3,r1,STACK_FRAME_OVERHEAD
mr r4,r9
bl do_notify_resume
REST_NVGPRS(r1)
b recheck
如果设置了调度标志,调用do_resched,,在entry_32.S中可以看到在函数do_resched中调用了schedule函数执行了调度:
do_resched: /* r10 contains MSR_KERNEL here */
/* Note: We don't need to inform lockdep that we are enabling
* interrupts here. As far as it knows, they are already enabled
*/
ori r10,r10,MSR_EE
SYNC
MTMSRD(r10) /* hard-enable interrupts */
bl schedule
定义在entry_32.S的recheck函数:
recheck:
/* Note: And we don't tell it we are disabling them again
* neither. Those disable/enable cycles used to peek at
* TI_FLAGS aren't advertised.
*/
LOAD_MSR_KERNEL(r10,MSR_KERNEL)
SYNC
MTMSRD(r10) /* disable interrupts */
CURRENT_THREAD_INFO(r9, r1)
lwz r9,TI_FLAGS(r9)
andi. r0,r9,_TIF_NEED_RESCHED
bne- do_resched
andi. r0,r9,_TIF_USER_WORK_MASK
beq restore_user
6.2 时钟中断的执行过程
在前面的中断向量定义中可以看到有一个处理过程为bl tfer;这里的tfer为transfer_to_handler或者transfer_to_handler_full,在时钟中断中为transfer_to_handler,主要做了一些中断处理函数调用之前的准备处理过程,然后跳转到中断执行过程hdlr,最后进入ret执行,ret对应函数ret_from_except或者ret_from_except_full,在时钟中断中对应为ret_from_except,进而调用resume_kernel后进入preempt_schedule_irq执行调度过程:
/*
* this is the entry point to schedule() from kernel preemption
* off of irq context.
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
preempt_disable();
local_irq_enable();
__schedule(true);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
}
接下来看看函数preempt_disable和local_irq_disable
static __always_inline volatile int *preempt_count_ptr(void)
{
return ¤t_thread_info()->preempt_count;
}
其实关闭抢占只是将当前进程状态信息preempt_count增加相应的值1,在此调用之后又barrier()操作,防止编译器优化和内存访问顺序问题,达到同步的目的。
/*
* Wrap the arch provided IRQ routines to provide appropriate checks.
*/
#define raw_local_irq_disable() arch_local_irq_disable()
#define raw_local_irq_enable() arch_local_irq_enable()
#define raw_local_irq_save(flags)
do {
typecheck(unsigned long, flags);
flags = arch_local_irq_save();
} while (0)
#define raw_local_irq_restore(flags)
do {
typecheck(unsigned long, flags);
arch_local_irq_restore(flags);
} while (0)
#define raw_local_save_flags(flags)
do {
typecheck(unsigned long, flags);
flags = arch_local_save_flags();
} while (0)
#define raw_irqs_disabled_flags(flags)
({
typecheck(unsigned long, flags);
arch_irqs_disabled_flags(flags);
})
#define raw_irqs_disabled() (arch_irqs_disabled())
#define raw_safe_halt() arch_safe_halt()
#define local_irq_enable() do { raw_local_irq_enable(); } while (0)
#define local_irq_disable() do { raw_local_irq_disable(); } while (0)
#define local_irq_save(flags)
do {
raw_local_irq_save(flags);
} while (0)
#define local_irq_restore(flags) do { raw_local_irq_restore(flags); } while (0)
#define safe_halt() do { raw_safe_halt(); } while (0)
跟架构相关的irq操作定义如下:
static inline void arch_local_irq_restore(unsigned long flags)
{
#if defined(CONFIG_BOOKE)
asm volatile("wrtee %0" : : "r" (flags) : "memory");
#else
mtmsr(flags);
#endif
}
时钟中断属于硬件中断,Linux系统不支持中断嵌套,所以在中断发生时又会禁止本地中断(local_irq_disable)
arch_local_irq_xxx等函数
static inline unsigned long arch_local_irq_save(void)
{
unsigned long flags = arch_local_save_flags();
#ifdef CONFIG_BOOKE
asm volatile("wrteei 0" : : : "memory");
#else
SET_MSR_EE(flags & ~MSR_EE);
#endif
return flags;
}
static inline void arch_local_irq_disable(void)
{
#ifdef CONFIG_BOOKE
asm volatile("wrteei 0" : : : "memory");
#else
arch_local_irq_save();
#endif
}
static inline void arch_local_irq_enable(void)
{
#ifdef CONFIG_BOOKE
asm volatile("wrteei 1" : : : "memory");
#else
unsigned long msr = mfmsr();
SET_MSR_EE(msr | MSR_EE);
#endif
}
static inline bool arch_irqs_disabled_flags(unsigned long flags)
{
return (flags & MSR_EE) == 0;
}
static inline bool arch_irqs_disabled(void)
{
return arch_irqs_disabled_flags(arch_local_save_flags());
}
#define hard_irq_disable() arch_local_irq_disable()
最后
以上就是愉快月饼为你收集整理的[收藏]时钟中断(Timer Interrupt)与 Linux内核调度的全部内容,希望文章能够帮你解决[收藏]时钟中断(Timer Interrupt)与 Linux内核调度所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复