Linux时间子系统(二) -- clocksource, timekeeper

159 阅读 0 评论 105 点赞

我是靠谱客的博主清爽眼睛，这篇文章主要介绍Linux时间子系统(二) -- clocksource, timekeeper，现在分享给大家，希望可以做个参考。

转载请标明出处floater的csdn blog，http://blog.csdn.net/flaoter

从本节开始使用的内核代码版本是3.18.12，使用的定时器硬件是ARM Generic Timer。

1 clocksource

在上一节中介绍过clocksource提供了一个单调增加的计时器，它的底层硬件在arm平台上对应的就是上一节中的System Counter。

1.1 数据结构

struct clocksource {
    /*
     * Hotpath data, fits in a single cache line when the
     * clocksource itself is cacheline aligned.
     */
    cycle_t (*read)(struct clocksource *cs);   //读取时钟源的当前计数值，返回值的变量类型是cycle_t，是64位无符号整数
    cycle_t mask;  //如果时钟源不提供64bit的计数值，mask用于选择合适的位
    u32 mult; 
    u32 shift;
    u64 max_idle_ns;  //最大idle时间
    u32 maxadj;   //最大的mult调整值，此处是mult的11%
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
    struct arch_clocksource_data archdata;
#endif

    const char *name;
    struct list_head list;
    int rating;
    int (*enable)(struct clocksource *cs);
    void (*disable)(struct clocksource *cs);
    unsigned long flags;
    void (*suspend)(struct clocksource *cs);
    void (*resume)(struct clocksource *cs);

    /* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
    /* Watchdog related data, used by the framework */
    struct list_head wd_list;
    cycle_t cs_last;
    cycle_t wd_last;
#endif
    struct module *owner;
} ____cacheline_aligned;

• mult和shift
clocksource中得到的是一个cycle值，需要通过这两个成员变量转换成纳秒值。
ns = (cycles/F) * NSEC_PER_SEC;
上述公式计算会引入浮点数运算，kernel中应避免浮点运算，所以使用下面函数中乘法并右移的方法进行计算，但是此方法会损失一定的精度。

static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
{
    return ((u64) cycles * mult) >> shift;
}

mult值不能太大，如果太大的话，cycles * mult的结果会超出64bit从而导致溢出。cycles的最大值是ULLONG_MAX/(mult+maxadj)。最大idle时间max_idle_ns可以通过最大cycles，mult, shift和maxadj得出。
当kernel配置成NO_HZ的时候，系统就不存在周期性的tick了，但是由于counter value和纳秒的转换限制，系统的idle时间不能超过max_idle_ns。

•rating
代表时钟源的精度，数值越大时钟精度越高。

rating	定义
1–99	不适合于用作实际的时钟源，只用于启动过程或用于测试
100–199	基本可用，可用作真实的时钟源，但不推荐
200–299	精度较好，可用作真实的时钟源
300–399	很好，精确的时钟源
300–399	理想的时钟源，如有可能就必须选择它作为时钟源

1.2 clocksource的建立过程

(1) jiffies clocksource的注册(后面会提到这里并不是时钟源注册地第一步)
所有的linux kernel都有如下clocksource_jiffies的定义，可见jiffies的rating是1，最低精度的clocksource。

static struct clocksource clocksource_jiffies = {
    .name       = "jiffies",
    .rating     = 1, /* lowest valid rating*/
    .read       = jiffies_read,
    .mask       = 0xffffffff, /*32bits*/
    .mult       = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
    .shift      = JIFFIES_SHIFT,
};

在core_initcall阶段clocksource_jiffies会被注册。

static int __init init_jiffies_clocksource(void)
{
    return clocksource_register(&clocksource_jiffies);   //注册clocksource_jiffies到clocksource_list
}
core_initcall(init_jiffies_clocksource);

在此处也将clocksource的注册函数进行展开，说明clocksource的注册方法。

int clocksource_register(struct clocksource *cs)
{
    /* calculate max adjustment for given mult/shift */
    cs->maxadj = clocksource_max_adjustment(cs);
    WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
        "Clocksource %s might overflow on 11%% adjustmentn",
        cs->name);

    /* calculate max idle time permitted for this clocksource */
    cs->max_idle_ns = clocksource_max_deferment(cs);

    mutex_lock(&clocksource_mutex);
    clocksource_enqueue(cs); //加入到clocksource_list队列中，rating值越大的clocksource添加的位置越靠前
    clocksource_enqueue_watchdog(cs);  //同时添加到watchdog链表watchdog_list中
    clocksource_select(); //选择最优的clocksource，并通知timekeeping
    mutex_unlock(&clocksource_mutex);
    return 0;
}

(2) 当所有clocksource注册完成后，选择系统使用的clocksource
选择最终要使用的clocksurce的函数clocksource_done_booting是在fs_initcall阶段调用的。

static int __init clocksource_done_booting(void)
{
    mutex_lock(&clocksource_mutex);
    curr_clocksource = clocksource_default_clock();   //clocksource_jiffies
    finished_booting = 1;
    /*
     * Run the watchdog first to eliminate unstable clock sources
     */
    __clocksource_watchdog_kthread();
    clocksource_select();  //选择最优的，如果之前只有jiffies进行了注册，最优时钟源就是jiffies
    mutex_unlock(&clocksource_mutex);
    return 0;
}
fs_initcall(clocksource_done_booting);

可见，如果没有其他clocksource进行注册的话，系统将要使用的时钟源就会是clocksource_jiffies。但是在我使用的SOC上之前已经提过了还会有ARM Generic Timer，它的System Counter会被注册成时钟源，它的注册过程会是怎样呢？

(0) arm clocksource注册
在kernel的C语言函数入口start_kernel中会调用time_init，time_init会调用clocksource_of_init对平台中其他可用时钟源进行注册。因为time_init的调用顺序比第1步中的core_initcall还要早，所以在此处添加了第0步。

clocksource_of_init函数会对段__clksrc_of_table进行解析，遍历此段中定义的时钟源，并进行注册。

void __init clocksource_of_init(void)
{
    struct device_node *np;
    const struct of_device_id *match;
    of_init_fn_1 init_func;
    unsigned clocksources = 0;

    for_each_matching_node_and_match(np, __clksrc_of_table, &match) {
        if (!of_device_is_available(np))
            continue;

        init_func = match->data;
        init_func(np);
        clocksources++;
    }
    if (!clocksources)
        pr_crit("%s: no matching clocksources foundn", __func__);
}

关于段定义用到的方法如下，

#define CLOCKSOURCE_OF_DECLARE(name, compat, fn) 
    OF_DECLARE_1(clksrc, name, compat, fn)

#define _OF_DECLARE(table, name, compat, fn, fn_type)           
    static const struct of_device_id __of_table_##name      
        __used __section(__##table##_of_table)          
         = { .compatible = compat,              
             .data = (fn == (fn_type)NULL) ? fn : fn  }

在arm_arch_timer.c文件中有对armv8_arch_timer的定义如下，可知armv8_arch_timer已经定义在了__clksrc_of_table段中，在遍历过程中会被遍历到，它的注册函数arch_timer_init会被执行。

CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_init);

当然，在一般SOC平台中应该不止有arm timer的定义，但本文不关注其它timer的定义。

arch_timer_init此函数实现功能较多，此处为描述clocksource的注册只对它调用的arch_counter_register进行解析，后面将在介绍clock_event时会详细介绍。
arch_timer_init –> arch_timer_common_init –> arch_counter_register

static struct clocksource clocksource_counter = {
    .name   = "arch_sys_counter",
    .rating = 400,
    .read   = arch_counter_read,
    .mask   = CLOCKSOURCE_MASK(56),
    .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void __init arch_counter_register(unsigned type)
{
    u64 start_count;

    /* Register the CP15 based counter if we have one */
    if (type & ARCH_CP15_TIMER) {   //CP15方式访问
        if (IS_ENABLED(CONFIG_ARM64) || arch_timer_use_virtual)   //virtualization，不考虑
            arch_timer_read_counter = arch_counter_get_cntvct;
        else
            arch_timer_read_counter = arch_counter_get_cntpct;
    } else {  //memory map方式访问
        arch_timer_read_counter = arch_counter_get_cntvct_mem;

        /* If the clocksource name is "arch_sys_counter" the
         * VDSO will attempt to read the CP15-based counter.
         * Ensure this does not happen when CP15-based
         * counter is not available.
         */
        clocksource_counter.name = "arch_mem_counter";
    }

    start_count = arch_timer_read_counter();
    clocksource_register_hz(&clocksource_counter, arch_timer_rate);  //注册新的clocksource -- clocksource_counter
    cyclecounter.mult = clocksource_counter.mult;
    cyclecounter.shift = clocksource_counter.shift;
    timecounter_init(&timecounter, &cyclecounter, start_count);

    /* 56 bits minimum, so we assume worst case rollover */
    sched_clock_register(arch_timer_read_counter, 56, arch_timer_rate);
}

clocksource_register_hz –> __clocksource_register_scale，__clocksource_register_scale与jiffies注册的函数clocksource_register类似。

int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)

{

    /* Initialize mult/shift and max_idle_ns */
    __clocksource_updatefreq_scale(cs, scale, freq);  //计算该clocksource的mult, shift, max_idle_ns

    /* Add clocksource to the clcoksource list */
    mutex_lock(&clocksource_mutex);
    clocksource_enqueue(cs);  //加入到clocksource_list队列中，rating值越大的clocksource添加的位置越靠前
    clocksource_enqueue_watchdog(cs);  //同时添加到watchdog链表watchdog_list中
    clocksource_select(); //选择最优的clocksource，并通知timekeeping
    mutex_unlock(&clocksource_mutex);
    return 0;
}

因此，最终系统使用的时钟源将是arch_sys_counter。

2 timekeeper

2.1 timeline

Linux内核提供并管理着多个种类的时间，分别是：
•RTC时间
又叫CMOS时间，通常由一个专门的计时硬件来实现，软件可以读取该硬件来获得年月日、时分秒等时间信息。硬件时间不管系统是否上电，RTC中的时间信息都不会丢失，计时会一直持续进行，硬件上通常使用一个后备电池对RTC硬件进行单独的供电。内核和用户空间通过驱动程序访问RTC硬件来获取或设置时间信息。
•xtime (wall time)
xtime实际上是一个内存中的变量，它的访问速度非常快，内核大部分时间都是使用xtime来获得当前时间信息，xtime精度取决于用于对其计时的clocksource，可以达到纳秒级别。xtime记录的是自1970年1月1日24时到当前时刻所经历的纳秒数。
•monotonic time
该时间自系统开机后就一直单调地增加，它不像xtime可以因用户的调整时间而产生跳变，不过该时间不计算系统休眠的时间，也就是说，系统休眠时，monotoic时间不会递增
•raw monotonic time
该时间与monotonic时间类似，也是单调递增的时间，唯一的不同是raw monotonic time不会受到NTP时间调整的影响，它代表着系统独立时钟硬件对时间的统计。
•boot time
与monotonic时间相同，不过会累加上系统休眠的时间，它代表着系统上电后的总时间。