linux 时间相关的一些总结

110 阅读 0 评论 73 点赞

我是靠谱客的博主烂漫大山，这篇文章主要介绍linux 时间相关的一些总结，现在分享给大家，希望可以做个参考。

仅作为内核代码中时间管理模块的笔记，3.10内核,很乱，不喜勿喷。

先有time，后有timer。

常用的time结构有哪些？除了大名鼎鼎的jiffies和jiffies64之外，还有常用的一些结构如下：

复制代码

1
ktime_t 经常用在timer中，

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
union ktime {
    s64    tv64;
#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
    struct {
# ifdef __BIG_ENDIAN
    s32    sec, nsec;
# else
    s32    nsec, sec;
# endif
    } tv;
#endif
};

typedef union ktime ktime_t;        /* Kill this */

经常用在fs中的timespec，低一点精度的timeval，以及时区结构timezone。主要用来做时间戳等。

复制代码

1
2
3
4
struct timespec {
    __kernel_time_t    tv_sec;            /* seconds */
    long        tv_nsec;        /* nanoseconds */
};

struct timeval {

__kernel_time_t tv_sec; /* seconds */

__kernel_suseconds_t tv_usec; /* microseconds */

};

struct timezone {

int tz_minuteswest; /* minutes west of Greenwich */

int tz_dsttime; /* type of dst correction */

};

复制代码

这些结构之间的常用转换函数：

复制代码

/* convert a timespec to ktime_t format: */
static inline ktime_t timespec_to_ktime(struct timespec ts)
{
    return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* convert a timespec64 to ktime_t format: */
static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
{
    return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* convert a timeval to ktime_t format: */
static inline ktime_t timeval_to_ktime(struct timeval tv)
{
    return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec(kt)        ns_to_timespec((kt).tv64)

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec64(kt)        ns_to_timespec64((kt).tv64)

/* Map the ktime_t to timeval conversion to ns_to_timeval function */
#define ktime_to_timeval(kt)        ns_to_timeval((kt).tv64)

/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
#define ktime_to_ns(kt)            ((kt).tv64)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* convert a timespec to ktime_t format: */
static inline ktime_t timespec_to_ktime(struct timespec ts)
{
    return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* convert a timespec64 to ktime_t format: */
static inline ktime_t timespec64_to_ktime(struct timespec64 ts)
{
    return ktime_set(ts.tv_sec, ts.tv_nsec);
}

/* convert a timeval to ktime_t format: */
static inline ktime_t timeval_to_ktime(struct timeval tv)
{
    return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec(kt)        ns_to_timespec((kt).tv64)

/* Map the ktime_t to timespec conversion to ns_to_timespec function */
#define ktime_to_timespec64(kt)        ns_to_timespec64((kt).tv64)

/* Map the ktime_t to timeval conversion to ns_to_timeval function */
#define ktime_to_timeval(kt)        ns_to_timeval((kt).tv64)

/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
#define ktime_to_ns(kt)            ((kt).tv64)

比如有时候自己不想那么高精度的时间戳怎么办呢？内核还提供了这个函数，取到秒级，最方便的是这个函数还被导出了，很好用。

复制代码

1
2
3
4
5
6
7
unsigned long get_seconds(void)
{
    struct timekeeper *tk = &timekeeper;

    return tk->xtime_sec;
}
EXPORT_SYMBOL(get_seconds);

还有个有趣的问题是，这个时间的维护，精度要更高的话，就需要用顺序锁去读取 timekeeper 变量。

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
struct timespec current_kernel_time(void)
{
    struct timekeeper *tk = &timekeeper;
    struct timespec64 now;
    unsigned long seq;

    do {
        seq = read_seqcount_begin(&timekeeper_seq);

        now = tk_xtime(tk);
    } while (read_seqcount_retry(&timekeeper_seq, seq));

    return timespec64_to_timespec(now);
}
EXPORT_SYMBOL(current_kernel_time);

好了，time除了用来做时间戳之前，另外一个大的应用就是timer的超时时间了。在描述timer之前，有必要描述linux 关于时间管理的几个大的概念，

低精度的timer定义：

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
crash> tvec_base
struct tvec_base {
    spinlock_t lock;
    struct timer_list *running_timer;
    unsigned long timer_jiffies;
    unsigned long next_timer;
    unsigned long active_timers;
    struct tvec_root tv1;
    struct tvec tv2;
    struct tvec tv3;
    struct tvec tv4;
    struct tvec tv5;
    unsigned long all_timers;
}

低精度定时器结构：

复制代码

1
2
3
4
5
6
7
8
9
10
11
struct timer_list {---------------------低精度定时器结构，
    struct list_head entry;-------------用这个挂入到时间轮的链表中，与高精度的rb_node类比
    unsigned long expires;--------------超期时间
    struct tvec_base *base;-------------指向某个cpu的 tvec_base
    void (*function)(unsigned long);----回调
    unsigned long data;
    int slack;
    int start_pid;
    void *start_site;
    char start_comm[16];
}

常用的配套函数有：add_timer,mod_timer,add_timer_on(指定cpu添加timer)，del_timer，DEFINE_TIMER，setup_timer等，这些在协议栈代码里面非常常见，一般用来等待超时。既然是超时，那么对时间精度要求就不那么高了，所以实现的时候，用了著名的定时器轮。

add_timer的流程和mod_timer的流程差不多，先判断该timer是不是pending，pending的意思就是从定时器轮已经摘取了，可能正在执行中，它的特征就是该timer的 entry的next是否为NULL

复制代码

1
2
3
4
static inline int timer_pending(const struct timer_list * timer)
{
    return timer->entry.next != NULL;
}

一句话总结：正等待被调度执行的定时器对象就是pending的。如果一个定时器不是pending的，那么肯定在定时器轮上。

接下来，自然要先从原来的位置摘除，

复制代码

1
2
3
4
5
6
7
8
9
10
11
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
    struct list_head *entry = &timer->entry;

    debug_deactivate(timer);

    __list_del(entry->prev, entry->next);-----如果timer以前没加入在定时器轮中，则这个啥都不做。
    if (clear_pending)
        entry->next = NULL;
    entry->prev = LIST_POISON2;
}

然后根据这个定时器的超时时间，加入到定时器轮中对应的vec中，主要改动两个，一个是timer的base，还有一个是timer的entry的所处的位置。

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
crash> p tvec_bases:0
per_cpu(tvec_bases, 0) = $30 = (struct tvec_base *) 0xffffffff81ea71c0 <boot_tvec_bases>
crash> tvec_bases
PER-CPU DATA TYPE:
  struct tvec_base *tvec_bases;
PER-CPU ADDRESSES:
  [0]: ffff8827dca13948
  [1]: ffff8827dca53948
  [2]: ffff8827dca93948
  [3]: ffff8827dcad3948
  [4]: ffff8827dcb13948
  [5]: ffff8827dcb53948
  [6]: ffff8827dcb93948
。。。。

这里还有一个细节，就是timer的base，由于这个是一个指针，所以至少是4字节对齐的，也就是后面两位肯定为0，被用来做标记了，当从timer中取这个base指针的时候，就需要将这两

位处理掉，不能直接用来解引用，否则会出现访问错误。

由于低精度的定时器是以jiffies来作为最低精度的，所以精度有限制，但随着硬件以及多媒体发展的实时性较高的要求，后来，又引入了高精度定时器。它是以纳秒为精度的。高精度定时器结构如下：

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
crash> hrtimer
struct hrtimer {
    struct timerqueue_node node;---------------------------用来插入到红黑树中
    ktime_t _softexpires;----------------------------------超期的时间
    enum hrtimer_restart (*function)(struct hrtimer *);----回调函数，肯定都有，不过它的返回值只有两个
    struct hrtimer_clock_base *base;-----------------------和低精度定时器类似，也有指向一个percpu的base的一个指针，不过base结构与低精度定时器time_list不同
    unsigned long state;
    int start_pid;
    void *start_site;
    char start_comm[16];
}
SIZE: 96

它指向的base是percpu的 hrtimer_bases，注意和低精度定时器的base相区别，因为低精度的base是percpu的 tvec_base

而高精度定时器的索引，也不是低精度那个vec管理，而是红黑树来管理的。

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
crash> timerqueue_head
struct timerqueue_head {
    struct rb_root head;
    struct timerqueue_node *next;
}
SIZE: 16
crash> hrtimer_clock_base
struct hrtimer_clock_base {
    struct hrtimer_cpu_base *cpu_base;
    int index;
    clockid_t clockid;
    struct timerqueue_head active;------------管理同类型的hrtimer的红黑树封装
    ktime_t resolution;
    ktime_t (*get_time)(void);
    ktime_t rh_reserved_softirq_time;
    ktime_t offset;
}

crash> hrtimer_cpu_base
struct hrtimer_cpu_base {
    raw_spinlock_t lock;
    unsigned int active_bases;
    unsigned int clock_was_set;
    ktime_t expires_next;
    int hres_active;
    int hang_detected;
    unsigned long nr_events;
    unsigned long nr_retries;
    unsigned long nr_hangs;
    ktime_t max_hang_time;
    struct hrtimer_clock_base clock_base[4];-------------它的地位，和时间轮的vec相当，是用来管理timer的，通过clockid来分类

     int cpu;

复制代码

1
    int in_hrtirq; 
}

相应的percpu管理结构，与低精度的tvec_base相对比：

复制代码

crash> hrtimer_bases------------整个hrtimer_interrupt都是以这个变量为基础
PER-CPU DATA TYPE:
  struct hrtimer_cpu_base hrtimer_bases;
PER-CPU ADDRESSES:
  [0]: ffff8827dca13960
  [1]: ffff8827dca53960
  [2]: ffff8827dca93960
  [3]: ffff8827dcad3960
  [4]: ffff8827dcb13960
  [5]: ffff8827dcb53960
  [6]: ffff8827dcb93960
  [7]: ffff8827dcbd3960
  [8]: ffff8827dcc13960
  [9]: ffff8827dcc53960
  [10]: ffff8827dcc93960
。。。。

crash> p hrtimer_bases:0
per_cpu(hrtimer_bases, 0) = $16 = {
  lock = {
    raw_lock = {
      val = {
        counter = 0
      }
    }
  },
  active_bases = 3,
  clock_was_set = 6,
  expires_next = {
    tv64 = 558945095814132
  },
  hres_active = 1,
  hang_detected = 0,
  nr_events = 2303159495,
  nr_retries = 5938805,
  nr_hangs = 5,
  max_hang_time = {
    tv64 = 21681
  },
  clock_base = {{
      cpu_base = 0xffff8827dca13960,
      index = 0,
      clockid = 1,
      active = {
        head = {
          rb_node = 0xffff881677e57e88
        },
        next = 0xffffe8d01d20f220
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f0670 <ktime_get>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 0
      }
    }, {
      cpu_base = 0xffff8827dca13960,
      index = 1,
      clockid = 0,
      active = {
        head = {
          rb_node = 0xffff881c433fbd38
        },
        next = 0xffff884a744a7d38
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f0ad0 <ktime_get_real>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 1540819482621868102
      }
    }, {
      cpu_base = 0xffff8827dca13960,
      index = 2,
      clockid = 7,
      active = {
        head = {
          rb_node = 0x0
        },
        next = 0x0
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f0c40 <ktime_get_boottime>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 0
      }
    }, {
      cpu_base = 0xffff8827dca13960,
      index = 3,
      clockid = 11,
      active = {
        head = {
          rb_node = 0x0
        },
        next = 0x0
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f08f0 <ktime_get_clocktai>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 1540819482621868102
      }
    }},
  cpu = 0,
  in_hrtirq = 0
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
crash> hrtimer_bases------------整个hrtimer_interrupt都是以这个变量为基础
PER-CPU DATA TYPE:
  struct hrtimer_cpu_base hrtimer_bases;
PER-CPU ADDRESSES:
  [0]: ffff8827dca13960
  [1]: ffff8827dca53960
  [2]: ffff8827dca93960
  [3]: ffff8827dcad3960
  [4]: ffff8827dcb13960
  [5]: ffff8827dcb53960
  [6]: ffff8827dcb93960
  [7]: ffff8827dcbd3960
  [8]: ffff8827dcc13960
  [9]: ffff8827dcc53960
  [10]: ffff8827dcc93960
。。。。
 

crash> p hrtimer_bases:0
per_cpu(hrtimer_bases, 0) = $16 = {
  lock = {
    raw_lock = {
      val = {
        counter = 0
      }
    }
  },
  active_bases = 3,
  clock_was_set = 6,
  expires_next = {
    tv64 = 558945095814132
  },
  hres_active = 1,
  hang_detected = 0,
  nr_events = 2303159495,
  nr_retries = 5938805,
  nr_hangs = 5,
  max_hang_time = {
    tv64 = 21681
  },
  clock_base = {{
      cpu_base = 0xffff8827dca13960,
      index = 0,
      clockid = 1,
      active = {
        head = {
          rb_node = 0xffff881677e57e88
        },
        next = 0xffffe8d01d20f220
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f0670 <ktime_get>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 0
      }
    }, {
      cpu_base = 0xffff8827dca13960,
      index = 1,
      clockid = 0,
      active = {
        head = {
          rb_node = 0xffff881c433fbd38
        },
        next = 0xffff884a744a7d38
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f0ad0 <ktime_get_real>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 1540819482621868102
      }
    }, {
      cpu_base = 0xffff8827dca13960,
      index = 2,
      clockid = 7,
      active = {
        head = {
          rb_node = 0x0
        },
        next = 0x0
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f0c40 <ktime_get_boottime>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 0
      }
    }, {
      cpu_base = 0xffff8827dca13960,
      index = 3,
      clockid = 11,
      active = {
        head = {
          rb_node = 0x0
        },
        next = 0x0
      },
      resolution = {
        tv64 = 1
      },
      get_time = 0xffffffff810f08f0 <ktime_get_clocktai>,
      rh_reserved_softirq_time = {
        tv64 = 0
      },
      offset = {
        tv64 = 1540819482621868102
      }
    }},
  cpu = 0,
  in_hrtirq = 0
}

两类定时器模块的初始化，在start_kernel中，

复制代码

1
2
3
4
5
6
7
asmlinkage void __init start_kernel(void)
{
。。。。    
    init_timers();//定时器模块初始化
    hrtimers_init();//高精度定时器模块初始化
。。。。
}

对比了两类定时器的定义，从定时器的执行再来对比一下，会加深印象。

对于低精度来说，

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
void __init init_timers(void)
{
    int err;

    /* ensure there are enough low bits for flags in timer->base pointer */
    BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);

    err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
                   (void *)(long)smp_processor_id());
    init_timer_stats();

    BUG_ON(err != NOTIFY_OK);
    register_cpu_notifier(&timers_nb);
    open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

在收到TIMER_SOFTIRQ 之后，run_timer_softirq-->__run_timers,这个函数会执行所有到期的定时的回调函数。执行回调的时候都持有 base->lock 这把自旋锁，所以也要求执行函数不能耗时太多。

对于高精度定时器来说，由于有两种模式，所以需要单独说明调用流程，

如果是处于低分辨率模式，则会在周期性的 update_process_times-->run_local_timers-->hrtimer_run_queues-->__hrtimer_run_queues 来把这些高精度定时器回调来执行；

update_process_times 调用 run_local_timers 来触发TIMER_SOFTIRQ软中断，run_timer_softirq负责调用__run_timers处理 TIMER_SOFTIRQ软中断。

run_local_timers 除了触发软中断，还调用 hrtimer_run_queues();看能否从低分辨率定时器切换到高分辨率。

run_local_timers | -->hrtimer_run_queues 负责分辨率切换--->hrtimer_switch_to_hres-->tick_setup_sched_timer

|-->raise_softirq(TIMER_SOFTIRQ)

如果是处于高精度模式，则虽然周期性的 update_process_times-->run_local_timers-->hrtimer_run_queues 会执行，但不会调用 __hrtimer_run_queues ，而是在 hrtimer_interrupt

函数中调用 __hrtimer_run_queues->__run_hrtimer 来完成定时器的调用。

调用链如下：

hrtimer_interrupt-->__hrtimer_run_queues-->__run_hrtimer-->执行回调。

这点和网上的不一致，因为网上大多是2.6的内核描述，其实在哪处理不是很关键，主要是理解数据结构和调用。

总结一下：

每个cpu有一个tvec_base结构；
tvec_base结构管理着5个不同超时时间的数组，它采用的基准时间是jiffies。
加入时间轮的时候，通过timer_list的超时时间，来指定它vec，
时间轮，按到期时间进行处理，第一轮vec处理完毕，会在第二轮中取一个数组元素填充第一轮的256个到底的元素，
它通过__run_timers来执行所有到期的低精度定时器

每个cpu有一个hrtimer_cpu_base结构；
hrtimer_cpu_base结构管理着3种不同的时间基准系统的hrtimer，分别是：实时时间，启动时间和单调时间；它的基准时间是纳秒。
每种时间基准系统通过它的active字段（timerqueue_head结构指针），指向它们各自的红黑树；
红黑树上，按到期时间进行排序，最先到期的hrtimer位于最左下的节点，并被记录在active.next字段中；
3中时间基准的最先到期时间可能不同，所以，它们之中最先到期的时间被记录在hrtimer_cpu_base的expires_next字段中。

有一点需要注意，高精度定时器要生效，意味着我们要有高精度的时钟源，那么当没有这么高精度的时钟源的时候，高精度定时器的运转，则精度会降低。

说到时钟源：在我的机器上，3.10的内核，封装了一个结构，叫clocksource如下：

复制代码

crash> list clocksource.list -H clocksource_list
ffffffff81a273c0
ffffffff81a2bb40
ffffffff81aebb80
ffffffff81eb5980
ffffffff81a52c40
crash> clocksource ffffffff81a273c0
struct clocksource {
  read = 0xffffffff81032e20 <read_tsc>,-----------这个成员的位置放到第一个，因为它最频繁使用，和2.6.18系列版本不一样，大家定义结构的时候把最常使用的放前面，便于cache命中
  cycle_last = 2592996216546832,
  mask = 18446744073709551615,
  mult = 4194304,
  shift = 23,
  max_idle_ns = 428122390528,
  maxadj = 461373,
  archdata = {
    vclock_mode = 1
  },
  name = 0xffffffff819217b4 "tsc",
  list = {
    next = 0xffffffff81a2bb78 <clocksource_hpet+56>,
    prev = 0xffffffff81a52c30 <clocksource_list>
  },
  rating = 300,----------------精度最高
  enable = 0x0,
  disable = 0x0,
  flags = 35,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}
crash> clocksource ffffffff81a2bb40
struct clocksource {
  read = 0xffffffff81062430 <read_hpet>,
  cycle_last = 103666886,
  mask = 4294967295,
  mult = 2796202783,
  shift = 26,
  max_idle_ns = 69681373356,
  maxadj = 307582306,
  archdata = {
    vclock_mode = 2
  },
  name = 0xffffffff818ff927 "hpet",
  list = {
    next = 0xffffffff81aebbb8 <clocksource_acpi_pm+56>,
    prev = 0xffffffff81a273f8 <clocksource_tsc+56>
  },
  rating = 250,
  enable = 0x0,
  disable = 0x0,
  flags = 33,
  suspend = 0x0,
  resume = 0xffffffff810619e0 <hpet_resume_counter>,
  owner = 0x0
}
crash> clocksource ffffffff81aebb80
struct clocksource {
  read = 0xffffffff8153cb10 <acpi_pm_read>,
  cycle_last = 0,
  mask = 16777215,
  mult = 2343484437,
  shift = 23,
  max_idle_ns = 3649976793,
  maxadj = 257783288,
  archdata = {
    vclock_mode = 0
  },
  name = 0xffffffff8191e0b6 "acpi_pm",
  list = {
    next = 0xffffffff81eb59b8 <refined_jiffies+56>,
    prev = 0xffffffff81a2bb78 <clocksource_hpet+56>
  },
  rating = 200,
  enable = 0x0,
  disable = 0x0,
  flags = 33,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}
crash> clocksource ffffffff81eb5980
struct clocksource {
  read = 0xffffffff810f3290 <jiffies_read>,
  cycle_last = 0,
  mask = 4294967295,
  mult = 255961088,
  shift = 8,
  max_idle_ns = 3344197395684985,
  maxadj = 28155719,
  archdata = {
    vclock_mode = 0
  },
  name = 0xffffffff8191e0fb "refined-jiffies",
  list = {
    next = 0xffffffff81a52c78 <clocksource_jiffies+56>,
    prev = 0xffffffff81aebbb8 <clocksource_acpi_pm+56>
  },
  rating = 2,------------------------精度最低
  enable = 0x0,
  disable = 0x0,
  flags = 0,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}
crash> clocksource ffffffff81a52c40
struct clocksource {
  read = 0xffffffff810f3290 <jiffies_read>,
  cycle_last = 4294669298,
  mask = 4294967295,
  mult = 256000000,
  shift = 8,
  max_idle_ns = 3344705780981250,
  maxadj = 28160000,
  archdata = {
    vclock_mode = 0
  },
  name = 0xffffffff8191e103 "jiffies",
  list = {
    next = 0xffffffff81a52c30 <clocksource_list>,
    prev = 0xffffffff81eb59b8 <refined_jiffies+56>
  },
  rating = 1,
  enable = 0x0,
  disable = 0x0,
  flags = 0,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
crash> list clocksource.list -H clocksource_list
ffffffff81a273c0
ffffffff81a2bb40
ffffffff81aebb80
ffffffff81eb5980
ffffffff81a52c40
crash> clocksource ffffffff81a273c0
struct clocksource {
  read = 0xffffffff81032e20 <read_tsc>,-----------这个成员的位置放到第一个，因为它最频繁使用，和2.6.18系列版本不一样，大家定义结构的时候把最常使用的放前面，便于cache命中
  cycle_last = 2592996216546832,
  mask = 18446744073709551615,
  mult = 4194304,
  shift = 23,
  max_idle_ns = 428122390528,
  maxadj = 461373,
  archdata = {
    vclock_mode = 1
  },
  name = 0xffffffff819217b4 "tsc",
  list = {
    next = 0xffffffff81a2bb78 <clocksource_hpet+56>,
    prev = 0xffffffff81a52c30 <clocksource_list>
  },
  rating = 300,----------------精度最高
  enable = 0x0,
  disable = 0x0,
  flags = 35,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}
crash> clocksource ffffffff81a2bb40
struct clocksource {
  read = 0xffffffff81062430 <read_hpet>,
  cycle_last = 103666886,
  mask = 4294967295,
  mult = 2796202783,
  shift = 26,
  max_idle_ns = 69681373356,
  maxadj = 307582306,
  archdata = {
    vclock_mode = 2
  },
  name = 0xffffffff818ff927 "hpet",
  list = {
    next = 0xffffffff81aebbb8 <clocksource_acpi_pm+56>,
    prev = 0xffffffff81a273f8 <clocksource_tsc+56>
  },
  rating = 250,
  enable = 0x0,
  disable = 0x0,
  flags = 33,
  suspend = 0x0,
  resume = 0xffffffff810619e0 <hpet_resume_counter>,
  owner = 0x0
}
crash> clocksource ffffffff81aebb80
struct clocksource {
  read = 0xffffffff8153cb10 <acpi_pm_read>,
  cycle_last = 0,
  mask = 16777215,
  mult = 2343484437,
  shift = 23,
  max_idle_ns = 3649976793,
  maxadj = 257783288,
  archdata = {
    vclock_mode = 0
  },
  name = 0xffffffff8191e0b6 "acpi_pm",
  list = {
    next = 0xffffffff81eb59b8 <refined_jiffies+56>,
    prev = 0xffffffff81a2bb78 <clocksource_hpet+56>
  },
  rating = 200,
  enable = 0x0,
  disable = 0x0,
  flags = 33,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}
crash> clocksource ffffffff81eb5980
struct clocksource {
  read = 0xffffffff810f3290 <jiffies_read>,
  cycle_last = 0,
  mask = 4294967295,
  mult = 255961088,
  shift = 8,
  max_idle_ns = 3344197395684985,
  maxadj = 28155719,
  archdata = {
    vclock_mode = 0
  },
  name = 0xffffffff8191e0fb "refined-jiffies",
  list = {
    next = 0xffffffff81a52c78 <clocksource_jiffies+56>,
    prev = 0xffffffff81aebbb8 <clocksource_acpi_pm+56>
  },
  rating = 2,------------------------精度最低
  enable = 0x0,
  disable = 0x0,
  flags = 0,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}
crash> clocksource ffffffff81a52c40
struct clocksource {
  read = 0xffffffff810f3290 <jiffies_read>,
  cycle_last = 4294669298,
  mask = 4294967295,
  mult = 256000000,
  shift = 8,
  max_idle_ns = 3344705780981250,
  maxadj = 28160000,
  archdata = {
    vclock_mode = 0
  },
  name = 0xffffffff8191e103 "jiffies",
  list = {
    next = 0xffffffff81a52c30 <clocksource_list>,
    prev = 0xffffffff81eb59b8 <refined_jiffies+56>
  },
  rating = 1,
  enable = 0x0,
  disable = 0x0,
  flags = 0,
  suspend = 0x0,
  resume = 0x0,
  owner = 0x0
}

用户可以通过手工来切换clocksource，比如我的环境上有tsc，hpet，acpi_pm三个可用的clocksource（这个比crash中列的少一些）

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
cat /sys/devices/system/clocksource/clocksource0/available_clocksource
tsc hpet acpi_pm
[root@localhost ~]# cat /sys/devices/system/clocksource/clocksource0/current_clocksource
tsc
[root@localhost ~]# cat /sys/devices/system/clocksource/clocksource0/unbind_clocksource
cat: /sys/devices/system/clocksource/clocksource0/unbind_clocksource: Permission denied
[root@localhost ~]# cat /sys/devices/system/clocksource/clocksource0/current_clocksource
tsc
[root@localhost ~]# ls -alrt /sys/devices/system/clocksource/clocksource0/current_clocksource
-rw-r--r-- 1 root root 4096 Oct 30 09:52 /sys/devices/system/clocksource/clocksource0/current_clocksource
[root@localhost ~]# echo hpet > /sys/devices/system/clocksource/clocksource0/current_clocksource
[root@localhost ~]# cat /sys/devices/system/clocksource/clocksource0/current_clocksource
hpet
[root@localhost ~]# echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource
切换之后会有打印，有时候也可以在message中看到内核自动切换的打印。

[44890.290544] Switched to clocksource hpet
[44902.121090] Switched to clocksource tsc

介绍完时钟源的定义和使用，有必要介绍下一个重要概念，时钟事件设备。

时间事件设备允许注册一个事件，在未来一个指定的时间点上发生，但与定时器实现相比，它只能存储一个事件。

举一个clock_event_device 的例子：

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
clock_event_device ffff8827dcf11140
struct clock_event_device {
  event_handler = 0xffffffff810b9890 <hrtimer_interrupt>,
  set_next_event = 0xffffffff81053df0 <lapic_next_deadline>,
  set_next_ktime = 0x0,
  next_event = {
    tv64 = 62900678796701
  },
  max_delta_ns = 2199023255551,
  min_delta_ns = 1000,
  mult = 8388608,
  shift = 27,
  mode = CLOCK_EVT_MODE_ONESHOT,
  features = 2,-------------------------------------------属性，为2说明是oneshot模式
  retries = 19117,
  broadcast = 0xffffffff81053e30 <lapic_timer_broadcast>,
  set_mode = 0xffffffff81054620 <lapic_timer_setup>,
  suspend = 0x0,
  resume = 0x0,
  min_delta_ticks = 15,
  max_delta_ticks = 18446744073709551615,
  name = 0xffffffff818fefdd "lapic",------------------事件设备的名称
  rating = 150,
  irq = -1,
  bound_on = 0,
  cpumask = 0xffffffff816e7c60 <cpu_bit_bitmap+26240>,
  list = {
    next = 0xffff8857bc2d11d8,
    prev = 0xffff8827dcf511d8
  },
  owner = 0x0
}

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*
 * Clock event features
 */
#define CLOCK_EVT_FEAT_PERIODIC        0x000001
#define CLOCK_EVT_FEAT_ONESHOT        0x000002
#define CLOCK_EVT_FEAT_KTIME        0x000004
/*
 * x86(64) specific misfeatures:
 *
 * - Clockevent source stops in C3 State and needs broadcast support.
 * - Local APIC timer is used as a dummy device.
 */
#define CLOCK_EVT_FEAT_C3STOP        0x000008
#define CLOCK_EVT_FEAT_DUMMY        0x000010

/*
 * Core shall set the interrupt affinity dynamically in broadcast mode
 */
#define CLOCK_EVT_FEAT_DYNIRQ        0x000020

/*
 * Clockevent device is based on a hrtimer for broadcast
 */
#define CLOCK_EVT_FEAT_HRTIMER        0x000080

每个时钟硬件设备注册一个时钟设备tick_device 和一个时钟事件设备。

struct tick_device {

struct clock_event_device *evtdev;

enum tick_device_mode mode;

};

可以看出，时钟设备就是时钟事件设备的简单封装。

为了精度，系统兼容了两套定时器，一套是时间轮的低精度定时器，一种是高精度的hrtimer。定时器软中断调用 hrtimer_run_queues 来处理高分辨率定时器队列，哪怕底层时钟事件设备只提供了低分辨率，也是如此。这使得可以使用现存的框架，而无需关注时钟的分辨率。

为了节能，系统又引入了tickless模型，也就是nohz模型，其实就是将原来周期性的tick，变为按需触发，对于需要模拟tick的周期性函数，则由相应的cpu来完成，其他cpu如果没事可以

休息。

nohz_mode目前包含三种模式，一种是未开启nohz，一种是系统工作于低分辨率模式下的动态时钟，一种是系统工作于高精度模式下的动态时钟。

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
struct tick_sched {
    struct hrtimer            sched_timer;---用于高分辨率模式下，模拟周期时钟的一个timer
    unsigned long            check_clocks;
    enum tick_nohz_mode        nohz_mode;---包含三种模式，
    ktime_t                last_tick;
    ktime_t                next_tick;
    int                inidle;
    int                tick_stopped;
    unsigned long            idle_jiffies;
    unsigned long            idle_calls;
    unsigned long            idle_sleeps;
    int                idle_active;
    ktime_t                idle_entrytime;
    ktime_t                idle_waketime;
    ktime_t                idle_exittime;
    ktime_t                idle_sleeptime;
    ktime_t                iowait_sleeptime;
    ktime_t                sleep_length;
    unsigned long            last_jiffies;
    u64                next_timer;
    ktime_t                idle_expires;
    int                do_timer_last;
};

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
crash> tick_sched
struct tick_sched {
    struct hrtimer sched_timer;
    unsigned long check_clocks;
    enum tick_nohz_mode nohz_mode;
    ktime_t last_tick;
    ktime_t next_tick;
    int inidle;
    int tick_stopped;
    unsigned long idle_jiffies;
    unsigned long idle_calls;
    unsigned long idle_sleeps;
    int idle_active;
    ktime_t idle_entrytime;
    ktime_t idle_waketime;
    ktime_t idle_exittime;
    ktime_t idle_sleeptime;
    ktime_t iowait_sleeptime;
    ktime_t sleep_length;
    unsigned long last_jiffies;
    u64 next_timer;
    ktime_t idle_expires;
    int do_timer_last;
}

tick_sched 中收集的统计信息通过/proc/timer_list 导出到用户层。

复制代码

crash> tick_cpu_sched
PER-CPU DATA TYPE:
  struct tick_sched tick_cpu_sched;
PER-CPU ADDRESSES:
  [0]: ffff8827dca13f20
  [1]: ffff8827dca53f20
  [2]: ffff8827dca93f20
  [3]: ffff8827dcad3f20
。。。。。。。。。。。。。

crash> p tick_cpu_sched:0
per_cpu(tick_cpu_sched, 0) = $18 = {
  sched_timer = {
    node = {
      node = {
        __rb_parent_color = 18446612303169076872,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = {
        tv64 = 579956705000000
      }
    },
    _softexpires = {
      tv64 = 579956705000000
    },
    function = 0xffffffff810f9170 <tick_sched_timer>,
    base = 0xffff8827dca139a0,
    state = 1,
    start_pid = 0,
    start_site = 0xffffffff810f95c2 <tick_nohz_stop_sched_tick+690>,
    start_comm = "swapper/0000000000000"
  },
  check_clocks = 1,
  nohz_mode = NOHZ_MODE_HIGHRES,
  last_tick = {
    tv64 = 579956549000000
  },
  next_tick = {
    tv64 = 579956705000000
  },
  inidle = 1,
  tick_stopped = 1,
  idle_jiffies = 4874623845,
  idle_calls = 2616662184,
  idle_sleeps = 2409217702,
  idle_active = 1,
  idle_entrytime = {
    tv64 = 579956548218826
  },
  idle_waketime = {
    tv64 = 579956548210360
  },
  idle_exittime = {
    tv64 = 579956548213511
  },
  idle_sleeptime = {
    tv64 = 548323643001010
  },
  iowait_sleeptime = {
    tv64 = 53588522970
  },
  sleep_length = {
    tv64 = 515114
  },
  last_jiffies = 4874623845,
  next_timer = 579956705000000,
  idle_expires = {
    tv64 = 579956705000000
  },
  do_timer_last = 0
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
crash> tick_cpu_sched
PER-CPU DATA TYPE:
  struct tick_sched tick_cpu_sched;
PER-CPU ADDRESSES:
  [0]: ffff8827dca13f20
  [1]: ffff8827dca53f20
  [2]: ffff8827dca93f20
  [3]: ffff8827dcad3f20
。。。。。。。。。。。。。

crash> p tick_cpu_sched:0
per_cpu(tick_cpu_sched, 0) = $18 = {
  sched_timer = {
    node = {
      node = {
        __rb_parent_color = 18446612303169076872,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = {
        tv64 = 579956705000000
      }
    },
    _softexpires = {
      tv64 = 579956705000000
    },
    function = 0xffffffff810f9170 <tick_sched_timer>,
    base = 0xffff8827dca139a0,
    state = 1,
    start_pid = 0,
    start_site = 0xffffffff810f95c2 <tick_nohz_stop_sched_tick+690>,
    start_comm = "swapper/0000000000000"
  },
  check_clocks = 1,
  nohz_mode = NOHZ_MODE_HIGHRES,
  last_tick = {
    tv64 = 579956549000000
  },
  next_tick = {
    tv64 = 579956705000000
  },
  inidle = 1,
  tick_stopped = 1,
  idle_jiffies = 4874623845,
  idle_calls = 2616662184,
  idle_sleeps = 2409217702,
  idle_active = 1,
  idle_entrytime = {
    tv64 = 579956548218826
  },
  idle_waketime = {
    tv64 = 579956548210360
  },
  idle_exittime = {
    tv64 = 579956548213511
  },
  idle_sleeptime = {
    tv64 = 548323643001010
  },
  iowait_sleeptime = {
    tv64 = 53588522970
  },
  sleep_length = {
    tv64 = 515114
  },
  last_jiffies = 4874623845,
  next_timer = 579956705000000,
  idle_expires = {
    tv64 = 579956705000000
  },
  do_timer_last = 0
}

对时钟的禁用是按cpu指定的，一般来说，所有cpu都空闲的概率还是比较低的。

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
crash> p tick_next_period
tick_next_period = $19 = {
  tv64 = 580792669000000
}
crash> p tick_next_period
tick_next_period = $20 = {
  tv64 = 580794124000000
}
crash> p tick_next_period
tick_next_period = $21 = {
  tv64 = 580795263000000
}
crash> p tick_next_period
tick_next_period = $22 = {
  tv64 = 580796247000000
}
crash> p last_jiffies_update
last_jiffies_update = $23 = {
  tv64 = 580801981000000
}
crash> p last_jiffies_update
last_jiffies_update = $24 = {
  tv64 = 580802792000000
}
crash> p last_jiffies_update
last_jiffies_update = $25 = {
  tv64 = 580803530000000
}

时间相关系统调用及外部设置：

adjtimex 系统调用，NTP设置，

内核的工作模式：

没有动态时钟的低分辨率系统，总是用周期时钟。这时不会支持单触发模式
启用了动态时钟的低分辨率系统，将以单触发模式是用时钟设备
高分辨率系统总是用单触发模式，无论是否启用了动态时钟特性
高分辨率时钟系统，每个cpu会使用一个hrtimer来模拟周期时钟，提供tick，毕竟精度高的要模拟精度低的比较容易，同时又能纳入自己的高分辨率框架。模拟的函数为：tick_sched_timer

非广播时最终的处理函数：

高分辨率动态时钟：hrtimer_interrupt

高分辨率周期时钟：hrtimer_interrupt

低分辨率动态时钟：tick_nohz_handler

低分辨率周期时钟：tick_handle_periodic

广播时最终的处理函数：

高分辨率动态时钟：tick_handle_oneshot_broadcast

高分辨率周期时钟：tick_handle_oneshot_broadcast

低分辨率动态时钟：tick_handle_oneshot_broadcast

低分辨率周期时钟：tick_handle_periodic_broadcast

参考资料：

linux 3.10内核源码
原文：https://blog.csdn.net/goodluckwhh/article/details/9048565