【内核调度、负载计算】【update_load_avg】update_load_avg__update_load_avg_seupdate_cfs_rq_load_avg___update_load_avg

107 阅读 0 评论 71 点赞

我是靠谱客的博主冷傲热狗，这篇文章主要介绍【内核调度、负载计算】【update_load_avg】update_load_avg__update_load_avg_seupdate_cfs_rq_load_avg___update_load_avg，现在分享给大家，希望可以做个参考。

update_load_avg

/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct sched_entity *se, int flags)
{
	struct cfs_rq *cfs_rq = cfs_rq_of(se);//se中的cfs_rq字段
	u64 now = cfs_rq_clock_task(cfs_rq);
	struct rq *rq = rq_of(cfs_rq);
	int cpu = cpu_of(rq);
	int decayed;

	/*
	 * Track task load average for carrying it to new CPU after migrated, and
	 * track group sched_entity load average for task_h_load calc in migration
	 */
	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
		__update_load_avg_se(now, cpu, cfs_rq, se);

	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
	decayed |= propagate_entity_load_avg(se);

	if (decayed && (flags & UPDATE_TG))
		update_tg_load_avg(cfs_rq, 0);
}

__update_load_avg_se

更新entity的load avg，核心是调用到了 ___update_load_avg函数

static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (___update_load_avg(now, cpu, &se->avg,
			       se->on_rq * scale_load_down(se->load.weight),
			       cfs_rq->curr == se, NULL, 0, 0)) {
		trace_sched_load_se(se);
		cfs_se_util_change(&se->avg);

#ifdef UTIL_EST_DEBUG
		/*
		 * Trace utilization only for actual tasks.
		 *
		 * These trace events are mostly useful to get easier to
		 * read plots for the estimated utilization, where we can
		 * compare it with the actual grow/decrease of the original
		 * PELT signal.
		 * Let's keep them disabled by default in "production kernels".
		 */
		if (entity_is_task(se)) {
			struct task_struct *tsk = task_of(se);

			trace_sched_util_est_task(tsk, &se->avg);

			/* Trace utilization only for top level CFS RQ */
			cfs_rq = &(task_rq(tsk)->cfs);
			trace_sched_util_est_cpu(cpu, cfs_rq);
		}
#endif /* UTIL_EST_DEBUG */

		return 1;
	}

	return 0;
}

update_cfs_rq_load_avg

最后还是调用到了___update_load_avg函数

/**
 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 * @now: current time, as per cfs_rq_clock_task()
 * @cfs_rq: cfs_rq to update
 *
 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
 * avg. The immediate(即时) corollary(必然的) is that all (fair) tasks must be attached, see
 * post_init_entity_util_avg().
 *
 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 *
 * Returns true if the load decayed or we removed load.
 *
 * Since both these conditions indicate a changed cfs_rq->avg.load we should
 * call update_tg_load_avg() when this function returns true.
 */
 //计算cfs的负载
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
	struct sched_avg *sa = &cfs_rq->avg;
	int decayed, removed_load = 0, removed_util = 0;
	bool update_freq = false;

	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
		sub_positive(&sa->load_avg, r);
		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
		removed_load = 1;
		set_tg_cfs_propagate(cfs_rq);
	}

	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
		sub_positive(&sa->util_avg, r);
		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
		removed_util = 1;
		set_tg_cfs_propagate(cfs_rq);
	}

	decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);

#ifndef CONFIG_64BIT
	smp_wmb();
	cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif

#ifdef CONFIG_SCHED_WALT
	if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util))
		update_freq = true;
#endif

	if (update_freq || decayed || removed_util)
		cfs_rq_util_change(cfs_rq);

	return decayed || removed_load;
}

还是调用到这个核心函数

static int
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
{
	int ret;
	//还是调用到这个核心函数
	ret = ___update_load_avg(now, cpu, &cfs_rq->avg,
			scale_load_down(cfs_rq->load.weight),
			cfs_rq->curr != NULL, cfs_rq, 0, 0);
	trace_sched_load_cfs_rq(cfs_rq);

	return ret;
}

___update_load_avg

之前版本的kernel没有accumulate_sum函数，而是直接将这部分放在本函数内执行。

accumulate_sum详见https://blog.csdn.net/feifei_csdn/article/details/103814876

主要是对time进行衰减

/*
 * We can represent the historical contribution to runnable average as the
 * coefficients of a geometric series.  To do this we sub-divide our runnable
 * history into segments of approximately 1ms (1024us); label the segment that
 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 *
 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 *      p0            p1           p2
 *     (now)       (~1ms ago)  (~2ms ago)
 *
 * Let u_i denote the fraction of p_i that the entity was runnable.
 *
 * We then designate the fractions u_i as our co-efficients, yielding the
 * following representation of historical load:
 *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 *
 * We choose y based on the with of a reasonably scheduling period, fixing:
 *   y^32 = 0.5
 *
 * This means that the contribution to load ~32ms ago (u_32) will be weighted
 * approximately half as much as the contribution to load within the last ms
 * (u_0).
 *
 * When a period "rolls over" and we have new u_0`, multiplying the previous
 * sum again by y is sufficient to update:
 *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 */
 //第四个参数weight = se->on_rq * scale_load_down(se->load.weight)
static __always_inline int
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
		  unsigned long weight, int running, struct cfs_rq *cfs_rq,
		  int irq, int last_accum)
{
	u64 delta;
	u32 ret;

	delta = now - sa->last_update_time;
	/*
	 * This should only happen when time goes backwards, which it
	 * unfortunately does during sched clock init when we swap over to TSC.
	 */
	if ((s64)delta < 0) {
		sa->last_update_time = now;
		return 0;
	}

	/*
	 * Use 1024ns as the unit of measurement since it's a reasonable
	 * approximation of 1us and fast to compute.
	 */
	/* (1) 把时间单位从ns，收缩成us */
	delta >>= 10;
	if (!delta)
		return 0;

	sa->last_update_time += delta << 10;

	/*
	 * running is a subset of runnable (weight) so running can't be set if
	 * runnable is clear. But there are some corner cases where the current
	 * se has been already dequeued but cfs_rq->curr still points to it.
	 * This means that weight will be 0 but not running for a sched_entity
	 * but also for a cfs_rq if the latter becomes idle. As an example,
	 * this happens during idle_balance() which calls
	 * update_blocked_averages()
	 */
	if (!weight)
		running = 0;

	/*
	 * Now we know we crossed measurement unit boundaries. The *_avg
	 * accrues by two steps:
	 *
	 * Step 1: accumulate *_sum since last_update_time. If we haven't
	 * crossed period boundaries, finish.
	 */
	ret = accumulate_sum(delta, cpu, sa, weight, running, cfs_rq);
	if (!ret) {
		if (!irq || (irq && !last_accum))
			return 0;
	} else if (irq == 1)
		return 1;

	/*
	 * Step 2: update *_avg.
	 */
	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
	if (cfs_rq) {
		cfs_rq->runnable_load_avg =
			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
	}
	sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);

	return 1;
}

sa->load_sum = weight * decay_time*freq_scale;

Sa.load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib)

sa->util_sum = decay_time*freq_scale*scale_cpu

Sa.util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib)

cfs_rq->runnable_load_sum = weight * decay_time*freq_scale;

cfs_rq->runnable_load_avg= div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);