Linux中的进程调度(六)

作者：丽sd园印章 | 来源：互联网 | 2023-07-13 21:17

从现在开始来分析和负载平衡有关的策略。**Thisfunctiongetscalledbythetimercode,withHZfrequency.*Wecallitwithint

从现在开始来分析和负载平衡有关的策略。

/** This function gets called by the timer code, with HZ frequency.* We call it with interrupts disabled.** It also gets called by the fork code, when changing the parent&＃39;s* timeslices.*/
void scheduler_tick(void)
{int cpu &＃61; smp_processor_id();struct rq *rq &＃61; cpu_rq(cpu);struct task_struct *curr &＃61; rq->curr;sched_clock_tick();spin_lock(&rq->lock);update_rq_clock(rq);update_cpu_load(rq);curr->sched_class->task_tick(rq, curr, 0);spin_unlock(&rq->lock);#ifdef CONFIG_SMPrq->idle_at_tick &＃61; idle_cpu(cpu);//检查当前cpu运行队列是否为空&＃xff08;只有idle进程)trigger_load_balance(rq, cpu);
#endif
} 可见&＃xff0c;在每次处理时钟中断时&＃xff0c;在最后会检查一下是否需要进行一次负载平衡。进入到trigger_load_balance中去&＃xff0c;从名字就可以猜出个大概。

/** Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.** In case of CONFIG_NO_HZ, this is the place where we nominate a new* idle load balancing owner or decide to stop the periodic load balancing,* if the whole system is idle.*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ/** If we were in the nohz mode recently and busy at the current* scheduler tick, then check if we need to nominate new idle* load balancer.*/if (rq->in_nohz_recently && !rq->idle_at_tick) {rq->in_nohz_recently &＃61; 0;if (atomic_read(&nohz.load_balancer) &＃61;&＃61; cpu) {cpumask_clear_cpu(cpu, nohz.cpu_mask);atomic_set(&nohz.load_balancer, -1);}if (atomic_read(&nohz.load_balancer) &＃61;&＃61; -1) {/** simple selection for now: Nominate the* first cpu in the nohz list to be the next* ilb owner.** TBD: Traverse the sched domains and nominate* the nearest cpu in the nohz.cpu_mask.*/int ilb &＃61; cpumask_first(nohz.cpu_mask);if (ilb idle_at_tick && atomic_read(&nohz.load_balancer) &＃61;&＃61; cpu &&cpumask_weight(nohz.cpu_mask) &＃61;&＃61; num_online_cpus()) {resched_cpu(cpu);return;}/** If this cpu is idle and the idle load balancing is done by* someone else, then no need raise the SCHED_SOFTIRQ*/if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) !&＃61; cpu &&cpumask_test_cpu(cpu, nohz.cpu_mask))return;
#endifif (time_after_eq(jiffies, rq->next_balance))raise_softirq(SCHED_SOFTIRQ);
} 忽略掉CONFIG_NO_HZ的部分&＃xff0c;可以看到&＃xff0c;这个函数就是判断一下当前的jiffies是不是已经比rq->next_balance值大&＃xff0c;如果值大的话&＃xff0c;会进一步调用raise_softirq提交一个软中断。提交的过程很简单&＃xff0c;就是把SCHED_SOFTIRQ对应的位置位&＃xff0c;处理软中断时检查是否位&＃xff0c;如果置位调用相应的软中断处理函数。用cscope在源码中搜索&＃xff0c;发现有如下语句&＃xff1a;

#ifdef CONFIG_SMPopen_softirq(SCHED_SOFTIRQ, run_rebalance_domains);#endif 呵&＃xff0c;这个软中断还是特别为SMP架构准备的呢&＃xff5e; 顺着这个线索&＃xff0c;去查看run_rebalance_domains的实现

/** run_rebalance_domains is triggered when needed from the scheduler tick.* In CONFIG_NO_HZ case, the idle load balance owner will do the* rebalancing for all the cpus for whom scheduler ticks are stopped.*/
static void run_rebalance_domains(struct softirq_action *h)
{int this_cpu &＃61; smp_processor_id();struct rq *this_rq &＃61; cpu_rq(this_cpu);enum cpu_idle_type idle &＃61; this_rq->idle_at_tick ?CPU_IDLE : CPU_NOT_IDLE;rebalance_domains(this_cpu, idle);#ifdef CONFIG_NO_HZ/** If this cpu is the owner for idle load balancing, then do the* balancing on behalf of the other idle cpus whose ticks are* stopped.*/if (this_rq->idle_at_tick &&atomic_read(&nohz.load_balancer) &＃61;&＃61; this_cpu) {struct rq *rq;int balance_cpu;for_each_cpu(balance_cpu, nohz.cpu_mask) {if (balance_cpu &＃61;&＃61; this_cpu)continue;/** If this cpu gets work to do, stop the load balancing* work being done for other cpus. Next load* balancing owner will pick it up.*/if (need_resched())break;rebalance_domains(balance_cpu, CPU_IDLE);rq &＃61; cpu_rq(balance_cpu);if (time_after(this_rq->next_balance, rq->next_balance))this_rq->next_balance &＃61; rq->next_balance;}}
#endif
} 忽略CONFIG_NO_HZ&＃xff0c;那么这个函数就是根据当前cpu的负载状态&＃xff08;为idle进程还是其它)确定idle参数&＃xff0c;然后调用rebalance_domains

/** It checks each scheduling domain to see if it is due to be balanced,* and initiates a balancing operation if so.** Balancing parameters are set up in arch_init_sched_domains.*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{int balance &＃61; 1;struct rq *rq &＃61; cpu_rq(cpu);unsigned long interval;struct sched_domain *sd;/* Earliest time when we have to do rebalance again */unsigned long next_balance &＃61; jiffies &＃43; 60*HZ;int update_next_balance &＃61; 0;int need_serialize;cpumask_var_t tmp;/* Fails alloc? Rebalancing probably not a priority right now. */if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))return;for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域if (!(sd->flags & SD_LOAD_BALANCE))//如果这个调度域已经明确表示不参与负载平衡&＃xff0c;则跳过continue;interval &＃61; sd->balance_interval;//得到该调度域的平衡周期if (idle !&＃61; CPU_IDLE)interval *&＃61; sd->busy_factor;//根据当前cpu状态对此周期进行修正/* scale ms to jiffies */interval &＃61; msecs_to_jiffies(interval);//将毫秒转化成jiffie数if (unlikely(!interval))interval &＃61; 1;if (interval > HZ*NR_CPUS/10)//继续修正interval &＃61; HZ*NR_CPUS/10;need_serialize &＃61; sd->flags & SD_SERIALIZE;if (need_serialize) {if (!spin_trylock(&balancing))goto out;}if (time_after_eq(jiffies, sd->last_balance &＃43; interval)) {//真的需要进行负载平衡了if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {/** We&＃39;ve pulled tasks over so either we&＃39;re no* longer idle, or one of our SMT siblings is* not idle.*/idle &＃61; CPU_NOT_IDLE;//注释已经写的很清楚}sd->last_balance &＃61; jiffies;//更新最后一次平衡的时间}if (need_serialize)spin_unlock(&balancing);
out:if (time_after(next_balance, sd->last_balance &＃43; interval)) {//设置下一次进行平衡操作的时间next_balance &＃61; sd->last_balance &＃43; interval;update_next_balance &＃61; 1;}/** Stop the load balance at this level. There is another* CPU in our sched group which is doing load balancing more* actively.*/if (!balance)break;}/** next_balance will be updated only when there is a need.* When the cpu is attached to null domain for ex, it will not be* updated.*/if (likely(update_next_balance))rq->next_balance &＃61; next_balance;free_cpumask_var(tmp);
} 重点就是load_balance了

/** Check this_cpu to ensure it is balanced within domain. Attempt to move* tasks if there is an imbalance.*/
static int load_balance(int this_cpu, struct rq *this_rq,struct sched_domain *sd, enum cpu_idle_type idle,int *balance, struct cpumask *cpus)
{int ld_moved, all_pinned &＃61; 0, active_balance &＃61; 0, sd_idle &＃61; 0;struct sched_group *group;unsigned long imbalance;struct rq *busiest;unsigned long flags;cpumask_setall(cpus);//先将所有cpu置位/** When power savings policy is enabled for the parent domain, idle* sibling can pick up load irrespective of busy siblings. In this case,* let the state of idle sibling percolate up as CPU_IDLE, instead of* portraying it as CPU_NOT_IDLE.*/if (idle !&＃61; CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))//SMP结构中&＃xff0c;SHARE_CPUPOWER不会出现sd_idle &＃61; 1;schedstat_inc(sd, lb_count[idle]);//更新统计信息redo:update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新group &＃61; find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,cpus, balance);//找也该调度域中最忙的调度组if (*balance &＃61;&＃61; 0)goto out_balanced;if (!group) {schedstat_inc(sd, lb_nobusyg[idle]);goto out_balanced;}busiest &＃61; find_busiest_queue(group, idle, imbalance, cpus);if (!busiest) {schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest &＃61;&＃61; this_rq);schedstat_add(sd, lb_imbalance[idle], imbalance);ld_moved &＃61; 0;if (busiest->nr_running > 1) {/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running nr_balance_failed&＃43;&＃43;;if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries&＃43;2)) {spin_lock_irqsave(&busiest->lock, flags);/* don&＃39;t kick the migration_thread, if the curr* task on busiest cpu can&＃39;t be moved to this_cpu*/if (!cpumask_test_cpu(this_cpu,&busiest->curr->cpus_allowed)) {spin_unlock_irqrestore(&busiest->lock, flags);all_pinned &＃61; 1;goto out_one_pinned;}if (!busiest->active_balance) {busiest->active_balance &＃61; 1;busiest->push_cpu &＃61; this_cpu;active_balance &＃61; 1;}spin_unlock_irqrestore(&busiest->lock, flags);if (active_balance)wake_up_process(busiest->migration_thread);/** We&＃39;ve kicked active balancing, reset the failure* counter.*/sd->nr_balance_failed &＃61; sd->cache_nice_tries&＃43;1;}} elsesd->nr_balance_failed &＃61; 0;if (likely(!active_balance)) {/* We were unbalanced, so reset the balancing interval */sd->balance_interval &＃61; sd->min_interval;} else {/** If we&＃39;ve begun active balancing, start to back off. This* case may not be covered by the all_pinned logic if there* is only 1 task on the busy runqueue (because we don&＃39;t call* move_tasks).*/if (sd->balance_interval max_interval)sd->balance_interval *&＃61; 2;}if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))ld_moved &＃61; -1;goto out;out_balanced:schedstat_inc(sd, lb_balanced[idle]);sd->nr_balance_failed &＃61; 0;out_one_pinned:/* tune up the balancing interval */if ((all_pinned && sd->balance_interval balance_interval max_interval))sd->balance_interval *&＃61; 2;if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))ld_moved &＃61; -1;elseld_moved &＃61; 0;
out:if (ld_moved)update_shares(sd);return ld_moved;
} 其中update_shares有必要去看一下

static void update_shares(struct sched_domain *sd)
{u64 now &＃61; cpu_clock(raw_smp_processor_id());s64 elapsed &＃61; now - sd->last_update;if (elapsed >&＃61; (s64)(u64)sysctl_sched_shares_ratelimit) {还要再进行一次确认&＃xff0c;是不是需要将此调度域中的每个进程组的share值更新sd->last_update &＃61; now;walk_tg_tree(tg_nop, tg_shares_up, sd);}
} 如果真的需要去更新该调度域的各个进程组的share值的话&＃xff0c;将调用wsalk_tg_tree进行更新操作&＃xff0c;tg_nop,shares_up是两个函数指针&＃xff0c;其中在这里&＃xff0c;tg_nop进行空操作&＃xff0c;shares_up将进行真正的更新操作。

/** Iterate the full tree, calling &＃64;down when first entering a node and &＃64;up when* leaving it for the final time.*/
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{struct task_group *parent, *child;int ret;rcu_read_lock();parent &＃61; &root_task_group;
down:ret &＃61; (*down)(parent, data);if (ret)goto out_unlock;list_for_each_entry_rcu(child, &parent->children, siblings) {parent &＃61; child;goto down;up:continue;}ret &＃61; (*up)(parent, data);if (ret)goto out_unlock;child &＃61; parent;parent &＃61; parent->parent;if (parent)goto up;
out_unlock:rcu_read_unlock();return ret;
}
代码比较难读&＃xff0c;不如自己在纸上画个图&＃xff0c;实际走一遍&＃xff0c;就看清楚了&＃xff0c;这里实际上就是从下而上&＃xff0c;从左到右&＃xff0c;依次更新每个调度组的share值&＃xff0c;具体的更新方法在shares_up函数中体现。

//注释中也说明了刚才的遍历方法
/** Re-compute the task group their per cpu shares over the given domain.* This needs to be done in a bottom-up fashion because the rq weight of a* parent group depends on the shares of its child groups.*/
static int tg_shares_up(struct task_group *tg, void *data)
{unsigned long weight, rq_weight &＃61; 0;unsigned long shares &＃61; 0;struct sched_domain *sd &＃61; data;int i;for_each_cpu(i, sched_domain_span(sd)) {//对于该调度域的所有cpu/** If there are currently no tasks on the cpu pretend there* is one of average load so that when a new task gets to* run here it will not get delayed by group starvation.*/weight &＃61; tg->cfs_rq[i]->load.weight;//将该调度组在该调度域中各个cpu上的运行队列的负载相加if (!weight)//如果在该cpu上没有负载&＃xff0c;就要分一些过来了&＃xff0c;注意与下一条语句联系weight &＃61; NICE_0_LOAD;tg->cfs_rq[i]->rq_weight &＃61; weight;//注意这里是cfs_rq的rq_weight&＃xff0c;rq_weight &＃43;&＃61; weight;//计算总的rq_weightshares &＃43;&＃61; tg->cfs_rq[i]->shares;//将该调度组在该调度域中各个cpu上的运行队列的shares值相加}//进行一下修正if ((!shares && rq_weight) || shares > tg->shares)shares &＃61; tg->shares;if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))//对于SMP来说&＃xff0c;这个条件是满足的&＃xff0c;shares &＃61; tg->shares;//shares值直接变成了该调度组的shares值了for_each_cpu(i, sched_domain_span(sd))//进行完刚才的统计后&＃xff0c;再来一次循环&＃xff0c;这次要更新了update_group_shares_cpu(tg, i, shares, rq_weight);return 0;
}
再来看update_group_shares_cpu

/** Calculate and set the cpu&＃39;s group shares.*/
static void//注意这里的参数&＃xff0c;tg就是在刚才树的遍历中遍历到的组&＃xff0c;cpu是该调度域中的cpu i&＃xff0c;sd_shares是该调度组的shares值&＃xff0c;sd_rq_weight是该组在该调度域中各个cpu上的运行队列的负载和
update_group_shares_cpu(struct task_group *tg, int cpu,unsigned long sd_shares, unsigned long sd_rq_weight)
{unsigned long shares;unsigned long rq_weight;if (!tg->se[cpu])return;rq_weight &＃61; tg->cfs_rq[cpu]->rq_weight;/** Sum shares * rq_weight* shares &＃61; -----------------------* Sum rq_weight**/shares &＃61; (sd_shares * rq_weight) / sd_rq_weight;//代码中的注释已经写的很清楚了shares &＃61; clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);//进行一下修正if (abs(shares - tg->se[cpu]->load.weight) >sysctl_sched_shares_thresh) {//为了避免操作过于频繁&＃xff0c;只有结果大于一个可控值时&＃xff0c;才进行更新。struct rq *rq &＃61; cpu_rq(cpu);unsigned long flags;spin_lock_irqsave(&rq->lock, flags);tg->cfs_rq[cpu]->shares &＃61; shares;__set_se_shares(tg->se[cpu], shares);//shares值最后还是要落实到"se"(scheduler entiry)中去spin_unlock_irqrestore(&rq->lock, flags);}
} 注意上面的计算方法&＃xff0c;是说对组内的cpu来讲&＃xff0c;共同来分担该组的shares值&＃xff0c;具体的分担方法是&＃xff0c;按比例来&＃xff0c;哪个cpu的负载占所有cpu负载的百分比大&＃xff0c;哪个cpu分得的shares值也就大一些&＃xff0c;优先级就大一些&＃xff0c;运行的时候就会多一些筹码&＃xff0c;这里需要返回去看pick_up_next的部分代码 __set_se_shares代码如下

static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{struct cfs_rq *cfs_rq &＃61; se->cfs_rq;int on_rq;on_rq &＃61; se->on_rq;if (on_rq)dequeue_entity(cfs_rq, se, 0);se->load.weight &＃61; shares;se->load.inv_weight &＃61; 0;if (on_rq)enqueue_entity(cfs_rq, se, 0);
} 很好理解&＃xff0c;先移出可执行队列&＃xff0c;更新其负载后&＃xff0c;再移入可执行队列。 update_shares到这里就分析完了&＃xff0c;注意update_shares的执行时间是在已经确定需要进行负载平衡&＃xff0c;但是还没有开始确定怎么平衡之前。先更新一下该调度域中各个组中的负载情况&＃xff0c;有助于下面的调度组以及进程的选择。返回刚才的load_balance函数&＃xff0c;继续往下进行。

redo:update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新group &＃61; find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,cpus, balance);//找到该调度域中最忙的调度组if (*balance &＃61;&＃61; 0)goto out_balanced;if (!group) {//如果都不太忙&＃xff0c;当然不需要平衡操作schedstat_inc(sd, lb_nobusyg[idle]);goto out_balanced;}busiest &＃61; find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列&＃xff0c;也就是最忙的cpuif (!busiest) {//如果所有cpu都不符合标准&＃xff0c;也不需要平衡操作schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest &＃61;&＃61; this_rq);schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息ld_moved &＃61; 0;//是否移动了某些进程的标志if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程&＃xff0c;注释中写了&＃xff0c;如果进程数小于&＃xff11;&＃xff0c;那么将其移走后&＃xff0c;进程数达到零&＃xff0c;不还是不平衡么&＃xff1f;所以干脆不移动/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running <&＃61; 1, the group is* still unbalanced. ld_moved simply stays zero, so it is* correctly treated as an imbalance.*/local_irq_save(flags);double_rq_lock(this_rq, busiest);ld_moved &＃61; move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示&＃xff0c;在busiest队列中挑选可进程&＃xff0c;移动到this_rq中去。imbalance, sd, idle, &all_pinned);double_rq_unlock(this_rq, busiest);local_irq_restore(flags);/** some other cpu did the load balance for us.*/if (ld_moved && this_cpu !&＃61; smp_processor_id())resched_cpu(this_cpu);/* All tasks on this runqueue were pinned by CPU affinity */if (unlikely(all_pinned)) {cpumask_clear_cpu(cpu_of(busiest), cpus);if (!cpumask_empty(cpus))goto redo;goto out_balanced;}}
先看 find_busiest_group函数&＃xff0c;这个函数比较长

/** find_busiest_group finds and returns the busiest CPU group within the* domain. It calculates and returns the amount of weighted load which* should be moved to restore balance via the imbalance parameter.*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,unsigned long *imbalance, enum cpu_idle_type idle,int *sd_idle, const struct cpumask *cpus, int *balance)
{struct sched_group *busiest &＃61; NULL, *this &＃61; NULL, *group &＃61; sd->groups;unsigned long max_load, avg_load, total_load, this_load, total_pwr;unsigned long max_pull;unsigned long busiest_load_per_task, busiest_nr_running;unsigned long this_load_per_task, this_nr_running;int load_idx, group_imb &＃61; 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)int power_savings_balance &＃61; 1;unsigned long leader_nr_running &＃61; 0, min_load_per_task &＃61; 0;unsigned long min_nr_running &＃61; ULONG_MAX;struct sched_group *group_min &＃61; NULL, *group_leader &＃61; NULL;
#endifmax_load &＃61; this_load &＃61; total_load &＃61; total_pwr &＃61; 0;busiest_load_per_task &＃61; busiest_nr_running &＃61; 0;this_load_per_task &＃61; this_nr_running &＃61; 0;if (idle &＃61;&＃61; CPU_NOT_IDLE)//先根据传进来的idle参数来确定load_idx的值&＃xff0c;这个值在下面寻找最忙调度组&＃xff08;不是进程组)时会作为一个重要指标load_idx &＃61; sd->busy_idx;//busy_idx默认为3else if (idle &＃61;&＃61; CPU_NEWLY_IDLE)load_idx &＃61; sd->newidle_idx;//newidle_idx为2elseload_idx &＃61; sd->idle_idx;idle_idx为&＃xff11;do {//从这里一直到while(group!&＃61;sd->groups)是一个大循环&＃xff0c;其目的就是遍布这个调度域中所有的调度组&＃xff0c;找出最忙的那个&＃xff0c;其中&＃xff0c;this_cpu所属的调度组不参与与其它调度组的竞争unsigned long load, group_capacity, max_cpu_load, min_cpu_load;int local_group;int i;int __group_imb &＃61; 0;unsigned int balance_cpu &＃61; -1, first_idle_cpu &＃61; 0;unsigned long sum_nr_running, sum_weighted_load;unsigned long sum_avg_load_per_task;unsigned long avg_load_per_task;local_group &＃61; cpumask_test_cpu(this_cpu,sched_group_cpus(group));//如果发现tihs_cpu属于当前的调度组&＃xff0c;那么将local_group置位if (local_group)balance_cpu &＃61; cpumask_first(sched_group_cpus(group));//如果正在处理"local_group"&＃xff0c;那么将balance_cpu暂定为该组中第一个cpu/* Tally up the load of all CPUs in the group */sum_weighted_load &＃61; sum_nr_running &＃61; avg_load &＃61; 0;sum_avg_load_per_task &＃61; avg_load_per_task &＃61; 0;max_cpu_load &＃61; 0;min_cpu_load &＃61; ~0UL;for_each_cpu_and(i, sched_group_cpus(group), cpus) {//对于该组中每个cpustruct rq *rq &＃61; cpu_rq(i);if (*sd_idle && rq->nr_running)*sd_idle &＃61; 0;/* Bias balancing toward cpus of our domain */if (local_group) {//如果是本地组&＃xff0c;且当前cpu为idle_cpu,并在循环中还没有进行过对balance_cpu的修正if (idle_cpu(i) && !first_idle_cpu) {first_idle_cpu &＃61; 1;balance_cpu &＃61; i;//将balance_cpu置为i&＃xff0c;仔细考虑下&＃xff0c;这里的逻辑就是说&＃xff0c;如果本地组中有空闲cpu&＃xff0c;那么就将第一个空闲cpu作为balance_cpu&＃xff0c;否则&＃xff0c;将该组中第一个cpu作为balance_cpu}load &＃61; target_load(i, load_idx);//累加计算该组的负载&＃xff0c;增加的数目要根据前面确定的load_idx来确定} else {//如果当前组不是本地组load &＃61; source_load(i, load_idx);//同上if (load > max_cpu_load)//如果该调度组的总负载大于已经找到的最大负载&＃xff0c;或者小于已经找到的最小负载&＃xff0c;则更新最大/最小值max_cpu_load &＃61; load;if (min_cpu_load > load)min_cpu_load &＃61; load;}avg_load &＃43;&＃61; load;//根据load_idx计算出来的负载之和sum_nr_running &＃43;&＃61; rq->nr_running;//组内各个cpu上可运行队列中进程数目之和sum_weighted_load &＃43;&＃61; weighted_cpuload(i);//该组内当前所有cpu的负载之和&＃xff0c;注意这里是当前的&＃xff0c;和avg_load不同&＃xff0c;因为avg_load的计算涉及到历史值&＃xff0c;也就是和load_idx有关sum_avg_load_per_task &＃43;&＃61; cpu_avg_load_per_task(i);//该cpu上所有进程的平均负载}/** First idle cpu or the first cpu(busiest) in this sched group* is eligible for doing load balancing at this and above* domains. In the newly idle case, we will allow all the cpu&＃39;s* to do the newly idle load balance.*/if (idle !&＃61; CPU_NEWLY_IDLE && local_group &&balance_cpu !&＃61; this_cpu && balance) {*balance &＃61; 0;goto ret;}total_load &＃43;&＃61; avg_load;//调度域的总负载total_pwr &＃43;&＃61; group->__cpu_power;//这个cpu_power还没弄清是怎么回事/* Adjust by relative CPU power of the group */avg_load &＃61; sg_div_cpu_power(group,avg_load * SCHED_LOAD_SCALE);//根据该总的avg_load以及其power确定该组最终的avg_load/** Consider the group unbalanced when the imbalance is larger* than the average weight of two tasks.** APZ: with cgroup the avg task weight can vary wildly and* might not be a suitable number - should we keep a* normalized nr_running number somewhere that negates* the hierarchy?*/avg_load_per_task &＃61; sg_div_cpu_power(group,sum_avg_load_per_task * SCHED_LOAD_SCALE);//同样&＃xff0c;修正该组的avg_load_per_task值if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)//如果该组内最大cpu负载值与最小cpu负载值之差大于平均负载值的2倍&＃xff0c;则__group_imb(imbalance&＃xff09;置1&＃xff0c;下面会看到它的作用__group_imb &＃61; 1;group_capacity &＃61; group->__cpu_power / SCHED_LOAD_SCALE;if (local_group) {//如果是本地组&＃xff0c;只更新this相关的变量&＃xff0c;并不更新busiest指针的指向this_load &＃61; avg_load;this &＃61; group;this_nr_running &＃61; sum_nr_running;this_load_per_task &＃61; sum_weighted_load;} else if (avg_load > max_load &&(sum_nr_running > group_capacity || __group_imb)) {//如果有不平衡的情况&＃xff0c;或者组内的进程数目已经超过了该组的能力&＃xff0c;且该组的平均负载大于已知的其它组的最大平均负载max_load &＃61; avg_load;//更新最大值busiest &＃61; group;//更新指针指向busiest_nr_running &＃61; sum_nr_running;//更新最忙组中的进程数目busiest_load_per_task &＃61; sum_weighted_load;group_imb &＃61; __group_imb;}#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)//SMT结构暂不分析,所以会跳过很长的代码&＃xff0c;直到整个do-while循环结束/** Busy processors will not participate in power savings* balance.*/if (idle &＃61;&＃61; CPU_NOT_IDLE ||!(sd->flags & SD_POWERSAVINGS_BALANCE))goto group_next;/** If the local group is idle or completely loaded* no need to do power savings balance at this domain*/if (local_group && (this_nr_running >&＃61; group_capacity ||!this_nr_running))power_savings_balance &＃61; 0;/** If a group is already running at full capacity or idle,* don&＃39;t include that group in power savings calculations*/if (!power_savings_balance || sum_nr_running >&＃61; group_capacity|| !sum_nr_running)goto group_next;/** Calculate the group which has the least non-idle load.* This is the group from where we need to pick up the load* for saving power*/if ((sum_nr_running cpumask_first(sched_group_cpus(group_min)))) {group_min &＃61; group;min_nr_running &＃61; sum_nr_running;min_load_per_task &＃61; sum_weighted_load /sum_nr_running;}/** Calculate the group which is almost near its* capacity but still has some space to pick up some load* from other group and save more power*/if (sum_nr_running leader_nr_running ||(sum_nr_running &＃61;&＃61; leader_nr_running &&cpumask_first(sched_group_cpus(group)) next;} while (group !&＃61; sd->groups);//至此&＃xff0c;已经将该调度域中所有调度组全部遍历完&＃xff0c;如果有符合条件的最忙调度组的话&＃xff0c;busiest已经指向它if (!busiest || this_load >&＃61; max_load || busiest_nr_running &＃61;&＃61; 0)//没有符合条件的&＃xff0c;或者本地调度组比找到的那一组还要忙&＃xff0c;或者最忙的组中已经没有进程&＃xff0c;则不需要平衡goto out_balanced;avg_load &＃61; (SCHED_LOAD_SCALE * total_load) / total_pwr;if (this_load >&＃61; avg_load ||100*max_load imbalance_pct*this_load)//另一个不需要平衡的条件&＃xff0c;当前组负载大于平均平均负载&＃xff0c;或者最大负载与当前组负载之比小于某个值goto out_balanced;busiest_load_per_task /&＃61; busiest_nr_running;//最忙组中每个进程的平均负载if (group_imb)//如果组内cpu上的最大负载与最小负载之差大于组内平均负载的2倍&＃xff0c;则进行一下修正busiest_load_per_task &＃61; min(busiest_load_per_task, avg_load);/** We&＃39;re trying to get all the cpus to the average_load, so we don&＃39;t* want to push ourselves above the average load, nor do we wish to* reduce the max loaded cpu below the average load, as either of these* actions would just result in more rebalancing later, and ping-pong* tasks around. Thus we look for the minimum possible imbalance.* Negative imbalances (*we* are more loaded than anyone else) will* be counted as no imbalance for these purposes -- we can&＃39;t fix that* by pulling tasks to us. Be careful of negative numbers as they&＃39;ll* appear as very large values with unsigned longs.*/if (max_load <&＃61; busiest_load_per_task)goto out_balanced;/** In the presence of smp nice balancing, certain scenarios can have* max load less than avg load(as we skip the groups at or below* its cpu_power, while calculating max_load..)*/if (max_load __cpu_power,(avg_load - this_load) * this->__cpu_power)/ SCHED_LOAD_SCALE;//计算一下需要移动的负载量&＃xff0c;下面就是一些太细节的东西了&＃xff0c;从逻辑上也讲不好是什么道理&＃xff0c;所以不进行分析/** if *imbalance is less than the average load per runnable task* there is no gaurantee that any tasks will be moved so we&＃39;ll have* a think about bumping its value to force at least one task to be* moved*/if (*imbalance this_load_per_task)imbn &＃61; 1;} elsethis_load_per_task &＃61; cpu_avg_load_per_task(this_cpu);if (max_load - this_load &＃43; busiest_load_per_task >&＃61;busiest_load_per_task * imbn) {*imbalance &＃61; busiest_load_per_task;return busiest;}/** OK, we don&＃39;t have enough imbalance to justify moving tasks,* however we may be able to increase total CPU power used by* moving them.*/pwr_now &＃43;&＃61; busiest->__cpu_power *min(busiest_load_per_task, max_load);pwr_now &＃43;&＃61; this->__cpu_power *min(this_load_per_task, this_load);pwr_now /&＃61; SCHED_LOAD_SCALE;/* Amount of load we&＃39;d subtract */tmp &＃61; sg_div_cpu_power(busiest,busiest_load_per_task * SCHED_LOAD_SCALE);if (max_load > tmp)pwr_move &＃43;&＃61; busiest->__cpu_power *min(busiest_load_per_task, max_load - tmp);/* Amount of load we&＃39;d add */if (max_load * busiest->__cpu_power __cpu_power);elsetmp &＃61; sg_div_cpu_power(this,busiest_load_per_task * SCHED_LOAD_SCALE);pwr_move &＃43;&＃61; this->__cpu_power *min(this_load_per_task, this_load &＃43; tmp);pwr_move /&＃61; SCHED_LOAD_SCALE;/* Move if we gain throughput */if (pwr_move > pwr_now)*imbalance &＃61; busiest_load_per_task;}return busiest;out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)if (idle &＃61;&＃61; CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))goto ret;if (this &＃61;&＃61; group_leader && group_leader !&＃61; group_min) {*imbalance &＃61; min_load_per_task;if (sched_mc_power_savings >&＃61; POWERSAVINGS_BALANCE_WAKEUP) {cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu &＃61;cpumask_first(sched_group_cpus(group_leader));}return group_min;}
#endif
ret:*imbalance &＃61; 0;return NULL;
}
顺着load_balance的调用路线&＃xff0c;接下来就要执行find_busiest_queue了&＃xff0c;这个函数比较好理解

/** find_busiest_queue - find the busiest runqueue among the cpus in group.*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,unsigned long imbalance, const struct cpumask *cpus)
{struct rq *busiest &＃61; NULL, *rq;unsigned long max_load &＃61; 0;int i;for_each_cpu(i, sched_group_cpus(group)) {unsigned long wl;if (!cpumask_test_cpu(i, cpus))//该cpu不在当前调度组中continue;rq &＃61; cpu_rq(i);wl &＃61; weighted_cpuload(i);if (rq->nr_running &＃61;&＃61; 1 && wl > imbalance)//如果该cpu上只有一个进程&＃xff0c;且其负载比需要移动的负载量大continue;if (wl > max_load) {//更新最大值及最忙队列指针max_load &＃61; wl;busiest &＃61; rq;}}return busiest;
}
再次回到load_balance的调用路线中&＃xff0c;这次终于可以实施最终的移动了

busiest &＃61; find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列&＃xff0c;也就是最忙的cpuif (!busiest) {//如果所有cpu都不符合标准&＃xff0c;也不需要平衡操作schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest &＃61;&＃61; this_rq);schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息ld_moved &＃61; 0;//是否移动了某些进程的标志if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程&＃xff0c;注释中写了&＃xff0c;如果进程数小于&＃xff11;&＃xff0c;那么将其移走后&＃xff0c;进程数达到零&＃xff0c;不还是不平衡么&＃xff1f;所以干脆不移动/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running <&＃61; 1, the group is* still unbalanced. ld_moved simply stays zero, so it is* correctly treated as an imbalance.*/local_irq_save(flags);double_rq_lock(this_rq, busiest);//同时为两个队列加锁&＃xff0c;要考虑防死锁&＃xff0c;这里的处理是按指针地址大小进行加锁ld_moved &＃61; move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示&＃xff0c;在busiest队列中挑选可进程&＃xff0c;移动到this_rq中去。imbalance, sd, idle, &all_pinned);double_rq_unlock(this_rq, busiest);local_irq_restore(flags);/** some other cpu did the load balance for us.*/if (ld_moved && this_cpu !&＃61; smp_processor_id())resched_cpu(this_cpu);/* All tasks on this runqueue were pinned by CPU affinity */if (unlikely(all_pinned)) {cpumask_clear_cpu(cpu_of(busiest), cpus);if (!cpumask_empty(cpus))goto redo;goto out_balanced;}}
加锁之后&＃xff0c;便进入到了move_tasks中&＃xff0c;

/** move_tasks tries to move up to max_load_move weighted load from busiest to* this_rq, as part of a balancing operation within domain "sd".* Returns 1 if successful and 0 otherwise.** Called with both runqueues locked.*/
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned)
{const struct sched_class *class &＃61; sched_class_highest;unsigned long total_load_moved &＃61; 0;int this_best_prio &＃61; this_rq->curr->prio;do {total_load_moved &＃43;&＃61;class->load_balance(this_rq, this_cpu, busiest,max_load_move - total_load_moved,sd, idle, all_pinned, &this_best_prio);class &＃61; class->next;if (idle &＃61;&＃61; CPU_NEWLY_IDLE && this_rq->nr_running)break;} while (class && max_load_move > total_load_moved);return total_load_moved > 0;
}
函数主要由一个do-while循环完成&＃xff0c;开始时class指向sched_class_highest&＃xff0c;而在sched.c里面有有&＃xff1a;

#define sched_class_highest (&rt_sched_class)
也就是说&＃xff0c;在循环第一次执行时&＃xff0c;会调用rt_sched_class调度类里对应的load_balance函数&＃xff0c;去sched_rt.c里面寻找&＃xff0c;发现如下&＃xff1a;

.load_balance &＃61; load_balance_rt,

static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned, int *this_best_prio)
{/* don&＃39;t touch RT tasks */return 0;
}
可见&＃xff0c;这是一个空函数&＃xff0c;也就是对于负载平衡&＃xff0c;是不会将rt类进程迁移走的&＃xff0c;循环只好进入下一个调度类&＃xff0c;也就是cfs调度类。去执行它所对应的load_balance函数

#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned, int *this_best_prio)
{long rem_load_move &＃61; max_load_move;//rem_load_move remain_load_moveint busiest_cpu &＃61; cpu_of(busiest);//最忙的列队所对应的cpustruct task_group *tg;rcu_read_lock();update_h_load(busiest_cpu);//更新一下list_for_each_entry_rcu(tg, &task_groups, list) {struct cfs_rq *busiest_cfs_rq &＃61; tg->cfs_rq[busiest_cpu];unsigned long busiest_h_load &＃61; busiest_cfs_rq->h_load;unsigned long busiest_weight &＃61; busiest_cfs_rq->load.weight;u64 rem_load, moved_load;/** empty group*/if (!busiest_cfs_rq->task_weight)continue;rem_load &＃61; (u64)rem_load_move * busiest_weight;rem_load &＃61; div_u64(rem_load, busiest_h_load &＃43; 1);moved_load &＃61; __load_balance_fair(this_rq, this_cpu, busiest,rem_load, sd, idle, all_pinned, this_best_prio,tg->cfs_rq[busiest_cpu]);if (!moved_load)continue;moved_load *&＃61; busiest_h_load;moved_load &＃61; div_u64(moved_load, busiest_weight &＃43; 1);rem_load_move -&＃61; moved_load;if (rem_load_move <0)break;}rcu_read_unlock();return max_load_move - rem_load_move;
} update_h_load与之前看到过的更新shares值的函数比较像

static void update_h_load(long cpu){walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);}
前面已经说过tg_nop函数是一个空函数&＃xff0c;来看一下tg_load_down函数

/** Compute the cpu&＃39;s hierarchical load factor for each task group.* This needs to be done in a top-down fashion because the load of a child* group is a fraction of its parents load.*/
static int tg_load_down(struct task_group *tg, void *data)//注释里写的还算清楚
{unsigned long load;long cpu &＃61; (long)data;if (!tg->parent) {load &＃61; cpu_rq(cpu)->load.weight;} else {load &＃61; tg->parent->cfs_rq[cpu]->h_load;//父层需要移动的负载量load *&＃61; tg->cfs_rq[cpu]->shares;//这个值其实就是本层的load_weight值load /&＃61; tg->parent->cfs_rq[cpu]->load.weight &＃43; 1;}tg->cfs_rq[cpu]->h_load &＃61; load;return 0;
} 上面的代码算下来&＃xff0c;就是本层调度组需要移动的负载量&＃xff1d;本调度组的shares值*(本调度组的load_weight)/父调度组的load_weight 其实说白了就是按负载比例进行分配。将本队列中各组需要移动的负载量计算出来以后&＃xff0c;就可以去各组中去挑选实际的进程了。回到load_balance_fair函数中

update_h_load(busiest_cpu);//更新一下list_for_each_entry_rcu(tg, &task_groups, list) {//对于各调度组在该cpu上的运行队列struct cfs_rq *busiest_cfs_rq &＃61; tg->cfs_rq[busiest_cpu];unsigned long busiest_h_load &＃61; busiest_cfs_rq->h_load;//刚才update_h_load计算好的该组需要移动的负载量unsigned long busiest_weight &＃61; busiest_cfs_rq->load.weight;//该组的负载u64 rem_load, moved_load;/** empty group*/if (!busiest_cfs_rq->task_weight)continue;rem_load &＃61; (u64)rem_load_move * busiest_weight;rem_load &＃61; div_u64(rem_load, busiest_h_load &＃43; 1);//rem_load&＃xff1d;rem_load_move*(busiest_weight)/(busiest_h_load&＃43;1)moved_load &＃61; __load_balance_fair(this_rq, this_cpu, busiest,rem_load, sd, idle, all_pinned, this_best_prio,tg->cfs_rq[busiest_cpu]);//最终的动作就是这里了if (!moved_load)continue;moved_load *&＃61; busiest_h_load;moved_load &＃61; div_u64(moved_load, busiest_weight &＃43; 1);rem_load_move -&＃61; moved_load;//移动完一个组&＃xff0c;将“成果”反馈&＃xff0c;看看还是不是需要继续移动下一个组中的进程if (rem_load_move <0)break;}rcu_read_unlock();return max_load_move - rem_load_move;
对于__load_balance_fair&＃xff0c;如下&＃xff1a;

static unsigned long
__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move, struct sched_domain *sd,enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,struct cfs_rq *cfs_rq)
{struct rq_iterator cfs_rq_iterator;cfs_rq_iterator.start &＃61; load_balance_start_fair;cfs_rq_iterator.next &＃61; load_balance_next_fair;cfs_rq_iterator.arg &＃61; cfs_rq;return balance_tasks(this_rq, this_cpu, busiest,max_load_move, sd, idle, all_pinned,this_best_prio, &cfs_rq_iterator);
}
还需要进入到balance_task中去

static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move, struct sched_domain *sd,enum cpu_idle_type idle, int *all_pinned,int *this_best_prio, struct rq_iterator *iterator)
{int loops &＃61; 0, pulled &＃61; 0, pinned &＃61; 0;struct task_struct *p;long rem_load_move &＃61; max_load_move;if (max_load_move &＃61;&＃61; 0)goto out;pinned &＃61; 1;/** Start the load-balancing iterator:*/p &＃61; iterator->start(iterator->arg);
next:if (!p || loops&＃43;&＃43; > sysctl_sched_nr_migrate)goto out;if ((p->se.load.weight >> 1) > rem_load_move ||!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {//如果该进程负载大于剩余需要移动的负载量的2倍&＃xff0c;则不考虑移动此进程&＃xff0c;如果此进程不能被移动&＃xff0c;则同样不考虑移动此进程p &＃61; iterator->next(iterator->arg);goto next;}pull_task(busiest, p, this_rq, this_cpu);//可以移动&＃xff0c;此函数将进程拉到this_cpu的this_rq上来pulled&＃43;&＃43;;//移动进程数加1rem_load_move -&＃61; p->se.load.weight;//剩余需要移动负载量减小/** We only want to steal up to the prescribed amount of weighted load.*/if (rem_load_move > 0) {if (p->prio prio;p &＃61; iterator->next(iterator->arg);goto next;}
out:/** Right now, this is one of only two places pull_task() is called,* so we can safely collect pull_task() stats here rather than* inside pull_task().*/schedstat_add(sd, lb_gained[idle], pulled);//统计信息if (all_pinned)*all_pinned &＃61; pinned;return max_load_move - rem_load_move;
} can_migrate_task的代码如下&＃xff1a;

/** can_migrate_task - may task p from runqueue rq be migrated to this_cpu?*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned)
{/** We do not migrate tasks that are:* 1) running (obviously), or* 2) cannot be migrated to this CPU due to cpus_allowed, or* 3) are cache-hot on their current CPU.*/if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {schedstat_inc(p, se.nr_failed_migrations_affine);return 0;}*all_pinned &＃61; 0;if (task_running(rq, p)) {schedstat_inc(p, se.nr_failed_migrations_running);return 0;}/** Aggressive migration if:* 1) task is cache cold, or* 2) too many balance attempts have failed.*/if (!task_hot(p, rq->clock, sd) ||sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATSif (task_hot(p, rq->clock, sd)) {schedstat_inc(sd, lb_hot_gained[idle]);schedstat_inc(p, se.nr_forced_migrations);}
#endifreturn 1;}if (task_hot(p, rq->clock, sd)) {schedstat_inc(p, se.nr_failed_migrations_hot);return 0;}return 1;
}
注释中写的极为详细&＃xff0c;这里不作过多解释。那么往下&＃xff0c;就来看一看pull_task吧

/** pull_task - move a task from a remote runqueue to the local runqueue.* Both runqueues must be locked.*/
static void pull_task(struct rq *src_rq, struct task_struct *p,struct rq *this_rq, int this_cpu)//注释说的很清楚
{deactivate_task(src_rq, p, 0);//将p从src队列中拿掉set_task_cpu(p, this_cpu);//将p中相应指针指向this_cpu&＃xff0c;但是还没入新的可执行队列activate_task(this_rq, p, 0);//最终动作&＃xff0c;将p加入this_rq队列/** Note that idle threads have a prio of MAX_PRIO, for this test* to be always true for them.*/check_preempt_curr(this_rq, p, 0);
}
set_task_cpu函数&＃xff1a;

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{int old_cpu &＃61; task_cpu(p);struct rq *old_rq &＃61; cpu_rq(old_cpu), *new_rq &＃61; cpu_rq(new_cpu);struct cfs_rq *old_cfsrq &＃61; task_cfs_rq(p),*new_cfsrq &＃61; cpu_cfs_rq(old_cfsrq, new_cpu);u64 clock_offset;clock_offset &＃61; old_rq->clock - new_rq->clock;trace_sched_migrate_task(p, task_cpu(p), new_cpu);#ifdef CONFIG_SCHEDSTATSif (p->se.wait_start)p->se.wait_start -&＃61; clock_offset;if (p->se.sleep_start)p->se.sleep_start -&＃61; clock_offset;if (p->se.block_start)p->se.block_start -&＃61; clock_offset;if (old_cpu !&＃61; new_cpu) {schedstat_inc(p, se.nr_migrations);if (task_hot(p, old_rq->clock, NULL))schedstat_inc(p, se.nr_forced2_migrations);}
#endifp->se.vruntime -&＃61; old_cfsrq->min_vruntime -new_cfsrq->min_vruntime;__set_task_cpu(p, new_cpu);
}
__set_task_cpu:

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{set_task_rq(p, cpu);
#ifdef CONFIG_SMP/** After ->cpu is set up to a new value, task_rq_lock(p, ...) can be* successfuly executed on another CPU. We must ensure that updates of* per-task data have been completed by this moment.*/smp_wmb();task_thread_info(p)->cpu &＃61; cpu;
#endif
}
set_task_rq:

/* Change a task&＃39;s cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_FAIR_GROUP_SCHEDp->se.cfs_rq &＃61; task_group(p)->cfs_rq[cpu];p->se.parent &＃61; task_group(p)->se[cpu];
#endif#ifdef CONFIG_RT_GROUP_SCHEDp->rt.rt_rq &＃61; task_group(p)->rt_rq[cpu];p->rt.parent &＃61; task_group(p)->rt_se[cpu];
#endif
} 可见&＃xff0c;p进程原来属于哪个组&＃xff0c;移动后还是属于哪个组&＃xff0c;只不过它被移动到了该组在其它cpu上的运行队列中由move_tasks产生的动作到这里就完了&＃xff0c;其实就是按照先算出来的每个组需要移动的负载量&＃xff0c;依次从每个组中挑选进程移走。再次回到load_balance函数中&＃xff0c;现在的情况是&＃xff0c;通过寻找该调度域中最忙的调度组&＃xff0c;以及找到最忙调度组中的最忙cpu&＃xff0c;又通过move_tasks将各种进程组中在此队列上的进程进行了适当的迁移&＃xff0c;迁移到了this_cpu上&＃xff0c;那么&＃xff0c;可以最后检查一下工作了&＃xff0c;看下刚才上述那些工作完成的怎么样

if (!ld_moved) {//如果没有移动进程schedstat_inc(sd, lb_failed[idle]);sd->nr_balance_failed&＃43;&＃43;;if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries&＃43;2)) {//如果失败次数已经超过cache_nice_tries&＃43;2&＃xff08;这个值看名字应该是保证cache hot用的)spin_lock_irqsave(&busiest->lock, flags);/* don&＃39;t kick the migration_thread, if the curr* task on busiest cpu can&＃39;t be moved to this_cpu*/if (!cpumask_test_cpu(this_cpu,&busiest->curr->cpus_allowed)) {//找下原因&＃xff0c;是不是因为进程被设定了不允许移动到this_cpu上spin_unlock_irqrestore(&busiest->lock, flags);all_pinned &＃61; 1;goto out_one_pinned;}if (!busiest->active_balance) {busiest->active_balance &＃61; 1;busiest->push_cpu &＃61; this_cpu;active_balance &＃61; 1;}spin_unlock_irqrestore(&busiest->lock, flags);if (active_balance)//实在不行&＃xff0c;唤醒migration_thread进程&＃xff0c;同步的去移动进程wake_up_process(busiest->migration_thread);/** We&＃39;ve kicked active balancing, reset the failure* counter.*/sd->nr_balance_failed &＃61; sd->cache_nice_tries&＃43;1;}} elsesd->nr_balance_failed &＃61; 0;if (likely(!active_balance)) {/* We were unbalanced, so reset the balancing interval */sd->balance_interval &＃61; sd->min_interval;//调整一下平衡周期} else {/** If we&＃39;ve begun active balancing, start to back off. This* case may not be covered by the all_pinned logic if there* is only 1 task on the busy runqueue (because we don&＃39;t call* move_tasks).*/if (sd->balance_interval max_interval)sd->balance_interval *&＃61; 2;}
migration_thread是干什么的呢&＃xff1f;原来&＃xff0c;每个cpu都会绑定一个migration_thread内核线程&＃xff0c;专门应对这种情况&＃xff0c;至于绑定的方法&＃xff0c;那就是将这个线程的task_struct结构体中cpu掩码设置好就OK了&＃xff0c;这也说明了为什么前面代码中会有"不允许移动到this_cpu“的情况。那么migration_thread都干些什么&＃xff1f;在sched.c中有如下函数&＃xff0c;在fork migration_thread时&＃xff0c;该线程将会执行它&＃xff1a;

/** migration_thread - this is a highprio system thread that performs* thread migration by bumping thread off CPU then &＃39;pushing&＃39; onto* another runqueue.*/
static int migration_thread(void *data)
{int cpu &＃61; (long)data;struct rq *rq;rq &＃61; cpu_rq(cpu);BUG_ON(rq->migration_thread !&＃61; current);set_current_state(TASK_INTERRUPTIBLE);while (!kthread_should_stop()) {struct migration_req *req;struct list_head *head;spin_lock_irq(&rq->lock);if (cpu_is_offline(cpu)) {spin_unlock_irq(&rq->lock);goto wait_to_die;}if (rq->active_balance) {active_load_balance(rq, cpu);rq->active_balance &＃61; 0;}head &＃61; &rq->migration_queue;if (list_empty(head)) {spin_unlock_irq(&rq->lock);schedule();set_current_state(TASK_INTERRUPTIBLE);continue;}req &＃61; list_entry(head->next, struct migration_req, list);list_del_init(head->next);spin_unlock(&rq->lock);__migrate_task(req->task, cpu, req->dest_cpu);local_irq_enable();complete(&req->done);}__set_current_state(TASK_RUNNING);return 0;wait_to_die:/* Wait for kthread_stop */set_current_state(TASK_INTERRUPTIBLE);while (!kthread_should_stop()) {schedule();set_current_state(TASK_INTERRUPTIBLE);}__set_current_state(TASK_RUNNING);return 0;
}
按刚才的情景&＃xff0c;会执行到active_load_balance函数

/** active_load_balance is run by migration threads. It pushes running tasks* off the busiest CPU onto idle CPUs. It requires at least 1 task to be* running on each physical CPU where possible, and avoids physical /* logical imbalances.** Called with busiest_rq locked.*/
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{int target_cpu &＃61; busiest_rq->push_cpu;struct sched_domain *sd;struct rq *target_rq;/* Is there any task to move? */if (busiest_rq->nr_running flags & SD_LOAD_BALANCE) &&cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))break;}if (likely(sd)) {schedstat_inc(sd, alb_count);if (move_one_task(target_rq, target_cpu, busiest_rq,sd, CPU_IDLE))//这里是move_one_task&＃xff0c;也就是说只移动一个进程&＃xff0c;减小了力度&＃xff0c;毕竟是受阻才会执行到这里的schedstat_inc(sd, alb_pushed);elseschedstat_inc(sd, alb_failed);}double_unlock_balance(busiest_rq, target_rq);
} 在该进程被唤醒之前&＃xff0c;push_cpu就已经被设置了load_balance里的this_cpu&＃xff0c;也就是说&＃xff0c;当时移动不了&＃xff0c;那过后再移动&＃xff0c;但是&＃xff0c;目标cpu还是不变的此外&＃xff0c;migration_thread线程还会检查rq中是否有提交上来的需要转移的进程&＃xff0c;如果有&＃xff0c;一并将其转移,那么进程究竟是怎么跑到这个队列中来的呢&＃xff1f;用cscope一路查下去&＃xff0c;发现是在exec中&＃xff0c;也就是sys_execve系统调用的执行过程中。

转载于:https://www.cnblogs.com/yangce/archive/2012/04/29/2910096.html