热门标签 | HotTags
当前位置:  开发笔记 > 编程语言 > 正文

Linux中的进程调度(六)

从现在开始来分析和负载平衡有关的策略。**Thisfunctiongetscalledbythetimercode,withHZfrequency.*Wecallitwithint
从现在开始来分析和负载平衡有关的策略。

/** This function gets called by the timer code, with HZ frequency.* We call it with interrupts disabled.** It also gets called by the fork code, when changing the parent's* timeslices.*/
void scheduler_tick(void)
{int cpu = smp_processor_id();struct rq *rq = cpu_rq(cpu);struct task_struct *curr = rq->curr;sched_clock_tick();spin_lock(&rq->lock);update_rq_clock(rq);update_cpu_load(rq);curr->sched_class->task_tick(rq, curr, 0);spin_unlock(&rq->lock);#ifdef CONFIG_SMPrq->idle_at_tick = idle_cpu(cpu);//检查当前cpu运行队列是否为空(只有idle进程)trigger_load_balance(rq, cpu);
#endif
} 可见,在每次处理时钟中断时,在最后会检查一下是否需要进行一次负载平衡。 进入到trigger_load_balance中去,从名字就可以猜出个大概。

/** Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.** In case of CONFIG_NO_HZ, this is the place where we nominate a new* idle load balancing owner or decide to stop the periodic load balancing,* if the whole system is idle.*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ/** If we were in the nohz mode recently and busy at the current* scheduler tick, then check if we need to nominate new idle* load balancer.*/if (rq->in_nohz_recently && !rq->idle_at_tick) {rq->in_nohz_recently = 0;if (atomic_read(&nohz.load_balancer) == cpu) {cpumask_clear_cpu(cpu, nohz.cpu_mask);atomic_set(&nohz.load_balancer, -1);}if (atomic_read(&nohz.load_balancer) == -1) {/** simple selection for now: Nominate the* first cpu in the nohz list to be the next* ilb owner.** TBD: Traverse the sched domains and nominate* the nearest cpu in the nohz.cpu_mask.*/int ilb = cpumask_first(nohz.cpu_mask);if (ilb idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {resched_cpu(cpu);return;}/** If this cpu is idle and the idle load balancing is done by* someone else, then no need raise the SCHED_SOFTIRQ*/if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&cpumask_test_cpu(cpu, nohz.cpu_mask))return;
#endifif (time_after_eq(jiffies, rq->next_balance))raise_softirq(SCHED_SOFTIRQ);
} 忽略掉CONFIG_NO_HZ的部分,可以看到,这个函数就是判断一下当前的jiffies是不是已经比rq->next_balance值大,如果值大的话,会进一步调用raise_softirq提交一个软中断。提交的过程很简单,就是把SCHED_SOFTIRQ对应的位置位,处理软中断时检查是否位,如果置位调用相应的软中断处理函数。 用cscope在源码中搜索,发现有如下语句:

#ifdef CONFIG_SMPopen_softirq(SCHED_SOFTIRQ, run_rebalance_domains);#endif 呵,这个软中断还是特别为SMP架构准备的呢~ 顺着这个线索,去查看run_rebalance_domains的实现

/** run_rebalance_domains is triggered when needed from the scheduler tick.* In CONFIG_NO_HZ case, the idle load balance owner will do the* rebalancing for all the cpus for whom scheduler ticks are stopped.*/
static void run_rebalance_domains(struct softirq_action *h)
{int this_cpu = smp_processor_id();struct rq *this_rq = cpu_rq(this_cpu);enum cpu_idle_type idle = this_rq->idle_at_tick ?CPU_IDLE : CPU_NOT_IDLE;rebalance_domains(this_cpu, idle);#ifdef CONFIG_NO_HZ/** If this cpu is the owner for idle load balancing, then do the* balancing on behalf of the other idle cpus whose ticks are* stopped.*/if (this_rq->idle_at_tick &&atomic_read(&nohz.load_balancer) == this_cpu) {struct rq *rq;int balance_cpu;for_each_cpu(balance_cpu, nohz.cpu_mask) {if (balance_cpu == this_cpu)continue;/** If this cpu gets work to do, stop the load balancing* work being done for other cpus. Next load* balancing owner will pick it up.*/if (need_resched())break;rebalance_domains(balance_cpu, CPU_IDLE);rq = cpu_rq(balance_cpu);if (time_after(this_rq->next_balance, rq->next_balance))this_rq->next_balance = rq->next_balance;}}
#endif
} 忽略CONFIG_NO_HZ,那么这个函数就是根据当前cpu的负载状态(为idle进程还是其它)确定idle参数,然后调用rebalance_domains

/** It checks each scheduling domain to see if it is due to be balanced,* and initiates a balancing operation if so.** Balancing parameters are set up in arch_init_sched_domains.*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{int balance = 1;struct rq *rq = cpu_rq(cpu);unsigned long interval;struct sched_domain *sd;/* Earliest time when we have to do rebalance again */unsigned long next_balance = jiffies + 60*HZ;int update_next_balance = 0;int need_serialize;cpumask_var_t tmp;/* Fails alloc? Rebalancing probably not a priority right now. */if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))return;for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域if (!(sd->flags & SD_LOAD_BALANCE))//如果这个调度域已经明确表示不参与负载平衡,则跳过continue;interval = sd->balance_interval;//得到该调度域的平衡周期if (idle != CPU_IDLE)interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正/* scale ms to jiffies */interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数if (unlikely(!interval))interval = 1;if (interval > HZ*NR_CPUS/10)//继续修正interval = HZ*NR_CPUS/10;need_serialize = sd->flags & SD_SERIALIZE;if (need_serialize) {if (!spin_trylock(&balancing))goto out;}if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {/** We've pulled tasks over so either we're no* longer idle, or one of our SMT siblings is* not idle.*/idle = CPU_NOT_IDLE;//注释已经写的很清楚}sd->last_balance = jiffies;//更新最后一次平衡的时间}if (need_serialize)spin_unlock(&balancing);
out:if (time_after(next_balance, sd->last_balance + interval)) {//设置下一次进行平衡操作的时间next_balance = sd->last_balance + interval;update_next_balance = 1;}/** Stop the load balance at this level. There is another* CPU in our sched group which is doing load balancing more* actively.*/if (!balance)break;}/** next_balance will be updated only when there is a need.* When the cpu is attached to null domain for ex, it will not be* updated.*/if (likely(update_next_balance))rq->next_balance = next_balance;free_cpumask_var(tmp);
} 重点就是load_balance了

/** Check this_cpu to ensure it is balanced within domain. Attempt to move* tasks if there is an imbalance.*/
static int load_balance(int this_cpu, struct rq *this_rq,struct sched_domain *sd, enum cpu_idle_type idle,int *balance, struct cpumask *cpus)
{int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;struct sched_group *group;unsigned long imbalance;struct rq *busiest;unsigned long flags;cpumask_setall(cpus);//先将所有cpu置位/** When power savings policy is enabled for the parent domain, idle* sibling can pick up load irrespective of busy siblings. In this case,* let the state of idle sibling percolate up as CPU_IDLE, instead of* portraying it as CPU_NOT_IDLE.*/if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))//SMP结构中,SHARE_CPUPOWER不会出现sd_idle = 1;schedstat_inc(sd, lb_count[idle]);//更新统计信息redo:update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,cpus, balance);//找也该调度域中最忙的调度组if (*balance == 0)goto out_balanced;if (!group) {schedstat_inc(sd, lb_nobusyg[idle]);goto out_balanced;}busiest = find_busiest_queue(group, idle, imbalance, cpus);if (!busiest) {schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest == this_rq);schedstat_add(sd, lb_imbalance[idle], imbalance);ld_moved = 0;if (busiest->nr_running > 1) {/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running nr_balance_failed++;if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {spin_lock_irqsave(&busiest->lock, flags);/* don't kick the migration_thread, if the curr* task on busiest cpu can't be moved to this_cpu*/if (!cpumask_test_cpu(this_cpu,&busiest->curr->cpus_allowed)) {spin_unlock_irqrestore(&busiest->lock, flags);all_pinned = 1;goto out_one_pinned;}if (!busiest->active_balance) {busiest->active_balance = 1;busiest->push_cpu = this_cpu;active_balance = 1;}spin_unlock_irqrestore(&busiest->lock, flags);if (active_balance)wake_up_process(busiest->migration_thread);/** We've kicked active balancing, reset the failure* counter.*/sd->nr_balance_failed = sd->cache_nice_tries+1;}} elsesd->nr_balance_failed = 0;if (likely(!active_balance)) {/* We were unbalanced, so reset the balancing interval */sd->balance_interval = sd->min_interval;} else {/** If we've begun active balancing, start to back off. This* case may not be covered by the all_pinned logic if there* is only 1 task on the busy runqueue (because we don't call* move_tasks).*/if (sd->balance_interval max_interval)sd->balance_interval *= 2;}if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))ld_moved = -1;goto out;out_balanced:schedstat_inc(sd, lb_balanced[idle]);sd->nr_balance_failed = 0;out_one_pinned:/* tune up the balancing interval */if ((all_pinned && sd->balance_interval balance_interval max_interval))sd->balance_interval *= 2;if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))ld_moved = -1;elseld_moved = 0;
out:if (ld_moved)update_shares(sd);return ld_moved;
} 其中update_shares有必要去看一下

static void update_shares(struct sched_domain *sd)
{u64 now = cpu_clock(raw_smp_processor_id());s64 elapsed = now - sd->last_update;if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {还要再进行一次确认,是不是需要将此调度域中的每个进程组的share值更新sd->last_update = now;walk_tg_tree(tg_nop, tg_shares_up, sd);}
} 如果真的需要去更新该调度域的各个进程组的share值的话,将调用wsalk_tg_tree进行更新操作,tg_nop,shares_up是两个函数指针,其中在这里,tg_nop进行空操作,shares_up将进行真正的更新操作。

/** Iterate the full tree, calling @down when first entering a node and @up when* leaving it for the final time.*/
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{struct task_group *parent, *child;int ret;rcu_read_lock();parent = &root_task_group;
down:ret = (*down)(parent, data);if (ret)goto out_unlock;list_for_each_entry_rcu(child, &parent->children, siblings) {parent = child;goto down;up:continue;}ret = (*up)(parent, data);if (ret)goto out_unlock;child = parent;parent = parent->parent;if (parent)goto up;
out_unlock:rcu_read_unlock();return ret;
}
代码比较难读,不如自己在纸上画个图,实际走一遍,就看清楚了,这里实际上就是从下而上,从左到右,依次更新每个调度组的share值,具体的更新方法在shares_up函数中体现。

//注释中也说明了刚才的遍历方法
/** Re-compute the task group their per cpu shares over the given domain.* This needs to be done in a bottom-up fashion because the rq weight of a* parent group depends on the shares of its child groups.*/
static int tg_shares_up(struct task_group *tg, void *data)
{unsigned long weight, rq_weight = 0;unsigned long shares = 0;struct sched_domain *sd = data;int i;for_each_cpu(i, sched_domain_span(sd)) {//对于该调度域的所有cpu/** If there are currently no tasks on the cpu pretend there* is one of average load so that when a new task gets to* run here it will not get delayed by group starvation.*/weight = tg->cfs_rq[i]->load.weight;//将该调度组在该调度域中各个cpu上的运行队列的负载相加if (!weight)//如果在该cpu上没有负载,就要分一些过来了,注意与下一条语句联系weight = NICE_0_LOAD;tg->cfs_rq[i]->rq_weight = weight;//注意这里是cfs_rq的rq_weight,rq_weight += weight;//计算总的rq_weightshares += tg->cfs_rq[i]->shares;//将该调度组在该调度域中各个cpu上的运行队列的shares值相加}//进行一下修正if ((!shares && rq_weight) || shares > tg->shares)shares = tg->shares;if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))//对于SMP来说,这个条件是满足的,shares = tg->shares;//shares值直接变成了该调度组的shares值了for_each_cpu(i, sched_domain_span(sd))//进行完刚才的统计后,再来一次循环,这次要更新了update_group_shares_cpu(tg, i, shares, rq_weight);return 0;
}
再来看update_group_shares_cpu

/** Calculate and set the cpu's group shares.*/
static void//注意这里的参数,tg就是在刚才树的遍历中遍历到的组,cpu是该调度域中的cpu i,sd_shares是该调度组的shares值,sd_rq_weight是该组在该调度域中各个cpu上的运行队列的负载和
update_group_shares_cpu(struct task_group *tg, int cpu,unsigned long sd_shares, unsigned long sd_rq_weight)
{unsigned long shares;unsigned long rq_weight;if (!tg->se[cpu])return;rq_weight = tg->cfs_rq[cpu]->rq_weight;/** Sum shares * rq_weight* shares = -----------------------* Sum rq_weight**/shares = (sd_shares * rq_weight) / sd_rq_weight;//代码中的注释已经写的很清楚了shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);//进行一下修正if (abs(shares - tg->se[cpu]->load.weight) >sysctl_sched_shares_thresh) {//为了避免操作过于频繁,只有结果大于一个可控值时,才进行更新。struct rq *rq = cpu_rq(cpu);unsigned long flags;spin_lock_irqsave(&rq->lock, flags);tg->cfs_rq[cpu]->shares = shares;__set_se_shares(tg->se[cpu], shares);//shares值最后还是要落实到"se"(scheduler entiry)中去spin_unlock_irqrestore(&rq->lock, flags);}
} 注意上面的计算方法,是说对组内的cpu来讲,共同来分担该组的shares值,具体的分担方法是,按比例来,哪个cpu的负载占所有cpu负载的百分比大,哪个cpu分得的shares值也就大一些,优先级就大一些,运行的时候就会多一些筹码,这里需要返回去看pick_up_next的部分代码 __set_se_shares代码如下

static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{struct cfs_rq *cfs_rq = se->cfs_rq;int on_rq;on_rq = se->on_rq;if (on_rq)dequeue_entity(cfs_rq, se, 0);se->load.weight = shares;se->load.inv_weight = 0;if (on_rq)enqueue_entity(cfs_rq, se, 0);
} 很好理解,先移出可执行队列,更新其负载后,再移入可执行队列。 update_shares到这里就分析完了,注意update_shares的执行时间是在已经确定需要进行负载平衡,但是还没有开始确定怎么平衡之前。先更新一下该调度域中各个组中的负载情况,有助于下面的调度组以及进程的选择。 返回刚才的load_balance函数,继续往下进行。

redo:update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新group &#61; find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,cpus, balance);//找到该调度域中最忙的调度组if (*balance &#61;&#61; 0)goto out_balanced;if (!group) {//如果都不太忙&#xff0c;当然不需要平衡操作schedstat_inc(sd, lb_nobusyg[idle]);goto out_balanced;}busiest &#61; find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列&#xff0c;也就是最忙的cpuif (!busiest) {//如果所有cpu都不符合标准&#xff0c;也不需要平衡操作schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest &#61;&#61; this_rq);schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息ld_moved &#61; 0;//是否移动了某些进程的标志if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程&#xff0c;注释中写了&#xff0c;如果进程数小于&#xff11;&#xff0c;那么将其移走后&#xff0c;进程数达到零&#xff0c;不还是不平衡么&#xff1f;所以干脆不移动/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running <&#61; 1, the group is* still unbalanced. ld_moved simply stays zero, so it is* correctly treated as an imbalance.*/local_irq_save(flags);double_rq_lock(this_rq, busiest);ld_moved &#61; move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示&#xff0c;在busiest队列中挑选可进程&#xff0c;移动到this_rq中去。imbalance, sd, idle, &all_pinned);double_rq_unlock(this_rq, busiest);local_irq_restore(flags);/** some other cpu did the load balance for us.*/if (ld_moved && this_cpu !&#61; smp_processor_id())resched_cpu(this_cpu);/* All tasks on this runqueue were pinned by CPU affinity */if (unlikely(all_pinned)) {cpumask_clear_cpu(cpu_of(busiest), cpus);if (!cpumask_empty(cpus))goto redo;goto out_balanced;}}
先看 find_busiest_group函数&#xff0c;这个函数比较长

/** find_busiest_group finds and returns the busiest CPU group within the* domain. It calculates and returns the amount of weighted load which* should be moved to restore balance via the imbalance parameter.*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,unsigned long *imbalance, enum cpu_idle_type idle,int *sd_idle, const struct cpumask *cpus, int *balance)
{struct sched_group *busiest &#61; NULL, *this &#61; NULL, *group &#61; sd->groups;unsigned long max_load, avg_load, total_load, this_load, total_pwr;unsigned long max_pull;unsigned long busiest_load_per_task, busiest_nr_running;unsigned long this_load_per_task, this_nr_running;int load_idx, group_imb &#61; 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)int power_savings_balance &#61; 1;unsigned long leader_nr_running &#61; 0, min_load_per_task &#61; 0;unsigned long min_nr_running &#61; ULONG_MAX;struct sched_group *group_min &#61; NULL, *group_leader &#61; NULL;
#endifmax_load &#61; this_load &#61; total_load &#61; total_pwr &#61; 0;busiest_load_per_task &#61; busiest_nr_running &#61; 0;this_load_per_task &#61; this_nr_running &#61; 0;if (idle &#61;&#61; CPU_NOT_IDLE)//先根据传进来的idle参数来确定load_idx的值&#xff0c;这个值在下面寻找最忙调度组&#xff08;不是进程组)时会作为一个重要指标load_idx &#61; sd->busy_idx;//busy_idx默认为3else if (idle &#61;&#61; CPU_NEWLY_IDLE)load_idx &#61; sd->newidle_idx;//newidle_idx为2elseload_idx &#61; sd->idle_idx;idle_idx为&#xff11;do {//从这里一直到while(group!&#61;sd->groups)是一个大循环&#xff0c;其目的就是遍布这个调度域中所有的调度组&#xff0c;找出最忙的那个&#xff0c;其中&#xff0c;this_cpu所属的调度组不参与与其它调度组的竞争unsigned long load, group_capacity, max_cpu_load, min_cpu_load;int local_group;int i;int __group_imb &#61; 0;unsigned int balance_cpu &#61; -1, first_idle_cpu &#61; 0;unsigned long sum_nr_running, sum_weighted_load;unsigned long sum_avg_load_per_task;unsigned long avg_load_per_task;local_group &#61; cpumask_test_cpu(this_cpu,sched_group_cpus(group));//如果发现tihs_cpu属于当前的调度组&#xff0c;那么将local_group置位if (local_group)balance_cpu &#61; cpumask_first(sched_group_cpus(group));//如果正在处理"local_group"&#xff0c;那么将balance_cpu暂定为该组中第一个cpu/* Tally up the load of all CPUs in the group */sum_weighted_load &#61; sum_nr_running &#61; avg_load &#61; 0;sum_avg_load_per_task &#61; avg_load_per_task &#61; 0;max_cpu_load &#61; 0;min_cpu_load &#61; ~0UL;for_each_cpu_and(i, sched_group_cpus(group), cpus) {//对于该组中每个cpustruct rq *rq &#61; cpu_rq(i);if (*sd_idle && rq->nr_running)*sd_idle &#61; 0;/* Bias balancing toward cpus of our domain */if (local_group) {//如果是本地组&#xff0c;且当前cpu为idle_cpu,并在循环中还没有进行过对balance_cpu的修正if (idle_cpu(i) && !first_idle_cpu) {first_idle_cpu &#61; 1;balance_cpu &#61; i;//将balance_cpu置为i&#xff0c;仔细考虑下&#xff0c;这里的逻辑就是说&#xff0c;如果本地组中有空闲cpu&#xff0c;那么就将第一个空闲cpu作为balance_cpu&#xff0c;否则&#xff0c;将该组中第一个cpu作为balance_cpu}load &#61; target_load(i, load_idx);//累加计算该组的负载&#xff0c;增加的数目要根据前面确定的load_idx来确定} else {//如果当前组不是本地组load &#61; source_load(i, load_idx);//同上if (load > max_cpu_load)//如果该调度组的总负载大于已经找到的最大负载&#xff0c;或者小于已经找到的最小负载&#xff0c;则更新最大/最小值max_cpu_load &#61; load;if (min_cpu_load > load)min_cpu_load &#61; load;}avg_load &#43;&#61; load;//根据load_idx计算出来的负载之和sum_nr_running &#43;&#61; rq->nr_running;//组内各个cpu上可运行队列中进程数目之和sum_weighted_load &#43;&#61; weighted_cpuload(i);//该组内当前所有cpu的负载之和&#xff0c;注意这里是当前的&#xff0c;和avg_load不同&#xff0c;因为avg_load的计算涉及到历史值&#xff0c;也就是和load_idx有关sum_avg_load_per_task &#43;&#61; cpu_avg_load_per_task(i);//该cpu上所有进程的平均负载}/** First idle cpu or the first cpu(busiest) in this sched group* is eligible for doing load balancing at this and above* domains. In the newly idle case, we will allow all the cpu&#39;s* to do the newly idle load balance.*/if (idle !&#61; CPU_NEWLY_IDLE && local_group &&balance_cpu !&#61; this_cpu && balance) {*balance &#61; 0;goto ret;}total_load &#43;&#61; avg_load;//调度域的总负载total_pwr &#43;&#61; group->__cpu_power;//这个cpu_power还没弄清是怎么回事/* Adjust by relative CPU power of the group */avg_load &#61; sg_div_cpu_power(group,avg_load * SCHED_LOAD_SCALE);//根据该总的avg_load以及其power确定该组最终的avg_load/** Consider the group unbalanced when the imbalance is larger* than the average weight of two tasks.** APZ: with cgroup the avg task weight can vary wildly and* might not be a suitable number - should we keep a* normalized nr_running number somewhere that negates* the hierarchy?*/avg_load_per_task &#61; sg_div_cpu_power(group,sum_avg_load_per_task * SCHED_LOAD_SCALE);//同样&#xff0c;修正该组的avg_load_per_task值if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)//如果该组内最大cpu负载值与最小cpu负载值之差大于平均负载值的2倍&#xff0c;则__group_imb(imbalance&#xff09;置1&#xff0c;下面会看到它的作用__group_imb &#61; 1;group_capacity &#61; group->__cpu_power / SCHED_LOAD_SCALE;if (local_group) {//如果是本地组&#xff0c;只更新this相关的变量&#xff0c;并不更新busiest指针的指向this_load &#61; avg_load;this &#61; group;this_nr_running &#61; sum_nr_running;this_load_per_task &#61; sum_weighted_load;} else if (avg_load > max_load &&(sum_nr_running > group_capacity || __group_imb)) {//如果有不平衡的情况&#xff0c;或者组内的进程数目已经超过了该组的能力&#xff0c;且该组的平均负载大于已知的其它组的最大平均负载max_load &#61; avg_load;//更新最大值busiest &#61; group;//更新指针指向busiest_nr_running &#61; sum_nr_running;//更新最忙组中的进程数目busiest_load_per_task &#61; sum_weighted_load;group_imb &#61; __group_imb;}#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)//SMT结构暂不分析,所以会跳过很长的代码&#xff0c;直到整个do-while循环结束/** Busy processors will not participate in power savings* balance.*/if (idle &#61;&#61; CPU_NOT_IDLE ||!(sd->flags & SD_POWERSAVINGS_BALANCE))goto group_next;/** If the local group is idle or completely loaded* no need to do power savings balance at this domain*/if (local_group && (this_nr_running >&#61; group_capacity ||!this_nr_running))power_savings_balance &#61; 0;/** If a group is already running at full capacity or idle,* don&#39;t include that group in power savings calculations*/if (!power_savings_balance || sum_nr_running >&#61; group_capacity|| !sum_nr_running)goto group_next;/** Calculate the group which has the least non-idle load.* This is the group from where we need to pick up the load* for saving power*/if ((sum_nr_running cpumask_first(sched_group_cpus(group_min)))) {group_min &#61; group;min_nr_running &#61; sum_nr_running;min_load_per_task &#61; sum_weighted_load /sum_nr_running;}/** Calculate the group which is almost near its* capacity but still has some space to pick up some load* from other group and save more power*/if (sum_nr_running leader_nr_running ||(sum_nr_running &#61;&#61; leader_nr_running &&cpumask_first(sched_group_cpus(group)) next;} while (group !&#61; sd->groups);//至此&#xff0c;已经将该调度域中所有调度组全部遍历完&#xff0c;如果有符合条件的最忙调度组的话&#xff0c;busiest已经指向它if (!busiest || this_load >&#61; max_load || busiest_nr_running &#61;&#61; 0)//没有符合条件的&#xff0c;或者本地调度组比找到的那一组还要忙&#xff0c;或者最忙的组中已经没有进程&#xff0c;则不需要平衡goto out_balanced;avg_load &#61; (SCHED_LOAD_SCALE * total_load) / total_pwr;if (this_load >&#61; avg_load ||100*max_load imbalance_pct*this_load)//另一个不需要平衡的条件&#xff0c;当前组负载大于平均平均负载&#xff0c;或者最大负载与当前组负载之比小于某个值goto out_balanced;busiest_load_per_task /&#61; busiest_nr_running;//最忙组中每个进程的平均负载if (group_imb)//如果组内cpu上的最大负载与最小负载之差大于组内平均负载的2倍&#xff0c;则进行一下修正busiest_load_per_task &#61; min(busiest_load_per_task, avg_load);/** We&#39;re trying to get all the cpus to the average_load, so we don&#39;t* want to push ourselves above the average load, nor do we wish to* reduce the max loaded cpu below the average load, as either of these* actions would just result in more rebalancing later, and ping-pong* tasks around. Thus we look for the minimum possible imbalance.* Negative imbalances (*we* are more loaded than anyone else) will* be counted as no imbalance for these purposes -- we can&#39;t fix that* by pulling tasks to us. Be careful of negative numbers as they&#39;ll* appear as very large values with unsigned longs.*/if (max_load <&#61; busiest_load_per_task)goto out_balanced;/** In the presence of smp nice balancing, certain scenarios can have* max load less than avg load(as we skip the groups at or below* its cpu_power, while calculating max_load..)*/if (max_load __cpu_power,(avg_load - this_load) * this->__cpu_power)/ SCHED_LOAD_SCALE;//计算一下需要移动的负载量&#xff0c;下面就是一些太细节的东西了&#xff0c;从逻辑上也讲不好是什么道理&#xff0c;所以不进行分析/** if *imbalance is less than the average load per runnable task* there is no gaurantee that any tasks will be moved so we&#39;ll have* a think about bumping its value to force at least one task to be* moved*/if (*imbalance this_load_per_task)imbn &#61; 1;} elsethis_load_per_task &#61; cpu_avg_load_per_task(this_cpu);if (max_load - this_load &#43; busiest_load_per_task >&#61;busiest_load_per_task * imbn) {*imbalance &#61; busiest_load_per_task;return busiest;}/** OK, we don&#39;t have enough imbalance to justify moving tasks,* however we may be able to increase total CPU power used by* moving them.*/pwr_now &#43;&#61; busiest->__cpu_power *min(busiest_load_per_task, max_load);pwr_now &#43;&#61; this->__cpu_power *min(this_load_per_task, this_load);pwr_now /&#61; SCHED_LOAD_SCALE;/* Amount of load we&#39;d subtract */tmp &#61; sg_div_cpu_power(busiest,busiest_load_per_task * SCHED_LOAD_SCALE);if (max_load > tmp)pwr_move &#43;&#61; busiest->__cpu_power *min(busiest_load_per_task, max_load - tmp);/* Amount of load we&#39;d add */if (max_load * busiest->__cpu_power __cpu_power);elsetmp &#61; sg_div_cpu_power(this,busiest_load_per_task * SCHED_LOAD_SCALE);pwr_move &#43;&#61; this->__cpu_power *min(this_load_per_task, this_load &#43; tmp);pwr_move /&#61; SCHED_LOAD_SCALE;/* Move if we gain throughput */if (pwr_move > pwr_now)*imbalance &#61; busiest_load_per_task;}return busiest;out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)if (idle &#61;&#61; CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))goto ret;if (this &#61;&#61; group_leader && group_leader !&#61; group_min) {*imbalance &#61; min_load_per_task;if (sched_mc_power_savings >&#61; POWERSAVINGS_BALANCE_WAKEUP) {cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu &#61;cpumask_first(sched_group_cpus(group_leader));}return group_min;}
#endif
ret:*imbalance &#61; 0;return NULL;
}
顺着load_balance的调用路线&#xff0c;接下来就要执行find_busiest_queue了&#xff0c;这个函数比较好理解

/** find_busiest_queue - find the busiest runqueue among the cpus in group.*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,unsigned long imbalance, const struct cpumask *cpus)
{struct rq *busiest &#61; NULL, *rq;unsigned long max_load &#61; 0;int i;for_each_cpu(i, sched_group_cpus(group)) {unsigned long wl;if (!cpumask_test_cpu(i, cpus))//该cpu不在当前调度组中continue;rq &#61; cpu_rq(i);wl &#61; weighted_cpuload(i);if (rq->nr_running &#61;&#61; 1 && wl > imbalance)//如果该cpu上只有一个进程&#xff0c;且其负载比需要移动的负载量大continue;if (wl > max_load) {//更新最大值及最忙队列指针max_load &#61; wl;busiest &#61; rq;}}return busiest;
}
再次回到load_balance的调用路线中&#xff0c;这次终于可以实施最终的移动了

busiest &#61; find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列&#xff0c;也就是最忙的cpuif (!busiest) {//如果所有cpu都不符合标准&#xff0c;也不需要平衡操作schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest &#61;&#61; this_rq);schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息ld_moved &#61; 0;//是否移动了某些进程的标志if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程&#xff0c;注释中写了&#xff0c;如果进程数小于&#xff11;&#xff0c;那么将其移走后&#xff0c;进程数达到零&#xff0c;不还是不平衡么&#xff1f;所以干脆不移动/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running <&#61; 1, the group is* still unbalanced. ld_moved simply stays zero, so it is* correctly treated as an imbalance.*/local_irq_save(flags);double_rq_lock(this_rq, busiest);//同时为两个队列加锁&#xff0c;要考虑防死锁&#xff0c;这里的处理是按指针地址大小进行加锁ld_moved &#61; move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示&#xff0c;在busiest队列中挑选可进程&#xff0c;移动到this_rq中去。imbalance, sd, idle, &all_pinned);double_rq_unlock(this_rq, busiest);local_irq_restore(flags);/** some other cpu did the load balance for us.*/if (ld_moved && this_cpu !&#61; smp_processor_id())resched_cpu(this_cpu);/* All tasks on this runqueue were pinned by CPU affinity */if (unlikely(all_pinned)) {cpumask_clear_cpu(cpu_of(busiest), cpus);if (!cpumask_empty(cpus))goto redo;goto out_balanced;}}
加锁之后&#xff0c;便进入到了move_tasks中&#xff0c;

/** move_tasks tries to move up to max_load_move weighted load from busiest to* this_rq, as part of a balancing operation within domain "sd".* Returns 1 if successful and 0 otherwise.** Called with both runqueues locked.*/
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned)
{const struct sched_class *class &#61; sched_class_highest;unsigned long total_load_moved &#61; 0;int this_best_prio &#61; this_rq->curr->prio;do {total_load_moved &#43;&#61;class->load_balance(this_rq, this_cpu, busiest,max_load_move - total_load_moved,sd, idle, all_pinned, &this_best_prio);class &#61; class->next;if (idle &#61;&#61; CPU_NEWLY_IDLE && this_rq->nr_running)break;} while (class && max_load_move > total_load_moved);return total_load_moved > 0;
}
函数主要由一个do-while循环完成&#xff0c;开始时class指向sched_class_highest&#xff0c;而在sched.c里面有有&#xff1a;

#define sched_class_highest (&rt_sched_class)
也就是说&#xff0c;在循环第一次执行时&#xff0c;会调用rt_sched_class调度类里对应的load_balance函数&#xff0c;去sched_rt.c里面寻找&#xff0c;发现如下&#xff1a;

.load_balance &#61; load_balance_rt,

static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned, int *this_best_prio)
{/* don&#39;t touch RT tasks */return 0;
}
可见&#xff0c;这是一个空函数&#xff0c;也就是对于负载平衡&#xff0c;是不会将rt类进程迁移走的&#xff0c;循环只好进入下一个调度类&#xff0c;也就是cfs调度类。去执行它所对应的load_balance函数


#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned, int *this_best_prio)
{long rem_load_move &#61; max_load_move;//rem_load_move remain_load_moveint busiest_cpu &#61; cpu_of(busiest);//最忙的列队所对应的cpustruct task_group *tg;rcu_read_lock();update_h_load(busiest_cpu);//更新一下list_for_each_entry_rcu(tg, &task_groups, list) {struct cfs_rq *busiest_cfs_rq &#61; tg->cfs_rq[busiest_cpu];unsigned long busiest_h_load &#61; busiest_cfs_rq->h_load;unsigned long busiest_weight &#61; busiest_cfs_rq->load.weight;u64 rem_load, moved_load;/** empty group*/if (!busiest_cfs_rq->task_weight)continue;rem_load &#61; (u64)rem_load_move * busiest_weight;rem_load &#61; div_u64(rem_load, busiest_h_load &#43; 1);moved_load &#61; __load_balance_fair(this_rq, this_cpu, busiest,rem_load, sd, idle, all_pinned, this_best_prio,tg->cfs_rq[busiest_cpu]);if (!moved_load)continue;moved_load *&#61; busiest_h_load;moved_load &#61; div_u64(moved_load, busiest_weight &#43; 1);rem_load_move -&#61; moved_load;if (rem_load_move <0)break;}rcu_read_unlock();return max_load_move - rem_load_move;
} update_h_load与之前看到过的更新shares值的函数比较像

static void update_h_load(long cpu){walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);}
前面已经说过tg_nop函数是一个空函数&#xff0c;来看一下tg_load_down函数

/** Compute the cpu&#39;s hierarchical load factor for each task group.* This needs to be done in a top-down fashion because the load of a child* group is a fraction of its parents load.*/
static int tg_load_down(struct task_group *tg, void *data)//注释里写的还算清楚
{unsigned long load;long cpu &#61; (long)data;if (!tg->parent) {load &#61; cpu_rq(cpu)->load.weight;} else {load &#61; tg->parent->cfs_rq[cpu]->h_load;//父层需要移动的负载量load *&#61; tg->cfs_rq[cpu]->shares;//这个值其实就是本层的load_weight值load /&#61; tg->parent->cfs_rq[cpu]->load.weight &#43; 1;}tg->cfs_rq[cpu]->h_load &#61; load;return 0;
} 上面的代码算下来&#xff0c;就是本层调度组需要移动的负载量&#xff1d;本调度组的shares值*(本调度组的load_weight)/父调度组的load_weight 其实说白了就是按负载比例进行分配。 将本队列中各组需要移动的负载量计算出来以后&#xff0c;就可以去各组中去挑选实际的进程了。回到load_balance_fair函数中

update_h_load(busiest_cpu);//更新一下list_for_each_entry_rcu(tg, &task_groups, list) {//对于各调度组在该cpu上的运行队列struct cfs_rq *busiest_cfs_rq &#61; tg->cfs_rq[busiest_cpu];unsigned long busiest_h_load &#61; busiest_cfs_rq->h_load;//刚才update_h_load计算好的该组需要移动的负载量unsigned long busiest_weight &#61; busiest_cfs_rq->load.weight;//该组的负载u64 rem_load, moved_load;/** empty group*/if (!busiest_cfs_rq->task_weight)continue;rem_load &#61; (u64)rem_load_move * busiest_weight;rem_load &#61; div_u64(rem_load, busiest_h_load &#43; 1);//rem_load&#xff1d;rem_load_move*(busiest_weight)/(busiest_h_load&#43;1)moved_load &#61; __load_balance_fair(this_rq, this_cpu, busiest,rem_load, sd, idle, all_pinned, this_best_prio,tg->cfs_rq[busiest_cpu]);//最终的动作就是这里了if (!moved_load)continue;moved_load *&#61; busiest_h_load;moved_load &#61; div_u64(moved_load, busiest_weight &#43; 1);rem_load_move -&#61; moved_load;//移动完一个组&#xff0c;将“成果”反馈&#xff0c;看看还是不是需要继续移动下一个组中的进程if (rem_load_move <0)break;}rcu_read_unlock();return max_load_move - rem_load_move;
对于__load_balance_fair&#xff0c;如下&#xff1a;

static unsigned long
__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move, struct sched_domain *sd,enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,struct cfs_rq *cfs_rq)
{struct rq_iterator cfs_rq_iterator;cfs_rq_iterator.start &#61; load_balance_start_fair;cfs_rq_iterator.next &#61; load_balance_next_fair;cfs_rq_iterator.arg &#61; cfs_rq;return balance_tasks(this_rq, this_cpu, busiest,max_load_move, sd, idle, all_pinned,this_best_prio, &cfs_rq_iterator);
}
还需要进入到balance_task中去

static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,unsigned long max_load_move, struct sched_domain *sd,enum cpu_idle_type idle, int *all_pinned,int *this_best_prio, struct rq_iterator *iterator)
{int loops &#61; 0, pulled &#61; 0, pinned &#61; 0;struct task_struct *p;long rem_load_move &#61; max_load_move;if (max_load_move &#61;&#61; 0)goto out;pinned &#61; 1;/** Start the load-balancing iterator:*/p &#61; iterator->start(iterator->arg);
next:if (!p || loops&#43;&#43; > sysctl_sched_nr_migrate)goto out;if ((p->se.load.weight >> 1) > rem_load_move ||!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {//如果该进程负载大于剩余需要移动的负载量的2倍&#xff0c;则不考虑移动此进程&#xff0c;如果此进程不能被移动&#xff0c;则同样不考虑移动此进程p &#61; iterator->next(iterator->arg);goto next;}pull_task(busiest, p, this_rq, this_cpu);//可以移动&#xff0c;此函数将进程拉到this_cpu的this_rq上来pulled&#43;&#43;;//移动进程数加1rem_load_move -&#61; p->se.load.weight;//剩余需要移动负载量减小/** We only want to steal up to the prescribed amount of weighted load.*/if (rem_load_move > 0) {if (p->prio prio;p &#61; iterator->next(iterator->arg);goto next;}
out:/** Right now, this is one of only two places pull_task() is called,* so we can safely collect pull_task() stats here rather than* inside pull_task().*/schedstat_add(sd, lb_gained[idle], pulled);//统计信息if (all_pinned)*all_pinned &#61; pinned;return max_load_move - rem_load_move;
} can_migrate_task的代码如下&#xff1a;

/** can_migrate_task - may task p from runqueue rq be migrated to this_cpu?*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,struct sched_domain *sd, enum cpu_idle_type idle,int *all_pinned)
{/** We do not migrate tasks that are:* 1) running (obviously), or* 2) cannot be migrated to this CPU due to cpus_allowed, or* 3) are cache-hot on their current CPU.*/if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {schedstat_inc(p, se.nr_failed_migrations_affine);return 0;}*all_pinned &#61; 0;if (task_running(rq, p)) {schedstat_inc(p, se.nr_failed_migrations_running);return 0;}/** Aggressive migration if:* 1) task is cache cold, or* 2) too many balance attempts have failed.*/if (!task_hot(p, rq->clock, sd) ||sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATSif (task_hot(p, rq->clock, sd)) {schedstat_inc(sd, lb_hot_gained[idle]);schedstat_inc(p, se.nr_forced_migrations);}
#endifreturn 1;}if (task_hot(p, rq->clock, sd)) {schedstat_inc(p, se.nr_failed_migrations_hot);return 0;}return 1;
}
注释中写的极为详细&#xff0c;这里不作过多解释。 那么往下&#xff0c;就来看一看pull_task吧

/** pull_task - move a task from a remote runqueue to the local runqueue.* Both runqueues must be locked.*/
static void pull_task(struct rq *src_rq, struct task_struct *p,struct rq *this_rq, int this_cpu)//注释说的很清楚
{deactivate_task(src_rq, p, 0);//将p从src队列中拿掉set_task_cpu(p, this_cpu);//将p中相应指针指向this_cpu&#xff0c;但是还没入新的可执行队列activate_task(this_rq, p, 0);//最终动作&#xff0c;将p加入this_rq队列/** Note that idle threads have a prio of MAX_PRIO, for this test* to be always true for them.*/check_preempt_curr(this_rq, p, 0);
}
set_task_cpu函数&#xff1a;


void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{int old_cpu &#61; task_cpu(p);struct rq *old_rq &#61; cpu_rq(old_cpu), *new_rq &#61; cpu_rq(new_cpu);struct cfs_rq *old_cfsrq &#61; task_cfs_rq(p),*new_cfsrq &#61; cpu_cfs_rq(old_cfsrq, new_cpu);u64 clock_offset;clock_offset &#61; old_rq->clock - new_rq->clock;trace_sched_migrate_task(p, task_cpu(p), new_cpu);#ifdef CONFIG_SCHEDSTATSif (p->se.wait_start)p->se.wait_start -&#61; clock_offset;if (p->se.sleep_start)p->se.sleep_start -&#61; clock_offset;if (p->se.block_start)p->se.block_start -&#61; clock_offset;if (old_cpu !&#61; new_cpu) {schedstat_inc(p, se.nr_migrations);if (task_hot(p, old_rq->clock, NULL))schedstat_inc(p, se.nr_forced2_migrations);}
#endifp->se.vruntime -&#61; old_cfsrq->min_vruntime -new_cfsrq->min_vruntime;__set_task_cpu(p, new_cpu);
}
__set_task_cpu:

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{set_task_rq(p, cpu);
#ifdef CONFIG_SMP/** After ->cpu is set up to a new value, task_rq_lock(p, ...) can be* successfuly executed on another CPU. We must ensure that updates of* per-task data have been completed by this moment.*/smp_wmb();task_thread_info(p)->cpu &#61; cpu;
#endif
}
set_task_rq:

/* Change a task&#39;s cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_FAIR_GROUP_SCHEDp->se.cfs_rq &#61; task_group(p)->cfs_rq[cpu];p->se.parent &#61; task_group(p)->se[cpu];
#endif#ifdef CONFIG_RT_GROUP_SCHEDp->rt.rt_rq &#61; task_group(p)->rt_rq[cpu];p->rt.parent &#61; task_group(p)->rt_se[cpu];
#endif
} 可见&#xff0c;p进程原来属于哪个组&#xff0c;移动后还是属于哪个组&#xff0c;只不过它被移动到了该组在其它cpu上的运行队列中 由move_tasks产生的动作到这里就完了&#xff0c;其实就是按照先算出来的每个组需要移动的负载量&#xff0c;依次从每个组中挑选进程移走。 再次回到load_balance函数中&#xff0c;现在的情况是&#xff0c;通过寻找该调度域中最忙的调度组&#xff0c;以及找到最忙调度组中的最忙cpu&#xff0c;又通过move_tasks将各种进程组中在此队列上的进程进行了适当的迁移&#xff0c;迁移到了this_cpu上&#xff0c;那么&#xff0c;可以最后检查一下工作了&#xff0c;看下刚才上述那些工作完成的怎么样

if (!ld_moved) {//如果没有移动进程schedstat_inc(sd, lb_failed[idle]);sd->nr_balance_failed&#43;&#43;;if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries&#43;2)) {//如果失败次数已经超过cache_nice_tries&#43;2&#xff08;这个值看名字应该是保证cache hot用的)spin_lock_irqsave(&busiest->lock, flags);/* don&#39;t kick the migration_thread, if the curr* task on busiest cpu can&#39;t be moved to this_cpu*/if (!cpumask_test_cpu(this_cpu,&busiest->curr->cpus_allowed)) {//找下原因&#xff0c;是不是因为进程被设定了不允许移动到this_cpu上spin_unlock_irqrestore(&busiest->lock, flags);all_pinned &#61; 1;goto out_one_pinned;}if (!busiest->active_balance) {busiest->active_balance &#61; 1;busiest->push_cpu &#61; this_cpu;active_balance &#61; 1;}spin_unlock_irqrestore(&busiest->lock, flags);if (active_balance)//实在不行&#xff0c;唤醒migration_thread进程&#xff0c;同步的去移动进程wake_up_process(busiest->migration_thread);/** We&#39;ve kicked active balancing, reset the failure* counter.*/sd->nr_balance_failed &#61; sd->cache_nice_tries&#43;1;}} elsesd->nr_balance_failed &#61; 0;if (likely(!active_balance)) {/* We were unbalanced, so reset the balancing interval */sd->balance_interval &#61; sd->min_interval;//调整一下平衡周期} else {/** If we&#39;ve begun active balancing, start to back off. This* case may not be covered by the all_pinned logic if there* is only 1 task on the busy runqueue (because we don&#39;t call* move_tasks).*/if (sd->balance_interval max_interval)sd->balance_interval *&#61; 2;}
migration_thread是干什么的呢&#xff1f;原来&#xff0c;每个cpu都会绑定一个migration_thread内核线程&#xff0c;专门应对这种情况&#xff0c;至于绑定的方法&#xff0c;那就是将这个线程的task_struct结构体中cpu掩码设置好就OK了&#xff0c;这也说明了为什么前面代码中会有"不允许移动到this_cpu“的情况。 那么migration_thread都干些什么&#xff1f;在sched.c中有如下函数&#xff0c;在fork migration_thread时&#xff0c;该线程将会执行它&#xff1a;

/** migration_thread - this is a highprio system thread that performs* thread migration by bumping thread off CPU then &#39;pushing&#39; onto* another runqueue.*/
static int migration_thread(void *data)
{int cpu &#61; (long)data;struct rq *rq;rq &#61; cpu_rq(cpu);BUG_ON(rq->migration_thread !&#61; current);set_current_state(TASK_INTERRUPTIBLE);while (!kthread_should_stop()) {struct migration_req *req;struct list_head *head;spin_lock_irq(&rq->lock);if (cpu_is_offline(cpu)) {spin_unlock_irq(&rq->lock);goto wait_to_die;}if (rq->active_balance) {active_load_balance(rq, cpu);rq->active_balance &#61; 0;}head &#61; &rq->migration_queue;if (list_empty(head)) {spin_unlock_irq(&rq->lock);schedule();set_current_state(TASK_INTERRUPTIBLE);continue;}req &#61; list_entry(head->next, struct migration_req, list);list_del_init(head->next);spin_unlock(&rq->lock);__migrate_task(req->task, cpu, req->dest_cpu);local_irq_enable();complete(&req->done);}__set_current_state(TASK_RUNNING);return 0;wait_to_die:/* Wait for kthread_stop */set_current_state(TASK_INTERRUPTIBLE);while (!kthread_should_stop()) {schedule();set_current_state(TASK_INTERRUPTIBLE);}__set_current_state(TASK_RUNNING);return 0;
}
按刚才的情景&#xff0c;会执行到active_load_balance函数

/** active_load_balance is run by migration threads. It pushes running tasks* off the busiest CPU onto idle CPUs. It requires at least 1 task to be* running on each physical CPU where possible, and avoids physical /* logical imbalances.** Called with busiest_rq locked.*/
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{int target_cpu &#61; busiest_rq->push_cpu;struct sched_domain *sd;struct rq *target_rq;/* Is there any task to move? */if (busiest_rq->nr_running flags & SD_LOAD_BALANCE) &&cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))break;}if (likely(sd)) {schedstat_inc(sd, alb_count);if (move_one_task(target_rq, target_cpu, busiest_rq,sd, CPU_IDLE))//这里是move_one_task&#xff0c;也就是说只移动一个进程&#xff0c;减小了力度&#xff0c;毕竟是受阻才会执行到这里的schedstat_inc(sd, alb_pushed);elseschedstat_inc(sd, alb_failed);}double_unlock_balance(busiest_rq, target_rq);
} 在该进程被唤醒之前&#xff0c;push_cpu就已经被设置了load_balance里的this_cpu&#xff0c;也就是说&#xff0c;当时移动不了&#xff0c;那过后再移动&#xff0c;但是&#xff0c;目标cpu还是不变的 此外&#xff0c;migration_thread线程还会检查rq中是否有提交上来的需要转移的进程&#xff0c;如果有&#xff0c;一并将其转移,那么进程究竟是怎么跑到这个队列中来的呢&#xff1f;用cscope一路查下去&#xff0c;发现是在exec中&#xff0c;也就是sys_execve系统调用的执行过程中。    

转载于:https://www.cnblogs.com/yangce/archive/2012/04/29/2910096.html


推荐阅读
author-avatar
丽sd园印章
这个家伙很懒,什么也没留下!
PHP1.CN | 中国最专业的PHP中文社区 | DevBox开发工具箱 | json解析格式化 |PHP资讯 | PHP教程 | 数据库技术 | 服务器技术 | 前端开发技术 | PHP框架 | 开发工具 | 在线工具
Copyright © 1998 - 2020 PHP1.CN. All Rights Reserved | 京公网安备 11010802041100号 | 京ICP备19059560号-4 | PHP1.CN 第一PHP社区 版权所有