[kernel/f14/master] Scheduler fixes for Bugzilla #635813 and #633037

Chuck Ebbert cebbert at fedoraproject.org
Tue Sep 21 13:28:24 UTC 2010


commit b218718b2b345b73aa274b30f83f544b1e5d9817
Author: Chuck Ebbert <cebbert at redhat.com>
Date:   Tue Sep 21 09:27:25 2010 -0400

    Scheduler fixes for Bugzilla #635813 and #633037

 kernel.spec                                        |   20 +-
 ...rectly-accounted-as-system-time-on-32-bit.patch |   55 ++
 ...ffect-of-tickless-idle-on-update_cpu_load.patch |  276 +++++++++
 ...z-idle-load-balancing-logic-to-push-model.patch |  651 ++++++++++++++++++++
 ...15-update-rq-clock-for-nohz-balanced-cpus.patch |   28 +
 ...lock-synchronization-when-migrating-tasks.patch |   38 ++
 ...-move-sched_avg_update-to-update_cpu_load.patch |   58 ++
 7 files changed, 1125 insertions(+), 1 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index 316d1c0..62004ac 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -742,6 +742,13 @@ Patch12540: irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch
 Patch12550: keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch
 Patch12551: keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch
 
+Patch12560: sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
+Patch12565: sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
+Patch12570: sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
+Patch12575: sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
+Patch12580: sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
+Patch12585: sched-25-move-sched_avg_update-to-update_cpu_load.patch
+
 %endif
 
 BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
@@ -1371,6 +1378,14 @@ ApplyPatch irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch
 ApplyPatch keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch
 ApplyPatch keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch
 
+# Scheduler fixes (#635813 and #633037)
+ApplyPatch sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
+ApplyPatch sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
+ApplyPatch sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
+ApplyPatch sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
+ApplyPatch sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
+ApplyPatch sched-25-move-sched_avg_update-to-update_cpu_load.patch
+
 # END OF PATCH APPLICATIONS
 
 %endif
@@ -1957,7 +1972,10 @@ fi
 # and build.
 
 %changelog
-* Mon Sep 20 2010 Chuck Ebbert <cebbert at redhat.com> 2.6.35.5-29
+* Tue Sep 21 2010 Chuck Ebbert <cebbert at redhat.com> 2.6.35.5-29
+- Scheduler fixes for Bugzilla #635813 and #633037
+
+* Mon Sep 20 2010 Chuck Ebbert <cebbert at redhat.com>
 - Linux 2.6.35.5
 - Drop merged patches:
   01-compat-make-compat_alloc_user_space-incorporate-the-access_ok-check.patch
diff --git a/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch b/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
new file mode 100644
index 0000000..7c15122
--- /dev/null
+++ b/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
@@ -0,0 +1,55 @@
+From: Stanislaw Gruszka <sgruszka at redhat.com>
+Date: Tue, 14 Sep 2010 14:35:14 +0000 (+0200)
+Subject: sched: Fix user time incorrectly accounted as system time on 32-bit
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fx86%2Flinux-2.6-tip.git;a=commitdiff_plain;h=e75e863dd5c7d96b91ebbd241da5328fc38a78cc
+
+sched: Fix user time incorrectly accounted as system time on 32-bit
+
+We have 32-bit variable overflow possibility when multiply in
+task_times() and thread_group_times() functions. When the
+overflow happens then the scaled utime value becomes erroneously
+small and the scaled stime becomes i erroneously big.
+
+Reported here:
+
+ https://bugzilla.redhat.com/show_bug.cgi?id=633037
+ https://bugzilla.kernel.org/show_bug.cgi?id=16559
+
+Reported-by: Michael Chapman <redhat-bugzilla at very.puzzling.org>
+Reported-by: Ciriaco Garcia de Celis <sysman at etherpilot.com>
+Signed-off-by: Stanislaw Gruszka <sgruszka at redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra at chello.nl>
+Cc: Hidetoshi Seto <seto.hidetoshi at jp.fujitsu.com>
+Cc: <stable at kernel.org>  # 2.6.32.19+ (partially) and 2.6.33+
+LKML-Reference: <20100914143513.GB8415 at redhat.com>
+Signed-off-by: Ingo Molnar <mingo at elte.hu>
+---
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index ed09d4f..dc85ceb 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -3513,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+ 
+ 	if (total) {
+-		u64 temp;
++		u64 temp = rtime;
+ 
+-		temp = (u64)(rtime * utime);
++		temp *= utime;
+ 		do_div(temp, total);
+ 		utime = (cputime_t)temp;
+ 	} else
+@@ -3546,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+ 
+ 	if (total) {
+-		u64 temp;
++		u64 temp = rtime;
+ 
+-		temp = (u64)(rtime * cputime.utime);
++		temp *= cputime.utime;
+ 		do_div(temp, total);
+ 		utime = (cputime_t)temp;
+ 	} else
diff --git a/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch b/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
new file mode 100644
index 0000000..ea7e48e
--- /dev/null
+++ b/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
@@ -0,0 +1,276 @@
+From: Venkatesh Pallipadi <venki at google.com>
+Date: Tue, 18 May 2010 01:14:43 +0000 (-0700)
+Subject: sched: Avoid side-effect of tickless idle on update_cpu_load
+X-Git-Tag: v2.6.36-rc1~531^2~22
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=fdf3e95d3916f18bf8703fb065499fdbc4dfe34c
+
+sched: Avoid side-effect of tickless idle on update_cpu_load
+
+tickless idle has a negative side effect on update_cpu_load(), which
+in turn can affect load balancing behavior.
+
+update_cpu_load() is supposed to be called every tick, to keep track
+of various load indicies. With tickless idle, there are no scheduler
+ticks called on the idle CPUs. Idle CPUs may still do load balancing
+(with idle_load_balance CPU) using the stale cpu_load. It will also
+cause problems when all CPUs go idle for a while and become active
+again. In this case loads would not degrade as expected.
+
+This is how rq->nr_load_updates change looks like under different
+conditions:
+
+<cpu_num> <nr_load_updates change>
+All CPUS idle for 10 seconds (HZ=1000)
+0 1621
+10 496
+11 139
+12 875
+13 1672
+14 12
+15 21
+1 1472
+2 2426
+3 1161
+4 2108
+5 1525
+6 701
+7 249
+8 766
+9 1967
+
+One CPU busy rest idle for 10 seconds
+0 10003
+10 601
+11 95
+12 966
+13 1597
+14 114
+15 98
+1 3457
+2 93
+3 6679
+4 1425
+5 1479
+6 595
+7 193
+8 633
+9 1687
+
+All CPUs busy for 10 seconds
+0 10026
+10 10026
+11 10026
+12 10026
+13 10025
+14 10025
+15 10025
+1 10026
+2 10026
+3 10026
+4 10026
+5 10026
+6 10026
+7 10026
+8 10026
+9 10026
+
+That is update_cpu_load works properly only when all CPUs are busy.
+If all are idle, all the CPUs get way lower updates.  And when few
+CPUs are busy and rest are idle, only busy and ilb CPU does proper
+updates and rest of the idle CPUs will do lower updates.
+
+The patch keeps track of when a last update was done and fixes up
+the load avg based on current time.
+
+On one of my test system SPECjbb with warehouse 1..numcpus, patch
+improves throughput numbers by ~1% (average of 6 runs).  On another
+test system (with different domain hierarchy) there is no noticable
+change in perf.
+
+Signed-off-by: Venkatesh Pallipadi <venki at google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra at chello.nl>
+Cc: Thomas Gleixner <tglx at linutronix.de>
+LKML-Reference: <AANLkTilLtDWQsAUrIxJ6s04WTgmw9GuOODc5AOrYsaR5 at mail.gmail.com>
+Signed-off-by: Ingo Molnar <mingo at elte.hu>
+---
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index f37a961..a757f6b 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -457,6 +457,7 @@ struct rq {
+ 	unsigned long nr_running;
+ 	#define CPU_LOAD_IDX_MAX 5
+ 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
++	unsigned long last_load_update_tick;
+ #ifdef CONFIG_NO_HZ
+ 	u64 nohz_stamp;
+ 	unsigned char in_nohz_recently;
+@@ -1803,6 +1804,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ static void calc_load_account_idle(struct rq *this_rq);
+ static void update_sysctl(void);
+ static int get_update_sysctl_factor(void);
++static void update_cpu_load(struct rq *this_rq);
+ 
+ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+ {
+@@ -3050,23 +3052,102 @@ static void calc_load_account_active(struct rq *this_rq)
+ }
+ 
+ /*
++ * The exact cpuload at various idx values, calculated at every tick would be
++ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
++ *
++ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
++ * on nth tick when cpu may be busy, then we have:
++ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
++ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
++ *
++ * decay_load_missed() below does efficient calculation of
++ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
++ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
++ *
++ * The calculation is approximated on a 128 point scale.
++ * degrade_zero_ticks is the number of ticks after which load at any
++ * particular idx is approximated to be zero.
++ * degrade_factor is a precomputed table, a row for each load idx.
++ * Each column corresponds to degradation factor for a power of two ticks,
++ * based on 128 point scale.
++ * Example:
++ * row 2, col 3 (=12) says that the degradation at load idx 2 after
++ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
++ *
++ * With this power of 2 load factors, we can degrade the load n times
++ * by looking at 1 bits in n and doing as many mult/shift instead of
++ * n mult/shifts needed by the exact degradation.
++ */
++#define DEGRADE_SHIFT		7
++static const unsigned char
++		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
++static const unsigned char
++		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
++					{0, 0, 0, 0, 0, 0, 0, 0},
++					{64, 32, 8, 0, 0, 0, 0, 0},
++					{96, 72, 40, 12, 1, 0, 0},
++					{112, 98, 75, 43, 15, 1, 0},
++					{120, 112, 98, 76, 45, 16, 2} };
++
++/*
++ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
++ * would be when CPU is idle and so we just decay the old load without
++ * adding any new load.
++ */
++static unsigned long
++decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
++{
++	int j = 0;
++
++	if (!missed_updates)
++		return load;
++
++	if (missed_updates >= degrade_zero_ticks[idx])
++		return 0;
++
++	if (idx == 1)
++		return load >> missed_updates;
++
++	while (missed_updates) {
++		if (missed_updates % 2)
++			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
++
++		missed_updates >>= 1;
++		j++;
++	}
++	return load;
++}
++
++/*
+  * Update rq->cpu_load[] statistics. This function is usually called every
+- * scheduler tick (TICK_NSEC).
++ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
++ * every tick. We fix it up based on jiffies.
+  */
+ static void update_cpu_load(struct rq *this_rq)
+ {
+ 	unsigned long this_load = this_rq->load.weight;
++	unsigned long curr_jiffies = jiffies;
++	unsigned long pending_updates;
+ 	int i, scale;
+ 
+ 	this_rq->nr_load_updates++;
+ 
++	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
++	if (curr_jiffies == this_rq->last_load_update_tick)
++		return;
++
++	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
++	this_rq->last_load_update_tick = curr_jiffies;
++
+ 	/* Update our load: */
+-	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
++	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
++	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ 		unsigned long old_load, new_load;
+ 
+ 		/* scale is effectively 1 << i now, and >> i divides by scale */
+ 
+ 		old_load = this_rq->cpu_load[i];
++		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ 		new_load = this_load;
+ 		/*
+ 		 * Round up the averaging division if load is increasing. This
+@@ -3074,9 +3155,15 @@ static void update_cpu_load(struct rq *this_rq)
+ 		 * example.
+ 		 */
+ 		if (new_load > old_load)
+-			new_load += scale-1;
+-		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
++			new_load += scale - 1;
++
++		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ 	}
++}
++
++static void update_cpu_load_active(struct rq *this_rq)
++{
++	update_cpu_load(this_rq);
+ 
+ 	calc_load_account_active(this_rq);
+ }
+@@ -3464,7 +3551,7 @@ void scheduler_tick(void)
+ 
+ 	raw_spin_lock(&rq->lock);
+ 	update_rq_clock(rq);
+-	update_cpu_load(rq);
++	update_cpu_load_active(rq);
+ 	curr->sched_class->task_tick(rq, curr, 0);
+ 	raw_spin_unlock(&rq->lock);
+ 
+@@ -7688,6 +7775,9 @@ void __init sched_init(void)
+ 
+ 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ 			rq->cpu_load[j] = 0;
++
++		rq->last_load_update_tick = jiffies;
++
+ #ifdef CONFIG_SMP
+ 		rq->sd = NULL;
+ 		rq->rd = NULL;
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index eed35ed..22b8b4f 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -3420,9 +3420,12 @@ static void run_rebalance_domains(struct softirq_action *h)
+ 			if (need_resched())
+ 				break;
+ 
++			rq = cpu_rq(balance_cpu);
++			raw_spin_lock_irq(&rq->lock);
++			update_cpu_load(rq);
++			raw_spin_unlock_irq(&rq->lock);
+ 			rebalance_domains(balance_cpu, CPU_IDLE);
+ 
+-			rq = cpu_rq(balance_cpu);
+ 			if (time_after(this_rq->next_balance, rq->next_balance))
+ 				this_rq->next_balance = rq->next_balance;
+ 		}
diff --git a/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch b/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
new file mode 100644
index 0000000..622e9f1
--- /dev/null
+++ b/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
@@ -0,0 +1,651 @@
+From: Venkatesh Pallipadi <venki at google.com>
+Date: Sat, 22 May 2010 00:09:41 +0000 (-0700)
+Subject: sched: Change nohz idle load balancing logic to push model
+X-Git-Tag: v2.6.36-rc1~531^2~21
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=83cd4fe27ad8446619b2e030b171b858501de87d
+
+sched: Change nohz idle load balancing logic to push model
+
+In the new push model, all idle CPUs indeed go into nohz mode. There is
+still the concept of idle load balancer (performing the load balancing
+on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
+balancer when any of the nohz CPUs need idle load balancing.
+The kickee CPU does the idle load balancing on behalf of all idle CPUs
+instead of the normal idle balance.
+
+This addresses the below two problems with the current nohz ilb logic:
+* the idle load balancer continued to have periodic ticks during idle and
+  wokeup frequently, even though it did not have any rebalancing to do on
+  behalf of any of the idle CPUs.
+* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
+  periodic wakeup can result in a periodic additional interrupt on a CPU
+  doing the timer broadcast.
+
+Also currently we are migrating the unpinned timers from an idle to the cpu
+doing idle load balancing (when all the cpus in the system are idle,
+there is no idle load balancing cpu and timers get added to the same idle cpu
+where the request was made. So the existing optimization works only on semi idle
+system).
+
+And In semi idle system, we no longer have periodic ticks on the idle load
+balancer CPU. Using that cpu will add more delays to the timers than intended
+(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
+causing mysterious slowdowns during boot etc.
+
+For now, in the semi idle case, use the nearest busy cpu for migrating timers
+from an idle cpu.  This is good for power-savings anyway.
+
+Signed-off-by: Venkatesh Pallipadi <venki at google.com>
+Signed-off-by: Suresh Siddha <suresh.b.siddha at intel.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra at chello.nl>
+Cc: Thomas Gleixner <tglx at linutronix.de>
+LKML-Reference: <1274486981.2840.46.camel at sbs-t61.sc.intel.com>
+Signed-off-by: Ingo Molnar <mingo at elte.hu>
+---
+
+[ backported for 2.6.35 ]
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index c2d4316..a3e5b1c 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -271,13 +271,10 @@ extern int runqueue_is_locked(int cpu);
+ 
+ extern cpumask_var_t nohz_cpu_mask;
+ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
+-extern int select_nohz_load_balancer(int cpu);
+-extern int get_nohz_load_balancer(void);
++extern void select_nohz_load_balancer(int stop_tick);
++extern int get_nohz_timer_target(void);
+ #else
+-static inline int select_nohz_load_balancer(int cpu)
+-{
+-	return 0;
+-}
++static inline void select_nohz_load_balancer(int stop_tick) { }
+ #endif
+ 
+ /*
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index 5c69e99..e934339 100644
+--- a/kernel/hrtimer.c
++++ b/kernel/hrtimer.c
+@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ static int hrtimer_get_target(int this_cpu, int pinned)
+ {
+ #ifdef CONFIG_NO_HZ
+-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+-		int preferred_cpu = get_nohz_load_balancer();
+-
+-		if (preferred_cpu >= 0)
+-			return preferred_cpu;
+-	}
++	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
++		return get_nohz_timer_target();
+ #endif
+ 	return this_cpu;
+ }
+diff --git a/kernel/sched.c b/kernel/sched.c
+index a757f6b..132950b 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -460,7 +460,7 @@ struct rq {
+ 	unsigned long last_load_update_tick;
+ #ifdef CONFIG_NO_HZ
+ 	u64 nohz_stamp;
+-	unsigned char in_nohz_recently;
++	unsigned char nohz_balance_kick;
+ #endif
+ 	unsigned int skip_clock_update;
+ 
+@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu)
+ 
+ #ifdef CONFIG_NO_HZ
+ /*
++ * In the semi idle case, use the nearest busy cpu for migrating timers
++ * from an idle cpu.  This is good for power-savings.
++ *
++ * We don't do similar optimization for completely idle system, as
++ * selecting an idle cpu will add more delays to the timers than intended
++ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
++ */
++int get_nohz_timer_target(void)
++{
++	int cpu = smp_processor_id();
++	int i;
++	struct sched_domain *sd;
++
++	for_each_domain(cpu, sd) {
++		for_each_cpu(i, sched_domain_span(sd))
++			if (!idle_cpu(i))
++				return i;
++	}
++	return cpu;
++}
++/*
+  * When add_timer_on() enqueues a timer into the timer wheel of an
+  * idle CPU then this timer might expire before the next timer event
+  * which is scheduled to wake up that CPU. In case of a completely
+@@ -7791,6 +7812,10 @@ void __init sched_init(void)
+ 		rq->idle_stamp = 0;
+ 		rq->avg_idle = 2*sysctl_sched_migration_cost;
+ 		rq_attach_root(rq, &def_root_domain);
++#ifdef CONFIG_NO_HZ
++		rq->nohz_balance_kick = 0;
++		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
++#endif
+ #endif
+ 		init_rq_hrtick(rq);
+ 		atomic_set(&rq->nr_iowait, 0);
+@@ -7835,8 +7860,11 @@ void __init sched_init(void)
+ 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+-	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
++	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
++	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
++	atomic_set(&nohz.load_balancer, nr_cpu_ids);
++	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
++	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
+ #endif
+ 	/* May be allocated at isolcpus cmdline parse time */
+ 	if (cpu_isolated_map == NULL)
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 22b8b4f..6ee2e0a 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -3091,13 +3091,40 @@ out_unlock:
+ }
+ 
+ #ifdef CONFIG_NO_HZ
++
++static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
++
++static void trigger_sched_softirq(void *data)
++{
++	raise_softirq_irqoff(SCHED_SOFTIRQ);
++}
++
++static inline void init_sched_softirq_csd(struct call_single_data *csd)
++{
++	csd->func = trigger_sched_softirq;
++	csd->info = NULL;
++	csd->flags = 0;
++	csd->priv = 0;
++}
++
++/*
++ * idle load balancing details
++ * - One of the idle CPUs nominates itself as idle load_balancer, while
++ *   entering idle.
++ * - This idle load balancer CPU will also go into tickless mode when
++ *   it is idle, just like all other idle CPUs
++ * - When one of the busy CPUs notice that there may be an idle rebalancing
++ *   needed, they will kick the idle load balancer, which then does idle
++ *   load balancing for all the idle CPUs.
++ */
+ static struct {
+ 	atomic_t load_balancer;
+-	cpumask_var_t cpu_mask;
+-	cpumask_var_t ilb_grp_nohz_mask;
+-} nohz ____cacheline_aligned = {
+-	.load_balancer = ATOMIC_INIT(-1),
+-};
++	atomic_t first_pick_cpu;
++	atomic_t second_pick_cpu;
++	cpumask_var_t idle_cpus_mask;
++	cpumask_var_t grp_idle_mask;
++	unsigned long next_balance;     /* in jiffy units */
++} nohz ____cacheline_aligned;
+ 
+ int get_nohz_load_balancer(void)
+ {
+@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+  */
+ static inline int is_semi_idle_group(struct sched_group *ilb_group)
+ {
+-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
++	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
+ 					sched_group_cpus(ilb_group));
+ 
+ 	/*
+ 	 * A sched_group is semi-idle when it has atleast one busy cpu
+ 	 * and atleast one idle cpu.
+ 	 */
+-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
++	if (cpumask_empty(nohz.grp_idle_mask))
+ 		return 0;
+ 
+-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
++	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
+ 		return 0;
+ 
+ 	return 1;
+@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
+ 	 * Optimize for the case when we have no idle CPUs or only one
+ 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
+ 	 */
+-	if (cpumask_weight(nohz.cpu_mask) < 2)
++	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
+ 		goto out_done;
+ 
+ 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
+ 
+ 		do {
+ 			if (is_semi_idle_group(ilb_group))
+-				return cpumask_first(nohz.ilb_grp_nohz_mask);
++				return cpumask_first(nohz.grp_idle_mask);
+ 
+ 			ilb_group = ilb_group->next;
+ 
+@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
+ 	}
+ 
+ out_done:
+-	return cpumask_first(nohz.cpu_mask);
++	return nr_cpu_ids;
+ }
+ #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+ static inline int find_new_ilb(int call_cpu)
+ {
+-	return cpumask_first(nohz.cpu_mask);
++	return nr_cpu_ids;
+ }
+ #endif
+ 
+ /*
++ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
++ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
++ * CPU (if there is one).
++ */
++static void nohz_balancer_kick(int cpu)
++{
++	int ilb_cpu;
++
++	nohz.next_balance++;
++
++	ilb_cpu = get_nohz_load_balancer();
++
++	if (ilb_cpu >= nr_cpu_ids) {
++		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
++		if (ilb_cpu >= nr_cpu_ids)
++			return;
++	}
++
++	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
++		struct call_single_data *cp;
++
++		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
++		cp = &per_cpu(remote_sched_softirq_cb, cpu);
++		__smp_call_function_single(ilb_cpu, cp, 0);
++	}
++	return;
++}
++
++/*
+  * This routine will try to nominate the ilb (idle load balancing)
+  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+- * load balancing on behalf of all those cpus. If all the cpus in the system
+- * go into this tickless mode, then there will be no ilb owner (as there is
+- * no need for one) and all the cpus will sleep till the next wakeup event
+- * arrives...
+- *
+- * For the ilb owner, tick is not stopped. And this tick will be used
+- * for idle load balancing. ilb owner will still be part of
+- * nohz.cpu_mask..
++ * load balancing on behalf of all those cpus.
+  *
+- * While stopping the tick, this cpu will become the ilb owner if there
+- * is no other owner. And will be the owner till that cpu becomes busy
+- * or if all cpus in the system stop their ticks at which point
+- * there is no need for ilb owner.
++ * When the ilb owner becomes busy, we will not have new ilb owner until some
++ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
++ * idle load balancing by kicking one of the idle CPUs.
+  *
+- * When the ilb owner becomes busy, it nominates another owner, during the
+- * next busy scheduler_tick()
++ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
++ * ilb owner CPU in future (when there is a need for idle load balancing on
++ * behalf of all idle CPUs).
+  */
+-int select_nohz_load_balancer(int stop_tick)
++void select_nohz_load_balancer(int stop_tick)
+ {
+ 	int cpu = smp_processor_id();
+ 
+ 	if (stop_tick) {
+-		cpu_rq(cpu)->in_nohz_recently = 1;
+-
+ 		if (!cpu_active(cpu)) {
+ 			if (atomic_read(&nohz.load_balancer) != cpu)
+-				return 0;
++				return;
+ 
+ 			/*
+ 			 * If we are going offline and still the leader,
+ 			 * give up!
+ 			 */
+-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
++			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
++					   nr_cpu_ids) != cpu)
+ 				BUG();
+ 
+-			return 0;
++			return;
+ 		}
+ 
+-		cpumask_set_cpu(cpu, nohz.cpu_mask);
++		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ 
+-		/* time for ilb owner also to sleep */
+-		if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+-			if (atomic_read(&nohz.load_balancer) == cpu)
+-				atomic_set(&nohz.load_balancer, -1);
+-			return 0;
+-		}
++		if (atomic_read(&nohz.first_pick_cpu) == cpu)
++			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
++		if (atomic_read(&nohz.second_pick_cpu) == cpu)
++			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ 
+-		if (atomic_read(&nohz.load_balancer) == -1) {
+-			/* make me the ilb owner */
+-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+-				return 1;
+-		} else if (atomic_read(&nohz.load_balancer) == cpu) {
++		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
+ 			int new_ilb;
+ 
+-			if (!(sched_smt_power_savings ||
+-						sched_mc_power_savings))
+-				return 1;
++			/* make me the ilb owner */
++			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
++					   cpu) != nr_cpu_ids)
++				return;
++
+ 			/*
+ 			 * Check to see if there is a more power-efficient
+ 			 * ilb.
+ 			 */
+ 			new_ilb = find_new_ilb(cpu);
+ 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+-				atomic_set(&nohz.load_balancer, -1);
++				atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ 				resched_cpu(new_ilb);
+-				return 0;
++				return;
+ 			}
+-			return 1;
++			return;
+ 		}
+ 	} else {
+-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+-			return 0;
++		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
++			return;
+ 
+-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
++		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ 
+ 		if (atomic_read(&nohz.load_balancer) == cpu)
+-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
++			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
++					   nr_cpu_ids) != cpu)
+ 				BUG();
+ 	}
+-	return 0;
++	return;
+ }
+ #endif
+ 
+@@ -3383,11 +3428,101 @@ out:
+ 		rq->next_balance = next_balance;
+ }
+ 
++#ifdef CONFIG_NO_HZ
+ /*
+- * run_rebalance_domains is triggered when needed from the scheduler tick.
+- * In CONFIG_NO_HZ case, the idle load balance owner will do the
++ * In CONFIG_NO_HZ case, the idle balance kickee will do the
+  * rebalancing for all the cpus for whom scheduler ticks are stopped.
+  */
++static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
++{
++	struct rq *this_rq = cpu_rq(this_cpu);
++	struct rq *rq;
++	int balance_cpu;
++
++	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
++		return;
++
++	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
++		if (balance_cpu == this_cpu)
++			continue;
++
++		/*
++		 * If this cpu gets work to do, stop the load balancing
++		 * work being done for other cpus. Next load
++		 * balancing owner will pick it up.
++		 */
++		if (need_resched()) {
++			this_rq->nohz_balance_kick = 0;
++			break;
++		}
++
++		raw_spin_lock_irq(&this_rq->lock);
++		update_cpu_load(this_rq);
++		raw_spin_unlock_irq(&this_rq->lock);
++
++		rebalance_domains(balance_cpu, CPU_IDLE);
++
++		rq = cpu_rq(balance_cpu);
++		if (time_after(this_rq->next_balance, rq->next_balance))
++			this_rq->next_balance = rq->next_balance;
++	}
++	nohz.next_balance = this_rq->next_balance;
++	this_rq->nohz_balance_kick = 0;
++}
++
++/*
++ * Current heuristic for kicking the idle load balancer
++ * - first_pick_cpu is the one of the busy CPUs. It will kick
++ *   idle load balancer when it has more than one process active. This
++ *   eliminates the need for idle load balancing altogether when we have
++ *   only one running process in the system (common case).
++ * - If there are more than one busy CPU, idle load balancer may have
++ *   to run for active_load_balance to happen (i.e., two busy CPUs are
++ *   SMT or core siblings and can run better if they move to different
++ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
++ *   which will kick idle load balancer as soon as it has any load.
++ */
++static inline int nohz_kick_needed(struct rq *rq, int cpu)
++{
++	unsigned long now = jiffies;
++	int ret;
++	int first_pick_cpu, second_pick_cpu;
++
++	if (time_before(now, nohz.next_balance))
++		return 0;
++
++	if (!rq->nr_running)
++		return 0;
++
++	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
++	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
++
++	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
++	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
++		return 0;
++
++	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
++	if (ret == nr_cpu_ids || ret == cpu) {
++		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
++		if (rq->nr_running > 1)
++			return 1;
++	} else {
++		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
++		if (ret == nr_cpu_ids || ret == cpu) {
++			if (rq->nr_running)
++				return 1;
++		}
++	}
++	return 0;
++}
++#else
++static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
++#endif
++
++/*
++ * run_rebalance_domains is triggered when needed from the scheduler tick.
++ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
++ */
+ static void run_rebalance_domains(struct softirq_action *h)
+ {
+ 	int this_cpu = smp_processor_id();
+@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
+ 
+ 	rebalance_domains(this_cpu, idle);
+ 
+-#ifdef CONFIG_NO_HZ
+ 	/*
+-	 * If this cpu is the owner for idle load balancing, then do the
++	 * If this cpu has a pending nohz_balance_kick, then do the
+ 	 * balancing on behalf of the other idle cpus whose ticks are
+ 	 * stopped.
+ 	 */
+-	if (this_rq->idle_at_tick &&
+-	    atomic_read(&nohz.load_balancer) == this_cpu) {
+-		struct rq *rq;
+-		int balance_cpu;
+-
+-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
+-			if (balance_cpu == this_cpu)
+-				continue;
+-
+-			/*
+-			 * If this cpu gets work to do, stop the load balancing
+-			 * work being done for other cpus. Next load
+-			 * balancing owner will pick it up.
+-			 */
+-			if (need_resched())
+-				break;
+-
+-			rq = cpu_rq(balance_cpu);
+-			raw_spin_lock_irq(&rq->lock);
+-			update_cpu_load(rq);
+-			raw_spin_unlock_irq(&rq->lock);
+-			rebalance_domains(balance_cpu, CPU_IDLE);
+-
+-			if (time_after(this_rq->next_balance, rq->next_balance))
+-				this_rq->next_balance = rq->next_balance;
+-		}
+-	}
+-#endif
++	nohz_idle_balance(this_cpu, idle);
+ }
+ 
+ static inline int on_null_domain(int cpu)
+@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
+ 
+ /*
+  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+- *
+- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+- * idle load balancing owner or decide to stop the periodic load balancing,
+- * if the whole system is idle.
+  */
+ static inline void trigger_load_balance(struct rq *rq, int cpu)
+ {
+-#ifdef CONFIG_NO_HZ
+-	/*
+-	 * If we were in the nohz mode recently and busy at the current
+-	 * scheduler tick, then check if we need to nominate new idle
+-	 * load balancer.
+-	 */
+-	if (rq->in_nohz_recently && !rq->idle_at_tick) {
+-		rq->in_nohz_recently = 0;
+-
+-		if (atomic_read(&nohz.load_balancer) == cpu) {
+-			cpumask_clear_cpu(cpu, nohz.cpu_mask);
+-			atomic_set(&nohz.load_balancer, -1);
+-		}
+-
+-		if (atomic_read(&nohz.load_balancer) == -1) {
+-			int ilb = find_new_ilb(cpu);
+-
+-			if (ilb < nr_cpu_ids)
+-				resched_cpu(ilb);
+-		}
+-	}
+-
+-	/*
+-	 * If this cpu is idle and doing idle load balancing for all the
+-	 * cpus with ticks stopped, is it time for that to stop?
+-	 */
+-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+-	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+-		resched_cpu(cpu);
+-		return;
+-	}
+-
+-	/*
+-	 * If this cpu is idle and the idle load balancing is done by
+-	 * someone else, then no need raise the SCHED_SOFTIRQ
+-	 */
+-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+-	    cpumask_test_cpu(cpu, nohz.cpu_mask))
+-		return;
+-#endif
+ 	/* Don't need to rebalance while attached to NULL domain */
+ 	if (time_after_eq(jiffies, rq->next_balance) &&
+ 	    likely(!on_null_domain(cpu)))
+ 		raise_softirq(SCHED_SOFTIRQ);
++#ifdef CONFIG_NO_HZ
++	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
++		nohz_balancer_kick(cpu);
++#endif
+ }
+ 
+ static void rq_online_fair(struct rq *rq)
+diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
+index 1d7b9bc..5f171f0 100644
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
+ 		 * the scheduler tick in nohz_restart_sched_tick.
+ 		 */
+ 		if (!ts->tick_stopped) {
+-			if (select_nohz_load_balancer(1)) {
+-				/*
+-				 * sched tick not stopped!
+-				 */
+-				cpumask_clear_cpu(cpu, nohz_cpu_mask);
+-				goto out;
+-			}
++			select_nohz_load_balancer(1);
+ 
+ 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+ 			ts->tick_stopped = 1;
+diff --git a/kernel/timer.c b/kernel/timer.c
+index ee305c8..48d6aec 100644
+--- a/kernel/timer.c
++++ b/kernel/timer.c
+@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
+ 	cpu = smp_processor_id();
+ 
+ #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+-		int preferred_cpu = get_nohz_load_balancer();
+-
+-		if (preferred_cpu >= 0)
+-			cpu = preferred_cpu;
+-	}
++	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
++		cpu = get_nohz_timer_target();
+ #endif
+ 	new_base = per_cpu(tvec_bases, cpu);
+ 
diff --git a/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch b/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
new file mode 100644
index 0000000..7c5432e
--- /dev/null
+++ b/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
@@ -0,0 +1,28 @@
+From: Suresh Siddha <suresh.b.siddha at intel.com>
+Date: Fri, 9 Jul 2010 13:19:54 +0000 (+0200)
+Subject: sched: Update rq->clock for nohz balanced cpus
+X-Git-Tag: v2.6.36-rc1~531^2~5
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=5343bdb8fd076f16edc9d113a9e35e2a1d1f4966
+
+sched: Update rq->clock for nohz balanced cpus
+
+Suresh spotted that we don't update the rq->clock in the nohz
+load-balancer path.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra at chello.nl>
+LKML-Reference: <1278626014.2834.74.camel at sbs-t61.sc.intel.com>
+Signed-off-by: Ingo Molnar <mingo at elte.hu>
+---
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index b4da534..e44a591 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -3596,6 +3596,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+ 		}
+ 
+ 		raw_spin_lock_irq(&this_rq->lock);
++		update_rq_clock(this_rq);
+ 		update_cpu_load(this_rq);
+ 		raw_spin_unlock_irq(&this_rq->lock);
+ 
diff --git a/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch b/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
new file mode 100644
index 0000000..466dd2d
--- /dev/null
+++ b/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
@@ -0,0 +1,38 @@
+From: Peter Zijlstra <peterz at infradead.org>
+Date: Thu, 19 Aug 2010 11:31:43 +0000 (+0200)
+Subject: sched: Fix rq->clock synchronization when migrating tasks
+X-Git-Tag: v2.6.36-rc3~25^2~1
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=861d034ee814917a83bd5de4b26e3b8336ddeeb8
+
+sched: Fix rq->clock synchronization when migrating tasks
+
+sched_fork() -- we do task placement in ->task_fork_fair() ensure we
+  update_rq_clock() so we work with current time. We leave the vruntime
+  in relative state, so the time delay until wake_up_new_task() doesn't
+  matter.
+
+wake_up_new_task() -- Since task_fork_fair() left p->vruntime in
+  relative state we can safely migrate, the activate_task() on the
+  remote rq will call update_rq_clock() and causes the clock to be
+  synced (enough).
+
+Tested-by: Jack Daniel <wanders.thirst at gmail.com>
+Tested-by: Philby John <pjohn at mvista.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra at chello.nl>
+LKML-Reference: <1281002322.1923.1708.camel at laptop>
+Signed-off-by: Ingo Molnar <mingo at elte.hu>
+---
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 806d1b2..ab661eb 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p)
+ 
+ 	raw_spin_lock_irqsave(&rq->lock, flags);
+ 
++	update_rq_clock(rq);
++
+ 	if (unlikely(task_cpu(p) != this_cpu))
+ 		__set_task_cpu(p, this_cpu);
+ 
diff --git a/sched-25-move-sched_avg_update-to-update_cpu_load.patch b/sched-25-move-sched_avg_update-to-update_cpu_load.patch
new file mode 100644
index 0000000..556c8ce
--- /dev/null
+++ b/sched-25-move-sched_avg_update-to-update_cpu_load.patch
@@ -0,0 +1,58 @@
+From: Suresh Siddha <suresh.b.siddha at intel.com>
+Date: Mon, 23 Aug 2010 20:42:51 +0000 (-0700)
+Subject: sched: Move sched_avg_update() to update_cpu_load()
+X-Git-Tag: v2.6.36-rc4~8^2~1
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=da2b71edd8a7db44fe1746261410a981f3e03632
+
+sched: Move sched_avg_update() to update_cpu_load()
+
+Currently sched_avg_update() (which updates rt_avg stats in the rq)
+is getting called from scale_rt_power() (in the load balance context)
+which doesn't take rq->lock.
+
+Fix it by moving the sched_avg_update() to more appropriate
+update_cpu_load() where the CFS load gets updated as well.
+
+Signed-off-by: Suresh Siddha <suresh.b.siddha at intel.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra at chello.nl>
+LKML-Reference: <1282596171.2694.3.camel at sbsiddha-MOBL3>
+Signed-off-by: Ingo Molnar <mingo at elte.hu>
+---
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 09b574e..ed09d4f 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -1294,6 +1294,10 @@ static void resched_task(struct task_struct *p)
+ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+ {
+ }
++
++static void sched_avg_update(struct rq *rq)
++{
++}
+ #endif /* CONFIG_SMP */
+ 
+ #if BITS_PER_LONG == 32
+@@ -3182,6 +3186,8 @@ static void update_cpu_load(struct rq *this_rq)
+ 
+ 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ 	}
++
++	sched_avg_update(this_rq);
+ }
+ 
+ static void update_cpu_load_active(struct rq *this_rq)
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index ab661eb..f53ec75 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -2268,8 +2268,6 @@ unsigned long scale_rt_power(int cpu)
+ 	struct rq *rq = cpu_rq(cpu);
+ 	u64 total, available;
+ 
+-	sched_avg_update(rq);
+-
+ 	total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ 	available = total - rq->rt_avg;
+ 


More information about the scm-commits mailing list