From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Tue, 14 Jul 2015 17:39:50 +0200
Subject: [PATCH] sched/fair: Beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU.  While looking at wake_wide(),
I noticed that it doesn't pay attention to the wakeup of a many partner
waker, returning 1 only when waking one of its many partners.

Correct that, letting explicit domain flags override the heuristic.

While at it, adjust task_struct bits, we don't need a 64-bit counter.

Tested-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
[ Tidy things up. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team<Kernel-team@fb.com>
Cc: morten.rasmussen@arm.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  4 +--
 kernel/sched/fair.c   | 67 ++++++++++++++++++++++++++-------------------------
 2 files changed, 36 insertions(+), 35 deletions(-)

diff --git b/include/linux/sched.h a/include/linux/sched.h
index 65a8a86..7412070 100644
--- b/include/linux/sched.h
+++ a/include/linux/sched.h
@@ -1487,9 +1487,9 @@ struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	unsigned int cpu;	/* current CPU */
 #endif
-	unsigned int wakee_flips;
-	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
+	unsigned long wakee_flips;
+	unsigned long wakee_flip_decay_ts;
 
 	int wake_cpu;
 #endif
diff --git b/kernel/sched/fair.c a/kernel/sched/fair.c
index ea23f9f..8b384b8d 100644
--- b/kernel/sched/fair.c
+++ a/kernel/sched/fair.c
@@ -5103,34 +5103,26 @@ static void record_wakee(struct task_struct *p)
 	}
 }
 
-/*
- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
- *
- * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.
- *
- * In order to determine whether we should let the load spread vs consolidating
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.
- *
- * With both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.
- *
- * Waker/wakee being client/server, worker/dispatcher, interrupt source or
- * whatever is irrelevant, spread criteria is apparent partner count exceeds
- * socket size.
- */
 static int wake_wide(struct task_struct *p)
 {
-	unsigned int master = current->wakee_flips;
-	unsigned int slave = p->wakee_flips;
 	int factor = this_cpu_read(sd_llc_size);
 
-	if (master < slave)
-		swap(master, slave);
-	if (slave < factor || master < slave * factor)
-		return 0;
-	return 1;
+	/*
+	 * Yeah, it's the switching-frequency, could means many wakee or
+	 * rapidly switch, use factor here will just help to automatically
+	 * adjust the loose-degree, so bigger node will lead to more pull.
+	 */
+	if (p->wakee_flips > factor) {
+		/*
+		 * wakee is somewhat hot, it needs certain amount of cpu
+		 * resource, so if waker is far more hot, prefer to leave
+		 * it alone.
+		 */
+		if (current->wakee_flips > (factor * p->wakee_flips))
+			return 1;
+	}
+
+	return 0;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
@@ -5143,6 +5135,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	unsigned long weight;
 	int balanced;
 
+	/*
+	 * If we wake multiple tasks be careful to not bounce
+	 * ourselves around too much.
+	 */
+	if (wake_wide(p))
+		return 0;
+
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	load	  = source_load(prev_cpu, idx);
@@ -5627,20 +5626,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
-	int new_cpu = prev_cpu;
+	int new_cpu = cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
-		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+		want_affine = !wake_cap(p, cpu, prev_cpu)
 			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 	}
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
-			break;
+			continue;
 
 		/*
 		 * If both cpu and prev_cpu are part of this domain,
@@ -5654,21 +5653,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 		if (tmp->flags & sd_flag)
 			sd = tmp;
-		else if (!want_affine)
-			break;
 	}
 
-	if (affine_sd) {
-		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
-			new_cpu = cpu;
-	}
+	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
+		prev_cpu = cpu;
 
-	if (!sd) {
-		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+	if (sd_flag & SD_BALANCE_WAKE) {
+		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+		goto unlock;
+	}
 
-	} else while (sd) {
+	while (sd) {
 		struct sched_group *group;
 		int weight;
 
@@ -5702,6 +5697,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		}
 		/* while loop will break here if sd == NULL */
 	}
+unlock:
 	rcu_read_unlock();
 
 	return new_cpu;
-- 
1.9.1