From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001 From: Mike Galbraith <umgwanakikbuti@gmail.com> Date: Tue, 14 Jul 2015 17:39:50 +0200 Subject: [PATCH] sched/fair: Beef up wake_wide() Josef Bacik reported that Facebook sees better performance with their 1:N load (1 dispatch/node, N workers/node) when carrying an old patch to try very hard to wake to an idle CPU. While looking at wake_wide(), I noticed that it doesn't pay attention to the wakeup of a many partner waker, returning 1 only when waking one of its many partners. Correct that, letting explicit domain flags override the heuristic. While at it, adjust task_struct bits, we don't need a 64-bit counter. Tested-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> [ Tidy things up. ] Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: kernel-team<Kernel-team@fb.com> Cc: morten.rasmussen@arm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- include/linux/sched.h | 4 +-- kernel/sched/fair.c | 67 ++++++++++++++++++++++++++------------------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git b/include/linux/sched.h a/include/linux/sched.h index 65a8a86..7412070 100644 --- b/include/linux/sched.h +++ a/include/linux/sched.h @@ -1487,9 +1487,9 @@ struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK unsigned int cpu; /* current CPU */ #endif - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; + unsigned long wakee_flips; + unsigned long wakee_flip_decay_ts; int wake_cpu; #endif diff --git b/kernel/sched/fair.c a/kernel/sched/fair.c index ea23f9f..8b384b8d 100644 --- b/kernel/sched/fair.c +++ a/kernel/sched/fair.c @@ -5103,34 +5103,26 @@ static void record_wakee(struct task_struct *p) } } -/* - * Detect M:N waker/wakee relationships via a switching-frequency heuristic. - * - * A waker of many should wake a different task than the one last awakened - * at a frequency roughly N times higher than one of its wakees. - * - * In order to determine whether we should let the load spread vs consolidating - * to shared cache, we look for a minimum 'flip' frequency of llc_size in one - * partner, and a factor of lls_size higher frequency in the other. - * - * With both conditions met, we can be relatively sure that the relationship is - * non-monogamous, with partner count exceeding socket size. - * - * Waker/wakee being client/server, worker/dispatcher, interrupt source or - * whatever is irrelevant, spread criteria is apparent partner count exceeds - * socket size. - */ static int wake_wide(struct task_struct *p) { - unsigned int master = current->wakee_flips; - unsigned int slave = p->wakee_flips; int factor = this_cpu_read(sd_llc_size); - if (master < slave) - swap(master, slave); - if (slave < factor || master < slave * factor) - return 0; - return 1; + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, @@ -5143,6 +5135,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); load = source_load(prev_cpu, idx); @@ -5627,20 +5626,20 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = prev_cpu; + int new_cpu = cpu; int want_affine = 0; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + want_affine = !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); } rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) - break; + continue; /* * If both cpu and prev_cpu are part of this domain, @@ -5654,21 +5653,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (tmp->flags & sd_flag) sd = tmp; - else if (!want_affine) - break; } - if (affine_sd) { - sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) - new_cpu = cpu; - } + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) + prev_cpu = cpu; - if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (sd_flag & SD_BALANCE_WAKE) { + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + goto unlock; + } - } else while (sd) { + while (sd) { struct sched_group *group; int weight; @@ -5702,6 +5697,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } /* while loop will break here if sd == NULL */ } +unlock: rcu_read_unlock(); return new_cpu; -- 1.9.1