Change-Id: Ib1e852a5aefb8ee02f885a5c503ef2a0821aa178
Reviewed-on: http://photon-jenkins.eng.vmware.com/1025
Reviewed-by: Sharath George
Tested-by: Sharath George
(cherry picked from commit 56a89e73bc36f9130daa02c88425d5ff21bd8f73)
Reviewed-on: http://photon-jenkins.eng.vmware.com/1035
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,174 @@ |
| 0 |
+From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001 |
|
| 1 |
+From: Mike Galbraith <umgwanakikbuti@gmail.com> |
|
| 2 |
+Date: Tue, 14 Jul 2015 17:39:50 +0200 |
|
| 3 |
+Subject: [PATCH] sched/fair: Beef up wake_wide() |
|
| 4 |
+ |
|
| 5 |
+Josef Bacik reported that Facebook sees better performance with their |
|
| 6 |
+1:N load (1 dispatch/node, N workers/node) when carrying an old patch |
|
| 7 |
+to try very hard to wake to an idle CPU. While looking at wake_wide(), |
|
| 8 |
+I noticed that it doesn't pay attention to the wakeup of a many partner |
|
| 9 |
+waker, returning 1 only when waking one of its many partners. |
|
| 10 |
+ |
|
| 11 |
+Correct that, letting explicit domain flags override the heuristic. |
|
| 12 |
+ |
|
| 13 |
+While at it, adjust task_struct bits, we don't need a 64-bit counter. |
|
| 14 |
+ |
|
| 15 |
+Tested-by: Josef Bacik <jbacik@fb.com> |
|
| 16 |
+Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> |
|
| 17 |
+[ Tidy things up. ] |
|
| 18 |
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
|
| 19 |
+Cc: Linus Torvalds <torvalds@linux-foundation.org> |
|
| 20 |
+Cc: Mike Galbraith <efault@gmx.de> |
|
| 21 |
+Cc: Peter Zijlstra <peterz@infradead.org> |
|
| 22 |
+Cc: Thomas Gleixner <tglx@linutronix.de> |
|
| 23 |
+Cc: kernel-team<Kernel-team@fb.com> |
|
| 24 |
+Cc: morten.rasmussen@arm.com |
|
| 25 |
+Cc: riel@redhat.com |
|
| 26 |
+Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com |
|
| 27 |
+Signed-off-by: Ingo Molnar <mingo@kernel.org> |
|
| 28 |
+--- |
|
| 29 |
+ include/linux/sched.h | 4 +-- |
|
| 30 |
+ kernel/sched/fair.c | 67 ++++++++++++++++++++++++++------------------------- |
|
| 31 |
+ 2 files changed, 36 insertions(+), 35 deletions(-) |
|
| 32 |
+ |
|
| 33 |
+diff --git b/include/linux/sched.h a/include/linux/sched.h |
|
| 34 |
+index 65a8a86..7412070 100644 |
|
| 35 |
+--- b/include/linux/sched.h |
|
| 36 |
+@@ -1359,9 +1359,9 @@ struct task_struct {
|
|
| 37 |
+ #ifdef CONFIG_SMP |
|
| 38 |
+ struct llist_node wake_entry; |
|
| 39 |
+ int on_cpu; |
|
| 40 |
+- unsigned int wakee_flips; |
|
| 41 |
+- unsigned long wakee_flip_decay_ts; |
|
| 42 |
+ struct task_struct *last_wakee; |
|
| 43 |
++ unsigned long wakee_flips; |
|
| 44 |
++ unsigned long wakee_flip_decay_ts; |
|
| 45 |
+ |
|
| 46 |
+ int wake_cpu; |
|
| 47 |
+ #endif |
|
| 48 |
+diff --git b/kernel/sched/fair.c a/kernel/sched/fair.c |
|
| 49 |
+index ea23f9f..8b384b8d 100644 |
|
| 50 |
+--- b/kernel/sched/fair.c |
|
| 51 |
+@@ -4726,29 +4726,26 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
|
| 52 |
+ |
|
| 53 |
+ #endif |
|
| 54 |
+ |
|
| 55 |
+-/* |
|
| 56 |
+- * Detect M:N waker/wakee relationships via a switching-frequency heuristic. |
|
| 57 |
+- * A waker of many should wake a different task than the one last awakened |
|
| 58 |
+- * at a frequency roughly N times higher than one of its wakees. In order |
|
| 59 |
+- * to determine whether we should let the load spread vs consolodating to |
|
| 60 |
+- * shared cache, we look for a minimum 'flip' frequency of llc_size in one |
|
| 61 |
+- * partner, and a factor of lls_size higher frequency in the other. With |
|
| 62 |
+- * both conditions met, we can be relatively sure that the relationship is |
|
| 63 |
+- * non-monogamous, with partner count exceeding socket size. Waker/wakee |
|
| 64 |
+- * being client/server, worker/dispatcher, interrupt source or whatever is |
|
| 65 |
+- * irrelevant, spread criteria is apparent partner count exceeds socket size. |
|
| 66 |
+- */ |
|
| 67 |
+ static int wake_wide(struct task_struct *p) |
|
| 68 |
+ {
|
|
| 69 |
+- unsigned int master = current->wakee_flips; |
|
| 70 |
+- unsigned int slave = p->wakee_flips; |
|
| 71 |
+ int factor = this_cpu_read(sd_llc_size); |
|
| 72 |
+ |
|
| 73 |
+- if (master < slave) |
|
| 74 |
+- swap(master, slave); |
|
| 75 |
+- if (slave < factor || master < slave * factor) |
|
| 76 |
+- return 0; |
|
| 77 |
+- return 1; |
|
| 78 |
++ /* |
|
| 79 |
++ * Yeah, it's the switching-frequency, could means many wakee or |
|
| 80 |
++ * rapidly switch, use factor here will just help to automatically |
|
| 81 |
++ * adjust the loose-degree, so bigger node will lead to more pull. |
|
| 82 |
++ */ |
|
| 83 |
++ if (p->wakee_flips > factor) {
|
|
| 84 |
++ /* |
|
| 85 |
++ * wakee is somewhat hot, it needs certain amount of cpu |
|
| 86 |
++ * resource, so if waker is far more hot, prefer to leave |
|
| 87 |
++ * it alone. |
|
| 88 |
++ */ |
|
| 89 |
++ if (current->wakee_flips > (factor * p->wakee_flips)) |
|
| 90 |
++ return 1; |
|
| 91 |
++ } |
|
| 92 |
++ |
|
| 93 |
++ return 0; |
|
| 94 |
+ } |
|
| 95 |
+ |
|
| 96 |
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
|
| 97 |
+@@ -4760,6 +4757,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
|
| 98 |
+ unsigned long weight; |
|
| 99 |
+ int balanced; |
|
| 100 |
+ |
|
| 101 |
++ /* |
|
| 102 |
++ * If we wake multiple tasks be careful to not bounce |
|
| 103 |
++ * ourselves around too much. |
|
| 104 |
++ */ |
|
| 105 |
++ if (wake_wide(p)) |
|
| 106 |
++ return 0; |
|
| 107 |
++ |
|
| 108 |
+ idx = sd->wake_idx; |
|
| 109 |
+ this_cpu = smp_processor_id(); |
|
| 110 |
+ prev_cpu = task_cpu(p); |
|
| 111 |
+@@ -5013,17 +5017,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
|
| 112 |
+ {
|
|
| 113 |
+ struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
|
| 114 |
+ int cpu = smp_processor_id(); |
|
| 115 |
+- int new_cpu = prev_cpu; |
|
| 116 |
++ int new_cpu = cpu; |
|
| 117 |
+ int want_affine = 0; |
|
| 118 |
+ int sync = wake_flags & WF_SYNC; |
|
| 119 |
+ |
|
| 120 |
+ if (sd_flag & SD_BALANCE_WAKE) |
|
| 121 |
+- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
|
| 122 |
++ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
|
| 123 |
+ |
|
| 124 |
+ rcu_read_lock(); |
|
| 125 |
+ for_each_domain(cpu, tmp) {
|
|
| 126 |
+ if (!(tmp->flags & SD_LOAD_BALANCE)) |
|
| 127 |
+- break; |
|
| 128 |
++ continue; |
|
| 129 |
+ |
|
| 130 |
+ /* |
|
| 131 |
+ * If both cpu and prev_cpu are part of this domain, |
|
| 132 |
+@@ -5037,21 +5041,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
|
| 133 |
+ |
|
| 134 |
+ if (tmp->flags & sd_flag) |
|
| 135 |
+ sd = tmp; |
|
| 136 |
+- else if (!want_affine) |
|
| 137 |
+- break; |
|
| 138 |
+ } |
|
| 139 |
+ |
|
| 140 |
+- if (affine_sd) {
|
|
| 141 |
+- sd = NULL; /* Prefer wake_affine over balance flags */ |
|
| 142 |
+- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
|
| 143 |
+- new_cpu = cpu; |
|
| 144 |
+- } |
|
| 145 |
++ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
|
| 146 |
++ prev_cpu = cpu; |
|
| 147 |
+ |
|
| 148 |
+- if (!sd) {
|
|
| 149 |
+- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
|
| 150 |
+- new_cpu = select_idle_sibling(p, new_cpu); |
|
| 151 |
++ if (sd_flag & SD_BALANCE_WAKE) {
|
|
| 152 |
++ new_cpu = select_idle_sibling(p, prev_cpu); |
|
| 153 |
++ goto unlock; |
|
| 154 |
++ } |
|
| 155 |
+ |
|
| 156 |
+- } else while (sd) {
|
|
| 157 |
++ while (sd) {
|
|
| 158 |
+ struct sched_group *group; |
|
| 159 |
+ int weight; |
|
| 160 |
+ |
|
| 161 |
+@@ -5085,6 +5085,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
|
| 162 |
+ } |
|
| 163 |
+ /* while loop will break here if sd == NULL */ |
|
| 164 |
+ } |
|
| 165 |
++unlock: |
|
| 166 |
+ rcu_read_unlock(); |
|
| 167 |
+ |
|
| 168 |
+ return new_cpu; |
|
| 169 |
+-- |
|
| 170 |
+1.9.1 |
|
| 171 |
+ |
| ... | ... |
@@ -2,7 +2,7 @@ |
| 2 | 2 |
Summary: Kernel |
| 3 | 3 |
Name: linux |
| 4 | 4 |
Version: 4.4.8 |
| 5 |
-Release: 5%{?dist}
|
|
| 5 |
+Release: 6%{?dist}
|
|
| 6 | 6 |
License: GPLv2 |
| 7 | 7 |
URL: http://www.kernel.org/ |
| 8 | 8 |
Group: System Environment/Kernel |
| ... | ... |
@@ -22,6 +22,7 @@ Patch6: net-Driver-Vmxnet3-set-CHECKSUM_UNNECESSARY-for-IPv6-packets.pat |
| 22 | 22 |
Patch7: netfilter-x_tables-deal-with-bogus-nextoffset-values.patch |
| 23 | 23 |
#fixes CVE-2016-3135 |
| 24 | 24 |
Patch8: netfilter-x_tables-check-for-size-overflow.patch |
| 25 |
+Patch9: REVERT-sched-fair-Beef-up-wake_wide.patch |
|
| 25 | 26 |
BuildRequires: bc |
| 26 | 27 |
BuildRequires: kbd |
| 27 | 28 |
BuildRequires: kmod |
| ... | ... |
@@ -86,6 +87,7 @@ Kernel driver for oprofile, a statistical profiler for Linux systems |
| 86 | 86 |
%patch6 -p1 |
| 87 | 87 |
%patch7 -p1 |
| 88 | 88 |
%patch8 -p1 |
| 89 |
+%patch9 -p1 |
|
| 89 | 90 |
|
| 90 | 91 |
%build |
| 91 | 92 |
make mrproper |
| ... | ... |
@@ -182,10 +184,12 @@ ln -s /usr/lib/debug/lib/modules/%{version}/vmlinux-%{version}.debug /boot/vmlin
|
| 182 | 182 |
/lib/modules/%{version}/kernel/arch/x86/oprofile/
|
| 183 | 183 |
|
| 184 | 184 |
%changelog |
| 185 |
-* Tue May 24 2016 Priyesh Padmavilasom <ppadmavilasom@vmware.com> 4.4.8-5 |
|
| 186 |
-- GA - Bump release of all rpms |
|
| 187 |
-* Mon May 23 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-4 |
|
| 188 |
-- Fixed generation of debug symbols for kernel modules & vmlinux. |
|
| 185 |
+* Thu May 26 2016 Alexey Makhalov <amakhalov@vmware.com> 4.4.8-6 |
|
| 186 |
+- patch: REVERT-sched-fair-Beef-up-wake_wide.patch |
|
| 187 |
+* Tue May 24 2016 Priyesh Padmavilasom <ppadmavilasom@vmware.com> 4.4.8-5 |
|
| 188 |
+- GA - Bump release of all rpms |
|
| 189 |
+* Mon May 23 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-4 |
|
| 190 |
+- Fixed generation of debug symbols for kernel modules & vmlinux. |
|
| 189 | 191 |
* Mon May 23 2016 Divya Thaluru <dthaluru@vmware.com> 4.4.8-3 |
| 190 | 192 |
- Added patches to fix CVE-2016-3134, CVE-2016-3135 |
| 191 | 193 |
* Wed May 18 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-2 |