Change-Id: Ib1e852a5aefb8ee02f885a5c503ef2a0821aa178
Reviewed-on: http://photon-jenkins.eng.vmware.com/1025
Reviewed-by: Sharath George
Tested-by: Sharath George
(cherry picked from commit 56a89e73bc36f9130daa02c88425d5ff21bd8f73)
Reviewed-on: http://photon-jenkins.eng.vmware.com/1035
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,174 @@ |
0 |
+From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001 |
|
1 |
+From: Mike Galbraith <umgwanakikbuti@gmail.com> |
|
2 |
+Date: Tue, 14 Jul 2015 17:39:50 +0200 |
|
3 |
+Subject: [PATCH] sched/fair: Beef up wake_wide() |
|
4 |
+ |
|
5 |
+Josef Bacik reported that Facebook sees better performance with their |
|
6 |
+1:N load (1 dispatch/node, N workers/node) when carrying an old patch |
|
7 |
+to try very hard to wake to an idle CPU. While looking at wake_wide(), |
|
8 |
+I noticed that it doesn't pay attention to the wakeup of a many partner |
|
9 |
+waker, returning 1 only when waking one of its many partners. |
|
10 |
+ |
|
11 |
+Correct that, letting explicit domain flags override the heuristic. |
|
12 |
+ |
|
13 |
+While at it, adjust task_struct bits, we don't need a 64-bit counter. |
|
14 |
+ |
|
15 |
+Tested-by: Josef Bacik <jbacik@fb.com> |
|
16 |
+Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> |
|
17 |
+[ Tidy things up. ] |
|
18 |
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
|
19 |
+Cc: Linus Torvalds <torvalds@linux-foundation.org> |
|
20 |
+Cc: Mike Galbraith <efault@gmx.de> |
|
21 |
+Cc: Peter Zijlstra <peterz@infradead.org> |
|
22 |
+Cc: Thomas Gleixner <tglx@linutronix.de> |
|
23 |
+Cc: kernel-team<Kernel-team@fb.com> |
|
24 |
+Cc: morten.rasmussen@arm.com |
|
25 |
+Cc: riel@redhat.com |
|
26 |
+Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com |
|
27 |
+Signed-off-by: Ingo Molnar <mingo@kernel.org> |
|
28 |
+--- |
|
29 |
+ include/linux/sched.h | 4 +-- |
|
30 |
+ kernel/sched/fair.c | 67 ++++++++++++++++++++++++++------------------------- |
|
31 |
+ 2 files changed, 36 insertions(+), 35 deletions(-) |
|
32 |
+ |
|
33 |
+diff --git b/include/linux/sched.h a/include/linux/sched.h |
|
34 |
+index 65a8a86..7412070 100644 |
|
35 |
+--- b/include/linux/sched.h |
|
36 |
+@@ -1359,9 +1359,9 @@ struct task_struct { |
|
37 |
+ #ifdef CONFIG_SMP |
|
38 |
+ struct llist_node wake_entry; |
|
39 |
+ int on_cpu; |
|
40 |
+- unsigned int wakee_flips; |
|
41 |
+- unsigned long wakee_flip_decay_ts; |
|
42 |
+ struct task_struct *last_wakee; |
|
43 |
++ unsigned long wakee_flips; |
|
44 |
++ unsigned long wakee_flip_decay_ts; |
|
45 |
+ |
|
46 |
+ int wake_cpu; |
|
47 |
+ #endif |
|
48 |
+diff --git b/kernel/sched/fair.c a/kernel/sched/fair.c |
|
49 |
+index ea23f9f..8b384b8d 100644 |
|
50 |
+--- b/kernel/sched/fair.c |
|
51 |
+@@ -4726,29 +4726,26 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
|
52 |
+ |
|
53 |
+ #endif |
|
54 |
+ |
|
55 |
+-/* |
|
56 |
+- * Detect M:N waker/wakee relationships via a switching-frequency heuristic. |
|
57 |
+- * A waker of many should wake a different task than the one last awakened |
|
58 |
+- * at a frequency roughly N times higher than one of its wakees. In order |
|
59 |
+- * to determine whether we should let the load spread vs consolodating to |
|
60 |
+- * shared cache, we look for a minimum 'flip' frequency of llc_size in one |
|
61 |
+- * partner, and a factor of lls_size higher frequency in the other. With |
|
62 |
+- * both conditions met, we can be relatively sure that the relationship is |
|
63 |
+- * non-monogamous, with partner count exceeding socket size. Waker/wakee |
|
64 |
+- * being client/server, worker/dispatcher, interrupt source or whatever is |
|
65 |
+- * irrelevant, spread criteria is apparent partner count exceeds socket size. |
|
66 |
+- */ |
|
67 |
+ static int wake_wide(struct task_struct *p) |
|
68 |
+ { |
|
69 |
+- unsigned int master = current->wakee_flips; |
|
70 |
+- unsigned int slave = p->wakee_flips; |
|
71 |
+ int factor = this_cpu_read(sd_llc_size); |
|
72 |
+ |
|
73 |
+- if (master < slave) |
|
74 |
+- swap(master, slave); |
|
75 |
+- if (slave < factor || master < slave * factor) |
|
76 |
+- return 0; |
|
77 |
+- return 1; |
|
78 |
++ /* |
|
79 |
++ * Yeah, it's the switching-frequency, could means many wakee or |
|
80 |
++ * rapidly switch, use factor here will just help to automatically |
|
81 |
++ * adjust the loose-degree, so bigger node will lead to more pull. |
|
82 |
++ */ |
|
83 |
++ if (p->wakee_flips > factor) { |
|
84 |
++ /* |
|
85 |
++ * wakee is somewhat hot, it needs certain amount of cpu |
|
86 |
++ * resource, so if waker is far more hot, prefer to leave |
|
87 |
++ * it alone. |
|
88 |
++ */ |
|
89 |
++ if (current->wakee_flips > (factor * p->wakee_flips)) |
|
90 |
++ return 1; |
|
91 |
++ } |
|
92 |
++ |
|
93 |
++ return 0; |
|
94 |
+ } |
|
95 |
+ |
|
96 |
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
|
97 |
+@@ -4760,6 +4757,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
|
98 |
+ unsigned long weight; |
|
99 |
+ int balanced; |
|
100 |
+ |
|
101 |
++ /* |
|
102 |
++ * If we wake multiple tasks be careful to not bounce |
|
103 |
++ * ourselves around too much. |
|
104 |
++ */ |
|
105 |
++ if (wake_wide(p)) |
|
106 |
++ return 0; |
|
107 |
++ |
|
108 |
+ idx = sd->wake_idx; |
|
109 |
+ this_cpu = smp_processor_id(); |
|
110 |
+ prev_cpu = task_cpu(p); |
|
111 |
+@@ -5013,17 +5017,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
|
112 |
+ { |
|
113 |
+ struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
|
114 |
+ int cpu = smp_processor_id(); |
|
115 |
+- int new_cpu = prev_cpu; |
|
116 |
++ int new_cpu = cpu; |
|
117 |
+ int want_affine = 0; |
|
118 |
+ int sync = wake_flags & WF_SYNC; |
|
119 |
+ |
|
120 |
+ if (sd_flag & SD_BALANCE_WAKE) |
|
121 |
+- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
|
122 |
++ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); |
|
123 |
+ |
|
124 |
+ rcu_read_lock(); |
|
125 |
+ for_each_domain(cpu, tmp) { |
|
126 |
+ if (!(tmp->flags & SD_LOAD_BALANCE)) |
|
127 |
+- break; |
|
128 |
++ continue; |
|
129 |
+ |
|
130 |
+ /* |
|
131 |
+ * If both cpu and prev_cpu are part of this domain, |
|
132 |
+@@ -5037,21 +5041,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
|
133 |
+ |
|
134 |
+ if (tmp->flags & sd_flag) |
|
135 |
+ sd = tmp; |
|
136 |
+- else if (!want_affine) |
|
137 |
+- break; |
|
138 |
+ } |
|
139 |
+ |
|
140 |
+- if (affine_sd) { |
|
141 |
+- sd = NULL; /* Prefer wake_affine over balance flags */ |
|
142 |
+- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
|
143 |
+- new_cpu = cpu; |
|
144 |
+- } |
|
145 |
++ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
|
146 |
++ prev_cpu = cpu; |
|
147 |
+ |
|
148 |
+- if (!sd) { |
|
149 |
+- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
|
150 |
+- new_cpu = select_idle_sibling(p, new_cpu); |
|
151 |
++ if (sd_flag & SD_BALANCE_WAKE) { |
|
152 |
++ new_cpu = select_idle_sibling(p, prev_cpu); |
|
153 |
++ goto unlock; |
|
154 |
++ } |
|
155 |
+ |
|
156 |
+- } else while (sd) { |
|
157 |
++ while (sd) { |
|
158 |
+ struct sched_group *group; |
|
159 |
+ int weight; |
|
160 |
+ |
|
161 |
+@@ -5085,6 +5085,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
|
162 |
+ } |
|
163 |
+ /* while loop will break here if sd == NULL */ |
|
164 |
+ } |
|
165 |
++unlock: |
|
166 |
+ rcu_read_unlock(); |
|
167 |
+ |
|
168 |
+ return new_cpu; |
|
169 |
+-- |
|
170 |
+1.9.1 |
|
171 |
+ |
... | ... |
@@ -2,7 +2,7 @@ |
2 | 2 |
Summary: Kernel |
3 | 3 |
Name: linux |
4 | 4 |
Version: 4.4.8 |
5 |
-Release: 5%{?dist} |
|
5 |
+Release: 6%{?dist} |
|
6 | 6 |
License: GPLv2 |
7 | 7 |
URL: http://www.kernel.org/ |
8 | 8 |
Group: System Environment/Kernel |
... | ... |
@@ -22,6 +22,7 @@ Patch6: net-Driver-Vmxnet3-set-CHECKSUM_UNNECESSARY-for-IPv6-packets.pat |
22 | 22 |
Patch7: netfilter-x_tables-deal-with-bogus-nextoffset-values.patch |
23 | 23 |
#fixes CVE-2016-3135 |
24 | 24 |
Patch8: netfilter-x_tables-check-for-size-overflow.patch |
25 |
+Patch9: REVERT-sched-fair-Beef-up-wake_wide.patch |
|
25 | 26 |
BuildRequires: bc |
26 | 27 |
BuildRequires: kbd |
27 | 28 |
BuildRequires: kmod |
... | ... |
@@ -86,6 +87,7 @@ Kernel driver for oprofile, a statistical profiler for Linux systems |
86 | 86 |
%patch6 -p1 |
87 | 87 |
%patch7 -p1 |
88 | 88 |
%patch8 -p1 |
89 |
+%patch9 -p1 |
|
89 | 90 |
|
90 | 91 |
%build |
91 | 92 |
make mrproper |
... | ... |
@@ -182,10 +184,12 @@ ln -s /usr/lib/debug/lib/modules/%{version}/vmlinux-%{version}.debug /boot/vmlin |
182 | 182 |
/lib/modules/%{version}/kernel/arch/x86/oprofile/ |
183 | 183 |
|
184 | 184 |
%changelog |
185 |
-* Tue May 24 2016 Priyesh Padmavilasom <ppadmavilasom@vmware.com> 4.4.8-5 |
|
186 |
-- GA - Bump release of all rpms |
|
187 |
-* Mon May 23 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-4 |
|
188 |
-- Fixed generation of debug symbols for kernel modules & vmlinux. |
|
185 |
+* Thu May 26 2016 Alexey Makhalov <amakhalov@vmware.com> 4.4.8-6 |
|
186 |
+- patch: REVERT-sched-fair-Beef-up-wake_wide.patch |
|
187 |
+* Tue May 24 2016 Priyesh Padmavilasom <ppadmavilasom@vmware.com> 4.4.8-5 |
|
188 |
+- GA - Bump release of all rpms |
|
189 |
+* Mon May 23 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-4 |
|
190 |
+- Fixed generation of debug symbols for kernel modules & vmlinux. |
|
189 | 191 |
* Mon May 23 2016 Divya Thaluru <dthaluru@vmware.com> 4.4.8-3 |
190 | 192 |
- Added patches to fix CVE-2016-3134, CVE-2016-3135 |
191 | 193 |
* Wed May 18 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-2 |