Browse code

linux: revert 'Beef up wake_wide' change

Change-Id: Ib1e852a5aefb8ee02f885a5c503ef2a0821aa178
Reviewed-on: http://photon-jenkins.eng.vmware.com/1025
Reviewed-by: Sharath George
Tested-by: Sharath George
(cherry picked from commit 56a89e73bc36f9130daa02c88425d5ff21bd8f73)
Reviewed-on: http://photon-jenkins.eng.vmware.com/1035

Alexey Makhalov authored on 2016/05/27 04:41:51
Showing 2 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,174 @@
0
+From 63b0e9edceec10fa41ec33393a1515a5ff444277 Mon Sep 17 00:00:00 2001
1
+From: Mike Galbraith <umgwanakikbuti@gmail.com>
2
+Date: Tue, 14 Jul 2015 17:39:50 +0200
3
+Subject: [PATCH] sched/fair: Beef up wake_wide()
4
+
5
+Josef Bacik reported that Facebook sees better performance with their
6
+1:N load (1 dispatch/node, N workers/node) when carrying an old patch
7
+to try very hard to wake to an idle CPU.  While looking at wake_wide(),
8
+I noticed that it doesn't pay attention to the wakeup of a many partner
9
+waker, returning 1 only when waking one of its many partners.
10
+
11
+Correct that, letting explicit domain flags override the heuristic.
12
+
13
+While at it, adjust task_struct bits, we don't need a 64-bit counter.
14
+
15
+Tested-by: Josef Bacik <jbacik@fb.com>
16
+Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
17
+[ Tidy things up. ]
18
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
19
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
20
+Cc: Mike Galbraith <efault@gmx.de>
21
+Cc: Peter Zijlstra <peterz@infradead.org>
22
+Cc: Thomas Gleixner <tglx@linutronix.de>
23
+Cc: kernel-team<Kernel-team@fb.com>
24
+Cc: morten.rasmussen@arm.com
25
+Cc: riel@redhat.com
26
+Link: http://lkml.kernel.org/r/1436888390.7983.49.camel@gmail.com
27
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
28
+---
29
+ include/linux/sched.h |  4 +--
30
+ kernel/sched/fair.c   | 67 ++++++++++++++++++++++++++-------------------------
31
+ 2 files changed, 36 insertions(+), 35 deletions(-)
32
+
33
+diff --git b/include/linux/sched.h a/include/linux/sched.h
34
+index 65a8a86..7412070 100644
35
+--- b/include/linux/sched.h
36
+@@ -1359,9 +1359,9 @@ struct task_struct {
37
+ #ifdef CONFIG_SMP
38
+ 	struct llist_node wake_entry;
39
+ 	int on_cpu;
40
+-	unsigned int wakee_flips;
41
+-	unsigned long wakee_flip_decay_ts;
42
+ 	struct task_struct *last_wakee;
43
++	unsigned long wakee_flips;
44
++	unsigned long wakee_flip_decay_ts;
45
+ 
46
+ 	int wake_cpu;
47
+ #endif
48
+diff --git b/kernel/sched/fair.c a/kernel/sched/fair.c
49
+index ea23f9f..8b384b8d 100644
50
+--- b/kernel/sched/fair.c
51
+@@ -4726,29 +4726,26 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
52
+ 
53
+ #endif
54
+ 
55
+-/*
56
+- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
57
+- * A waker of many should wake a different task than the one last awakened
58
+- * at a frequency roughly N times higher than one of its wakees.  In order
59
+- * to determine whether we should let the load spread vs consolodating to
60
+- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
61
+- * partner, and a factor of lls_size higher frequency in the other.  With
62
+- * both conditions met, we can be relatively sure that the relationship is
63
+- * non-monogamous, with partner count exceeding socket size.  Waker/wakee
64
+- * being client/server, worker/dispatcher, interrupt source or whatever is
65
+- * irrelevant, spread criteria is apparent partner count exceeds socket size.
66
+- */
67
+ static int wake_wide(struct task_struct *p)
68
+ {
69
+-	unsigned int master = current->wakee_flips;
70
+-	unsigned int slave = p->wakee_flips;
71
+ 	int factor = this_cpu_read(sd_llc_size);
72
+ 
73
+-	if (master < slave)
74
+-		swap(master, slave);
75
+-	if (slave < factor || master < slave * factor)
76
+-		return 0;
77
+-	return 1;
78
++	/*
79
++	 * Yeah, it's the switching-frequency, could means many wakee or
80
++	 * rapidly switch, use factor here will just help to automatically
81
++	 * adjust the loose-degree, so bigger node will lead to more pull.
82
++	 */
83
++	if (p->wakee_flips > factor) {
84
++		/*
85
++		 * wakee is somewhat hot, it needs certain amount of cpu
86
++		 * resource, so if waker is far more hot, prefer to leave
87
++		 * it alone.
88
++		 */
89
++		if (current->wakee_flips > (factor * p->wakee_flips))
90
++			return 1;
91
++	}
92
++
93
++	return 0;
94
+ }
95
+ 
96
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
97
+@@ -4760,6 +4757,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
98
+ 	unsigned long weight;
99
+ 	int balanced;
100
+ 
101
++	/*
102
++	 * If we wake multiple tasks be careful to not bounce
103
++	 * ourselves around too much.
104
++	 */
105
++	if (wake_wide(p))
106
++		return 0;
107
++
108
+ 	idx	  = sd->wake_idx;
109
+ 	this_cpu  = smp_processor_id();
110
+ 	prev_cpu  = task_cpu(p);
111
+@@ -5013,17 +5017,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
112
+ {
113
+ 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
114
+ 	int cpu = smp_processor_id();
115
+-	int new_cpu = prev_cpu;
116
++	int new_cpu = cpu;
117
+ 	int want_affine = 0;
118
+ 	int sync = wake_flags & WF_SYNC;
119
+ 
120
+ 	if (sd_flag & SD_BALANCE_WAKE)
121
+-		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
122
++		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
123
+ 
124
+ 	rcu_read_lock();
125
+ 	for_each_domain(cpu, tmp) {
126
+ 		if (!(tmp->flags & SD_LOAD_BALANCE))
127
+-			break;
128
++			continue;
129
+ 
130
+ 		/*
131
+ 		 * If both cpu and prev_cpu are part of this domain,
132
+@@ -5037,21 +5041,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
133
+ 
134
+ 		if (tmp->flags & sd_flag)
135
+ 			sd = tmp;
136
+-		else if (!want_affine)
137
+-			break;
138
+ 	}
139
+ 
140
+-	if (affine_sd) {
141
+-		sd = NULL; /* Prefer wake_affine over balance flags */
142
+-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
143
+-			new_cpu = cpu;
144
+-	}
145
++	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
146
++		prev_cpu = cpu;
147
+ 
148
+-	if (!sd) {
149
+-		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
150
+-			new_cpu = select_idle_sibling(p, new_cpu);
151
++	if (sd_flag & SD_BALANCE_WAKE) {
152
++		new_cpu = select_idle_sibling(p, prev_cpu);
153
++		goto unlock;
154
++	}
155
+ 
156
+-	} else while (sd) {
157
++	while (sd) {
158
+ 		struct sched_group *group;
159
+ 		int weight;
160
+ 
161
+@@ -5085,6 +5085,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
162
+ 		}
163
+ 		/* while loop will break here if sd == NULL */
164
+ 	}
165
++unlock:
166
+ 	rcu_read_unlock();
167
+ 
168
+ 	return new_cpu;
169
+-- 
170
+1.9.1
171
+
... ...
@@ -2,7 +2,7 @@
2 2
 Summary:        Kernel
3 3
 Name:           linux
4 4
 Version:    	4.4.8
5
-Release:    	5%{?dist}
5
+Release:    	6%{?dist}
6 6
 License:    	GPLv2
7 7
 URL:        	http://www.kernel.org/
8 8
 Group:        	System Environment/Kernel
... ...
@@ -22,6 +22,7 @@ Patch6:         net-Driver-Vmxnet3-set-CHECKSUM_UNNECESSARY-for-IPv6-packets.pat
22 22
 Patch7:		netfilter-x_tables-deal-with-bogus-nextoffset-values.patch
23 23
 #fixes CVE-2016-3135
24 24
 Patch8:		netfilter-x_tables-check-for-size-overflow.patch
25
+Patch9:		REVERT-sched-fair-Beef-up-wake_wide.patch
25 26
 BuildRequires:  bc
26 27
 BuildRequires:  kbd
27 28
 BuildRequires:  kmod
... ...
@@ -86,6 +87,7 @@ Kernel driver for oprofile, a statistical profiler for Linux systems
86 86
 %patch6 -p1
87 87
 %patch7 -p1
88 88
 %patch8 -p1
89
+%patch9 -p1
89 90
 
90 91
 %build
91 92
 make mrproper
... ...
@@ -182,10 +184,12 @@ ln -s /usr/lib/debug/lib/modules/%{version}/vmlinux-%{version}.debug /boot/vmlin
182 182
 /lib/modules/%{version}/kernel/arch/x86/oprofile/
183 183
 
184 184
 %changelog
185
-*	Tue May 24 2016 Priyesh Padmavilasom <ppadmavilasom@vmware.com> 4.4.8-5
186
--	GA - Bump release of all rpms
187
-*	Mon May 23 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-4
188
--	Fixed generation of debug symbols for kernel modules & vmlinux.
185
+*   Thu May 26 2016 Alexey Makhalov <amakhalov@vmware.com> 4.4.8-6
186
+-   patch: REVERT-sched-fair-Beef-up-wake_wide.patch
187
+*   Tue May 24 2016 Priyesh Padmavilasom <ppadmavilasom@vmware.com> 4.4.8-5
188
+-   GA - Bump release of all rpms
189
+*   Mon May 23 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-4
190
+-   Fixed generation of debug symbols for kernel modules & vmlinux.
189 191
 *   Mon May 23 2016 Divya Thaluru <dthaluru@vmware.com> 4.4.8-3
190 192
 -   Added patches to fix CVE-2016-3134, CVE-2016-3135
191 193
 *   Wed May 18 2016 Harish Udaiya Kumar <hudaiyakumar@vmware.com> 4.4.8-2