Browse code

linux-rt: Fix issues in Guest timer Advancement feature

Code to disable Guest timer Advancement broke in some systems
due to compiler optimizations. Fixed that with memory barriers.

Guest timer advancement will not work in CPUs that do not
support TSC_DEADLINE_TIMER feature with current design. So
we disable the feature in this case.

We are also reducing the severity and frequency of a log in
hrtimer.c
We were printing a log every time we spin for > 1us. 1us spent
spinning does not cause any serious issues, but this message is
flooding dmesg output.
We are increasing the threshold for this message to 5us.
Spinning for 5us is unlikely, but can cause latency issues in RT
systems if a CPU is unavailable for that much time.

Change-Id: Ie23dbab5ec74bc3d922215711beda7259575b098
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/20535
Reviewed-by: Srivatsa S. Bhat <srivatsab@vmware.com>
Tested-by: gerrit-photon <photon-checkins@vmware.com>

Him Kalyan Bordoloi authored on 2023/04/14 17:40:38
Showing 3 changed files
... ...
@@ -16,7 +16,7 @@
16 16
 Summary:        Kernel
17 17
 Name:           linux-rt
18 18
 Version:        6.1.10
19
-Release:        8%{?kat_build:.kat}%{?dist}
19
+Release:        9%{?kat_build:.kat}%{?dist}
20 20
 License:        GPLv2
21 21
 URL:            http://www.kernel.org
22 22
 Group:          System Environment/Kernel
... ...
@@ -177,7 +177,7 @@ Patch701: 6.0-sched-rt-RT_RUNTIME_GREED-sched-feature.patch
177 177
 Patch714: 0001-Allow-tick-sched-timer-to-be-turned-off-in-idle-poll.patch
178 178
 
179 179
 #Patch to add timer padding on guest
180
-Patch716: 6.0-timer-padding-on-guest.patch
180
+Patch716: Guest-timer-Advancement-Feature.patch
181 181
 
182 182
 # Fix for a latency issue related to ktimer thread wakeup:
183 183
 Patch717: softirq-wake-up-ktimer-thread-in-softirq-context.patch
... ...
@@ -503,6 +503,8 @@ ln -sf linux-%{uname_r}.cfg /boot/photon.cfg
503 503
 %{_usrsrc}/linux-headers-%{uname_r}
504 504
 
505 505
 %changelog
506
+* Fri Apr 14 2023 Him Kalyan Bordoloi <bordoloih@vmware.com> 6.1.10-9
507
+- Update Guest timer advancement feature
506 508
 * Fri Mar 31 2023 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 6.1.10-8
507 509
 - Expose Photon kernel macros to simplify building out-of-tree drivers.
508 510
 * Thu Mar 30 2023 Brennan Lamoreaux <blamoreaux@vmware.com> 6.1.10-7
509 511
deleted file mode 100644
... ...
@@ -1,345 +0,0 @@
1
-From 22eb1efba48df1568adc5f765557b6e7269d247e Mon Sep 17 00:00:00 2001
2
-From: Him Kalyan Bordoloi <bordoloih@vmware.com>
3
-Date: Mon, 6 Jun 2022 20:32:24 +0000
4
-Subject: [PATCH] timer padding on guest
5
-
6
-This is an optimization to hide the virtualization cost of timer
7
-interrupts.
8
-
9
-The idea is that if we have a predictable overhead, we can prepone the
10
-timer by the said overhead. As a result we would receive the interrupt
11
-in the guest at the same time as baremetal, giving the impression that
12
-there is no virtualization overhead.
13
-
14
-Next we have to determine what value to prepone the timer by.  One of
15
-the fundamental guarantees of timer interrupts is that a timer's
16
-callback will never be invoked before its configured time.  If the
17
-timer is preponed, it needs to be ensured that this guarantee is
18
-satisfied.  In order to do that, the interrupt handler needs to spin
19
-in the guest, in case the timer is received earlier than intended.
20
-This is not a desirable scenario, especially in a real-time system, as
21
-this will be hogging the CPU from the workload, with interrupts
22
-disabled.
23
-
24
-When we traced the observed overhead from the guest for timer
25
-interrupts, we found that in about 99.9% of the instances the overhead
26
-was within a range of about 400ns, from approx. 1200ns to 1600ns.
27
-
28
-So we decided to use the minimum observed overhead since boot time to
29
-prepone the timer.  The miniumum observed overhead is continuously
30
-monitored during runtime.  With this option, we are minimizing the
31
-need to spin while also not giving up too much of the possible latency
32
-gains.
33
-
34
-Signed-off-by: Him Kalyan Bordoloi <bordoloih@vmware.com>
35
-Signed-off-by: Keerthana K <keerthanak@vmware.com>
36
-Signed-off-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
37
- arch/x86/kernel/apic/apic.c | 22 +++++++++++++-
38
- include/linux/clockchips.h  | 22 ++++++++++++++
39
- include/linux/hrtimer.h     |  3 ++
40
- kernel/time/clockevents.c   | 59 +++++++++++++++++++++++++++++++++++++
41
- kernel/time/hrtimer.c       | 41 ++++++++++++++++++++++++++
42
- 5 files changed, 146 insertions(+), 1 deletion(-)
43
-
44
-diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
45
-index c6876d3ea4b1..d19f219c304c 100644
46
-+++ b/arch/x86/kernel/apic/apic.c
47
-@@ -35,6 +35,7 @@
48
- #include <linux/dmi.h>
49
- #include <linux/smp.h>
50
- #include <linux/mm.h>
51
-+#include <linux/hrtimer.h>
52
- 
53
- #include <asm/trace/irq_vectors.h>
54
- #include <asm/irq_remapping.h>
55
-@@ -470,6 +471,22 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
56
- }
57
- EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
58
- 
59
-+/*
60
-+ * Function to convert time delta from tsc to ns. It will call clockevent_delta2ns,
61
-+ * which takes unsigned long value as input. Since tsc is a u64 value, in a 32 bit system
62
-+ * this can lead to data loss. So this function is restricted to x86_64 systems only.
63
-+ */
64
-+unsigned long long tsc_delta2ns(unsigned long delta,
65
-+				struct clock_event_device *evt)
66
-+{
67
-+#ifdef CONFIG_X86_64
68
-+	return clockevent_delta2ns(delta / TSC_DIVISOR, evt);
69
-+#else
70
-+	return 0;
71
-+#endif
72
-+}
73
-+EXPORT_SYMBOL_GPL(tsc_delta2ns);
74
-+
75
- /*
76
-  * Program the next event, relative to now
77
-  */
78
-@@ -484,12 +501,15 @@ static int lapic_next_deadline(unsigned long delta,
79
- 			       struct clock_event_device *evt)
80
- {
81
- 	u64 tsc;
82
-+	u64 deadline;
83
- 
84
- 	/* This MSR is special and need a special fence: */
85
- 	weak_wrmsr_fence();
86
- 
87
- 	tsc = rdtsc();
88
--	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
89
-+	deadline = tsc + (((u64) delta) * TSC_DIVISOR);
90
-+	wrmsrl(MSR_IA32_TSC_DEADLINE, deadline);
91
-+	this_cpu_write(last_programmed_time_tsc, deadline);
92
- 	return 0;
93
- }
94
- 
95
-diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
96
-index 8ae9a95ebf5b..d722e0f1d09c 100644
97
-+++ b/include/linux/clockchips.h
98
-@@ -15,10 +15,31 @@
99
- # include <linux/cpumask.h>
100
- # include <linux/ktime.h>
101
- # include <linux/notifier.h>
102
-+#include <asm/hypervisor.h>
103
- 
104
- struct clock_event_device;
105
- struct module;
106
- 
107
-+/*
108
-+ * Timer padding enabled ?
109
-+ */
110
-+static bool timer_padding_enabled __read_mostly  = true;
111
-+
112
-+/*
113
-+ * timer_padding_is_enabled - query, if the timer padding optimization is enabled
114
-+ */
115
-+static inline int timer_padding_is_enabled(void)
116
-+{
117
-+#ifdef CONFIG_X86_64
118
-+	if (timer_padding_enabled != false && x86_hyper_type != X86_HYPER_VMWARE) {
119
-+		timer_padding_enabled = false;
120
-+	}
121
-+	return timer_padding_enabled;
122
-+#else
123
-+	return 0;
124
-+#endif
125
-+}
126
-+
127
- /*
128
-  * Possible states of a clock event device.
129
-  *
130
-@@ -180,6 +201,7 @@ div_sc(unsigned long ticks, unsigned long nsec, int shift)
131
- 
132
- /* Clock event layer functions */
133
- extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt);
134
-+extern unsigned long long tsc_delta2ns(unsigned long delta, struct clock_event_device *evt);
135
- extern void clockevents_register_device(struct clock_event_device *dev);
136
- extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
137
- 
138
-diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
139
-index 0ee140176f10..c56d19fc075f 100644
140
-+++ b/include/linux/hrtimer.h
141
-@@ -316,6 +316,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
142
- #ifdef CONFIG_HIGH_RES_TIMERS
143
- struct clock_event_device;
144
- 
145
-+DECLARE_PER_CPU(unsigned long long, min_overhead_tsc);
146
-+DECLARE_PER_CPU(unsigned long long, last_programmed_time_tsc);
147
-+
148
- extern void hrtimer_interrupt(struct clock_event_device *dev);
149
- 
150
- extern unsigned int hrtimer_resolution;
151
-diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
152
-index 5d85014d59b5..8690dd7a5179 100644
153
-+++ b/kernel/time/clockevents.c
154
-@@ -29,6 +29,25 @@ struct ce_unbind {
155
- 	int res;
156
- };
157
- 
158
-+/*
159
-+ * Enable / Disable timer padding optimization
160
-+ */
161
-+static int __init setup_timer_padding(char *str)
162
-+{
163
-+#ifdef CONFIG_X86_64
164
-+	if (x86_hyper_type != X86_HYPER_VMWARE) {
165
-+		timer_padding_enabled = false;
166
-+		return 0;
167
-+	}
168
-+	return (kstrtobool(str, &timer_padding_enabled) == 0);
169
-+#else
170
-+	timer_padding_enabled = false;
171
-+	return 0;
172
-+#endif
173
-+}
174
-+
175
-+__setup("timerpadding=", setup_timer_padding);
176
-+
177
- static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
178
- 			bool ismax)
179
- {
180
-@@ -306,6 +325,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
181
- 	unsigned long long clc;
182
- 	int64_t delta;
183
- 	int rc;
184
-+	unsigned long long min_overhead_ns = 0;
185
- 
186
- 	if (WARN_ON_ONCE(expires < 0))
187
- 		return -ETIME;
188
-@@ -324,9 +344,20 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
189
- 		return dev->set_next_ktime(expires, dev);
190
- 
191
- 	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
192
-+
193
- 	if (delta <= 0)
194
- 		return force ? clockevents_program_min_delta(dev) : -ETIME;
195
- 
196
-+	if (timer_padding_is_enabled()) {
197
-+		min_overhead_ns = tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev);
198
-+		/*
199
-+		 * min_overhead_ns <= 1000 is not reliable
200
-+		 * tsc_delta2ns only returns values greater than 1us reliably
201
-+		 */
202
-+		if (min_overhead_ns > 1000 && delta > min_overhead_ns)
203
-+			delta = delta - min_overhead_ns;
204
-+	}
205
-+
206
- 	delta = min(delta, (int64_t) dev->max_delta_ns);
207
- 	delta = max(delta, (int64_t) dev->min_delta_ns);
208
- 
209
-@@ -667,6 +698,32 @@ static struct bus_type clockevents_subsys = {
210
- static DEFINE_PER_CPU(struct device, tick_percpu_dev);
211
- static struct tick_device *tick_get_tick_dev(struct device *dev);
212
- 
213
-+static ssize_t sysfs_show_timer_padding_ns(struct device *dev,
214
-+					   struct device_attribute *attr,
215
-+					   char *buf)
216
-+{
217
-+	struct tick_device *td;
218
-+	ssize_t count = 0;
219
-+	int cpu = 0;
220
-+	unsigned long long min_overhead_ns = 0;
221
-+
222
-+	if (!timer_padding_is_enabled())
223
-+		return 0;
224
-+	raw_spin_lock_irq(&clockevents_lock);
225
-+	td = tick_get_tick_dev(dev);
226
-+	if (td && td->evtdev) {
227
-+		if (cpumask_weight(td->evtdev->cpumask) == 1) {
228
-+			cpu = cpumask_first(td->evtdev->cpumask);
229
-+			if (per_cpu(min_overhead_tsc, cpu) != ULONG_MAX)
230
-+				min_overhead_ns = tsc_delta2ns(per_cpu(min_overhead_tsc, cpu), td->evtdev);
231
-+			count = snprintf(buf, PAGE_SIZE, "%lld\n", min_overhead_ns);
232
-+		}
233
-+	}
234
-+	raw_spin_unlock_irq(&clockevents_lock);
235
-+	return count;
236
-+}
237
-+static DEVICE_ATTR(timer_padding_ns, 0444, sysfs_show_timer_padding_ns, NULL);
238
-+
239
- static ssize_t current_device_show(struct device *dev,
240
- 				   struct device_attribute *attr,
241
- 				   char *buf)
242
-@@ -760,6 +817,8 @@ static int __init tick_init_sysfs(void)
243
- 			err = device_create_file(dev, &dev_attr_current_device);
244
- 		if (!err)
245
- 			err = device_create_file(dev, &dev_attr_unbind_device);
246
-+		if (!err && timer_padding_is_enabled())
247
-+			err = device_create_file(dev, &dev_attr_timer_padding_ns);
248
- 		if (err)
249
- 			return err;
250
- 	}
251
-diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
252
-index 24b353cf31d3..d4c10b94f374 100644
253
-+++ b/kernel/time/hrtimer.c
254
-@@ -41,6 +41,7 @@
255
- #include <linux/timer.h>
256
- #include <linux/freezer.h>
257
- #include <linux/compat.h>
258
-+#include <linux/delay.h>
259
- 
260
- #include <linux/uaccess.h>
261
- 
262
-@@ -1775,6 +1776,9 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
263
- 
264
- #ifdef CONFIG_HIGH_RES_TIMERS
265
- 
266
-+DEFINE_PER_CPU(unsigned long long, min_overhead_tsc) = ULONG_MAX;
267
-+DEFINE_PER_CPU(unsigned long long, last_programmed_time_tsc) = 0;
268
-+
269
- /*
270
-  * High resolution timer interrupt
271
-  * Called with interrupts disabled
272
-@@ -1785,6 +1789,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
273
- 	ktime_t expires_next, now, entry_time, delta;
274
- 	unsigned long flags;
275
- 	int retries = 0;
276
-+	long long current_overhead = 0;
277
-+	unsigned long long tsc_now = 0;
278
-+	ktime_t early = 0;
279
-+	ktime_t timer_padding_spin = 0;
280
- 
281
- 	BUG_ON(!cpu_base->hres_active);
282
- 	cpu_base->nr_events++;
283
-@@ -1794,6 +1802,35 @@ void hrtimer_interrupt(struct clock_event_device *dev)
284
- 	entry_time = now = hrtimer_update_base(cpu_base);
285
- retry:
286
- 	cpu_base->in_hrtirq = 1;
287
-+
288
-+	tsc_now = rdtsc();
289
-+	current_overhead = tsc_now - this_cpu_read(last_programmed_time_tsc);
290
-+
291
-+	if (timer_padding_is_enabled() && current_overhead > 0 && cpu_base->next_timer
292
-+	    && ktime_before(now, cpu_base->expires_next)) {
293
-+		early = ktime_sub(cpu_base->expires_next, now);
294
-+		while (early > 0) {
295
-+			/*
296
-+			 * We pad/prepone the timer by the value of min_overhead_tsc.
297
-+			 * That means we cannot arrive here earlier than the expected timer fire by
298
-+			 * more than min_overhead_tsc, even with no overhead
299
-+			 */
300
-+			if (ktime_to_ns(early) > tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev))
301
-+				break;
302
-+			timer_padding_spin = ktime_add(timer_padding_spin, early);
303
-+			ndelay(early);
304
-+			now = hrtimer_update_base(cpu_base);
305
-+			if (!ktime_before(now, cpu_base->expires_next)) {
306
-+				early = 0;
307
-+				break;
308
-+			} else
309
-+				early = ktime_sub(cpu_base->expires_next, now);
310
-+		}
311
-+	}
312
-+	if (current_overhead > 0 && current_overhead < this_cpu_read(min_overhead_tsc)) {
313
-+		this_cpu_write(min_overhead_tsc, current_overhead);
314
-+	}
315
-+
316
- 	/*
317
- 	 * We set expires_next to KTIME_MAX here with cpu_base->lock
318
- 	 * held to prevent that a timer is enqueued in our queue via
319
-@@ -1825,6 +1862,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
320
- 	if (expires_next == KTIME_MAX ||
321
- 	    !tick_program_event(expires_next, 0)) {
322
- 		cpu_base->hang_detected = 0;
323
-+		if (timer_padding_spin > 1000 && timer_padding_is_enabled())
324
-+			pr_warn("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
325
- 		return;
326
- 	}
327
- 
328
-@@ -1868,6 +1907,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
329
- 	else
330
- 		expires_next = ktime_add(now, delta);
331
- 	tick_program_event(expires_next, 1);
332
-+	if (timer_padding_is_enabled())
333
-+		pr_warn("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
334
- 	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
335
- }
336
- 
337
-2.25.1
338
-
339 1
new file mode 100644
... ...
@@ -0,0 +1,374 @@
0
+From 4e6182c55e7195a1efc4a0106e7ed2fd53dbc7ba Mon Sep 17 00:00:00 2001
1
+From: Him Kalyan Bordoloi <bordoloih@vmware.com>
2
+Date: Fri, 14 Apr 2023 05:48:32 +0000
3
+Subject: [PATCH] Guest timer Advancement Feature
4
+
5
+V1:
6
+
7
+This is an optimization to hide the virtualization cost of timer interrupts.
8
+
9
+The idea is that if we have a predictable overhead, we can prepone the timer by
10
+the said overhead. As a result we would receive the interrupt in the guest at the
11
+same time as baremetal, giving the impression that there is no virtualization
12
+overhead.
13
+
14
+Next we have to determine what value to prepone the timer by.
15
+One of the fundamental guarantees of timer interrupts is that a timer's callback
16
+will never be invoked before its configured time.
17
+If the timer is preponed, it needs to be ensured that this guarantee is satisfied.
18
+In order to do that, the interrupt handler needs to spin in the guest,
19
+in case the timer is received earlier than intended. This is not a desirable scenario,
20
+especially in a real-time system, as this will be hogging the CPU from the workload,
21
+with interrupts disabled.
22
+
23
+When we traced the observed overhead from the guest for timer interrupts,
24
+we found that in about 99.9% of the instances the overhead was within a range
25
+of about 400ns, from approx. 1200ns to 1600ns.
26
+
27
+So we decided to use the minimum observed overhead since boot time to prepone the timer.
28
+The miniumum observed overhead is continuously monitored during runtime.
29
+With this option, we are minimizing the need to spin while also not giving up too much of the
30
+possible latency gains.
31
+
32
+V2:
33
+
34
+Code to disable Guest timer Advancement broke in some systems
35
+due to compiler optimizations. Fixed that with memory barriers.
36
+
37
+Guest timer advancement will not work in CPUs that do not
38
+support TSC_DEADLINE_TIMER feature with current design. So
39
+we disable the feature if this feature is not supported.
40
+
41
+We are also reducing the severity and frequency of a log in
42
+hrtimer.c
43
+We were printing a log every time we spin for > 1us. 1us spent
44
+spinning does not cause any serious issues, but this message is
45
+flooding dmesg output.
46
+We are increasing the threshold for this message to 5us.
47
+Spinning for 5us is unlikely, but can cause latency issues in RT
48
+systems if a CPU is unavailable for that much time.
49
+---
50
+ arch/x86/kernel/apic/apic.c | 27 +++++++++++-
51
+ include/linux/clockchips.h  |  3 ++
52
+ include/linux/hrtimer.h     |  3 ++
53
+ kernel/time/clockevents.c   | 83 +++++++++++++++++++++++++++++++++++++
54
+ kernel/time/hrtimer.c       | 42 +++++++++++++++++++
55
+ 5 files changed, 156 insertions(+), 2 deletions(-)
56
+
57
+diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
58
+index 20d9a604da7c..b4e11cf87992 100644
59
+--- a/arch/x86/kernel/apic/apic.c
60
+@@ -35,6 +35,7 @@
61
+ #include <linux/dmi.h>
62
+ #include <linux/smp.h>
63
+ #include <linux/mm.h>
64
++#include <linux/hrtimer.h>
65
+ 
66
+ #include <asm/trace/irq_vectors.h>
67
+ #include <asm/irq_remapping.h>
68
+@@ -470,6 +471,20 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
69
+ }
70
+ EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
71
+ 
72
++/*
73
++ * Function to convert time delta from tsc to ns. It will call clockevent_delta2ns,
74
++ * which takes unsigned long value as input. Since tsc is a u64 value, in a 32 bit system
75
++ * this can lead to data loss. So this function is restricted to x86_64 systems only.
76
++ */
77
++u64 tsc_delta2ns(u64 delta, struct clock_event_device *evt)
78
++{
79
++#ifdef CONFIG_X86_64
80
++	return clockevent_delta2ns((unsigned long)delta / TSC_DIVISOR, evt);
81
++#else
82
++	return 0;
83
++#endif
84
++}
85
++
86
+ /*
87
+  * Program the next event, relative to now
88
+  */
89
+@@ -484,12 +499,18 @@ static int lapic_next_deadline(unsigned long delta,
90
+ 			       struct clock_event_device *evt)
91
+ {
92
+ 	u64 tsc;
93
++	u64 deadline;
94
+ 
95
+ 	/* This MSR is special and need a special fence: */
96
+ 	weak_wrmsr_fence();
97
+ 
98
+ 	tsc = rdtsc();
99
+-	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
100
++	if (timer_padding_is_enabled()) {
101
++		deadline = tsc + (((u64) delta) * TSC_DIVISOR);
102
++		wrmsrl(MSR_IA32_TSC_DEADLINE, deadline);
103
++		this_cpu_write(last_programmed_time_tsc, deadline);
104
++	} else
105
++		wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
106
+ 	return 0;
107
+ }
108
+ 
109
+@@ -641,8 +662,10 @@ static void setup_APIC_timer(void)
110
+ 		clockevents_config_and_register(levt,
111
+ 						tsc_khz * (1000 / TSC_DIVISOR),
112
+ 						0xF, ~0UL);
113
+-	} else
114
++	} else {
115
+ 		clockevents_register_device(levt);
116
++		set_timer_padding(false);
117
++	}
118
+ }
119
+ 
120
+ /*
121
+diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
122
+index 8ae9a95ebf5b..2bffa549ed55 100644
123
+--- a/include/linux/clockchips.h
124
+@@ -179,7 +179,10 @@ div_sc(unsigned long ticks, unsigned long nsec, int shift)
125
+ }
126
+ 
127
+ /* Clock event layer functions */
128
++extern inline bool timer_padding_is_enabled(void);
129
++extern inline void set_timer_padding(bool enabled);
130
+ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt);
131
++extern u64 tsc_delta2ns(u64 delta, struct clock_event_device *evt);
132
+ extern void clockevents_register_device(struct clock_event_device *dev);
133
+ extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
134
+ 
135
+diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
136
+index 0ee140176f10..7d5eff8fd605 100644
137
+--- a/include/linux/hrtimer.h
138
+@@ -316,6 +316,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
139
+ #ifdef CONFIG_HIGH_RES_TIMERS
140
+ struct clock_event_device;
141
+ 
142
++DECLARE_PER_CPU(u64, min_overhead_tsc);
143
++DECLARE_PER_CPU(u64, last_programmed_time_tsc);
144
++
145
+ extern void hrtimer_interrupt(struct clock_event_device *dev);
146
+ 
147
+ extern unsigned int hrtimer_resolution;
148
+diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
149
+index 5d85014d59b5..c58e3f064517 100644
150
+--- a/kernel/time/clockevents.c
151
+@@ -13,6 +13,7 @@
152
+ #include <linux/module.h>
153
+ #include <linux/smp.h>
154
+ #include <linux/device.h>
155
++#include <asm/hypervisor.h>
156
+ 
157
+ #include "tick-internal.h"
158
+ 
159
+@@ -29,6 +30,44 @@ struct ce_unbind {
160
+ 	int res;
161
+ };
162
+ 
163
++/*
164
++ * Timer padding enabled ?
165
++ */
166
++static bool timer_padding_enabled __read_mostly = true;
167
++/*
168
++ * timer_padding_is_enabled - query, if the timer padding optimization is enabled
169
++ */
170
++inline bool timer_padding_is_enabled(void)
171
++{
172
++        smp_rmb();
173
++        return timer_padding_enabled;
174
++}
175
++
176
++inline void set_timer_padding(bool enabled)
177
++{
178
++        timer_padding_enabled = enabled;
179
++        smp_wmb();
180
++}
181
++
182
++/*
183
++ * Enable / Disable timer padding optimization
184
++ */
185
++static int __init setup_timer_padding(char *str)
186
++{
187
++#ifdef CONFIG_X86_64
188
++	if (x86_hyper_type != X86_HYPER_VMWARE) {
189
++		set_timer_padding(false);
190
++		return 0;
191
++	}
192
++	return (kstrtobool(str, &timer_padding_enabled) == 0);
193
++#else
194
++	set_timer_padding(false);
195
++	return 0;
196
++#endif
197
++}
198
++
199
++__setup("timerpadding=", setup_timer_padding);
200
++
201
+ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
202
+ 			bool ismax)
203
+ {
204
+@@ -306,6 +345,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
205
+ 	unsigned long long clc;
206
+ 	int64_t delta;
207
+ 	int rc;
208
++	u64 min_overhead_ns = 0;
209
+ 
210
+ 	if (WARN_ON_ONCE(expires < 0))
211
+ 		return -ETIME;
212
+@@ -324,9 +364,20 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
213
+ 		return dev->set_next_ktime(expires, dev);
214
+ 
215
+ 	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
216
++
217
+ 	if (delta <= 0)
218
+ 		return force ? clockevents_program_min_delta(dev) : -ETIME;
219
+ 
220
++	if (timer_padding_is_enabled()) {
221
++		min_overhead_ns = tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev);
222
++		/*
223
++		 * min_overhead_ns <= 1000 is not reliable
224
++		 * tsc_delta2ns only returns values greater than 1us reliably
225
++		 */
226
++		if (min_overhead_ns > 1000 && delta > min_overhead_ns)
227
++			delta = delta - min_overhead_ns;
228
++	}
229
++
230
+ 	delta = min(delta, (int64_t) dev->max_delta_ns);
231
+ 	delta = max(delta, (int64_t) dev->min_delta_ns);
232
+ 
233
+@@ -667,6 +718,32 @@ static struct bus_type clockevents_subsys = {
234
+ static DEFINE_PER_CPU(struct device, tick_percpu_dev);
235
+ static struct tick_device *tick_get_tick_dev(struct device *dev);
236
+ 
237
++static ssize_t timer_padding_ns_show(struct device *dev,
238
++				     struct device_attribute *attr,
239
++				     char *buf)
240
++{
241
++	struct tick_device *td;
242
++	ssize_t count = 0;
243
++	int cpu = 0;
244
++	u64 min_overhead_ns = 0;
245
++
246
++	if (!timer_padding_is_enabled())
247
++		return 0;
248
++	raw_spin_lock_irq(&clockevents_lock);
249
++	td = tick_get_tick_dev(dev);
250
++	if (td && td->evtdev) {
251
++		if (cpumask_weight(td->evtdev->cpumask) == 1) {
252
++			cpu = cpumask_first(td->evtdev->cpumask);
253
++			if (per_cpu(min_overhead_tsc, cpu) != ULONG_MAX)
254
++				 min_overhead_ns = tsc_delta2ns(per_cpu(min_overhead_tsc, cpu), td->evtdev);
255
++			count = snprintf(buf, PAGE_SIZE, "%lld\n", min_overhead_ns);
256
++		}
257
++	}
258
++	raw_spin_unlock_irq(&clockevents_lock);
259
++	return count;
260
++}
261
++static DEVICE_ATTR(timer_padding_ns, 0444, timer_padding_ns_show, NULL);
262
++
263
+ static ssize_t current_device_show(struct device *dev,
264
+ 				   struct device_attribute *attr,
265
+ 				   char *buf)
266
+@@ -760,6 +837,12 @@ static int __init tick_init_sysfs(void)
267
+ 			err = device_create_file(dev, &dev_attr_current_device);
268
+ 		if (!err)
269
+ 			err = device_create_file(dev, &dev_attr_unbind_device);
270
++		if (!err && timer_padding_is_enabled()) {
271
++			if (x86_hyper_type != X86_HYPER_VMWARE) {
272
++				set_timer_padding(false);
273
++			} else
274
++				err = device_create_file(dev, &dev_attr_timer_padding_ns);
275
++		}
276
+ 		if (err)
277
+ 			return err;
278
+ 	}
279
+diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
280
+index 29860eefd452..49bec11d5b36 100644
281
+--- a/kernel/time/hrtimer.c
282
+@@ -41,6 +41,7 @@
283
+ #include <linux/timer.h>
284
+ #include <linux/freezer.h>
285
+ #include <linux/compat.h>
286
++#include <linux/delay.h>
287
+ 
288
+ #include <linux/uaccess.h>
289
+ 
290
+@@ -1775,6 +1776,9 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
291
+ 
292
+ #ifdef CONFIG_HIGH_RES_TIMERS
293
+ 
294
++DEFINE_PER_CPU(u64, min_overhead_tsc) = ULONG_MAX;
295
++DEFINE_PER_CPU(u64, last_programmed_time_tsc) = 0;
296
++
297
+ /*
298
+  * High resolution timer interrupt
299
+  * Called with interrupts disabled
300
+@@ -1785,6 +1789,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
301
+ 	ktime_t expires_next, now, entry_time, delta;
302
+ 	unsigned long flags;
303
+ 	int retries = 0;
304
++	s64 current_overhead = 0;
305
++	u64 tsc_now = 0;
306
++	ktime_t early = 0;
307
++	ktime_t timer_padding_spin = 0;
308
+ 
309
+ 	BUG_ON(!cpu_base->hres_active);
310
+ 	cpu_base->nr_events++;
311
+@@ -1794,6 +1802,36 @@ void hrtimer_interrupt(struct clock_event_device *dev)
312
+ 	entry_time = now = hrtimer_update_base(cpu_base);
313
+ retry:
314
+ 	cpu_base->in_hrtirq = 1;
315
++
316
++	if (timer_padding_is_enabled()) {
317
++		tsc_now = rdtsc();
318
++		current_overhead = tsc_now - this_cpu_read(last_programmed_time_tsc);
319
++
320
++		if (current_overhead > 0 && cpu_base->next_timer
321
++		    && ktime_before(now, cpu_base->expires_next)) {
322
++			early = ktime_sub(cpu_base->expires_next, now);
323
++			while (early > 0) {
324
++				/*
325
++				 * We pad/prepone the timer by the value of min_overhead_tsc.
326
++				 * That means we cannot arrive here earlier than the expected timer fire by
327
++				 * more than min_overhead_tsc, even with no overhead
328
++				 */
329
++				if (ktime_to_ns(early) > tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev))
330
++					break;
331
++				timer_padding_spin = ktime_add(timer_padding_spin, early);
332
++				ndelay(early);
333
++				now = hrtimer_update_base(cpu_base);
334
++				if (!ktime_before(now, cpu_base->expires_next)) {
335
++					early = 0;
336
++					break;
337
++				} else
338
++					early = ktime_sub(cpu_base->expires_next, now);
339
++			}
340
++		}
341
++		if (current_overhead > 0 && current_overhead < this_cpu_read(min_overhead_tsc)) {
342
++			this_cpu_write(min_overhead_tsc, current_overhead);
343
++		}
344
++	}
345
+ 	/*
346
+ 	 * We set expires_next to KTIME_MAX here with cpu_base->lock
347
+ 	 * held to prevent that a timer is enqueued in our queue via
348
+@@ -1825,6 +1863,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
349
+ 	if (expires_next == KTIME_MAX ||
350
+ 	    !tick_program_event(expires_next, 0)) {
351
+ 		cpu_base->hang_detected = 0;
352
++		if (timer_padding_is_enabled() && timer_padding_spin > 5000)
353
++			pr_info_ratelimited("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
354
+ 		return;
355
+ 	}
356
+ 
357
+@@ -1868,6 +1908,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
358
+ 	else
359
+ 		expires_next = ktime_add(now, delta);
360
+ 	tick_program_event(expires_next, 1);
361
++	if (timer_padding_is_enabled())
362
++		pr_warn_once("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
363
+ 	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
364
+ }
365
+ 
366
+-- 
367
+2.23.3
368
+