Code to disable Guest timer Advancement broke in some systems
due to compiler optimizations. Fixed that with memory barriers.
Guest timer advancement will not work in CPUs that do not
support TSC_DEADLINE_TIMER feature with current design. So
we disable the feature in this case.
We are also reducing the severity and frequency of a log in
hrtimer.c
We were printing a log every time we spin for > 1us. 1us spent
spinning does not cause any serious issues, but this message is
flooding dmesg output.
We are increasing the threshold for this message to 5us.
Spinning for 5us is unlikely, but can cause latency issues in RT
systems if a CPU is unavailable for that much time.
Change-Id: Ie23dbab5ec74bc3d922215711beda7259575b098
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/20535
Reviewed-by: Srivatsa S. Bhat <srivatsab@vmware.com>
Tested-by: gerrit-photon <photon-checkins@vmware.com>
| ... | ... |
@@ -16,7 +16,7 @@ |
| 16 | 16 |
Summary: Kernel |
| 17 | 17 |
Name: linux-rt |
| 18 | 18 |
Version: 6.1.10 |
| 19 |
-Release: 8%{?kat_build:.kat}%{?dist}
|
|
| 19 |
+Release: 9%{?kat_build:.kat}%{?dist}
|
|
| 20 | 20 |
License: GPLv2 |
| 21 | 21 |
URL: http://www.kernel.org |
| 22 | 22 |
Group: System Environment/Kernel |
| ... | ... |
@@ -177,7 +177,7 @@ Patch701: 6.0-sched-rt-RT_RUNTIME_GREED-sched-feature.patch |
| 177 | 177 |
Patch714: 0001-Allow-tick-sched-timer-to-be-turned-off-in-idle-poll.patch |
| 178 | 178 |
|
| 179 | 179 |
#Patch to add timer padding on guest |
| 180 |
-Patch716: 6.0-timer-padding-on-guest.patch |
|
| 180 |
+Patch716: Guest-timer-Advancement-Feature.patch |
|
| 181 | 181 |
|
| 182 | 182 |
# Fix for a latency issue related to ktimer thread wakeup: |
| 183 | 183 |
Patch717: softirq-wake-up-ktimer-thread-in-softirq-context.patch |
| ... | ... |
@@ -503,6 +503,8 @@ ln -sf linux-%{uname_r}.cfg /boot/photon.cfg
|
| 503 | 503 |
%{_usrsrc}/linux-headers-%{uname_r}
|
| 504 | 504 |
|
| 505 | 505 |
%changelog |
| 506 |
+* Fri Apr 14 2023 Him Kalyan Bordoloi <bordoloih@vmware.com> 6.1.10-9 |
|
| 507 |
+- Update Guest timer advancement feature |
|
| 506 | 508 |
* Fri Mar 31 2023 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 6.1.10-8 |
| 507 | 509 |
- Expose Photon kernel macros to simplify building out-of-tree drivers. |
| 508 | 510 |
* Thu Mar 30 2023 Brennan Lamoreaux <blamoreaux@vmware.com> 6.1.10-7 |
| 509 | 511 |
deleted file mode 100644 |
| ... | ... |
@@ -1,345 +0,0 @@ |
| 1 |
-From 22eb1efba48df1568adc5f765557b6e7269d247e Mon Sep 17 00:00:00 2001 |
|
| 2 |
-From: Him Kalyan Bordoloi <bordoloih@vmware.com> |
|
| 3 |
-Date: Mon, 6 Jun 2022 20:32:24 +0000 |
|
| 4 |
-Subject: [PATCH] timer padding on guest |
|
| 5 |
- |
|
| 6 |
-This is an optimization to hide the virtualization cost of timer |
|
| 7 |
-interrupts. |
|
| 8 |
- |
|
| 9 |
-The idea is that if we have a predictable overhead, we can prepone the |
|
| 10 |
-timer by the said overhead. As a result we would receive the interrupt |
|
| 11 |
-in the guest at the same time as baremetal, giving the impression that |
|
| 12 |
-there is no virtualization overhead. |
|
| 13 |
- |
|
| 14 |
-Next we have to determine what value to prepone the timer by. One of |
|
| 15 |
-the fundamental guarantees of timer interrupts is that a timer's |
|
| 16 |
-callback will never be invoked before its configured time. If the |
|
| 17 |
-timer is preponed, it needs to be ensured that this guarantee is |
|
| 18 |
-satisfied. In order to do that, the interrupt handler needs to spin |
|
| 19 |
-in the guest, in case the timer is received earlier than intended. |
|
| 20 |
-This is not a desirable scenario, especially in a real-time system, as |
|
| 21 |
-this will be hogging the CPU from the workload, with interrupts |
|
| 22 |
-disabled. |
|
| 23 |
- |
|
| 24 |
-When we traced the observed overhead from the guest for timer |
|
| 25 |
-interrupts, we found that in about 99.9% of the instances the overhead |
|
| 26 |
-was within a range of about 400ns, from approx. 1200ns to 1600ns. |
|
| 27 |
- |
|
| 28 |
-So we decided to use the minimum observed overhead since boot time to |
|
| 29 |
-prepone the timer. The miniumum observed overhead is continuously |
|
| 30 |
-monitored during runtime. With this option, we are minimizing the |
|
| 31 |
-need to spin while also not giving up too much of the possible latency |
|
| 32 |
-gains. |
|
| 33 |
- |
|
| 34 |
-Signed-off-by: Him Kalyan Bordoloi <bordoloih@vmware.com> |
|
| 35 |
-Signed-off-by: Keerthana K <keerthanak@vmware.com> |
|
| 36 |
-Signed-off-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> |
|
| 37 |
- arch/x86/kernel/apic/apic.c | 22 +++++++++++++- |
|
| 38 |
- include/linux/clockchips.h | 22 ++++++++++++++ |
|
| 39 |
- include/linux/hrtimer.h | 3 ++ |
|
| 40 |
- kernel/time/clockevents.c | 59 +++++++++++++++++++++++++++++++++++++ |
|
| 41 |
- kernel/time/hrtimer.c | 41 ++++++++++++++++++++++++++ |
|
| 42 |
- 5 files changed, 146 insertions(+), 1 deletion(-) |
|
| 43 |
- |
|
| 44 |
-diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c |
|
| 45 |
-index c6876d3ea4b1..d19f219c304c 100644 |
|
| 46 |
-+++ b/arch/x86/kernel/apic/apic.c |
|
| 47 |
-@@ -35,6 +35,7 @@ |
|
| 48 |
- #include <linux/dmi.h> |
|
| 49 |
- #include <linux/smp.h> |
|
| 50 |
- #include <linux/mm.h> |
|
| 51 |
-+#include <linux/hrtimer.h> |
|
| 52 |
- |
|
| 53 |
- #include <asm/trace/irq_vectors.h> |
|
| 54 |
- #include <asm/irq_remapping.h> |
|
| 55 |
-@@ -470,6 +471,22 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) |
|
| 56 |
- } |
|
| 57 |
- EXPORT_SYMBOL_GPL(setup_APIC_eilvt); |
|
| 58 |
- |
|
| 59 |
-+/* |
|
| 60 |
-+ * Function to convert time delta from tsc to ns. It will call clockevent_delta2ns, |
|
| 61 |
-+ * which takes unsigned long value as input. Since tsc is a u64 value, in a 32 bit system |
|
| 62 |
-+ * this can lead to data loss. So this function is restricted to x86_64 systems only. |
|
| 63 |
-+ */ |
|
| 64 |
-+unsigned long long tsc_delta2ns(unsigned long delta, |
|
| 65 |
-+ struct clock_event_device *evt) |
|
| 66 |
-+{
|
|
| 67 |
-+#ifdef CONFIG_X86_64 |
|
| 68 |
-+ return clockevent_delta2ns(delta / TSC_DIVISOR, evt); |
|
| 69 |
-+#else |
|
| 70 |
-+ return 0; |
|
| 71 |
-+#endif |
|
| 72 |
-+} |
|
| 73 |
-+EXPORT_SYMBOL_GPL(tsc_delta2ns); |
|
| 74 |
-+ |
|
| 75 |
- /* |
|
| 76 |
- * Program the next event, relative to now |
|
| 77 |
- */ |
|
| 78 |
-@@ -484,12 +501,15 @@ static int lapic_next_deadline(unsigned long delta, |
|
| 79 |
- struct clock_event_device *evt) |
|
| 80 |
- {
|
|
| 81 |
- u64 tsc; |
|
| 82 |
-+ u64 deadline; |
|
| 83 |
- |
|
| 84 |
- /* This MSR is special and need a special fence: */ |
|
| 85 |
- weak_wrmsr_fence(); |
|
| 86 |
- |
|
| 87 |
- tsc = rdtsc(); |
|
| 88 |
-- wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); |
|
| 89 |
-+ deadline = tsc + (((u64) delta) * TSC_DIVISOR); |
|
| 90 |
-+ wrmsrl(MSR_IA32_TSC_DEADLINE, deadline); |
|
| 91 |
-+ this_cpu_write(last_programmed_time_tsc, deadline); |
|
| 92 |
- return 0; |
|
| 93 |
- } |
|
| 94 |
- |
|
| 95 |
-diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h |
|
| 96 |
-index 8ae9a95ebf5b..d722e0f1d09c 100644 |
|
| 97 |
-+++ b/include/linux/clockchips.h |
|
| 98 |
-@@ -15,10 +15,31 @@ |
|
| 99 |
- # include <linux/cpumask.h> |
|
| 100 |
- # include <linux/ktime.h> |
|
| 101 |
- # include <linux/notifier.h> |
|
| 102 |
-+#include <asm/hypervisor.h> |
|
| 103 |
- |
|
| 104 |
- struct clock_event_device; |
|
| 105 |
- struct module; |
|
| 106 |
- |
|
| 107 |
-+/* |
|
| 108 |
-+ * Timer padding enabled ? |
|
| 109 |
-+ */ |
|
| 110 |
-+static bool timer_padding_enabled __read_mostly = true; |
|
| 111 |
-+ |
|
| 112 |
-+/* |
|
| 113 |
-+ * timer_padding_is_enabled - query, if the timer padding optimization is enabled |
|
| 114 |
-+ */ |
|
| 115 |
-+static inline int timer_padding_is_enabled(void) |
|
| 116 |
-+{
|
|
| 117 |
-+#ifdef CONFIG_X86_64 |
|
| 118 |
-+ if (timer_padding_enabled != false && x86_hyper_type != X86_HYPER_VMWARE) {
|
|
| 119 |
-+ timer_padding_enabled = false; |
|
| 120 |
-+ } |
|
| 121 |
-+ return timer_padding_enabled; |
|
| 122 |
-+#else |
|
| 123 |
-+ return 0; |
|
| 124 |
-+#endif |
|
| 125 |
-+} |
|
| 126 |
-+ |
|
| 127 |
- /* |
|
| 128 |
- * Possible states of a clock event device. |
|
| 129 |
- * |
|
| 130 |
-@@ -180,6 +201,7 @@ div_sc(unsigned long ticks, unsigned long nsec, int shift) |
|
| 131 |
- |
|
| 132 |
- /* Clock event layer functions */ |
|
| 133 |
- extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); |
|
| 134 |
-+extern unsigned long long tsc_delta2ns(unsigned long delta, struct clock_event_device *evt); |
|
| 135 |
- extern void clockevents_register_device(struct clock_event_device *dev); |
|
| 136 |
- extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); |
|
| 137 |
- |
|
| 138 |
-diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h |
|
| 139 |
-index 0ee140176f10..c56d19fc075f 100644 |
|
| 140 |
-+++ b/include/linux/hrtimer.h |
|
| 141 |
-@@ -316,6 +316,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) |
|
| 142 |
- #ifdef CONFIG_HIGH_RES_TIMERS |
|
| 143 |
- struct clock_event_device; |
|
| 144 |
- |
|
| 145 |
-+DECLARE_PER_CPU(unsigned long long, min_overhead_tsc); |
|
| 146 |
-+DECLARE_PER_CPU(unsigned long long, last_programmed_time_tsc); |
|
| 147 |
-+ |
|
| 148 |
- extern void hrtimer_interrupt(struct clock_event_device *dev); |
|
| 149 |
- |
|
| 150 |
- extern unsigned int hrtimer_resolution; |
|
| 151 |
-diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c |
|
| 152 |
-index 5d85014d59b5..8690dd7a5179 100644 |
|
| 153 |
-+++ b/kernel/time/clockevents.c |
|
| 154 |
-@@ -29,6 +29,25 @@ struct ce_unbind {
|
|
| 155 |
- int res; |
|
| 156 |
- }; |
|
| 157 |
- |
|
| 158 |
-+/* |
|
| 159 |
-+ * Enable / Disable timer padding optimization |
|
| 160 |
-+ */ |
|
| 161 |
-+static int __init setup_timer_padding(char *str) |
|
| 162 |
-+{
|
|
| 163 |
-+#ifdef CONFIG_X86_64 |
|
| 164 |
-+ if (x86_hyper_type != X86_HYPER_VMWARE) {
|
|
| 165 |
-+ timer_padding_enabled = false; |
|
| 166 |
-+ return 0; |
|
| 167 |
-+ } |
|
| 168 |
-+ return (kstrtobool(str, &timer_padding_enabled) == 0); |
|
| 169 |
-+#else |
|
| 170 |
-+ timer_padding_enabled = false; |
|
| 171 |
-+ return 0; |
|
| 172 |
-+#endif |
|
| 173 |
-+} |
|
| 174 |
-+ |
|
| 175 |
-+__setup("timerpadding=", setup_timer_padding);
|
|
| 176 |
-+ |
|
| 177 |
- static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, |
|
| 178 |
- bool ismax) |
|
| 179 |
- {
|
|
| 180 |
-@@ -306,6 +325,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, |
|
| 181 |
- unsigned long long clc; |
|
| 182 |
- int64_t delta; |
|
| 183 |
- int rc; |
|
| 184 |
-+ unsigned long long min_overhead_ns = 0; |
|
| 185 |
- |
|
| 186 |
- if (WARN_ON_ONCE(expires < 0)) |
|
| 187 |
- return -ETIME; |
|
| 188 |
-@@ -324,9 +344,20 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, |
|
| 189 |
- return dev->set_next_ktime(expires, dev); |
|
| 190 |
- |
|
| 191 |
- delta = ktime_to_ns(ktime_sub(expires, ktime_get())); |
|
| 192 |
-+ |
|
| 193 |
- if (delta <= 0) |
|
| 194 |
- return force ? clockevents_program_min_delta(dev) : -ETIME; |
|
| 195 |
- |
|
| 196 |
-+ if (timer_padding_is_enabled()) {
|
|
| 197 |
-+ min_overhead_ns = tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev); |
|
| 198 |
-+ /* |
|
| 199 |
-+ * min_overhead_ns <= 1000 is not reliable |
|
| 200 |
-+ * tsc_delta2ns only returns values greater than 1us reliably |
|
| 201 |
-+ */ |
|
| 202 |
-+ if (min_overhead_ns > 1000 && delta > min_overhead_ns) |
|
| 203 |
-+ delta = delta - min_overhead_ns; |
|
| 204 |
-+ } |
|
| 205 |
-+ |
|
| 206 |
- delta = min(delta, (int64_t) dev->max_delta_ns); |
|
| 207 |
- delta = max(delta, (int64_t) dev->min_delta_ns); |
|
| 208 |
- |
|
| 209 |
-@@ -667,6 +698,32 @@ static struct bus_type clockevents_subsys = {
|
|
| 210 |
- static DEFINE_PER_CPU(struct device, tick_percpu_dev); |
|
| 211 |
- static struct tick_device *tick_get_tick_dev(struct device *dev); |
|
| 212 |
- |
|
| 213 |
-+static ssize_t sysfs_show_timer_padding_ns(struct device *dev, |
|
| 214 |
-+ struct device_attribute *attr, |
|
| 215 |
-+ char *buf) |
|
| 216 |
-+{
|
|
| 217 |
-+ struct tick_device *td; |
|
| 218 |
-+ ssize_t count = 0; |
|
| 219 |
-+ int cpu = 0; |
|
| 220 |
-+ unsigned long long min_overhead_ns = 0; |
|
| 221 |
-+ |
|
| 222 |
-+ if (!timer_padding_is_enabled()) |
|
| 223 |
-+ return 0; |
|
| 224 |
-+ raw_spin_lock_irq(&clockevents_lock); |
|
| 225 |
-+ td = tick_get_tick_dev(dev); |
|
| 226 |
-+ if (td && td->evtdev) {
|
|
| 227 |
-+ if (cpumask_weight(td->evtdev->cpumask) == 1) {
|
|
| 228 |
-+ cpu = cpumask_first(td->evtdev->cpumask); |
|
| 229 |
-+ if (per_cpu(min_overhead_tsc, cpu) != ULONG_MAX) |
|
| 230 |
-+ min_overhead_ns = tsc_delta2ns(per_cpu(min_overhead_tsc, cpu), td->evtdev); |
|
| 231 |
-+ count = snprintf(buf, PAGE_SIZE, "%lld\n", min_overhead_ns); |
|
| 232 |
-+ } |
|
| 233 |
-+ } |
|
| 234 |
-+ raw_spin_unlock_irq(&clockevents_lock); |
|
| 235 |
-+ return count; |
|
| 236 |
-+} |
|
| 237 |
-+static DEVICE_ATTR(timer_padding_ns, 0444, sysfs_show_timer_padding_ns, NULL); |
|
| 238 |
-+ |
|
| 239 |
- static ssize_t current_device_show(struct device *dev, |
|
| 240 |
- struct device_attribute *attr, |
|
| 241 |
- char *buf) |
|
| 242 |
-@@ -760,6 +817,8 @@ static int __init tick_init_sysfs(void) |
|
| 243 |
- err = device_create_file(dev, &dev_attr_current_device); |
|
| 244 |
- if (!err) |
|
| 245 |
- err = device_create_file(dev, &dev_attr_unbind_device); |
|
| 246 |
-+ if (!err && timer_padding_is_enabled()) |
|
| 247 |
-+ err = device_create_file(dev, &dev_attr_timer_padding_ns); |
|
| 248 |
- if (err) |
|
| 249 |
- return err; |
|
| 250 |
- } |
|
| 251 |
-diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c |
|
| 252 |
-index 24b353cf31d3..d4c10b94f374 100644 |
|
| 253 |
-+++ b/kernel/time/hrtimer.c |
|
| 254 |
-@@ -41,6 +41,7 @@ |
|
| 255 |
- #include <linux/timer.h> |
|
| 256 |
- #include <linux/freezer.h> |
|
| 257 |
- #include <linux/compat.h> |
|
| 258 |
-+#include <linux/delay.h> |
|
| 259 |
- |
|
| 260 |
- #include <linux/uaccess.h> |
|
| 261 |
- |
|
| 262 |
-@@ -1775,6 +1776,9 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) |
|
| 263 |
- |
|
| 264 |
- #ifdef CONFIG_HIGH_RES_TIMERS |
|
| 265 |
- |
|
| 266 |
-+DEFINE_PER_CPU(unsigned long long, min_overhead_tsc) = ULONG_MAX; |
|
| 267 |
-+DEFINE_PER_CPU(unsigned long long, last_programmed_time_tsc) = 0; |
|
| 268 |
-+ |
|
| 269 |
- /* |
|
| 270 |
- * High resolution timer interrupt |
|
| 271 |
- * Called with interrupts disabled |
|
| 272 |
-@@ -1785,6 +1789,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 273 |
- ktime_t expires_next, now, entry_time, delta; |
|
| 274 |
- unsigned long flags; |
|
| 275 |
- int retries = 0; |
|
| 276 |
-+ long long current_overhead = 0; |
|
| 277 |
-+ unsigned long long tsc_now = 0; |
|
| 278 |
-+ ktime_t early = 0; |
|
| 279 |
-+ ktime_t timer_padding_spin = 0; |
|
| 280 |
- |
|
| 281 |
- BUG_ON(!cpu_base->hres_active); |
|
| 282 |
- cpu_base->nr_events++; |
|
| 283 |
-@@ -1794,6 +1802,35 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 284 |
- entry_time = now = hrtimer_update_base(cpu_base); |
|
| 285 |
- retry: |
|
| 286 |
- cpu_base->in_hrtirq = 1; |
|
| 287 |
-+ |
|
| 288 |
-+ tsc_now = rdtsc(); |
|
| 289 |
-+ current_overhead = tsc_now - this_cpu_read(last_programmed_time_tsc); |
|
| 290 |
-+ |
|
| 291 |
-+ if (timer_padding_is_enabled() && current_overhead > 0 && cpu_base->next_timer |
|
| 292 |
-+ && ktime_before(now, cpu_base->expires_next)) {
|
|
| 293 |
-+ early = ktime_sub(cpu_base->expires_next, now); |
|
| 294 |
-+ while (early > 0) {
|
|
| 295 |
-+ /* |
|
| 296 |
-+ * We pad/prepone the timer by the value of min_overhead_tsc. |
|
| 297 |
-+ * That means we cannot arrive here earlier than the expected timer fire by |
|
| 298 |
-+ * more than min_overhead_tsc, even with no overhead |
|
| 299 |
-+ */ |
|
| 300 |
-+ if (ktime_to_ns(early) > tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev)) |
|
| 301 |
-+ break; |
|
| 302 |
-+ timer_padding_spin = ktime_add(timer_padding_spin, early); |
|
| 303 |
-+ ndelay(early); |
|
| 304 |
-+ now = hrtimer_update_base(cpu_base); |
|
| 305 |
-+ if (!ktime_before(now, cpu_base->expires_next)) {
|
|
| 306 |
-+ early = 0; |
|
| 307 |
-+ break; |
|
| 308 |
-+ } else |
|
| 309 |
-+ early = ktime_sub(cpu_base->expires_next, now); |
|
| 310 |
-+ } |
|
| 311 |
-+ } |
|
| 312 |
-+ if (current_overhead > 0 && current_overhead < this_cpu_read(min_overhead_tsc)) {
|
|
| 313 |
-+ this_cpu_write(min_overhead_tsc, current_overhead); |
|
| 314 |
-+ } |
|
| 315 |
-+ |
|
| 316 |
- /* |
|
| 317 |
- * We set expires_next to KTIME_MAX here with cpu_base->lock |
|
| 318 |
- * held to prevent that a timer is enqueued in our queue via |
|
| 319 |
-@@ -1825,6 +1862,8 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 320 |
- if (expires_next == KTIME_MAX || |
|
| 321 |
- !tick_program_event(expires_next, 0)) {
|
|
| 322 |
- cpu_base->hang_detected = 0; |
|
| 323 |
-+ if (timer_padding_spin > 1000 && timer_padding_is_enabled()) |
|
| 324 |
-+ pr_warn("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
|
|
| 325 |
- return; |
|
| 326 |
- } |
|
| 327 |
- |
|
| 328 |
-@@ -1868,6 +1907,8 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 329 |
- else |
|
| 330 |
- expires_next = ktime_add(now, delta); |
|
| 331 |
- tick_program_event(expires_next, 1); |
|
| 332 |
-+ if (timer_padding_is_enabled()) |
|
| 333 |
-+ pr_warn("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
|
|
| 334 |
- pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
|
|
| 335 |
- } |
|
| 336 |
- |
|
| 337 |
-2.25.1 |
|
| 338 |
- |
| 339 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,374 @@ |
| 0 |
+From 4e6182c55e7195a1efc4a0106e7ed2fd53dbc7ba Mon Sep 17 00:00:00 2001 |
|
| 1 |
+From: Him Kalyan Bordoloi <bordoloih@vmware.com> |
|
| 2 |
+Date: Fri, 14 Apr 2023 05:48:32 +0000 |
|
| 3 |
+Subject: [PATCH] Guest timer Advancement Feature |
|
| 4 |
+ |
|
| 5 |
+V1: |
|
| 6 |
+ |
|
| 7 |
+This is an optimization to hide the virtualization cost of timer interrupts. |
|
| 8 |
+ |
|
| 9 |
+The idea is that if we have a predictable overhead, we can prepone the timer by |
|
| 10 |
+the said overhead. As a result we would receive the interrupt in the guest at the |
|
| 11 |
+same time as baremetal, giving the impression that there is no virtualization |
|
| 12 |
+overhead. |
|
| 13 |
+ |
|
| 14 |
+Next we have to determine what value to prepone the timer by. |
|
| 15 |
+One of the fundamental guarantees of timer interrupts is that a timer's callback |
|
| 16 |
+will never be invoked before its configured time. |
|
| 17 |
+If the timer is preponed, it needs to be ensured that this guarantee is satisfied. |
|
| 18 |
+In order to do that, the interrupt handler needs to spin in the guest, |
|
| 19 |
+in case the timer is received earlier than intended. This is not a desirable scenario, |
|
| 20 |
+especially in a real-time system, as this will be hogging the CPU from the workload, |
|
| 21 |
+with interrupts disabled. |
|
| 22 |
+ |
|
| 23 |
+When we traced the observed overhead from the guest for timer interrupts, |
|
| 24 |
+we found that in about 99.9% of the instances the overhead was within a range |
|
| 25 |
+of about 400ns, from approx. 1200ns to 1600ns. |
|
| 26 |
+ |
|
| 27 |
+So we decided to use the minimum observed overhead since boot time to prepone the timer. |
|
| 28 |
+The miniumum observed overhead is continuously monitored during runtime. |
|
| 29 |
+With this option, we are minimizing the need to spin while also not giving up too much of the |
|
| 30 |
+possible latency gains. |
|
| 31 |
+ |
|
| 32 |
+V2: |
|
| 33 |
+ |
|
| 34 |
+Code to disable Guest timer Advancement broke in some systems |
|
| 35 |
+due to compiler optimizations. Fixed that with memory barriers. |
|
| 36 |
+ |
|
| 37 |
+Guest timer advancement will not work in CPUs that do not |
|
| 38 |
+support TSC_DEADLINE_TIMER feature with current design. So |
|
| 39 |
+we disable the feature if this feature is not supported. |
|
| 40 |
+ |
|
| 41 |
+We are also reducing the severity and frequency of a log in |
|
| 42 |
+hrtimer.c |
|
| 43 |
+We were printing a log every time we spin for > 1us. 1us spent |
|
| 44 |
+spinning does not cause any serious issues, but this message is |
|
| 45 |
+flooding dmesg output. |
|
| 46 |
+We are increasing the threshold for this message to 5us. |
|
| 47 |
+Spinning for 5us is unlikely, but can cause latency issues in RT |
|
| 48 |
+systems if a CPU is unavailable for that much time. |
|
| 49 |
+--- |
|
| 50 |
+ arch/x86/kernel/apic/apic.c | 27 +++++++++++- |
|
| 51 |
+ include/linux/clockchips.h | 3 ++ |
|
| 52 |
+ include/linux/hrtimer.h | 3 ++ |
|
| 53 |
+ kernel/time/clockevents.c | 83 +++++++++++++++++++++++++++++++++++++ |
|
| 54 |
+ kernel/time/hrtimer.c | 42 +++++++++++++++++++ |
|
| 55 |
+ 5 files changed, 156 insertions(+), 2 deletions(-) |
|
| 56 |
+ |
|
| 57 |
+diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c |
|
| 58 |
+index 20d9a604da7c..b4e11cf87992 100644 |
|
| 59 |
+--- a/arch/x86/kernel/apic/apic.c |
|
| 60 |
+@@ -35,6 +35,7 @@ |
|
| 61 |
+ #include <linux/dmi.h> |
|
| 62 |
+ #include <linux/smp.h> |
|
| 63 |
+ #include <linux/mm.h> |
|
| 64 |
++#include <linux/hrtimer.h> |
|
| 65 |
+ |
|
| 66 |
+ #include <asm/trace/irq_vectors.h> |
|
| 67 |
+ #include <asm/irq_remapping.h> |
|
| 68 |
+@@ -470,6 +471,20 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) |
|
| 69 |
+ } |
|
| 70 |
+ EXPORT_SYMBOL_GPL(setup_APIC_eilvt); |
|
| 71 |
+ |
|
| 72 |
++/* |
|
| 73 |
++ * Function to convert time delta from tsc to ns. It will call clockevent_delta2ns, |
|
| 74 |
++ * which takes unsigned long value as input. Since tsc is a u64 value, in a 32 bit system |
|
| 75 |
++ * this can lead to data loss. So this function is restricted to x86_64 systems only. |
|
| 76 |
++ */ |
|
| 77 |
++u64 tsc_delta2ns(u64 delta, struct clock_event_device *evt) |
|
| 78 |
++{
|
|
| 79 |
++#ifdef CONFIG_X86_64 |
|
| 80 |
++ return clockevent_delta2ns((unsigned long)delta / TSC_DIVISOR, evt); |
|
| 81 |
++#else |
|
| 82 |
++ return 0; |
|
| 83 |
++#endif |
|
| 84 |
++} |
|
| 85 |
++ |
|
| 86 |
+ /* |
|
| 87 |
+ * Program the next event, relative to now |
|
| 88 |
+ */ |
|
| 89 |
+@@ -484,12 +499,18 @@ static int lapic_next_deadline(unsigned long delta, |
|
| 90 |
+ struct clock_event_device *evt) |
|
| 91 |
+ {
|
|
| 92 |
+ u64 tsc; |
|
| 93 |
++ u64 deadline; |
|
| 94 |
+ |
|
| 95 |
+ /* This MSR is special and need a special fence: */ |
|
| 96 |
+ weak_wrmsr_fence(); |
|
| 97 |
+ |
|
| 98 |
+ tsc = rdtsc(); |
|
| 99 |
+- wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); |
|
| 100 |
++ if (timer_padding_is_enabled()) {
|
|
| 101 |
++ deadline = tsc + (((u64) delta) * TSC_DIVISOR); |
|
| 102 |
++ wrmsrl(MSR_IA32_TSC_DEADLINE, deadline); |
|
| 103 |
++ this_cpu_write(last_programmed_time_tsc, deadline); |
|
| 104 |
++ } else |
|
| 105 |
++ wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); |
|
| 106 |
+ return 0; |
|
| 107 |
+ } |
|
| 108 |
+ |
|
| 109 |
+@@ -641,8 +662,10 @@ static void setup_APIC_timer(void) |
|
| 110 |
+ clockevents_config_and_register(levt, |
|
| 111 |
+ tsc_khz * (1000 / TSC_DIVISOR), |
|
| 112 |
+ 0xF, ~0UL); |
|
| 113 |
+- } else |
|
| 114 |
++ } else {
|
|
| 115 |
+ clockevents_register_device(levt); |
|
| 116 |
++ set_timer_padding(false); |
|
| 117 |
++ } |
|
| 118 |
+ } |
|
| 119 |
+ |
|
| 120 |
+ /* |
|
| 121 |
+diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h |
|
| 122 |
+index 8ae9a95ebf5b..2bffa549ed55 100644 |
|
| 123 |
+--- a/include/linux/clockchips.h |
|
| 124 |
+@@ -179,7 +179,10 @@ div_sc(unsigned long ticks, unsigned long nsec, int shift) |
|
| 125 |
+ } |
|
| 126 |
+ |
|
| 127 |
+ /* Clock event layer functions */ |
|
| 128 |
++extern inline bool timer_padding_is_enabled(void); |
|
| 129 |
++extern inline void set_timer_padding(bool enabled); |
|
| 130 |
+ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); |
|
| 131 |
++extern u64 tsc_delta2ns(u64 delta, struct clock_event_device *evt); |
|
| 132 |
+ extern void clockevents_register_device(struct clock_event_device *dev); |
|
| 133 |
+ extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); |
|
| 134 |
+ |
|
| 135 |
+diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h |
|
| 136 |
+index 0ee140176f10..7d5eff8fd605 100644 |
|
| 137 |
+--- a/include/linux/hrtimer.h |
|
| 138 |
+@@ -316,6 +316,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) |
|
| 139 |
+ #ifdef CONFIG_HIGH_RES_TIMERS |
|
| 140 |
+ struct clock_event_device; |
|
| 141 |
+ |
|
| 142 |
++DECLARE_PER_CPU(u64, min_overhead_tsc); |
|
| 143 |
++DECLARE_PER_CPU(u64, last_programmed_time_tsc); |
|
| 144 |
++ |
|
| 145 |
+ extern void hrtimer_interrupt(struct clock_event_device *dev); |
|
| 146 |
+ |
|
| 147 |
+ extern unsigned int hrtimer_resolution; |
|
| 148 |
+diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c |
|
| 149 |
+index 5d85014d59b5..c58e3f064517 100644 |
|
| 150 |
+--- a/kernel/time/clockevents.c |
|
| 151 |
+@@ -13,6 +13,7 @@ |
|
| 152 |
+ #include <linux/module.h> |
|
| 153 |
+ #include <linux/smp.h> |
|
| 154 |
+ #include <linux/device.h> |
|
| 155 |
++#include <asm/hypervisor.h> |
|
| 156 |
+ |
|
| 157 |
+ #include "tick-internal.h" |
|
| 158 |
+ |
|
| 159 |
+@@ -29,6 +30,44 @@ struct ce_unbind {
|
|
| 160 |
+ int res; |
|
| 161 |
+ }; |
|
| 162 |
+ |
|
| 163 |
++/* |
|
| 164 |
++ * Timer padding enabled ? |
|
| 165 |
++ */ |
|
| 166 |
++static bool timer_padding_enabled __read_mostly = true; |
|
| 167 |
++/* |
|
| 168 |
++ * timer_padding_is_enabled - query, if the timer padding optimization is enabled |
|
| 169 |
++ */ |
|
| 170 |
++inline bool timer_padding_is_enabled(void) |
|
| 171 |
++{
|
|
| 172 |
++ smp_rmb(); |
|
| 173 |
++ return timer_padding_enabled; |
|
| 174 |
++} |
|
| 175 |
++ |
|
| 176 |
++inline void set_timer_padding(bool enabled) |
|
| 177 |
++{
|
|
| 178 |
++ timer_padding_enabled = enabled; |
|
| 179 |
++ smp_wmb(); |
|
| 180 |
++} |
|
| 181 |
++ |
|
| 182 |
++/* |
|
| 183 |
++ * Enable / Disable timer padding optimization |
|
| 184 |
++ */ |
|
| 185 |
++static int __init setup_timer_padding(char *str) |
|
| 186 |
++{
|
|
| 187 |
++#ifdef CONFIG_X86_64 |
|
| 188 |
++ if (x86_hyper_type != X86_HYPER_VMWARE) {
|
|
| 189 |
++ set_timer_padding(false); |
|
| 190 |
++ return 0; |
|
| 191 |
++ } |
|
| 192 |
++ return (kstrtobool(str, &timer_padding_enabled) == 0); |
|
| 193 |
++#else |
|
| 194 |
++ set_timer_padding(false); |
|
| 195 |
++ return 0; |
|
| 196 |
++#endif |
|
| 197 |
++} |
|
| 198 |
++ |
|
| 199 |
++__setup("timerpadding=", setup_timer_padding);
|
|
| 200 |
++ |
|
| 201 |
+ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, |
|
| 202 |
+ bool ismax) |
|
| 203 |
+ {
|
|
| 204 |
+@@ -306,6 +345,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, |
|
| 205 |
+ unsigned long long clc; |
|
| 206 |
+ int64_t delta; |
|
| 207 |
+ int rc; |
|
| 208 |
++ u64 min_overhead_ns = 0; |
|
| 209 |
+ |
|
| 210 |
+ if (WARN_ON_ONCE(expires < 0)) |
|
| 211 |
+ return -ETIME; |
|
| 212 |
+@@ -324,9 +364,20 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, |
|
| 213 |
+ return dev->set_next_ktime(expires, dev); |
|
| 214 |
+ |
|
| 215 |
+ delta = ktime_to_ns(ktime_sub(expires, ktime_get())); |
|
| 216 |
++ |
|
| 217 |
+ if (delta <= 0) |
|
| 218 |
+ return force ? clockevents_program_min_delta(dev) : -ETIME; |
|
| 219 |
+ |
|
| 220 |
++ if (timer_padding_is_enabled()) {
|
|
| 221 |
++ min_overhead_ns = tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev); |
|
| 222 |
++ /* |
|
| 223 |
++ * min_overhead_ns <= 1000 is not reliable |
|
| 224 |
++ * tsc_delta2ns only returns values greater than 1us reliably |
|
| 225 |
++ */ |
|
| 226 |
++ if (min_overhead_ns > 1000 && delta > min_overhead_ns) |
|
| 227 |
++ delta = delta - min_overhead_ns; |
|
| 228 |
++ } |
|
| 229 |
++ |
|
| 230 |
+ delta = min(delta, (int64_t) dev->max_delta_ns); |
|
| 231 |
+ delta = max(delta, (int64_t) dev->min_delta_ns); |
|
| 232 |
+ |
|
| 233 |
+@@ -667,6 +718,32 @@ static struct bus_type clockevents_subsys = {
|
|
| 234 |
+ static DEFINE_PER_CPU(struct device, tick_percpu_dev); |
|
| 235 |
+ static struct tick_device *tick_get_tick_dev(struct device *dev); |
|
| 236 |
+ |
|
| 237 |
++static ssize_t timer_padding_ns_show(struct device *dev, |
|
| 238 |
++ struct device_attribute *attr, |
|
| 239 |
++ char *buf) |
|
| 240 |
++{
|
|
| 241 |
++ struct tick_device *td; |
|
| 242 |
++ ssize_t count = 0; |
|
| 243 |
++ int cpu = 0; |
|
| 244 |
++ u64 min_overhead_ns = 0; |
|
| 245 |
++ |
|
| 246 |
++ if (!timer_padding_is_enabled()) |
|
| 247 |
++ return 0; |
|
| 248 |
++ raw_spin_lock_irq(&clockevents_lock); |
|
| 249 |
++ td = tick_get_tick_dev(dev); |
|
| 250 |
++ if (td && td->evtdev) {
|
|
| 251 |
++ if (cpumask_weight(td->evtdev->cpumask) == 1) {
|
|
| 252 |
++ cpu = cpumask_first(td->evtdev->cpumask); |
|
| 253 |
++ if (per_cpu(min_overhead_tsc, cpu) != ULONG_MAX) |
|
| 254 |
++ min_overhead_ns = tsc_delta2ns(per_cpu(min_overhead_tsc, cpu), td->evtdev); |
|
| 255 |
++ count = snprintf(buf, PAGE_SIZE, "%lld\n", min_overhead_ns); |
|
| 256 |
++ } |
|
| 257 |
++ } |
|
| 258 |
++ raw_spin_unlock_irq(&clockevents_lock); |
|
| 259 |
++ return count; |
|
| 260 |
++} |
|
| 261 |
++static DEVICE_ATTR(timer_padding_ns, 0444, timer_padding_ns_show, NULL); |
|
| 262 |
++ |
|
| 263 |
+ static ssize_t current_device_show(struct device *dev, |
|
| 264 |
+ struct device_attribute *attr, |
|
| 265 |
+ char *buf) |
|
| 266 |
+@@ -760,6 +837,12 @@ static int __init tick_init_sysfs(void) |
|
| 267 |
+ err = device_create_file(dev, &dev_attr_current_device); |
|
| 268 |
+ if (!err) |
|
| 269 |
+ err = device_create_file(dev, &dev_attr_unbind_device); |
|
| 270 |
++ if (!err && timer_padding_is_enabled()) {
|
|
| 271 |
++ if (x86_hyper_type != X86_HYPER_VMWARE) {
|
|
| 272 |
++ set_timer_padding(false); |
|
| 273 |
++ } else |
|
| 274 |
++ err = device_create_file(dev, &dev_attr_timer_padding_ns); |
|
| 275 |
++ } |
|
| 276 |
+ if (err) |
|
| 277 |
+ return err; |
|
| 278 |
+ } |
|
| 279 |
+diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c |
|
| 280 |
+index 29860eefd452..49bec11d5b36 100644 |
|
| 281 |
+--- a/kernel/time/hrtimer.c |
|
| 282 |
+@@ -41,6 +41,7 @@ |
|
| 283 |
+ #include <linux/timer.h> |
|
| 284 |
+ #include <linux/freezer.h> |
|
| 285 |
+ #include <linux/compat.h> |
|
| 286 |
++#include <linux/delay.h> |
|
| 287 |
+ |
|
| 288 |
+ #include <linux/uaccess.h> |
|
| 289 |
+ |
|
| 290 |
+@@ -1775,6 +1776,9 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) |
|
| 291 |
+ |
|
| 292 |
+ #ifdef CONFIG_HIGH_RES_TIMERS |
|
| 293 |
+ |
|
| 294 |
++DEFINE_PER_CPU(u64, min_overhead_tsc) = ULONG_MAX; |
|
| 295 |
++DEFINE_PER_CPU(u64, last_programmed_time_tsc) = 0; |
|
| 296 |
++ |
|
| 297 |
+ /* |
|
| 298 |
+ * High resolution timer interrupt |
|
| 299 |
+ * Called with interrupts disabled |
|
| 300 |
+@@ -1785,6 +1789,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 301 |
+ ktime_t expires_next, now, entry_time, delta; |
|
| 302 |
+ unsigned long flags; |
|
| 303 |
+ int retries = 0; |
|
| 304 |
++ s64 current_overhead = 0; |
|
| 305 |
++ u64 tsc_now = 0; |
|
| 306 |
++ ktime_t early = 0; |
|
| 307 |
++ ktime_t timer_padding_spin = 0; |
|
| 308 |
+ |
|
| 309 |
+ BUG_ON(!cpu_base->hres_active); |
|
| 310 |
+ cpu_base->nr_events++; |
|
| 311 |
+@@ -1794,6 +1802,36 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 312 |
+ entry_time = now = hrtimer_update_base(cpu_base); |
|
| 313 |
+ retry: |
|
| 314 |
+ cpu_base->in_hrtirq = 1; |
|
| 315 |
++ |
|
| 316 |
++ if (timer_padding_is_enabled()) {
|
|
| 317 |
++ tsc_now = rdtsc(); |
|
| 318 |
++ current_overhead = tsc_now - this_cpu_read(last_programmed_time_tsc); |
|
| 319 |
++ |
|
| 320 |
++ if (current_overhead > 0 && cpu_base->next_timer |
|
| 321 |
++ && ktime_before(now, cpu_base->expires_next)) {
|
|
| 322 |
++ early = ktime_sub(cpu_base->expires_next, now); |
|
| 323 |
++ while (early > 0) {
|
|
| 324 |
++ /* |
|
| 325 |
++ * We pad/prepone the timer by the value of min_overhead_tsc. |
|
| 326 |
++ * That means we cannot arrive here earlier than the expected timer fire by |
|
| 327 |
++ * more than min_overhead_tsc, even with no overhead |
|
| 328 |
++ */ |
|
| 329 |
++ if (ktime_to_ns(early) > tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev)) |
|
| 330 |
++ break; |
|
| 331 |
++ timer_padding_spin = ktime_add(timer_padding_spin, early); |
|
| 332 |
++ ndelay(early); |
|
| 333 |
++ now = hrtimer_update_base(cpu_base); |
|
| 334 |
++ if (!ktime_before(now, cpu_base->expires_next)) {
|
|
| 335 |
++ early = 0; |
|
| 336 |
++ break; |
|
| 337 |
++ } else |
|
| 338 |
++ early = ktime_sub(cpu_base->expires_next, now); |
|
| 339 |
++ } |
|
| 340 |
++ } |
|
| 341 |
++ if (current_overhead > 0 && current_overhead < this_cpu_read(min_overhead_tsc)) {
|
|
| 342 |
++ this_cpu_write(min_overhead_tsc, current_overhead); |
|
| 343 |
++ } |
|
| 344 |
++ } |
|
| 345 |
+ /* |
|
| 346 |
+ * We set expires_next to KTIME_MAX here with cpu_base->lock |
|
| 347 |
+ * held to prevent that a timer is enqueued in our queue via |
|
| 348 |
+@@ -1825,6 +1863,8 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 349 |
+ if (expires_next == KTIME_MAX || |
|
| 350 |
+ !tick_program_event(expires_next, 0)) {
|
|
| 351 |
+ cpu_base->hang_detected = 0; |
|
| 352 |
++ if (timer_padding_is_enabled() && timer_padding_spin > 5000) |
|
| 353 |
++ pr_info_ratelimited("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
|
|
| 354 |
+ return; |
|
| 355 |
+ } |
|
| 356 |
+ |
|
| 357 |
+@@ -1868,6 +1908,8 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
|
| 358 |
+ else |
|
| 359 |
+ expires_next = ktime_add(now, delta); |
|
| 360 |
+ tick_program_event(expires_next, 1); |
|
| 361 |
++ if (timer_padding_is_enabled()) |
|
| 362 |
++ pr_warn_once("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
|
|
| 363 |
+ pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
|
|
| 364 |
+ } |
|
| 365 |
+ |
|
| 366 |
+-- |
|
| 367 |
+2.23.3 |
|
| 368 |
+ |