From 4e6182c55e7195a1efc4a0106e7ed2fd53dbc7ba Mon Sep 17 00:00:00 2001
From: Him Kalyan Bordoloi <bordoloih@vmware.com>
Date: Fri, 14 Apr 2023 05:48:32 +0000
Subject: [PATCH] Guest timer Advancement Feature

V1:

This is an optimization to hide the virtualization cost of timer interrupts.

The idea is that if we have a predictable overhead, we can prepone the timer by
the said overhead. As a result we would receive the interrupt in the guest at the
same time as baremetal, giving the impression that there is no virtualization
overhead.

Next we have to determine what value to prepone the timer by.
One of the fundamental guarantees of timer interrupts is that a timer's callback
will never be invoked before its configured time.
If the timer is preponed, it needs to be ensured that this guarantee is satisfied.
In order to do that, the interrupt handler needs to spin in the guest,
in case the timer is received earlier than intended. This is not a desirable scenario,
especially in a real-time system, as this will be hogging the CPU from the workload,
with interrupts disabled.

When we traced the observed overhead from the guest for timer interrupts,
we found that in about 99.9% of the instances the overhead was within a range
of about 400ns, from approx. 1200ns to 1600ns.

So we decided to use the minimum observed overhead since boot time to prepone the timer.
The miniumum observed overhead is continuously monitored during runtime.
With this option, we are minimizing the need to spin while also not giving up too much of the
possible latency gains.

V2:

Code to disable Guest timer Advancement broke in some systems
due to compiler optimizations. Fixed that with memory barriers.

Guest timer advancement will not work in CPUs that do not
support TSC_DEADLINE_TIMER feature with current design. So
we disable the feature if this feature is not supported.

We are also reducing the severity and frequency of a log in
hrtimer.c
We were printing a log every time we spin for > 1us. 1us spent
spinning does not cause any serious issues, but this message is
flooding dmesg output.
We are increasing the threshold for this message to 5us.
Spinning for 5us is unlikely, but can cause latency issues in RT
systems if a CPU is unavailable for that much time.
---
 arch/x86/kernel/apic/apic.c | 27 +++++++++++-
 include/linux/clockchips.h  |  3 ++
 include/linux/hrtimer.h     |  3 ++
 kernel/time/clockevents.c   | 83 +++++++++++++++++++++++++++++++++++++
 kernel/time/hrtimer.c       | 42 +++++++++++++++++++
 5 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 20d9a604da7c..b4e11cf87992 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,7 @@
 #include <linux/dmi.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/hrtimer.h>
 
 #include <asm/trace/irq_vectors.h>
 #include <asm/irq_remapping.h>
@@ -470,6 +471,20 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
 }
 EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
 
+/*
+ * Function to convert time delta from tsc to ns. It will call clockevent_delta2ns,
+ * which takes unsigned long value as input. Since tsc is a u64 value, in a 32 bit system
+ * this can lead to data loss. So this function is restricted to x86_64 systems only.
+ */
+u64 tsc_delta2ns(u64 delta, struct clock_event_device *evt)
+{
+#ifdef CONFIG_X86_64
+	return clockevent_delta2ns((unsigned long)delta / TSC_DIVISOR, evt);
+#else
+	return 0;
+#endif
+}
+
 /*
  * Program the next event, relative to now
  */
@@ -484,12 +499,18 @@ static int lapic_next_deadline(unsigned long delta,
 			       struct clock_event_device *evt)
 {
 	u64 tsc;
+	u64 deadline;
 
 	/* This MSR is special and need a special fence: */
 	weak_wrmsr_fence();
 
 	tsc = rdtsc();
-	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+	if (timer_padding_is_enabled()) {
+		deadline = tsc + (((u64) delta) * TSC_DIVISOR);
+		wrmsrl(MSR_IA32_TSC_DEADLINE, deadline);
+		this_cpu_write(last_programmed_time_tsc, deadline);
+	} else
+		wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
 
@@ -641,8 +662,10 @@ static void setup_APIC_timer(void)
 		clockevents_config_and_register(levt,
 						tsc_khz * (1000 / TSC_DIVISOR),
 						0xF, ~0UL);
-	} else
+	} else {
 		clockevents_register_device(levt);
+		set_timer_padding(false);
+	}
 }
 
 /*
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 8ae9a95ebf5b..2bffa549ed55 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -179,7 +179,10 @@ div_sc(unsigned long ticks, unsigned long nsec, int shift)
 }
 
 /* Clock event layer functions */
+extern inline bool timer_padding_is_enabled(void);
+extern inline void set_timer_padding(bool enabled);
 extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt);
+extern u64 tsc_delta2ns(u64 delta, struct clock_event_device *evt);
 extern void clockevents_register_device(struct clock_event_device *dev);
 extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
 
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..7d5eff8fd605 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -316,6 +316,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 #ifdef CONFIG_HIGH_RES_TIMERS
 struct clock_event_device;
 
+DECLARE_PER_CPU(u64, min_overhead_tsc);
+DECLARE_PER_CPU(u64, last_programmed_time_tsc);
+
 extern void hrtimer_interrupt(struct clock_event_device *dev);
 
 extern unsigned int hrtimer_resolution;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5d85014d59b5..c58e3f064517 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/device.h>
+#include <asm/hypervisor.h>
 
 #include "tick-internal.h"
 
@@ -29,6 +30,44 @@ struct ce_unbind {
 	int res;
 };
 
+/*
+ * Timer padding enabled ?
+ */
+static bool timer_padding_enabled __read_mostly = true;
+/*
+ * timer_padding_is_enabled - query, if the timer padding optimization is enabled
+ */
+inline bool timer_padding_is_enabled(void)
+{
+        smp_rmb();
+        return timer_padding_enabled;
+}
+
+inline void set_timer_padding(bool enabled)
+{
+        timer_padding_enabled = enabled;
+        smp_wmb();
+}
+
+/*
+ * Enable / Disable timer padding optimization
+ */
+static int __init setup_timer_padding(char *str)
+{
+#ifdef CONFIG_X86_64
+	if (x86_hyper_type != X86_HYPER_VMWARE) {
+		set_timer_padding(false);
+		return 0;
+	}
+	return (kstrtobool(str, &timer_padding_enabled) == 0);
+#else
+	set_timer_padding(false);
+	return 0;
+#endif
+}
+
+__setup("timerpadding=", setup_timer_padding);
+
 static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
 			bool ismax)
 {
@@ -306,6 +345,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 	unsigned long long clc;
 	int64_t delta;
 	int rc;
+	u64 min_overhead_ns = 0;
 
 	if (WARN_ON_ONCE(expires < 0))
 		return -ETIME;
@@ -324,9 +364,20 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 		return dev->set_next_ktime(expires, dev);
 
 	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+
 	if (delta <= 0)
 		return force ? clockevents_program_min_delta(dev) : -ETIME;
 
+	if (timer_padding_is_enabled()) {
+		min_overhead_ns = tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev);
+		/*
+		 * min_overhead_ns <= 1000 is not reliable
+		 * tsc_delta2ns only returns values greater than 1us reliably
+		 */
+		if (min_overhead_ns > 1000 && delta > min_overhead_ns)
+			delta = delta - min_overhead_ns;
+	}
+
 	delta = min(delta, (int64_t) dev->max_delta_ns);
 	delta = max(delta, (int64_t) dev->min_delta_ns);
 
@@ -667,6 +718,32 @@ static struct bus_type clockevents_subsys = {
 static DEFINE_PER_CPU(struct device, tick_percpu_dev);
 static struct tick_device *tick_get_tick_dev(struct device *dev);
 
+static ssize_t timer_padding_ns_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct tick_device *td;
+	ssize_t count = 0;
+	int cpu = 0;
+	u64 min_overhead_ns = 0;
+
+	if (!timer_padding_is_enabled())
+		return 0;
+	raw_spin_lock_irq(&clockevents_lock);
+	td = tick_get_tick_dev(dev);
+	if (td && td->evtdev) {
+		if (cpumask_weight(td->evtdev->cpumask) == 1) {
+			cpu = cpumask_first(td->evtdev->cpumask);
+			if (per_cpu(min_overhead_tsc, cpu) != ULONG_MAX)
+				 min_overhead_ns = tsc_delta2ns(per_cpu(min_overhead_tsc, cpu), td->evtdev);
+			count = snprintf(buf, PAGE_SIZE, "%lld\n", min_overhead_ns);
+		}
+	}
+	raw_spin_unlock_irq(&clockevents_lock);
+	return count;
+}
+static DEVICE_ATTR(timer_padding_ns, 0444, timer_padding_ns_show, NULL);
+
 static ssize_t current_device_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
@@ -760,6 +837,12 @@ static int __init tick_init_sysfs(void)
 			err = device_create_file(dev, &dev_attr_current_device);
 		if (!err)
 			err = device_create_file(dev, &dev_attr_unbind_device);
+		if (!err && timer_padding_is_enabled()) {
+			if (x86_hyper_type != X86_HYPER_VMWARE) {
+				set_timer_padding(false);
+			} else
+				err = device_create_file(dev, &dev_attr_timer_padding_ns);
+		}
 		if (err)
 			return err;
 	}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 29860eefd452..49bec11d5b36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -41,6 +41,7 @@
 #include <linux/timer.h>
 #include <linux/freezer.h>
 #include <linux/compat.h>
+#include <linux/delay.h>
 
 #include <linux/uaccess.h>
 
@@ -1775,6 +1776,9 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 
+DEFINE_PER_CPU(u64, min_overhead_tsc) = ULONG_MAX;
+DEFINE_PER_CPU(u64, last_programmed_time_tsc) = 0;
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1785,6 +1789,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	ktime_t expires_next, now, entry_time, delta;
 	unsigned long flags;
 	int retries = 0;
+	s64 current_overhead = 0;
+	u64 tsc_now = 0;
+	ktime_t early = 0;
+	ktime_t timer_padding_spin = 0;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1794,6 +1802,36 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	entry_time = now = hrtimer_update_base(cpu_base);
 retry:
 	cpu_base->in_hrtirq = 1;
+
+	if (timer_padding_is_enabled()) {
+		tsc_now = rdtsc();
+		current_overhead = tsc_now - this_cpu_read(last_programmed_time_tsc);
+
+		if (current_overhead > 0 && cpu_base->next_timer
+		    && ktime_before(now, cpu_base->expires_next)) {
+			early = ktime_sub(cpu_base->expires_next, now);
+			while (early > 0) {
+				/*
+				 * We pad/prepone the timer by the value of min_overhead_tsc.
+				 * That means we cannot arrive here earlier than the expected timer fire by
+				 * more than min_overhead_tsc, even with no overhead
+				 */
+				if (ktime_to_ns(early) > tsc_delta2ns(this_cpu_read(min_overhead_tsc), dev))
+					break;
+				timer_padding_spin = ktime_add(timer_padding_spin, early);
+				ndelay(early);
+				now = hrtimer_update_base(cpu_base);
+				if (!ktime_before(now, cpu_base->expires_next)) {
+					early = 0;
+					break;
+				} else
+					early = ktime_sub(cpu_base->expires_next, now);
+			}
+		}
+		if (current_overhead > 0 && current_overhead < this_cpu_read(min_overhead_tsc)) {
+			this_cpu_write(min_overhead_tsc, current_overhead);
+		}
+	}
 	/*
 	 * We set expires_next to KTIME_MAX here with cpu_base->lock
 	 * held to prevent that a timer is enqueued in our queue via
@@ -1825,6 +1863,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	if (expires_next == KTIME_MAX ||
 	    !tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
+		if (timer_padding_is_enabled() && timer_padding_spin > 5000)
+			pr_info_ratelimited("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
 		return;
 	}
 
@@ -1868,6 +1908,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	else
 		expires_next = ktime_add(now, delta);
 	tick_program_event(expires_next, 1);
+	if (timer_padding_is_enabled())
+		pr_warn_once("hrtimer: timer padding spent %llu ns spinning\n", ktime_to_ns(timer_padding_spin));
 	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
 }
 
-- 
2.23.3