From c6c12d88ab31ac78bcb779ad09204bde05b68f3e Mon Sep 17 00:00:00 2001 From: Alexey Makhalov Date: Tue, 8 Aug 2017 19:32:45 +0000 Subject: [PATCH] PV-ops support for VMware guest --- arch/x86/Kconfig | 18 ++ arch/x86/include/asm/paravirt.h | 5 + arch/x86/include/asm/paravirt_types.h | 5 + arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/rdrand.c | 2 + arch/x86/kernel/cpu/vmware.c | 414 +++++++++++++++++++++++++++++++++- arch/x86/kernel/head_64.S | 10 + arch/x86/kernel/paravirt.c | 7 + arch/x86/kernel/setup.c | 9 + arch/x86/kernel/smpboot.c | 2 +- kernel/sched/clock.c | 7 + 11 files changed, 475 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 436639a..4ad7f49 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -710,6 +710,24 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. +config VMWARE + bool "VMware guest support" + depends on PARAVIRT + default y + ---help--- + This option enables various optimizations for running under the + VMware hypervisor. It includes vmware-clock clocksource and some + pv-ops implementations. + +config VMWARE_ONLY + bool "Build for VMware only" + depends on VMWARE + default n + ---help--- + This option enables VMware guest specific optimizations. If you say + yes here, the kernel will probably work only under VMware hypervisor. + + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c759b3c..5ee3378 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -198,6 +198,11 @@ static inline u64 paravirt_steal_clock(int cpu) return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); } +static inline void paravirt_read_boot_clock64(struct timespec64 *ts) +{ + PVOP_VCALL1(pv_time_ops.read_boot_clock64, ts); +} + static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 3d44191..2e76e4a 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -51,6 +51,10 @@ struct mm_struct; struct desc_struct; struct task_struct; struct cpumask; +#if __BITS_PER_LONG == 64 +# define timespec64 timespec +#endif +struct timespec64; /* * Wrapper type for pointers to code which uses the non-standard @@ -102,6 +106,7 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); + void (*read_boot_clock64)(struct timespec64 *ts); }; struct pv_cpu_ops { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 637ca41..2635ce9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -943,7 +943,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); +#ifndef CONFIG_VMWARE_ONLY x86_init_rdrand(c); +#endif x86_init_cache_qos(c); /* diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 136ac74..0685891 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -32,6 +32,7 @@ static int __init x86_rdrand_setup(char *s) } __setup("nordrand", x86_rdrand_setup); +#ifndef CONFIG_VMWARE_ONLY /* * Force a reseed cycle; we are architecturally guaranteed a reseed * after no more than 512 128-bit chunks of random data. This also @@ -58,3 +59,4 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_RDRAND); #endif } +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 628a059..43aae4e 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -26,25 +26,73 @@ #include #include #include +#include +#include +#include +#include +#include -#define CPUID_VMWARE_INFO_LEAF 0x40000000 -#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 -#define VMWARE_HYPERVISOR_PORT 0x5658 +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_HB_PORT 0x5659 #define VMWARE_PORT_CMD_GETVERSION 10 #define VMWARE_PORT_CMD_GETHZ 45 #define VMWARE_PORT_CMD_GETVCPU_INFO 68 #define VMWARE_PORT_CMD_LEGACY_X2APIC 3 #define VMWARE_PORT_CMD_VCPU_RESERVED 31 +#define VMWARE_PORT_CMD_STEALCLOCK 91 +# define STEALCLOCK_IS_NOT_AVALIABLE -1 +# define STEALCLOCK_IS_DISABLED 0 +# define STEALCLOCK_IS_ENABLED 1 +#define VMWARE_PORT_CMD_MESSAGE 30 +#define VMWARE_HB_PORT_CMD_MESSAGE 0 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + VMWARE_PORT2(cmd, eax, ebx, ecx, edx, UINT_MAX) + +#define VMWARE_PORT2(cmd, eax, ebx, ecx, edx, arg) \ __asm__("inl (%%dx)" : \ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ "0"(VMWARE_HYPERVISOR_MAGIC), \ "1"(VMWARE_PORT_CMD_##cmd), \ - "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(arg) : \ "memory"); +struct vmware_steal_time { + uint64_t clock; /* stolen time counter in units of vtsc */ + uint64_t reserved[7]; +}; +static DEFINE_PER_CPU(struct vmware_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; + +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_STEALCLOCK), + "d" (VMWARE_HYPERVISOR_PORT), + "b" (0), + "S" (arg1), + "D" (arg2)); + return result; +} +#define STEALCLOCK_ENABLE(pa) \ + (vmware_cmd_stealclock((pa) >> 32, (pa) & 0xffffffff) \ + == STEALCLOCK_IS_ENABLED) + +#define STEALCLOCK_DISABLE() \ + vmware_cmd_stealclock(0, 1) + +static int vmware_is_stealclock_available(void) +{ + return STEALCLOCK_DISABLE() != STEALCLOCK_IS_NOT_AVALIABLE; +} + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -75,17 +123,246 @@ static unsigned long vmware_get_tsc_khz(void) return tsc_hz; } +#define CYC2NS_SCALE_FACTOR 8 +static struct cyc2ns_data vmware_cyc2ns; +uint64_t __initdata tsc_at_head; +uint64_t __initdata vtsc_khz; + +static cycle_t vmware_clock_get_cycles(struct clocksource *cs) +{ + return (cycle_t)rdtsc_ordered(); +} + +static struct clocksource clocksource_vmware = { + .name = "vmware-clock", + .read = vmware_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .archdata = { .vclock_mode = VCLOCK_TSC }, +}; + +#ifdef CONFIG_VMWARE_ONLY +/* We want to use clocksource_vmware from the beginning to avoid drifting in + monotonic clock */ +struct clocksource * __init clocksource_default_clock(void) +{ + return &clocksource_vmware; +} +#endif + +static u64 vmware_sched_clock(void) +{ + u64 ret; + + ret = rdtsc() - vmware_cyc2ns.cyc2ns_offset; + ret = mul_u64_u32_shr(ret, vmware_cyc2ns.cyc2ns_mul, CYC2NS_SCALE_FACTOR); + return ret; +} + + +/* Function to read the exact time the system has been started. It will be + used as zero time for monotonic clock */ +static void vmware_read_boot_clock64(struct timespec64 *ts) +{ + struct timespec64 now; + u64 delta, delta_nsec; + u32 rem; + + read_persistent_clock64(&now); + delta = rdtsc() - vmware_cyc2ns.cyc2ns_offset; + delta_nsec = mul_u64_u32_shr(delta, vmware_cyc2ns.cyc2ns_mul, + CYC2NS_SCALE_FACTOR); + ts->tv_sec = now.tv_sec - div_s64_rem(delta_nsec, NSEC_PER_SEC, &rem); + ts->tv_nsec = now.tv_nsec - rem; + while (unlikely(ts->tv_nsec < 0)) { + ts->tv_sec--; + ts->tv_nsec += NSEC_PER_SEC; + } +} + +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal; + + steal = &per_cpu(steal_time, cpu); + return mul_u64_u32_shr(steal->clock, vmware_cyc2ns.cyc2ns_mul, + CYC2NS_SCALE_FACTOR); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + if (!STEALCLOCK_ENABLE(slow_virt_to_phys(st))) { + has_steal_clock = 0; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); +} + +void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + STEALCLOCK_DISABLE(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static void vmware_guest_cpu_online(void *dummy) +{ + vmware_guest_cpu_init(); +} + +static void vmware_guest_cpu_offline(void *dummy) +{ + vmware_disable_steal_time(); +} + +static int vmware_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, vmware_guest_cpu_online, + NULL, 0); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + smp_call_function_single(cpu, vmware_guest_cpu_offline, + NULL, 1); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block vmware_cpu_notifier = { + .notifier_call = vmware_cpu_notify, +}; +#endif + +static int sta_enabled = 1; /* steal time accounting */ +static int parse_vmw_no_sta(char *arg) +{ + sta_enabled = 0; + return 0; +} + +early_param("vmw-no-sta", parse_vmw_no_sta); + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (sta_enabled) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + + +static void __init paravirt_ops_setup(void) +{ + pv_info.name = "VMware"; + pv_cpu_ops.io_delay = paravirt_nop; + pv_time_ops.sched_clock = vmware_sched_clock; + pv_time_ops.read_boot_clock64 = vmware_read_boot_clock64; + + if (vmware_is_stealclock_available()) { + has_steal_clock = 1; + pv_time_ops.steal_clock = vmware_steal_clock; + } + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + +} + +static void kmsg_dumper_vmware_log(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper kmsg_dumper = { + .dump = kmsg_dumper_vmware_log +}; + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx != UINT_MAX) + if (ebx != UINT_MAX) { x86_platform.calibrate_tsc = vmware_get_tsc_khz; +#ifdef CONFIG_X86_LOCAL_APIC + /* Skip lapic calibration since we know the bus frequency. */ + lapic_timer_frequency = ecx / HZ; + pr_info("Host bus clock speed read from hypervisor : %u Hz\n", + ecx); +#endif + } else printk(KERN_WARNING "Failed to get TSC freq from the hypervisor\n"); + + vtsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(vtsc_khz, 1000); + printk(KERN_INFO "Pre Kernel boot time: %dms\n", + (unsigned int) (tsc_at_head / vtsc_khz)); + + vmware_cyc2ns.cyc2ns_mul = + DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, + vtsc_khz); + vmware_cyc2ns.cyc2ns_shift = CYC2NS_SCALE_FACTOR; + vmware_cyc2ns.cyc2ns_offset = tsc_at_head; + + clocksource_register_khz(&clocksource_vmware, vtsc_khz); + + paravirt_ops_setup(); + + /* vmware_cpu_notifier is used only by STA */ + if (has_steal_clock) { +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = vmware_smp_prepare_boot_cpu; + register_cpu_notifier(&vmware_cpu_notifier); +#else + vmware_guest_cpu_init(); +#endif + } + +#ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ + pci_probe &= ~PCI_PROBE_BIOS; +#endif + kmsg_dump_register(&kmsg_dumper); } /* @@ -95,6 +372,9 @@ static void __init vmware_platform_setup(void) */ static uint32_t __init vmware_platform(void) { +#ifndef CONFIG_VMWARE_ONLY + tsc_at_head = rdtsc(); +#endif if (cpu_has_hypervisor) { unsigned int eax; unsigned int hyper_vendor_id[3]; @@ -145,3 +425,127 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = { .x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); + +#define MESSAGE_STATUS_SUCCESS (0x01 << 16) +#define MESSAGE_STATUS_CPT (0x10 << 16) +#define MESSAGE_STATUS_HB (0x80 << 16) + +#define RPCI_PROTOCOL_NUM 0x49435052 /* 'RPCI' */ +#define GUESTMSG_FLAG_COOKIE 0x80000000 + +#define MESSAGE_TYPE_OPEN (0 << 16) +#define MESSAGE_TYPE_SENDSIZE (1 << 16) +#define MESSAGE_TYPE_CLOSE (6 << 16) + +typedef struct { + uint32_t id; + uint32_t cookieHigh; + uint32_t cookieLow; +} vmw_msg; + +static int +vmware_log_open(vmw_msg *msg) { + uint32_t result, info, dx, si, di; + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info), + "=d" (dx), + "=S" (si), + "=D" (di) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_MESSAGE | MESSAGE_TYPE_OPEN), + "d" (VMWARE_HYPERVISOR_PORT), + "b" (RPCI_PROTOCOL_NUM | GUESTMSG_FLAG_COOKIE)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0) + return 1; + + msg->id = dx & 0xffff0000; + msg->cookieHigh = si; + msg->cookieLow = di; + return 0; +} + +static int +vmware_log_close(vmw_msg *msg) { + uint32_t result, info; + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_MESSAGE | MESSAGE_TYPE_CLOSE), + "d" (VMWARE_HYPERVISOR_PORT | msg->id), + "b" (0), + "S" (msg->cookieHigh), + "D" (msg->cookieLow)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0) + return 1; + return 0; +} + +static int +vmware_log_send(vmw_msg *msg, const char *string) { + uint32_t result, info; + uint32_t len = strlen(string); + +retry: + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_MESSAGE | MESSAGE_TYPE_SENDSIZE), + "d" (VMWARE_HYPERVISOR_PORT | msg->id), + "b" (len), + "S" (msg->cookieHigh), + "D" (msg->cookieLow)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0 || + (info & MESSAGE_STATUS_HB) == 0) + /* Expected success + high-bandwidth. Give up. */ + return 1; + + __asm__ __volatile__ ("pushq %%rbp\n\t" + "movl %[rbp], %%ebp\n\t" + "cld\n\t" + "rep; outsb\n\t" + "popq %%rbp\n\t" + : "=a" (result), + "=b" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (len), + "d" (VMWARE_HYPERVISOR_HB_PORT | msg->id), + "b" (VMWARE_HB_PORT_CMD_MESSAGE | MESSAGE_STATUS_SUCCESS), + "S" (string), + [rbp] "r" (msg->cookieHigh), + "D" (msg->cookieLow)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0) { + if (info & MESSAGE_STATUS_CPT) + /* A checkpoint occurred. Retry. */ + goto retry; + return 1; + } + return 0; +} + +static void kmsg_dumper_vmware_log(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + vmw_msg msg; + static char line[1024]; + size_t len = 0; + + line[0] = 'l'; + line[1] = 'o'; + line[2] = 'g'; + line[3] = ' '; + + while (kmsg_dump_get_line(dumper, true, line + 4, sizeof(line) - 4, &len)) { + line[len + 4] = '\0'; + if (vmware_log_open(&msg) || + vmware_log_send(&msg, line) || + vmware_log_close(&msg)) + break; + } +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ffdc0e8..1bc0140 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,16 @@ startup_64: * tables and then reload them. */ +#ifdef CONFIG_VMWARE_ONLY + /* + * Read a TSC value first + */ + rdtsc + shl $0x20, %rdx + or %rax, %rdx + mov %rdx, tsc_at_head(%rip) +#endif + /* Sanitize CPU configuration */ call verify_cpu diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f534a0e..93b24e4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -218,6 +218,12 @@ static u64 native_steal_clock(int cpu) return 0; } +static void native_read_boot_clock64(struct timespec64 *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -328,6 +334,7 @@ struct pv_init_ops pv_init_ops = { struct pv_time_ops pv_time_ops = { .sched_clock = native_sched_clock, .steal_clock = native_steal_clock, + .read_boot_clock64 = native_read_boot_clock64, }; __visible struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e67b834..8b78a86 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1287,3 +1287,12 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); + +/* We need to define a real function for read_boot_clock64, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +void read_boot_clock64(struct timespec64 *ts) +{ + paravirt_read_boot_clock64(ts); +} +#endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fbabe4f..5a18dd6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -557,7 +557,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - udelay(200); +// udelay(200); if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041..377ab5a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -385,8 +385,15 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { + /* + * sched_clock is stable and running for VMware guest. + * Let's disable this checking. It will allow us to have + * printk timestamps from the beginning + */ +#if !defined(CONFIG_VMWARE_ONLY) || !defined(CONFIG_PRINTK_TIME) if (!sched_clock_stable()) return sched_clock_cpu(raw_smp_processor_id()); +#endif return sched_clock(); } -- 2.8.1