From d5e7229bec41406a4040a1ac9131e24cb1f8768d Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Wed, 30 Sep 2015 23:00:00 +0000 Subject: [PATCH 01/12] Measure correct boot time. --- arch/x86/Kconfig | 8 ++++++++ arch/x86/kernel/head_64.S | 16 ++++++++++++++++ init/main.c | 11 +++++++++++ 3 files changed, 35 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3a1a5d..24141ac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -708,6 +708,14 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. +config VMWARE + bool "VMware Guest support" + depends on PARAVIRT + default y + ---help--- + This option enables various optimizations for running under the + VMware hypervisor. It includes a correct boot time measurement. + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8..eccf2d7 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,16 @@ startup_64: * tables and then reload them. */ +#ifdef CONFIG_VMWARE + /* + * Read a TSC value first + */ + rdtsc + shl $0x20, %rdx + or %rax, %rdx + mov %rdx, tsc_at_head(%rip) +#endif + /* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. @@ -512,6 +522,12 @@ early_gdt_descr: early_gdt_descr_base: .quad INIT_PER_CPU_VAR(gdt_page) +#ifdef CONFIG_VMWARE + .globl tsc_at_head +tsc_at_head: + .quad 0 +#endif + ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 diff --git a/init/main.c b/init/main.c index 5650655..c386186 100644 --- a/init/main.c +++ b/init/main.c @@ -929,6 +929,9 @@ static int try_to_run_init_process(const char *init_filename) } static noinline void __init kernel_init_freeable(void); +#ifdef CONFIG_VMWARE +extern unsigned long long tsc_at_head; +#endif static int __ref kernel_init(void *unused) { @@ -944,6 +947,14 @@ static int __ref kernel_init(void *unused) flush_delayed_fput(); +#ifdef CONFIG_VMWARE + printk(KERN_INFO "Pre-Kernel time: %5dms\n", + (unsigned int) (tsc_at_head / tsc_khz)); + printk(KERN_INFO "Kernel boot time:%5dms\n", + (unsigned int) ((__native_read_tsc() - tsc_at_head) / + tsc_khz)); +#endif + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) -- 1.9.1 From 500436e32d4dffae5d78f12be72c2e6784ab8cfb Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Fri, 2 Oct 2015 20:00:06 +0000 Subject: [PATCH 02/12] PV io_delay for VMware guest. --- arch/x86/kernel/cpu/vmware.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 628a059..8fdd031 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -26,6 +26,7 @@ #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> +#include <asm/timer.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -75,6 +76,16 @@ static unsigned long vmware_get_tsc_khz(void) return tsc_hz; } +static void __init paravirt_ops_setup(void) +{ + pv_info.name = "VMware"; + pv_cpu_ops.io_delay = paravirt_nop, + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif +} + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -86,6 +97,8 @@ static void __init vmware_platform_setup(void) else printk(KERN_WARNING "Failed to get TSC freq from the hypervisor\n"); + + paravirt_ops_setup(); } /* -- 1.9.1 From adff5db39b45d8adef2b4579ec46ab1bb721a81f Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Wed, 7 Oct 2015 22:53:18 +0000 Subject: [PATCH 03/12] Improved tsc based sched_clock & clocksource. --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/vmware.c | 66 ++++++++++++++++++++++++++++++++++++++++++++ init/main.c | 11 -------- kernel/sched/clock.c | 2 ++ 4 files changed, 69 insertions(+), 11 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 24141ac..ca0be27 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -711,6 +711,7 @@ config KVM_DEBUG_FS config VMWARE bool "VMware Guest support" depends on PARAVIRT + select PARAVIRT_CLOCK default y ---help--- This option enables various optimizations for running under the diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 8fdd031..004825e 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -27,6 +27,7 @@ #include <asm/x86_init.h> #include <asm/hypervisor.h> #include <asm/timer.h> +#include <linux/sched.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -76,10 +77,43 @@ static unsigned long vmware_get_tsc_khz(void) return tsc_hz; } +static struct cyc2ns_data vmware_cyc2ns; +extern unsigned long long tsc_at_head; +static cycle_t vmware_clock_get_cycles(struct clocksource *cs) +{ + return __native_read_tsc(); +} + +static struct clocksource clocksource_vmware = { + .name = "vmware-clock", + .read = vmware_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +struct clocksource * __init clocksource_default_clock(void) +{ + return &clocksource_vmware; +} + +#define CYC2NS_SCALE_FACTOR 8 + +static u64 vmware_sched_clock(void) +{ + u64 ret; + + ret = __native_read_tsc() - vmware_cyc2ns.cyc2ns_offset; + ret = mul_u64_u32_shr(ret, vmware_cyc2ns.cyc2ns_mul, CYC2NS_SCALE_FACTOR); + return ret; +} + +extern __read_mostly int sched_clock_running; static void __init paravirt_ops_setup(void) { pv_info.name = "VMware"; pv_cpu_ops.io_delay = paravirt_nop, + pv_time_ops.sched_clock = vmware_sched_clock; #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; @@ -88,6 +122,7 @@ static void __init paravirt_ops_setup(void) static void __init vmware_platform_setup(void) { + uint64_t cpu_khz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -98,6 +133,19 @@ static void __init vmware_platform_setup(void) printk(KERN_WARNING "Failed to get TSC freq from the hypervisor\n"); + cpu_khz = eax | (((uint64_t)ebx) << 32); + do_div(cpu_khz, 1000); + printk(KERN_INFO "Pre Kernel boot time: %dms\n", + (unsigned int) (tsc_at_head / cpu_khz)); + + vmware_cyc2ns.cyc2ns_mul = + DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, + cpu_khz); + vmware_cyc2ns.cyc2ns_shift = CYC2NS_SCALE_FACTOR; + vmware_cyc2ns.cyc2ns_offset = tsc_at_head; + + clocksource_register_khz(&clocksource_vmware, cpu_khz); + paravirt_ops_setup(); } @@ -158,3 +206,21 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = { .x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); + +void read_boot_clock64(struct timespec64 *ts) +{ + struct timespec64 now; + u64 delta, delta_nsec; + u32 rem; + + read_persistent_clock64(&now); + delta = __native_read_tsc() - vmware_cyc2ns.cyc2ns_offset; + delta_nsec = mul_u64_u32_shr(delta, vmware_cyc2ns.cyc2ns_mul, + CYC2NS_SCALE_FACTOR); + ts->tv_sec = now.tv_sec - div_s64_rem(delta_nsec, NSEC_PER_SEC, &rem); + ts->tv_nsec = now.tv_nsec - rem; + while (unlikely(ts->tv_nsec < 0)) { + ts->tv_sec--; + ts->tv_nsec += NSEC_PER_SEC; + } +} diff --git a/init/main.c b/init/main.c index c386186..5650655 100644 --- a/init/main.c +++ b/init/main.c @@ -929,9 +929,6 @@ static int try_to_run_init_process(const char *init_filename) } static noinline void __init kernel_init_freeable(void); -#ifdef CONFIG_VMWARE -extern unsigned long long tsc_at_head; -#endif static int __ref kernel_init(void *unused) { @@ -947,14 +944,6 @@ static int __ref kernel_init(void *unused) flush_delayed_fput(); -#ifdef CONFIG_VMWARE - printk(KERN_INFO "Pre-Kernel time: %5dms\n", - (unsigned int) (tsc_at_head / tsc_khz)); - printk(KERN_INFO "Kernel boot time:%5dms\n", - (unsigned int) ((__native_read_tsc() - tsc_at_head) / - tsc_khz)); -#endif - if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a2051..284a7ba 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -385,8 +385,10 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { +#ifndef CONFIG_VMWARE if (!sched_clock_stable()) return sched_clock_cpu(raw_smp_processor_id()); +#endif return sched_clock(); } -- 1.9.1 From 3bd5760c3b1f6cb39568361561d7d1e5440f1109 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Mon, 12 Oct 2015 22:43:38 +0000 Subject: [PATCH 04/12] Move read_boot_clock64 into pv_time_ops. --- arch/x86/Kconfig | 14 ++++++-- arch/x86/include/asm/paravirt.h | 5 +++ arch/x86/include/asm/paravirt_types.h | 5 +++ arch/x86/kernel/cpu/vmware.c | 66 ++++++++++++++++++++--------------- arch/x86/kernel/head_64.S | 8 +---- arch/x86/kernel/paravirt.c | 7 ++++ arch/x86/kernel/setup.c | 9 +++++ kernel/sched/clock.c | 7 +++- 8 files changed, 83 insertions(+), 38 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca0be27..d3ef8ef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -709,13 +709,23 @@ config KVM_DEBUG_FS may incur significant overhead. config VMWARE - bool "VMware Guest support" + bool "VMware guest support" depends on PARAVIRT select PARAVIRT_CLOCK default y ---help--- This option enables various optimizations for running under the - VMware hypervisor. It includes a correct boot time measurement. + VMware hypervisor. It includes vmware-clock clocksource and some + pv-ops implementations. + +config VMWARE_ONLY + bool "Build for VMware only" + depends on VMWARE + default n + ---help--- + This option enables VMware guest specific optimizations. If you say + yes here, the kernel will probably work only under VMware hypervisor. + source "arch/x86/lguest/Kconfig" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d143bfa..ffcbd18 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -201,6 +201,11 @@ static inline u64 paravirt_steal_clock(int cpu) return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); } +static inline void paravirt_read_boot_clock64(struct timespec64 *ts) +{ + PVOP_VCALL1(pv_time_ops.read_boot_clock64, ts); +} + static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index a6b8f9f..7adcd55 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -51,6 +51,10 @@ struct mm_struct; struct desc_struct; struct task_struct; struct cpumask; +#if __BITS_PER_LONG == 64 +# define timespec64 timespec +#endif +struct timespec64; /* * Wrapper type for pointers to code which uses the non-standard @@ -98,6 +102,7 @@ struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); unsigned long (*get_tsc_khz)(void); + void (*read_boot_clock64)(struct timespec64 *ts); }; struct pv_cpu_ops { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 004825e..1bf1fe3 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -77,8 +77,10 @@ static unsigned long vmware_get_tsc_khz(void) return tsc_hz; } +#define CYC2NS_SCALE_FACTOR 8 static struct cyc2ns_data vmware_cyc2ns; -extern unsigned long long tsc_at_head; +u64 __initdata tsc_at_head; + static cycle_t vmware_clock_get_cycles(struct clocksource *cs) { return __native_read_tsc(); @@ -92,12 +94,14 @@ static struct clocksource clocksource_vmware = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +#ifdef CONFIG_VMWARE_ONLY +/* We want to use clocksource_vmware from the beginning to avoid drifting in + monotonic clock */ struct clocksource * __init clocksource_default_clock(void) { return &clocksource_vmware; } - -#define CYC2NS_SCALE_FACTOR 8 +#endif static u64 vmware_sched_clock(void) { @@ -108,12 +112,33 @@ static u64 vmware_sched_clock(void) return ret; } -extern __read_mostly int sched_clock_running; + +/* Function to read the exact time the system has been started. It will be + used as zero time for monotonic clock */ +static void vmware_read_boot_clock64(struct timespec64 *ts) +{ + struct timespec64 now; + u64 delta, delta_nsec; + u32 rem; + + read_persistent_clock64(&now); + delta = __native_read_tsc() - vmware_cyc2ns.cyc2ns_offset; + delta_nsec = mul_u64_u32_shr(delta, vmware_cyc2ns.cyc2ns_mul, + CYC2NS_SCALE_FACTOR); + ts->tv_sec = now.tv_sec - div_s64_rem(delta_nsec, NSEC_PER_SEC, &rem); + ts->tv_nsec = now.tv_nsec - rem; + while (unlikely(ts->tv_nsec < 0)) { + ts->tv_sec--; + ts->tv_nsec += NSEC_PER_SEC; + } +} + static void __init paravirt_ops_setup(void) { pv_info.name = "VMware"; pv_cpu_ops.io_delay = paravirt_nop, pv_time_ops.sched_clock = vmware_sched_clock; + pv_time_ops.read_boot_clock64 = vmware_read_boot_clock64; #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; @@ -122,7 +147,7 @@ static void __init paravirt_ops_setup(void) static void __init vmware_platform_setup(void) { - uint64_t cpu_khz; + uint64_t vtsc_khz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -133,18 +158,18 @@ static void __init vmware_platform_setup(void) printk(KERN_WARNING "Failed to get TSC freq from the hypervisor\n"); - cpu_khz = eax | (((uint64_t)ebx) << 32); - do_div(cpu_khz, 1000); + vtsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(vtsc_khz, 1000); printk(KERN_INFO "Pre Kernel boot time: %dms\n", - (unsigned int) (tsc_at_head / cpu_khz)); + (unsigned int) (tsc_at_head / vtsc_khz)); vmware_cyc2ns.cyc2ns_mul = DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, - cpu_khz); + vtsc_khz); vmware_cyc2ns.cyc2ns_shift = CYC2NS_SCALE_FACTOR; vmware_cyc2ns.cyc2ns_offset = tsc_at_head; - clocksource_register_khz(&clocksource_vmware, cpu_khz); + clocksource_register_khz(&clocksource_vmware, vtsc_khz); paravirt_ops_setup(); } @@ -156,6 +181,9 @@ static void __init vmware_platform_setup(void) */ static uint32_t __init vmware_platform(void) { +#ifndef CONFIG_VMWARE_ONLY + tsc_at_head = __native_read_tsc(); +#endif if (cpu_has_hypervisor) { unsigned int eax; unsigned int hyper_vendor_id[3]; @@ -206,21 +234,3 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = { .x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); - -void read_boot_clock64(struct timespec64 *ts) -{ - struct timespec64 now; - u64 delta, delta_nsec; - u32 rem; - - read_persistent_clock64(&now); - delta = __native_read_tsc() - vmware_cyc2ns.cyc2ns_offset; - delta_nsec = mul_u64_u32_shr(delta, vmware_cyc2ns.cyc2ns_mul, - CYC2NS_SCALE_FACTOR); - ts->tv_sec = now.tv_sec - div_s64_rem(delta_nsec, NSEC_PER_SEC, &rem); - ts->tv_nsec = now.tv_nsec - rem; - while (unlikely(ts->tv_nsec < 0)) { - ts->tv_sec--; - ts->tv_nsec += NSEC_PER_SEC; - } -} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index eccf2d7..1dfd805 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,7 +65,7 @@ startup_64: * tables and then reload them. */ -#ifdef CONFIG_VMWARE +#ifdef CONFIG_VMWARE_ONLY /* * Read a TSC value first */ @@ -522,12 +522,6 @@ early_gdt_descr: early_gdt_descr_base: .quad INIT_PER_CPU_VAR(gdt_page) -#ifdef CONFIG_VMWARE - .globl tsc_at_head -tsc_at_head: - .quad 0 -#endif - ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 58bcfb6..abf40ec 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -210,6 +210,12 @@ static u64 native_steal_clock(int cpu) return 0; } +static void native_read_boot_clock64(struct timespec64 *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -320,6 +326,7 @@ struct pv_init_ops pv_init_ops = { struct pv_time_ops pv_time_ops = { .sched_clock = native_sched_clock, .steal_clock = native_steal_clock, + .read_boot_clock64 = native_read_boot_clock64, }; __visible struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80f874b..0d7022e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1289,3 +1289,12 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); + +/* We need to define a real function for read_boot_clock64, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +void read_boot_clock64(struct timespec64 *ts) +{ + paravirt_read_boot_clock64(ts); +} +#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 284a7ba..615aeb4 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -385,7 +385,12 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { -#ifndef CONFIG_VMWARE + /* + * sched_clock is stable and running for VMware guest. + * Let's disable this checking. It will allow us to have + * printk timestamps from the beginning + */ +#if !defined(CONFIG_VMWARE_ONLY) || !defined(CONFIG_PRINTK_TIME) if (!sched_clock_stable()) return sched_clock_cpu(raw_smp_processor_id()); #endif -- 1.9.1 From aa93eaec3f709633007ab6ce3ddbb8aaa455b557 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Thu, 5 Nov 2015 21:02:52 +0000 Subject: [PATCH 05/12] Fix clocksource_vmware issue in VM version <= 10 --- arch/x86/kernel/cpu/vmware.c | 48 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 1bf1fe3..0b89bb9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -79,7 +79,8 @@ static unsigned long vmware_get_tsc_khz(void) #define CYC2NS_SCALE_FACTOR 8 static struct cyc2ns_data vmware_cyc2ns; -u64 __initdata tsc_at_head; +uint64_t __initdata tsc_at_head; +uint64_t __initdata vtsc_khz; static cycle_t vmware_clock_get_cycles(struct clocksource *cs) { @@ -95,11 +96,45 @@ static struct clocksource clocksource_vmware = { }; #ifdef CONFIG_VMWARE_ONLY +/* + * clocksource_vmware_periodic - is a temporary clocksource only for + * early boot initialization. + * Hack to avoid infinite looping in calibrate_APIC_clock() when + * tsc_deadline_timer is not supported by hypervisor (VM version <= 10) + * calibrate_APIC_clock() relies on _periodic_ timer! + * In that case we do not need to use clocksource that is valid for + * hres/oneshot timer. + */ +static struct clocksource __initdata clocksource_vmware_periodic = { + .name = "vmware-clock-periodic", + .read = vmware_clock_get_cycles, + .rating = 100, + .mask = CLOCKSOURCE_MASK(64), +}; + +static struct clocksource __initdata * initial_clocksource; + +/* + * clocksource_vmware_register + * + * Time to register real clocksource. It will be activated in + * clocksource_done_booting(). + */ +static int __init clocksource_vmware_register(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + clocksource_register_khz(&clocksource_vmware, vtsc_khz); + clocksource_unregister(&clocksource_vmware_periodic); + } + return 0; +} +subsys_initcall(clocksource_vmware_register); + /* We want to use clocksource_vmware from the beginning to avoid drifting in monotonic clock */ struct clocksource * __init clocksource_default_clock(void) { - return &clocksource_vmware; + return initial_clocksource; } #endif @@ -147,7 +182,6 @@ static void __init paravirt_ops_setup(void) static void __init vmware_platform_setup(void) { - uint64_t vtsc_khz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -169,7 +203,15 @@ static void __init vmware_platform_setup(void) vmware_cyc2ns.cyc2ns_shift = CYC2NS_SCALE_FACTOR; vmware_cyc2ns.cyc2ns_offset = tsc_at_head; +#ifdef CONFIG_VMWARE_ONLY + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + initial_clocksource = &clocksource_vmware_periodic; + else + initial_clocksource = &clocksource_vmware; + clocksource_register_khz(initial_clocksource, vtsc_khz); +#else clocksource_register_khz(&clocksource_vmware, vtsc_khz); +#endif paravirt_ops_setup(); } -- 1.9.1 From 245c6ff168fabde177b5b6023356b6005b0efbef Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Tue, 10 Nov 2015 11:46:57 +0000 Subject: [PATCH 06/12] Get lapic timer frequency from HV, skip calibration --- arch/x86/kernel/cpu/vmware.c | 48 +++++--------------------------------------- 1 file changed, 5 insertions(+), 43 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 0b89bb9..b16618b 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -96,45 +96,11 @@ static struct clocksource clocksource_vmware = { }; #ifdef CONFIG_VMWARE_ONLY -/* - * clocksource_vmware_periodic - is a temporary clocksource only for - * early boot initialization. - * Hack to avoid infinite looping in calibrate_APIC_clock() when - * tsc_deadline_timer is not supported by hypervisor (VM version <= 10) - * calibrate_APIC_clock() relies on _periodic_ timer! - * In that case we do not need to use clocksource that is valid for - * hres/oneshot timer. - */ -static struct clocksource __initdata clocksource_vmware_periodic = { - .name = "vmware-clock-periodic", - .read = vmware_clock_get_cycles, - .rating = 100, - .mask = CLOCKSOURCE_MASK(64), -}; - -static struct clocksource __initdata * initial_clocksource; - -/* - * clocksource_vmware_register - * - * Time to register real clocksource. It will be activated in - * clocksource_done_booting(). - */ -static int __init clocksource_vmware_register(void) -{ - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { - clocksource_register_khz(&clocksource_vmware, vtsc_khz); - clocksource_unregister(&clocksource_vmware_periodic); - } - return 0; -} -subsys_initcall(clocksource_vmware_register); - /* We want to use clocksource_vmware from the beginning to avoid drifting in monotonic clock */ struct clocksource * __init clocksource_default_clock(void) { - return initial_clocksource; + return &clocksource_vmware; } #endif @@ -197,21 +163,17 @@ static void __init vmware_platform_setup(void) printk(KERN_INFO "Pre Kernel boot time: %dms\n", (unsigned int) (tsc_at_head / vtsc_khz)); +#ifdef CONFIG_X86_LOCAL_APIC + /* Skip lapic calibration since we know bus frequency. */ + lapic_timer_frequency = ecx; +#endif vmware_cyc2ns.cyc2ns_mul = DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, vtsc_khz); vmware_cyc2ns.cyc2ns_shift = CYC2NS_SCALE_FACTOR; vmware_cyc2ns.cyc2ns_offset = tsc_at_head; -#ifdef CONFIG_VMWARE_ONLY - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) - initial_clocksource = &clocksource_vmware_periodic; - else - initial_clocksource = &clocksource_vmware; - clocksource_register_khz(initial_clocksource, vtsc_khz); -#else clocksource_register_khz(&clocksource_vmware, vtsc_khz); -#endif paravirt_ops_setup(); } -- 1.9.1 From 23055114ca27a04044ebbe38853834e0aa869da0 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Tue, 15 Dec 2015 21:31:18 +0000 Subject: [PATCH 07/12] Skip rdrand reseed --- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/cpu/rdrand.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb9e5df..5327c74 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -943,7 +943,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); +#ifndef CONFIG_VMWARE_ONLY x86_init_rdrand(c); +#endif x86_init_cache_qos(c); /* diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 136ac74..0685891 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -32,6 +32,7 @@ static int __init x86_rdrand_setup(char *s) } __setup("nordrand", x86_rdrand_setup); +#ifndef CONFIG_VMWARE_ONLY /* * Force a reseed cycle; we are architecturally guaranteed a reseed * after no more than 512 128-bit chunks of random data. This also @@ -58,3 +59,4 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_RDRAND); #endif } +#endif -- 1.9.1 From bd806a16d202bf9dc41fbe3f8e39545e704adf9e Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Thu, 3 Dec 2015 00:46:46 +0000 Subject: [PATCH 08/12] STA implementation. first version. --- arch/x86/kernel/cpu/vmware.c | 163 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b16618b..cf1fb64 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -28,6 +28,8 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <linux/sched.h> +#include <linux/cpu.h> +#include <asm/pci_x86.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -38,6 +40,10 @@ #define VMWARE_PORT_CMD_GETVCPU_INFO 68 #define VMWARE_PORT_CMD_LEGACY_X2APIC 3 #define VMWARE_PORT_CMD_VCPU_RESERVED 31 +#define VMWARE_PORT_CMD_STEALCLOCK 91 +# define CMD_STEALCLOCK_ENABLE 0 +# define CMD_STEALCLOCK_DISABLE 1 + #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx)" : \ @@ -47,6 +53,34 @@ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); +struct vmware_steal_time { + uint64_t clock; /* stolen time counter in units of vtsc */ + uint64_t reserved; +}; +static DEFINE_PER_CPU(struct vmware_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; + +static int vmware_cmd_stealclock(int subcmd, uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_STEALCLOCK), + "d" (VMWARE_HYPERVISOR_PORT), + "b" (subcmd), + "S" (arg1), + "D" (arg2)); + return result; +} +#define STEALCLOCK_ENABLE(pa) \ + vmware_cmd_stealclock(CMD_STEALCLOCK_ENABLE, \ + (pa) >> 32, (pa) & 0xffffffff) + +#define STEALCLOCK_DISABLE() \ + vmware_cmd_stealclock(CMD_STEALCLOCK_DISABLE, 0, 0) + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -134,6 +168,114 @@ static void vmware_read_boot_clock64(struct timespec64 *ts) } } +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal; + + steal = &per_cpu(steal_time, cpu); + return mul_u64_u32_shr(steal->clock, vmware_cyc2ns.cyc2ns_mul, + CYC2NS_SCALE_FACTOR); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + if (STEALCLOCK_ENABLE(slow_virt_to_phys(st)) != 0) { + has_steal_clock = 0; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); +} + +void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + STEALCLOCK_DISABLE(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static void vmware_guest_cpu_online(void *dummy) +{ + vmware_guest_cpu_init(); +} + +static void vmware_guest_cpu_offline(void *dummy) +{ + vmware_disable_steal_time(); +} + +static int vmware_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, vmware_guest_cpu_online, + NULL, 0); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + smp_call_function_single(cpu, vmware_guest_cpu_offline, + NULL, 1); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block vmware_cpu_notifier = { + .notifier_call = vmware_cpu_notify, +}; +#endif + +static int sta_enabled = 1; /* steal time accounting */ +static int parse_vmw_no_sta(char *arg) +{ + sta_enabled = 0; + return 0; +} + +early_param("vmw-no-sta", parse_vmw_no_sta); + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (sta_enabled) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + + static void __init paravirt_ops_setup(void) { pv_info.name = "VMware"; @@ -141,9 +283,18 @@ static void __init paravirt_ops_setup(void) pv_time_ops.sched_clock = vmware_sched_clock; pv_time_ops.read_boot_clock64 = vmware_read_boot_clock64; + /* + * TODO: check for STEAL_TIME support + */ + if (1) { + has_steal_clock = 1; + pv_time_ops.steal_clock = vmware_steal_clock; + } + #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif + } static void __init vmware_platform_setup(void) @@ -176,6 +327,18 @@ static void __init vmware_platform_setup(void) clocksource_register_khz(&clocksource_vmware, vtsc_khz); paravirt_ops_setup(); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = vmware_smp_prepare_boot_cpu; + register_cpu_notifier(&vmware_cpu_notifier); +#else + vmware_guest_cpu_init(); +#endif + +#ifdef CONFIG_PCI + /* PCI BIOS service won't work from a PV guest. */ + pci_probe &= ~PCI_PROBE_BIOS; +#endif } /* -- 1.9.1 From 8496145f4f5fcd430e5d8f493066a8e54aaaf96b Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Wed, 13 Jan 2016 22:54:04 +0000 Subject: [PATCH 09/12] STA. updated version --- arch/x86/kernel/cpu/vmware.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index cf1fb64..196703c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -41,16 +41,23 @@ #define VMWARE_PORT_CMD_LEGACY_X2APIC 3 #define VMWARE_PORT_CMD_VCPU_RESERVED 31 #define VMWARE_PORT_CMD_STEALCLOCK 91 -# define CMD_STEALCLOCK_ENABLE 0 -# define CMD_STEALCLOCK_DISABLE 1 +# define CMD_STEALCLOCK_STATUS 0 +# define STEALCLOCK_IS_NOT_AVALIABLE 0 +# define STEALCLOCK_IS_ENABLED 1 +# define STEALCLOCK_IS_DISABLED 2 +# define CMD_STEALCLOCK_ENABLE 1 +# define CMD_STEALCLOCK_DISABLE 2 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + VMWARE_PORT2(cmd, eax, ebx, ecx, edx, UINT_MAX) + +#define VMWARE_PORT2(cmd, eax, ebx, ecx, edx, arg) \ __asm__("inl (%%dx)" : \ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ "0"(VMWARE_HYPERVISOR_MAGIC), \ "1"(VMWARE_PORT_CMD_##cmd), \ - "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(arg) : \ "memory"); struct vmware_steal_time { @@ -60,6 +67,13 @@ struct vmware_steal_time { static DEFINE_PER_CPU(struct vmware_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; +static int vmware_is_stealclock_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT2(STEALCLOCK, eax, ebx, ecx, edx, CMD_STEALCLOCK_STATUS); + printk("%s:%d %d %d\n", __FUNCTION__, __LINE__, eax, ebx); + return eax == 0 && ebx != STEALCLOCK_IS_NOT_AVALIABLE; +} static int vmware_cmd_stealclock(int subcmd, uint32_t arg1, uint32_t arg2) { uint32_t result, info; @@ -283,10 +297,7 @@ static void __init paravirt_ops_setup(void) pv_time_ops.sched_clock = vmware_sched_clock; pv_time_ops.read_boot_clock64 = vmware_read_boot_clock64; - /* - * TODO: check for STEAL_TIME support - */ - if (1) { + if (vmware_is_stealclock_available()) { has_steal_clock = 1; pv_time_ops.steal_clock = vmware_steal_clock; } @@ -328,12 +339,15 @@ static void __init vmware_platform_setup(void) paravirt_ops_setup(); + /* vmware_cpu_notifier is used only by STA */ + if (has_steal_clock) { #ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = vmware_smp_prepare_boot_cpu; - register_cpu_notifier(&vmware_cpu_notifier); + smp_ops.smp_prepare_boot_cpu = vmware_smp_prepare_boot_cpu; + register_cpu_notifier(&vmware_cpu_notifier); #else - vmware_guest_cpu_init(); + vmware_guest_cpu_init(); #endif + } #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ -- 1.9.1 From a8f165433de453994aef84a9d6bf1704a18b3a95 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Tue, 15 Mar 2016 22:29:23 +0000 Subject: [PATCH 10/12] STA: version with a single backdoor command. --- arch/x86/kernel/cpu/vmware.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 196703c..743b8ad 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -41,12 +41,9 @@ #define VMWARE_PORT_CMD_LEGACY_X2APIC 3 #define VMWARE_PORT_CMD_VCPU_RESERVED 31 #define VMWARE_PORT_CMD_STEALCLOCK 91 -# define CMD_STEALCLOCK_STATUS 0 -# define STEALCLOCK_IS_NOT_AVALIABLE 0 -# define STEALCLOCK_IS_ENABLED 1 -# define STEALCLOCK_IS_DISABLED 2 -# define CMD_STEALCLOCK_ENABLE 1 -# define CMD_STEALCLOCK_DISABLE 2 +# define STEALCLOCK_IS_NOT_AVALIABLE -1 +# define STEALCLOCK_IS_DISABLED 0 +# define STEALCLOCK_IS_ENABLED 1 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ @@ -62,19 +59,12 @@ struct vmware_steal_time { uint64_t clock; /* stolen time counter in units of vtsc */ - uint64_t reserved; + uint64_t reserved[7]; }; static DEFINE_PER_CPU(struct vmware_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; -static int vmware_is_stealclock_available(void) -{ - uint32_t eax, ebx, ecx, edx; - VMWARE_PORT2(STEALCLOCK, eax, ebx, ecx, edx, CMD_STEALCLOCK_STATUS); - printk("%s:%d %d %d\n", __FUNCTION__, __LINE__, eax, ebx); - return eax == 0 && ebx != STEALCLOCK_IS_NOT_AVALIABLE; -} -static int vmware_cmd_stealclock(int subcmd, uint32_t arg1, uint32_t arg2) +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) { uint32_t result, info; __asm__ __volatile__ ("inl (%%dx)" @@ -83,17 +73,22 @@ static int vmware_cmd_stealclock(int subcmd, uint32_t arg1, uint32_t arg2) : "a" (VMWARE_HYPERVISOR_MAGIC), "c" (VMWARE_PORT_CMD_STEALCLOCK), "d" (VMWARE_HYPERVISOR_PORT), - "b" (subcmd), + "b" (0), "S" (arg1), "D" (arg2)); return result; } #define STEALCLOCK_ENABLE(pa) \ - vmware_cmd_stealclock(CMD_STEALCLOCK_ENABLE, \ - (pa) >> 32, (pa) & 0xffffffff) + (vmware_cmd_stealclock((pa) >> 32, (pa) & 0xffffffff) \ + == STEALCLOCK_IS_ENABLED) #define STEALCLOCK_DISABLE() \ - vmware_cmd_stealclock(CMD_STEALCLOCK_DISABLE, 0, 0) + vmware_cmd_stealclock(0, 1) + +static int vmware_is_stealclock_available(void) +{ + return STEALCLOCK_DISABLE() != STEALCLOCK_IS_NOT_AVALIABLE; +} static inline int __vmware_platform(void) { @@ -201,7 +196,7 @@ static void vmware_register_steal_time(void) memset(st, 0, sizeof(*st)); - if (STEALCLOCK_ENABLE(slow_virt_to_phys(st)) != 0) { + if (!STEALCLOCK_ENABLE(slow_virt_to_phys(st))) { has_steal_clock = 0; return; } -- 1.9.1 From 27b9b08cf68f55fbfa297eb047f7d1309e0a60cf Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Fri, 25 Mar 2016 01:14:17 +0000 Subject: [PATCH 11/12] Remove delays for smpboot --- arch/x86/kernel/smpboot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b1f3ed9..8f0be52 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -560,7 +560,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - udelay(200); +// udelay(200); if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ @@ -665,7 +665,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - udelay(300); +// udelay(300); pr_debug("Startup point 1\n"); @@ -675,7 +675,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Give the other CPU some time to accept the IPI. */ - udelay(200); +// udelay(200); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); -- 1.9.1 From 3fe2ad9c5031e059849ba0970ccee95ce07f8239 Mon Sep 17 00:00:00 2001 From: Alexey Makhalov <amakhalov@vmware.com> Date: Tue, 29 Mar 2016 21:14:46 +0000 Subject: [PATCH 12/12] kmsg_dumper to vmware.log --- arch/x86/kernel/cpu/vmware.c | 143 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 139 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 743b8ad..e9f7d52 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,10 +30,12 @@ #include <linux/sched.h> #include <linux/cpu.h> #include <asm/pci_x86.h> +#include <linux/kmsg_dump.h> -#define CPUID_VMWARE_INFO_LEAF 0x40000000 -#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 -#define VMWARE_HYPERVISOR_PORT 0x5658 +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_HB_PORT 0x5659 #define VMWARE_PORT_CMD_GETVERSION 10 #define VMWARE_PORT_CMD_GETHZ 45 @@ -44,7 +46,8 @@ # define STEALCLOCK_IS_NOT_AVALIABLE -1 # define STEALCLOCK_IS_DISABLED 0 # define STEALCLOCK_IS_ENABLED 1 - +#define VMWARE_PORT_CMD_MESSAGE 30 +#define VMWARE_HB_PORT_CMD_MESSAGE 0 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ VMWARE_PORT2(cmd, eax, ebx, ecx, edx, UINT_MAX) @@ -303,6 +306,13 @@ static void __init paravirt_ops_setup(void) } +static void kmsg_dumper_vmware_log(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason); + +static struct kmsg_dumper kmsg_dumper = { + .dump = kmsg_dumper_vmware_log +}; + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -348,6 +358,7 @@ static void __init vmware_platform_setup(void) /* PCI BIOS service won't work from a PV guest. */ pci_probe &= ~PCI_PROBE_BIOS; #endif + kmsg_dump_register(&kmsg_dumper); } /* @@ -410,3 +421,127 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = { .x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); + +#define MESSAGE_STATUS_SUCCESS (0x01 << 16) +#define MESSAGE_STATUS_CPT (0x10 << 16) +#define MESSAGE_STATUS_HB (0x80 << 16) + +#define RPCI_PROTOCOL_NUM 0x49435052 /* 'RPCI' */ +#define GUESTMSG_FLAG_COOKIE 0x80000000 + +#define MESSAGE_TYPE_OPEN (0 << 16) +#define MESSAGE_TYPE_SENDSIZE (1 << 16) +#define MESSAGE_TYPE_CLOSE (6 << 16) + +typedef struct { + uint32_t id; + uint32_t cookieHigh; + uint32_t cookieLow; +} vmw_msg; + +static int +vmware_log_open(vmw_msg *msg) { + uint32_t result, info, dx, si, di; + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info), + "=d" (dx), + "=S" (si), + "=D" (di) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_MESSAGE | MESSAGE_TYPE_OPEN), + "d" (VMWARE_HYPERVISOR_PORT), + "b" (RPCI_PROTOCOL_NUM | GUESTMSG_FLAG_COOKIE)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0) + return 1; + + msg->id = dx & 0xffff0000; + msg->cookieHigh = si; + msg->cookieLow = di; + return 0; +} + +static int +vmware_log_close(vmw_msg *msg) { + uint32_t result, info; + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_MESSAGE | MESSAGE_TYPE_CLOSE), + "d" (VMWARE_HYPERVISOR_PORT | msg->id), + "b" (0), + "S" (msg->cookieHigh), + "D" (msg->cookieLow)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0) + return 1; + return 0; +} + +static int +vmware_log_send(vmw_msg *msg, const char *string) { + uint32_t result, info; + uint32_t len = strlen(string); + +retry: + __asm__ __volatile__ ("inl (%%dx)" + : "=a" (result), + "=c" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (VMWARE_PORT_CMD_MESSAGE | MESSAGE_TYPE_SENDSIZE), + "d" (VMWARE_HYPERVISOR_PORT | msg->id), + "b" (len), + "S" (msg->cookieHigh), + "D" (msg->cookieLow)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0 || + (info & MESSAGE_STATUS_HB) == 0) + /* Expected success + high-bandwidth. Give up. */ + return 1; + + __asm__ __volatile__ ("pushq %%rbp\n\t" + "movl %[rbp], %%ebp\n\t" + "cld\n\t" + "rep; outsb\n\t" + "popq %%rbp\n\t" + : "=a" (result), + "=b" (info) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "c" (len), + "d" (VMWARE_HYPERVISOR_HB_PORT | msg->id), + "b" (VMWARE_HB_PORT_CMD_MESSAGE | MESSAGE_STATUS_SUCCESS), + "S" (string), + [rbp] "r" (msg->cookieHigh), + "D" (msg->cookieLow)); + + if ((info & MESSAGE_STATUS_SUCCESS) == 0) { + if (info & MESSAGE_STATUS_CPT) + /* A checkpoint occurred. Retry. */ + goto retry; + return 1; + } + return 0; +} + +static void kmsg_dumper_vmware_log(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + vmw_msg msg; + static char line[1024]; + size_t len = 0; + + line[0] = 'l'; + line[1] = 'o'; + line[2] = 'g'; + line[3] = ' '; + + while (kmsg_dump_get_line(dumper, true, line + 4, sizeof(line) - 4, &len)) { + line[len + 4] = '\0'; + if (vmware_log_open(&msg) || + vmware_log_send(&msg, line) || + vmware_log_close(&msg)) + break; + } +} -- 1.9.1