When running the linux-esx kernel, performing a CPU hot-add from ESXi
causes the Photon VM to crash and reboot immediately. This is caused
by issues in the boot-clock patch.
At the time of CPU startup, we want to read a TSC value and store it
in the variable named 'tsc_at_head'. This operation is supposed to be
performed only when the first CPU starts (the boot CPU), i.e., inside
startup_64(). However, the current code puts this logic inside
secondary_startup_64(), which is the initialization logic for all the
other CPUs (non-boot or secondary CPUs). This makes the logic
incorrect, but that doesn't yet explain why the kernel crashes on CPU
hot-add.
The global variable tsc_at_head is marked as __initdata, which means
that its page will be set to read-only permission after kernel
initialization is complete. So, the first round of writes to
tsc_at_head will succeed from all the CPUs (during booting), but any
subsquent write (such as during CPU hot-add) will end up writing to
read-only kernel memory, resulting in a crash.
Fix this by moving the read-tsc logic to the beginning of the boot CPU
init sequence (startup_64()).
Change-Id: Ie4aaef2439187470ac90a3515d0af2952125a8c7
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/6465
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Alexey Makhalov <amakhalov@vmware.com>
| ... | ... |
@@ -5,15 +5,15 @@ Subject: [PATCH] x86/vmware: pv-ops boot_clock |
| 5 | 5 |
|
| 6 | 6 |
--- |
| 7 | 7 |
arch/x86/include/asm/paravirt.h | 5 +++++ |
| 8 |
- arch/x86/include/asm/paravirt_types.h | 5 +++++ |
|
| 8 |
+ arch/x86/include/asm/paravirt_types.h | 2 ++ |
|
| 9 | 9 |
arch/x86/kernel/cpu/vmware.c | 24 +++++++++++++++++++++++- |
| 10 | 10 |
arch/x86/kernel/head_64.S | 8 ++++++++ |
| 11 | 11 |
arch/x86/kernel/paravirt.c | 7 +++++++ |
| 12 |
- arch/x86/kernel/setup.c | 9 +++++++++ |
|
| 13 |
- 6 files changed, 57 insertions(+), 1 deletion(-) |
|
| 12 |
+ arch/x86/kernel/setup.c | 10 ++++++++++ |
|
| 13 |
+ 6 files changed, 55 insertions(+), 1 deletion(-) |
|
| 14 | 14 |
|
| 15 | 15 |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h |
| 16 |
-index d49bbf4..e0e5b8e 100644 |
|
| 16 |
+index e375d4266..e5c64e1 100644 |
|
| 17 | 17 |
--- a/arch/x86/include/asm/paravirt.h |
| 18 | 18 |
+++ b/arch/x86/include/asm/paravirt.h |
| 19 | 19 |
@@ -184,6 +184,11 @@ static inline u64 paravirt_steal_clock(int cpu) |
| ... | ... |
@@ -29,10 +29,10 @@ index d49bbf4..e0e5b8e 100644 |
| 29 | 29 |
{
|
| 30 | 30 |
return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); |
| 31 | 31 |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h |
| 32 |
-index 180bc0b..b6280a9 100644 |
|
| 32 |
+index 4b75acc..156cfa3 100644 |
|
| 33 | 33 |
--- a/arch/x86/include/asm/paravirt_types.h |
| 34 | 34 |
+++ b/arch/x86/include/asm/paravirt_types.h |
| 35 |
-@@ -53,7 +53,8 @@ struct mm_struct; |
|
| 35 |
+@@ -53,6 +53,7 @@ struct mm_struct; |
|
| 36 | 36 |
struct desc_struct; |
| 37 | 37 |
struct task_struct; |
| 38 | 38 |
struct cpumask; |
| ... | ... |
@@ -40,8 +40,7 @@ index 180bc0b..b6280a9 100644 |
| 40 | 40 |
struct flush_tlb_info; |
| 41 | 41 |
struct mmu_gather; |
| 42 | 42 |
|
| 43 |
- /* |
|
| 44 |
-@@ -99,6 +100,7 @@ struct pv_lazy_ops {
|
|
| 43 |
+@@ -100,6 +101,7 @@ struct pv_lazy_ops {
|
|
| 45 | 44 |
struct pv_time_ops {
|
| 46 | 45 |
unsigned long long (*sched_clock)(void); |
| 47 | 46 |
unsigned long long (*steal_clock)(int cpu); |
| ... | ... |
@@ -50,7 +49,7 @@ index 180bc0b..b6280a9 100644 |
| 50 | 50 |
|
| 51 | 51 |
struct pv_cpu_ops {
|
| 52 | 52 |
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c |
| 53 |
-index 20509ee..e299357 100644 |
|
| 53 |
+index d4a8c9a..45c78ec 100644 |
|
| 54 | 54 |
--- a/arch/x86/kernel/cpu/vmware.c |
| 55 | 55 |
+++ b/arch/x86/kernel/cpu/vmware.c |
| 56 | 56 |
@@ -101,6 +101,7 @@ static struct cyc2ns_data vmware_cyc2ns __ro_after_init; |
| ... | ... |
@@ -88,7 +87,7 @@ index 20509ee..e299357 100644 |
| 88 | 88 |
static __init int setup_vmw_sched_clock(char *s) |
| 89 | 89 |
{
|
| 90 | 90 |
vmw_sched_clock = 0; |
| 91 |
-@@ -143,7 +164,7 @@ static unsigned long long vmware_sched_clock(void) |
|
| 91 |
+@@ -143,7 +164,7 @@ static unsigned long long notrace vmware_sched_clock(void) |
|
| 92 | 92 |
static void __init vmware_cyc2ns_setup(void) |
| 93 | 93 |
{
|
| 94 | 94 |
struct cyc2ns_data *d = &vmware_cyc2ns; |
| ... | ... |
@@ -106,11 +105,11 @@ index 20509ee..e299357 100644 |
| 106 | 106 |
#else |
| 107 | 107 |
#define vmware_paravirt_ops_setup() do {} while (0)
|
| 108 | 108 |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S |
| 109 |
-index 8344dd2..60ad9ac 100644 |
|
| 109 |
+index a3618cf..93b5e83 100644 |
|
| 110 | 110 |
--- a/arch/x86/kernel/head_64.S |
| 111 | 111 |
+++ b/arch/x86/kernel/head_64.S |
| 112 |
-@@ -107,6 +107,14 @@ ENTRY(secondary_startup_64) |
|
| 113 |
- * after the boot processor executes this code. |
|
| 112 |
+@@ -72,6 +72,14 @@ startup_64: |
|
| 113 |
+ * tables and then reload them. |
|
| 114 | 114 |
*/ |
| 115 | 115 |
|
| 116 | 116 |
+ /* |
| ... | ... |
@@ -121,14 +120,14 @@ index 8344dd2..60ad9ac 100644 |
| 121 | 121 |
+ or %rax, %rdx |
| 122 | 122 |
+ mov %rdx, tsc_at_head(%rip) |
| 123 | 123 |
+ |
| 124 |
- /* Sanitize CPU configuration */ |
|
| 125 |
- call verify_cpu |
|
| 124 |
+ /* Set up the stack for verify_cpu(), similar to initial_stack below */ |
|
| 125 |
+ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp |
|
| 126 | 126 |
|
| 127 | 127 |
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c |
| 128 |
-index 930c883..f6b3799 100644 |
|
| 128 |
+index 8dc69d8..3cf11e7 100644 |
|
| 129 | 129 |
--- a/arch/x86/kernel/paravirt.c |
| 130 | 130 |
+++ b/arch/x86/kernel/paravirt.c |
| 131 |
-@@ -219,6 +219,12 @@ static u64 native_steal_clock(int cpu) |
|
| 131 |
+@@ -220,6 +220,12 @@ static u64 native_steal_clock(int cpu) |
|
| 132 | 132 |
return 0; |
| 133 | 133 |
} |
| 134 | 134 |
|
| ... | ... |
@@ -141,7 +140,7 @@ index 930c883..f6b3799 100644 |
| 141 | 141 |
/* These are in entry.S */ |
| 142 | 142 |
extern void native_iret(void); |
| 143 | 143 |
extern void native_usergs_sysret64(void); |
| 144 |
-@@ -326,6 +332,7 @@ struct pv_init_ops pv_init_ops = {
|
|
| 144 |
+@@ -327,6 +333,7 @@ struct pv_init_ops pv_init_ops = {
|
|
| 145 | 145 |
struct pv_time_ops pv_time_ops = {
|
| 146 | 146 |
.sched_clock = native_sched_clock, |
| 147 | 147 |
.steal_clock = native_steal_clock, |
| ... | ... |
@@ -150,10 +149,10 @@ index 930c883..f6b3799 100644 |
| 150 | 150 |
|
| 151 | 151 |
__visible struct pv_irq_ops pv_irq_ops = {
|
| 152 | 152 |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c |
| 153 |
-index 74b4472..3a6ab94 100644 |
|
| 153 |
+index b4866ba..ecec1e1 100644 |
|
| 154 | 154 |
--- a/arch/x86/kernel/setup.c |
| 155 | 155 |
+++ b/arch/x86/kernel/setup.c |
| 156 |
-@@ -1318,3 +1318,13 @@ static int __init register_kernel_offset_dumper(void) |
|
| 156 |
+@@ -1309,3 +1309,13 @@ static int __init register_kernel_offset_dumper(void) |
|
| 157 | 157 |
return 0; |
| 158 | 158 |
} |
| 159 | 159 |
__initcall(register_kernel_offset_dumper); |
| ... | ... |
@@ -2,7 +2,7 @@ |
| 2 | 2 |
Summary: Kernel |
| 3 | 3 |
Name: linux-esx |
| 4 | 4 |
Version: 4.19.6 |
| 5 |
-Release: 2%{?dist}
|
|
| 5 |
+Release: 3%{?dist}
|
|
| 6 | 6 |
License: GPLv2 |
| 7 | 7 |
URL: http://www.kernel.org/ |
| 8 | 8 |
Group: System Environment/Kernel |
| ... | ... |
@@ -193,6 +193,8 @@ ln -sf linux-%{uname_r}.cfg /boot/photon.cfg
|
| 193 | 193 |
/usr/src/linux-headers-%{uname_r}
|
| 194 | 194 |
|
| 195 | 195 |
%changelog |
| 196 |
+* Tue Jan 08 2019 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.19.6-3 |
|
| 197 |
+- Fix crash on cpu hot-add. |
|
| 196 | 198 |
* Fri Jan 04 2019 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.19.6-2 |
| 197 | 199 |
- Add out-of-tree patches from AppArmor and enable it by default. |
| 198 | 200 |
* Mon Dec 10 2018 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.19.6-1 |