Browse code

linux-esx: Fix crash on cpu hot-add

When running the linux-esx kernel, performing a CPU hot-add from ESXi
causes the Photon VM to crash and reboot immediately. This is caused
by issues in the boot-clock patch.

At the time of CPU startup, we want to read a TSC value and store it
in the variable named 'tsc_at_head'. This operation is supposed to be
performed only when the first CPU starts (the boot CPU), i.e., inside
startup_64(). However, the current code puts this logic inside
secondary_startup_64(), which is the initialization logic for all the
other CPUs (non-boot or secondary CPUs). This makes the logic
incorrect, but that doesn't yet explain why the kernel crashes on CPU
hot-add.

The global variable tsc_at_head is marked as __initdata, which means
that its page will be set to read-only permission after kernel
initialization is complete. So, the first round of writes to
tsc_at_head will succeed from all the CPUs (during booting), but any
subsquent write (such as during CPU hot-add) will end up writing to
read-only kernel memory, resulting in a crash.

Fix this by moving the read-tsc logic to the beginning of the boot CPU
init sequence (startup_64()).

Change-Id: Ie4aaef2439187470ac90a3515d0af2952125a8c7
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/6465
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Alexey Makhalov <amakhalov@vmware.com>

Srivatsa S. Bhat (VMware) authored on 2019/01/09 05:52:18
Showing 2 changed files
... ...
@@ -5,15 +5,15 @@ Subject: [PATCH] x86/vmware: pv-ops boot_clock
5 5
 
6 6
 ---
7 7
  arch/x86/include/asm/paravirt.h       |  5 +++++
8
- arch/x86/include/asm/paravirt_types.h |  5 +++++
8
+ arch/x86/include/asm/paravirt_types.h |  2 ++
9 9
  arch/x86/kernel/cpu/vmware.c          | 24 +++++++++++++++++++++++-
10 10
  arch/x86/kernel/head_64.S             |  8 ++++++++
11 11
  arch/x86/kernel/paravirt.c            |  7 +++++++
12
- arch/x86/kernel/setup.c               |  9 +++++++++
13
- 6 files changed, 57 insertions(+), 1 deletion(-)
12
+ arch/x86/kernel/setup.c               | 10 ++++++++++
13
+ 6 files changed, 55 insertions(+), 1 deletion(-)
14 14
 
15 15
 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
16
-index d49bbf4..e0e5b8e 100644
16
+index e375d4266..e5c64e1 100644
17 17
 --- a/arch/x86/include/asm/paravirt.h
18 18
 +++ b/arch/x86/include/asm/paravirt.h
19 19
 @@ -184,6 +184,11 @@ static inline u64 paravirt_steal_clock(int cpu)
... ...
@@ -29,10 +29,10 @@ index d49bbf4..e0e5b8e 100644
29 29
  {
30 30
  	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
31 31
 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
32
-index 180bc0b..b6280a9 100644
32
+index 4b75acc..156cfa3 100644
33 33
 --- a/arch/x86/include/asm/paravirt_types.h
34 34
 +++ b/arch/x86/include/asm/paravirt_types.h
35
-@@ -53,7 +53,8 @@ struct mm_struct;
35
+@@ -53,6 +53,7 @@ struct mm_struct;
36 36
  struct desc_struct;
37 37
  struct task_struct;
38 38
  struct cpumask;
... ...
@@ -40,8 +40,7 @@ index 180bc0b..b6280a9 100644
40 40
  struct flush_tlb_info;
41 41
  struct mmu_gather;
42 42
  
43
- /*
44
-@@ -99,6 +100,7 @@ struct pv_lazy_ops {
43
+@@ -100,6 +101,7 @@ struct pv_lazy_ops {
45 44
  struct pv_time_ops {
46 45
  	unsigned long long (*sched_clock)(void);
47 46
  	unsigned long long (*steal_clock)(int cpu);
... ...
@@ -50,7 +49,7 @@ index 180bc0b..b6280a9 100644
50 50
  
51 51
  struct pv_cpu_ops {
52 52
 diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
53
-index 20509ee..e299357 100644
53
+index d4a8c9a..45c78ec 100644
54 54
 --- a/arch/x86/kernel/cpu/vmware.c
55 55
 +++ b/arch/x86/kernel/cpu/vmware.c
56 56
 @@ -101,6 +101,7 @@ static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
... ...
@@ -88,7 +87,7 @@ index 20509ee..e299357 100644
88 88
  static __init int setup_vmw_sched_clock(char *s)
89 89
  {
90 90
  	vmw_sched_clock = 0;
91
-@@ -143,7 +164,7 @@ static unsigned long long vmware_sched_clock(void)
91
+@@ -143,7 +164,7 @@ static unsigned long long notrace vmware_sched_clock(void)
92 92
  static void __init vmware_cyc2ns_setup(void)
93 93
  {
94 94
  	struct cyc2ns_data *d = &vmware_cyc2ns;
... ...
@@ -106,11 +105,11 @@ index 20509ee..e299357 100644
106 106
  #else
107 107
  #define vmware_paravirt_ops_setup() do {} while (0)
108 108
 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
109
-index 8344dd2..60ad9ac 100644
109
+index a3618cf..93b5e83 100644
110 110
 --- a/arch/x86/kernel/head_64.S
111 111
 +++ b/arch/x86/kernel/head_64.S
112
-@@ -107,6 +107,14 @@ ENTRY(secondary_startup_64)
113
- 	 * after the boot processor executes this code.
112
+@@ -72,6 +72,14 @@ startup_64:
113
+ 	 * tables and then reload them.
114 114
  	 */
115 115
  
116 116
 +	/*
... ...
@@ -121,14 +120,14 @@ index 8344dd2..60ad9ac 100644
121 121
 +	or	%rax, %rdx
122 122
 +	mov	%rdx, tsc_at_head(%rip)
123 123
 +
124
- 	/* Sanitize CPU configuration */
125
- 	call verify_cpu
124
+ 	/* Set up the stack for verify_cpu(), similar to initial_stack below */
125
+ 	leaq	(__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
126 126
  
127 127
 diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
128
-index 930c883..f6b3799 100644
128
+index 8dc69d8..3cf11e7 100644
129 129
 --- a/arch/x86/kernel/paravirt.c
130 130
 +++ b/arch/x86/kernel/paravirt.c
131
-@@ -219,6 +219,12 @@ static u64 native_steal_clock(int cpu)
131
+@@ -220,6 +220,12 @@ static u64 native_steal_clock(int cpu)
132 132
  	return 0;
133 133
  }
134 134
  
... ...
@@ -141,7 +140,7 @@ index 930c883..f6b3799 100644
141 141
  /* These are in entry.S */
142 142
  extern void native_iret(void);
143 143
  extern void native_usergs_sysret64(void);
144
-@@ -326,6 +332,7 @@ struct pv_init_ops pv_init_ops = {
144
+@@ -327,6 +333,7 @@ struct pv_init_ops pv_init_ops = {
145 145
  struct pv_time_ops pv_time_ops = {
146 146
  	.sched_clock = native_sched_clock,
147 147
  	.steal_clock = native_steal_clock,
... ...
@@ -150,10 +149,10 @@ index 930c883..f6b3799 100644
150 150
  
151 151
  __visible struct pv_irq_ops pv_irq_ops = {
152 152
 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
153
-index 74b4472..3a6ab94 100644
153
+index b4866ba..ecec1e1 100644
154 154
 --- a/arch/x86/kernel/setup.c
155 155
 +++ b/arch/x86/kernel/setup.c
156
-@@ -1318,3 +1318,13 @@ static int __init register_kernel_offset_dumper(void)
156
+@@ -1309,3 +1309,13 @@ static int __init register_kernel_offset_dumper(void)
157 157
  	return 0;
158 158
  }
159 159
  __initcall(register_kernel_offset_dumper);
... ...
@@ -2,7 +2,7 @@
2 2
 Summary:        Kernel
3 3
 Name:           linux-esx
4 4
 Version:        4.19.6
5
-Release:        2%{?dist}
5
+Release:        3%{?dist}
6 6
 License:        GPLv2
7 7
 URL:            http://www.kernel.org/
8 8
 Group:          System Environment/Kernel
... ...
@@ -193,6 +193,8 @@ ln -sf linux-%{uname_r}.cfg /boot/photon.cfg
193 193
 /usr/src/linux-headers-%{uname_r}
194 194
 
195 195
 %changelog
196
+*   Tue Jan 08 2019 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.19.6-3
197
+-   Fix crash on cpu hot-add.
196 198
 *   Fri Jan 04 2019 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.19.6-2
197 199
 -   Add out-of-tree patches from AppArmor and enable it by default.
198 200
 *   Mon Dec 10 2018 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.19.6-1