Commit 3e1a878b7ccdb31da6d9d2b855c72ad87afeba3f

Authored by Igor Mammedov
Committed by Ingo Molnar
1 parent feef1e8ecb

x86/smpboot: Initialize secondary CPU only if master CPU will wait for it

Hang is observed on virtual machines during CPU hotplug,
especially in big guests with many CPUs. (It reproducible
more often if host is over-committed).

It happens because master CPU gives up waiting on
secondary CPU and allows it to run wild. As result
AP causes locking or crashing system. For example
as described here:

   https://lkml.org/lkml/2014/3/6/257

If master CPU have sent STARTUP IPI successfully,
and AP signalled to master CPU that it's ready
to start initialization, make master CPU wait
indefinitely till AP is onlined.
To ensure that AP won't ever run wild, make it
wait at early startup till master CPU confirms its
intention to wait for AP. If AP doesn't respond in 10
seconds, the master CPU will timeout and cancel
AP onlining.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1401975765-22328-4-git-send-email-imammedo@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Showing 2 changed files with 47 additions and 79 deletions Side-by-side Diff

arch/x86/kernel/cpu/common.c
... ... @@ -1221,6 +1221,17 @@
1221 1221 #define dbg_restore_debug_regs()
1222 1222 #endif /* ! CONFIG_KGDB */
1223 1223  
  1224 +static void wait_for_master_cpu(int cpu)
  1225 +{
  1226 + /*
  1227 + * wait for ACK from master CPU before continuing
  1228 + * with AP initialization
  1229 + */
  1230 + WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
  1231 + while (!cpumask_test_cpu(cpu, cpu_callout_mask))
  1232 + cpu_relax();
  1233 +}
  1234 +
1224 1235 /*
1225 1236 * cpu_init() initializes state that is per-CPU. Some data is already
1226 1237 * initialized (naturally) in the bootstrap process, such as the GDT
1227 1238  
1228 1239  
... ... @@ -1236,16 +1247,17 @@
1236 1247 struct task_struct *me;
1237 1248 struct tss_struct *t;
1238 1249 unsigned long v;
1239   - int cpu;
  1250 + int cpu = stack_smp_processor_id();
1240 1251 int i;
1241 1252  
  1253 + wait_for_master_cpu(cpu);
  1254 +
1242 1255 /*
1243 1256 * Load microcode on this cpu if a valid microcode is available.
1244 1257 * This is early microcode loading procedure.
1245 1258 */
1246 1259 load_ucode_ap();
1247 1260  
1248   - cpu = stack_smp_processor_id();
1249 1261 t = &per_cpu(init_tss, cpu);
1250 1262 oist = &per_cpu(orig_ist, cpu);
1251 1263  
... ... @@ -1257,9 +1269,6 @@
1257 1269  
1258 1270 me = current;
1259 1271  
1260   - if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1261   - panic("CPU#%d already initialized!\n", cpu);
1262   -
1263 1272 pr_debug("Initializing CPU#%d\n", cpu);
1264 1273  
1265 1274 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1266 1275  
... ... @@ -1336,13 +1345,9 @@
1336 1345 struct tss_struct *t = &per_cpu(init_tss, cpu);
1337 1346 struct thread_struct *thread = &curr->thread;
1338 1347  
1339   - show_ucode_info_early();
  1348 + wait_for_master_cpu(cpu);
1340 1349  
1341   - if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
1342   - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1343   - for (;;)
1344   - local_irq_enable();
1345   - }
  1350 + show_ucode_info_early();
1346 1351  
1347 1352 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
1348 1353  
arch/x86/kernel/smpboot.c
... ... @@ -111,7 +111,6 @@
111 111 static void smp_callin(void)
112 112 {
113 113 int cpuid, phys_id;
114   - unsigned long timeout;
115 114  
116 115 /*
117 116 * If waken up by an INIT in an 82489DX configuration
118 117  
... ... @@ -130,39 +129,8 @@
130 129 * (This works even if the APIC is not enabled.)
131 130 */
132 131 phys_id = read_apic_id();
133   - if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
134   - panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
135   - phys_id, cpuid);
136   - }
137   - pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
138 132  
139 133 /*
140   - * STARTUP IPIs are fragile beasts as they might sometimes
141   - * trigger some glue motherboard logic. Complete APIC bus
142   - * silence for 1 second, this overestimates the time the
143   - * boot CPU is spending to send the up to 2 STARTUP IPIs
144   - * by a factor of two. This should be enough.
145   - */
146   -
147   - /*
148   - * Waiting 2s total for startup (udelay is not yet working)
149   - */
150   - timeout = jiffies + 2*HZ;
151   - while (time_before(jiffies, timeout)) {
152   - /*
153   - * Has the boot CPU finished it's STARTUP sequence?
154   - */
155   - if (cpumask_test_cpu(cpuid, cpu_callout_mask))
156   - break;
157   - cpu_relax();
158   - }
159   -
160   - if (!time_before(jiffies, timeout)) {
161   - panic("%s: CPU%d started up but did not get a callout!\n",
162   - __func__, cpuid);
163   - }
164   -
165   - /*
166 134 * the boot CPU has finished the init stage and is spinning
167 135 * on callin_map until we finish. We are free to set up this
168 136 * CPU, first the APIC. (this is probably redundant on most
169 137  
... ... @@ -750,8 +718,8 @@
750 718 unsigned long start_ip = real_mode_header->trampoline_start;
751 719  
752 720 unsigned long boot_error = 0;
753   - int timeout;
754 721 int cpu0_nmi_registered = 0;
  722 + unsigned long timeout;
755 723  
756 724 /* Just in case we booted with a single CPU. */
757 725 alternatives_enable_smp();
... ... @@ -799,6 +767,15 @@
799 767 }
800 768  
801 769 /*
  770 + * AP might wait on cpu_callout_mask in cpu_init() with
  771 + * cpu_initialized_mask set if previous attempt to online
  772 + * it timed-out. Clear cpu_initialized_mask so that after
  773 + * INIT/SIPI it could start with a clean state.
  774 + */
  775 + cpumask_clear_cpu(cpu, cpu_initialized_mask);
  776 + smp_mb();
  777 +
  778 + /*
802 779 * Wake up a CPU in difference cases:
803 780 * - Use the method in the APIC driver if it's defined
804 781 * Otherwise,
805 782  
806 783  
807 784  
808 785  
809 786  
810 787  
811 788  
... ... @@ -810,55 +787,41 @@
810 787 boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
811 788 &cpu0_nmi_registered);
812 789  
  790 +
813 791 if (!boot_error) {
814 792 /*
815   - * allow APs to start initializing.
  793 + * Wait 10s total for a response from AP
816 794 */
817   - pr_debug("Before Callout %d\n", cpu);
818   - cpumask_set_cpu(cpu, cpu_callout_mask);
819   - pr_debug("After Callout %d\n", cpu);
  795 + boot_error = -1;
  796 + timeout = jiffies + 10*HZ;
  797 + while (time_before(jiffies, timeout)) {
  798 + if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
  799 + /*
  800 + * Tell AP to proceed with initialization
  801 + */
  802 + cpumask_set_cpu(cpu, cpu_callout_mask);
  803 + boot_error = 0;
  804 + break;
  805 + }
  806 + udelay(100);
  807 + schedule();
  808 + }
  809 + }
820 810  
  811 + if (!boot_error) {
821 812 /*
822   - * Wait 5s total for a response
  813 + * Wait till AP completes initial initialization
823 814 */
824   - for (timeout = 0; timeout < 50000; timeout++) {
825   - if (cpumask_test_cpu(cpu, cpu_callin_mask))
826   - break; /* It has booted */
827   - udelay(100);
  815 + while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
828 816 /*
829 817 * Allow other tasks to run while we wait for the
830 818 * AP to come online. This also gives a chance
831 819 * for the MTRR work(triggered by the AP coming online)
832 820 * to be completed in the stop machine context.
833 821 */
  822 + udelay(100);
834 823 schedule();
835 824 }
836   -
837   - if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
838   - print_cpu_msr(&cpu_data(cpu));
839   - pr_debug("CPU%d: has booted.\n", cpu);
840   - } else {
841   - boot_error = 1;
842   - if (*trampoline_status == 0xA5A5A5A5)
843   - /* trampoline started but...? */
844   - pr_err("CPU%d: Stuck ??\n", cpu);
845   - else
846   - /* trampoline code not run */
847   - pr_err("CPU%d: Not responding\n", cpu);
848   - if (apic->inquire_remote_apic)
849   - apic->inquire_remote_apic(apicid);
850   - }
851   - }
852   -
853   - if (boot_error) {
854   - /* Try to put things back the way they were before ... */
855   - numa_remove_cpu(cpu); /* was set by numa_add_cpu */
856   -
857   - /* was set by do_boot_cpu() */
858   - cpumask_clear_cpu(cpu, cpu_callout_mask);
859   -
860   - /* was set by cpu_init() */
861   - cpumask_clear_cpu(cpu, cpu_initialized_mask);
862 825 }
863 826  
864 827 /* mark "stuck" area as not stuck */