Commit b645af2d5905c4e32399005b867987919cbfc3ae

Authored by Andy Lutomirski
Committed by Linus Torvalds
1 parent 6f442be2fb

x86_64, traps: Rework bad_iret

It's possible for iretq to userspace to fail.  This can happen because
of a bad CS, SS, or RIP.

Historically, we've handled it by fixing up an exception from iretq to
land at bad_iret, which pretends that the failed iret frame was really
the hardware part of #GP(0) from userspace.  To make this work, there's
an extra fixup to fudge the gs base into a usable state.

This is suboptimal because it loses the original exception.  It's also
buggy because there's no guarantee that we were on the kernel stack to
begin with.  For example, if the failing iret happened on return from an
NMI, then we'll end up executing general_protection on the NMI stack.
This is bad for several reasons, the most immediate of which is that
general_protection, as a non-paranoid idtentry, will try to deliver
signals and/or schedule from the wrong stack.

This patch throws out bad_iret entirely.  As a replacement, it augments
the existing swapgs fudge into a full-blown iret fixup, mostly written
in C.  It's should be clearer and more correct.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 48 additions and 26 deletions Side-by-side Diff

arch/x86/kernel/entry_64.S
... ... @@ -830,8 +830,13 @@
830 830  
831 831 .global native_irq_return_iret
832 832 native_irq_return_iret:
  833 + /*
  834 + * This may fault. Non-paranoid faults on return to userspace are
  835 + * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
  836 + * Double-faults due to espfix64 are handled in do_double_fault.
  837 + * Other faults here are fatal.
  838 + */
833 839 iretq
834   - _ASM_EXTABLE(native_irq_return_iret, bad_iret)
835 840  
836 841 #ifdef CONFIG_X86_ESPFIX64
837 842 native_irq_return_ldt:
... ... @@ -859,25 +864,6 @@
859 864 jmp native_irq_return_iret
860 865 #endif
861 866  
862   - .section .fixup,"ax"
863   -bad_iret:
864   - /*
865   - * The iret traps when the %cs or %ss being restored is bogus.
866   - * We've lost the original trap vector and error code.
867   - * #GPF is the most likely one to get for an invalid selector.
868   - * So pretend we completed the iret and took the #GPF in user mode.
869   - *
870   - * We are now running with the kernel GS after exception recovery.
871   - * But error_entry expects us to have user GS to match the user %cs,
872   - * so swap back.
873   - */
874   - pushq $0
875   -
876   - SWAPGS
877   - jmp general_protection
878   -
879   - .previous
880   -
881 867 /* edi: workmask, edx: work */
882 868 retint_careful:
883 869 CFI_RESTORE_STATE
884 870  
... ... @@ -1369,17 +1355,16 @@
1369 1355  
1370 1356 /*
1371 1357 * There are two places in the kernel that can potentially fault with
1372   - * usergs. Handle them here. The exception handlers after iret run with
1373   - * kernel gs again, so don't set the user space flag. B stepping K8s
1374   - * sometimes report an truncated RIP for IRET exceptions returning to
1375   - * compat mode. Check for these here too.
  1358 + * usergs. Handle them here. B stepping K8s sometimes report a
  1359 + * truncated RIP for IRET exceptions returning to compat mode. Check
  1360 + * for these here too.
1376 1361 */
1377 1362 error_kernelspace:
1378 1363 CFI_REL_OFFSET rcx, RCX+8
1379 1364 incl %ebx
1380 1365 leaq native_irq_return_iret(%rip),%rcx
1381 1366 cmpq %rcx,RIP+8(%rsp)
1382   - je error_swapgs
  1367 + je error_bad_iret
1383 1368 movl %ecx,%eax /* zero extend */
1384 1369 cmpq %rax,RIP+8(%rsp)
1385 1370 je bstep_iret
... ... @@ -1390,7 +1375,15 @@
1390 1375 bstep_iret:
1391 1376 /* Fix truncated RIP */
1392 1377 movq %rcx,RIP+8(%rsp)
1393   - jmp error_swapgs
  1378 + /* fall through */
  1379 +
  1380 +error_bad_iret:
  1381 + SWAPGS
  1382 + mov %rsp,%rdi
  1383 + call fixup_bad_iret
  1384 + mov %rax,%rsp
  1385 + decl %ebx /* Return to usergs */
  1386 + jmp error_sti
1394 1387 CFI_ENDPROC
1395 1388 END(error_entry)
1396 1389  
arch/x86/kernel/traps.c
... ... @@ -407,6 +407,35 @@
407 407 return regs;
408 408 }
409 409 NOKPROBE_SYMBOL(sync_regs);
  410 +
  411 +struct bad_iret_stack {
  412 + void *error_entry_ret;
  413 + struct pt_regs regs;
  414 +};
  415 +
  416 +asmlinkage __visible
  417 +struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
  418 +{
  419 + /*
  420 + * This is called from entry_64.S early in handling a fault
  421 + * caused by a bad iret to user mode. To handle the fault
  422 + * correctly, we want move our stack frame to task_pt_regs
  423 + * and we want to pretend that the exception came from the
  424 + * iret target.
  425 + */
  426 + struct bad_iret_stack *new_stack =
  427 + container_of(task_pt_regs(current),
  428 + struct bad_iret_stack, regs);
  429 +
  430 + /* Copy the IRET target to the new stack. */
  431 + memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
  432 +
  433 + /* Copy the remainder of the stack from the current stack. */
  434 + memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
  435 +
  436 + BUG_ON(!user_mode_vm(&new_stack->regs));
  437 + return new_stack;
  438 +}
410 439 #endif
411 440  
412 441 /*