x86/paravirt: add sysret/sysexit pvops for returning to 32-bit compatibility userspace
Jeremy Fitzhardinge [Wed, 25 Jun 2008 04:19:28 +0000 (00:19 -0400)]
In a 64-bit system, we need separate sysret/sysexit operations to
return to a 32-bit userspace.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citirx.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

arch/x86/ia32/ia32entry.S
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/paravirt.c
arch/x86/kernel/paravirt_patch_64.c
include/asm-x86/irqflags.h
include/asm-x86/paravirt.h

index 3aefbce..2a4c424 100644 (file)
        CFI_UNDEFINED   r15
        .endm
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+       swapgs
+       sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+       swapgs
+       sti
+       sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
 /*
  * 32bit SYSENTER instruction entry.
  *
@@ -151,10 +164,7 @@ sysenter_do_call:
        CFI_ADJUST_CFA_OFFSET -8
        CFI_REGISTER rsp,rcx
        TRACE_IRQS_ON
-       swapgs
-       sti             /* sti only takes effect after the next instruction */
-       /* sysexit */
-       .byte   0xf, 0x35
+       ENABLE_INTERRUPTS_SYSEXIT32
 
 sysenter_tracesys:
        CFI_RESTORE_STATE
@@ -254,8 +264,7 @@ cstar_do_call:
        TRACE_IRQS_ON
        movl RSP-ARGOFFSET(%rsp),%esp
        CFI_RESTORE rsp
-       swapgs
-       sysretl
+       USERGS_SYSRET32
        
 cstar_tracesys:        
        CFI_RESTORE_STATE
index a19aba8..06c451a 100644 (file)
@@ -62,7 +62,9 @@ int main(void)
        OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
        OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
        OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-       OFFSET(PV_CPU_usergs_sysret, pv_cpu_ops, usergs_sysret);
+       OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
+       OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
+       OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
        OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
        OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
index 18447a3..880ffe5 100644 (file)
@@ -59,7 +59,7 @@
 #endif 
 
 #ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret)
+ENTRY(native_usergs_sysret64)
        swapgs
        sysretq
 #endif /* CONFIG_PARAVIRT */
@@ -275,7 +275,7 @@ sysret_check:
        RESTORE_ARGS 0,-ARG_SKIP,1
        /*CFI_REGISTER  rflags,r11*/
        movq    %gs:pda_oldrsp, %rsp
-       USERGS_SYSRET
+       USERGS_SYSRET64
 
        CFI_RESTORE_STATE
        /* Handle reschedules */
index b0b17f0..bf1067e 100644 (file)
@@ -141,7 +141,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
                ret = paravirt_patch_nop();
        else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
                 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-                type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret))
+                type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
+                type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
                /* If operation requires a jmp, then jmp */
                ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
        else
@@ -193,7 +194,8 @@ static void native_flush_tlb_single(unsigned long addr)
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret(void);
+extern void native_usergs_sysret32(void);
+extern void native_usergs_sysret64(void);
 
 static int __init print_banner(void)
 {
@@ -329,10 +331,10 @@ struct pv_cpu_ops pv_cpu_ops = {
        .write_idt_entry = native_write_idt_entry,
        .load_sp0 = native_load_sp0,
 
-#ifdef CONFIG_X86_32
        .irq_enable_sysexit = native_irq_enable_sysexit,
-#else
-       .usergs_sysret = native_usergs_sysret,
+#ifdef CONFIG_X86_64
+       .usergs_sysret32 = native_usergs_sysret32,
+       .usergs_sysret64 = native_usergs_sysret64,
 #endif
        .iret = native_iret,
        .swapgs = native_swapgs,
index d4c0712..061d01d 100644 (file)
@@ -14,8 +14,9 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
-/* the three commands give us more control to how to return from a syscall */
-DEF_NATIVE(pv_cpu_ops, usergs_sysret, "swapgs; sysretq;");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
+DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
@@ -35,7 +36,9 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_irq_ops, irq_enable);
                PATCH_SITE(pv_irq_ops, irq_disable);
                PATCH_SITE(pv_cpu_ops, iret);
-               PATCH_SITE(pv_cpu_ops, usergs_sysret);
+               PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
+               PATCH_SITE(pv_cpu_ops, usergs_sysret32);
+               PATCH_SITE(pv_cpu_ops, usergs_sysret64);
                PATCH_SITE(pv_cpu_ops, swapgs);
                PATCH_SITE(pv_mmu_ops, read_cr2);
                PATCH_SITE(pv_mmu_ops, read_cr3);
index 544836c..ea9bd26 100644 (file)
@@ -112,9 +112,17 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #ifdef CONFIG_X86_64
 #define INTERRUPT_RETURN       iretq
-#define USERGS_SYSRET                                  \
-                       swapgs;                         \
-                       sysretq;
+#define USERGS_SYSRET64                                \
+       swapgs;                                 \
+       sysretq;
+#define USERGS_SYSRET32                                \
+       swapgs;                                 \
+       sysretl
+#define ENABLE_INTERRUPTS_SYSEXIT32            \
+       swapgs;                                 \
+       sti;                                    \
+       sysexit
+
 #else
 #define INTERRUPT_RETURN               iret
 #define ENABLE_INTERRUPTS_SYSEXIT      sti; sysexit
index dad5b41..33f72f8 100644 (file)
@@ -141,9 +141,32 @@ struct pv_cpu_ops {
        u64 (*read_pmc)(int counter);
        unsigned long long (*read_tscp)(unsigned int *aux);
 
-       /* These three are jmp to, not actually called. */
+       /*
+        * Atomically enable interrupts and return to userspace.  This
+        * is only ever used to return to 32-bit processes; in a
+        * 64-bit kernel, it's used for 32-on-64 compat processes, but
+        * never native 64-bit processes.  (Jump, not call.)
+        */
        void (*irq_enable_sysexit)(void);
-       void (*usergs_sysret)(void);
+
+       /*
+        * Switch to usermode gs and return to 64-bit usermode using
+        * sysret.  Only used in 64-bit kernels to return to 64-bit
+        * processes.  Usermode register state, including %rsp, must
+        * already be restored.
+        */
+       void (*usergs_sysret64)(void);
+
+       /*
+        * Switch to usermode gs and return to 32-bit usermode using
+        * sysret.  Used to return to 32-on-64 compat processes.
+        * Other usermode register state, including %esp, must already
+        * be restored.
+        */
+       void (*usergs_sysret32)(void);
+
+       /* Normal iret.  Jump to this with the standard iret stack
+          frame set up. */
        void (*iret)(void);
 
        void (*swapgs)(void);
@@ -1481,18 +1504,24 @@ static inline unsigned long __raw_local_irq_save(void)
                  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);     \
                  PV_RESTORE_REGS;)
 
-#define ENABLE_INTERRUPTS_SYSEXIT                                      \
-       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),    \
+#define USERGS_SYSRET32                                                        \
+       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),       \
                  CLBR_NONE,                                            \
-                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-
+                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
 
 #ifdef CONFIG_X86_32
 #define GET_CR0_INTO_EAX                               \
        push %ecx; push %edx;                           \
        call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
        pop %edx; pop %ecx
-#else
+
+#define ENABLE_INTERRUPTS_SYSEXIT                                      \
+       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),    \
+                 CLBR_NONE,                                            \
+                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+
+
+#else  /* !CONFIG_X86_32 */
 #define SWAPGS                                                         \
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,     \
                  PV_SAVE_REGS;                                         \
@@ -1505,11 +1534,16 @@ static inline unsigned long __raw_local_irq_save(void)
        movq %rax, %rcx;                                \
        xorq %rax, %rax;
 
-#define USERGS_SYSRET                                                  \
-       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret),         \
+#define USERGS_SYSRET64                                                        \
+       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),       \
                  CLBR_NONE,                                            \
-                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret))
-#endif
+                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#define ENABLE_INTERRUPTS_SYSEXIT32                                    \
+       PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),    \
+                 CLBR_NONE,                                            \
+                 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
+#endif /* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_PARAVIRT */