FIXME: re-benchmark
lguest: (mostly) close iret race

If an lguest does an iret which re-enables interrupts, we don't tell the
hypervisor so any pending interrupts don't get delivered.

This is clever (no hypercall in iret path), but means interrupt latency can
be high.  Now we expose a pending flag to the Guest, it can detect pending
interrupts and do a hypercall in that case.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/include/asm/lguest_hcall.h   |    1 
 arch/x86/lguest/i386_head.S           |    6 +++++
 drivers/lguest/interrupts_and_traps.c |   39 ++++++++++++++++++++++++++++++++++
 drivers/lguest/lg.h                   |    1 
 drivers/lguest/x86/core.c             |    3 ++
 5 files changed, 50 insertions(+)

diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -18,6 +18,7 @@
 #define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
 #define LHCALL_SEND_INTERRUPTS	19
+#define LHCALL_IRET		20
 
 #define LGUEST_TRAP_ENTRY 0x1F
 
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -152,6 +152,8 @@ ENTRY(lg_restore_fl)
 ENTRY(lguest_iret)
 	pushl	%eax
 	movl	12(%esp), %eax
+	testl	%ss:lguest_data+LGUEST_DATA_irq_pending, %eax
+	jnz	iret_hcall
 lguest_noirq_start:
 	/* Note the %ss: segment prefix here.  Normal data accesses use the
 	 * "ds" segment, but that will have already been restored for whatever
@@ -161,3 +163,7 @@ lguest_noirq_start:
 	popl	%eax
 	iret
 lguest_noirq_end:
+
+iret_hcall:
+	movl $LHCALL_IRET, %eax
+	int $LGUEST_TRAP_ENTRY
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -48,6 +48,45 @@ static void push_guest_stack(struct lg_c
 	lgwrite(cpu, *gstack, u32, val);
 }
 
+/* We need a helper to "pop" a value from the Guest's stack, since that's a
+ * big part of what returning from an interrupt does. */
+static u32 pop_guest_stack(struct lg_cpu *cpu, unsigned long *gstack)
+{
+	u32 val;
+	val = lgread(cpu, *gstack, u32);
+	*gstack += 4;
+	return val;
+}
+
+/* Return from interrupt, when Guest knows an interrupt is pending. */
+void return_from_interrupt(struct lg_cpu *cpu)
+{
+	/* Note: this assumes physically linear stacks. */
+	unsigned long gstack = guest_pa(cpu, cpu->regs->esp);
+
+	/* Stack frame is: eax, eip, cs, eflags. */
+	cpu->regs->eax = pop_guest_stack(cpu, &gstack);
+	cpu->regs->eip = pop_guest_stack(cpu, &gstack);
+	cpu->regs->cs = pop_guest_stack(cpu, &gstack);
+	cpu->regs->eflags = pop_guest_stack(cpu, &gstack);
+
+	if ((cpu->regs->cs & 3) < GUEST_PL)
+		kill_guest(cpu, "iret to invalid PL: %lu", cpu->regs->cs);
+	else if ((cpu->regs->cs & 3) != GUEST_PL) {
+		/* Priv level change, restore esp and ss. */
+		cpu->regs->esp = pop_guest_stack(cpu, &gstack);
+		cpu->regs->ss = pop_guest_stack(cpu, &gstack);
+	} else {
+		/* Fix up stack to jump over 4 slots. */
+		cpu->regs->esp += 16;
+	}
+
+	/* Copy eflags back. */
+	put_user(cpu->regs->eflags & X86_EFLAGS_IF,
+		 &cpu->lg->lguest_data->irq_enabled);
+	/* Now any pending interrupts will be delivered in next loop */
+}
+
 /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or
  * trap.  The mechanics of delivering traps and interrupts to the Guest are the
  * same, except some traps have an "error code" which gets pushed onto the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -151,6 +151,7 @@ unsigned int interrupt_pending(struct lg
 void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
 void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
+void return_from_interrupt(struct lg_cpu *cpu);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
 void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -577,6 +577,9 @@ int lguest_arch_do_hcall(struct lg_cpu *
 	case LHCALL_LOAD_TLS:
 		guest_load_tls(cpu, args->arg1);
 		break;
+	case LHCALL_IRET:
+		return_from_interrupt(cpu);
+		break;
 	default:
 		/* Bad Guest.  Bad! */
 		return -EIO;
