Subject: Linux kernel infrastructure for Xen Share access

On the Linux kernel side, we provide some wrappers for accessing
shared pages.  They are currently reference-counted, because a future
patch allows userspace to access shared pages, and the Xen interface
will refuse the second request for access by the same domain.

The entire hypercall interface is arch-wrapped, which is probably
overkill, but I wasn't entirely sure of the needs of non-x86
architectures.  Some of this should almost certainly be in common code.

Index: xen-sane/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/share.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xen-sane/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/share.h	2006-07-03 12:43:51.000000000 +1000
@@ -0,0 +1,62 @@
+#ifndef __ASM_XEN_I386_SHARE_H
+#define __ASM_XEN_I386_SHARE_H
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <xen/interface/share.h>
+
+struct xen_share
+{
+	struct list_head list;
+	atomic_t use;
+	share_ref_t share_ref;
+	unsigned num_pages;
+	void *addr;
+	int event_channel;
+	int peerid;
+	int irq;
+	struct list_head handlers;
+};
+
+struct xen_share_handler
+{
+	struct list_head list;
+	void (*handler)(struct xen_share_handler *h);
+};
+
+/* Map a shared area.  Returns PTR_ERR(errno) on fail. */
+struct xen_share *xen_share_get(share_ref_t share_ref, unsigned pages);
+
+/* Set up handler for events. */
+void xen_share_add_handler(struct xen_share *s, struct xen_share_handler *h);
+
+/* Remove handler. */
+void xen_share_remove_handler(struct xen_share *s,
+			      struct xen_share_handler *h);
+
+/* Unmap a shared area (irq unbound if not done already). */
+void xen_share_put(struct xen_share *share);
+
+/* Register this sg list (physical kernel addresses).  Returns 0 on success. */
+int xen_sg_register(struct xen_share *share, u32 queue, u32 *lenp,
+		    unsigned int num_sgs, const struct xen_sg sg[]);
+
+/* Unregister this sg list: give first phys address of sg. */
+void xen_sg_unregister(struct xen_share *share, unsigned long sgaddr);
+
+/* Copy from this sg list (physical kernel addresses).  Returns len xferred. */
+int xen_sg_xfer(struct xen_share *share, u32 queue,
+		unsigned int num_sgs, const struct xen_sg sg[]);
+
+/* Place watch on this trigger.  Returns 0 on success. */
+int xen_share_watch(struct xen_share *share, int triggernum, u32 *resultp);
+
+/* Remove watch on this trigger. */
+void xen_share_unwatch(struct xen_share *share, int triggernum);
+
+/* Trigger a watch.  Returns num watching on success. */
+int xen_share_trigger(struct xen_share *share, int triggernum);
+
+/* Map a share into a vma (for userspace mmap). */
+int xen_share_map(struct xen_share *share, struct vm_area_struct *vma);
+#endif	/* __ASM_XEN_I386_SHARE_H */
Index: xen-sane/linux-2.6-xen-sparse/arch/i386/kernel/share-xen.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xen-sane/linux-2.6-xen-sparse/arch/i386/kernel/share-xen.c	2006-07-03 12:49:30.000000000 +1000
@@ -0,0 +1,280 @@
+/* x86 layer for share hypercalls.
+ * Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/sched.h>
+#include <linux/page-flags.h>
+#include <linux/vmalloc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+#include <asm/share.h>
+#include <asm/io.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+
+/* We only request each area from the hypervisor once, so track them. */
+static DECLARE_MUTEX(share_lock);
+static spinlock_t handler_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(shares);
+
+static int get_evtchn_port(void)
+{
+	int err;
+	struct evtchn_alloc_unbound evtchn = { .dom = DOMID_SELF,
+	                                       .remote_dom = DOMID_SELF };
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &evtchn);
+	if (err)
+		return err;
+
+	return evtchn.port;
+}
+
+static void close_evtchn_port(int port)
+{
+	struct evtchn_close evtchn;
+	evtchn.port = port;
+	BUG_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &evtchn) != 0);
+}
+
+static struct xen_share *get_share(share_ref_t share_ref)
+{
+	struct xen_share *i;
+
+	list_for_each_entry(i, &shares, list) {
+		if (i->share_ref == share_ref) {
+			atomic_inc(&i->use);
+			return i;
+		}
+	}
+	return NULL;
+}
+
+static irqreturn_t share_irq(int irq, void *share_, struct pt_regs *regs)
+{
+	struct xen_share *share = share_;
+	struct xen_share_handler *h;
+
+	list_for_each_entry(h, &share->handlers, list)
+		h->handler(h);
+	return IRQ_HANDLED;
+}
+
+struct xen_share *create_share(share_ref_t share_ref, unsigned pages)
+{
+	pgprot_t prot;
+	int err;
+	struct vm_struct *vma;
+	struct xen_share *share;
+
+	share = kmalloc(sizeof(struct xen_share), GFP_KERNEL);
+	if (!share) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	share->share_ref = share_ref;
+	share->num_pages = pages;
+	atomic_set(&share->use, 1);
+	INIT_LIST_HEAD(&share->handlers);
+	vma = get_vm_area(pages * PAGE_SIZE, VM_IOREMAP);
+	if (!vma) {
+		err = -ENOMEM;
+		goto free_share;
+	}
+
+	share->event_channel = get_evtchn_port();
+	if (share->event_channel < 0) {
+		err = share->event_channel;
+		goto free_vma;
+	}
+
+	err = bind_evtchn_to_irqhandler(share->event_channel, share_irq,
+					SA_SHIRQ, "xenshare", share);
+	if (err < 0)
+		goto close_evtchn;
+	share->irq = err;
+
+	share->peerid = HYPERVISOR_share(XEN_SHARE_get, share_ref,
+					 share->event_channel, 0, 0);
+	if (share->peerid < 0) {
+		err = share->peerid;
+		goto unbind_evtchn;
+	}
+
+	prot = __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_DIRTY|_PAGE_ACCESSED);
+	err = direct_kernel_remap_pfn_range((unsigned long)vma->addr,
+					    share_ref, pages * PAGE_SIZE,
+					    prot, DOMID_SELF);
+	if (err)
+		goto put_share;
+	share->addr = vma->addr;
+	list_add(&share->list, &shares);
+
+	return share;
+
+put_share:
+	BUG_ON(HYPERVISOR_share(XEN_SHARE_drop,share->share_ref,0,0,0) != 0);
+unbind_evtchn:
+	unbind_from_irqhandler(share->irq, share);
+	goto free_vma;
+close_evtchn:
+	close_evtchn_port(share->event_channel);
+free_vma:
+	kfree(vma);
+free_share:
+	kfree(share);
+fail:
+	return ERR_PTR(err);
+}
+
+/* Map a shared area.  Returns PTR_ERR(errno) on fail. */
+struct xen_share *xen_share_get(share_ref_t share_ref, unsigned pages)
+{
+	struct xen_share *share;
+
+	down(&share_lock);
+	share = get_share(share_ref);
+	if (share)
+		BUG_ON(share->num_pages != pages);
+	else
+		share = create_share(share_ref, pages);
+	up(&share_lock);
+
+	return share;
+}
+
+void xen_share_add_handler(struct xen_share *s, struct xen_share_handler *h)
+{
+	spin_lock_irq(&handler_lock);
+	list_add(&h->list, &s->handlers);
+	spin_unlock_irq(&handler_lock);
+}
+
+/* Remove irq handler. */
+void xen_share_remove_handler(struct xen_share *s, struct xen_share_handler *h)
+{
+	BUG_ON(list_empty(&s->handlers));
+	spin_lock_irq(&handler_lock);
+	list_del(&h->list);
+	spin_unlock_irq(&handler_lock);
+}
+
+/* Unmap a shared area. */
+void xen_share_put(struct xen_share *share)
+{
+	down(&share_lock);
+	if (atomic_dec_and_test(&share->use)) {
+		BUG_ON(!list_empty(&share->handlers));
+		unbind_from_irqhandler(share->irq, share);
+
+		/* This also kfrees vma. */
+		vunmap(share->addr);
+		BUG_ON(HYPERVISOR_share(XEN_SHARE_drop, share->share_ref, 0,
+					0, 0) != 0);
+		list_del(&share->list);
+		kfree(share);
+	}
+	up(&share_lock);
+}
+
+/* Register this sg list (physical kernel addresses).  Returns 0 on success. */
+int xen_sg_register(struct xen_share *s, u32 queue, u32 *lenp,
+		    unsigned int num_sgs, const struct xen_sg sg[])
+{
+	struct xen_sg new_sg[XEN_SG_MAX];
+	unsigned int i;
+
+	/* We feed machine addresses to hypervisor. */
+	for (i = 0; i < num_sgs; i++) {
+		new_sg[i].addr = phys_to_machine(sg[i].addr);
+		new_sg[i].len = sg[i].len;
+	}
+
+	return HYPERVISOR_share(XEN_SHARE_sg_register, s->share_ref,
+				xen_share_sg_arg(queue, num_sgs),
+				(long)new_sg,
+				virt_to_machine(lenp));
+}
+
+/* Unregister this sg list. */
+void xen_sg_unregister(struct xen_share *s, unsigned long addr)
+{
+	BUG_ON(HYPERVISOR_share(XEN_SHARE_sg_unregister, s->share_ref,
+				phys_to_machine(addr), 0, 0) != 0);
+}
+
+/* Send to this sg list (physical kernel addresses).  Returns len xferred. */
+int xen_sg_xfer(struct xen_share *s, u32 queue,
+		unsigned int num_sgs, const struct xen_sg sg[])
+{
+	struct xen_sg new_sg[XEN_SG_MAX];
+	unsigned int i;
+
+	/* Hypervisor wants virtual addresses here. */
+	for (i = 0; i < num_sgs; i++) {
+		new_sg[i].addr = (long)phys_to_virt(sg[i].addr);
+		new_sg[i].len = sg[i].len;
+	}
+
+	return HYPERVISOR_share(XEN_SHARE_sg_xfer, s->share_ref,
+				xen_share_sg_arg(queue, num_sgs),
+				(long)new_sg, 0);
+}
+
+/* Place watch on this trigger.  Returns 0 on success. */
+int xen_share_watch(struct xen_share *s, int triggernum, u32 *resultp)
+{
+	return HYPERVISOR_share(XEN_SHARE_watch, s->share_ref, triggernum,
+				virt_to_machine(resultp), 0);
+}
+
+/* Remove watch on this trigger. */
+void xen_share_unwatch(struct xen_share *s, int triggernum)
+{
+	BUG_ON(HYPERVISOR_share(XEN_SHARE_unwatch, s->share_ref, triggernum,
+				0, 0) != 0);
+}
+
+/* Trigger a watch.  Returns num watching on success. */
+int xen_share_trigger(struct xen_share *s, int trigger)
+{
+	return HYPERVISOR_share(XEN_SHARE_trigger, s->share_ref, trigger,0,0);
+}
+
+int xen_share_map(struct xen_share *s, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+	return direct_remap_pfn_range(vma, vma->vm_start,
+				      s->share_ref,
+				      s->num_pages * PAGE_SIZE,
+				      vma->vm_page_prot, DOMID_SELF);
+}
+
+EXPORT_SYMBOL_GPL(xen_share_get);
+EXPORT_SYMBOL_GPL(xen_share_put);
+EXPORT_SYMBOL_GPL(xen_share_map);
+EXPORT_SYMBOL_GPL(xen_share_trigger);
+EXPORT_SYMBOL_GPL(xen_share_watch);
+EXPORT_SYMBOL_GPL(xen_share_unwatch);
+EXPORT_SYMBOL_GPL(xen_sg_xfer);
+EXPORT_SYMBOL_GPL(xen_sg_register);
+EXPORT_SYMBOL_GPL(xen_sg_unregister);
+EXPORT_SYMBOL_GPL(xen_share_add_handler);
+EXPORT_SYMBOL_GPL(xen_share_remove_handler);
Index: xen-sane/linux-2.6-xen-sparse/arch/i386/kernel/Makefile
===================================================================
--- xen-sane.orig/linux-2.6-xen-sparse/arch/i386/kernel/Makefile	2006-07-03 11:59:20.000000000 +1000
+++ xen-sane/linux-2.6-xen-sparse/arch/i386/kernel/Makefile	2006-07-03 11:59:44.000000000 +1000
@@ -88,6 +88,7 @@
 include $(srctree)/scripts/Makefile.xen
 
 obj-y += fixup.o
+obj-y += share-xen.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
 n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
 
Index: xen-sane/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c
===================================================================
--- xen-sane.orig/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c	2006-07-03 11:59:20.000000000 +1000
+++ xen-sane/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c	2006-07-03 11:59:44.000000000 +1000
@@ -123,8 +123,11 @@
 	/* Same as remap_pfn_range(). */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 
+	/* FIXME: xenshare needs to pass DOMID_SELF. Check it's save to remove
+	 * the check.
 	if (domid == DOMID_SELF)
 		return -EINVAL;
+	*/
 
 	return __direct_remap_pfn_range(
 		vma->vm_mm, address, mfn, size, prot, domid);
Index: xen-sane/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h
===================================================================
--- xen-sane.orig/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h	2006-07-03 11:59:20.000000000 +1000
+++ xen-sane/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h	2006-07-03 11:59:44.000000000 +1000
@@ -368,5 +368,11 @@
 	return _hypercall2(int, xenoprof_op, op, arg);
 }
 
+static inline long
+HYPERVISOR_share(
+	int op, long arg1, long arg2, long arg3, long arg4)
+{
+	return _hypercall5(long, share_op, op, arg1, arg2, arg3, arg4);
+}
 
 #endif /* __HYPERCALL_H__ */
Index: xen-sane/patches/linux-2.6.12/get_vm_area.patch
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ xen-sane/patches/linux-2.6.12/get_vm_area.patch	2006-07-03 11:59:44.000000000 +1000
@@ -0,0 +1,9 @@
+diff -Naur linux-2.6.12/mm/vmalloc.c linux-2.6.12.post/mm/vmalloc.c
+--- linux-2.6.12/mm/vmalloc.c    2005-06-18 05:48:29.000000000 +1000
++++ linux-2.6.12.post/mm/vmalloc.c        2006-01-10 16:56:36.000000000 +1100
+@@ -247,6 +247,7 @@
+ {
+        return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+ }
++EXPORT_SYMBOL(get_vm_area);
+
