Prototype patch for netring.

I started hacking on tap, but this is sufficiently different that it
doesn't make sense.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/ioctl-number.txt |    1 
 drivers/net/Kconfig            |    9 
 drivers/net/Makefile           |    1 
 drivers/net/netring.c          |  685 +++++++++++++++++++++++++++++++++++++++++
 include/linux/if_netring.h     |   40 ++
 5 files changed, 736 insertions(+)

diff -r 98387866de41 Documentation/ioctl-number.txt
--- a/Documentation/ioctl-number.txt	Mon Mar 31 23:01:58 2008 +1000
+++ b/Documentation/ioctl-number.txt	Tue Apr 01 13:30:24 2008 +1000
@@ -183,6 +183,7 @@ 0xAC	00-1F	linux/raw.h
 0xAC	00-1F	linux/raw.h
 0xAD	00	Netfilter device	in development:
 					<mailto:rusty@rustcorp.com.au>	
+0xAE	00	linux/if_netring.h
 0xB0	all	RATIO devices		in development:
 					<mailto:vgo@ratio.de>
 0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
diff -r 98387866de41 drivers/net/Kconfig
--- a/drivers/net/Kconfig	Mon Mar 31 23:01:58 2008 +1000
+++ b/drivers/net/Kconfig	Tue Apr 01 13:30:24 2008 +1000
@@ -3138,4 +3138,13 @@ config VIRTIO_NET
 	  This is the virtual network driver for virtio.  It can be used with
           lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
+config NETRING
+	tristate "Network userspace ring support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	---help---
+	  This is a userspace network interface (similar to tun) which
+	  uses a virtio-ring-compatible ringbuffer interface for
+	  efficient communication between the kernel and userspace.
+	  Say Y or M.
+
 endif # NETDEVICES
diff -r 98387866de41 drivers/net/Makefile
--- a/drivers/net/Makefile	Mon Mar 31 23:01:58 2008 +1000
+++ b/drivers/net/Makefile	Tue Apr 01 13:30:24 2008 +1000
@@ -252,3 +252,4 @@ obj-$(CONFIG_NETXEN_NIC) += netxen/
 obj-$(CONFIG_NETXEN_NIC) += netxen/
 obj-$(CONFIG_NIU) += niu.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_NETRING) += netring.o
diff -r 98387866de41 drivers/net/netring.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/net/netring.c	Tue Apr 01 13:30:24 2008 +1000
@@ -0,0 +1,685 @@
+/*
+ *  netring - ringbuffer based userspace net driver
+ *  Copyright 2008 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *  Much cribbed from Maxim Krasnyansky's tun.c
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ */
+#define DEBUG
+#include <linux/module.h>
+#include <linux/if_netring.h>
+#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+#include <linux/etherdevice.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+
+static int major;
+
+/* This is both filp->private_data, and netdev_priv(). */
+struct netring_struct {
+	/* We could split this into recv and xmit, but this works. */
+	spinlock_t lock;
+
+	/* The userspace virtio_rings from our perspective. */
+	/* FIXME: Store mask, to avoid modulus. */
+	struct vring recv, xmit;
+
+	/* skb receive queue: copying is done when user polls. */
+	struct sk_buff_head recvq;
+	atomic_t recvq_estimate;
+
+	/* Last seen on receive and send buffers. */
+	u16 last_recv_avail, last_xmit_avail, last_xmit_used;
+
+	/* limits and offsets. */
+	unsigned long limit, offset;
+
+	/* Waitqueue for poll() */
+	wait_queue_head_t poll_wait;
+
+	/* The struct net_device. */
+	struct net_device *dev;
+
+	/* The mapped used xmit ring. */
+	struct page *xmit_used_page;
+	struct vring_used *xmit_used;
+};
+
+struct skb_shinfo_netring {
+	struct netring_struct *nr;
+
+	unsigned int descriptor;
+	unsigned int len;
+};
+
+/* We are done with this skb: put it in the used pile. */
+static void skb_finished(struct skb_shared_info *sinfo)
+{
+	struct skb_shinfo_netring *shn = (void *)(sinfo + 1);
+	struct vring_used_elem *used;
+
+	spin_lock_bh(&shn->nr->lock);
+	used = &shn->nr->xmit_used->ring[shn->nr->xmit_used->idx
+					 % shn->nr->xmit.num];
+	used->id = shn->descriptor;
+	used->len = shn->len;
+	/* Make sure buffer is written before we update index. */
+	wmb();
+	shn->nr->xmit_used->idx++;
+	spin_unlock_bh(&shn->nr->lock);
+
+	/* FIXME: Check if they care. */
+	wake_up(&shn->nr->poll_wait);
+
+	/* Release device. */
+	dev_put(shn->nr->dev);
+}
+
+/* Assume we need one descriptor for gso hdr then one per page. */
+static unsigned estimate_descriptors(struct sk_buff *skb)
+{
+	return 1 + 1 + skb_shinfo(skb)->nr_frags;
+}
+
+/* FIXME: we could use pagefault_disable() and do some tricks here. */
+static int netring_receive_input(struct sk_buff *skb, struct net_device *dev)
+{
+	struct netring_struct *nr = netdev_priv(dev);
+	unsigned int estimate;
+
+	estimate = estimate_descriptors(skb);
+
+	if (atomic_read(&nr->recvq_estimate) + estimate > nr->recv.num) {
+		/* No room. */
+		dev_dbg(&dev->dev, "No room in queue, stopping\n");
+		netif_stop_queue(dev);
+	} else {
+		skb_queue_tail(&nr->recvq, skb);
+		atomic_add(estimate, &nr->recvq_estimate);
+	}
+
+	/* Tell user to hurry up and suck packets! */
+	wake_up(&nr->poll_wait);
+	return 0;
+}
+
+static void netring_setup(struct net_device *dev)
+{
+	struct netring_struct *nr = netdev_priv(dev);
+
+	/* nr->recv and nr->xmit are already zeroed. */
+	nr->dev = dev;
+	init_waitqueue_head(&nr->poll_wait);
+	spin_lock_init(&nr->lock);
+	skb_queue_head_init(&nr->recvq);
+	atomic_set(&nr->recvq_estimate, 0);
+
+	ether_setup(dev);
+	dev->hard_start_xmit = netring_receive_input;
+}
+
+/* If they give bad addresses, things will go wrong.  As always. */
+static u32 get_index(u16 *addr)
+{
+	u16 x;
+	if (get_user(x, addr) != 0)
+		return 0xFFFFFFFF;
+	return x;
+}
+
+/* ioctl tells us where the rings are: we're protected from parallel ioctls. */
+static int netring_ioctl(struct inode *inode, struct file *file,
+			 unsigned cmd, unsigned long arg)
+{
+	struct netring unr;
+	struct net_device *dev;
+	struct netring_struct *nr;
+	int err;
+
+	if (cmd == NETRINGGETIF) {
+		nr = file->private_data;
+		if (!nr)
+			return -EINVAL;
+		if (copy_to_user((char __user *)arg, nr->dev->name, IFNAMSIZ))
+			return -EFAULT;
+		return 0;
+	}
+
+	if (cmd != NETRINGBIND)
+		return -ENOTTY;
+
+	if (file->private_data)
+		return -EBUSY;
+
+	if (copy_from_user(&unr, (struct netring __user *)arg, sizeof(unr)))
+		return -EFAULT;
+
+	/* No zero-length buffers please! */
+	if (unr.recv_num == 0 || unr.xmit_num == 0)
+		return -EINVAL;
+
+	if (unr.recv_num > 65536 || unr.xmit_num > 65536)
+		return -EINVAL;
+
+	/* FIXME: This is because I only want to map one page. */
+	if (sizeof(__u16)*2 + sizeof(struct vring_used_elem)*unr.xmit_num
+	    > PAGE_SIZE)
+		return -EINVAL;
+
+	dev = alloc_netdev(sizeof(struct netring_struct), "nring%d",
+			   netring_setup);
+	if (!dev)
+		return -ENOMEM;
+
+	nr = netdev_priv(dev);
+	if (unr.flags & IF_NETRING_F_RECV_CSUM) {
+		dev->features |= (NETIF_F_HW_CSUM | NETIF_F_SG
+				  | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA);
+		if (unr.flags & IF_NETRING_F_RECV_TSO4)
+			dev->features |= NETIF_F_TSO;
+		if (unr.flags & IF_NETRING_F_RECV_TSO6)
+			dev->features |= NETIF_F_TSO6;
+		if (unr.flags & IF_NETRING_F_RECV_UFO)
+			dev->features |= NETIF_F_UFO;
+		if (unr.flags & IF_NETRING_F_RECV_TSO_ECN)
+			dev->flags |= NETIF_F_TSO_ECN;
+	}
+
+	/* Initialize virtio ring infrastructure. */
+	vring_init(&nr->recv, unr.recv_num, (void *)(long)unr.recv_addr,
+		   PAGE_SIZE);
+	vring_init(&nr->xmit, unr.xmit_num, (void *)(long)unr.xmit_addr,
+		   PAGE_SIZE);
+	nr->limit = unr.limit;
+	nr->offset = unr.offset;
+	nr->last_recv_avail = get_index(&nr->recv.avail->idx);
+	nr->last_xmit_avail = get_index(&nr->xmit.avail->idx);
+	nr->last_xmit_used = get_index(&nr->xmit.used->idx);
+
+	/* Map the used pages */
+	err = get_user_pages(current, current->mm, (unsigned long)nr->xmit.used,
+			     1, 1, 1, &nr->xmit_used_page, NULL);
+	if (err != 1)
+		goto free_netdev;
+
+	nr->xmit_used = kmap(nr->xmit_used_page);
+	if (!nr->xmit_used) {
+		err = -ENOMEM;
+		goto free_page;
+	}
+
+	/* Now we can receive packets. */
+	err = register_netdev(dev);
+	if (err)
+		goto unmap_page;
+
+	/* After this we can read/write. */
+	file->private_data = nr;
+	return 0;
+
+unmap_page:
+	kunmap(nr->xmit_used_page);
+free_page:
+	put_page(nr->xmit_used_page);
+free_netdev:
+	free_netdev(dev);
+	return err;
+}
+
+/* Assumes src is locked, and dest is or doesn't need to be. */
+static void __skb_queue_steal(struct sk_buff_head *dest,
+			      struct sk_buff_head *src)
+{
+	dest->prev = src->prev;
+	dest->next = src->next;
+	dest->qlen = src->qlen;
+	src->prev = src->next = NULL;
+	src->qlen = 0;
+}
+
+/* Chases down this descriptor, checking validity, and put it in iov[].
+ * Returne -errno or number of iovs used (max is 1 + MAX_SKB_FRAGS).
+ * Also fills in the total length (needed before using memcpy_toiovec etc). */
+static int get_iovec(struct netring_struct *nr,
+		     struct vring *v, unsigned int head,
+		     struct iovec iov[], unsigned int *len)
+{
+	u32 i, iov_num = 0;
+	struct vring_desc d;
+
+	*len = 0;
+	i = head;
+
+	while (iov_num < 1+MAX_SKB_FRAGS) {
+		if (i >= v->num) {
+			dev_dbg(&nr->dev->dev, "bad index: %u + %u = %u\n",
+				head, iov_num, i);
+			return -EINVAL;
+		}
+
+		if (copy_from_user(&d, &v->desc[i], sizeof(d)) != 0)
+			return -EINVAL;
+
+		/* We expect every recvq descriptor to be writable. */
+		if (!(d.flags & VRING_DESC_F_WRITE)) {
+			dev_dbg(&nr->dev->dev, "non-writable desc %u\n", i);
+			return -EINVAL;
+		}
+
+		if ((unsigned long)d.addr >= nr->limit
+		    || (unsigned long)d.addr + d.len > nr->limit) {
+			dev_dbg(&nr->dev->dev, "out-of-bounds desc %lu+%u\n",
+				(unsigned long)d.addr, d.len);
+			return -EINVAL;
+		}
+
+		iov[iov_num].iov_len = d.len;
+		*len += d.len;
+		iov[iov_num].iov_base = (void __user*)(long)d.addr + nr->offset;
+		iov_num++;
+
+		if (!(d.flags & VRING_DESC_F_NEXT))
+			break;
+
+		i = v->desc[i].next;
+	}
+	return iov_num;
+}
+
+static int copy_recv_skbs(struct netring_struct *nr)
+{
+	struct sk_buff_head q;
+	int err, num_copied = 0;
+	struct sk_buff *skb;
+	unsigned int used_idx, estimate = 0;
+
+	/* Atomically transfer off whole queue. */
+	spin_lock_bh(&nr->recvq.lock);
+	__skb_queue_steal(&q, &nr->recvq);
+	spin_unlock_bh(&nr->recvq.lock);
+
+	used_idx = get_index(&nr->recv.used->idx);
+
+	while (nr->last_recv_avail != get_index(&nr->recv.avail->idx)
+	       && !skb_queue_empty(&q)) {
+		struct iovec iov[1+MAX_SKB_FRAGS];
+		struct virtio_net_hdr gso = { 0 }; /* no info leak */
+		struct skb_shared_info *sinfo;
+		unsigned int len, num, head;
+		struct vring_used_elem __user *used;
+
+		num = nr->last_recv_avail % nr->recv.num;
+		head = get_index(&nr->recv.avail->ring[num]);
+		err = get_iovec(nr, &nr->recv, head, iov, &len);
+		if (err < 0)
+			goto requeue;
+		num = err;
+
+		/* Progress to next descriptor. */
+		nr->last_recv_avail++;
+
+		skb = __skb_dequeue(&q);
+		sinfo = skb_shinfo(skb);
+
+		/* FIXME: we could stash this descriptor and go looking for a
+		 * better-sized one.  That would allow them to mix different
+		 * buffer sizes for efficiency. */
+		if (unlikely(len < sizeof(gso) + skb->len)) {
+			nr->dev->stats.tx_aborted_errors++;
+			err = -ENOBUFS; /* PS. You suck! */
+			goto free_skb;
+		}
+
+		if (skb_is_gso(skb)) {
+			/* This is a hint as to how much should be linear. */
+			gso.hdr_len = skb_headlen(skb);
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		err = memcpy_toiovec(iov, (void *)&gso, sizeof(gso));
+		if (err) {
+			nr->dev->stats.tx_fifo_errors++;
+			goto free_skb;
+		}
+
+		err = skb_copy_datagram_iovec(skb, 0, iov, skb->len);
+		if (err) {
+			nr->dev->stats.tx_fifo_errors++;
+			goto free_skb;
+		}
+
+		/* Put this in the used entries. */
+		used = &nr->recv.used->ring[(used_idx + num_copied)
+					    % nr->recv.num];
+		if (put_user(head, &used->id) != 0
+		    || put_user(sizeof(gso) + skb->len, &used->len) != 0) {
+			nr->dev->stats.tx_fifo_errors++;
+			goto free_skb;
+		}
+		estimate += estimate_descriptors(skb);
+		kfree_skb(skb);
+		num_copied++;
+	}
+
+	/* Make sure userspace sees used entries before index update.
+	 * Currently this is overkill since presumably it's doing the
+	 * write() which invoked us here, but it's good
+	 * futureproofing. */
+	wmb();
+
+	put_user(used_idx + num_copied, &nr->recv.used->idx);
+
+	err = num_copied;
+	goto requeue;
+
+free_skb:
+	kfree_skb(skb);
+	nr->dev->stats.tx_errors++;
+requeue:
+	/* We now have to put back any we didn't send.  Maintain order! */
+	spin_lock_bh(&nr->recvq.lock);
+	while ((skb = __skb_dequeue_tail(&q)) != NULL)
+		__skb_queue_head(&nr->recvq, skb);
+	spin_unlock_bh(&nr->recvq.lock);
+	atomic_add(estimate, &nr->recvq_estimate);
+	netif_wake_queue(nr->dev);
+	return err;
+}
+
+/* This will fail if they give us a crazy iovec, but that's their own fault. */
+static int get_user_skb_frags(const struct iovec *iv, size_t count,
+			      struct skb_frag_struct *f)
+{
+	unsigned int i, j, num_pg = 0;
+	int err;
+	struct page *pages[MAX_SKB_FRAGS];
+
+	down_read(&current->mm->mmap_sem);
+	for (i = 0; i < count; i++) {
+		int n, npages;
+		unsigned long base, len;
+		base = (unsigned long)iv[i].iov_base;
+		len = (unsigned long)iv[i].iov_len;
+
+		if (len == 0)
+			continue;
+
+		/* How many pages will this take? */
+		npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		n = get_user_pages(current, current->mm, base, npages,
+				   0, 0, pages, NULL);
+		if (unlikely(n < 0)) {
+			err = n;
+			goto fail;
+		}
+
+		/* Transfer pages to the frag array */
+		for (j = 0; j < n; j++) {
+			f[num_pg].page = pages[j];
+			f[num_pg].page_offset = offset_in_page(base);
+			f[num_pg].size = min(len,
+					     PAGE_SIZE - f[num_pg].page_offset);
+			len -= f[num_pg].size;
+			base += f[num_pg].size;
+			num_pg++;
+		}
+
+		if (unlikely(n != npages)) {
+			err = -EFAULT;
+			goto fail;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+	return num_pg;
+
+fail:
+	for (i = 0; i < num_pg; i++)
+		put_page(f[i].page);
+	up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+/* For GSO packets, it's worth pinning the userspace pages. */
+static struct sk_buff *map_gso_skb(struct netring_struct *nr,
+				   struct iovec *iov, size_t count, size_t len,
+				   unsigned int head)
+{
+	struct sk_buff *skb;
+	struct skb_shared_info *sinfo;
+	struct skb_shinfo_netring *shn;
+	struct virtio_net_hdr gso;
+	int err;
+
+	/* Don't get silly with the lengths. */
+	if (len < sizeof(gso) || len > sizeof(gso) + 65536)
+		return ERR_PTR(-EINVAL);
+
+	err = memcpy_fromiovec((void *)&gso, iov, sizeof(gso));
+	if (err < 0)
+		return ERR_PTR(err);
+	len -= sizeof(gso);
+
+	/* If you recommend an amount to copy, it must be < len. */
+	if (gso.hdr_len > len)
+		return ERR_PTR(-EINVAL);
+	len -= gso.hdr_len;
+
+	/* Can't have scatter gather packet already csummed. */
+	if (len && !(gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
+		return ERR_PTR(-EINVAL);
+
+	skb = alloc_skb_data_destructor(NET_IP_ALIGN + gso.hdr_len,
+					GFP_KERNEL, skb_finished,
+					sizeof(*shn));
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	skb_reserve(skb, NET_IP_ALIGN);
+	/* Don't let the device go away until destructor called. */
+	dev_hold(nr->dev);
+	shn = (struct skb_shinfo_netring *)(skb_shinfo(skb) + 1);
+	shn->nr = nr;
+	shn->descriptor = head;
+	shn->len = sizeof(gso) + gso.hdr_len + len;
+
+	sinfo = skb_shinfo(skb);
+	sinfo->gso_size = gso.gso_size;
+	sinfo->gso_type = SKB_GSO_DODGY;
+	switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+	case VIRTIO_NET_HDR_GSO_TCPV4:
+		sinfo->gso_type |= SKB_GSO_TCPV4;
+		break;
+	case VIRTIO_NET_HDR_GSO_TCPV6:
+		sinfo->gso_type |= SKB_GSO_TCPV6;
+		break;
+	case VIRTIO_NET_HDR_GSO_UDP:
+		sinfo->gso_type |= SKB_GSO_UDP;
+		break;
+	default:
+		err = -EINVAL;
+		goto free_skb;
+	}
+
+	if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+	/* Copy in the header. */
+	if (memcpy_fromiovec(skb_put(skb, gso.hdr_len), iov, gso.hdr_len)) {
+		err = -EFAULT;
+		goto free_skb;
+	}
+
+	err = get_user_skb_frags(iov, count, sinfo->frags);
+	if (err < 0)
+		goto free_skb;
+
+	sinfo->nr_frags = err;
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+	printk("skb->data_len = %u, skb->len = %u, skb->truesize = %u\n",
+	       skb->data_len, skb->len, skb->truesize);
+
+	return skb;
+
+free_skb:
+	kfree_skb(skb);
+	return ERR_PTR(err);
+}
+
+/* FIXME: Frob VRING_USED_F_NO_NOTIFY and VRING_AVAIL_F_NO_INTERRUPT. */
+static int xmit_buffers(struct netring_struct *nr)
+{
+	struct iovec iov[1+MAX_SKB_FRAGS];
+	unsigned int len, num, head;
+	int err;
+
+	while (nr->last_xmit_avail != get_index(&nr->xmit.avail->idx)) {
+		struct sk_buff *skb;
+
+		num = nr->last_xmit_avail % nr->xmit.num;
+		head = get_index(&nr->xmit.avail->ring[num]);
+		err = get_iovec(nr, &nr->xmit, head, iov, &len);
+		if (err < 0)
+			return err;
+		num = err;
+
+		skb = map_gso_skb(nr, iov, num, len, head);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		skb->protocol = eth_type_trans(skb, nr->dev);
+		netif_rx_ni(skb);
+
+		nr->dev->last_rx = jiffies;
+		nr->dev->stats.rx_packets++;
+		nr->dev->stats.rx_bytes += len;
+		nr->last_xmit_avail++;
+	}
+	return 0;
+}
+
+/*
+ * Our poll() is a hack, in that it does work and clears state.  This avoids
+ * another syscall (such as read).
+ *
+ * Our two rings really are independent: each one can be ready for reading (ie.
+ * some buffers have been used) or writing (ie. more buffers can be registered).
+ * Ideally, we'd have a 'ringfd' for each one: we don't.
+ *
+ * So, we simply mark it readable when the either some receive queue buffers
+ * are used (ie. you have input), or the xmit buffers are used (ie. you can
+ * xmit more).
+ */
+static unsigned int netring_poll(struct file *file,
+				 struct poll_table_struct *poll)
+{
+	struct netring_struct *nr = file->private_data;
+	int ret, mask = 0;
+
+	if (!nr)
+		return -EBADFD;
+
+	/* Now we're in the correct context, try copying recv skbs. */
+	ret = copy_recv_skbs(nr);
+	if (ret < 0)
+		return ret;
+
+	poll_wait(file, &nr->poll_wait, poll);
+
+	/* If we copied buffers, it's readable. */
+	if (ret)
+		mask |= POLLIN | POLLRDNORM;
+	else {
+		/* More xmit used? */
+		ret = get_index(&nr->xmit.used->idx);
+		if (nr->last_xmit_used != ret) {
+			mask |= POLLIN | POLLRDNORM;
+			nr->last_xmit_used = ret;
+		}
+	}
+
+	return mask;
+}
+
+static ssize_t netring_xmit(struct file *file, const char __user *p,
+			    size_t len, loff_t *off)
+{
+	struct netring_struct *nr = file->private_data;
+
+	if (unlikely(!nr))
+		return -EINVAL;
+
+	return xmit_buffers(nr);
+}
+
+static int netring_close(struct inode *inode, struct file *file)
+{
+	struct netring_struct *nr = file->private_data;
+
+	if (nr) {
+		unregister_netdev(nr->dev);
+		kunmap(nr->xmit_used_page);
+		put_page(nr->xmit_used_page);
+		free_netdev(nr->dev);
+	}
+	return 0;
+}
+
+static struct file_operations netring_fops = {
+	.owner = THIS_MODULE,
+	.poll = netring_poll,
+	.write = netring_xmit,
+	.ioctl = netring_ioctl,
+	.release = netring_close,
+};
+
+static int init(void)
+{
+	major = register_chrdev(0, KBUILD_MODNAME, &netring_fops);
+	return major < 0 ? major : 0;
+}
+
+static void fini(void)
+{
+	unregister_chrdev(major, KBUILD_MODNAME);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff -r 98387866de41 include/linux/if_netring.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/if_netring.h	Tue Apr 01 13:30:24 2008 +1000
@@ -0,0 +1,40 @@
+#ifndef __IF_NETRING_H
+#define __IF_NETRING_H
+
+#include <linux/types.h>
+
+/* Can we handle partial csums on received packets? */
+#define IF_NETRING_F_RECV_CSUM		0x01
+/* Can we handle TSOv4 frames? (must also specify RECV_CSUM) */
+#define IF_NETRING_F_RECV_TSO4		0x02
+/* Can we handle TSOv6 frames? (must also specify RECV_CSUM) */
+#define IF_NETRING_F_RECV_TSO6		0x04
+/* Can we handle UFO frames? (must also specify RECV_CSUM) */
+#define IF_NETRING_F_RECV_UFO		0x08
+/* Can we handle those TSO frames with ECN? */
+#define IF_NETRING_F_RECV_TSO_ECN	0x10
+
+/* This tells the kernel about our virtio_rings. */
+struct netring {
+	/* The addresses of the receive and send virtio_rings.*/
+	__u64 recv_addr, xmit_addr;
+
+	/* The offset to apply to the addresses in the rings, and the
+	 * highest value to allow (before offset).  This allows the
+	 * caller to restrict what buffers can be put in the rings,
+	 * and also offset them (useful for guests whose "physical"
+	 * address space is mapped into this address space. */
+	__u64 offset, limit;
+
+	/* The actual number of elements in the virtio_rings. */
+	__u32 recv_num, xmit_num;
+
+	/* What can this userspace handle? */
+	__u64 flags;
+};
+
+/* Ioctl defines: "ioctls are an AEgly interface". */
+#define NETRINGBIND   _IOW(0xAE, 0, struct netring)
+#define NETRINGGETIF  _IOR(0xAE, 1, char[IFNAMSIZ])
+
+#endif /* __IF_NETRING_H */