tun: vringfd receive support.

This patch modifies tun to allow a vringfd to specify the receive
buffer.  Because we can't copy to userspace in bh context, we queue
like normal then use the "pull" hook to actually do the copy.

We use struct virtio_net_hdr prepended to packets in the ring to allow
userspace to receive GSO packets in future (at the moment, the tun
driver doesn't tell the stack it can handle them, so these cases are
never taken).  This will need to be something that userspace tells us
it can handle.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/Kconfig    |    2 
 drivers/net/tun.c      |  163 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/if_tun.h |    1 
 3 files changed, 166 insertions(+)

diff -r 4e24df465710 drivers/net/Kconfig
--- a/drivers/net/Kconfig	Mon May 05 14:58:01 2008 +1000
+++ b/drivers/net/Kconfig	Mon May 05 14:59:07 2008 +1000
@@ -120,6 +120,8 @@ config TUN
 config TUN
 	tristate "Universal TUN/TAP device driver support"
 	select CRC32
+# If no VRING at all, that's fine, but if it's a module, we must be, too.
+	depends on !VRING || VRING
 	---help---
 	  TUN/TAP provides packet reception and transmission for user space
 	  programs.  It can be viewed as a simple Point-to-Point or Ethernet
diff -r 4e24df465710 drivers/net/tun.c
--- a/drivers/net/tun.c	Mon May 05 14:58:01 2008 +1000
+++ b/drivers/net/tun.c	Mon May 05 14:59:07 2008 +1000
@@ -63,6 +63,9 @@
 #include <linux/if_tun.h>
 #include <linux/crc32.h>
 #include <linux/nsproxy.h>
+#include <linux/vring.h>
+#include <linux/virtio_net.h>
+#include <linux/file.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -100,6 +103,9 @@ struct tun_struct {
 	u8 dev_addr[ETH_ALEN];
 	u32 chr_filter[2];
 	u32 net_filter[2];
+
+	struct vring_info	*inring;
+	struct file		*infile;
 
 #ifdef TUN_DEBUG
 	int debug;
@@ -164,6 +170,10 @@ static int tun_net_xmit(struct sk_buff *
 	/* Notify and wake up reader process */
 	if (tun->flags & TUN_FASYNC)
 		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
+
+	if (tun->inring)
+		vring_wake(tun->inring);
+
 	wake_up_interruptible(&tun->read_wait);
 	return 0;
 
@@ -255,6 +265,153 @@ static void tun_net_init(struct net_devi
 		break;
 	}
 }
+
+#if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
+/* Returns whether there are queued buffers */
+static bool pending_recv_skbs(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+
+	return !skb_queue_empty(&tun->readq) && vring_has_buffer(tun->inring);
+}
+
+/* Returns 0, or negative errno. */
+static int pull_recv_skbs(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	int err = 0, num_copied = 0;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&tun->readq)) != NULL) {
+		struct iovec iov[2+MAX_SKB_FRAGS];
+		struct virtio_net_hdr gso = { 0 }; /* no info leak */
+		unsigned long len;
+		int id;
+
+		id = vring_get_buffer(tun->inring, iov, ARRAY_SIZE(iov), &len,
+				      NULL, 0, NULL);
+		if (id <= 0) {
+			err = id;
+			break;
+		}
+
+		/* FIXME: we could stash this descriptor and go looking for a
+		 * better-sized one.  That would allow them to mix different
+		 * buffer sizes for efficiency. */
+		if (unlikely(len < sizeof(gso) + skb->len)) {
+			tun->dev->stats.tx_aborted_errors++;
+			err = -ENOBUFS; /* PS. You suck! */
+			break;
+		}
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			gso.hdr_len = skb_headlen(skb);
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		err = memcpy_toiovec(iov, (void *)&gso, sizeof(gso));
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		err = skb_copy_datagram_iovec(skb, 0, iov, skb->len);
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		err = vring_used_buffer(tun->inring, id, sizeof(gso)+skb->len);
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		num_copied++;
+	}
+
+	/* We took an skb, but ring isn't ready for it.  Put it back */
+	if (skb)
+		skb_queue_head(&tun->readq, skb);
+
+	if (num_copied)
+		netif_wake_queue(tun->dev);
+
+	return err;
+}
+
+static struct vring_ops recvops = {
+	.can_pull = pending_recv_skbs,
+	.pull = pull_recv_skbs,
+};
+
+static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	int err;
+
+	if (tun->inring)
+		return -EBUSY;
+
+	tun->infile = fget(fd);
+	if (!tun->infile)
+		return -EBADF;
+
+	tun->inring = vring_get(tun->infile);
+	if (!tun->inring) {
+		err = -EBADF;
+		goto put;
+	}
+
+	err = vring_set_ops(tun->inring, &recvops, tun);
+	if (err) {
+		tun->inring = NULL;
+		goto put;
+	}
+	return 0;
+
+put:
+	fput(tun->infile);
+	tun->infile = NULL;
+	return err;
+}
+
+static void unset_vrings(struct tun_struct *tun)
+{
+	if (tun->inring) {
+		vring_unset_ops(tun->inring);
+		fput(tun->infile);
+	}
+}
+#else /* ... !CONFIG_VRING */
+static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	return -ENOTTY;
+}
+
+static void unset_vrings(struct tun_struct *tun)
+{
+}
+#endif
 
 /* Character device part */
 
@@ -471,6 +628,7 @@ static void tun_setup(struct net_device 
 
 	tun->owner = -1;
 	tun->group = -1;
+	tun->inring = NULL;
 
 	dev->open = tun_net_open;
 	dev->hard_start_xmit = tun_net_xmit;
@@ -692,6 +850,9 @@ static int tun_chr_ioctl(struct inode *i
 		break;
 #endif
 
+	case TUNSETRECVVRING:
+		return set_recv_vring(tun, arg);
+
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
 		if (copy_to_user( argp, &ifr, sizeof ifr))
@@ -812,6 +973,8 @@ static int tun_chr_close(struct inode *i
 	DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);
 
 	tun_chr_fasync(-1, file, 0);
+
+	unset_vrings(tun);
 
 	rtnl_lock();
 
diff -r 4e24df465710 include/linux/if_tun.h
--- a/include/linux/if_tun.h	Mon May 05 14:58:01 2008 +1000
+++ b/include/linux/if_tun.h	Mon May 05 14:59:07 2008 +1000
@@ -42,6 +42,7 @@
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
+#define TUNSETRECVVRING _IOW('T', 207, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
