tun: vringfd xmit support.

This patch modifies tun to allow a vringfd to specify the send
buffer.  The user does a write to push out packets from the buffer.

Again we use the 'struct virtio_net_hdr' to allow userspace to send
GSO packets.  In this case, it can hint how much to copy, and the
other pages will be made into skb fragments.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/tun.c      |  417 +++++++++++++++++++++++++++++++++++++++++--------
 include/linux/if_tun.h |    1 
 2 files changed, 358 insertions(+), 60 deletions(-)

diff -r f87969ce25ac drivers/net/tun.c
--- a/drivers/net/tun.c	Thu Apr 24 12:02:08 2008 +1000
+++ b/drivers/net/tun.c	Thu Apr 24 12:02:44 2008 +1000
@@ -66,6 +66,8 @@
 #include <linux/vring.h>
 #include <linux/virtio_net.h>
 #include <linux/file.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -104,8 +106,11 @@ struct tun_struct {
 	u32 chr_filter[2];
 	u32 net_filter[2];
 
-	struct vring_info	*inring;
-	struct file		*infile;
+	/* List for user-mapped skbs successfully transmitted */
+	spinlock_t		outring_lock;
+	struct list_head	outring_finished;
+	struct vring_info	*inring, *outring;
+	struct file		*infile, *outfile;
 
 #ifdef TUN_DEBUG
 	int debug;
@@ -266,6 +271,189 @@ static void tun_net_init(struct net_devi
 	}
 }
 
+/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
+ * Users will learn not to do that. */
+static int get_user_skb_frags(struct iovec *iv, size_t totlen,
+			      struct skb_frag_struct *f)
+{
+	unsigned int i, j, num_pg = 0;
+	int err;
+	struct page *pages[MAX_SKB_FRAGS];
+
+	down_read(&current->mm->mmap_sem);
+	/* We loop through whole iovec appending pages to skb->frags. */
+	while (totlen) {
+		int n, npages;
+		unsigned long base = (unsigned long)iv->iov_base;
+
+		if (iv->iov_len == 0) {
+			iv++;
+			continue;
+		}
+
+		/* How many pages will iovec element take? */
+		npages = 1 + (base+iv->iov_len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		n = get_user_pages(current, current->mm, base, npages,
+				   0, 0, pages, NULL);
+		if (unlikely(n < 0)) {
+			err = n;
+			goto fail;
+		}
+
+		/* If we didn't get all the pages we expect, free them all */
+		if (unlikely(n != npages)) {
+			release_pages(pages, n, 1);
+			err = -EFAULT;
+			goto fail;
+		}
+
+		/* Append these pages to the frag array */
+		for (j = 0; j < n; j++) {
+			f[num_pg].page = pages[j];
+			if (j == 0) {
+				f[num_pg].page_offset = offset_in_page(base);
+				f[num_pg].size = min_t(unsigned long,
+						       iv->iov_len, PAGE_SIZE -
+						       f[num_pg].page_offset);
+			} else {
+				f[num_pg].page_offset = 0;
+				f[num_pg].size = min_t(unsigned long,
+						       iv->iov_len, PAGE_SIZE);
+			}
+			iv->iov_len -= f[num_pg].size;
+			totlen -= f[num_pg].size;
+			base += f[num_pg].size;
+			iv->iov_base += f[num_pg].size;
+			num_pg++;
+		}
+		iv->iov_base = (void __user *)base;
+	}
+	up_read(&current->mm->mmap_sem);
+	return num_pg;
+
+fail:
+	for (i = 0; i < num_pg; i++)
+		put_page(f[i].page);
+	up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+/* We actually store this at the head of the skb. */
+struct skb_tun_hdr {
+	struct list_head list;
+	struct tun_struct *tun;
+	unsigned int id;
+};
+
+/**
+ * get_user_skb - get packet from user space buffer.
+ * @iv: the userspace iovec
+ * @copylen: the recommended amount to copy
+ * @len: the total length of the iovec
+ *
+ * If len > copylen, the rest of userspace vectors might be pinned, and
+ * skb_shinfo(skb)->nr_frags will be non-zero.  In this case, there will be
+ * room for a 'struct skb_tun_hdr' at the front of the skb data, for use
+ * of the destructor.
+ *
+ * Returns an skb or ERR_PTR().
+ */
+static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
+				    size_t copylen, size_t len)
+{
+	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
+	struct sk_buff *skb;
+	size_t align = 0, extra = 0;
+	int err;
+
+	if (!(tun->flags & TUN_NO_PI)) {
+		if (len < sizeof(pi)) {
+			err = -EINVAL;
+			goto fail;
+		}
+		len -= sizeof(pi);
+
+		if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
+			err = -EFAULT;
+			goto fail;
+		}
+		if (copylen > len)
+			copylen = len;
+	}
+
+	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
+		align = NET_IP_ALIGN;
+		if (unlikely(copylen < ETH_HLEN)) {
+			if (len < ETH_HLEN) {
+				err = -EINVAL;
+				goto fail;
+			}
+			copylen = ETH_HLEN;
+		}
+	}
+
+	/* Allocate extra header if we need  */
+	if (copylen != len)
+		extra = sizeof(struct skb_tun_hdr);
+
+	skb = alloc_skb(extra + copylen + align, GFP_KERNEL);
+	if (!skb) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	if (extra + align)
+		skb_reserve(skb, extra + align);
+
+	if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
+		err = -EFAULT;
+		goto free_skb;
+	}
+
+	switch (tun->flags & TUN_TYPE_MASK) {
+	case TUN_TUN_DEV:
+		skb_reset_mac_header(skb);
+		skb->protocol = pi.proto;
+		skb->dev = tun->dev;
+		break;
+	case TUN_TAP_DEV:
+		skb->protocol = eth_type_trans(skb, tun->dev);
+		break;
+	};
+
+	if (tun->flags & TUN_NOCHECKSUM)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* Anything left gets put into frags. */
+	if (copylen != len) {
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+		err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
+		if (err < 0)
+			goto free_skb;
+		sinfo->nr_frags = err;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += len;
+	}
+	tun->dev->last_rx = jiffies;
+
+	tun->dev->stats.rx_packets++;
+	tun->dev->stats.rx_bytes += len;
+
+	return skb;
+
+free_skb:
+	kfree_skb(skb);
+fail:
+	tun->dev->stats.rx_dropped++;
+	return ERR_PTR(err);
+}
+
 #if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
 /* Returns whether there are queued buffers */
 static bool pending_recv_skbs(void *_tun)
@@ -360,6 +548,120 @@ static struct vring_ops recvops = {
 	.pull = pull_recv_skbs,
 };
 
+/* Returns whether there are queued buffers */
+static bool finished_xmit_buffers(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+
+	return !list_empty(&tun->outring_finished);
+}
+
+/* Returns 0, or negative errno. */
+static int pull_finished_buffers(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	struct skb_tun_hdr *i;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&tun->outring_lock);
+	list_splice_init(&tun->outring_finished, &list);
+	spin_unlock_irq(&tun->outring_lock);
+
+	while (!list_empty(&list)) {
+		i = list_first_entry(&list, struct skb_tun_hdr, list);
+
+		list_del(&i->list);
+
+		/* This is a xmit packet, so we wrote 0 bytes to it. */
+		vring_used_buffer(tun->outring, i->id, 0);
+
+		/* Release device.  Keeping this reference blocks file close. */
+		dev_put(tun->dev);
+
+		/* i == skb->head. */
+		kfree(i);
+	}
+	return 0;
+}
+
+/* We are done with this skb data: put it in the used pile. */
+static void shinfo_finished(struct skb_shared_info *sinfo)
+{
+	struct skb_tun_hdr *tunh = (void *)skb_shinfo_to_head(sinfo);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tunh->tun->outring_lock, flags);
+	list_add(&tunh->list, &tunh->tun->outring_finished);
+	spin_unlock_irqrestore(&tunh->tun->outring_lock, flags);
+
+	vring_wake(tunh->tun->outring);
+}
+
+static int xmit_packets(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	struct iovec iov[2+MAX_SKB_FRAGS];
+	int id, err, wake = 0;
+	unsigned long len;
+
+	while ((id = vring_get_buffer(tun->outring, NULL, 0, NULL,
+				      iov, ARRAY_SIZE(iov), &len)) > 0) {
+		struct virtio_net_hdr h;
+		struct sk_buff *skb;
+
+		if (unlikely(len < sizeof(h)))
+			return -EINVAL;
+
+		err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
+		if (unlikely(err))
+			return -EFAULT;
+
+		len -= sizeof(h);
+		if (h.hdr_len > len)
+			return -EINVAL;
+
+		/* Without GSO, we copy entire packet. */
+		if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
+			h.hdr_len = len;
+
+		skb = get_user_skb(tun, iov, h.hdr_len, len);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		    !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		/* If it has fragments, set up destructor for later. */
+		if (skb_shinfo(skb)->nr_frags) {
+			struct skb_tun_hdr *tunh = (void *)skb->head;
+			tunh->id = id;
+			tunh->tun = tun;
+			dev_hold(tun->dev);
+			skb_shinfo(skb)->destructor = shinfo_finished;
+		} else {
+			/* Queue immediately for pull_finished_buffers. */
+			vring_used_buffer(tun->outring, id, 0);
+			wake = 1;
+		}
+		netif_rx_ni(skb);
+	}
+
+	if (wake)
+		vring_wake(tun->outring);
+
+	/* 0 or error. */
+	return id;
+}
+
+static struct vring_ops xmitops = {
+	.push = xmit_packets,
+	.needs_pull = finished_xmit_buffers,
+	.pull = pull_finished_buffers,
+};
+
 static int set_recv_vring(struct tun_struct *tun, int fd)
 {
 	int err;
@@ -396,9 +698,47 @@ static void unset_vrings(struct tun_stru
 		vring_unset_ops(tun->inring);
 		fput(tun->infile);
 	}
+	if (tun->outring) {
+		vring_unset_ops(tun->outring);
+		fput(tun->outfile);
+	}
+}
+
+static int set_xmit_vring(struct tun_struct *tun, int fd)
+{
+	int err;
+
+	if (tun->outring)
+		return -EBUSY;
+
+	tun->outfile = fget(fd);
+	if (!tun->outfile)
+		return -EBADF;
+
+	tun->outring = vring_get(tun->outfile);
+	if (!tun->outring) {
+		err = -EBADF;
+		goto put;
+	}
+
+	err = vring_set_ops(tun->outring, &xmitops, tun);
+	if (err) {
+		tun->outring = NULL;
+		goto put;
+	}
+	return 0;
+
+put:
+	fput(tun->outfile);
+	tun->outfile = NULL;
+	return err;
 }
 #else /* ... !CONFIG_VRING */
 static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	return -ENOTTY;
+}
+static int set_xmit_vring(struct tun_struct *tun, int fd)
 {
 	return -ENOTTY;
 }
@@ -429,74 +769,26 @@ static unsigned int tun_chr_poll(struct 
 	return mask;
 }
 
-/* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
-{
-	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
-	struct sk_buff *skb;
-	size_t len = count, align = 0;
-
-	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) > count)
-			return -EINVAL;
-
-		if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
-			return -EFAULT;
-	}
-
-	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
-		align = NET_IP_ALIGN;
-		if (unlikely(len < ETH_HLEN))
-			return -EINVAL;
-	}
-
-	if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
-		tun->dev->stats.rx_dropped++;
-		return -ENOMEM;
-	}
-
-	if (align)
-		skb_reserve(skb, align);
-	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
-		tun->dev->stats.rx_dropped++;
-		kfree_skb(skb);
-		return -EFAULT;
-	}
-
-	switch (tun->flags & TUN_TYPE_MASK) {
-	case TUN_TUN_DEV:
-		skb_reset_mac_header(skb);
-		skb->protocol = pi.proto;
-		skb->dev = tun->dev;
-		break;
-	case TUN_TAP_DEV:
-		skb->protocol = eth_type_trans(skb, tun->dev);
-		break;
-	};
-
-	if (tun->flags & TUN_NOCHECKSUM)
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	netif_rx_ni(skb);
-	tun->dev->last_rx = jiffies;
-
-	tun->dev->stats.rx_packets++;
-	tun->dev->stats.rx_bytes += len;
-
-	return count;
-}
-
 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 			      unsigned long count, loff_t pos)
 {
 	struct tun_struct *tun = iocb->ki_filp->private_data;
+	size_t len;
+	struct sk_buff *skb;
 
 	if (!tun)
 		return -EBADFD;
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
+	len = iov_length(iv, count);
+
+	skb = get_user_skb(tun, (struct iovec *)iv, len, len);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	netif_rx_ni(skb);
+	return len;
 }
 
 /* Put packet to the user space buffer */
@@ -624,6 +916,8 @@ static void tun_setup(struct net_device 
 	tun->owner = -1;
 	tun->group = -1;
 	tun->inring = NULL;
+	spin_lock_init(&tun->outring_lock);
+	INIT_LIST_HEAD(&tun->outring_finished);
 
 	dev->open = tun_net_open;
 	dev->hard_start_xmit = tun_net_xmit;
@@ -840,6 +1134,9 @@ static int tun_chr_ioctl(struct inode *i
 
 	case TUNSETRECVVRING:
 		return set_recv_vring(tun, arg);
+
+	case TUNSETXMITVRING:
+		return set_xmit_vring(tun, arg);
 
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
diff -r f87969ce25ac include/linux/if_tun.h
--- a/include/linux/if_tun.h	Thu Apr 24 12:02:08 2008 +1000
+++ b/include/linux/if_tun.h	Thu Apr 24 12:02:44 2008 +1000
@@ -43,6 +43,7 @@
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
 #define TUNSETRECVVRING _IOW('T', 207, int)
+#define TUNSETXMITVRING _IOW('T', 208, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
