This implements partial checksum and GSO support for tun/tap.

We use the virtio_net_hdr: it is an ABI already and designed to
encapsulate such metadata as GSO and partial checksums.

IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
at the start of each packet.  You can always write packets with
partial checksum and gso to the tap device using this header.

IFF_RECV_CSUM means you can handle reading packets with partial
checksums.  If IFF_RECV_GSO is also set, it means you can handle
reading (all types of) GSO packets.

Note that there is no easy way to detect if these flags are supported:
see next patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/tun.c      |  231 +++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/if_tun.h |   26 +++++
 2 files changed, 240 insertions(+), 17 deletions(-)

diff -r c41f05a27969 drivers/net/tun.c
--- a/drivers/net/tun.c	Fri Mar 28 13:42:24 2008 +1100
+++ b/drivers/net/tun.c	Sat Mar 29 21:30:00 2008 +1100
@@ -238,35 +238,167 @@ static unsigned int tun_chr_poll(struct 
 	return mask;
 }
 
+static struct sk_buff *copy_user_skb(size_t align, struct iovec *iv, size_t len)
+{
+	struct sk_buff *skb;
+
+	if (!(skb = alloc_skb(len + align, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	if (align)
+		skb_reserve(skb, align);
+
+	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
+		kfree_skb(skb);
+		return ERR_PTR(-EFAULT);
+	}
+	return skb;
+}
+
+/* This will fail if they give us a crazy iovec, but that's their own fault. */
+static int get_user_skb_frags(const struct iovec *iv, size_t count,
+			      struct skb_frag_struct *f)
+{
+	unsigned int i, num_pg = 0;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		unsigned long len = (unsigned long)iv[i].iov_len;
+		void __user *base = iv[i].iov_base;
+
+		while (len) {
+			if (num_pg == MAX_SKB_FRAGS) {
+				err = -ENOSPC;
+				goto fail;
+			}
+			f[num_pg].page = alloc_page(GFP_KERNEL);
+			if (!f[num_pg].page) {
+				err = -ENOMEM;
+				goto fail;
+			}
+			f[num_pg].page_offset = 0;
+			f[num_pg].size = min(len, PAGE_SIZE);
+			if (copy_from_user(page_address(f[num_pg].page),
+					   base, f[num_pg].size)) {
+				__free_page(f[num_pg].page);
+				err = -EFAULT;
+				goto fail;
+			}
+
+			len -= f[num_pg].size;
+			base += f[num_pg].size;
+			num_pg++;
+		}
+	}
+	return num_pg;
+
+fail:
+	for (i = 0; i < num_pg; i++)
+		put_page(f[i].page);
+	return err;
+}
+
+/* For GSO packets, we don't try to alloc_skb, but individual pages. */
+static struct sk_buff *copy_gso_skb(const struct tun_gso_hdr *gso,
+				   size_t align, struct iovec *iv,
+				   size_t count, size_t len)
+{
+	struct sk_buff *skb;
+	struct skb_shared_info *sinfo;
+	int err;
+
+	if (len < gso->hdr_len)
+		return ERR_PTR(-EINVAL);
+	len -= gso->hdr_len;
+
+	/* GSO implies csum needed. */
+	if (!(gso->flags & TUN_GSO_HDR_F_NEEDS_CSUM))
+		return ERR_PTR(-EINVAL);
+
+	if (!(skb = alloc_skb(align + gso->hdr_len, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	if (align)
+		skb_reserve(skb, align);
+
+	sinfo = skb_shinfo(skb);
+	sinfo->gso_size = gso->gso_size;
+	sinfo->gso_type = SKB_GSO_DODGY;
+	switch (gso->gso_type & ~TUN_GSO_HDR_GSO_ECN) {
+	case TUN_GSO_HDR_GSO_TCPV4:
+		sinfo->gso_type |= SKB_GSO_TCPV4;
+		break;
+	case TUN_GSO_HDR_GSO_TCPV6:
+		sinfo->gso_type |= SKB_GSO_TCPV6;
+		break;
+	case TUN_GSO_HDR_GSO_UDP:
+		sinfo->gso_type |= SKB_GSO_UDP;
+		break;
+	default:
+		err = -EINVAL;
+		goto fail;
+	}
+
+	if (gso->gso_type & TUN_GSO_HDR_GSO_ECN)
+		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+	/* Copy in the header. */
+	if (memcpy_fromiovec(skb_put(skb, gso->hdr_len), iv, gso->hdr_len)) {
+		err = -EFAULT;
+		goto fail;
+	}
+
+	err = get_user_skb_frags(iv, count, sinfo->frags);
+	if (err < 0)
+		goto fail;
+
+	sinfo->nr_frags = err;
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+
+	return skb;
+
+fail:
+	kfree_skb(skb);
+	return ERR_PTR(err);
+}
+
 /* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
+static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t num)
 {
 	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
+	struct tun_gso_hdr gso = { 0, TUN_GSO_HDR_GSO_NONE };
 	struct sk_buff *skb;
-	size_t len = count, align = 0;
+	size_t tot_len = iov_length(iv, num);
+	size_t len = tot_len, align = 0;
 
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) > count)
+		if ((len -= sizeof(pi)) > tot_len)
 			return -EINVAL;
 
 		if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
+			return -EFAULT;
+	}
+	if (tun->flags & TUN_SEND_GSO) {
+		if ((len -= sizeof(gso)) > tot_len)
+			return -EINVAL;
+
+		if (memcpy_fromiovec((void *)&gso, iv, sizeof(gso)))
 			return -EFAULT;
 	}
 
 	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV)
 		align = NET_IP_ALIGN;
 
-	if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
+	if (gso.gso_type != TUN_GSO_HDR_GSO_NONE)
+		skb = copy_gso_skb(&gso, align, iv, num, len);
+	else
+		skb = copy_user_skb(align, iv, len);
+
+	if (IS_ERR(skb)) {
 		tun->dev->stats.rx_dropped++;
-		return -ENOMEM;
-	}
-
-	if (align)
-		skb_reserve(skb, align);
-	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
-		tun->dev->stats.rx_dropped++;
-		kfree_skb(skb);
-		return -EFAULT;
+		return PTR_ERR(skb);
 	}
 
 	switch (tun->flags & TUN_TYPE_MASK) {
@@ -280,7 +412,13 @@ static __inline__ ssize_t tun_get_user(s
 		break;
 	};
 
-	if (tun->flags & TUN_NOCHECKSUM)
+	if (gso.flags & TUN_GSO_HDR_F_NEEDS_CSUM) {
+		if (!skb_partial_csum_set(skb,gso.csum_start,gso.csum_offset)) {
+			tun->dev->stats.rx_dropped++;
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+	} else if (tun->flags & TUN_NOCHECKSUM)
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	netif_rx_ni(skb);
@@ -289,7 +427,7 @@ static __inline__ ssize_t tun_get_user(s
 	tun->dev->stats.rx_packets++;
 	tun->dev->stats.rx_bytes += len;
 
-	return count;
+	return tot_len;
 }
 
 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -302,7 +440,7 @@ static ssize_t tun_chr_aio_write(struct 
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
+	return tun_get_user(tun, (struct iovec *) iv, count);
 }
 
 /* Put packet to the user space buffer */
@@ -325,6 +463,40 @@ static __inline__ ssize_t tun_put_user(s
 		if (memcpy_toiovec(iv, (void *) &pi, sizeof(pi)))
 			return -EFAULT;
 		total += sizeof(pi);
+	}
+	/* Either of these means they expect a prepended struct tun_gso_hdr */
+	if (tun->flags & (TUN_RECV_CSUM|TUN_RECV_GSO)) {
+		struct tun_gso_hdr gso = { 0 }; /* no info leak */
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+		if (skb_is_gso(skb)) {
+			gso.hdr_len = skb_transport_header(skb) - skb->data;
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = TUN_GSO_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = TUN_GSO_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				gso.gso_type = TUN_GSO_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= TUN_GSO_HDR_GSO_ECN;
+		} else
+			gso.gso_type = TUN_GSO_HDR_GSO_NONE;
+		
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = TUN_GSO_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} /* else flags everything is zero */
+
+		if ((len -= sizeof(gso)) < 0)
+			return -EINVAL;
+
+		if (memcpy_toiovec(iv, (void *)&gso, sizeof(gso)))
+			return -EFAULT;
+		total += sizeof(gso);
 	}
 
 	len = min_t(int, skb->len, len);
@@ -512,6 +684,18 @@ static int tun_set_iff(struct file *file
 
 		tun_net_init(dev);
 
+		dev->features = 0;
+		/* If user can handle a csum, we can have features. */
+		if (ifr->ifr_flags & IFF_RECV_CSUM) {
+			dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM |
+					NETIF_F_HIGHDMA | NETIF_F_FRAGLIST;
+
+			/* CSUM handling is required for GSO, */
+			if (ifr->ifr_flags & IFF_RECV_GSO)
+				dev->features |= NETIF_F_TSO | NETIF_F_UFO |
+					NETIF_F_TSO_ECN | NETIF_F_TSO6;
+		}
+
 		if (strchr(dev->name, '%')) {
 			err = dev_alloc_name(dev, dev->name);
 			if (err < 0)
@@ -536,6 +720,21 @@ static int tun_set_iff(struct file *file
 		tun->flags |= TUN_ONE_QUEUE;
 	else
 		tun->flags &= ~TUN_ONE_QUEUE;
+
+	if (ifr->ifr_flags & IFF_SEND_GSO)
+		tun->flags |= TUN_SEND_GSO;
+	else
+		tun->flags &= ~TUN_SEND_GSO;
+
+	if (ifr->ifr_flags & IFF_RECV_CSUM)
+		tun->flags |= TUN_RECV_CSUM;
+	else
+		tun->flags &= ~TUN_RECV_CSUM;
+
+	if (ifr->ifr_flags & IFF_RECV_GSO)
+		tun->flags |= TUN_RECV_GSO;
+	else
+		tun->flags &= ~TUN_RECV_GSO;
 
 	file->private_data = tun;
 	tun->attached = 1;
diff -r c41f05a27969 include/linux/if_tun.h
--- a/include/linux/if_tun.h	Fri Mar 28 13:42:24 2008 +1100
+++ b/include/linux/if_tun.h	Sat Mar 29 21:30:00 2008 +1100
@@ -72,6 +72,9 @@ struct tun_struct {
 #define TUN_NO_PI	0x0040
 #define TUN_ONE_QUEUE	0x0080
 #define TUN_PERSIST 	0x0100	
+#define TUN_SEND_GSO	0x0200
+#define TUN_RECV_CSUM	0x0400
+#define TUN_RECV_GSO	0x0800
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -88,7 +91,11 @@ struct tun_struct {
 #define IFF_TAP		0x0002
 #define IFF_NO_PI	0x1000
 #define IFF_ONE_QUEUE	0x2000
-#define IFF_ALL_FLAGS (IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE)
+#define IFF_SEND_GSO	0x4000
+#define IFF_RECV_CSUM	0x8000
+#define IFF_RECV_GSO	0x0800
+#define IFF_ALL_FLAGS (IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | \
+		       IFF_SEND_GSO | IFF_RECV_CSUM | IFF_RECV_GSO)
 
 struct tun_pi {
 	unsigned short flags;
@@ -96,4 +103,21 @@ struct tun_pi {
 };
 #define TUN_PKT_STRIP	0x0001
 
+/* The TUN GSO header is the same ABI as virtio_net_hdr (linux/virtio_net.h) */
+struct tun_gso_hdr
+{
+#define TUN_GSO_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
+	__u8 flags;
+#define TUN_GSO_HDR_GSO_NONE		0	// Not a GSO frame
+#define TUN_GSO_HDR_GSO_TCPV4		1	// GSO frame, IPv4 TCP (TSO)
+#define TUN_GSO_HDR_GSO_UDP		3	// GSO frame, IPv4 UDP (UFO)
+#define TUN_GSO_HDR_GSO_TCPV6		4	// GSO frame, IPv6 TCP
+#define TUN_GSO_HDR_GSO_ECN		0x80	// TCP has ECN set
+	__u8 gso_type;
+	__u16 hdr_len;		/* Ethernet + IP + tcp/udp hdrs */
+	__u16 gso_size;		/* Bytes to append to gso_hdr_len per frame */
+	__u16 csum_start;	/* Position to start checksumming from */
+	__u16 csum_offset;	/* Offset after that to place checksum */
+};
+
 #endif /* __IF_TUN_H */
