lguest: use tun vringfds

This is how lguest uses the vringfd tun support.  It needs more cleanup,
but it works.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c |  185 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 164 insertions(+), 21 deletions(-)

diff -r b8bc9a65f59a Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Mon May 05 14:59:41 2008 +1000
+++ b/Documentation/lguest/lguest.c	Mon May 05 15:00:12 2008 +1000
@@ -43,6 +43,7 @@
 #include "linux/virtio_console.h"
 #include "linux/virtio_rng.h"
 #include "linux/virtio_ring.h"
+#include "linux/vring.h"
 #include "asm-x86/bootparam.h"
 /*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
@@ -101,6 +102,9 @@ struct device_list
 
 	/* The descriptor page for the devices. */
 	u8 *descpage;
+
+	/* Pointer to last used in descpage */
+	u8 *nextdesc;
 
 	/* A single linked list of devices. */
 	struct device *dev;
@@ -199,6 +203,15 @@ static u8 *get_feature_bits(struct devic
 {
 	return (u8 *)(dev->desc + 1)
 		+ dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+}
+
+/* Does this device have the given feature? */
+static bool has_feature(struct device *dev, unsigned bit)
+{
+	/* Guest feature bits follow our feature bits. */
+	u8 *features = get_feature_bits(dev) + dev->desc->feature_len;
+
+	return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT));
 }
 
 /*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -856,6 +869,11 @@ static void handle_console_output(int fd
  * and write them (ignoring the first element) to this device's file descriptor
  * (/dev/net/tun).
  */
+struct virtio_net_info {
+	struct virtqueue *xmit_vq, *recv_vq;
+	int xmitfd, recvfd;
+};
+
 static void handle_net_output(int fd, struct virtqueue *vq)
 {
 	unsigned int head, out, in;
@@ -873,6 +891,15 @@ static void handle_net_output(int fd, st
 		len = writev(vq->dev->fd, iov+1, out-1);
 		add_used_and_trigger(fd, vq, head, len);
 	}
+}
+
+static void handle_netring_output(int fd, struct virtqueue *vq)
+{
+	struct virtio_net_info *ni = vq->dev->priv;
+
+	/* We have output, kick the kernel. */
+	if (write(ni->xmitfd, "", 0) != 0)
+		err(1, "Writing to xmitfd");
 }
 
 /* This is where we handle a packet coming in from the tun device to our
@@ -1073,18 +1100,13 @@ static struct lguest_device_desc *new_de
 static struct lguest_device_desc *new_dev_desc(u16 type)
 {
 	struct lguest_device_desc d = { .type = type };
-	void *p;
-
-	/* Figure out where the next device config is, based on the last one. */
-	if (devices.lastdev)
-		p = device_config(devices.lastdev)
-			+ devices.lastdev->desc->config_len;
-	else
-		p = devices.descpage;
+	void *p = devices.nextdesc;
 
 	/* We only have one page for all the descriptors. */
 	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
 		errx(1, "Too many devices");
+
+	devices.nextdesc += sizeof(d);
 
 	/* p might not be aligned, so we memcpy in. */
 	return memcpy(p, &d, sizeof(d));
@@ -1122,6 +1144,7 @@ static void add_virtqueue(struct device 
 	 * yet, otherwise we'd be overwriting them. */
 	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
 	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+	devices.nextdesc += sizeof(vq->config);
 	dev->desc->num_vq++;
 
 	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1151,6 +1174,7 @@ static void add_feature(struct device *d
 	if (dev->desc->feature_len <= bit / CHAR_BIT) {
 		assert(dev->desc->config_len == 0);
 		dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+		devices.nextdesc = features + dev->desc->feature_len * 2;
 	}
 
 	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1169,8 +1193,10 @@ static void set_config(struct device *de
 	if (device_config(dev) + len > devices.descpage + getpagesize())
 		errx(1, "Too many devices");
 
+	assert(device_config(dev) == devices.nextdesc);
 	/* Copy in the config information, and store the length. */
 	memcpy(device_config(dev), conf, len);
+	devices.nextdesc += len;
 	dev->desc->config_len = len;
 }
 
@@ -1189,7 +1215,8 @@ static struct device *new_device(const c
 	 * to the device_list's fdset and maxfd. */
 	if (handle_input)
 		add_device_fd(dev->fd);
-	dev->desc = new_dev_desc(type);
+	if (type)
+		dev->desc = new_dev_desc(type);
 	dev->handle_input = handle_input;
 	dev->name = name;
 	dev->vq = NULL;
@@ -1320,11 +1347,78 @@ static void configure_device(int fd, con
 	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
 }
 
+static bool xmitfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	/* Read to clear it. */
+	if (read(ni->xmitfd, NULL, 0) != 0)
+		err(1, "%s: reading xmitfd", dev->name);
+	trigger_irq(fd, ni->xmit_vq);
+
+	return true;
+}
+
+static bool recvfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	/* Read to clear it. */
+	if (read(ni->recvfd, NULL, 0) != 0)
+		err(1, "%s: reading recvfd", dev->name);
+	trigger_irq(fd, ni->recv_vq);
+
+	return true;
+}
+
+static int map_vring(struct vring *vr)
+{
+	int fd = open_or_die("/dev/vring", O_RDWR);
+
+	/* Map the rings over where they belong in Guest. */
+	if (mmap(vr->desc, page_align(vring_size(vr->num, getpagesize())),
+		 PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, fd,
+		 vr->num * getpagesize()) != vr->desc)
+		err(1, "mmaping /dev/vring");
+
+	/* This is subtle and nasty.  If we lazily map this, Waker may
+	 * see different pages when we touch it, and hence it will get
+	 * a different result for poll(). */
+	memset(vr->desc, 0, vring_size(vr->num, getpagesize()));
+
+	/* Set offset & limit. */
+	if (ioctl(fd, VRINGSETBASE, guest_base) != 0
+	    || ioctl(fd, VRINGSETBASE, guest_base) != 0)
+		err(1, "Setting vring offset and limit");
+
+	return fd;
+}
+
+static void tun_ring_set_features(struct device *dev)
+{
+	unsigned int features = 0;
+
+	/* They tell us what they can handle, we tell tun to give it to us. */
+	if (has_feature(dev, VIRTIO_NET_F_GUEST_CSUM))
+		features |= TUN_F_CSUM;
+	if (has_feature(dev, VIRTIO_NET_F_GUEST_TSO4))
+		features |= TUN_F_TSO4;
+	if (has_feature(dev, VIRTIO_NET_F_GUEST_TSO6))
+		features |= TUN_F_TSO6;
+	if (has_feature(dev, VIRTIO_NET_F_GUEST_ECN))
+		features |= TUN_F_TSO_ECN;
+	if (has_feature(dev, VIRTIO_NET_F_HOST_UFO))
+		features |= TUN_F_UFO;
+
+	if (ioctl(dev->fd, TUNSETFEATURES, features) != 0)
+		errx(1, "Could not set features %#x for tun device", features);
+}
+
 /*L:195 Our network is a Host<->Guest network.  This can either use bridging or
  * routing, but the principle is the same: it uses the "tun" device to inject
  * packets into the Host as if they came in from a normal network card.  We
  * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(const char *arg)
+static void setup_tun_net(const char *arg, bool rings)
 {
 	struct device *dev;
 	struct ifreq ifr;
@@ -1332,6 +1426,7 @@ static void setup_tun_net(const char *ar
 	u32 ip;
 	const char *br_name = NULL;
 	struct virtio_net_config conf;
+	struct virtio_net_info *ni;
 
 	/* We open the /dev/net/tun device and tell it we want a tap device.  A
 	 * tap device is like a tun device, only somehow different.  To tell
@@ -1343,17 +1438,44 @@ static void setup_tun_net(const char *ar
 	strcpy(ifr.ifr_name, "tap%d");
 	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
 		err(1, "configuring /dev/net/tun");
-	/* We don't need checksums calculated for packets coming in this
-	 * device: trust us! */
-	ioctl(netfd, TUNSETNOCSUM, 1);
 
-	/* First we create a new network device. */
-	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+	if (rings) {
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, NULL);
+		add_virtqueue(dev, VIRTQUEUE_NUM, NULL);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_netring_output);
+	} else {
+		/* We don't need checksums calculated for packets coming in this
+		 * device: trust us! */
+		ioctl(netfd, TUNSETNOCSUM, 1);
 
-	/* Network devices need a receive and a send queue, just like
-	 * console. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+		/* When they add more receive buffers, try re-enabling input */
+		add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	}
+
+	dev->priv = ni = malloc(sizeof(*ni));
+
+	ni->recv_vq = dev->vq;
+	ni->xmit_vq = dev->vq->next;
+
+	if (rings) {
+		/* Now we create the receive and xmit ringfds. */
+		ni->recvfd = map_vring(&dev->vq->vring);
+		ni->xmitfd = map_vring(&dev->vq->next->vring);
+
+		/* Tell the tunnet to use them. */
+		if (ioctl(netfd, TUNSETRECVVRING, ni->recvfd) != 0)
+			err(1, "Setting receive ring");
+		if (ioctl(netfd, TUNSETXMITVRING, ni->xmitfd) != 0)
+			err(1, "Setting xmit ring");
+
+		/* Now we need to respond when they become readable. */
+		new_device("net", 0, ni->recvfd, recvfd_used)->priv = ni;
+		new_device("net", 0, ni->xmitfd, xmitfd_used)->priv = ni;
+	}
 
 	/* We need a socket to perform the magic network ioctls to bring up the
 	 * tap interface, connect to the bridge etc.  Any socket will do! */
@@ -1374,6 +1496,23 @@ static void setup_tun_net(const char *ar
 
 	/* Tell Guest what MAC address to use. */
 	add_feature(dev, VIRTIO_NET_F_MAC);
+
+	/* If we're using rings, we can do GSO magic. */
+	if (rings) {
+		add_feature(dev, VIRTIO_NET_F_CSUM);
+		add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
+		add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
+		add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
+		add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
+		add_feature(dev, VIRTIO_NET_F_GUEST_UFO);
+		add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
+		add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
+		add_feature(dev, VIRTIO_NET_F_HOST_ECN);
+		/* Kernel can't actually handle UFO in software currently. */
+
+		/* We tell kernel what we can handle once guest tells us. */
+		dev->ready = tun_ring_set_features;
+	}
 	set_config(dev, sizeof(conf), &conf);
 
 	/* We don't need the socket any more; setup is done. */
@@ -1741,6 +1880,7 @@ static struct option opts[] = {
 static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
+	{ "tunring", 1, NULL, 'R' },
 	{ "block", 1, NULL, 'b' },
 	{ "rng", 0, NULL, 'r' },
 	{ "initrd", 1, NULL, 'i' },
@@ -1800,7 +1940,7 @@ int main(int argc, char *argv[])
 						      + DEVICE_PAGES);
 			guest_limit = mem;
 			guest_max = mem + DEVICE_PAGES*getpagesize();
-			devices.descpage = get_pages(1);
+			devices.descpage = devices.nextdesc = get_pages(1);
 			break;
 		}
 	}
@@ -1812,7 +1952,10 @@ int main(int argc, char *argv[])
 			verbose = true;
 			break;
 		case 't':
-			setup_tun_net(optarg);
+			setup_tun_net(optarg, false);
+			break;
+		case 'R':
+			setup_tun_net(optarg, true);
 			break;
 		case 'b':
 			setup_block_file(optarg);
