lguest: use tun vringfds

This is how lguest uses the vringfd tun support.  It needs more cleanup,
but it works.

(Includes Mark McLoughlin <markmc@redhat.com>'s vringfd limit fix)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c |  160 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 141 insertions(+), 19 deletions(-)

diff -r c95947a6c2bc Documentation/lguest/lguest
Binary file Documentation/lguest/lguest has changed

diff -r c95947a6c2bc Documentation/lguest/lguest
Binary file Documentation/lguest/lguest has changed

diff -r c95947a6c2bc Documentation/lguest/lguest
Binary file Documentation/lguest/lguest has changed

diff -r 910285429043 Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Mon Aug 04 13:54:15 2008 +1000
+++ b/Documentation/lguest/lguest.c	Mon Aug 04 13:55:14 2008 +1000
@@ -44,6 +44,7 @@
 #include "linux/virtio_console.h"
 #include "linux/virtio_rng.h"
 #include "linux/virtio_ring.h"
+#include "linux/vring.h"
 #include "asm-x86/bootparam.h"
 /*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
@@ -110,6 +111,9 @@ struct device_list
 	/* The descriptor page for the devices. */
 	u8 *descpage;
 
+	/* Pointer to last used in descpage */
+	u8 *nextdesc;
+
 	/* A single linked list of devices. */
 	struct device *dev;
 	/* And a pointer to the last device for easy append and also for
@@ -171,7 +175,8 @@ struct virtqueue
 	bool blocked;
 };
 
-static unsigned int net_xmit_notify, net_recv_notify, net_recv_notify, net_timeout;
+static unsigned int net_xmit_notify, net_recv_notify, net_recv_notify,
+	net_timeout, net_xmit_irq, net_recv_irq;
 
 /* Remember the arguments to the program so we can "reboot" */
 static char **main_args;
@@ -868,6 +873,8 @@ static bool handle_console_input(int fd,
 				printf("network xmit %u recv %u timeout %u usec %u\n",
 				       net_xmit_notify, net_recv_notify,
 				       net_timeout, timeout_usec);
+				printf("network xmit irq %u recv irq %u\n",
+				       net_xmit_irq, net_recv_irq);
 				exit(2);
 			}
 			abort->count = 0;
@@ -921,6 +928,11 @@ static void block_vq(struct virtqueue *v
  * and write them (ignoring the first element) to this device's file descriptor
  * (/dev/net/tun).
  */
+struct virtio_net_info {
+	struct virtqueue *xmit_vq, *recv_vq;
+	int xmitfd, recvfd;
+};
+
 static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in, num = 0;
@@ -942,6 +954,33 @@ static void handle_net_output(int fd, st
 	/* Block further kicks and set up a timer if we saw anything. */
 	if (!timeout && num)
 		block_vq(vq);
+
+	if (timeout) {
+		if (num < last_timeout_num)
+			timeout_usec += 10;
+		else if (timeout_usec > 1)
+			timeout_usec--;
+		last_timeout_num = num;
+	}
+}
+
+static void handle_netring_output(int fd, struct virtqueue *vq, bool timeout)
+{
+	struct virtio_net_info *ni = vq->dev->priv;
+	u16 num = vq->vring.avail->idx - vring_last_avail(&vq->vring);
+	static int last_timeout_num;
+
+	if (!timeout)
+		net_xmit_notify++;
+
+	if (num) {
+		/* We have output, kick the kernel. */
+		if (write(ni->xmitfd, "", 0) != 0)
+			err(1, "Writing to xmitfd");
+
+		if (!timeout)
+			block_vq(vq);
+	}
 
 	if (timeout) {
 		if (num < last_timeout_num)
@@ -1181,18 +1220,13 @@ static struct lguest_device_desc *new_de
 static struct lguest_device_desc *new_dev_desc(u16 type)
 {
 	struct lguest_device_desc d = { .type = type };
-	void *p;
-
-	/* Figure out where the next device config is, based on the last one. */
-	if (devices.lastdev)
-		p = device_config(devices.lastdev)
-			+ devices.lastdev->desc->config_len;
-	else
-		p = devices.descpage;
+	void *p = devices.nextdesc;
 
 	/* We only have one page for all the descriptors. */
 	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
 		errx(1, "Too many devices");
+
+	devices.nextdesc += sizeof(d);
 
 	/* p might not be aligned, so we memcpy in. */
 	return memcpy(p, &d, sizeof(d));
@@ -1232,6 +1266,7 @@ static void add_virtqueue(struct device 
 	 * yet, otherwise we'd be overwriting them. */
 	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
 	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+	devices.nextdesc += sizeof(vq->config);
 	dev->desc->num_vq++;
 
 	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1261,6 +1296,7 @@ static void add_feature(struct device *d
 	if (dev->desc->feature_len <= bit / CHAR_BIT) {
 		assert(dev->desc->config_len == 0);
 		dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+		devices.nextdesc = features + dev->desc->feature_len * 2;
 	}
 
 	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1279,8 +1315,10 @@ static void set_config(struct device *de
 	if (device_config(dev) + len > devices.descpage + getpagesize())
 		errx(1, "Too many devices");
 
+	assert(device_config(dev) == devices.nextdesc);
 	/* Copy in the config information, and store the length. */
 	memcpy(device_config(dev), conf, len);
+	devices.nextdesc += len;
 	dev->desc->config_len = len;
 }
 
@@ -1299,7 +1337,8 @@ static struct device *new_device(const c
 	 * to the device_list's fdset and maxfd. */
 	if (handle_input)
 		add_device_fd(dev->fd);
-	dev->desc = new_dev_desc(type);
+	if (type)
+		dev->desc = new_dev_desc(type);
 	dev->handle_input = handle_input;
 	dev->name = name;
 	dev->vq = NULL;
@@ -1502,11 +1541,61 @@ static int get_tun_device(char tapif[IFN
 	return netfd;
 }
 
+static bool xmitfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	net_xmit_irq++;
+
+	/* Read to clear it. */
+	if (read(ni->xmitfd, NULL, 0) != 0)
+		err(1, "%s: reading xmitfd", dev->name);
+	trigger_irq(fd, ni->xmit_vq);
+
+	return true;
+}
+
+static bool recvfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	net_recv_irq++;
+	/* Read to clear it. */
+	if (read(ni->recvfd, NULL, 0) != 0)
+		err(1, "%s: reading recvfd", dev->name);
+	trigger_irq(fd, ni->recv_vq);
+
+	return true;
+}
+
+static int map_vring(struct vring *vr)
+{
+	int fd = open_or_die("/dev/vring", O_RDWR);
+
+	/* Map the rings over where they belong in Guest. */
+	if (mmap(vr->desc, page_align(vring_size(vr->num, getpagesize())),
+		 PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, fd,
+		 vr->num * getpagesize()) != vr->desc)
+		err(1, "mmaping /dev/vring");
+
+	/* This is subtle and nasty.  If we lazily map this, Waker may
+	 * see different pages when we touch it, and hence it will get
+	 * a different result for poll(). */
+	memset(vr->desc, 0, vring_size(vr->num, getpagesize()));
+
+	/* Set offset & limit. */
+	if (ioctl(fd, VRINGSETBASE, guest_base) != 0
+	    || ioctl(fd, VRINGSETLIMIT, guest_limit) != 0)
+		err(1, "Setting vring offset and limit");
+
+	return fd;
+}
+
 /*L:195 Our network is a Host<->Guest network.  This can either use bridging or
  * routing, but the principle is the same: it uses the "tun" device to inject
  * packets into the Host as if they came in from a normal network card.  We
  * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(char *arg)
+static void setup_tun_net(char *arg, bool rings)
 {
 	struct device *dev;
 	int netfd, ipfd;
@@ -1514,16 +1603,45 @@ static void setup_tun_net(char *arg)
 	bool bridging = false;
 	char tapif[IFNAMSIZ], *p;
 	struct virtio_net_config conf;
+	struct virtio_net_info *ni;
 
 	netfd = get_tun_device(tapif);
 
-	/* First we create a new network device. */
-	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+	if (rings) {
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, NULL);
+		/* We don't need enable_fd here.  But we do want to pop
+		 * the Waker out. */
+		add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_netring_output);
+	} else {
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+		/* When they add more receive buffers, try re-enabling input */
+		add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	}
 
-	/* Network devices need a receive and a send queue, just like
-	 * console. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	dev->priv = ni = malloc(sizeof(*ni));
+
+	ni->recv_vq = dev->vq;
+	ni->xmit_vq = dev->vq->next;
+
+	if (rings) {
+		/* Now we create the receive and xmit ringfds. */
+		ni->recvfd = map_vring(&dev->vq->vring);
+		ni->xmitfd = map_vring(&dev->vq->next->vring);
+
+		/* Tell the tunnet to use them. */
+		if (ioctl(netfd, TUNSETRECVVRING, ni->recvfd) != 0)
+			err(1, "Setting receive ring");
+		if (ioctl(netfd, TUNSETXMITVRING, ni->xmitfd) != 0)
+			err(1, "Setting xmit ring");
+
+		/* Now we need to respond when they become readable. */
+		new_device("net", 0, ni->recvfd, recvfd_used)->priv = ni;
+		new_device("net", 0, ni->xmitfd, xmitfd_used)->priv = ni;
+	}
 
 	/* We need a socket to perform the magic network ioctls to bring up the
 	 * tap interface, connect to the bridge etc.  Any socket will do! */
@@ -1946,6 +2064,7 @@ static struct option opts[] = {
 static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
+	{ "tunring", 1, NULL, 'R' },
 	{ "block", 1, NULL, 'b' },
 	{ "rng", 0, NULL, 'r' },
 	{ "initrd", 1, NULL, 'i' },
@@ -2005,7 +2124,7 @@ int main(int argc, char *argv[])
 						      + DEVICE_PAGES);
 			guest_limit = mem;
 			guest_max = mem + DEVICE_PAGES*getpagesize();
-			devices.descpage = get_pages(1);
+			devices.descpage = devices.nextdesc = get_pages(1);
 			break;
 		}
 	}
@@ -2017,7 +2136,10 @@ int main(int argc, char *argv[])
 			verbose = true;
 			break;
 		case 't':
-			setup_tun_net(optarg);
+			setup_tun_net(optarg, false);
+			break;
+		case 'R':
+			setup_tun_net(optarg, true);
 			break;
 		case 'b':
 			setup_block_file(optarg);
