lguest: use tun vringfds

This is how lguest uses the vringfd tun support.  It needs more cleanup,
but it works.

(Includes Mark McLoughlin <markmc@redhat.com>'s vringfd limit fix)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c |  160 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 141 insertions(+), 19 deletions(-)

diff -r 4e00f8c6d7c8 Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Tue Aug 26 09:39:42 2008 +1000
+++ b/Documentation/lguest/lguest.c	Tue Aug 26 09:46:33 2008 +1000
@@ -44,6 +44,7 @@
 #include "linux/virtio_console.h"
 #include "linux/virtio_rng.h"
 #include "linux/virtio_ring.h"
+#include "linux/vring.h"
 #include "asm-x86/bootparam.h"
 /*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
@@ -110,6 +111,9 @@ struct device_list
 	/* The descriptor page for the devices. */
 	u8 *descpage;
 
+	/* Pointer to last used in descpage */
+	u8 *nextdesc;
+
 	/* A single linked list of devices. */
 	struct device *dev;
 	/* And a pointer to the last device for easy append and also for
@@ -171,7 +175,8 @@ struct virtqueue
 	bool blocked;
 };
 
-static unsigned int net_xmit_notify, net_recv_notify, net_recv_notify, net_timeout;
+static unsigned int net_xmit_notify, net_recv_notify, net_recv_notify,
+	net_timeout, net_xmit_irq, net_recv_irq;
 
 /* Remember the arguments to the program so we can "reboot" */
 static char **main_args;
@@ -881,6 +886,8 @@ static bool handle_console_input(int fd,
 					}
 					printf("(idx = %u)\n", timeout_num_idx);
 				}
+				printf("network xmit irq %u recv irq %u\n",
+				       net_xmit_irq, net_recv_irq);
 				exit(2);
 			}
 			abort->count = 0;
@@ -937,6 +944,11 @@ static void block_vq(struct virtqueue *v
  * and write them (ignoring the first element) to this device's file descriptor
  * (/dev/net/tun).
  */
+struct virtio_net_info {
+	struct virtqueue *xmit_vq, *recv_vq;
+	int xmitfd, recvfd;
+};
+
 static void handle_net_output(int fd, struct virtqueue *vq, bool timeout)
 {
 	unsigned int head, out, in, num = 0;
@@ -957,6 +969,33 @@ static void handle_net_output(int fd, st
 	/* Block further kicks and set up a timer if we saw anything. */
 	if (!timeout && num)
 		block_vq(vq);
+
+	if (timeout) {
+		if (num < last_timeout_num)
+			timeout_usec += 10;
+		else if (timeout_usec > 1)
+			timeout_usec--;
+		last_timeout_num = num;
+	}
+}
+
+static void handle_netring_output(int fd, struct virtqueue *vq, bool timeout)
+{
+	struct virtio_net_info *ni = vq->dev->priv;
+	u16 num = vq->vring.avail->idx - vring_last_avail(&vq->vring);
+	static int last_timeout_num;
+
+	if (!timeout)
+		net_xmit_notify++;
+
+	if (num) {
+		/* We have output, kick the kernel. */
+		if (write(ni->xmitfd, "", 0) != 0)
+			err(1, "Writing to xmitfd");
+
+		if (!timeout)
+			block_vq(vq);
+	}
 
 	/* We never quite know how long should we wait before we check the
 	 * queue again for more packets.  We start at 500 microseconds, and if
@@ -1207,18 +1246,13 @@ static struct lguest_device_desc *new_de
 static struct lguest_device_desc *new_dev_desc(u16 type)
 {
 	struct lguest_device_desc d = { .type = type };
-	void *p;
-
-	/* Figure out where the next device config is, based on the last one. */
-	if (devices.lastdev)
-		p = device_config(devices.lastdev)
-			+ devices.lastdev->desc->config_len;
-	else
-		p = devices.descpage;
+	void *p = devices.nextdesc;
 
 	/* We only have one page for all the descriptors. */
 	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
 		errx(1, "Too many devices");
+
+	devices.nextdesc += sizeof(d);
 
 	/* p might not be aligned, so we memcpy in. */
 	return memcpy(p, &d, sizeof(d));
@@ -1258,6 +1292,7 @@ static void add_virtqueue(struct device 
 	 * yet, otherwise we'd be overwriting them. */
 	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
 	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+	devices.nextdesc += sizeof(vq->config);
 	dev->desc->num_vq++;
 
 	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1287,6 +1322,7 @@ static void add_feature(struct device *d
 	if (dev->desc->feature_len <= bit / CHAR_BIT) {
 		assert(dev->desc->config_len == 0);
 		dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+		devices.nextdesc = features + dev->desc->feature_len * 2;
 	}
 
 	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1305,8 +1341,10 @@ static void set_config(struct device *de
 	if (device_config(dev) + len > devices.descpage + getpagesize())
 		errx(1, "Too many devices");
 
+	assert(device_config(dev) == devices.nextdesc);
 	/* Copy in the config information, and store the length. */
 	memcpy(device_config(dev), conf, len);
+	devices.nextdesc += len;
 	dev->desc->config_len = len;
 }
 
@@ -1325,7 +1363,8 @@ static struct device *new_device(const c
 	 * to the device_list's fdset and maxfd. */
 	if (handle_input)
 		add_device_fd(dev->fd);
-	dev->desc = new_dev_desc(type);
+	if (type)
+		dev->desc = new_dev_desc(type);
 	dev->handle_input = handle_input;
 	dev->name = name;
 	dev->vq = NULL;
@@ -1513,11 +1552,61 @@ static int get_tun_device(char tapif[IFN
 	return netfd;
 }
 
+static bool xmitfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	net_xmit_irq++;
+
+	/* Read to clear it. */
+	if (read(ni->xmitfd, NULL, 0) != 0)
+		err(1, "%s: reading xmitfd", dev->name);
+	trigger_irq(fd, ni->xmit_vq);
+
+	return true;
+}
+
+static bool recvfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	net_recv_irq++;
+	/* Read to clear it. */
+	if (read(ni->recvfd, NULL, 0) != 0)
+		err(1, "%s: reading recvfd", dev->name);
+	trigger_irq(fd, ni->recv_vq);
+
+	return true;
+}
+
+static int map_vring(struct vring *vr)
+{
+	int fd = open_or_die("/dev/vring", O_RDWR);
+
+	/* Map the rings over where they belong in Guest. */
+	if (mmap(vr->desc, page_align(vring_size(vr->num, getpagesize())),
+		 PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, fd,
+		 vr->num * getpagesize()) != vr->desc)
+		err(1, "mmaping /dev/vring");
+
+	/* This is subtle and nasty.  If we lazily map this, Waker may
+	 * see different pages when we touch it, and hence it will get
+	 * a different result for poll(). */
+	memset(vr->desc, 0, vring_size(vr->num, getpagesize()));
+
+	/* Set offset & limit. */
+	if (ioctl(fd, VRINGSETBASE, guest_base) != 0
+	    || ioctl(fd, VRINGSETLIMIT, guest_limit) != 0)
+		err(1, "Setting vring offset and limit");
+
+	return fd;
+}
+
 /*L:195 Our network is a Host<->Guest network.  This can either use bridging or
  * routing, but the principle is the same: it uses the "tun" device to inject
  * packets into the Host as if they came in from a normal network card.  We
  * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(char *arg)
+static void setup_tun_net(char *arg, bool rings)
 {
 	struct device *dev;
 	int netfd, ipfd;
@@ -1525,16 +1614,45 @@ static void setup_tun_net(char *arg)
 	bool bridging = false;
 	char tapif[IFNAMSIZ], *p;
 	struct virtio_net_config conf;
+	struct virtio_net_info *ni;
 
 	netfd = get_tun_device(tapif);
 
-	/* First we create a new network device. */
-	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+	if (rings) {
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, NULL);
+		/* We don't need enable_fd here.  But we do want to pop
+		 * the Waker out. */
+		add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_netring_output);
+	} else {
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+		/* When they add more receive buffers, try re-enabling input */
+		add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	}
 
-	/* Network devices need a receive and a send queue, just like
-	 * console. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd);
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	dev->priv = ni = malloc(sizeof(*ni));
+
+	ni->recv_vq = dev->vq;
+	ni->xmit_vq = dev->vq->next;
+
+	if (rings) {
+		/* Now we create the receive and xmit ringfds. */
+		ni->recvfd = map_vring(&dev->vq->vring);
+		ni->xmitfd = map_vring(&dev->vq->next->vring);
+
+		/* Tell the tunnet to use them. */
+		if (ioctl(netfd, TUNSETRECVVRING, ni->recvfd) != 0)
+			err(1, "Setting receive ring");
+		if (ioctl(netfd, TUNSETXMITVRING, ni->xmitfd) != 0)
+			err(1, "Setting xmit ring");
+
+		/* Now we need to respond when they become readable. */
+		new_device("net", 0, ni->recvfd, recvfd_used)->priv = ni;
+		new_device("net", 0, ni->xmitfd, xmitfd_used)->priv = ni;
+	}
 
 	/* We need a socket to perform the magic network ioctls to bring up the
 	 * tap interface, connect to the bridge etc.  Any socket will do! */
@@ -1951,6 +2069,7 @@ static struct option opts[] = {
 static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
+	{ "tunring", 1, NULL, 'R' },
 	{ "block", 1, NULL, 'b' },
 	{ "rng", 0, NULL, 'r' },
 	{ "initrd", 1, NULL, 'i' },
@@ -2010,7 +2129,7 @@ int main(int argc, char *argv[])
 						      + DEVICE_PAGES);
 			guest_limit = mem;
 			guest_max = mem + DEVICE_PAGES*getpagesize();
-			devices.descpage = get_pages(1);
+			devices.descpage = devices.nextdesc = get_pages(1);
 			break;
 		}
 	}
@@ -2022,7 +2141,10 @@ int main(int argc, char *argv[])
 			verbose = true;
 			break;
 		case 't':
-			setup_tun_net(optarg);
+			setup_tun_net(optarg, false);
+			break;
+		case 'R':
+			setup_tun_net(optarg, true);
 			break;
 		case 'b':
 			setup_block_file(optarg);
