lguest: Handle non-GSO Guests

This is a test of feature bits.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c |  315 ++++++++++++++++++++++++++----------------
 1 file changed, 199 insertions(+), 116 deletions(-)

diff -r 34ccd603999b Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Thu Jan 24 12:46:30 2008 +1100
+++ b/Documentation/lguest/lguest.c	Thu Jan 24 12:47:03 2008 +1100
@@ -36,6 +36,7 @@
 #include <sched.h>
 #include <limits.h>
 #include <stddef.h>
+#include <net/ethernet.h>
 #include "linux/lguest_launcher.h"
 #include "linux/virtio_config.h"
 #include "linux/virtio_net.h"
@@ -44,7 +45,7 @@
 #include "linux/virtio_rng.h"
 #include "linux/virtio_ring.h"
 #include "asm-x86/bootparam.h"
-/*L:110 We can ignore the 38 include files we need for this program, but I do
+/*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
  *
  * As Linus said, "C is a Spartan language, and so should your naming be."  I
@@ -199,6 +200,14 @@ static u8 *get_feature_bits(struct devic
 {
 	return (u8 *)(dev->desc + 1)
 		+ dev->desc->num_vq * sizeof(struct lguest_vqconfig);
+}
+
+static bool has_feature(struct device *dev, unsigned bit)
+{
+	/* Guest feature bits follow our feature bits. */
+	u8 *features = get_feature_bits(dev) + dev->desc->feature_len;
+
+	return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT));
 }
 
 /*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -842,26 +851,151 @@ static void handle_console_output(int fd
 		add_used_and_trigger(fd, vq, head, len);
 	}
 }
+/*:*/
+static u32 str2ip(const char *ipaddr)
+{
+	unsigned int byte[4];
 
-/*
+	sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
+	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
+}
+
+/* This code is "adapted" from libbridge: it attaches the Host end of the
+ * network device to the bridge device specified by the command line.
+ *
+ * This is yet another James Morris contribution (I'm an IP-level guy, so I
+ * dislike bridging), and I just try not to break it. */
+static void add_to_bridge(int fd, const char *if_name, const char *br_name)
+{
+	int ifidx;
+	struct ifreq ifr;
+
+	if (!*br_name)
+		errx(1, "must specify bridge name");
+
+	ifidx = if_nametoindex(if_name);
+	if (!ifidx)
+		errx(1, "interface %s does not exist!", if_name);
+
+	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+	ifr.ifr_ifindex = ifidx;
+	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
+		err(1, "can't add %s to bridge %s", if_name, br_name);
+}
+
+/* This sets up the Host end of the network device with an IP address and brings
+ * it up so packets will flow. */
+static void configure_device(int fd, const char *devname, u32 ipaddr)
+{
+	struct ifreq ifr;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+	/* Don't read these incantations.  Just cut & paste them like I did! */
+	memset(&ifr, 0, sizeof(ifr));
+	strcpy(ifr.ifr_name, devname);
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = htonl(ipaddr);
+	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+		err(1, "Setting %s interface address", devname);
+	ifr.ifr_flags = IFF_UP;
+	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+		err(1, "Bringing interface %s up", devname);
+}
+
+/* Normally, we assume that the Guest uses all the features, so it will be
+ * using gso.  But to demonstrate feature negotiation, we allow it to choose
+ * not to.  Unfortunately, we can't configure the tun/tap device until we can
+ * tell it whether we handle GSO or not, so we delay that until later. */
+struct net_priv
+{
+	bool done_setup;
+	bool use_gso;
+	struct virtio_net_config conf;
+	u32 ip;
+	const char *bridge_name;
+};
+
+static void complete_net_setup(struct device *dev)
+{
+	struct net_priv *priv = dev->priv;
+	int ipfd;
+	struct ifreq ifr;
+
+	if (priv->done_setup)
+		return;
+
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+
+	/* If Guest can handle everything, we can enable IFF_GSO_HDR */
+	if (has_feature(dev, VIRTIO_NET_F_CSUM)
+	    && has_feature(dev, VIRTIO_NET_F_GSO)) {
+		priv->use_gso = true;
+		ifr.ifr_flags |= IFF_VIRTIO_HDR|IFF_RECV_CSUM|IFF_RECV_GSO;
+	} else
+		priv->use_gso = false;
+
+	strcpy(ifr.ifr_name, "tap%d");
+	if (ioctl(dev->fd, TUNSETIFF, &ifr) != 0)
+		err(1, "configuring /dev/net/tun");
+
+	/* SIOC stands for Socket I/O Control.  S means Set.  IF means
+	 * Interface, and HWADDR is hardware address.  Simple! */
+	memcpy(ifr.ifr_hwaddr.sa_data, priv->conf.mac, ETH_ALEN);
+	ifr.ifr_hwaddr.sa_family = AF_LOCAL;
+	if (ioctl(dev->fd, SIOCSIFHWADDR, &ifr) != 0)
+		err(1, "setting hw address for %s", ifr.ifr_name);
+
+	/* We don't need checksums calculated for packets coming in this
+	 * device: trust us! */
+	ioctl(dev->fd, TUNSETNOCSUM, 1);
+
+	/* We need a socket to perform the magic network ioctls to bring up the
+	 * tap interface, connect to the bridge etc.  Any socket will do! */
+	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (ipfd < 0)
+		err(1, "opening IP socket");
+
+	/* If the command line was --tunnet=bridge:<name> do bridging. */
+	if (priv->bridge_name)
+		add_to_bridge(ipfd, ifr.ifr_name, priv->bridge_name);
+
+	/* Set up the tun device, and get the mac address for the interface. */
+	configure_device(ipfd, ifr.ifr_name, priv->ip);
+
+	priv->done_setup = true;
+}
+
+/*L:201
  * The Network
  *
  * Handling output for network is also simple: we get all the output buffers
- * and write them (ignoring the first element) to this device's file descriptor
- * (stdout). */
+ * and write them to this device's file descriptor (the tap device). */
 static void handle_net_output(int fd, struct virtqueue *vq)
 {
 	unsigned int head, out, in;
 	int len;
+	struct net_priv *priv = vq->dev->priv;
 	struct iovec iov[vq->vring.num];
+
+	/* We might not know whether this Guest speaks GSO until now. */
+	complete_net_setup(vq->dev);
 
 	/* Keep getting output buffers from the Guest until we run out. */
 	while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
 		if (in)
 			errx(1, "Input buffers in output queue?");
+
 		/* With IFF_GSO_HDR, tap takes the same format header as
-		 * virtio_net, so we just pass it through. */
-		len = writev(vq->dev->fd, iov, out);
+		 * virtio_net, so we just pass it through.  Otherwise,
+		 * skip the header. */
+		if (priv->use_gso)
+			len = writev(vq->dev->fd, iov, out);
+		else {
+			len = writev(vq->dev->fd, iov+1, out-1);
+			if (len >= 0)
+				len += iov[0].iov_len;
+		}
 		add_used_and_trigger(fd, vq, head, len);
 	}
 }
@@ -872,27 +1006,40 @@ static bool handle_tun_input(int fd, str
 {
 	unsigned int head, in_num, out_num;
 	int len;
+	struct net_priv *priv = dev->priv;
 	struct iovec iov[dev->vq->vring.num];
+
+	/* We might not know whether this Guest speaks GSO until now. */
+	complete_net_setup(dev);
 
 	/* First we need a network buffer from the Guests's recv virtqueue. */
 	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
 	if (head == dev->vq->vring.num) {
-		/* Now, it's expected that if we try to send a packet too
-		 * early, the Guest won't be ready yet.  Wait until the device
-		 * status says it's ready. */
-		/* FIXME: Actually want DRIVER_ACTIVE here. */
-		if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
-			warn("network: no dma buffer!");
+		/* FIXME: Only do this if DRIVER_ACTIVE. */
+		warn("network: no dma buffer!");
 		/* We'll turn this back on if input buffers are registered. */
 		return false;
 	} else if (out_num)
 		errx(1, "Output buffers in network recv queue?");
 
-	/* Read the GSO header and packet from the tun device directly into the
-	 * Guest's buffer. */
-	len = readv(dev->fd, iov, in_num);
-	if (len <= 0)
-		err(1, "reading network");
+	if (!priv->use_gso) {
+		/* Give them a noop header. */
+		struct virtio_net_hdr hdr = { .flags = 0 };
+		if (iov[0].iov_len != sizeof(hdr))
+			errx(1, "Bad net header len %u", iov[0].iov_len);
+		memcpy(iov[0].iov_base, &hdr, sizeof(hdr));
+
+		len = readv(dev->fd, iov+1, in_num-1);
+		if (len <= 0)
+			err(1, "reading network");
+		len += sizeof(hdr);
+	} else {
+		/* Read the GSO header and packet from the tun device directly
+		 * into the Guest's buffer. */
+		len = readv(dev->fd, iov, in_num);
+		if (len <= 0)
+			err(1, "reading network");
+	}
 
 	/* Tell the Guest about the new packet. */
 	add_used_and_trigger(fd, dev->vq, head, len);
@@ -997,17 +1144,20 @@ static void handle_input(int fd)
 		/* Otherwise, call the device(s) which have readable
 		 * file descriptors and a method of handling them.  */
 		for (i = devices.dev; i; i = i->next) {
+			bool ok = i->desc->status&VIRTIO_CONFIG_S_ACKNOWLEDGE;
 			if (i->handle_input && FD_ISSET(i->fd, &fds)) {
 				int dev_fd;
-				if (i->handle_input(fd, i))
+
+				/* Don't bother the Guest before it's ready. */
+				if (ok && i->handle_input(fd, i))
 					continue;
 
-				/* If handle_input() returns false, it means we
-				 * should no longer service it.  Networking and
-				 * console do this when there's no input
-				 * buffers to deliver into.  Console also uses
-				 * it when it discovers that stdin is
-				 * closed. */
+				/* If handle_input() returns false or driver
+				 * isn't ready, it means we should no longer
+				 * service it.  Networking and console do this
+				 * when there's no input buffers to deliver
+				 * into.  Console also uses it when it
+				 * discovers that stdin is closed. */
 				FD_CLR(i->fd, &devices.infds);
 				/* Tell waker to ignore it too, by sending a
 				 * negative fd number (-1, since 0 is a valid
@@ -1223,63 +1373,14 @@ static void setup_console(void)
  *
  * Finally, we could implement a virtio network switch in the kernel. :*/
 
-static u32 str2ip(const char *ipaddr)
+static void random_ether_addr(u8 *mac)
 {
-	unsigned int byte[4];
-
-	sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]);
-	return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
-}
-
-/* This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it. */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_ifindex = ifidx;
-	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-		err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/* This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer. */
-static void configure_device(int fd, const char *devname, u32 ipaddr,
-			     unsigned char hwaddr[6])
-{
-	struct ifreq ifr;
-	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
-
-	/* Don't read these incantations.  Just cut & paste them like I did! */
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, devname);
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = htonl(ipaddr);
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", devname);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", devname);
-
-	/* SIOC stands for Socket I/O Control.  G means Get (vs S for Set
-	 * above).  IF means Interface, and HWADDR is hardware address.
-	 * Simple! */
-	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
-		err(1, "getting hw address for %s", devname);
-	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
+	int randfd = open_or_die("/dev/urandom", O_RDONLY);
+	if (read(randfd, mac, ETH_ALEN) != ETH_ALEN)
+		err(1, "Reading /dev/urandom");
+	close(randfd);
+	mac[0] &= 0xfe;	/* clear multicast bit */
+	mac[0] |= 0x02;	/* set local assignment bit (IEEE802) */
 }
 
 /*L:195 Our network is a Host<->Guest network.  This can either use bridging or
@@ -1289,26 +1390,14 @@ static void setup_tun_net(const char *ar
 static void setup_tun_net(const char *arg)
 {
 	struct device *dev;
-	struct ifreq ifr;
-	int netfd, ipfd;
-	u32 ip;
-	const char *br_name = NULL;
-	struct virtio_net_config conf;
+	int netfd;
+	struct net_priv *priv = malloc(sizeof(*priv));
 
 	/* We open the /dev/net/tun device and tell it we want a tap device.  A
 	 * tap device is like a tun device, only somehow different.  To tell
 	 * the truth, I completely blundered my way through this code, but it
 	 * works now! */
 	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	memset(&ifr, 0, sizeof(ifr));
-	ifr.ifr_flags = (IFF_TAP | IFF_NO_PI | IFF_VIRTIO_HDR
-			 | IFF_RECV_CSUM | IFF_RECV_GSO);
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-	/* We don't need checksums calculated for packets coming in this
-	 * device: trust us! */
-	ioctl(netfd, TUNSETNOCSUM, 1);
 
 	/* First we create a new network device. */
 	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
@@ -1318,38 +1407,32 @@ static void setup_tun_net(const char *ar
 	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
 	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
 
-	/* We need a socket to perform the magic network ioctls to bring up the
-	 * tap interface, connect to the bridge etc.  Any socket will do! */
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
+	dev->priv = priv;
+
+	/* We have to do setup later. */
+	priv->done_setup = false;
 
 	/* If the command line was --tunnet=bridge:<name> do bridging. */
 	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		ip = INADDR_ANY;
-		br_name = arg + strlen(BRIDGE_PFX);
-		add_to_bridge(ipfd, ifr.ifr_name, br_name);
+		priv->ip = INADDR_ANY;
+		priv->bridge_name = arg + strlen(BRIDGE_PFX);
 	} else /* It is an IP address to set up the device with */
-		ip = str2ip(arg);
+		priv->ip = str2ip(arg);
 
-	/* Set up the tun device, and get the mac address for the interface. */
-	configure_device(ipfd, ifr.ifr_name, ip, conf.mac);
+	/* Get a random mac address, and tell the Guest to use it */
+	random_ether_addr(priv->conf.mac);
+	add_feature(dev, VIRTIO_NET_F_MAC);
+	set_config(dev, sizeof(priv->conf), &priv->conf);
 
-	/* Tell Guest what MAC address to use. */
-	add_feature(dev, VIRTIO_NET_F_MAC);
-	set_config(dev, sizeof(conf), &conf);
-	/* Tap device can handle csums, and all types of GSO. */
+	/* Tap device can handle no csum, and all types of GSO. */
 	add_feature(dev, VIRTIO_NET_F_CSUM);
 	add_feature(dev, VIRTIO_NET_F_GSO);
 
-	/* We don't need the socket any more; setup is done. */
-	close(ipfd);
-
 	verbose("device %u: tun net %u.%u.%u.%u\n",
-		devices.device_num++,
-		(u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
-	if (br_name)
-		verbose("attached to bridge: %s\n", br_name);
+		devices.device_num++, (u8)(priv->ip>>24), (u8)(priv->ip>>16),
+		(u8)(priv->ip>>8), (u8)priv->ip);
+	if (priv->bridge_name)
+		verbose("attached to bridge: %s\n", priv->bridge_name);
 }
 
 /* Our block (disk) device should be really simple: the Guest asks for a block
