Before:
	1Gping-throughput.sh: Seconds: 16
	1Gping-throughput.sh: Packets: 1000002,999891
	1Gping-throughput.sh: Interrupts: 967592,1
	1Gping-throughput.sh: Notifications: 15384(0),7(999999)
	Net IRQs triggered: 968286(31715),1(7)
	1Mping.sh: Seconds: 43
	1Mping.sh: Packets: 1000003,1000006
	1Mping.sh: Interrupts: 1000003,1
	1Mping.sh: Notifications: 15385(0),997460(2547)
	Net IRQs triggered: 1000007(0),1(997460)

After:
	1Gping-throughput.sh: Seconds: 20
	1Gping-throughput.sh: Packets: 1000001,1000004
	1Gping-throughput.sh: Interrupts: 966239,1
	1Gping-throughput.sh: Notifications: 1089(14275),963192(36813)
	Net IRQs triggered: 966443(12192),1(962475)
	1Mping.sh: Seconds: 35
	1Mping.sh: Packets: 1000001,1000004
	1Mping.sh: Interrupts: 1000001,1
	1Mping.sh: Notifications: 15385(0),1000004(1)
	Net IRQs triggered: 1000001(0),1(1000004)
---
 Documentation/lguest/lguest.c |  360 ++++++++++++++++++++++++------------------
 1 file changed, 214 insertions(+), 146 deletions(-)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -26,7 +26,11 @@
 #include <sys/time.h>
 #include <time.h>
 #include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
 #include <net/if.h>
+#include <net/if_arp.h>
+#include <net/ethernet.h>
 #include <linux/sockios.h>
 #include <linux/if_tun.h>
 #include <sys/uio.h>
@@ -798,19 +802,125 @@ static void console_output(struct virtqu
 /*
  * The Network
  *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
+ * This is a fake network; we only handle ARP and pings.  Enough to show
+ * how much of our speed is lost writing to / reading from tun/tap and the
+ * kernel network stack.
  */
 struct net_info {
-	int tunfd;
+	/* We write packets to the pipe. */
+	int p[2];
+	/* Our hardware address */
+	unsigned char ethaddr[ETH_ALEN];
+	unsigned char ipaddr[4];
 };
 
+static inline unsigned short from32to16(unsigned long x)
+{
+	/* add up 16-bit and 16-bit for 16+c bit */
+	x = (x & 0xffff) + (x >> 16);
+	/* add up carry.. */
+	x = (x & 0xffff) + (x >> 16);
+	return x;
+}
+
+static unsigned int csum_fold(unsigned int sum)
+{
+	return ~from32to16(sum);
+}
+
+static unsigned long do_csum(const unsigned char * buff, int len)
+{
+	int odd, count;
+	unsigned long result = 0;
+
+	if (len <= 0)
+		return 0;
+
+	odd = 1 & (unsigned long) buff;
+	if (odd) {
+		result = *buff;
+		len--;
+		buff++;
+	}
+	count = len >> 1;		/* nr of 16-bit words.. */
+	if (count) {
+		if (2 & (unsigned long) buff) {
+			result += *(unsigned short *) buff;
+			count--;
+			len -= 2;
+			buff += 2;
+		}
+		count >>= 1;		/* nr of 32-bit words.. */
+		if (count) {
+		        unsigned long carry = 0;
+			do {
+				unsigned int w = *(unsigned int *) buff;
+				count--;
+				buff += 4;
+				result += carry;
+				result += w;
+				carry = (w > result);
+			} while (count);
+			result += carry;
+			result = (result & 0xffff) + (result >> 16);
+		}
+		if (len & 2) {
+			result += *(unsigned short *) buff;
+			buff += 2;
+		}
+	}
+	if (len & 1)
+		result += (*buff << 8);
+	result = from32to16(result);
+	if (odd)
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+	return result;
+}
+
+static unsigned int csum_partial(const void * buff, int len, unsigned int sum)
+{
+	unsigned int result = do_csum(buff, len);
+
+	/* add in old sum, and carry.. */
+	result += sum;
+	if (sum > result)
+		result += 1;
+	return result;
+}
+
+static void csum_replace(__u16 *sum, u32 from, u32 to)
+{
+	u32 diff[] = { ~from, to };
+	*sum = csum_fold(csum_partial(diff, sizeof(diff), *sum ^ 0xFFFF));
+}
+
+static unsigned int copy_iov(char *buffer,
+			     const struct iovec *iov, unsigned iovnum,
+			     unsigned buflen)
+{
+	unsigned int len = 0;
+
+	while (iovnum) {
+		if (len + iov->iov_len > buflen)
+			err(1, "iovec too long");
+		memcpy(buffer + len, iov->iov_base, iov->iov_len);
+		len += iov->iov_len;
+		iovnum--;
+		iov++;
+	}
+
+	return len;
+}
+
 static void net_output(struct virtqueue *vq)
 {
 	struct net_info *net_info = vq->dev->priv;
 	unsigned int head, out, in;
-	int len;
 	struct iovec iov[vq->vring.num];
+	struct ether_header *ehdr;
+	char pkt[2 + sizeof(struct virtio_net_hdr) + 1514];
+	unsigned int len;
 
 	vq->irq = &xmit_irq;
 	vq->irq_suppressed = &xmit_irq_suppressed;
@@ -818,10 +928,66 @@ static void net_output(struct virtqueue 
 	head = wait_for_vq_desc(vq, iov, &out, &in);
 	if (in)
 		errx(1, "Input buffers in net output queue?");
-	len = writev(net_info->tunfd, iov, out);
-	if (len < 0)
-		errx(1, "Write to tun failed?");
-	add_used(vq, head, len);
+
+	len = copy_iov(pkt+2, iov, out, sizeof(pkt));
+	if (len < sizeof(struct virtio_net_hdr) + 14)
+		errx(1, "Short packet %u", len);
+
+	ehdr = (struct ether_header *)(pkt + 2 + sizeof(struct virtio_net_hdr));
+	len -= sizeof(struct virtio_net_hdr);
+	if (ehdr->ether_type == htons(ETHERTYPE_ARP)) {
+		struct arphdr *arp = (struct arphdr *)(ehdr + 1);
+		if (len != 14 + 28)
+			errx(1, "Bad arp length %u", len);
+		if (arp->ar_op != htons(ARPOP_REQUEST))
+			errx(1, "Bad arp op %u", ntohs(arp->ar_op));
+
+		/* Turn it into a reply. */
+		arp->ar_op = htons(ARPOP_REPLY);
+		/* Move sender address to target address. */
+		memcpy((void *)(arp + 1) + 10, arp + 1, 10);
+		/* Copy in our ethaddr & IP */
+		memcpy(arp+1, net_info->ethaddr, 10);
+	} else if (ehdr->ether_type == htons(ETHERTYPE_IP)) {
+		struct iphdr *ip = (struct iphdr *)(ehdr + 1);
+		struct icmphdr *icmp = (struct icmphdr *)(ip + 1);
+		u32 old;
+
+		/* We do see some spurious broadcasts. */
+		if (memcmp(&ip->daddr, net_info->ipaddr, 4) != 0)
+			goto consume;
+
+		if (ip->protocol != IPPROTO_ICMP)
+			err(1, "Bad IP protocol %u", ip->protocol);
+
+		if (icmp->type != ICMP_ECHO)
+			err(1, "Bad ICMP typ %u", icmp->type);
+
+		/* Change it to a reply, fix csum. */
+		old = *(u32 *)icmp;
+		icmp->type = ICMP_ECHOREPLY;
+		csum_replace(&icmp->checksum, old, *(u32 *)icmp);
+
+		/* Fix IP header (doesn't effect csum!) */
+		old = ip->daddr;
+		ip->daddr = ip->saddr;
+		ip->saddr = old;
+	} else
+		errx(1, "Unknown ether type %u", ntohs(ehdr->ether_type));
+
+	/* Flip ethernet header. */
+	/* Copy source ethaddr to dst. */
+	memcpy(ehdr->ether_dhost, ehdr->ether_shost, ETH_ALEN);
+	/* Copy our ethaddr to source ethaddr. */
+	memcpy(ehdr->ether_shost, net_info->ethaddr, ETH_ALEN);
+
+	/* We put the lenght first, to packetize. */
+	*(u16 *)pkt = len + sizeof(struct virtio_net_hdr);
+	if (write(net_info->p[1], pkt, 2 + sizeof(struct virtio_net_hdr) + len)
+	    != 2 + sizeof(struct virtio_net_hdr) + len)
+		err(1, "Writing to pipe");
+consume:
+	add_used(vq, head, len+sizeof(struct virtio_net_hdr));
 }
 
 /* Will reading from this file descriptor block? */
@@ -834,6 +1000,17 @@ static bool will_block(int fd)
 	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
 }
 
+static void trim_iov(struct iovec *iov, unsigned num_iov, unsigned len)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_iov; i++) {
+		if (iov[i].iov_len > len)
+			iov[i].iov_len = len;
+		len -= iov[i].iov_len;
+	}
+}
+
 /* This is where we handle packets coming in from the tun device to our
  * Guest. */
 static void net_input(struct virtqueue *vq)
@@ -841,6 +1018,7 @@ static void net_input(struct virtqueue *
 	int len;
 	unsigned int head, out, in;
 	struct iovec iov[vq->vring.num];
+	u16 pkt_len;
 	struct net_info *net_info = vq->dev->priv;
 
 	vq->irq = &recv_irq;
@@ -851,12 +1029,18 @@ static void net_input(struct virtqueue *
 		errx(1, "Output buffers in net input queue?");
 
 	/* Deliver interrupt now, since we're about to sleep. */
-	if (vq->pending_used && will_block(net_info->tunfd))
+	if (vq->pending_used && will_block(net_info->p[0]))
 		trigger_irq(vq);
 
-	len = readv(net_info->tunfd, iov, in);
-	if (len <= 0)
-		err(1, "Failed to read from tun.");
+	if (read(net_info->p[0], &pkt_len, 2) != 2)
+		err(1, "Failed to read length from pipe.");
+
+	/* Don't read in more than the one packet! */
+	trim_iov(iov, in, pkt_len);
+
+	len = readv(net_info->p[0], iov, in);
+	if (len != pkt_len)
+		err(1, "Failed to read from pipe: %u vs %u", len, pkt_len);
 	add_used(vq, head, len);
 }
 
@@ -1303,110 +1487,17 @@ static u32 str2ip(const char *ipaddr)
 	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
 }
 
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
-	unsigned int m[6];
-	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
-		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
-		errx(1, "Failed to parse mac address '%s'", macaddr);
-	mac[0] = m[0];
-	mac[1] = m[1];
-	mac[2] = m[2];
-	mac[3] = m[3];
-	mac[4] = m[4];
-	mac[5] = m[5];
-}
-
-/* This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it. */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_name[IFNAMSIZ-1] = '\0';
-	ifr.ifr_ifindex = ifidx;
-	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-		err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/* This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer. */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
-	struct ifreq ifr;
-	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
-
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, tapif);
-
-	/* Don't read these incantations.  Just cut & paste them like I did! */
-	sin->sin_family = AF_INET;
-	sin->sin_addr.s_addr = htonl(ipaddr);
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", tapif);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
-	struct ifreq ifr;
-	int netfd;
-
-	/* Start with this zeroed.  Messy but sure. */
-	memset(&ifr, 0, sizeof(ifr));
-
-	/* We open the /dev/net/tun device and tell it we want a tap device.  A
-	 * tap device is like a tun device, only somehow different.  To tell
-	 * the truth, I completely blundered my way through this code, but it
-	 * works now! */
-	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-
-	if (ioctl(netfd, TUNSETOFFLOAD,
-		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
-		err(1, "Could not set features for tun device");
-
-	/* We don't need checksums calculated for packets coming in this
-	 * device: trust us! */
-	ioctl(netfd, TUNSETNOCSUM, 1);
-
-	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
-	return netfd;
-}
-
-/*L:195 Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(char *arg)
+/*L:195 Our network is a Host<->Guest network.
+ * This one is a dummy; it just replies to arps and pings. */
+static void setup_dummy_net(char *arg)
 {
 	struct device *dev;
 	struct net_info *net_info = malloc(sizeof(*net_info));
-	int ipfd;
-	u32 ip = INADDR_ANY;
-	bool bridging = false;
-	char tapif[IFNAMSIZ], *p;
 	struct virtio_net_config conf;
+	u32 ip;
 
-	net_info->tunfd = get_tun_device(tapif);
+	if (pipe(net_info->p) != 0)
+		err(1, "Creating net pipe");
 
 	/* First we create a new network device. */
 	dev = new_device("net", VIRTIO_ID_NET);
@@ -1417,34 +1508,17 @@ static void setup_tun_net(char *arg)
 	add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
 	add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
 
-	/* We need a socket to perform the magic network ioctls to bring up the
-	 * tap interface, connect to the bridge etc.  Any socket will do! */
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
+	/* Set IP address. */
+	ip = htonl(str2ip(arg));
+	memcpy(net_info->ipaddr, &ip, 4);
 
-	/* If the command line was --tunnet=bridge:<name> do bridging. */
-	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		arg += strlen(BRIDGE_PFX);
-		bridging = true;
-	}
-
-	/* A mac address may follow the bridge name or IP address */
-	p = strchr(arg, ':');
-	if (p) {
-		str2mac(p+1, conf.mac);
-		add_feature(dev, VIRTIO_NET_F_MAC);
-		*p = '\0';
-	}
-
-	/* arg is now either an IP address or a bridge name */
-	if (bridging)
-		add_to_bridge(ipfd, tapif, arg);
-	else
-		ip = str2ip(arg);
-
-	/* Set up the tun device. */
-	configure_device(ipfd, tapif, ip);
+	/* Set ethernet address: no multicast bit, but local assignment set */
+	net_info->ethaddr[0] = 0xfe;
+	net_info->ethaddr[1] = 104;
+	net_info->ethaddr[2] = 97;
+	net_info->ethaddr[3] = 99;
+	net_info->ethaddr[4] = 107;
+	net_info->ethaddr[5] = 0;
 
 	/* Expect Guest to handle everything except UFO */
 	add_feature(dev, VIRTIO_NET_F_CSUM);
@@ -1459,13 +1533,7 @@ static void setup_tun_net(char *arg)
 	add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
 	set_config(dev, sizeof(conf), &conf);
 
-	/* We don't need the socket any more; setup is done. */
-	close(ipfd);
-
-	if (bridging)
-		finalize_device(dev, " attached to bridge %s", arg);
-	else
-		finalize_device(dev, " address %s", arg);
+	finalize_device(dev, " address %s", arg);
 }
 
 /* Our block (disk) device should be really simple: the Guest asks for a block
@@ -1805,7 +1873,7 @@ int main(int argc, char *argv[])
 			verbose = true;
 			break;
 		case 't':
-			setup_tun_net(optarg);
+			setup_dummy_net(optarg);
 			break;
 		case 'b':
 			setup_block_file(optarg);
