---
 Documentation/lguest/lguest.c |  262 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 237 insertions(+), 25 deletions(-)

diff -r d803a2208052 Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Mon Mar 17 22:33:22 2008 +1100
+++ b/Documentation/lguest/lguest.c	Mon Mar 17 22:35:59 2008 +1100
@@ -276,6 +276,12 @@ static void unlink_memfile(void)
 	unlink(memfile_path);
 }
 
+/* Name the memfiles by the process ID of this launcher. */
+static void guest_memfile(char *buffer, pid_t pid)
+{
+	snprintf(buffer, PATH_MAX, "%s/.lguest/%u", getenv("HOME") ?: "", pid);
+}
+
 /* map_zeroed_pages() takes a number of pages, and creates a mapping file where
  * this Guest's memory lives. */
 static void *map_zeroed_pages(unsigned int num)
@@ -289,9 +295,7 @@ static void *map_zeroed_pages(unsigned i
 	if (mkdir(memfile_path, S_IRWXU) != 0 && errno != EEXIST)
 		err(1, "Creating directory %s", memfile_path);
 
-	/* Name the memfiles by the process ID of this launcher. */
-	snprintf(memfile_path, PATH_MAX, "%s/.lguest/%u",
-		 getenv("HOME") ?: "", getpid());
+	guest_memfile(memfile_path, getpid());
 	fd = open(memfile_path, O_RDWR | O_CREAT | O_TRUNC, S_IRWXU);
 	if (fd < 0)
 		err(1, "Creating memory backing file %s", memfile_path);
@@ -1426,22 +1430,6 @@ static void setup_console(void)
 }
 /*:*/
 
-/*M:010 Inter-guest networking is an interesting area.  Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe.  This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sopisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic.  Of course, namespace and permissions issues need to be
- * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could implement a virtio network switch in the kernel. :*/
-
 static void random_ether_addr(u8 *mac)
 {
 	int randfd = open_or_die("/dev/urandom", O_RDONLY);
@@ -1503,10 +1491,230 @@ static void setup_tun_net(const char *ar
 	if (priv->bridge_name)
 		verbose("attached to bridge: %s\n", priv->bridge_name);
 }
+/*:*/
 
-/* Our block (disk) device should be really simple: the Guest asks for a block
- * number and we read or write that position in the file.  Unfortunately, that
- * was amazingly slow: the Guest waits until the read is finished before
+struct sharenet_priv
+{
+	/* The fifo to write to tell the other Launcher. */
+	int writefd;
+
+	/* The other Guest's send virtqueue. */
+	struct virtqueue_info vq_out;
+
+	/* The information the other Guest gave us. */
+	struct sharenet_other {
+		unsigned int pid;
+		u16 vq_out_num;
+		unsigned long vq_out_addr;
+		struct guest_memory mem;
+	} other;
+};
+
+/* Alters contents of src[] and dst[].  Returns true if all of src copied. */
+static bool iovec_copy(struct iovec *dst, unsigned int dst_num,
+		       struct iovec *src, unsigned int src_num,
+		       unsigned int *totlen)
+{
+	*totlen = 0;
+	while (src_num) {
+		unsigned int len = src->iov_len < dst->iov_len
+			? src->iov_len : dst->iov_len;
+		memcpy(dst->iov_base, src->iov_base, len);
+		*totlen += len;
+
+		src->iov_base += len;
+		src->iov_len -= len;
+		if (!src->iov_len) {
+			src++;
+			src_num--;
+		}
+
+		dst->iov_base += len;
+		dst->iov_len -= len;
+		if (!dst->iov_len) {
+			dst++;
+			dst_num--;
+			/* If we're out of dst room, it's only ok if we're out
+			 * of src too */
+			if (dst_num == 0)
+				return src_num == 0;
+		}
+	}
+	return true;
+}
+
+static bool inter_iov_copy(struct iovec fiov[],
+			   unsigned int fout_num, unsigned int fin_num,
+			   struct iovec iov[],
+			   unsigned int out_num, unsigned int in_num,
+			   unsigned int *len)
+{
+	unsigned int partlen;
+
+	/* Transfer our output to their input (not used by net code). */
+	if (!iovec_copy(fiov + fout_num, fin_num, iov, out_num, &partlen))
+		return false;
+	*len = partlen;
+	if (!iovec_copy(iov + out_num, in_num, fiov, fout_num, &partlen))
+		return false;
+	*len += partlen;
+	return true;
+}
+
+static bool handle_sharenet_input(int fd, struct device *dev)
+{
+	struct sharenet_priv *p = dev->priv;
+	struct virtqueue *vq = dev->vq;
+	struct iovec fiov[p->vq_out.vring.num], iov[vq->vqi.vring.num];
+	unsigned int fin_num, fout_num, in_num, out_num;
+	int fhead, head;
+	char c;
+	bool progress = false, filled = false;
+
+	if (read(dev->fd, &c, 1) != 1) {
+		warn("sharenet: failed to read from other Guest");
+		return false;
+	}
+
+	/* Look in other Guests' (ie. foreign) virtqueue. */
+	/* FIXME: Don't allow arbitrary bidir copies? */
+	while ((fhead = get_vq_desc(&p->vq_out, fiov, &fout_num, &fin_num))>=0){
+		unsigned int len;
+		/* Copy it into our receive queue. */
+		head = get_vq_desc(&vq->vqi, iov, &out_num, &in_num);
+		if (out_num)
+			errx(1, "Output buffers in network recv queue?");
+		if (head < 0) {
+			/* We don't have room to take it, put it back. */
+			p->vq_out.last_avail_idx--;
+			filled = true;
+			break;
+		}
+
+		if (!inter_iov_copy(fiov, fout_num, fin_num,
+				    iov, out_num, in_num, &len)) {
+			warnx("Inter-guest network copy failed: too long?");
+			p->vq_out.broken = true;
+			return false;
+		}
+
+		/* We used one buffer of ours, and one of theirs. */
+		add_used(&vq->vqi, head, len);
+		add_used(&p->vq_out, fhead, len);
+		progress = true;
+	}
+
+	if (progress) {
+		trigger_irq(fd, vq);
+		/* FIXME: Only tell it if they want notify. */
+		write(fd, &c, 1);
+	}
+
+	/* If we filled up, return false: enable_fd will re-enable us. */
+	return !filled;
+}
+
+static void handle_sharenet_output(int fd, struct virtqueue *vq)
+{
+	struct sharenet_priv *p = vq->dev->priv;
+	char c = 0;
+
+	/* Tell other Guest we've got something for it. */
+	write(p->writefd, &c, 1);
+}
+
+static void setup_sharenet(const char *arg)
+{
+	struct device *dev;
+	struct sharenet_priv *p = malloc(sizeof(*p));
+	int fd, readfd;
+	char other_memfile[PATH_MAX];
+	struct sharenet_other us;
+	char *other;
+
+	/* Other fifo is the same, with _ appended. */
+	other = malloc(strlen(arg) + 2);
+	sprintf(other, "%s_", arg);
+
+	/* OK, if we're the first, we get to create it. */
+	if (mkfifo(arg, S_IRUSR|S_IWUSR) == 0) {
+		/* We open our own FIFO, then their FIFO */
+		readfd = open_or_die(arg, O_RDONLY);
+		/* Once we're connected, delete arg. */
+		unlink(arg);
+		p->writefd = open_or_die(other, O_WRONLY);
+		unlink(other);
+	} else {
+		/* The other side got there first. */
+		if (errno != EEXIST)
+			err(1, "Creating sharenet fifo %s", arg);
+
+		/* OK, make the fifo for the other side to open. */
+		if (mkfifo(other, S_IRUSR|S_IWUSR) != 0)
+			err(1, "Creating second sharenet fifo %s", other);
+
+		/* Now, open their FIFO, then open ours.  We unlink even though
+		 * we didn't create it: redundancy is useful. */
+		p->writefd = open_or_die(arg, O_WRONLY);
+		unlink(arg);
+		readfd = open_or_die(other, O_RDONLY);
+		unlink(other);
+	}
+
+	/* Now set up the device. */
+	dev = new_device("sharenet", VIRTIO_ID_NET, readfd,
+			 handle_sharenet_input);
+	dev->priv = p;
+
+	/* Network devices need a receive and a send queue. */
+	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+	add_virtqueue(dev, VIRTQUEUE_NUM, handle_sharenet_output);
+
+	/* Tell the other end about ourselves. */
+	us.pid = getpid();
+	us.vq_out_addr = to_guest_phys(&gmem, dev->vq->next->vqi.vring.desc);
+	us.vq_out_num = dev->vq->next->vqi.vring.num;
+	us.mem = gmem;
+	if (write(p->writefd, &us, sizeof(us)) != sizeof(us))
+		err(1, "Writing to second sharenet fifo");
+
+	/* And, your hobbies are? */
+	if (read(readfd, &p->other, sizeof(p->other)) != sizeof(p->other))
+		err(1, "Reading info from sharenet fifo");
+
+	/* Map their memory file. */
+	guest_memfile(other_memfile, p->other.pid);
+	fd = open_or_die(other_memfile, O_RDWR);
+	p->other.mem.base = mmap(NULL, p->other.mem.limit,
+				 PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+	if (p->other.mem.base == MAP_FAILED)
+		err(1, "Failed to mmap other Guest's memory for sharenet");
+	close(fd);
+
+	/* Check for silly virtqueue stats. */
+	if (p->other.vq_out_addr >= p->other.mem.limit
+	    ||p->other.vq_out_addr+vring_size(p->other.vq_out_num,getpagesize())
+	    >= p->other.mem.limit)
+		err(1, "sharenet: other Guest gave %lu/%u for vq",
+		    p->other.vq_out_addr, p->other.vq_out_num);
+
+	p->vq_out.mem = &p->other.mem;
+	p->vq_out.last_avail_idx = 0;
+	p->vq_out.broken = false;
+	vring_init(&p->vq_out.vring, p->other.vq_out_num,
+		   from_guest_phys(&p->other.mem, p->other.vq_out_addr),
+		   getpagesize());
+
+	/* FIXME: make fifo non-blocking, so other guest can't freeze
+	 * us on write. */
+	/* FIXME: kill SIGPIPE, so other guest can't kill us on write. */
+	verbose("device %u: sharenet (%u at %p)\n", devices.device_num++,
+		p->other.pid, p->other.mem.base);
+}
+
+/*L:196 Our block (disk) device should be really simple: the Guest asks for a
+ * block number and we read or write that position in the file.  Unfortunately,
+ * that was amazingly slow: the Guest waits until the read is finished before
  * running anything else, even if it could have been doing useful work.
  *
  * We could use async I/O, except it's reputed to suck so hard that characters
@@ -1530,7 +1738,6 @@ struct vblk_info
 	 * Launcher triggers interrupt to Guest. */
 	int done_fd;
 };
-/*:*/
 
 /*L:210
  * The Disk
@@ -1732,7 +1939,7 @@ static void setup_block_file(const char 
 	close(vblk->workpipe[0]);
 
 	verbose("device %u: virtblock %llu sectors\n",
-		devices.device_num, le64_to_cpu(conf.capacity));
+		devices.device_num++, le64_to_cpu(conf.capacity));
 }
 /* That's the end of device setup. :*/
 
@@ -1794,7 +2001,7 @@ static void setup_rng(void)
 	/* The device has one virtqueue, where the Guest places inbufs. */
 	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
 
-	verbose("device %u: rng\n", devices.device_num);
+	verbose("device %u: rng\n", devices.device_num++);
 }
 /* That's the end of device setup. */
 
@@ -1851,6 +2058,7 @@ static struct option opts[] = {
 static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
+	{ "sharenet", 1, NULL, 's' },
 	{ "block", 1, NULL, 'b' },
 	{ "rng", 0, NULL, 'r' },
 	{ "initrd", 1, NULL, 'i' },
@@ -1860,6 +2068,7 @@ static void usage(void)
 {
 	errx(1, "Usage: lguest [--verbose] "
 	     "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+	     "[--sharenet=<controlfile>]\n"
 	     "|--block=<filename>|--initrd=<filename>]...\n"
 	     "<mem-in-mb> vmlinux [args...]");
 }
@@ -1932,6 +2141,9 @@ int main(int argc, char *argv[])
 			break;
 		case 'i':
 			initrd_name = optarg;
+			break;
+		case 's':
+			setup_sharenet(optarg);
 			break;
 		default:
 			warnx("Unknown argument %s", argv[optind]);
