Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest

* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (45 commits) Use "struct boot_params" in example launcher Loading bzImage directly. Revert lguest magic and use hook in head.S Update lguest documentation to reflect the new virtual block device name. generalize lgread_u32/lgwrite_u32. Example launcher handle guests not being ready for input Update example launcher for virtio Lguest support for Virtio Remove old lguest I/O infrrasructure. Remove old lguest bus and drivers. Virtio helper routines for a descriptor ringbuffer implementation Module autoprobing support for virtio drivers. Virtio console driver Block driver using virtio. Net driver using virtio Virtio interface Boot with virtual == physical to get closer to native Linux. Allow guest to specify syscall vector to use. Rename "cr3" to "gpgdir" to avoid x86-specific naming. Pagetables to use normal kernel types ...
2026-01-06 10:13:00 -08:00 · 2007-10-23 09:03:07 -07:00
parent a98ce5c6fe 43d33b21a0
commit 0d6810091c
70 changed files with 4817 additions and 4396 deletions
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
 # This creates the demonstration utility "lguest" which runs a Linux guest.
-
-# For those people that have a separate object dir, look there for .config
-KBUILD_OUTPUT := ../..
-ifdef O
-  ifeq ("$(origin O)", "command line")
-    KBUILD_OUTPUT := $(O)
-  endif
-endif
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-include $(KBUILD_OUTPUT)/.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
-
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
 LDLIBS:=-lz
-# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
-# not others (eg. FC7).
-LDFLAGS+=-static
-all: lguest.lds lguest

-# The linker script on x86 is so complex the only way of creating one
-# which will link our binary in the right place is to mangle the
-# default one.
-lguest.lds:
-	$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
+all: lguest

 clean:
-	rm -f lguest.lds lguest
+	rm -f lguest
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
 Linux developers and users to experiment with virtualization with the
 minimum of complexity.  Nonetheless, it should have sufficient
 features to make it useful for specific tasks, and, of course, you are
-encouraged to fork and enhance it.
+encouraged to fork and enhance it (see drivers/lguest/README).

 Features:

@@ -23,19 +23,30 @@ Developer features:

 Running Lguest:

- Lguest runs the same kernel as guest and host.  You can configure
-  them differently, but usually it's easiest not to.
+- The easiest way to run lguest is to use same kernel as guest and host.
+  You can configure them differently, but usually it's easiest not to.

  You will need to configure your kernel with the following options:

-  CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
-  CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
-  CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
-  CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
-  CONFIG_LGUEST=y/m ("Linux hypervisor example code")
+  "General setup":
+     "Prompt for development and/or incomplete code/drivers" = Y
+        (CONFIG_EXPERIMENTAL=y)

-  and I recommend:
-  CONFIG_HZ=100 ("Timer frequency")[2]
+  "Processor type and features":
+     "Paravirtualized guest support" = Y
+        "Lguest guest support" = Y
+     "High Memory Support" = off/4GB
+     "Alignment value to which kernel should be aligned" = 0x100000
+        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
+         CONFIG_PHYSICAL_ALIGN=0x100000)
+
+  "Device Drivers":
+     "Network device support"
+        "Universal TUN/TAP device driver support" = M/Y
+           (CONFIG_TUN=m)
+     "Virtualization"
+        "Linux hypervisor example code" = M/Y
+           (CONFIG_LGUEST=m)

 - A tool called "lguest" is available in this directory: type "make"
  to build it.  If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
 	  dd if=/dev/zero of=rootfile bs=1M count=2048
 	  qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d

+  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
+  console!
+
 - "modprobe lg" if you built it as a module.

 - Run an lguest as root:

-      Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
+      Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda

   Explanation:
-    64m: the amount of memory to use.
+    64: the amount of memory to use, in MB.

    vmlinux: the kernel image found in the top of your build directory.  You
       can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
    --tunnet=192.168.19.1: configures a "tap" device for networking with this
       IP address.

-    --block=rootfile: a file or block device which becomes /dev/lgba
+    --block=rootfile: a file or block device which becomes /dev/vda
       inside the guest.

-    root=/dev/lgba: this (and anything else on the command line) are
+    root=/dev/vda: this (and anything else on the command line) are
       kernel boot parameters.

 - Configuring networking.  I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
  "--sharenet=<filename>": any two guests using the same file are on
  the same network.  This file is created if it does not exist.

-Lguest I/O model:
+There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest

-Lguest uses a simplified DMA model plus shared memory for I/O.  Guests
-can communicate with each other if they share underlying memory
-(usually by the lguest program mmaping the same file), but they can
-use any non-shared memory to communicate with the lguest process.
-
-Guests can register DMA buffers at any key (must be a valid physical
-address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
-hypercall.  "dmabufs" is the physical address of an array of "num"
-"struct lguest_dma": each contains a used_len, and an array of
-physical addresses and lengths.  When a transfer occurs, the
-"used_len" field of one of the buffers which has used_len 0 will be
-set to the length transferred and the irq will fire.
-
-Using an irq value of 0 unbinds the dma buffers.
-
-To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
-and the bytes used is written to the used_len field.  This can be 0 if
-noone else has bound a DMA buffer to that key or some other error.
-DMA buffers bound by the same guest are ignored.
-
-Cheers!
+Good luck!
 Rusty Russell rusty@rustcorp.com.au.
-
-[1] These are on various places on the TODO list, waiting for you to
-    get annoyed enough at the limitation to fix it.
-[2] Lguest is not yet tickless when idle.  See [1].
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -227,28 +227,40 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
 	  If in doubt, say "Y".

 config PARAVIRT
-	bool "Paravirtualization support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool
 	depends on !(X86_VISWS || X86_VOYAGER)
 	help
-	  Paravirtualization is a way of running multiple instances of
-	  Linux on the same machine, under a hypervisor.  This option
-	  changes the kernel so it can modify itself when it is run
-	  under a hypervisor, improving performance significantly.
-	  However, when run without a hypervisor the kernel is
-	  theoretically slower.  If in doubt, say N.
+	  This changes the kernel so it can modify itself when it is run
+	  under a hypervisor, potentially improving performance significantly
+	  over full virtualization.  However, when run without a hypervisor
+	  the kernel is theoretically slower and slightly larger.
+
+menuconfig PARAVIRT_GUEST
+	bool "Paravirtualized guest support"
+	help
+	  Say Y here to get to see options related to running Linux under
+	  various hypervisors.  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and disabled.
+
+if PARAVIRT_GUEST

 source "arch/x86/xen/Kconfig"

 config VMI
-	bool "VMI Paravirt-ops support"
-	depends on PARAVIRT
+	bool "VMI Guest support"
+	select PARAVIRT
+	depends on !(X86_VISWS || X86_VOYAGER)
 	help
 	  VMI provides a paravirtualized interface to the VMware ESX server
 	  (it could be used by other hypervisors in theory too, but is not
 	  at the moment), by linking the kernel to a GPL-ed ROM module
 	  provided by the hypervisor.

+source "arch/x86/lguest/Kconfig"
+
+endif
+
 config ACPI_SRAT
 	bool
 	default y
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000)	:= arch/x86/mach-es7000/
 # Xen paravirtualization support
 core-$(CONFIG_XEN)		+= arch/x86/xen/

+# lguest paravirtualization support
+core-$(CONFIG_LGUEST_GUEST)	+= arch/x86/lguest/
+
 # default subarch .h files
 mflags-y += -Iinclude/asm-x86/mach-default

--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -136,6 +136,7 @@ void foo(void)
 #ifdef CONFIG_LGUEST_GUEST
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
 	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
 	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -0,0 +1,14 @@
+config LGUEST_GUEST
+	bool "Lguest guest support"
+	select PARAVIRT
+	depends on !X86_PAE
+	select VIRTIO
+	select VIRTIO_RING
+	select VIRTIO_CONSOLE
+	help
+	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
+	  allow your kernel to boot under lguest.  This option will increase
+	  your kernel size by about 6k.  If in doubt, say N.
+
+	  If you say Y here, make sure you say Y (or M) to the virtio block
+	  and net drivers which lguest needs.
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -0,0 +1 @@
+obj-y		:= i386_head.o boot.o
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -55,7 +55,7 @@
 #include <linux/clockchips.h>
 #include <linux/lguest.h>
 #include <linux/lguest_launcher.h>
-#include <linux/lguest_bus.h>
+#include <linux/virtio_console.h>
 #include <asm/paravirt.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -65,6 +65,7 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <asm/io.h>
+#include <asm/i387.h>

 /*G:010 Welcome to the Guest!
 *
@@ -85,9 +86,10 @@ struct lguest_data lguest_data = {
 	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
 	.noirq_start = (u32)lguest_noirq_start,
 	.noirq_end = (u32)lguest_noirq_end,
+	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
+	.syscall_vec = SYSCALL_VECTOR,
 };
-struct lguest_device_desc *lguest_devices;
 static cycle_t clock_base;

 /*G:035 Notice the lazy_hcall() above, rather than hcall().  This is our first
@@ -146,10 +148,10 @@ void async_hcall(unsigned long call,
 		/* Table full, so do normal hcall which will flush table. */
 		hcall(call, arg1, arg2, arg3);
 	} else {
-		lguest_data.hcalls[next_call].eax = call;
-		lguest_data.hcalls[next_call].edx = arg1;
-		lguest_data.hcalls[next_call].ebx = arg2;
-		lguest_data.hcalls[next_call].ecx = arg3;
+		lguest_data.hcalls[next_call].arg0 = call;
+		lguest_data.hcalls[next_call].arg1 = arg1;
+		lguest_data.hcalls[next_call].arg2 = arg2;
+		lguest_data.hcalls[next_call].arg3 = arg3;
 		/* Arguments must all be written before we mark it to go */
 		wmb();
 		lguest_data.hcall_status[next_call] = 0;
@@ -160,46 +162,6 @@ void async_hcall(unsigned long call,
 }
 /*:*/

-/* Wrappers for the SEND_DMA and BIND_DMA hypercalls.  This is mainly because
- * Jeff Garzik complained that __pa() should never appear in drivers, and this
- * helps remove most of them.   But also, it wraps some ugliness. */
-void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
-{
-	/* The hcall might not write this if something goes wrong */
-	dma->used_len = 0;
-	hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
-}
-
-int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
-		    unsigned int num, u8 irq)
-{
-	/* This is the only hypercall which actually wants 5 arguments, and we
-	 * only support 4.  Fortunately the interrupt number is always less
-	 * than 256, so we can pack it with the number of dmas in the final
-	 * argument.  */
-	if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
-		return -ENOMEM;
-	return 0;
-}
-
-/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
-void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
-{
-	hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
-}
-
-/* For guests, device memory can be used as normal memory, so we cast away the
- * __iomem to quieten sparse. */
-void *lguest_map(unsigned long phys_addr, unsigned long pages)
-{
-	return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
-}
-
-void lguest_unmap(void *addr)
-{
-	iounmap((__force void __iomem *)addr);
-}
-
 /*G:033
 * Here are our first native-instruction replacements: four functions for
 * interrupt control.
@@ -680,6 +642,7 @@ static struct clocksource lguest_clock = {
 	.mask		= CLOCKSOURCE_MASK(64),
 	.mult		= 1 << 22,
 	.shift		= 22,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };

 /* The "scheduler clock" is just our real clock, adjusted to start at zero */
@@ -761,11 +724,9 @@ static void lguest_time_init(void)
 	 * the TSC, otherwise it's a dumb nanosecond-resolution clock.  Either
 	 * way, the "rating" is initialized so high that it's always chosen
 	 * over any other clocksource. */
-	if (lguest_data.tsc_khz) {
+	if (lguest_data.tsc_khz)
 		lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
 							 lguest_clock.shift);
-		lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
-	}
 	clock_base = lguest_clock_read();
 	clocksource_register(&lguest_clock);

@@ -889,6 +850,23 @@ static __init char *lguest_memory_setup(void)
 	return "LGUEST";
 }

+/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to
+ * produce console output. */
+static __init int early_put_chars(u32 vtermno, const char *buf, int count)
+{
+	char scratch[17];
+	unsigned int len = count;
+
+	if (len > sizeof(scratch) - 1)
+		len = sizeof(scratch) - 1;
+	scratch[len] = '\0';
+	memcpy(scratch, buf, len);
+	hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
+
+	/* This routine returns the number of bytes actually written. */
+	return len;
+}
+
 /*G:050
 * Patching (Powerfully Placating Performance Pedants)
 *
@@ -950,18 +928,8 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
 /*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
 * structures in the kernel provide points for (almost) every routine we have
 * to override to avoid privileged instructions. */
-__init void lguest_init(void *boot)
+__init void lguest_init(void)
 {
-	/* Copy boot parameters first: the Launcher put the physical location
-	 * in %esi, and head.S converted that to a virtual address and handed
-	 * it to us.  We use "__memcpy" because "memcpy" sometimes tries to do
-	 * tricky things to go faster, and we're not ready for that. */
-	__memcpy(&boot_params, boot, PARAM_SIZE);
-	/* The boot parameters also tell us where the command-line is: save
-	 * that, too. */
-	__memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
-	       COMMAND_LINE_SIZE);
-
 	/* We're under lguest, paravirt is enabled, and we're running at
 	 * privilege level 1, not 0 as normal. */
 	pv_info.name = "lguest";
@@ -1033,11 +1001,7 @@ __init void lguest_init(void *boot)

 	/*G:070 Now we've seen all the paravirt_ops, we return to
 	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 *
-	 * The Host expects our first hypercall to tell it where our "struct
-	 * lguest_data" is, so we do that first. */
-	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+	 * occurs. */

 	/* The native boot code sets up initial page tables immediately after
 	 * the kernel itself, and sets init_pg_tables_end so they're not
@@ -1050,11 +1014,6 @@ __init void lguest_init(void *boot)
 	 * the normal data segment to get through booting. */
 	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");

-	/* Clear the part of the kernel data which is expected to be zero.
-	 * Normally it will be anyway, but if we're loading from a bzImage with
-	 * CONFIG_RELOCATALE=y, the relocations will be sitting here. */
-	memset(__bss_start, 0, __bss_stop - __bss_start);
-
 	/* The Host uses the top of the Guest's virtual address space for the
 	 * Host<->Guest Switcher, and it tells us how much it needs in
 	 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
@@ -1092,6 +1051,9 @@ __init void lguest_init(void *boot)
 	 * adapted for lguest's use. */
 	add_preferred_console("hvc", 0, NULL);

+	/* Register our very early console. */
+	virtio_cons_early_init(early_put_chars);
+
 	/* Last of all, we set the power management poweroff hook to point to
 	 * the Guest routine to power off. */
 	pm_power_off = lguest_power_off;
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -1,25 +1,47 @@
 #include <linux/linkage.h>
 #include <linux/lguest.h>
+#include <asm/lguest_hcall.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/processor-flags.h>

-/*G:020 This is where we begin: we have a magic signature which the launcher
- * looks for.  The plan is that the Linux boot protocol will be extended with a
- * "platform type" field which will guide us here from the normal entry point,
- * but for the moment this suffices.  The normal boot code uses %esi for the
- * boot header, so we do too.  We convert it to a virtual address by adding
- * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
+/*G:020 This is where we begin: head.S notes that the boot header's platform
+ * type field is "1" (lguest), so calls us here.  The boot header is in %esi.
+ *
+ * WARNING: be very careful here!  We're running at addresses equal to physical
+ * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
+ * data.
 *
 * The .section line puts this code in .init.text so it will be discarded after
 * boot. */
 .section .init.text, "ax", @progbits
-.ascii "GenuineLguest"
-	/* Set up initial stack. */
- 	movl $(init_thread_union+THREAD_SIZE),%esp
-	movl %esi, %eax
-	addl $__PAGE_OFFSET, %eax
-	jmp lguest_init
+ENTRY(lguest_entry)
+	/* Make initial hypercall now, so we can set up the pagetables. */
+	movl $LHCALL_LGUEST_INIT, %eax
+	movl $lguest_data - __PAGE_OFFSET, %edx
+	int $LGUEST_TRAP_ENTRY
+
+	/* The Host put the toplevel pagetable in lguest_data.pgdir.  The movsl
+	 * instruction uses %esi implicitly. */
+	movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
+
+	/* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
+	 * This means the first 128M of kernel memory will be mapped at
+	 * PAGE_OFFSET where the kernel expects to run.  This will get it far
+	 * enough through boot to switch to its own pagetables. */
+	movl $32, %ecx
+	movl %esi, %edi
+	addl $((__PAGE_OFFSET >> 22) * 4), %edi
+	rep
+	movsl
+
+	/* Set up the initial stack so we can run C code. */
+	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
+	 * moment. */
+	jmp lguest_init+__PAGE_OFFSET

 /*G:055 We create a macro which puts the assembler code between lgstart_ and
 * lgend_ markers.  These templates are put in the .text section: they can't be
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -3,8 +3,9 @@
 #

 config XEN
-	bool "Enable support for Xen hypervisor"
-	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+	bool "Xen guest support"
+	select PARAVIRT
+	depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -94,5 +94,5 @@ source "drivers/kvm/Kconfig"

 source "drivers/uio/Kconfig"

-source "drivers/lguest/Kconfig"
+source "drivers/virtio/Kconfig"
 endmenu
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -91,3 +91,4 @@ obj-$(CONFIG_HID)		+= hid/
 obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
+obj-$(CONFIG_VIRTIO)		+= virtio/
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -425,4 +425,10 @@ config XEN_BLKDEV_FRONTEND
 	  block device driver.  It communicates with a back-end driver
 	  in another domain which drives the actual block device.

+config VIRTIO_BLK
+	tristate "Virtio block driver (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && VIRTIO
+	---help---
+	  This is the virtual block driver for lguest.  Say Y or M.
+
 endif # BLK_DEV
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -25,10 +25,10 @@ obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
+obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o

 obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o

 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
-obj-$(CONFIG_LGUEST_BLOCK)	+= lguest_blk.o
--- a/drivers/block/lguest_blk.c
+++ b/drivers/block/lguest_blk.c
@@ -1,421 +0,0 @@
-/*D:400
- * The Guest block driver
- *
- * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
- * The mechanism is simple: we place the information about the request in the
- * device page, then use SEND_DMA (containing the data for a write, or an empty
- * "ping" DMA for a read).
- :*/
-/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-//#define DEBUG
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/lguest_bus.h>
-
-static char next_block_index = 'a';
-
-/*D:420 Here is the structure which holds all the information we need about
- * each Guest block device.
- *
- * I'm sure at this stage, you're wondering "hey, where was the adventure I was
- * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
- * my blog".  I think Real adventures have boring bits, too, and you're in the
- * middle of one.  But it gets better.  Just not quite yet. */
-struct blockdev
-{
-	/* The block queue infrastructure wants a spinlock: it is held while it
-	 * calls our block request function.  We grab it in our interrupt
-	 * handler so the responses don't mess with new requests. */
-	spinlock_t lock;
-
-	/* The disk structure registered with kernel. */
-	struct gendisk *disk;
-
-	/* The major device number for this disk, and the interrupt.  We only
-	 * really keep them here for completeness; we'd need them if we
-	 * supported device unplugging. */
-	int major;
-	int irq;
-
-	/* The physical address of this device's memory page */
-	unsigned long phys_addr;
-	/* The mapped memory page for convenient acces. */
-	struct lguest_block_page *lb_page;
-
-	/* We only have a single request outstanding at a time: this is it. */
-	struct lguest_dma dma;
-	struct request *req;
-};
-
-/*D:495 We originally used end_request() throughout the driver, but it turns
- * out that end_request() is deprecated, and doesn't actually end the request
- * (which seems like a good reason to deprecate it!).  It simply ends the first
- * bio.  So if we had 3 bios in a "struct request" we would do all 3,
- * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
- * work as we needed to do.
- *
- * This reinforced to me that I do not understand the block layer.
- *
- * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
- * request.  This improved disk speed by 130%. */
-static void end_entire_request(struct request *req, int uptodate)
-{
-	if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
-		BUG();
-	add_disk_randomness(req->rq_disk);
-	blkdev_dequeue_request(req);
-	end_that_request_last(req, uptodate);
-}
-
-/* I'm told there are only two stories in the world worth telling: love and
- * hate.  So there used to be a love scene here like this:
- *
- *  Launcher:	We could make beautiful I/O together, you and I.
- *  Guest:	My, that's a big disk!
- *
- * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
-
-/*D:490 This is the interrupt handler, called when a block read or write has
- * been completed for us. */
-static irqreturn_t lgb_irq(int irq, void *_bd)
-{
-	/* We handed our "struct blockdev" as the argument to request_irq(), so
-	 * it is passed through to us here.  This tells us which device we're
-	 * dealing with in case we have more than one. */
-	struct blockdev *bd = _bd;
-	unsigned long flags;
-
-	/* We weren't doing anything?  Strange, but could happen if we shared
-	 * interrupts (we don't!). */
-	if (!bd->req) {
-		pr_debug("No work!\n");
-		return IRQ_NONE;
-	}
-
-	/* Not done yet?  That's equally strange. */
-	if (!bd->lb_page->result) {
-		pr_debug("No result!\n");
-		return IRQ_NONE;
-	}
-
-	/* We have to grab the lock before ending the request. */
-	spin_lock_irqsave(&bd->lock, flags);
-	/* "result" is 1 for success, 2 for failure: end_entire_request() wants
-	 * to know whether this succeeded or not. */
-	end_entire_request(bd->req, bd->lb_page->result == 1);
-	/* Clear out request, it's done. */
-	bd->req = NULL;
-	/* Reset incoming DMA for next time. */
-	bd->dma.used_len = 0;
-	/* Ready for more reads or writes */
-	blk_start_queue(bd->disk->queue);
-	spin_unlock_irqrestore(&bd->lock, flags);
-
-	/* The interrupt was for us, we dealt with it. */
-	return IRQ_HANDLED;
-}
-
-/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
- * each of which contains "struct bio_vec"s, each of which contains a page, an
- * offset and a length.
- *
- * Fortunately there are iterators to help us walk through the "struct
- * request".  Even more fortunately, there were plenty of places to steal the
- * code from.  We pack the "struct request" into our "struct lguest_dma" and
- * return the total length. */
-static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
-{
-	unsigned int i = 0, len = 0;
-	struct req_iterator iter;
-	struct bio_vec *bvec;
-
-	rq_for_each_segment(bvec, req, iter) {
-		/* We told the block layer not to give us too many. */
-		BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
-		/* If we had a zero-length segment, it would look like
-		 * the end of the data referred to by the "struct
-		 * lguest_dma", so make sure that doesn't happen. */
-		BUG_ON(!bvec->bv_len);
-		/* Convert page & offset to a physical address */
-		dma->addr[i] = page_to_phys(bvec->bv_page)
-			+ bvec->bv_offset;
-		dma->len[i] = bvec->bv_len;
-		len += bvec->bv_len;
-		i++;
-	}
-	/* If the array isn't full, we mark the end with a 0 length */
-	if (i < LGUEST_MAX_DMA_SECTIONS)
-		dma->len[i] = 0;
-	return len;
-}
-
-/* This creates an empty DMA, useful for prodding the Host without sending data
- * (ie. when we want to do a read) */
-static void empty_dma(struct lguest_dma *dma)
-{
-	dma->len[0] = 0;
-}
-
-/*D:470 Setting up a request is fairly easy: */
-static void setup_req(struct blockdev *bd,
-		      int type, struct request *req, struct lguest_dma *dma)
-{
-	/* The type is 1 (write) or 0 (read). */
-	bd->lb_page->type = type;
-	/* The sector on disk where the read or write starts. */
-	bd->lb_page->sector = req->sector;
-	/* The result is initialized to 0 (unfinished). */
-	bd->lb_page->result = 0;
-	/* The current request (so we can end it in the interrupt handler). */
-	bd->req = req;
-	/* The number of bytes: returned as a side-effect of req_to_dma(),
-	 * which packs the block layer's "struct request" into our "struct
-	 * lguest_dma" */
-	bd->lb_page->bytes = req_to_dma(req, dma);
-}
-
-/*D:450 Write is pretty straightforward: we pack the request into a "struct
- * lguest_dma", then use SEND_DMA to send the request. */
-static void do_write(struct blockdev *bd, struct request *req)
-{
-	struct lguest_dma send;
-
-	pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
-	setup_req(bd, 1, req, &send);
-
-	lguest_send_dma(bd->phys_addr, &send);
-}
-
-/* Read is similar to write, except we pack the request into our receive
- * "struct lguest_dma" and send through an empty DMA just to tell the Host that
- * there's a request pending. */
-static void do_read(struct blockdev *bd, struct request *req)
-{
-	struct lguest_dma ping;
-
-	pr_debug("lgb: READ sector %li\n", (long)req->sector);
-	setup_req(bd, 0, req, &bd->dma);
-
-	empty_dma(&ping);
-	lguest_send_dma(bd->phys_addr, &ping);
-}
-
-/*D:440 This where requests come in: we get handed the request queue and are
- * expected to pull a "struct request" off it until we've finished them or
- * we're waiting for a reply: */
-static void do_lgb_request(struct request_queue *q)
-{
-	struct blockdev *bd;
-	struct request *req;
-
-again:
-	/* This sometimes returns NULL even on the very first time around.  I
-	 * wonder if it's something to do with letting elves handle the request
-	 * queue... */
-	req = elv_next_request(q);
-	if (!req)
-		return;
-
-	/* We attached the struct blockdev to the disk: get it back */
-	bd = req->rq_disk->private_data;
-	/* Sometimes we get repeated requests after blk_stop_queue(), but we
-	 * can only handle one at a time. */
-	if (bd->req)
-		return;
-
-	/* We only do reads and writes: no tricky business! */
-	if (!blk_fs_request(req)) {
-		pr_debug("Got non-command 0x%08x\n", req->cmd_type);
-		req->errors++;
-		end_entire_request(req, 0);
-		goto again;
-	}
-
-	if (rq_data_dir(req) == WRITE)
-		do_write(bd, req);
-	else
-		do_read(bd, req);
-
-	/* We've put out the request, so stop any more coming in until we get
-	 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
-	blk_stop_queue(q);
-}
-
-/*D:430 This is the "struct block_device_operations" we attach to the disk at
- * the end of lguestblk_probe().  It doesn't seem to want much. */
-static struct block_device_operations lguestblk_fops = {
-	.owner = THIS_MODULE,
-};
-
-/*D:425 Setting up a disk device seems to involve a lot of code.  I'm not sure
- * quite why.  I do know that the IDE code sent two or three of the maintainers
- * insane, perhaps this is the fringe of the same disease?
- *
- * As in the console code, the probe function gets handed the generic
- * lguest_device from lguest_bus.c: */
-static int lguestblk_probe(struct lguest_device *lgdev)
-{
-	struct blockdev *bd;
-	int err;
-	int irqflags = IRQF_SHARED;
-
-	/* First we allocate our own "struct blockdev" and initialize the easy
-	 * fields. */
-	bd = kmalloc(sizeof(*bd), GFP_KERNEL);
-	if (!bd)
-		return -ENOMEM;
-
-	spin_lock_init(&bd->lock);
-	bd->irq = lgdev_irq(lgdev);
-	bd->req = NULL;
-	bd->dma.used_len = 0;
-	bd->dma.len[0] = 0;
-	/* The descriptor in the lguest_devices array provided by the Host
-	 * gives the Guest the physical page number of the device's page. */
-	bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
-
-	/* We use lguest_map() to get a pointer to the device page */
-	bd->lb_page = lguest_map(bd->phys_addr, 1);
-	if (!bd->lb_page) {
-		err = -ENOMEM;
-		goto out_free_bd;
-	}
-
-	/* We need a major device number: 0 means "assign one dynamically". */
-	bd->major = register_blkdev(0, "lguestblk");
-	if (bd->major < 0) {
-		err = bd->major;
-		goto out_unmap;
-	}
-
-	/* This allocates a "struct gendisk" where we pack all the information
-	 * about the disk which the rest of Linux sees.  The argument is the
-	 * number of minor devices desired: we need one minor for the main
-	 * disk, and one for each partition.  Of course, we can't possibly know
-	 * how many partitions are on the disk (add_disk does that).
-	 */
-	bd->disk = alloc_disk(16);
-	if (!bd->disk) {
-		err = -ENOMEM;
-		goto out_unregister_blkdev;
-	}
-
-	/* Every disk needs a queue for requests to come in: we set up the
-	 * queue with a callback function (the core of our driver) and the lock
-	 * to use. */
-	bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
-	if (!bd->disk->queue) {
-		err = -ENOMEM;
-		goto out_put_disk;
-	}
-
-	/* We can only handle a certain number of pointers in our SEND_DMA
-	 * call, so we set that with blk_queue_max_hw_segments().  This is not
-	 * to be confused with blk_queue_max_phys_segments() of course!  I
-	 * know, who could possibly confuse the two?
-	 *
-	 * Well, it's simple to tell them apart: this one seems to work and the
-	 * other one didn't. */
-	blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
-
-	/* Due to technical limitations of our Host (and simple coding) we
-	 * can't have a single buffer which crosses a page boundary.  Tell it
-	 * here.  This means that our maximum request size is 16
-	 * (LGUEST_MAX_DMA_SECTIONS) pages. */
-	blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
-
-	/* We name our disk: this becomes the device name when udev does its
-	 * magic thing and creates the device node, such as /dev/lgba.
-	 * next_block_index is a global which starts at 'a'.  Unfortunately
-	 * this simple increment logic means that the 27th disk will be called
-	 * "/dev/lgb{".  In that case, I recommend having at least 29 disks, so
-	 * your /dev directory will be balanced. */
-	sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
-
-	/* We look to the device descriptor again to see if this device's
-	 * interrupts are expected to be random.  If they are, we tell the irq
-	 * subsystem.  At the moment this bit is always set. */
-	if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
-		irqflags |= IRQF_SAMPLE_RANDOM;
-
-	/* Now we have the name and irqflags, we can request the interrupt; we
-	 * give it the "struct blockdev" we have set up to pass to lgb_irq()
-	 * when there is an interrupt. */
-	err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
-	if (err)
-		goto out_cleanup_queue;
-
-	/* We bind our one-entry DMA pool to the key for this block device so
-	 * the Host can reply to our requests.  The key is equal to the
-	 * physical address of the device's page, which is conveniently
-	 * unique. */
-	err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
-	if (err)
-		goto out_free_irq;
-
-	/* We finish our disk initialization and add the disk to the system. */
-	bd->disk->major = bd->major;
-	bd->disk->first_minor = 0;
-	bd->disk->private_data = bd;
-	bd->disk->fops = &lguestblk_fops;
-	/* This is initialized to the disk size by the Launcher. */
-	set_capacity(bd->disk, bd->lb_page->num_sectors);
-	add_disk(bd->disk);
-
-	printk(KERN_INFO "%s: device %i at major %d\n",
-	       bd->disk->disk_name, lgdev->index, bd->major);
-
-	/* We don't need to keep the "struct blockdev" around, but if we ever
-	 * implemented device removal, we'd need this. */
-	lgdev->private = bd;
-	return 0;
-
-out_free_irq:
-	free_irq(bd->irq, bd);
-out_cleanup_queue:
-	blk_cleanup_queue(bd->disk->queue);
-out_put_disk:
-	put_disk(bd->disk);
-out_unregister_blkdev:
-	unregister_blkdev(bd->major, "lguestblk");
-out_unmap:
-	lguest_unmap(bd->lb_page);
-out_free_bd:
-	kfree(bd);
-	return err;
-}
-
-/*D:410 The boilerplate code for registering the lguest block driver is just
- * like the console: */
-static struct lguest_driver lguestblk_drv = {
-	.name = "lguestblk",
-	.owner = THIS_MODULE,
-	.device_type = LGUEST_DEVICE_T_BLOCK,
-	.probe = lguestblk_probe,
-};
-
-static __init int lguestblk_init(void)
-{
-	return register_lguest_driver(&lguestblk_drv);
-}
-module_init(lguestblk_init);
-
-MODULE_DESCRIPTION("Lguest block driver");
-MODULE_LICENSE("GPL");
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -0,0 +1,308 @@
+//#define DEBUG
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/virtio.h>
+#include <linux/virtio_blk.h>
+#include <linux/virtio_blk.h>
+
+static unsigned char virtblk_index = 'a';
+struct virtio_blk
+{
+	spinlock_t lock;
+
+	struct virtio_device *vdev;
+	struct virtqueue *vq;
+
+	/* The disk structure for the kernel. */
+	struct gendisk *disk;
+
+	/* Request tracking. */
+	struct list_head reqs;
+
+	mempool_t *pool;
+
+	/* Scatterlist: can be too big for stack. */
+	struct scatterlist sg[3+MAX_PHYS_SEGMENTS];
+};
+
+struct virtblk_req
+{
+	struct list_head list;
+	struct request *req;
+	struct virtio_blk_outhdr out_hdr;
+	struct virtio_blk_inhdr in_hdr;
+};
+
+static bool blk_done(struct virtqueue *vq)
+{
+	struct virtio_blk *vblk = vq->vdev->priv;
+	struct virtblk_req *vbr;
+	unsigned int len;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vblk->lock, flags);
+	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
+		int uptodate;
+		switch (vbr->in_hdr.status) {
+		case VIRTIO_BLK_S_OK:
+			uptodate = 1;
+			break;
+		case VIRTIO_BLK_S_UNSUPP:
+			uptodate = -ENOTTY;
+			break;
+		default:
+			uptodate = 0;
+			break;
+		}
+
+		end_dequeued_request(vbr->req, uptodate);
+		list_del(&vbr->list);
+		mempool_free(vbr, vblk->pool);
+	}
+	/* In case queue is stopped waiting for more buffers. */
+	blk_start_queue(vblk->disk->queue);
+	spin_unlock_irqrestore(&vblk->lock, flags);
+	return true;
+}
+
+static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
+		   struct request *req)
+{
+	unsigned long num, out, in;
+	struct virtblk_req *vbr;
+
+	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
+	if (!vbr)
+		/* When another request finishes we'll try again. */
+		return false;
+
+	vbr->req = req;
+	if (blk_fs_request(vbr->req)) {
+		vbr->out_hdr.type = 0;
+		vbr->out_hdr.sector = vbr->req->sector;
+		vbr->out_hdr.ioprio = vbr->req->ioprio;
+	} else if (blk_pc_request(vbr->req)) {
+		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
+		vbr->out_hdr.sector = 0;
+		vbr->out_hdr.ioprio = vbr->req->ioprio;
+	} else {
+		/* We don't put anything else in the queue. */
+		BUG();
+	}
+
+	if (blk_barrier_rq(vbr->req))
+		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
+
+	/* We have to zero this, otherwise blk_rq_map_sg gets upset. */
+	memset(vblk->sg, 0, sizeof(vblk->sg));
+	sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
+	num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
+	sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr));
+
+	if (rq_data_dir(vbr->req) == WRITE) {
+		vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+		out = 1 + num;
+		in = 1;
+	} else {
+		vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+		out = 1;
+		in = 1 + num;
+	}
+
+	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
+		mempool_free(vbr, vblk->pool);
+		return false;
+	}
+
+	list_add_tail(&vbr->list, &vblk->reqs);
+	return true;
+}
+
+static void do_virtblk_request(struct request_queue *q)
+{
+	struct virtio_blk *vblk = NULL;
+	struct request *req;
+	unsigned int issued = 0;
+
+	while ((req = elv_next_request(q)) != NULL) {
+		vblk = req->rq_disk->private_data;
+		BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg));
+
+		/* If this request fails, stop queue and wait for something to
+		   finish to restart it. */
+		if (!do_req(q, vblk, req)) {
+			blk_stop_queue(q);
+			break;
+		}
+		blkdev_dequeue_request(req);
+		issued++;
+	}
+
+	if (issued)
+		vblk->vq->vq_ops->kick(vblk->vq);
+}
+
+static int virtblk_ioctl(struct inode *inode, struct file *filp,
+			 unsigned cmd, unsigned long data)
+{
+	return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue,
+			      inode->i_bdev->bd_disk, cmd,
+			      (void __user *)data);
+}
+
+static struct block_device_operations virtblk_fops = {
+	.ioctl = virtblk_ioctl,
+	.owner = THIS_MODULE,
+};
+
+static int virtblk_probe(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk;
+	int err, major;
+	void *token;
+	unsigned int len;
+	u64 cap;
+	u32 v;
+
+	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
+	if (!vblk) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&vblk->reqs);
+	spin_lock_init(&vblk->lock);
+	vblk->vdev = vdev;
+
+	/* We expect one virtqueue, for output. */
+	vblk->vq = vdev->config->find_vq(vdev, blk_done);
+	if (IS_ERR(vblk->vq)) {
+		err = PTR_ERR(vblk->vq);
+		goto out_free_vblk;
+	}
+
+	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
+	if (!vblk->pool) {
+		err = -ENOMEM;
+		goto out_free_vq;
+	}
+
+	major = register_blkdev(0, "virtblk");
+	if (major < 0) {
+		err = major;
+		goto out_mempool;
+	}
+
+	/* FIXME: How many partitions?  How long is a piece of string? */
+	vblk->disk = alloc_disk(1 << 4);
+	if (!vblk->disk) {
+		err = -ENOMEM;
+		goto out_unregister_blkdev;
+	}
+
+	vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
+	if (!vblk->disk->queue) {
+		err = -ENOMEM;
+		goto out_put_disk;
+	}
+
+	sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++);
+	vblk->disk->major = major;
+	vblk->disk->first_minor = 0;
+	vblk->disk->private_data = vblk;
+	vblk->disk->fops = &virtblk_fops;
+
+	/* If barriers are supported, tell block layer that queue is ordered */
+	token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len);
+	if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER))
+		blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
+
+	err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap);
+	if (err) {
+		dev_err(&vdev->dev, "Bad/missing capacity in config\n");
+		goto out_put_disk;
+	}
+
+	/* If capacity is too big, truncate with warning. */
+	if ((sector_t)cap != cap) {
+		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+			 (unsigned long long)cap);
+		cap = (sector_t)-1;
+	}
+	set_capacity(vblk->disk, cap);
+
+	err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v);
+	if (!err)
+		blk_queue_max_segment_size(vblk->disk->queue, v);
+	else if (err != -ENOENT) {
+		dev_err(&vdev->dev, "Bad SIZE_MAX in config\n");
+		goto out_put_disk;
+	}
+
+	err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v);
+	if (!err)
+		blk_queue_max_hw_segments(vblk->disk->queue, v);
+	else if (err != -ENOENT) {
+		dev_err(&vdev->dev, "Bad SEG_MAX in config\n");
+		goto out_put_disk;
+	}
+
+	add_disk(vblk->disk);
+	return 0;
+
+out_put_disk:
+	put_disk(vblk->disk);
+out_unregister_blkdev:
+	unregister_blkdev(major, "virtblk");
+out_mempool:
+	mempool_destroy(vblk->pool);
+out_free_vq:
+	vdev->config->del_vq(vblk->vq);
+out_free_vblk:
+	kfree(vblk);
+out:
+	return err;
+}
+
+static void virtblk_remove(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+	int major = vblk->disk->major;
+
+	BUG_ON(!list_empty(&vblk->reqs));
+	blk_cleanup_queue(vblk->disk->queue);
+	put_disk(vblk->disk);
+	unregister_blkdev(major, "virtblk");
+	mempool_destroy(vblk->pool);
+	kfree(vblk);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static struct virtio_driver virtio_blk = {
+	.driver.name =	KBUILD_MODNAME,
+	.driver.owner =	THIS_MODULE,
+	.id_table =	id_table,
+	.probe =	virtblk_probe,
+	.remove =	__devexit_p(virtblk_remove),
+};
+
+static int __init init(void)
+{
+	return register_virtio_driver(&virtio_blk);
+}
+
+static void __exit fini(void)
+{
+	unregister_virtio_driver(&virtio_blk);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio block driver");
+MODULE_LICENSE("GPL");
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -613,6 +613,10 @@ config HVC_XEN
 	help
 	  Xen virtual console device driver

+config VIRTIO_CONSOLE
+	bool
+	select HVC_DRIVER
+
 config HVCS
 	tristate "IBM Hypervisor Virtual Console Server support"
 	depends on PPC_PSERIES
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_N_HDLC)		+= n_hdlc.o
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
 obj-$(CONFIG_SX)		+= sx.o generic_serial.o
-obj-$(CONFIG_LGUEST_GUEST)	+= hvc_lguest.o
 obj-$(CONFIG_RIO)		+= rio/ generic_serial.o
 obj-$(CONFIG_HVC_CONSOLE)	+= hvc_vio.o hvsi.o
 obj-$(CONFIG_HVC_ISERIES)	+= hvc_iseries.o
@@ -50,6 +49,7 @@ obj-$(CONFIG_HVC_RTAS)		+= hvc_rtas.o
 obj-$(CONFIG_HVC_BEAT)		+= hvc_beat.o
 obj-$(CONFIG_HVC_DRIVER)	+= hvc_console.o
 obj-$(CONFIG_HVC_XEN)		+= hvc_xen.o
+obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
 obj-$(CONFIG_RAW_DRIVER)	+= raw.o
 obj-$(CONFIG_SGI_SNSC)		+= snsc.o snsc_event.o
 obj-$(CONFIG_MSPEC)		+= mspec.o
--- a/drivers/char/hvc_lguest.c
+++ b/drivers/char/hvc_lguest.c
@@ -1,177 +0,0 @@
-/*D:300
- * The Guest console driver
- *
- * This is a trivial console driver: we use lguest's DMA mechanism to send
- * bytes out, and register a DMA buffer to receive bytes in.  It is assumed to
- * be present and available from the very beginning of boot.
- *
- * Writing console drivers is one of the few remaining Dark Arts in Linux.
- * Fortunately for us, the path of virtual consoles has been well-trodden by
- * the PowerPC folks, who wrote "hvc_console.c" to generically support any
- * virtual console.  We use that infrastructure which only requires us to write
- * the basic put_chars and get_chars functions and call the right register
- * functions.
- :*/
-
-/*M:002 The console can be flooded: while the Guest is processing input the
- * Host can send more.  Buffering in the Host could alleviate this, but it is a
- * difficult problem in general. :*/
-/* Copyright (C) 2006 Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/lguest_bus.h>
-#include <asm/paravirt.h>
-#include "hvc_console.h"
-
-/*D:340 This is our single console input buffer, with associated "struct
- * lguest_dma" referring to it.  Note the 0-terminated length array, and the
- * use of physical address for the buffer itself. */
-static char inbuf[256];
-static struct lguest_dma cons_input = { .used_len = 0,
-					.addr[0] = __pa(inbuf),
-					.len[0] = sizeof(inbuf),
-					.len[1] = 0 };
-
-/*D:310 The put_chars() callback is pretty straightforward.
- *
- * First we put the pointer and length in a "struct lguest_dma": we only have
- * one pointer, so we set the second length to 0.  Then we use SEND_DMA to send
- * the data to (Host) buffers attached to the console key.  Usually a device's
- * key is a physical address within the device's memory, but because the
- * console device doesn't have any associated physical memory, we use the
- * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */
-static int put_chars(u32 vtermno, const char *buf, int count)
-{
-	struct lguest_dma dma;
-
-	/* FIXME: DMA buffers in a "struct lguest_dma" are not allowed
-	 * to go over page boundaries.  This never seems to happen,
-	 * but if it did we'd need to fix this code. */
-	dma.len[0] = count;
-	dma.len[1] = 0;
-	dma.addr[0] = __pa(buf);
-
-	lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma);
-	/* We're expected to return the amount of data we wrote: all of it. */
-	return count;
-}
-
-/*D:350 get_chars() is the callback from the hvc_console infrastructure when
- * an interrupt is received.
- *
- * Firstly we see if our buffer has been filled: if not, we return.  The rest
- * of the code deals with the fact that the hvc_console() infrastructure only
- * asks us for 16 bytes at a time.  We keep a "cons_offset" variable for
- * partially-read buffers. */
-static int get_chars(u32 vtermno, char *buf, int count)
-{
-	static int cons_offset;
-
-	/* Nothing left to see here... */
-	if (!cons_input.used_len)
-		return 0;
-
-	/* You want more than we have to give?  Well, try wanting less! */
-	if (cons_input.used_len - cons_offset < count)
-		count = cons_input.used_len - cons_offset;
-
-	/* Copy across to their buffer and increment offset. */
-	memcpy(buf, inbuf + cons_offset, count);
-	cons_offset += count;
-
-	/* Finished?  Zero offset, and reset cons_input so Host will use it
-	 * again. */
-	if (cons_offset == cons_input.used_len) {
-		cons_offset = 0;
-		cons_input.used_len = 0;
-	}
-	return count;
-}
-/*:*/
-
-static struct hv_ops lguest_cons = {
-	.get_chars = get_chars,
-	.put_chars = put_chars,
-};
-
-/*D:320 Console drivers are initialized very early so boot messages can go
- * out.  At this stage, the console is output-only.  Our driver checks we're a
- * Guest, and if so hands hvc_instantiate() the console number (0), priority
- * (0), and the struct hv_ops containing the put_chars() function. */
-static int __init cons_init(void)
-{
-	if (strcmp(pv_info.name, "lguest") != 0)
-		return 0;
-
-	return hvc_instantiate(0, 0, &lguest_cons);
-}
-console_initcall(cons_init);
-
-/*D:370 To set up and manage our virtual console, we call hvc_alloc() and
- * stash the result in the private pointer of the "struct lguest_device".
- * Since we never remove the console device we never need this pointer again,
- * but using ->private is considered good form, and you never know who's going
- * to copy your driver.
- *
- * Once the console is set up, we bind our input buffer ready for input. */
-static int lguestcons_probe(struct lguest_device *lgdev)
-{
-	int err;
-
-	/* The first argument of hvc_alloc() is the virtual console number, so
-	 * we use zero.  The second argument is the interrupt number.
-	 *
-	 * The third argument is a "struct hv_ops" containing the put_chars()
-	 * and get_chars() pointers.  The final argument is the output buffer
-	 * size: we use 256 and expect the Host to have room for us to send
-	 * that much. */
-	lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256);
-	if (IS_ERR(lgdev->private))
-		return PTR_ERR(lgdev->private);
-
-	/* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY.
-	 * "cons_input" is that statically-initialized global DMA buffer we saw
-	 * above, and we also give the interrupt we want. */
-	err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1,
-			      lgdev_irq(lgdev));
-	if (err)
-		printk("lguest console: failed to bind buffer.\n");
-	return err;
-}
-/* Note the use of lgdev_irq() for the interrupt number.  We tell hvc_alloc()
- * to expect input when this interrupt is triggered, and then tell
- * lguest_bind_dma() that is the interrupt to send us when input comes in. */
-
-/*D:360 From now on the console driver follows standard Guest driver form:
- * register_lguest_driver() registers the device type and probe function, and
- * the probe function sets up the device.
- *
- * The standard "struct lguest_driver": */
-static struct lguest_driver lguestcons_drv = {
-	.name = "lguestcons",
-	.owner = THIS_MODULE,
-	.device_type = LGUEST_DEVICE_T_CONSOLE,
-	.probe = lguestcons_probe,
-};
-
-/* The standard init function */
-static int __init hvc_lguest_init(void)
-{
-	return register_lguest_driver(&lguestcons_drv);
-}
-module_init(hvc_lguest_init);
--- a/Show More
+++ b/Show More