mirror of
https://github.com/armbian/linux.git
synced 2026-01-06 10:13:00 -08:00
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (45 commits) Use "struct boot_params" in example launcher Loading bzImage directly. Revert lguest magic and use hook in head.S Update lguest documentation to reflect the new virtual block device name. generalize lgread_u32/lgwrite_u32. Example launcher handle guests not being ready for input Update example launcher for virtio Lguest support for Virtio Remove old lguest I/O infrrasructure. Remove old lguest bus and drivers. Virtio helper routines for a descriptor ringbuffer implementation Module autoprobing support for virtio drivers. Virtio console driver Block driver using virtio. Net driver using virtio Virtio interface Boot with virtual == physical to get closer to native Linux. Allow guest to specify syscall vector to use. Rename "cr3" to "gpgdir" to avoid x86-specific naming. Pagetables to use normal kernel types ...
This commit is contained in:
@@ -1,28 +1,8 @@
|
||||
# This creates the demonstration utility "lguest" which runs a Linux guest.
|
||||
|
||||
# For those people that have a separate object dir, look there for .config
|
||||
KBUILD_OUTPUT := ../..
|
||||
ifdef O
|
||||
ifeq ("$(origin O)", "command line")
|
||||
KBUILD_OUTPUT := $(O)
|
||||
endif
|
||||
endif
|
||||
# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
|
||||
include $(KBUILD_OUTPUT)/.config
|
||||
LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
|
||||
|
||||
CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
|
||||
CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
|
||||
LDLIBS:=-lz
|
||||
# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
|
||||
# not others (eg. FC7).
|
||||
LDFLAGS+=-static
|
||||
all: lguest.lds lguest
|
||||
|
||||
# The linker script on x86 is so complex the only way of creating one
|
||||
# which will link our binary in the right place is to mangle the
|
||||
# default one.
|
||||
lguest.lds:
|
||||
$(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
|
||||
all: lguest
|
||||
|
||||
clean:
|
||||
rm -f lguest.lds lguest
|
||||
rm -f lguest
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
|
||||
Linux developers and users to experiment with virtualization with the
|
||||
minimum of complexity. Nonetheless, it should have sufficient
|
||||
features to make it useful for specific tasks, and, of course, you are
|
||||
encouraged to fork and enhance it.
|
||||
encouraged to fork and enhance it (see drivers/lguest/README).
|
||||
|
||||
Features:
|
||||
|
||||
@@ -23,19 +23,30 @@ Developer features:
|
||||
|
||||
Running Lguest:
|
||||
|
||||
- Lguest runs the same kernel as guest and host. You can configure
|
||||
them differently, but usually it's easiest not to.
|
||||
- The easiest way to run lguest is to use same kernel as guest and host.
|
||||
You can configure them differently, but usually it's easiest not to.
|
||||
|
||||
You will need to configure your kernel with the following options:
|
||||
|
||||
CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
|
||||
CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
|
||||
CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
|
||||
CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
|
||||
CONFIG_LGUEST=y/m ("Linux hypervisor example code")
|
||||
"General setup":
|
||||
"Prompt for development and/or incomplete code/drivers" = Y
|
||||
(CONFIG_EXPERIMENTAL=y)
|
||||
|
||||
and I recommend:
|
||||
CONFIG_HZ=100 ("Timer frequency")[2]
|
||||
"Processor type and features":
|
||||
"Paravirtualized guest support" = Y
|
||||
"Lguest guest support" = Y
|
||||
"High Memory Support" = off/4GB
|
||||
"Alignment value to which kernel should be aligned" = 0x100000
|
||||
(CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
|
||||
CONFIG_PHYSICAL_ALIGN=0x100000)
|
||||
|
||||
"Device Drivers":
|
||||
"Network device support"
|
||||
"Universal TUN/TAP device driver support" = M/Y
|
||||
(CONFIG_TUN=m)
|
||||
"Virtualization"
|
||||
"Linux hypervisor example code" = M/Y
|
||||
(CONFIG_LGUEST=m)
|
||||
|
||||
- A tool called "lguest" is available in this directory: type "make"
|
||||
to build it. If you didn't build your kernel in-tree, use "make
|
||||
@@ -51,14 +62,17 @@ Running Lguest:
|
||||
dd if=/dev/zero of=rootfile bs=1M count=2048
|
||||
qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
|
||||
|
||||
Make sure that you install a getty on /dev/hvc0 if you want to log in on the
|
||||
console!
|
||||
|
||||
- "modprobe lg" if you built it as a module.
|
||||
|
||||
- Run an lguest as root:
|
||||
|
||||
Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
|
||||
Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
|
||||
|
||||
Explanation:
|
||||
64m: the amount of memory to use.
|
||||
64: the amount of memory to use, in MB.
|
||||
|
||||
vmlinux: the kernel image found in the top of your build directory. You
|
||||
can also use a standard bzImage.
|
||||
@@ -66,10 +80,10 @@ Running Lguest:
|
||||
--tunnet=192.168.19.1: configures a "tap" device for networking with this
|
||||
IP address.
|
||||
|
||||
--block=rootfile: a file or block device which becomes /dev/lgba
|
||||
--block=rootfile: a file or block device which becomes /dev/vda
|
||||
inside the guest.
|
||||
|
||||
root=/dev/lgba: this (and anything else on the command line) are
|
||||
root=/dev/vda: this (and anything else on the command line) are
|
||||
kernel boot parameters.
|
||||
|
||||
- Configuring networking. I usually have the host masquerade, using
|
||||
@@ -99,31 +113,7 @@ Running Lguest:
|
||||
"--sharenet=<filename>": any two guests using the same file are on
|
||||
the same network. This file is created if it does not exist.
|
||||
|
||||
Lguest I/O model:
|
||||
There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
|
||||
|
||||
Lguest uses a simplified DMA model plus shared memory for I/O. Guests
|
||||
can communicate with each other if they share underlying memory
|
||||
(usually by the lguest program mmaping the same file), but they can
|
||||
use any non-shared memory to communicate with the lguest process.
|
||||
|
||||
Guests can register DMA buffers at any key (must be a valid physical
|
||||
address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
|
||||
hypercall. "dmabufs" is the physical address of an array of "num"
|
||||
"struct lguest_dma": each contains a used_len, and an array of
|
||||
physical addresses and lengths. When a transfer occurs, the
|
||||
"used_len" field of one of the buffers which has used_len 0 will be
|
||||
set to the length transferred and the irq will fire.
|
||||
|
||||
Using an irq value of 0 unbinds the dma buffers.
|
||||
|
||||
To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
|
||||
and the bytes used is written to the used_len field. This can be 0 if
|
||||
noone else has bound a DMA buffer to that key or some other error.
|
||||
DMA buffers bound by the same guest are ignored.
|
||||
|
||||
Cheers!
|
||||
Good luck!
|
||||
Rusty Russell rusty@rustcorp.com.au.
|
||||
|
||||
[1] These are on various places on the TODO list, waiting for you to
|
||||
get annoyed enough at the limitation to fix it.
|
||||
[2] Lguest is not yet tickless when idle. See [1].
|
||||
|
||||
@@ -227,28 +227,40 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER
|
||||
If in doubt, say "Y".
|
||||
|
||||
config PARAVIRT
|
||||
bool "Paravirtualization support (EXPERIMENTAL)"
|
||||
depends on EXPERIMENTAL
|
||||
bool
|
||||
depends on !(X86_VISWS || X86_VOYAGER)
|
||||
help
|
||||
Paravirtualization is a way of running multiple instances of
|
||||
Linux on the same machine, under a hypervisor. This option
|
||||
changes the kernel so it can modify itself when it is run
|
||||
under a hypervisor, improving performance significantly.
|
||||
However, when run without a hypervisor the kernel is
|
||||
theoretically slower. If in doubt, say N.
|
||||
This changes the kernel so it can modify itself when it is run
|
||||
under a hypervisor, potentially improving performance significantly
|
||||
over full virtualization. However, when run without a hypervisor
|
||||
the kernel is theoretically slower and slightly larger.
|
||||
|
||||
menuconfig PARAVIRT_GUEST
|
||||
bool "Paravirtualized guest support"
|
||||
help
|
||||
Say Y here to get to see options related to running Linux under
|
||||
various hypervisors. This option alone does not add any kernel code.
|
||||
|
||||
If you say N, all options in this submenu will be skipped and disabled.
|
||||
|
||||
if PARAVIRT_GUEST
|
||||
|
||||
source "arch/x86/xen/Kconfig"
|
||||
|
||||
config VMI
|
||||
bool "VMI Paravirt-ops support"
|
||||
depends on PARAVIRT
|
||||
bool "VMI Guest support"
|
||||
select PARAVIRT
|
||||
depends on !(X86_VISWS || X86_VOYAGER)
|
||||
help
|
||||
VMI provides a paravirtualized interface to the VMware ESX server
|
||||
(it could be used by other hypervisors in theory too, but is not
|
||||
at the moment), by linking the kernel to a GPL-ed ROM module
|
||||
provided by the hypervisor.
|
||||
|
||||
source "arch/x86/lguest/Kconfig"
|
||||
|
||||
endif
|
||||
|
||||
config ACPI_SRAT
|
||||
bool
|
||||
default y
|
||||
|
||||
@@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/
|
||||
# Xen paravirtualization support
|
||||
core-$(CONFIG_XEN) += arch/x86/xen/
|
||||
|
||||
# lguest paravirtualization support
|
||||
core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
|
||||
|
||||
# default subarch .h files
|
||||
mflags-y += -Iinclude/asm-x86/mach-default
|
||||
|
||||
|
||||
@@ -136,6 +136,7 @@ void foo(void)
|
||||
#ifdef CONFIG_LGUEST_GUEST
|
||||
BLANK();
|
||||
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
|
||||
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
|
||||
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
|
||||
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
|
||||
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
|
||||
|
||||
14
arch/x86/lguest/Kconfig
Normal file
14
arch/x86/lguest/Kconfig
Normal file
@@ -0,0 +1,14 @@
|
||||
config LGUEST_GUEST
|
||||
bool "Lguest guest support"
|
||||
select PARAVIRT
|
||||
depends on !X86_PAE
|
||||
select VIRTIO
|
||||
select VIRTIO_RING
|
||||
select VIRTIO_CONSOLE
|
||||
help
|
||||
Lguest is a tiny in-kernel hypervisor. Selecting this will
|
||||
allow your kernel to boot under lguest. This option will increase
|
||||
your kernel size by about 6k. If in doubt, say N.
|
||||
|
||||
If you say Y here, make sure you say Y (or M) to the virtio block
|
||||
and net drivers which lguest needs.
|
||||
1
arch/x86/lguest/Makefile
Normal file
1
arch/x86/lguest/Makefile
Normal file
@@ -0,0 +1 @@
|
||||
obj-y := i386_head.o boot.o
|
||||
@@ -55,7 +55,7 @@
|
||||
#include <linux/clockchips.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <linux/lguest_launcher.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
#include <linux/virtio_console.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/param.h>
|
||||
#include <asm/page.h>
|
||||
@@ -65,6 +65,7 @@
|
||||
#include <asm/e820.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/io.h>
|
||||
#include <asm/i387.h>
|
||||
|
||||
/*G:010 Welcome to the Guest!
|
||||
*
|
||||
@@ -85,9 +86,10 @@ struct lguest_data lguest_data = {
|
||||
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
|
||||
.noirq_start = (u32)lguest_noirq_start,
|
||||
.noirq_end = (u32)lguest_noirq_end,
|
||||
.kernel_address = PAGE_OFFSET,
|
||||
.blocked_interrupts = { 1 }, /* Block timer interrupts */
|
||||
.syscall_vec = SYSCALL_VECTOR,
|
||||
};
|
||||
struct lguest_device_desc *lguest_devices;
|
||||
static cycle_t clock_base;
|
||||
|
||||
/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first
|
||||
@@ -146,10 +148,10 @@ void async_hcall(unsigned long call,
|
||||
/* Table full, so do normal hcall which will flush table. */
|
||||
hcall(call, arg1, arg2, arg3);
|
||||
} else {
|
||||
lguest_data.hcalls[next_call].eax = call;
|
||||
lguest_data.hcalls[next_call].edx = arg1;
|
||||
lguest_data.hcalls[next_call].ebx = arg2;
|
||||
lguest_data.hcalls[next_call].ecx = arg3;
|
||||
lguest_data.hcalls[next_call].arg0 = call;
|
||||
lguest_data.hcalls[next_call].arg1 = arg1;
|
||||
lguest_data.hcalls[next_call].arg2 = arg2;
|
||||
lguest_data.hcalls[next_call].arg3 = arg3;
|
||||
/* Arguments must all be written before we mark it to go */
|
||||
wmb();
|
||||
lguest_data.hcall_status[next_call] = 0;
|
||||
@@ -160,46 +162,6 @@ void async_hcall(unsigned long call,
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because
|
||||
* Jeff Garzik complained that __pa() should never appear in drivers, and this
|
||||
* helps remove most of them. But also, it wraps some ugliness. */
|
||||
void lguest_send_dma(unsigned long key, struct lguest_dma *dma)
|
||||
{
|
||||
/* The hcall might not write this if something goes wrong */
|
||||
dma->used_len = 0;
|
||||
hcall(LHCALL_SEND_DMA, key, __pa(dma), 0);
|
||||
}
|
||||
|
||||
int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas,
|
||||
unsigned int num, u8 irq)
|
||||
{
|
||||
/* This is the only hypercall which actually wants 5 arguments, and we
|
||||
* only support 4. Fortunately the interrupt number is always less
|
||||
* than 256, so we can pack it with the number of dmas in the final
|
||||
* argument. */
|
||||
if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq))
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Unbinding is the same hypercall as binding, but with 0 num & irq. */
|
||||
void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas)
|
||||
{
|
||||
hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0);
|
||||
}
|
||||
|
||||
/* For guests, device memory can be used as normal memory, so we cast away the
|
||||
* __iomem to quieten sparse. */
|
||||
void *lguest_map(unsigned long phys_addr, unsigned long pages)
|
||||
{
|
||||
return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages);
|
||||
}
|
||||
|
||||
void lguest_unmap(void *addr)
|
||||
{
|
||||
iounmap((__force void __iomem *)addr);
|
||||
}
|
||||
|
||||
/*G:033
|
||||
* Here are our first native-instruction replacements: four functions for
|
||||
* interrupt control.
|
||||
@@ -680,6 +642,7 @@ static struct clocksource lguest_clock = {
|
||||
.mask = CLOCKSOURCE_MASK(64),
|
||||
.mult = 1 << 22,
|
||||
.shift = 22,
|
||||
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
|
||||
};
|
||||
|
||||
/* The "scheduler clock" is just our real clock, adjusted to start at zero */
|
||||
@@ -761,11 +724,9 @@ static void lguest_time_init(void)
|
||||
* the TSC, otherwise it's a dumb nanosecond-resolution clock. Either
|
||||
* way, the "rating" is initialized so high that it's always chosen
|
||||
* over any other clocksource. */
|
||||
if (lguest_data.tsc_khz) {
|
||||
if (lguest_data.tsc_khz)
|
||||
lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
|
||||
lguest_clock.shift);
|
||||
lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
|
||||
}
|
||||
clock_base = lguest_clock_read();
|
||||
clocksource_register(&lguest_clock);
|
||||
|
||||
@@ -889,6 +850,23 @@ static __init char *lguest_memory_setup(void)
|
||||
return "LGUEST";
|
||||
}
|
||||
|
||||
/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to
|
||||
* produce console output. */
|
||||
static __init int early_put_chars(u32 vtermno, const char *buf, int count)
|
||||
{
|
||||
char scratch[17];
|
||||
unsigned int len = count;
|
||||
|
||||
if (len > sizeof(scratch) - 1)
|
||||
len = sizeof(scratch) - 1;
|
||||
scratch[len] = '\0';
|
||||
memcpy(scratch, buf, len);
|
||||
hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0);
|
||||
|
||||
/* This routine returns the number of bytes actually written. */
|
||||
return len;
|
||||
}
|
||||
|
||||
/*G:050
|
||||
* Patching (Powerfully Placating Performance Pedants)
|
||||
*
|
||||
@@ -950,18 +928,8 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
|
||||
/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
|
||||
* structures in the kernel provide points for (almost) every routine we have
|
||||
* to override to avoid privileged instructions. */
|
||||
__init void lguest_init(void *boot)
|
||||
__init void lguest_init(void)
|
||||
{
|
||||
/* Copy boot parameters first: the Launcher put the physical location
|
||||
* in %esi, and head.S converted that to a virtual address and handed
|
||||
* it to us. We use "__memcpy" because "memcpy" sometimes tries to do
|
||||
* tricky things to go faster, and we're not ready for that. */
|
||||
__memcpy(&boot_params, boot, PARAM_SIZE);
|
||||
/* The boot parameters also tell us where the command-line is: save
|
||||
* that, too. */
|
||||
__memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
|
||||
COMMAND_LINE_SIZE);
|
||||
|
||||
/* We're under lguest, paravirt is enabled, and we're running at
|
||||
* privilege level 1, not 0 as normal. */
|
||||
pv_info.name = "lguest";
|
||||
@@ -1033,11 +1001,7 @@ __init void lguest_init(void *boot)
|
||||
|
||||
/*G:070 Now we've seen all the paravirt_ops, we return to
|
||||
* lguest_init() where the rest of the fairly chaotic boot setup
|
||||
* occurs.
|
||||
*
|
||||
* The Host expects our first hypercall to tell it where our "struct
|
||||
* lguest_data" is, so we do that first. */
|
||||
hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
|
||||
* occurs. */
|
||||
|
||||
/* The native boot code sets up initial page tables immediately after
|
||||
* the kernel itself, and sets init_pg_tables_end so they're not
|
||||
@@ -1050,11 +1014,6 @@ __init void lguest_init(void *boot)
|
||||
* the normal data segment to get through booting. */
|
||||
asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
|
||||
|
||||
/* Clear the part of the kernel data which is expected to be zero.
|
||||
* Normally it will be anyway, but if we're loading from a bzImage with
|
||||
* CONFIG_RELOCATALE=y, the relocations will be sitting here. */
|
||||
memset(__bss_start, 0, __bss_stop - __bss_start);
|
||||
|
||||
/* The Host uses the top of the Guest's virtual address space for the
|
||||
* Host<->Guest Switcher, and it tells us how much it needs in
|
||||
* lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
|
||||
@@ -1092,6 +1051,9 @@ __init void lguest_init(void *boot)
|
||||
* adapted for lguest's use. */
|
||||
add_preferred_console("hvc", 0, NULL);
|
||||
|
||||
/* Register our very early console. */
|
||||
virtio_cons_early_init(early_put_chars);
|
||||
|
||||
/* Last of all, we set the power management poweroff hook to point to
|
||||
* the Guest routine to power off. */
|
||||
pm_power_off = lguest_power_off;
|
||||
@@ -1,25 +1,47 @@
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/lguest.h>
|
||||
#include <asm/lguest_hcall.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/thread_info.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
||||
/*G:020 This is where we begin: we have a magic signature which the launcher
|
||||
* looks for. The plan is that the Linux boot protocol will be extended with a
|
||||
* "platform type" field which will guide us here from the normal entry point,
|
||||
* but for the moment this suffices. The normal boot code uses %esi for the
|
||||
* boot header, so we do too. We convert it to a virtual address by adding
|
||||
* PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
|
||||
/*G:020 This is where we begin: head.S notes that the boot header's platform
|
||||
* type field is "1" (lguest), so calls us here. The boot header is in %esi.
|
||||
*
|
||||
* WARNING: be very careful here! We're running at addresses equal to physical
|
||||
* addesses (around 0), not above PAGE_OFFSET as most code expectes
|
||||
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
|
||||
* data.
|
||||
*
|
||||
* The .section line puts this code in .init.text so it will be discarded after
|
||||
* boot. */
|
||||
.section .init.text, "ax", @progbits
|
||||
.ascii "GenuineLguest"
|
||||
/* Set up initial stack. */
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
movl %esi, %eax
|
||||
addl $__PAGE_OFFSET, %eax
|
||||
jmp lguest_init
|
||||
ENTRY(lguest_entry)
|
||||
/* Make initial hypercall now, so we can set up the pagetables. */
|
||||
movl $LHCALL_LGUEST_INIT, %eax
|
||||
movl $lguest_data - __PAGE_OFFSET, %edx
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
|
||||
/* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
|
||||
* instruction uses %esi implicitly. */
|
||||
movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
|
||||
|
||||
/* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
|
||||
* This means the first 128M of kernel memory will be mapped at
|
||||
* PAGE_OFFSET where the kernel expects to run. This will get it far
|
||||
* enough through boot to switch to its own pagetables. */
|
||||
movl $32, %ecx
|
||||
movl %esi, %edi
|
||||
addl $((__PAGE_OFFSET >> 22) * 4), %edi
|
||||
rep
|
||||
movsl
|
||||
|
||||
/* Set up the initial stack so we can run C code. */
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
|
||||
/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
|
||||
* moment. */
|
||||
jmp lguest_init+__PAGE_OFFSET
|
||||
|
||||
/*G:055 We create a macro which puts the assembler code between lgstart_ and
|
||||
* lgend_ markers. These templates are put in the .text section: they can't be
|
||||
@@ -3,8 +3,9 @@
|
||||
#
|
||||
|
||||
config XEN
|
||||
bool "Enable support for Xen hypervisor"
|
||||
depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
|
||||
bool "Xen guest support"
|
||||
select PARAVIRT
|
||||
depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
|
||||
help
|
||||
This is the Linux Xen port. Enabling this will allow the
|
||||
kernel to boot in a paravirtualized environment under the
|
||||
|
||||
@@ -94,5 +94,5 @@ source "drivers/kvm/Kconfig"
|
||||
|
||||
source "drivers/uio/Kconfig"
|
||||
|
||||
source "drivers/lguest/Kconfig"
|
||||
source "drivers/virtio/Kconfig"
|
||||
endmenu
|
||||
|
||||
@@ -91,3 +91,4 @@ obj-$(CONFIG_HID) += hid/
|
||||
obj-$(CONFIG_PPC_PS3) += ps3/
|
||||
obj-$(CONFIG_OF) += of/
|
||||
obj-$(CONFIG_SSB) += ssb/
|
||||
obj-$(CONFIG_VIRTIO) += virtio/
|
||||
|
||||
@@ -425,4 +425,10 @@ config XEN_BLKDEV_FRONTEND
|
||||
block device driver. It communicates with a back-end driver
|
||||
in another domain which drives the actual block device.
|
||||
|
||||
config VIRTIO_BLK
|
||||
tristate "Virtio block driver (EXPERIMENTAL)"
|
||||
depends on EXPERIMENTAL && VIRTIO
|
||||
---help---
|
||||
This is the virtual block driver for lguest. Say Y or M.
|
||||
|
||||
endif # BLK_DEV
|
||||
|
||||
@@ -25,10 +25,10 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o
|
||||
obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
|
||||
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
|
||||
obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
|
||||
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
|
||||
|
||||
obj-$(CONFIG_VIODASD) += viodasd.o
|
||||
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
|
||||
obj-$(CONFIG_BLK_DEV_UB) += ub.o
|
||||
|
||||
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
|
||||
obj-$(CONFIG_LGUEST_BLOCK) += lguest_blk.o
|
||||
|
||||
@@ -1,421 +0,0 @@
|
||||
/*D:400
|
||||
* The Guest block driver
|
||||
*
|
||||
* This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
|
||||
* The mechanism is simple: we place the information about the request in the
|
||||
* device page, then use SEND_DMA (containing the data for a write, or an empty
|
||||
* "ping" DMA for a read).
|
||||
:*/
|
||||
/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
//#define DEBUG
|
||||
#include <linux/init.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
|
||||
static char next_block_index = 'a';
|
||||
|
||||
/*D:420 Here is the structure which holds all the information we need about
|
||||
* each Guest block device.
|
||||
*
|
||||
* I'm sure at this stage, you're wondering "hey, where was the adventure I was
|
||||
* promised?" and thinking "Rusty sucks, I shall say nasty things about him on
|
||||
* my blog". I think Real adventures have boring bits, too, and you're in the
|
||||
* middle of one. But it gets better. Just not quite yet. */
|
||||
struct blockdev
|
||||
{
|
||||
/* The block queue infrastructure wants a spinlock: it is held while it
|
||||
* calls our block request function. We grab it in our interrupt
|
||||
* handler so the responses don't mess with new requests. */
|
||||
spinlock_t lock;
|
||||
|
||||
/* The disk structure registered with kernel. */
|
||||
struct gendisk *disk;
|
||||
|
||||
/* The major device number for this disk, and the interrupt. We only
|
||||
* really keep them here for completeness; we'd need them if we
|
||||
* supported device unplugging. */
|
||||
int major;
|
||||
int irq;
|
||||
|
||||
/* The physical address of this device's memory page */
|
||||
unsigned long phys_addr;
|
||||
/* The mapped memory page for convenient acces. */
|
||||
struct lguest_block_page *lb_page;
|
||||
|
||||
/* We only have a single request outstanding at a time: this is it. */
|
||||
struct lguest_dma dma;
|
||||
struct request *req;
|
||||
};
|
||||
|
||||
/*D:495 We originally used end_request() throughout the driver, but it turns
|
||||
* out that end_request() is deprecated, and doesn't actually end the request
|
||||
* (which seems like a good reason to deprecate it!). It simply ends the first
|
||||
* bio. So if we had 3 bios in a "struct request" we would do all 3,
|
||||
* end_request(), do 2, end_request(), do 1 and end_request(): twice as much
|
||||
* work as we needed to do.
|
||||
*
|
||||
* This reinforced to me that I do not understand the block layer.
|
||||
*
|
||||
* Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
|
||||
* request. This improved disk speed by 130%. */
|
||||
static void end_entire_request(struct request *req, int uptodate)
|
||||
{
|
||||
if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
|
||||
BUG();
|
||||
add_disk_randomness(req->rq_disk);
|
||||
blkdev_dequeue_request(req);
|
||||
end_that_request_last(req, uptodate);
|
||||
}
|
||||
|
||||
/* I'm told there are only two stories in the world worth telling: love and
|
||||
* hate. So there used to be a love scene here like this:
|
||||
*
|
||||
* Launcher: We could make beautiful I/O together, you and I.
|
||||
* Guest: My, that's a big disk!
|
||||
*
|
||||
* Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
|
||||
|
||||
/*D:490 This is the interrupt handler, called when a block read or write has
|
||||
* been completed for us. */
|
||||
static irqreturn_t lgb_irq(int irq, void *_bd)
|
||||
{
|
||||
/* We handed our "struct blockdev" as the argument to request_irq(), so
|
||||
* it is passed through to us here. This tells us which device we're
|
||||
* dealing with in case we have more than one. */
|
||||
struct blockdev *bd = _bd;
|
||||
unsigned long flags;
|
||||
|
||||
/* We weren't doing anything? Strange, but could happen if we shared
|
||||
* interrupts (we don't!). */
|
||||
if (!bd->req) {
|
||||
pr_debug("No work!\n");
|
||||
return IRQ_NONE;
|
||||
}
|
||||
|
||||
/* Not done yet? That's equally strange. */
|
||||
if (!bd->lb_page->result) {
|
||||
pr_debug("No result!\n");
|
||||
return IRQ_NONE;
|
||||
}
|
||||
|
||||
/* We have to grab the lock before ending the request. */
|
||||
spin_lock_irqsave(&bd->lock, flags);
|
||||
/* "result" is 1 for success, 2 for failure: end_entire_request() wants
|
||||
* to know whether this succeeded or not. */
|
||||
end_entire_request(bd->req, bd->lb_page->result == 1);
|
||||
/* Clear out request, it's done. */
|
||||
bd->req = NULL;
|
||||
/* Reset incoming DMA for next time. */
|
||||
bd->dma.used_len = 0;
|
||||
/* Ready for more reads or writes */
|
||||
blk_start_queue(bd->disk->queue);
|
||||
spin_unlock_irqrestore(&bd->lock, flags);
|
||||
|
||||
/* The interrupt was for us, we dealt with it. */
|
||||
return IRQ_HANDLED;
|
||||
}
|
||||
|
||||
/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
|
||||
* each of which contains "struct bio_vec"s, each of which contains a page, an
|
||||
* offset and a length.
|
||||
*
|
||||
* Fortunately there are iterators to help us walk through the "struct
|
||||
* request". Even more fortunately, there were plenty of places to steal the
|
||||
* code from. We pack the "struct request" into our "struct lguest_dma" and
|
||||
* return the total length. */
|
||||
static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
|
||||
{
|
||||
unsigned int i = 0, len = 0;
|
||||
struct req_iterator iter;
|
||||
struct bio_vec *bvec;
|
||||
|
||||
rq_for_each_segment(bvec, req, iter) {
|
||||
/* We told the block layer not to give us too many. */
|
||||
BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
|
||||
/* If we had a zero-length segment, it would look like
|
||||
* the end of the data referred to by the "struct
|
||||
* lguest_dma", so make sure that doesn't happen. */
|
||||
BUG_ON(!bvec->bv_len);
|
||||
/* Convert page & offset to a physical address */
|
||||
dma->addr[i] = page_to_phys(bvec->bv_page)
|
||||
+ bvec->bv_offset;
|
||||
dma->len[i] = bvec->bv_len;
|
||||
len += bvec->bv_len;
|
||||
i++;
|
||||
}
|
||||
/* If the array isn't full, we mark the end with a 0 length */
|
||||
if (i < LGUEST_MAX_DMA_SECTIONS)
|
||||
dma->len[i] = 0;
|
||||
return len;
|
||||
}
|
||||
|
||||
/* This creates an empty DMA, useful for prodding the Host without sending data
|
||||
* (ie. when we want to do a read) */
|
||||
static void empty_dma(struct lguest_dma *dma)
|
||||
{
|
||||
dma->len[0] = 0;
|
||||
}
|
||||
|
||||
/*D:470 Setting up a request is fairly easy: */
|
||||
static void setup_req(struct blockdev *bd,
|
||||
int type, struct request *req, struct lguest_dma *dma)
|
||||
{
|
||||
/* The type is 1 (write) or 0 (read). */
|
||||
bd->lb_page->type = type;
|
||||
/* The sector on disk where the read or write starts. */
|
||||
bd->lb_page->sector = req->sector;
|
||||
/* The result is initialized to 0 (unfinished). */
|
||||
bd->lb_page->result = 0;
|
||||
/* The current request (so we can end it in the interrupt handler). */
|
||||
bd->req = req;
|
||||
/* The number of bytes: returned as a side-effect of req_to_dma(),
|
||||
* which packs the block layer's "struct request" into our "struct
|
||||
* lguest_dma" */
|
||||
bd->lb_page->bytes = req_to_dma(req, dma);
|
||||
}
|
||||
|
||||
/*D:450 Write is pretty straightforward: we pack the request into a "struct
|
||||
* lguest_dma", then use SEND_DMA to send the request. */
|
||||
static void do_write(struct blockdev *bd, struct request *req)
|
||||
{
|
||||
struct lguest_dma send;
|
||||
|
||||
pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
|
||||
setup_req(bd, 1, req, &send);
|
||||
|
||||
lguest_send_dma(bd->phys_addr, &send);
|
||||
}
|
||||
|
||||
/* Read is similar to write, except we pack the request into our receive
|
||||
* "struct lguest_dma" and send through an empty DMA just to tell the Host that
|
||||
* there's a request pending. */
|
||||
static void do_read(struct blockdev *bd, struct request *req)
|
||||
{
|
||||
struct lguest_dma ping;
|
||||
|
||||
pr_debug("lgb: READ sector %li\n", (long)req->sector);
|
||||
setup_req(bd, 0, req, &bd->dma);
|
||||
|
||||
empty_dma(&ping);
|
||||
lguest_send_dma(bd->phys_addr, &ping);
|
||||
}
|
||||
|
||||
/*D:440 This where requests come in: we get handed the request queue and are
|
||||
* expected to pull a "struct request" off it until we've finished them or
|
||||
* we're waiting for a reply: */
|
||||
static void do_lgb_request(struct request_queue *q)
|
||||
{
|
||||
struct blockdev *bd;
|
||||
struct request *req;
|
||||
|
||||
again:
|
||||
/* This sometimes returns NULL even on the very first time around. I
|
||||
* wonder if it's something to do with letting elves handle the request
|
||||
* queue... */
|
||||
req = elv_next_request(q);
|
||||
if (!req)
|
||||
return;
|
||||
|
||||
/* We attached the struct blockdev to the disk: get it back */
|
||||
bd = req->rq_disk->private_data;
|
||||
/* Sometimes we get repeated requests after blk_stop_queue(), but we
|
||||
* can only handle one at a time. */
|
||||
if (bd->req)
|
||||
return;
|
||||
|
||||
/* We only do reads and writes: no tricky business! */
|
||||
if (!blk_fs_request(req)) {
|
||||
pr_debug("Got non-command 0x%08x\n", req->cmd_type);
|
||||
req->errors++;
|
||||
end_entire_request(req, 0);
|
||||
goto again;
|
||||
}
|
||||
|
||||
if (rq_data_dir(req) == WRITE)
|
||||
do_write(bd, req);
|
||||
else
|
||||
do_read(bd, req);
|
||||
|
||||
/* We've put out the request, so stop any more coming in until we get
|
||||
* an interrupt, which takes us to lgb_irq() to re-enable the queue. */
|
||||
blk_stop_queue(q);
|
||||
}
|
||||
|
||||
/*D:430 This is the "struct block_device_operations" we attach to the disk at
|
||||
* the end of lguestblk_probe(). It doesn't seem to want much. */
|
||||
static struct block_device_operations lguestblk_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
|
||||
* quite why. I do know that the IDE code sent two or three of the maintainers
|
||||
* insane, perhaps this is the fringe of the same disease?
|
||||
*
|
||||
* As in the console code, the probe function gets handed the generic
|
||||
* lguest_device from lguest_bus.c: */
|
||||
static int lguestblk_probe(struct lguest_device *lgdev)
|
||||
{
|
||||
struct blockdev *bd;
|
||||
int err;
|
||||
int irqflags = IRQF_SHARED;
|
||||
|
||||
/* First we allocate our own "struct blockdev" and initialize the easy
|
||||
* fields. */
|
||||
bd = kmalloc(sizeof(*bd), GFP_KERNEL);
|
||||
if (!bd)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock_init(&bd->lock);
|
||||
bd->irq = lgdev_irq(lgdev);
|
||||
bd->req = NULL;
|
||||
bd->dma.used_len = 0;
|
||||
bd->dma.len[0] = 0;
|
||||
/* The descriptor in the lguest_devices array provided by the Host
|
||||
* gives the Guest the physical page number of the device's page. */
|
||||
bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
|
||||
|
||||
/* We use lguest_map() to get a pointer to the device page */
|
||||
bd->lb_page = lguest_map(bd->phys_addr, 1);
|
||||
if (!bd->lb_page) {
|
||||
err = -ENOMEM;
|
||||
goto out_free_bd;
|
||||
}
|
||||
|
||||
/* We need a major device number: 0 means "assign one dynamically". */
|
||||
bd->major = register_blkdev(0, "lguestblk");
|
||||
if (bd->major < 0) {
|
||||
err = bd->major;
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
/* This allocates a "struct gendisk" where we pack all the information
|
||||
* about the disk which the rest of Linux sees. The argument is the
|
||||
* number of minor devices desired: we need one minor for the main
|
||||
* disk, and one for each partition. Of course, we can't possibly know
|
||||
* how many partitions are on the disk (add_disk does that).
|
||||
*/
|
||||
bd->disk = alloc_disk(16);
|
||||
if (!bd->disk) {
|
||||
err = -ENOMEM;
|
||||
goto out_unregister_blkdev;
|
||||
}
|
||||
|
||||
/* Every disk needs a queue for requests to come in: we set up the
|
||||
* queue with a callback function (the core of our driver) and the lock
|
||||
* to use. */
|
||||
bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
|
||||
if (!bd->disk->queue) {
|
||||
err = -ENOMEM;
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
/* We can only handle a certain number of pointers in our SEND_DMA
|
||||
* call, so we set that with blk_queue_max_hw_segments(). This is not
|
||||
* to be confused with blk_queue_max_phys_segments() of course! I
|
||||
* know, who could possibly confuse the two?
|
||||
*
|
||||
* Well, it's simple to tell them apart: this one seems to work and the
|
||||
* other one didn't. */
|
||||
blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
|
||||
|
||||
/* Due to technical limitations of our Host (and simple coding) we
|
||||
* can't have a single buffer which crosses a page boundary. Tell it
|
||||
* here. This means that our maximum request size is 16
|
||||
* (LGUEST_MAX_DMA_SECTIONS) pages. */
|
||||
blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
|
||||
|
||||
/* We name our disk: this becomes the device name when udev does its
|
||||
* magic thing and creates the device node, such as /dev/lgba.
|
||||
* next_block_index is a global which starts at 'a'. Unfortunately
|
||||
* this simple increment logic means that the 27th disk will be called
|
||||
* "/dev/lgb{". In that case, I recommend having at least 29 disks, so
|
||||
* your /dev directory will be balanced. */
|
||||
sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
|
||||
|
||||
/* We look to the device descriptor again to see if this device's
|
||||
* interrupts are expected to be random. If they are, we tell the irq
|
||||
* subsystem. At the moment this bit is always set. */
|
||||
if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
|
||||
irqflags |= IRQF_SAMPLE_RANDOM;
|
||||
|
||||
/* Now we have the name and irqflags, we can request the interrupt; we
|
||||
* give it the "struct blockdev" we have set up to pass to lgb_irq()
|
||||
* when there is an interrupt. */
|
||||
err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
|
||||
if (err)
|
||||
goto out_cleanup_queue;
|
||||
|
||||
/* We bind our one-entry DMA pool to the key for this block device so
|
||||
* the Host can reply to our requests. The key is equal to the
|
||||
* physical address of the device's page, which is conveniently
|
||||
* unique. */
|
||||
err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
|
||||
if (err)
|
||||
goto out_free_irq;
|
||||
|
||||
/* We finish our disk initialization and add the disk to the system. */
|
||||
bd->disk->major = bd->major;
|
||||
bd->disk->first_minor = 0;
|
||||
bd->disk->private_data = bd;
|
||||
bd->disk->fops = &lguestblk_fops;
|
||||
/* This is initialized to the disk size by the Launcher. */
|
||||
set_capacity(bd->disk, bd->lb_page->num_sectors);
|
||||
add_disk(bd->disk);
|
||||
|
||||
printk(KERN_INFO "%s: device %i at major %d\n",
|
||||
bd->disk->disk_name, lgdev->index, bd->major);
|
||||
|
||||
/* We don't need to keep the "struct blockdev" around, but if we ever
|
||||
* implemented device removal, we'd need this. */
|
||||
lgdev->private = bd;
|
||||
return 0;
|
||||
|
||||
out_free_irq:
|
||||
free_irq(bd->irq, bd);
|
||||
out_cleanup_queue:
|
||||
blk_cleanup_queue(bd->disk->queue);
|
||||
out_put_disk:
|
||||
put_disk(bd->disk);
|
||||
out_unregister_blkdev:
|
||||
unregister_blkdev(bd->major, "lguestblk");
|
||||
out_unmap:
|
||||
lguest_unmap(bd->lb_page);
|
||||
out_free_bd:
|
||||
kfree(bd);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*D:410 The boilerplate code for registering the lguest block driver is just
|
||||
* like the console: */
|
||||
static struct lguest_driver lguestblk_drv = {
|
||||
.name = "lguestblk",
|
||||
.owner = THIS_MODULE,
|
||||
.device_type = LGUEST_DEVICE_T_BLOCK,
|
||||
.probe = lguestblk_probe,
|
||||
};
|
||||
|
||||
static __init int lguestblk_init(void)
|
||||
{
|
||||
return register_lguest_driver(&lguestblk_drv);
|
||||
}
|
||||
module_init(lguestblk_init);
|
||||
|
||||
MODULE_DESCRIPTION("Lguest block driver");
|
||||
MODULE_LICENSE("GPL");
|
||||
308
drivers/block/virtio_blk.c
Normal file
308
drivers/block/virtio_blk.c
Normal file
@@ -0,0 +1,308 @@
|
||||
//#define DEBUG
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/hdreg.h>
|
||||
#include <linux/virtio.h>
|
||||
#include <linux/virtio_blk.h>
|
||||
#include <linux/virtio_blk.h>
|
||||
|
||||
static unsigned char virtblk_index = 'a';
|
||||
struct virtio_blk
|
||||
{
|
||||
spinlock_t lock;
|
||||
|
||||
struct virtio_device *vdev;
|
||||
struct virtqueue *vq;
|
||||
|
||||
/* The disk structure for the kernel. */
|
||||
struct gendisk *disk;
|
||||
|
||||
/* Request tracking. */
|
||||
struct list_head reqs;
|
||||
|
||||
mempool_t *pool;
|
||||
|
||||
/* Scatterlist: can be too big for stack. */
|
||||
struct scatterlist sg[3+MAX_PHYS_SEGMENTS];
|
||||
};
|
||||
|
||||
struct virtblk_req
|
||||
{
|
||||
struct list_head list;
|
||||
struct request *req;
|
||||
struct virtio_blk_outhdr out_hdr;
|
||||
struct virtio_blk_inhdr in_hdr;
|
||||
};
|
||||
|
||||
static bool blk_done(struct virtqueue *vq)
|
||||
{
|
||||
struct virtio_blk *vblk = vq->vdev->priv;
|
||||
struct virtblk_req *vbr;
|
||||
unsigned int len;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&vblk->lock, flags);
|
||||
while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
|
||||
int uptodate;
|
||||
switch (vbr->in_hdr.status) {
|
||||
case VIRTIO_BLK_S_OK:
|
||||
uptodate = 1;
|
||||
break;
|
||||
case VIRTIO_BLK_S_UNSUPP:
|
||||
uptodate = -ENOTTY;
|
||||
break;
|
||||
default:
|
||||
uptodate = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
end_dequeued_request(vbr->req, uptodate);
|
||||
list_del(&vbr->list);
|
||||
mempool_free(vbr, vblk->pool);
|
||||
}
|
||||
/* In case queue is stopped waiting for more buffers. */
|
||||
blk_start_queue(vblk->disk->queue);
|
||||
spin_unlock_irqrestore(&vblk->lock, flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
|
||||
struct request *req)
|
||||
{
|
||||
unsigned long num, out, in;
|
||||
struct virtblk_req *vbr;
|
||||
|
||||
vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
|
||||
if (!vbr)
|
||||
/* When another request finishes we'll try again. */
|
||||
return false;
|
||||
|
||||
vbr->req = req;
|
||||
if (blk_fs_request(vbr->req)) {
|
||||
vbr->out_hdr.type = 0;
|
||||
vbr->out_hdr.sector = vbr->req->sector;
|
||||
vbr->out_hdr.ioprio = vbr->req->ioprio;
|
||||
} else if (blk_pc_request(vbr->req)) {
|
||||
vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
|
||||
vbr->out_hdr.sector = 0;
|
||||
vbr->out_hdr.ioprio = vbr->req->ioprio;
|
||||
} else {
|
||||
/* We don't put anything else in the queue. */
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (blk_barrier_rq(vbr->req))
|
||||
vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
|
||||
|
||||
/* We have to zero this, otherwise blk_rq_map_sg gets upset. */
|
||||
memset(vblk->sg, 0, sizeof(vblk->sg));
|
||||
sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
|
||||
num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
|
||||
sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr));
|
||||
|
||||
if (rq_data_dir(vbr->req) == WRITE) {
|
||||
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
|
||||
out = 1 + num;
|
||||
in = 1;
|
||||
} else {
|
||||
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
|
||||
out = 1;
|
||||
in = 1 + num;
|
||||
}
|
||||
|
||||
if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
|
||||
mempool_free(vbr, vblk->pool);
|
||||
return false;
|
||||
}
|
||||
|
||||
list_add_tail(&vbr->list, &vblk->reqs);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void do_virtblk_request(struct request_queue *q)
|
||||
{
|
||||
struct virtio_blk *vblk = NULL;
|
||||
struct request *req;
|
||||
unsigned int issued = 0;
|
||||
|
||||
while ((req = elv_next_request(q)) != NULL) {
|
||||
vblk = req->rq_disk->private_data;
|
||||
BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg));
|
||||
|
||||
/* If this request fails, stop queue and wait for something to
|
||||
finish to restart it. */
|
||||
if (!do_req(q, vblk, req)) {
|
||||
blk_stop_queue(q);
|
||||
break;
|
||||
}
|
||||
blkdev_dequeue_request(req);
|
||||
issued++;
|
||||
}
|
||||
|
||||
if (issued)
|
||||
vblk->vq->vq_ops->kick(vblk->vq);
|
||||
}
|
||||
|
||||
static int virtblk_ioctl(struct inode *inode, struct file *filp,
|
||||
unsigned cmd, unsigned long data)
|
||||
{
|
||||
return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue,
|
||||
inode->i_bdev->bd_disk, cmd,
|
||||
(void __user *)data);
|
||||
}
|
||||
|
||||
static struct block_device_operations virtblk_fops = {
|
||||
.ioctl = virtblk_ioctl,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int virtblk_probe(struct virtio_device *vdev)
|
||||
{
|
||||
struct virtio_blk *vblk;
|
||||
int err, major;
|
||||
void *token;
|
||||
unsigned int len;
|
||||
u64 cap;
|
||||
u32 v;
|
||||
|
||||
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
|
||||
if (!vblk) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&vblk->reqs);
|
||||
spin_lock_init(&vblk->lock);
|
||||
vblk->vdev = vdev;
|
||||
|
||||
/* We expect one virtqueue, for output. */
|
||||
vblk->vq = vdev->config->find_vq(vdev, blk_done);
|
||||
if (IS_ERR(vblk->vq)) {
|
||||
err = PTR_ERR(vblk->vq);
|
||||
goto out_free_vblk;
|
||||
}
|
||||
|
||||
vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
|
||||
if (!vblk->pool) {
|
||||
err = -ENOMEM;
|
||||
goto out_free_vq;
|
||||
}
|
||||
|
||||
major = register_blkdev(0, "virtblk");
|
||||
if (major < 0) {
|
||||
err = major;
|
||||
goto out_mempool;
|
||||
}
|
||||
|
||||
/* FIXME: How many partitions? How long is a piece of string? */
|
||||
vblk->disk = alloc_disk(1 << 4);
|
||||
if (!vblk->disk) {
|
||||
err = -ENOMEM;
|
||||
goto out_unregister_blkdev;
|
||||
}
|
||||
|
||||
vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
|
||||
if (!vblk->disk->queue) {
|
||||
err = -ENOMEM;
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++);
|
||||
vblk->disk->major = major;
|
||||
vblk->disk->first_minor = 0;
|
||||
vblk->disk->private_data = vblk;
|
||||
vblk->disk->fops = &virtblk_fops;
|
||||
|
||||
/* If barriers are supported, tell block layer that queue is ordered */
|
||||
token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len);
|
||||
if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER))
|
||||
blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL);
|
||||
|
||||
err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap);
|
||||
if (err) {
|
||||
dev_err(&vdev->dev, "Bad/missing capacity in config\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
/* If capacity is too big, truncate with warning. */
|
||||
if ((sector_t)cap != cap) {
|
||||
dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
|
||||
(unsigned long long)cap);
|
||||
cap = (sector_t)-1;
|
||||
}
|
||||
set_capacity(vblk->disk, cap);
|
||||
|
||||
err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v);
|
||||
if (!err)
|
||||
blk_queue_max_segment_size(vblk->disk->queue, v);
|
||||
else if (err != -ENOENT) {
|
||||
dev_err(&vdev->dev, "Bad SIZE_MAX in config\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v);
|
||||
if (!err)
|
||||
blk_queue_max_hw_segments(vblk->disk->queue, v);
|
||||
else if (err != -ENOENT) {
|
||||
dev_err(&vdev->dev, "Bad SEG_MAX in config\n");
|
||||
goto out_put_disk;
|
||||
}
|
||||
|
||||
add_disk(vblk->disk);
|
||||
return 0;
|
||||
|
||||
out_put_disk:
|
||||
put_disk(vblk->disk);
|
||||
out_unregister_blkdev:
|
||||
unregister_blkdev(major, "virtblk");
|
||||
out_mempool:
|
||||
mempool_destroy(vblk->pool);
|
||||
out_free_vq:
|
||||
vdev->config->del_vq(vblk->vq);
|
||||
out_free_vblk:
|
||||
kfree(vblk);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void virtblk_remove(struct virtio_device *vdev)
|
||||
{
|
||||
struct virtio_blk *vblk = vdev->priv;
|
||||
int major = vblk->disk->major;
|
||||
|
||||
BUG_ON(!list_empty(&vblk->reqs));
|
||||
blk_cleanup_queue(vblk->disk->queue);
|
||||
put_disk(vblk->disk);
|
||||
unregister_blkdev(major, "virtblk");
|
||||
mempool_destroy(vblk->pool);
|
||||
kfree(vblk);
|
||||
}
|
||||
|
||||
static struct virtio_device_id id_table[] = {
|
||||
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
static struct virtio_driver virtio_blk = {
|
||||
.driver.name = KBUILD_MODNAME,
|
||||
.driver.owner = THIS_MODULE,
|
||||
.id_table = id_table,
|
||||
.probe = virtblk_probe,
|
||||
.remove = __devexit_p(virtblk_remove),
|
||||
};
|
||||
|
||||
static int __init init(void)
|
||||
{
|
||||
return register_virtio_driver(&virtio_blk);
|
||||
}
|
||||
|
||||
static void __exit fini(void)
|
||||
{
|
||||
unregister_virtio_driver(&virtio_blk);
|
||||
}
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_DEVICE_TABLE(virtio, id_table);
|
||||
MODULE_DESCRIPTION("Virtio block driver");
|
||||
MODULE_LICENSE("GPL");
|
||||
@@ -613,6 +613,10 @@ config HVC_XEN
|
||||
help
|
||||
Xen virtual console device driver
|
||||
|
||||
config VIRTIO_CONSOLE
|
||||
bool
|
||||
select HVC_DRIVER
|
||||
|
||||
config HVCS
|
||||
tristate "IBM Hypervisor Virtual Console Server support"
|
||||
depends on PPC_PSERIES
|
||||
|
||||
@@ -42,7 +42,6 @@ obj-$(CONFIG_SYNCLINK_GT) += synclink_gt.o
|
||||
obj-$(CONFIG_N_HDLC) += n_hdlc.o
|
||||
obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
|
||||
obj-$(CONFIG_SX) += sx.o generic_serial.o
|
||||
obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o
|
||||
obj-$(CONFIG_RIO) += rio/ generic_serial.o
|
||||
obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o
|
||||
obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o
|
||||
@@ -50,6 +49,7 @@ obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o
|
||||
obj-$(CONFIG_HVC_BEAT) += hvc_beat.o
|
||||
obj-$(CONFIG_HVC_DRIVER) += hvc_console.o
|
||||
obj-$(CONFIG_HVC_XEN) += hvc_xen.o
|
||||
obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o
|
||||
obj-$(CONFIG_RAW_DRIVER) += raw.o
|
||||
obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o
|
||||
obj-$(CONFIG_MSPEC) += mspec.o
|
||||
|
||||
@@ -1,177 +0,0 @@
|
||||
/*D:300
|
||||
* The Guest console driver
|
||||
*
|
||||
* This is a trivial console driver: we use lguest's DMA mechanism to send
|
||||
* bytes out, and register a DMA buffer to receive bytes in. It is assumed to
|
||||
* be present and available from the very beginning of boot.
|
||||
*
|
||||
* Writing console drivers is one of the few remaining Dark Arts in Linux.
|
||||
* Fortunately for us, the path of virtual consoles has been well-trodden by
|
||||
* the PowerPC folks, who wrote "hvc_console.c" to generically support any
|
||||
* virtual console. We use that infrastructure which only requires us to write
|
||||
* the basic put_chars and get_chars functions and call the right register
|
||||
* functions.
|
||||
:*/
|
||||
|
||||
/*M:002 The console can be flooded: while the Guest is processing input the
|
||||
* Host can send more. Buffering in the Host could alleviate this, but it is a
|
||||
* difficult problem in general. :*/
|
||||
/* Copyright (C) 2006 Rusty Russell, IBM Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
#include <linux/err.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/lguest_bus.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include "hvc_console.h"
|
||||
|
||||
/*D:340 This is our single console input buffer, with associated "struct
|
||||
* lguest_dma" referring to it. Note the 0-terminated length array, and the
|
||||
* use of physical address for the buffer itself. */
|
||||
static char inbuf[256];
|
||||
static struct lguest_dma cons_input = { .used_len = 0,
|
||||
.addr[0] = __pa(inbuf),
|
||||
.len[0] = sizeof(inbuf),
|
||||
.len[1] = 0 };
|
||||
|
||||
/*D:310 The put_chars() callback is pretty straightforward.
|
||||
*
|
||||
* First we put the pointer and length in a "struct lguest_dma": we only have
|
||||
* one pointer, so we set the second length to 0. Then we use SEND_DMA to send
|
||||
* the data to (Host) buffers attached to the console key. Usually a device's
|
||||
* key is a physical address within the device's memory, but because the
|
||||
* console device doesn't have any associated physical memory, we use the
|
||||
* LGUEST_CONSOLE_DMA_KEY constant (aka 0). */
|
||||
static int put_chars(u32 vtermno, const char *buf, int count)
|
||||
{
|
||||
struct lguest_dma dma;
|
||||
|
||||
/* FIXME: DMA buffers in a "struct lguest_dma" are not allowed
|
||||
* to go over page boundaries. This never seems to happen,
|
||||
* but if it did we'd need to fix this code. */
|
||||
dma.len[0] = count;
|
||||
dma.len[1] = 0;
|
||||
dma.addr[0] = __pa(buf);
|
||||
|
||||
lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma);
|
||||
/* We're expected to return the amount of data we wrote: all of it. */
|
||||
return count;
|
||||
}
|
||||
|
||||
/*D:350 get_chars() is the callback from the hvc_console infrastructure when
|
||||
* an interrupt is received.
|
||||
*
|
||||
* Firstly we see if our buffer has been filled: if not, we return. The rest
|
||||
* of the code deals with the fact that the hvc_console() infrastructure only
|
||||
* asks us for 16 bytes at a time. We keep a "cons_offset" variable for
|
||||
* partially-read buffers. */
|
||||
static int get_chars(u32 vtermno, char *buf, int count)
|
||||
{
|
||||
static int cons_offset;
|
||||
|
||||
/* Nothing left to see here... */
|
||||
if (!cons_input.used_len)
|
||||
return 0;
|
||||
|
||||
/* You want more than we have to give? Well, try wanting less! */
|
||||
if (cons_input.used_len - cons_offset < count)
|
||||
count = cons_input.used_len - cons_offset;
|
||||
|
||||
/* Copy across to their buffer and increment offset. */
|
||||
memcpy(buf, inbuf + cons_offset, count);
|
||||
cons_offset += count;
|
||||
|
||||
/* Finished? Zero offset, and reset cons_input so Host will use it
|
||||
* again. */
|
||||
if (cons_offset == cons_input.used_len) {
|
||||
cons_offset = 0;
|
||||
cons_input.used_len = 0;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
/*:*/
|
||||
|
||||
static struct hv_ops lguest_cons = {
|
||||
.get_chars = get_chars,
|
||||
.put_chars = put_chars,
|
||||
};
|
||||
|
||||
/*D:320 Console drivers are initialized very early so boot messages can go
|
||||
* out. At this stage, the console is output-only. Our driver checks we're a
|
||||
* Guest, and if so hands hvc_instantiate() the console number (0), priority
|
||||
* (0), and the struct hv_ops containing the put_chars() function. */
|
||||
static int __init cons_init(void)
|
||||
{
|
||||
if (strcmp(pv_info.name, "lguest") != 0)
|
||||
return 0;
|
||||
|
||||
return hvc_instantiate(0, 0, &lguest_cons);
|
||||
}
|
||||
console_initcall(cons_init);
|
||||
|
||||
/*D:370 To set up and manage our virtual console, we call hvc_alloc() and
|
||||
* stash the result in the private pointer of the "struct lguest_device".
|
||||
* Since we never remove the console device we never need this pointer again,
|
||||
* but using ->private is considered good form, and you never know who's going
|
||||
* to copy your driver.
|
||||
*
|
||||
* Once the console is set up, we bind our input buffer ready for input. */
|
||||
static int lguestcons_probe(struct lguest_device *lgdev)
|
||||
{
|
||||
int err;
|
||||
|
||||
/* The first argument of hvc_alloc() is the virtual console number, so
|
||||
* we use zero. The second argument is the interrupt number.
|
||||
*
|
||||
* The third argument is a "struct hv_ops" containing the put_chars()
|
||||
* and get_chars() pointers. The final argument is the output buffer
|
||||
* size: we use 256 and expect the Host to have room for us to send
|
||||
* that much. */
|
||||
lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256);
|
||||
if (IS_ERR(lgdev->private))
|
||||
return PTR_ERR(lgdev->private);
|
||||
|
||||
/* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY.
|
||||
* "cons_input" is that statically-initialized global DMA buffer we saw
|
||||
* above, and we also give the interrupt we want. */
|
||||
err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1,
|
||||
lgdev_irq(lgdev));
|
||||
if (err)
|
||||
printk("lguest console: failed to bind buffer.\n");
|
||||
return err;
|
||||
}
|
||||
/* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc()
|
||||
* to expect input when this interrupt is triggered, and then tell
|
||||
* lguest_bind_dma() that is the interrupt to send us when input comes in. */
|
||||
|
||||
/*D:360 From now on the console driver follows standard Guest driver form:
|
||||
* register_lguest_driver() registers the device type and probe function, and
|
||||
* the probe function sets up the device.
|
||||
*
|
||||
* The standard "struct lguest_driver": */
|
||||
static struct lguest_driver lguestcons_drv = {
|
||||
.name = "lguestcons",
|
||||
.owner = THIS_MODULE,
|
||||
.device_type = LGUEST_DEVICE_T_CONSOLE,
|
||||
.probe = lguestcons_probe,
|
||||
};
|
||||
|
||||
/* The standard init function */
|
||||
static int __init hvc_lguest_init(void)
|
||||
{
|
||||
return register_lguest_driver(&lguestcons_drv);
|
||||
}
|
||||
module_init(hvc_lguest_init);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user