Merge branch 'linus' into core/softirq

This commit is contained in:
Ingo Molnar
2008-06-25 12:23:59 +02:00
49 changed files with 647 additions and 500 deletions
+6 -14
View File
@@ -84,10 +84,9 @@
runs an instance of gdb against the vmlinux file which contains
the symbols (not boot image such as bzImage, zImage, uImage...).
In gdb the developer specifies the connection parameters and
connects to kgdb. Depending on which kgdb I/O modules exist in
the kernel for a given architecture, it may be possible to debug
the test machine's kernel with the development machine using a
rs232 or ethernet connection.
connects to kgdb. The type of connection a developer makes with
gdb depends on the availability of kgdb I/O modules compiled as
builtin's or kernel modules in the test machine's kernel.
</para>
</chapter>
<chapter id="CompilingAKernel">
@@ -223,7 +222,7 @@
</para>
<para>
IMPORTANT NOTE: Using this option with kgdb over the console
(kgdboc) or kgdb over ethernet (kgdboe) is not supported.
(kgdboc) is not supported.
</para>
</sect1>
</chapter>
@@ -249,18 +248,11 @@
(gdb) target remote /dev/ttyS0
</programlisting>
<para>
Example (kgdb to a terminal server):
Example (kgdb to a terminal server on tcp port 2012):
</para>
<programlisting>
% gdb ./vmlinux
(gdb) target remote udp:192.168.2.2:6443
</programlisting>
<para>
Example (kgdb over ethernet):
</para>
<programlisting>
% gdb ./vmlinux
(gdb) target remote udp:192.168.2.2:6443
(gdb) target remote 192.168.2.2:2012
</programlisting>
<para>
Once connected, you can debug a kernel the way you would debug an
+1 -1
View File
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 26
EXTRAVERSION = -rc7
EXTRAVERSION = -rc8
NAME = Rotary Wombat
# *DOCUMENTATION*
-2
View File
@@ -558,8 +558,6 @@ static struct iosapic_rte_info * __init_refok iosapic_alloc_rte (void)
if (!iosapic_kmalloc_ok && list_empty(&free_rte_list)) {
rte = alloc_bootmem(sizeof(struct iosapic_rte_info) *
NR_PREALLOCATE_RTE_ENTRIES);
if (!rte)
return NULL;
for (i = 0; i < NR_PREALLOCATE_RTE_ENTRIES; i++, rte++)
list_add(&rte->rte_list, &free_rte_list);
}
+1 -2
View File
@@ -578,8 +578,6 @@ setup_arch (char **cmdline_p)
cpu_init(); /* initialize the bootstrap CPU */
mmu_context_init(); /* initialize context_id bitmap */
check_sal_cache_flush();
#ifdef CONFIG_ACPI
acpi_boot_init();
#endif
@@ -607,6 +605,7 @@ setup_arch (char **cmdline_p)
ia64_mca_init();
platform_setup(cmdline_p);
check_sal_cache_flush();
paging_init();
}
+1 -1
View File
@@ -512,7 +512,7 @@ static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, si
int cpu;
char optstr[64];
if (count > sizeof(optstr))
if (count == 0 || count > sizeof(optstr))
return -EINVAL;
if (copy_from_user(optstr, user, count))
return -EFAULT;
+5
View File
@@ -383,6 +383,7 @@ config VMI
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
select PARAVIRT_CLOCK
depends on !(X86_VISWS || X86_VOYAGER)
help
Turning on this option will allow you to run a paravirtualized clock
@@ -410,6 +411,10 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
config PARAVIRT_CLOCK
bool
default n
endif
config MEMTEST_BOOTPARAM
+1
View File
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
+33 -56
View File
@@ -18,6 +18,7 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
#include <asm/pvclock.h>
#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
static struct pvclock_wall_clock wall_clock;
static inline u64 kvm_get_delta(u64 last_tsc)
{
int cpu = smp_processor_id();
u64 delta = native_read_tsc() - last_tsc;
return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
}
static struct kvm_wall_clock wall_clock;
static cycle_t kvm_clock_read(void);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void);
*/
static unsigned long kvm_get_wallclock(void)
{
u32 wc_sec, wc_nsec;
u64 delta;
struct pvclock_vcpu_time_info *vcpu_time;
struct timespec ts;
int version, nsec;
int low, high;
low = (int)__pa(&wall_clock);
high = ((u64)__pa(&wall_clock) >> 32);
delta = kvm_clock_read();
native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
do {
version = wall_clock.wc_version;
rmb();
wc_sec = wall_clock.wc_sec;
wc_nsec = wall_clock.wc_nsec;
rmb();
} while ((wall_clock.wc_version != version) || (version & 1));
delta = kvm_clock_read() - delta;
delta += wc_nsec;
nsec = do_div(delta, NSEC_PER_SEC);
set_normalized_timespec(&ts, wc_sec + delta, nsec);
/*
* Of all mechanisms of time adjustment I've tested, this one
* was the champion!
*/
return ts.tv_sec + 1;
vcpu_time = &get_cpu_var(hv_clock);
pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
put_cpu_var(hv_clock);
return ts.tv_sec;
}
static int kvm_set_wallclock(unsigned long now)
{
return 0;
return -1;
}
/*
* This is our read_clock function. The host puts an tsc timestamp each time
* it updates a new time. Without the tsc adjustment, we can have a situation
* in which a vcpu starts to run earlier (smaller system_time), but probes
* time later (compared to another vcpu), leading to backwards time
*/
static cycle_t kvm_clock_read(void)
{
u64 last_tsc, now;
int cpu;
struct pvclock_vcpu_time_info *src;
cycle_t ret;
preempt_disable();
cpu = smp_processor_id();
last_tsc = get_clock(cpu, tsc_timestamp);
now = get_clock(cpu, system_time);
now += kvm_get_delta(last_tsc);
preempt_enable();
return now;
src = &get_cpu_var(hv_clock);
ret = pvclock_clocksource_read(src);
put_cpu_var(hv_clock);
return ret;
}
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static int kvm_register_clock(void)
static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high;
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
WARN_ON(kvm_register_clock());
WARN_ON(kvm_register_clock("secondary cpu clock"));
/* ok, done with our trickery, call native */
setup_secondary_APIC_clock();
}
#endif
#ifdef CONFIG_SMP
void __init kvm_smp_prepare_boot_cpu(void)
{
WARN_ON(kvm_register_clock("primary cpu clock"));
native_smp_prepare_boot_cpu();
}
#endif
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -174,13 +148,16 @@ void __init kvmclock_init(void)
return;
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
if (kvm_register_clock())
if (kvm_register_clock("boot clock"))
return;
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
+141
View File
@@ -0,0 +1,141 @@
/* paravirtual clock -- common code used by kvm/xen
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <asm/pvclock.h>
/*
* These are perodically updated
* xen: magic shared_info page
* kvm: gpa registered via msr
* and then copied here.
*/
struct pvclock_shadow_time {
u64 tsc_timestamp; /* TSC at last update of time vals. */
u64 system_timestamp; /* Time, in nanosecs, since boot. */
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
};
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
#endif
if (shift < 0)
delta >>= -shift;
else
delta <<= shift;
#ifdef __i386__
__asm__ (
"mul %5 ; "
"mov %4,%%eax ; "
"mov %%edx,%4 ; "
"mul %5 ; "
"xor %5,%5 ; "
"add %4,%%eax ; "
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif
return product;
}
static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
u64 delta = native_read_tsc() - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
/*
* Reads a consistent set of time-base values from hypervisor,
* into a shadow data area.
*/
static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
struct pvclock_vcpu_time_info *src)
{
do {
dst->version = src->version;
rmb(); /* fetch version before data */
dst->tsc_timestamp = src->tsc_timestamp;
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
rmb(); /* test version after fetching data */
} while ((src->version & 1) || (dst->version != src->version));
return dst->version;
}
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
unsigned version;
cycle_t ret, offset;
do {
version = pvclock_get_time_values(&shadow, src);
barrier();
offset = pvclock_get_nsec_offset(&shadow);
ret = shadow.system_timestamp + offset;
barrier();
} while (version != src->version);
return ret;
}
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
struct timespec *ts)
{
u32 version;
u64 delta;
struct timespec now;
/* get wallclock at system boot */
do {
version = wall_clock->version;
rmb(); /* fetch version before time */
now.tv_sec = wall_clock->sec;
now.tv_nsec = wall_clock->nsec;
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));
delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+6 -3
View File
@@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
atomic_inc(&pt->pending);
smp_mb__after_atomic_inc();
if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(&vcpu0->wq);
if (vcpu0) {
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
if (waitqueue_active(&vcpu0->wq)) {
vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(&vcpu0->wq);
}
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
+1
View File
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
+8 -11
View File
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
rmap_remove(kvm, spte);
--kvm->stat.lpages;
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
if (user_fault) {
mmu_unshadow(vcpu->kvm, gfn);
goto unshadowed;
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
unshadowed:
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
u64 *spte,
const void *new)
{
if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
&& !vcpu->arch.update_pte.largepage) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
if (!vcpu->arch.update_pte.largepage ||
sp->role.glevels == PT32_ROOT_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
}
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
+11 -8
View File
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
load_transition_efer(vmx);
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
unsigned long flags;
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
reload_host_efer(vmx);
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
{
preempt_disable();
__vmx_load_host_state(vmx);
preempt_enable();
}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_load_host_state(to_vmx(vcpu));
__vmx_load_host_state(to_vmx(vcpu));
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
switch (msr_index) {
#ifdef CONFIG_X86_64
case MSR_EFER:
vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
if (vmx->host_state.loaded) {
reload_host_efer(vmx);
load_transition_efer(vmx);
}
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
guest_write_tsc(data);
break;
default:
vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
msr->data = data;
if (vmx->host_state.loaded)
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
+66 -25
View File
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
struct kvm_wall_clock wc;
struct timespec wc_ts;
struct pvclock_wall_clock wc;
struct timespec now, sys, boot;
if (!wall_clock)
return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
wc_ts = current_kernel_time();
wc.wc_sec = wc_ts.tv_sec;
wc.wc_nsec = wc_ts.tv_nsec;
wc.wc_version = version;
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_write_guest_time below) to the
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
now = current_kernel_time();
ktime_get_ts(&sys);
boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
wc.sec = boot.tv_sec;
wc.nsec = boot.tv_nsec;
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t quotient, remainder;
/* Don't try to replace with do_div(), this one calculates
* "(dividend << 32) / divisor" */
__asm__ ( "divl %4"
: "=a" (quotient), "=d" (remainder)
: "0" (0), "1" (dividend), "r" (divisor) );
return quotient;
}
static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
{
uint64_t nsecs = 1000000000LL;
int32_t shift = 0;
uint64_t tps64;
uint32_t tps32;
tps64 = tsc_khz * 1000LL;
while (tps64 > nsecs*2) {
tps64 >>= 1;
shift--;
}
tps32 = (uint32_t)tps64;
while (tps32 <= (uint32_t)nsecs) {
tps32 <<= 1;
shift++;
}
hv_clock->tsc_shift = shift;
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
__FUNCTION__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
vcpu->hv_clock_tsc_khz = tsc_khz;
}
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
* state, we just write "2" at the end
* state, we just increase by 2 at the end.
*/
vcpu->hv_clock.version = 2;
vcpu->hv_clock.version += 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));
sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
vcpu->arch.hv_clock.tsc_to_system_mul =
clocksource_khz2mult(tsc_khz, 22);
vcpu->arch.hv_clock.tsc_shift = 22;
down_read(&current->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2759,6 +2808,8 @@ again:
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
&vcpu->requests)) {
kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
}
}
clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
kvm_inject_pending_timer_irqs(vcpu);
preempt_disable();
@@ -2781,21 +2833,13 @@ again:
local_irq_disable();
if (need_resched()) {
if (vcpu->requests || need_resched()) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
if (vcpu->requests)
if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -2825,9 +2869,6 @@ again:
kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
KVMTRACE_0D(VMENTRY, vcpu, entryexit);
kvm_x86_ops->run(vcpu, kvm_run);
+2 -1
View File
@@ -5,8 +5,9 @@
config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
depends on X86_32
depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
+23 -33
View File
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
static __init void xen_pagetable_setup_start(pgd_t *base)
{
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
int i;
/* special set_pte for pagetable initialization */
pv_mmu_ops.set_pte = xen_set_pte_init;
init_mm.pgd = base;
/*
* copy top-level of Xen-supplied pagetable into place. For
* !PAE we can use this as-is, but for PAE it is a stand-in
* while we copy the pmd pages.
* copy top-level of Xen-supplied pagetable into place. This
* is a stand-in while we copy the pmd pages.
*/
memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
if (PTRS_PER_PMD > 1) {
int i;
/*
* For PAE, need to allocate new pmds, rather than
* share Xen's, since Xen doesn't like pmd's being
* shared between address spaces.
*/
for (i = 0; i < PTRS_PER_PGD; i++) {
if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
/*
* For PAE, need to allocate new pmds, rather than
* share Xen's, since Xen doesn't like pmd's being
* shared between address spaces.
*/
for (i = 0; i < PTRS_PER_PGD; i++) {
if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
PAGE_SIZE);
memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
PAGE_SIZE);
make_lowmem_page_readonly(pmd);
make_lowmem_page_readonly(pmd);
set_pgd(&base[i], __pgd(1 + __pa(pmd)));
} else
pgd_clear(&base[i]);
}
set_pgd(&base[i], __pgd(1 + __pa(pmd)));
} else
pgd_clear(&base[i]);
}
/* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
{
unsigned level;
#ifdef CONFIG_X86_PAE
level = MMUEXT_PIN_L3_TABLE;
#else
level = MMUEXT_PIN_L2_TABLE;
#endif
pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
}
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
}
/* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
.set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
#endif /* PAE */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
/* Prevent unwanted bits from being set in PTEs. */
__supported_pte_mask &= ~_PAGE_GLOBAL;
if (!is_initial_xendomain())
__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
/* set the limit of our address space */
xen_reserve_top();
+33 -44
View File
@@ -179,50 +179,56 @@ out:
preempt_enable();
}
/* Assume pteval_t is equivalent to all the other *val_t types. */
static pteval_t pte_mfn_to_pfn(pteval_t val)
{
if (val & _PAGE_PRESENT) {
unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
pteval_t flags = val & ~PTE_MASK;
val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
}
return val;
}
static pteval_t pte_pfn_to_mfn(pteval_t val)
{
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
pteval_t flags = val & ~PTE_MASK;
val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
}
return val;
}
pteval_t xen_pte_val(pte_t pte)
{
pteval_t ret = pte.pte;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
return pte_mfn_to_pfn(pte.pte);
}
pgdval_t xen_pgd_val(pgd_t pgd)
{
pgdval_t ret = pgd.pgd;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
return pte_mfn_to_pfn(pgd.pgd);
}
pte_t xen_make_pte(pteval_t pte)
{
if (pte & _PAGE_PRESENT) {
pte = phys_to_machine(XPADDR(pte)).maddr;
pte &= ~(_PAGE_PCD | _PAGE_PWT);
}
return (pte_t){ .pte = pte };
pte = pte_pfn_to_mfn(pte);
return native_make_pte(pte);
}
pgd_t xen_make_pgd(pgdval_t pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
return (pgd_t){ pgd };
pgd = pte_pfn_to_mfn(pgd);
return native_make_pgd(pgd);
}
pmdval_t xen_pmd_val(pmd_t pmd)
{
pmdval_t ret = native_pmd_val(pmd);
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
return pte_mfn_to_pfn(pmd.pmd);
}
#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
struct multicall_space mcs;
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
pmd_t xen_make_pmd(pmdval_t pmd)
{
if (pmd & _PAGE_PRESENT)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
#else /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
}
#endif /* CONFIG_X86_PAE */
/*
(Yet another) pagetable walker. This one is intended for pinning a
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
unsigned level;
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch();
}
#ifdef CONFIG_X86_PAE
level = MMUEXT_PIN_L3_TABLE;
#else
level = MMUEXT_PIN_L2_TABLE;
#endif
xen_do_pin(level, PFN_DOWN(__pa(pgd)));
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0);
}
+6 -18
View File
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
#ifdef CONFIG_X86_PAE
unsigned long long xen_pte_val(pte_t);
unsigned long long xen_pmd_val(pmd_t);
unsigned long long xen_pgd_val(pgd_t);
pteval_t xen_pte_val(pte_t);
pmdval_t xen_pmd_val(pmd_t);
pgdval_t xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long long);
pmd_t xen_make_pmd(unsigned long long);
pgd_t xen_make_pgd(unsigned long long);
pte_t xen_make_pte(pteval_t);
pmd_t xen_make_pmd(pmdval_t);
pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
#else
unsigned long xen_pte_val(pte_t);
unsigned long xen_pmd_val(pmd_t);
unsigned long xen_pgd_val(pgd_t);
pte_t xen_make_pte(unsigned long);
pmd_t xen_make_pmd(unsigned long);
pgd_t xen_make_pgd(unsigned long);
#endif
#endif /* _XEN_MMU_H */
+12 -120
View File
@@ -14,6 +14,7 @@
#include <linux/kernel_stat.h>
#include <linux/math64.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -31,17 +32,6 @@
static cycle_t xen_clocksource_read(void);
/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
u64 tsc_timestamp; /* TSC at last update of time vals. */
u64 system_timestamp; /* Time, in nanosecs, since boot. */
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
};
static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void)
unsigned long xen_cpu_khz(void)
{
u64 xen_khz = 1000000ULL << 32;
const struct vcpu_time_info *info =
const struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(xen_khz, info->tsc_to_system_mul);
@@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void)
return xen_khz;
}
/*
* Reads a consistent set of time-base values from Xen, into a shadow data
* area.
*/
static unsigned get_time_values_from_xen(void)
{
struct vcpu_time_info *src;
struct shadow_time_info *dst;
/* src is shared memory with the hypervisor, so we need to
make sure we get a consistent snapshot, even in the face of
being preempted. */
src = &__get_cpu_var(xen_vcpu)->time;
dst = &__get_cpu_var(shadow_time);
do {
dst->version = src->version;
rmb(); /* fetch version before data */
dst->tsc_timestamp = src->tsc_timestamp;
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
rmb(); /* test version after fetching data */
} while ((src->version & 1) | (dst->version ^ src->version));
return dst->version;
}
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
u32 tmp1, tmp2;
#endif
if (shift < 0)
delta >>= -shift;
else
delta <<= shift;
#ifdef __i386__
__asm__ (
"mul %5 ; "
"mov %4,%%eax ; "
"mov %%edx,%4 ; "
"mul %5 ; "
"xor %5,%5 ; "
"add %4,%%eax ; "
"adc %5,%%edx ; "
: "=A" (product), "=r" (tmp1), "=r" (tmp2)
: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
__asm__ (
"mul %%rdx ; shrd $32,%%rdx,%%rax"
: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif
return product;
}
static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
u64 now, delta;
now = native_read_tsc();
delta = now - shadow->tsc_timestamp;
return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}
static cycle_t xen_clocksource_read(void)
{
struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
struct pvclock_vcpu_time_info *src;
cycle_t ret;
unsigned version;
do {
version = get_time_values_from_xen();
barrier();
ret = shadow->system_timestamp + get_nsec_offset(shadow);
barrier();
} while (version != __get_cpu_var(xen_vcpu)->time.version);
put_cpu_var(shadow_time);
src = &get_cpu_var(xen_vcpu)->time;
ret = pvclock_clocksource_read(src);
put_cpu_var(xen_vcpu);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
const struct shared_info *s = HYPERVISOR_shared_info;
u32 version;
u64 delta;
struct timespec now;
struct shared_info *s = HYPERVISOR_shared_info;
struct pvclock_wall_clock *wall_clock = &(s->wc);
struct pvclock_vcpu_time_info *vcpu_time;
/* get wallclock at system boot */
do {
version = s->wc_version;
rmb(); /* fetch version before time */
now.tv_sec = s->wc_sec;
now.tv_nsec = s->wc_nsec;
rmb(); /* fetch time before checking version */
} while ((s->wc_version & 1) | (version ^ s->wc_version));
delta = xen_clocksource_read(); /* time since system boot */
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
vcpu_time = &get_cpu_var(xen_vcpu)->time;
pvclock_read_wallclock(wall_clock, vcpu_time, ts);
put_cpu_var(xen_vcpu);
}
unsigned long xen_get_wallclock(void)
@@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void)
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
@@ -569,8 +463,6 @@ __init void xen_time_init(void)
{
int cpu = smp_processor_id();
get_time_values_from_xen();
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+1 -5
View File
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
__FINIT
.pushsection .bss.page_aligned
.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
#else
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */

Some files were not shown because too many files have changed in this diff Show More