Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "First batch of KVM changes for 4.4.

  s390:
     A bunch of fixes and optimizations for interrupt and time handling.

  PPC:
     Mostly bug fixes.

  ARM:
     No big features, but many small fixes and prerequisites including:

      - a number of fixes for the arch-timer

      - introducing proper level-triggered semantics for the arch-timers

      - a series of patches to synchronously halt a guest (prerequisite
        for IRQ forwarding)

      - some tracepoint improvements

      - a tweak for the EL2 panic handlers

      - some more VGIC cleanups getting rid of redundant state

  x86:
     Quite a few changes:

      - support for VT-d posted interrupts (i.e. PCI devices can inject
        interrupts directly into vCPUs).  This introduces a new
        component (in virt/lib/) that connects VFIO and KVM together.
        The same infrastructure will be used for ARM interrupt
        forwarding as well.

      - more Hyper-V features, though the main one Hyper-V synthetic
        interrupt controller will have to wait for 4.5.  These will let
        KVM expose Hyper-V devices.

      - nested virtualization now supports VPID (same as PCID but for
        vCPUs) which makes it quite a bit faster

      - for future hardware that supports NVDIMM, there is support for
        clflushopt, clwb, pcommit

      - support for "split irqchip", i.e.  LAPIC in kernel +
        IOAPIC/PIC/PIT in userspace, which reduces the attack surface of
        the hypervisor

      - obligatory smattering of SMM fixes

      - on the guest side, stable scheduler clock support was rewritten
        to not require help from the hypervisor"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits)
  KVM: VMX: Fix commit which broke PML
  KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0()
  KVM: x86: allow RSM from 64-bit mode
  KVM: VMX: fix SMEP and SMAP without EPT
  KVM: x86: move kvm_set_irq_inatomic to legacy device assignment
  KVM: device assignment: remove pointless #ifdefs
  KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic
  KVM: x86: zero apic_arb_prio on reset
  drivers/hv: share Hyper-V SynIC constants with userspace
  KVM: x86: handle SMBASE as physical address in RSM
  KVM: x86: add read_phys to x86_emulate_ops
  KVM: x86: removing unused variable
  KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs
  KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr()
  KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings
  KVM: arm/arm64: Optimize away redundant LR tracking
  KVM: s390: use simple switch statement as multiplexer
  KVM: s390: drop useless newline in debugging data
  KVM: s390: SCA must not cross page boundaries
  KVM: arm: Do not indent the arguments of DECLARE_BITMAP
  ...
This commit is contained in:
Linus Torvalds
2015-11-05 16:26:26 -08:00
89 changed files with 2956 additions and 1029 deletions
+1
View File
@@ -0,0 +1 @@
obj-y += lib/
+4 -1
View File
@@ -46,4 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT
config KVM_COMPAT
def_bool y
depends on COMPAT && !S390
depends on KVM && COMPAT && !S390
config HAVE_KVM_IRQ_BYPASS
bool
+117 -56
View File
@@ -28,6 +28,8 @@
#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>
#include "trace.h"
static struct timecounter *timecounter;
static struct workqueue_struct *wqueue;
static unsigned int host_vtimer_irq;
@@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer)
}
}
static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
{
int ret;
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
kvm_vgic_set_phys_irq_active(timer->map, true);
ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
timer->map,
timer->irq->level);
WARN_ON(ret);
}
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
return HRTIMER_NORESTART;
}
static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
}
bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
cycle_t cval, now;
if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
!(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
kvm_vgic_get_phys_irq_active(timer->map))
if (!kvm_timer_irq_can_fire(vcpu))
return false;
cval = timer->cntv_cval;
@@ -127,12 +123,94 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
return cval <= now;
}
static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
{
int ret;
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
BUG_ON(!vgic_initialized(vcpu->kvm));
timer->irq.level = new_level;
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
timer->irq.level);
ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
timer->map,
timer->irq.level);
WARN_ON(ret);
}
/*
* Check if there was a change in the timer state (should we raise or lower
* the line level to the GIC).
*/
static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
/*
* If userspace modified the timer registers via SET_ONE_REG before
* the vgic was initialized, we mustn't set the timer->irq.level value
* because the guest would never see the interrupt. Instead wait
* until we call this function from kvm_timer_flush_hwstate.
*/
if (!vgic_initialized(vcpu->kvm))
return;
if (kvm_timer_should_fire(vcpu) != timer->irq.level)
kvm_timer_update_irq(vcpu, !timer->irq.level);
}
/*
* Schedule the background timer before calling kvm_vcpu_block, so that this
* thread is removed from its waitqueue and made runnable when there's a timer
* interrupt to handle.
*/
void kvm_timer_schedule(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
u64 ns;
cycle_t cval, now;
BUG_ON(timer_is_armed(timer));
/*
* No need to schedule a background timer if the guest timer has
* already expired, because kvm_vcpu_block will return before putting
* the thread to sleep.
*/
if (kvm_timer_should_fire(vcpu))
return;
/*
* If the timer is not capable of raising interrupts (disabled or
* masked), then there's no more work for us to do.
*/
if (!kvm_timer_irq_can_fire(vcpu))
return;
/* The timer has not yet expired, schedule a background timer */
cval = timer->cntv_cval;
now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
ns = cyclecounter_cyc2ns(timecounter->cc,
cval - now,
timecounter->mask,
&timecounter->frac);
timer_arm(timer, ns);
}
void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
timer_disarm(timer);
}
/**
* kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
* @vcpu: The vcpu pointer
*
* Disarm any pending soft timers, since the world-switch code will write the
* virtual timer state back to the physical CPU.
* Check if the virtual timer has expired while we were running in the host,
* and inject an interrupt if that was the case.
*/
void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
{
@@ -140,28 +218,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
bool phys_active;
int ret;
/*
* We're about to run this vcpu again, so there is no need to
* keep the background timer running, as we're about to
* populate the CPU timer again.
*/
timer_disarm(timer);
kvm_timer_update_state(vcpu);
/*
* If the timer expired while we were not scheduled, now is the time
* to inject it.
* If we enter the guest with the virtual input level to the VGIC
* asserted, then we have already told the VGIC what we need to, and
* we don't need to exit from the guest until the guest deactivates
* the already injected interrupt, so therefore we should set the
* hardware active state to prevent unnecessary exits from the guest.
*
* Conversely, if the virtual input level is deasserted, then always
* clear the hardware active state to ensure that hardware interrupts
* from the timer triggers a guest exit.
*/
if (kvm_timer_should_fire(vcpu))
kvm_timer_inject_irq(vcpu);
/*
* We keep track of whether the edge-triggered interrupt has been
* signalled to the vgic/guest, and if so, we mask the interrupt and
* the physical distributor to prevent the timer from raising a
* physical interrupt whenever we run a guest, preventing forward
* VCPU progress.
*/
if (kvm_vgic_get_phys_irq_active(timer->map))
if (timer->irq.level)
phys_active = true;
else
phys_active = false;
@@ -176,32 +246,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
* kvm_timer_sync_hwstate - sync timer state from cpu
* @vcpu: The vcpu pointer
*
* Check if the virtual timer was armed and either schedule a corresponding
* soft timer or inject directly if already expired.
* Check if the virtual timer has expired while we were running in the guest,
* and inject an interrupt if that was the case.
*/
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
cycle_t cval, now;
u64 ns;
BUG_ON(timer_is_armed(timer));
if (kvm_timer_should_fire(vcpu)) {
/*
* Timer has already expired while we were not
* looking. Inject the interrupt and carry on.
*/
kvm_timer_inject_irq(vcpu);
return;
}
cval = timer->cntv_cval;
now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
&timecounter->frac);
timer_arm(timer, ns);
/*
* The guest could have modified the timer registers or the timer
* could have expired, update the timer state.
*/
kvm_timer_update_state(vcpu);
}
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
@@ -216,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
* kvm_vcpu_set_target(). To handle this, we determine
* vcpu timer irq number when the vcpu is reset.
*/
timer->irq = irq;
timer->irq.irq = irq->irq;
/*
* The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -225,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
* the ARMv7 architecture.
*/
timer->cntv_ctl = 0;
kvm_timer_update_state(vcpu);
/*
* Tell the VGIC that the virtual interrupt is tied to a
@@ -269,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
default:
return -1;
}
kvm_timer_update_state(vcpu);
return 0;
}
+63
View File
@@ -0,0 +1,63 @@
#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KVM_H
#include <linux/tracepoint.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
/*
* Tracepoints for vgic
*/
TRACE_EVENT(vgic_update_irq_pending,
TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
TP_ARGS(vcpu_id, irq, level),
TP_STRUCT__entry(
__field( unsigned long, vcpu_id )
__field( __u32, irq )
__field( bool, level )
),
TP_fast_assign(
__entry->vcpu_id = vcpu_id;
__entry->irq = irq;
__entry->level = level;
),
TP_printk("VCPU: %ld, IRQ %d, level: %d",
__entry->vcpu_id, __entry->irq, __entry->level)
);
/*
* Tracepoints for arch_timer
*/
TRACE_EVENT(kvm_timer_update_irq,
TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
TP_ARGS(vcpu_id, irq, level),
TP_STRUCT__entry(
__field( unsigned long, vcpu_id )
__field( __u32, irq )
__field( int, level )
),
TP_fast_assign(
__entry->vcpu_id = vcpu_id;
__entry->irq = irq;
__entry->level = level;
),
TP_printk("VCPU: %ld, IRQ %d, level %d",
__entry->vcpu_id, __entry->irq, __entry->level)
);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
/* This part must be outside protection */
#include <trace/define_trace.h>
+1 -5
View File
@@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
}
static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
struct vgic_lr lr_desc)
{
if (!(lr_desc.state & LR_STATE_MASK))
vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
else
@@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
/* Get the show on the road... */
vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
static const struct vgic_ops vgic_v2_ops = {
.get_lr = vgic_v2_get_lr,
.set_lr = vgic_v2_set_lr,
.sync_lr_elrsr = vgic_v2_sync_lr_elrsr,
.get_elrsr = vgic_v2_get_elrsr,
.get_eisr = vgic_v2_get_eisr,
.clear_eisr = vgic_v2_clear_eisr,
+1 -5
View File
@@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
}
vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
}
static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
struct vgic_lr lr_desc)
{
if (!(lr_desc.state & LR_STATE_MASK))
vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
else
@@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vgic_v3->vgic_vmcr = 0;
vgic_v3->vgic_elrsr = ~0;
/*
* If we are emulating a GICv3, we do it in an non-GICv2-compatible
@@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
static const struct vgic_ops vgic_v3_ops = {
.get_lr = vgic_v3_get_lr,
.set_lr = vgic_v3_set_lr,
.sync_lr_elrsr = vgic_v3_sync_lr_elrsr,
.get_elrsr = vgic_v3_get_elrsr,
.get_eisr = vgic_v3_get_eisr,
.clear_eisr = vgic_v3_clear_eisr,
+119 -189
View File
File diff suppressed because it is too large Load Diff
+4
View File
@@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work)
trace_kvm_async_pf_completed(addr, gva);
/*
* This memory barrier pairs with prepare_to_wait's set_current_state()
*/
smp_mb();
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
+97 -93
View File
@@ -23,6 +23,7 @@
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/kvm_irqfd.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
@@ -34,73 +35,20 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/seqlock.h>
#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#include <kvm/iodev.h>
#ifdef CONFIG_HAVE_KVM_IRQFD
/*
* --------------------------------------------------------------------
* irqfd: Allows an fd to be used to inject an interrupt to the guest
*
* Credit goes to Avi Kivity for the original idea.
* --------------------------------------------------------------------
*/
/*
* Resampling irqfds are a special variety of irqfds used to emulate
* level triggered interrupts. The interrupt is asserted on eventfd
* trigger. On acknowledgement through the irq ack notifier, the
* interrupt is de-asserted and userspace is notified through the
* resamplefd. All resamplers on the same gsi are de-asserted
* together, so we don't need to track the state of each individual
* user. We can also therefore share the same irq source ID.
*/
struct _irqfd_resampler {
struct kvm *kvm;
/*
* List of resampling struct _irqfd objects sharing this gsi.
* RCU list modified under kvm->irqfds.resampler_lock
*/
struct list_head list;
struct kvm_irq_ack_notifier notifier;
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
* Accessed and modified under kvm->irqfds.resampler_lock
*/
struct list_head link;
};
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
wait_queue_t wait;
/* Update side is protected by irqfds.lock */
struct kvm_kernel_irq_routing_entry irq_entry;
seqcount_t irq_entry_sc;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
/* The resampler used by this irqfd (resampler-only) */
struct _irqfd_resampler *resampler;
/* Eventfd notified on resample (resampler-only) */
struct eventfd_ctx *resamplefd;
/* Entry in list of irqfds for a resampler (resampler-only) */
struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
poll_table pt;
struct work_struct shutdown;
};
static struct workqueue_struct *irqfd_cleanup_wq;
static void
irqfd_inject(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm_kernel_irqfd *irqfd =
container_of(work, struct kvm_kernel_irqfd, inject);
struct kvm *kvm = irqfd->kvm;
if (!irqfd->resampler) {
@@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work)
static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
struct _irqfd_resampler *resampler;
struct kvm_kernel_irqfd_resampler *resampler;
struct kvm *kvm;
struct _irqfd *irqfd;
struct kvm_kernel_irqfd *irqfd;
int idx;
resampler = container_of(kian, struct _irqfd_resampler, notifier);
resampler = container_of(kian,
struct kvm_kernel_irqfd_resampler, notifier);
kvm = resampler->kvm;
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
}
static void
irqfd_resampler_shutdown(struct _irqfd *irqfd)
irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
{
struct _irqfd_resampler *resampler = irqfd->resampler;
struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
struct kvm *kvm = resampler->kvm;
mutex_lock(&kvm->irqfds.resampler_lock);
@@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
static void
irqfd_shutdown(struct work_struct *work)
{
struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
struct kvm_kernel_irqfd *irqfd =
container_of(work, struct kvm_kernel_irqfd, shutdown);
u64 cnt;
/*
@@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
/*
* It is now safe to release the object's resources
*/
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
irq_bypass_unregister_consumer(&irqfd->consumer);
#endif
eventfd_ctx_put(irqfd->eventfd);
kfree(irqfd);
}
@@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work)
/* assumes kvm->irqfds.lock is held */
static bool
irqfd_is_active(struct _irqfd *irqfd)
irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
{
return list_empty(&irqfd->list) ? false : true;
}
@@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd)
* assumes kvm->irqfds.lock is held
*/
static void
irqfd_deactivate(struct _irqfd *irqfd)
irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
{
BUG_ON(!irqfd_is_active(irqfd));
@@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd)
queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
}
int __attribute__((weak)) kvm_arch_set_irq_inatomic(
struct kvm_kernel_irq_routing_entry *irq,
struct kvm *kvm, int irq_source_id,
int level,
bool line_status)
{
return -EWOULDBLOCK;
}
/*
* Called with wqh->lock held and interrupts disabled
*/
static int
irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
struct kvm_kernel_irqfd *irqfd =
container_of(wait, struct kvm_kernel_irqfd, wait);
unsigned long flags = (unsigned long)key;
struct kvm_kernel_irq_routing_entry irq;
struct kvm *kvm = irqfd->kvm;
@@ -238,10 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
irq = irqfd->irq_entry;
} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
/* An event has been signaled, inject an interrupt */
if (irq.type == KVM_IRQ_ROUTING_MSI)
kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false);
else
if (kvm_arch_set_irq_inatomic(&irq, kvm,
KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false) == -EWOULDBLOCK)
schedule_work(&irqfd->inject);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
@@ -274,37 +236,54 @@ static void
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
struct kvm_kernel_irqfd *irqfd =
container_of(pt, struct kvm_kernel_irqfd, pt);
add_wait_queue(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
int i, n_entries;
int n_entries;
n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
write_seqcount_begin(&irqfd->irq_entry_sc);
irqfd->irq_entry.type = 0;
e = entries;
for (i = 0; i < n_entries; ++i, ++e) {
/* Only fast-path MSI. */
if (e->type == KVM_IRQ_ROUTING_MSI)
irqfd->irq_entry = *e;
}
if (n_entries == 1)
irqfd->irq_entry = *e;
else
irqfd->irq_entry.type = 0;
write_seqcount_end(&irqfd->irq_entry_sc);
}
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
struct irq_bypass_consumer *cons)
{
}
void __attribute__((weak)) kvm_arch_irq_bypass_start(
struct irq_bypass_consumer *cons)
{
}
int __attribute__((weak)) kvm_arch_update_irqfd_routing(
struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
return 0;
}
#endif
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct _irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd, *tmp;
struct fd f;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
@@ -340,7 +319,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
irqfd->eventfd = eventfd;
if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
struct _irqfd_resampler *resampler;
struct kvm_kernel_irqfd_resampler *resampler;
resamplefd = eventfd_ctx_fdget(args->resamplefd);
if (IS_ERR(resamplefd)) {
@@ -428,6 +407,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
* we might race against the POLLHUP
*/
fdput(f);
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
irqfd->consumer.token = (void *)irqfd->eventfd;
irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
irqfd->consumer.start = kvm_arch_irq_bypass_start;
ret = irq_bypass_register_consumer(&irqfd->consumer);
if (ret)
pr_info("irq bypass consumer (token %p) registration fails: %d\n",
irqfd->consumer.token, ret);
#endif
return 0;
@@ -469,9 +459,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
}
EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
{
struct kvm_irq_ack_notifier *kian;
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
}
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
int gsi, idx;
trace_kvm_ack_irq(irqchip, pin);
@@ -479,10 +478,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
idx = srcu_read_lock(&kvm->irq_srcu);
gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
if (gsi != -1)
hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
link)
if (kian->gsi == gsi)
kian->irq_acked(kian);
kvm_notify_acked_gsi(kvm, gsi);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
@@ -525,7 +521,7 @@ kvm_eventfd_init(struct kvm *kvm)
static int
kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct _irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd, *tmp;
struct eventfd_ctx *eventfd;
eventfd = eventfd_ctx_fdget(args->fd);
@@ -581,7 +577,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
void
kvm_irqfd_release(struct kvm *kvm)
{
struct _irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd, *tmp;
spin_lock_irq(&kvm->irqfds.lock);
@@ -604,13 +600,23 @@ kvm_irqfd_release(struct kvm *kvm)
*/
void kvm_irq_routing_update(struct kvm *kvm)
{
struct _irqfd *irqfd;
struct kvm_kernel_irqfd *irqfd;
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry(irqfd, &kvm->irqfds.items, list)
list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
irqfd_update(kvm, irqfd);
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
if (irqfd->producer) {
int ret = kvm_arch_update_irqfd_routing(
irqfd->kvm, irqfd->producer->irq,
irqfd->gsi, 1);
WARN_ON(ret);
}
#endif
}
spin_unlock_irq(&kvm->irqfds.lock);
}
@@ -914,9 +920,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
return -EINVAL;
/* ioeventfd with no length can't be combined with DATAMATCH */
if (!args->len &&
args->flags & (KVM_IOEVENTFD_FLAG_PIO |
KVM_IOEVENTFD_FLAG_DATAMATCH))
if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
return -EINVAL;
ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
+5 -13
View File
@@ -31,16 +31,6 @@
#include <trace/events/kvm.h>
#include "irq.h"
struct kvm_irq_routing_table {
int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
u32 nr_rt_entries;
/*
* Array indexed by gsi. Each entry contains list of irq chips
* the gsi is connected to.
*/
struct hlist_head map[0];
};
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi)
{
@@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and MSI.
* Allow only one to one mapping between GSI and non-irqchip routing.
*/
hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
if (ei->type == KVM_IRQ_ROUTING_MSI ||
ue->type == KVM_IRQ_ROUTING_MSI ||
if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return r;
@@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
kvm_irq_routing_update(kvm);
mutex_unlock(&kvm->irq_lock);
kvm_arch_irq_routing_update(kvm);
synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;
+9 -2
View File
@@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
vcpu->pre_pcpu = -1;
INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
@@ -2018,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
} while (single_task_running() && ktime_before(cur, stop));
}
kvm_arch_vcpu_blocking(vcpu);
for (;;) {
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
@@ -2031,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
cur = ktime_get();
kvm_arch_vcpu_unblocking(vcpu);
out:
block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
@@ -2718,6 +2724,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_IRQFD:
case KVM_CAP_IRQFD_RESAMPLE:
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
return 1;
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@@ -3341,7 +3348,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
return -ENOSPC;
new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
@@ -3373,7 +3380,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
if (r)
return r;
new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
sizeof(struct kvm_io_range)), GFP_KERNEL);
if (!new_bus)
return -ENOMEM;
+2
View File
@@ -0,0 +1,2 @@
config IRQ_BYPASS_MANAGER
tristate
+1
View File
@@ -0,0 +1 @@
obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o
+257
View File
@@ -0,0 +1,257 @@
/*
* IRQ offload/bypass manager
*
* Copyright (C) 2015 Red Hat, Inc.
* Copyright (c) 2015 Linaro Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Various virtualization hardware acceleration techniques allow bypassing or
* offloading interrupts received from devices around the host kernel. Posted
* Interrupts on Intel VT-d systems can allow interrupts to be received
* directly by a virtual machine. ARM IRQ Forwarding allows forwarded physical
* interrupts to be directly deactivated by the guest. This manager allows
* interrupt producers and consumers to find each other to enable this sort of
* bypass.
*/
#include <linux/irqbypass.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mutex.h>
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IRQ bypass manager utility module");
static LIST_HEAD(producers);
static LIST_HEAD(consumers);
static DEFINE_MUTEX(lock);
/* @lock must be held when calling connect */
static int __connect(struct irq_bypass_producer *prod,
struct irq_bypass_consumer *cons)
{
int ret = 0;
if (prod->stop)
prod->stop(prod);
if (cons->stop)
cons->stop(cons);
if (prod->add_consumer)
ret = prod->add_consumer(prod, cons);
if (!ret) {
ret = cons->add_producer(cons, prod);
if (ret && prod->del_consumer)
prod->del_consumer(prod, cons);
}
if (cons->start)
cons->start(cons);
if (prod->start)
prod->start(prod);
return ret;
}
/* @lock must be held when calling disconnect */
static void __disconnect(struct irq_bypass_producer *prod,
struct irq_bypass_consumer *cons)
{
if (prod->stop)
prod->stop(prod);
if (cons->stop)
cons->stop(cons);
cons->del_producer(cons, prod);
if (prod->del_consumer)
prod->del_consumer(prod, cons);
if (cons->start)
cons->start(cons);
if (prod->start)
prod->start(prod);
}
/**
* irq_bypass_register_producer - register IRQ bypass producer
* @producer: pointer to producer structure
*
* Add the provided IRQ producer to the list of producers and connect
* with any matching token found on the IRQ consumers list.
*/
int irq_bypass_register_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;
might_sleep();
if (!try_module_get(THIS_MODULE))
return -ENODEV;
mutex_lock(&lock);
list_for_each_entry(tmp, &producers, node) {
if (tmp->token == producer->token) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return -EBUSY;
}
}
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
int ret = __connect(producer, consumer);
if (ret) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
break;
}
}
list_add(&producer->node, &producers);
mutex_unlock(&lock);
return 0;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
/**
* irq_bypass_unregister_producer - unregister IRQ bypass producer
* @producer: pointer to producer structure
*
* Remove a previously registered IRQ producer from the list of producers
* and disconnect it from any connected IRQ consumer.
*/
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;
might_sleep();
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
mutex_lock(&lock);
list_for_each_entry(tmp, &producers, node) {
if (tmp->token != producer->token)
continue;
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&producer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
/**
* irq_bypass_register_consumer - register IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Add the provided IRQ consumer to the list of consumers and connect
* with any matching token found on the IRQ producer list.
*/
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;
if (!consumer->add_producer || !consumer->del_producer)
return -EINVAL;
might_sleep();
if (!try_module_get(THIS_MODULE))
return -ENODEV;
mutex_lock(&lock);
list_for_each_entry(tmp, &consumers, node) {
if (tmp->token == consumer->token) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return -EBUSY;
}
}
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
int ret = __connect(producer, consumer);
if (ret) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
break;
}
}
list_add(&consumer->node, &consumers);
mutex_unlock(&lock);
return 0;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
/**
* irq_bypass_unregister_consumer - unregister IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Remove a previously registered IRQ consumer from the list of consumers
* and disconnect it from any connected IRQ producer.
*/
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;
might_sleep();
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
mutex_lock(&lock);
list_for_each_entry(tmp, &consumers, node) {
if (tmp->token != consumer->token)
continue;
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&consumer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);