Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini: "First batch of KVM changes for 4.4. s390: A bunch of fixes and optimizations for interrupt and time handling. PPC: Mostly bug fixes. ARM: No big features, but many small fixes and prerequisites including: - a number of fixes for the arch-timer - introducing proper level-triggered semantics for the arch-timers - a series of patches to synchronously halt a guest (prerequisite for IRQ forwarding) - some tracepoint improvements - a tweak for the EL2 panic handlers - some more VGIC cleanups getting rid of redundant state x86: Quite a few changes: - support for VT-d posted interrupts (i.e. PCI devices can inject interrupts directly into vCPUs). This introduces a new component (in virt/lib/) that connects VFIO and KVM together. The same infrastructure will be used for ARM interrupt forwarding as well. - more Hyper-V features, though the main one Hyper-V synthetic interrupt controller will have to wait for 4.5. These will let KVM expose Hyper-V devices. - nested virtualization now supports VPID (same as PCID but for vCPUs) which makes it quite a bit faster - for future hardware that supports NVDIMM, there is support for clflushopt, clwb, pcommit - support for "split irqchip", i.e. LAPIC in kernel + IOAPIC/PIC/PIT in userspace, which reduces the attack surface of the hypervisor - obligatory smattering of SMM fixes - on the guest side, stable scheduler clock support was rewritten to not require help from the hypervisor" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits) KVM: VMX: Fix commit which broke PML KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() KVM: x86: allow RSM from 64-bit mode KVM: VMX: fix SMEP and SMAP without EPT KVM: x86: move kvm_set_irq_inatomic to legacy device assignment KVM: device assignment: remove pointless #ifdefs KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic KVM: x86: zero apic_arb_prio on reset drivers/hv: share Hyper-V SynIC constants with userspace KVM: x86: handle SMBASE as physical address in RSM KVM: x86: add read_phys to x86_emulate_ops KVM: x86: removing unused variable KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr() KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings KVM: arm/arm64: Optimize away redundant LR tracking KVM: s390: use simple switch statement as multiplexer KVM: s390: drop useless newline in debugging data KVM: s390: SCA must not cross page boundaries KVM: arm: Do not indent the arguments of DECLARE_BITMAP ...
2026-05-01 15:00:59 -07:00 · 2015-11-05 16:26:26 -08:00
parent a3e7531535 a3eaa8649e
commit 933425fb00
89 changed files with 2956 additions and 1029 deletions
@@ -0,0 +1 @@
+obj-y	+= lib/
@@ -46,4 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT

 config KVM_COMPAT
       def_bool y
-       depends on COMPAT && !S390
+       depends on KVM && COMPAT && !S390
+
+config HAVE_KVM_IRQ_BYPASS
+       bool
@@ -28,6 +28,8 @@
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>

+#include "trace.h"
+
 static struct timecounter *timecounter;
 static struct workqueue_struct *wqueue;
 static unsigned int host_vtimer_irq;
@@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer)
 	}
 }

-static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
-{
-	int ret;
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	kvm_vgic_set_phys_irq_active(timer->map, true);
-	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
-					 timer->map,
-					 timer->irq->level);
-	WARN_ON(ret);
-}
-
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 {
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
 	return HRTIMER_NORESTART;
 }

+static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+		(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
+}
+
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	cycle_t cval, now;

-	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-	    !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
-	    kvm_vgic_get_phys_irq_active(timer->map))
+	if (!kvm_timer_irq_can_fire(vcpu))
 		return false;

 	cval = timer->cntv_cval;
@@ -127,12 +123,94 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 	return cval <= now;
 }

+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
+{
+	int ret;
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	BUG_ON(!vgic_initialized(vcpu->kvm));
+
+	timer->irq.level = new_level;
+	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
+				   timer->irq.level);
+	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
+					 timer->map,
+					 timer->irq.level);
+	WARN_ON(ret);
+}
+
+/*
+ * Check if there was a change in the timer state (should we raise or lower
+ * the line level to the GIC).
+ */
+static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	/*
+	 * If userspace modified the timer registers via SET_ONE_REG before
+	 * the vgic was initialized, we mustn't set the timer->irq.level value
+	 * because the guest would never see the interrupt.  Instead wait
+	 * until we call this function from kvm_timer_flush_hwstate.
+	 */
+	if (!vgic_initialized(vcpu->kvm))
+	    return;
+
+	if (kvm_timer_should_fire(vcpu) != timer->irq.level)
+		kvm_timer_update_irq(vcpu, !timer->irq.level);
+}
+
+/*
+ * Schedule the background timer before calling kvm_vcpu_block, so that this
+ * thread is removed from its waitqueue and made runnable when there's a timer
+ * interrupt to handle.
+ */
+void kvm_timer_schedule(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	u64 ns;
+	cycle_t cval, now;
+
+	BUG_ON(timer_is_armed(timer));
+
+	/*
+	 * No need to schedule a background timer if the guest timer has
+	 * already expired, because kvm_vcpu_block will return before putting
+	 * the thread to sleep.
+	 */
+	if (kvm_timer_should_fire(vcpu))
+		return;
+
+	/*
+	 * If the timer is not capable of raising interrupts (disabled or
+	 * masked), then there's no more work for us to do.
+	 */
+	if (!kvm_timer_irq_can_fire(vcpu))
+		return;
+
+	/*  The timer has not yet expired, schedule a background timer */
+	cval = timer->cntv_cval;
+	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+
+	ns = cyclecounter_cyc2ns(timecounter->cc,
+				 cval - now,
+				 timecounter->mask,
+				 &timecounter->frac);
+	timer_arm(timer, ns);
+}
+
+void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	timer_disarm(timer);
+}
+
 /**
 * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
 * @vcpu: The vcpu pointer
 *
- * Disarm any pending soft timers, since the world-switch code will write the
- * virtual timer state back to the physical CPU.
+ * Check if the virtual timer has expired while we were running in the host,
+ * and inject an interrupt if that was the case.
 */
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -140,28 +218,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 	bool phys_active;
 	int ret;

-	/*
-	 * We're about to run this vcpu again, so there is no need to
-	 * keep the background timer running, as we're about to
-	 * populate the CPU timer again.
-	 */
-	timer_disarm(timer);
+	kvm_timer_update_state(vcpu);

 	/*
-	 * If the timer expired while we were not scheduled, now is the time
-	 * to inject it.
+	 * If we enter the guest with the virtual input level to the VGIC
+	 * asserted, then we have already told the VGIC what we need to, and
+	 * we don't need to exit from the guest until the guest deactivates
+	 * the already injected interrupt, so therefore we should set the
+	 * hardware active state to prevent unnecessary exits from the guest.
+	 *
+	 * Conversely, if the virtual input level is deasserted, then always
+	 * clear the hardware active state to ensure that hardware interrupts
+	 * from the timer triggers a guest exit.
 	 */
-	if (kvm_timer_should_fire(vcpu))
-		kvm_timer_inject_irq(vcpu);
-
-	/*
-	 * We keep track of whether the edge-triggered interrupt has been
-	 * signalled to the vgic/guest, and if so, we mask the interrupt and
-	 * the physical distributor to prevent the timer from raising a
-	 * physical interrupt whenever we run a guest, preventing forward
-	 * VCPU progress.
-	 */
-	if (kvm_vgic_get_phys_irq_active(timer->map))
+	if (timer->irq.level)
 		phys_active = true;
 	else
 		phys_active = false;
@@ -176,32 +246,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 * kvm_timer_sync_hwstate - sync timer state from cpu
 * @vcpu: The vcpu pointer
 *
- * Check if the virtual timer was armed and either schedule a corresponding
- * soft timer or inject directly if already expired.
+ * Check if the virtual timer has expired while we were running in the guest,
+ * and inject an interrupt if that was the case.
 */
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	cycle_t cval, now;
-	u64 ns;

 	BUG_ON(timer_is_armed(timer));

-	if (kvm_timer_should_fire(vcpu)) {
-		/*
-		 * Timer has already expired while we were not
-		 * looking. Inject the interrupt and carry on.
-		 */
-		kvm_timer_inject_irq(vcpu);
-		return;
-	}
-
-	cval = timer->cntv_cval;
-	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
-
-	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
-				 &timecounter->frac);
-	timer_arm(timer, ns);
+	/*
+	 * The guest could have modified the timer registers or the timer
+	 * could have expired, update the timer state.
+	 */
+	kvm_timer_update_state(vcpu);
 }

 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
@@ -216,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * kvm_vcpu_set_target(). To handle this, we determine
 	 * vcpu timer irq number when the vcpu is reset.
 	 */
-	timer->irq = irq;
+	timer->irq.irq = irq->irq;

 	/*
 	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -225,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * the ARMv7 architecture.
 	 */
 	timer->cntv_ctl = 0;
+	kvm_timer_update_state(vcpu);

 	/*
 	 * Tell the VGIC that the virtual interrupt is tied to a
@@ -269,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 	default:
 		return -1;
 	}
+
+	kvm_timer_update_state(vcpu);
 	return 0;
 }

@@ -0,0 +1,63 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoints for vgic
+ */
+TRACE_EVENT(vgic_update_irq_pending,
+	TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
+	TP_ARGS(vcpu_id, irq, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_id	)
+		__field(	__u32,		irq	)
+		__field(	bool,		level	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->irq		= irq;
+		__entry->level		= level;
+	),
+
+	TP_printk("VCPU: %ld, IRQ %d, level: %d",
+		  __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+/*
+ * Tracepoints for arch_timer
+ */
+TRACE_EVENT(kvm_timer_update_irq,
+	TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
+	TP_ARGS(vcpu_id, irq, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_id	)
+		__field(	__u32,		irq	)
+		__field(	int,		level	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->irq		= irq;
+		__entry->level		= level;
+	),
+
+	TP_printk("VCPU: %ld, IRQ %d, level %d",
+		  __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
@@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);

 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-}

-static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
-				  struct vgic_lr lr_desc)
-{
 	if (!(lr_desc.state & LR_STATE_MASK))
 		vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
 	else
@@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 	 * anyway.
 	 */
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;

 	/* Get the show on the road... */
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 static const struct vgic_ops vgic_v2_ops = {
 	.get_lr			= vgic_v2_get_lr,
 	.set_lr			= vgic_v2_set_lr,
-	.sync_lr_elrsr		= vgic_v2_sync_lr_elrsr,
 	.get_elrsr		= vgic_v2_get_elrsr,
 	.get_eisr		= vgic_v2_get_eisr,
 	.clear_eisr		= vgic_v2_clear_eisr,
@@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 	}

 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
-}

-static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
-				  struct vgic_lr lr_desc)
-{
 	if (!(lr_desc.state & LR_STATE_MASK))
 		vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
 	else
@@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	 * anyway.
 	 */
 	vgic_v3->vgic_vmcr = 0;
+	vgic_v3->vgic_elrsr = ~0;

 	/*
 	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
@@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 static const struct vgic_ops vgic_v3_ops = {
 	.get_lr			= vgic_v3_get_lr,
 	.set_lr			= vgic_v3_set_lr,
-	.sync_lr_elrsr		= vgic_v3_sync_lr_elrsr,
 	.get_elrsr		= vgic_v3_get_elrsr,
 	.get_eisr		= vgic_v3_get_eisr,
 	.clear_eisr		= vgic_v3_clear_eisr,
@@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work)

 	trace_kvm_async_pf_completed(addr, gva);

+	/*
+	 * This memory barrier pairs with prepare_to_wait's set_current_state()
+	 */
+	smp_mb();
 	if (waitqueue_active(&vcpu->wq))
 		wake_up_interruptible(&vcpu->wq);

@@ -23,6 +23,7 @@

 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
+#include <linux/kvm_irqfd.h>
 #include <linux/workqueue.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
@@ -34,73 +35,20 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/seqlock.h>
+#include <linux/irqbypass.h>
 #include <trace/events/kvm.h>

 #include <kvm/iodev.h>

 #ifdef CONFIG_HAVE_KVM_IRQFD
-/*
- * --------------------------------------------------------------------
- * irqfd: Allows an fd to be used to inject an interrupt to the guest
- *
- * Credit goes to Avi Kivity for the original idea.
- * --------------------------------------------------------------------
- */
-
-/*
- * Resampling irqfds are a special variety of irqfds used to emulate
- * level triggered interrupts.  The interrupt is asserted on eventfd
- * trigger.  On acknowledgement through the irq ack notifier, the
- * interrupt is de-asserted and userspace is notified through the
- * resamplefd.  All resamplers on the same gsi are de-asserted
- * together, so we don't need to track the state of each individual
- * user.  We can also therefore share the same irq source ID.
- */
-struct _irqfd_resampler {
-	struct kvm *kvm;
-	/*
-	 * List of resampling struct _irqfd objects sharing this gsi.
-	 * RCU list modified under kvm->irqfds.resampler_lock
-	 */
-	struct list_head list;
-	struct kvm_irq_ack_notifier notifier;
-	/*
-	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
-	 * resamplers among irqfds on the same gsi.
-	 * Accessed and modified under kvm->irqfds.resampler_lock
-	 */
-	struct list_head link;
-};
-
-struct _irqfd {
-	/* Used for MSI fast-path */
-	struct kvm *kvm;
-	wait_queue_t wait;
-	/* Update side is protected by irqfds.lock */
-	struct kvm_kernel_irq_routing_entry irq_entry;
-	seqcount_t irq_entry_sc;
-	/* Used for level IRQ fast-path */
-	int gsi;
-	struct work_struct inject;
-	/* The resampler used by this irqfd (resampler-only) */
-	struct _irqfd_resampler *resampler;
-	/* Eventfd notified on resample (resampler-only) */
-	struct eventfd_ctx *resamplefd;
-	/* Entry in list of irqfds for a resampler (resampler-only) */
-	struct list_head resampler_link;
-	/* Used for setup/shutdown */
-	struct eventfd_ctx *eventfd;
-	struct list_head list;
-	poll_table pt;
-	struct work_struct shutdown;
-};

 static struct workqueue_struct *irqfd_cleanup_wq;

 static void
 irqfd_inject(struct work_struct *work)
 {
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(work, struct kvm_kernel_irqfd, inject);
 	struct kvm *kvm = irqfd->kvm;

 	if (!irqfd->resampler) {
@@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work)
 static void
 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
-	struct _irqfd_resampler *resampler;
+	struct kvm_kernel_irqfd_resampler *resampler;
 	struct kvm *kvm;
-	struct _irqfd *irqfd;
+	struct kvm_kernel_irqfd *irqfd;
 	int idx;

-	resampler = container_of(kian, struct _irqfd_resampler, notifier);
+	resampler = container_of(kian,
+			struct kvm_kernel_irqfd_resampler, notifier);
 	kvm = resampler->kvm;

 	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 }

 static void
-irqfd_resampler_shutdown(struct _irqfd *irqfd)
+irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
 {
-	struct _irqfd_resampler *resampler = irqfd->resampler;
+	struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
 	struct kvm *kvm = resampler->kvm;

 	mutex_lock(&kvm->irqfds.resampler_lock);
@@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
 static void
 irqfd_shutdown(struct work_struct *work)
 {
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(work, struct kvm_kernel_irqfd, shutdown);
 	u64 cnt;

 	/*
@@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
 	/*
 	 * It is now safe to release the object's resources
 	 */
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+	irq_bypass_unregister_consumer(&irqfd->consumer);
+#endif
 	eventfd_ctx_put(irqfd->eventfd);
 	kfree(irqfd);
 }
@@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work)

 /* assumes kvm->irqfds.lock is held */
 static bool
-irqfd_is_active(struct _irqfd *irqfd)
+irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
 {
 	return list_empty(&irqfd->list) ? false : true;
 }
@@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd)
 * assumes kvm->irqfds.lock is held
 */
 static void
-irqfd_deactivate(struct _irqfd *irqfd)
+irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 {
 	BUG_ON(!irqfd_is_active(irqfd));

@@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd)
 	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 }

+int __attribute__((weak)) kvm_arch_set_irq_inatomic(
+				struct kvm_kernel_irq_routing_entry *irq,
+				struct kvm *kvm, int irq_source_id,
+				int level,
+				bool line_status)
+{
+	return -EWOULDBLOCK;
+}
+
 /*
 * Called with wqh->lock held and interrupts disabled
 */
 static int
 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
-	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(wait, struct kvm_kernel_irqfd, wait);
 	unsigned long flags = (unsigned long)key;
 	struct kvm_kernel_irq_routing_entry irq;
 	struct kvm *kvm = irqfd->kvm;
@@ -238,10 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 			irq = irqfd->irq_entry;
 		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 		/* An event has been signaled, inject an interrupt */
-		if (irq.type == KVM_IRQ_ROUTING_MSI)
-			kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
-					false);
-		else
+		if (kvm_arch_set_irq_inatomic(&irq, kvm,
+					      KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+					      false) == -EWOULDBLOCK)
 			schedule_work(&irqfd->inject);
 		srcu_read_unlock(&kvm->irq_srcu, idx);
 	}
@@ -274,37 +236,54 @@ static void
 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 			poll_table *pt)
 {
-	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(pt, struct kvm_kernel_irqfd, pt);
 	add_wait_queue(wqh, &irqfd->wait);
 }

 /* Must be called under irqfds.lock */
-static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
+static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	int i, n_entries;
+	int n_entries;

 	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);

 	write_seqcount_begin(&irqfd->irq_entry_sc);

-	irqfd->irq_entry.type = 0;
-
 	e = entries;
-	for (i = 0; i < n_entries; ++i, ++e) {
-		/* Only fast-path MSI. */
-		if (e->type == KVM_IRQ_ROUTING_MSI)
-			irqfd->irq_entry = *e;
-	}
+	if (n_entries == 1)
+		irqfd->irq_entry = *e;
+	else
+		irqfd->irq_entry.type = 0;

 	write_seqcount_end(&irqfd->irq_entry_sc);
 }

+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+void __attribute__((weak)) kvm_arch_irq_bypass_stop(
+				struct irq_bypass_consumer *cons)
+{
+}
+
+void __attribute__((weak)) kvm_arch_irq_bypass_start(
+				struct irq_bypass_consumer *cons)
+{
+}
+
+int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
+				struct kvm *kvm, unsigned int host_irq,
+				uint32_t guest_irq, bool set)
+{
+	return 0;
+}
+#endif
+
 static int
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct fd f;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 	int ret;
@@ -340,7 +319,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	irqfd->eventfd = eventfd;

 	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
-		struct _irqfd_resampler *resampler;
+		struct kvm_kernel_irqfd_resampler *resampler;

 		resamplefd = eventfd_ctx_fdget(args->resamplefd);
 		if (IS_ERR(resamplefd)) {
@@ -428,6 +407,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * we might race against the POLLHUP
 	 */
 	fdput(f);
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+	irqfd->consumer.token = (void *)irqfd->eventfd;
+	irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
+	irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
+	irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
+	irqfd->consumer.start = kvm_arch_irq_bypass_start;
+	ret = irq_bypass_register_consumer(&irqfd->consumer);
+	if (ret)
+		pr_info("irq bypass consumer (token %p) registration fails: %d\n",
+				irqfd->consumer.token, ret);
+#endif

 	return 0;

@@ -469,9 +459,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 }
 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);

-void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 {
 	struct kvm_irq_ack_notifier *kian;
+
+	hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+				 link)
+		if (kian->gsi == gsi)
+			kian->irq_acked(kian);
+}
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
 	int gsi, idx;

 	trace_kvm_ack_irq(irqchip, pin);
@@ -479,10 +478,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	if (gsi != -1)
-		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-					 link)
-			if (kian->gsi == gsi)
-				kian->irq_acked(kian);
+		kvm_notify_acked_gsi(kvm, gsi);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }

@@ -525,7 +521,7 @@ kvm_eventfd_init(struct kvm *kvm)
 static int
 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct eventfd_ctx *eventfd;

 	eventfd = eventfd_ctx_fdget(args->fd);
@@ -581,7 +577,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 void
 kvm_irqfd_release(struct kvm *kvm)
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;

 	spin_lock_irq(&kvm->irqfds.lock);

@@ -604,13 +600,23 @@ kvm_irqfd_release(struct kvm *kvm)
 */
 void kvm_irq_routing_update(struct kvm *kvm)
 {
-	struct _irqfd *irqfd;
+	struct kvm_kernel_irqfd *irqfd;

 	spin_lock_irq(&kvm->irqfds.lock);

-	list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+	list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
 		irqfd_update(kvm, irqfd);

+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+		if (irqfd->producer) {
+			int ret = kvm_arch_update_irqfd_routing(
+					irqfd->kvm, irqfd->producer->irq,
+					irqfd->gsi, 1);
+			WARN_ON(ret);
+		}
+#endif
+	}
+
 	spin_unlock_irq(&kvm->irqfds.lock);
 }

@@ -914,9 +920,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 		return -EINVAL;

 	/* ioeventfd with no length can't be combined with DATAMATCH */
-	if (!args->len &&
-	    args->flags & (KVM_IOEVENTFD_FLAG_PIO |
-			   KVM_IOEVENTFD_FLAG_DATAMATCH))
+	if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
 		return -EINVAL;

 	ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
@@ -31,16 +31,6 @@
 #include <trace/events/kvm.h>
 #include "irq.h"

-struct kvm_irq_routing_table {
-	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
-	u32 nr_rt_entries;
-	/*
-	 * Array indexed by gsi. Each entry contains list of irq chips
-	 * the gsi is connected to.
-	 */
-	struct hlist_head map[0];
-};
-
 int kvm_irq_map_gsi(struct kvm *kvm,
 		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
 {
@@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,

 	/*
 	 * Do not allow GSI to be mapped to the same irqchip more than once.
-	 * Allow only one to one mapping between GSI and MSI.
+	 * Allow only one to one mapping between GSI and non-irqchip routing.
 	 */
 	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
-		if (ei->type == KVM_IRQ_ROUTING_MSI ||
-		    ue->type == KVM_IRQ_ROUTING_MSI ||
+		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
+		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
 			return r;

@@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);

+	kvm_arch_irq_routing_update(kvm);
+
 	synchronize_srcu_expedited(&kvm->irq_srcu);

 	new = old;
@@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);

+	vcpu->pre_pcpu = -1;
+	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
+
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 		r = -ENOMEM;
@@ -2018,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		} while (single_task_running() && ktime_before(cur, stop));
 	}

+	kvm_arch_vcpu_blocking(vcpu);
+
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);

@@ -2031,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 	cur = ktime_get();

+	kvm_arch_vcpu_unblocking(vcpu);
 out:
 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);

@@ -2718,6 +2724,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IRQFD_RESAMPLE:
 #endif
+	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@@ -3341,7 +3348,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;

-	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
@@ -3373,7 +3380,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	if (r)
 		return r;

-	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 		return -ENOMEM;
@@ -0,0 +1,2 @@
+config IRQ_BYPASS_MANAGER
+	tristate
@@ -0,0 +1 @@
+obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o
@@ -0,0 +1,257 @@
+/*
+ * IRQ offload/bypass manager
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Various virtualization hardware acceleration techniques allow bypassing or
+ * offloading interrupts received from devices around the host kernel.  Posted
+ * Interrupts on Intel VT-d systems can allow interrupts to be received
+ * directly by a virtual machine.  ARM IRQ Forwarding allows forwarded physical
+ * interrupts to be directly deactivated by the guest.  This manager allows
+ * interrupt producers and consumers to find each other to enable this sort of
+ * bypass.
+ */
+
+#include <linux/irqbypass.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("IRQ bypass manager utility module");
+
+static LIST_HEAD(producers);
+static LIST_HEAD(consumers);
+static DEFINE_MUTEX(lock);
+
+/* @lock must be held when calling connect */
+static int __connect(struct irq_bypass_producer *prod,
+		     struct irq_bypass_consumer *cons)
+{
+	int ret = 0;
+
+	if (prod->stop)
+		prod->stop(prod);
+	if (cons->stop)
+		cons->stop(cons);
+
+	if (prod->add_consumer)
+		ret = prod->add_consumer(prod, cons);
+
+	if (!ret) {
+		ret = cons->add_producer(cons, prod);
+		if (ret && prod->del_consumer)
+			prod->del_consumer(prod, cons);
+	}
+
+	if (cons->start)
+		cons->start(cons);
+	if (prod->start)
+		prod->start(prod);
+
+	return ret;
+}
+
+/* @lock must be held when calling disconnect */
+static void __disconnect(struct irq_bypass_producer *prod,
+			 struct irq_bypass_consumer *cons)
+{
+	if (prod->stop)
+		prod->stop(prod);
+	if (cons->stop)
+		cons->stop(cons);
+
+	cons->del_producer(cons, prod);
+
+	if (prod->del_consumer)
+		prod->del_consumer(prod, cons);
+
+	if (cons->start)
+		cons->start(cons);
+	if (prod->start)
+		prod->start(prod);
+}
+
+/**
+ * irq_bypass_register_producer - register IRQ bypass producer
+ * @producer: pointer to producer structure
+ *
+ * Add the provided IRQ producer to the list of producers and connect
+ * with any matching token found on the IRQ consumers list.
+ */
+int irq_bypass_register_producer(struct irq_bypass_producer *producer)
+{
+	struct irq_bypass_producer *tmp;
+	struct irq_bypass_consumer *consumer;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &producers, node) {
+		if (tmp->token == producer->token) {
+			mutex_unlock(&lock);
+			module_put(THIS_MODULE);
+			return -EBUSY;
+		}
+	}
+
+	list_for_each_entry(consumer, &consumers, node) {
+		if (consumer->token == producer->token) {
+			int ret = __connect(producer, consumer);
+			if (ret) {
+				mutex_unlock(&lock);
+				module_put(THIS_MODULE);
+				return ret;
+			}
+			break;
+		}
+	}
+
+	list_add(&producer->node, &producers);
+
+	mutex_unlock(&lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
+
+/**
+ * irq_bypass_unregister_producer - unregister IRQ bypass producer
+ * @producer: pointer to producer structure
+ *
+ * Remove a previously registered IRQ producer from the list of producers
+ * and disconnect it from any connected IRQ consumer.
+ */
+void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
+{
+	struct irq_bypass_producer *tmp;
+	struct irq_bypass_consumer *consumer;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return; /* nothing in the list anyway */
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &producers, node) {
+		if (tmp->token != producer->token)
+			continue;
+
+		list_for_each_entry(consumer, &consumers, node) {
+			if (consumer->token == producer->token) {
+				__disconnect(producer, consumer);
+				break;
+			}
+		}
+
+		list_del(&producer->node);
+		module_put(THIS_MODULE);
+		break;
+	}
+
+	mutex_unlock(&lock);
+
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
+
+/**
+ * irq_bypass_register_consumer - register IRQ bypass consumer
+ * @consumer: pointer to consumer structure
+ *
+ * Add the provided IRQ consumer to the list of consumers and connect
+ * with any matching token found on the IRQ producer list.
+ */
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
+{
+	struct irq_bypass_consumer *tmp;
+	struct irq_bypass_producer *producer;
+
+	if (!consumer->add_producer || !consumer->del_producer)
+		return -EINVAL;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &consumers, node) {
+		if (tmp->token == consumer->token) {
+			mutex_unlock(&lock);
+			module_put(THIS_MODULE);
+			return -EBUSY;
+		}
+	}
+
+	list_for_each_entry(producer, &producers, node) {
+		if (producer->token == consumer->token) {
+			int ret = __connect(producer, consumer);
+			if (ret) {
+				mutex_unlock(&lock);
+				module_put(THIS_MODULE);
+				return ret;
+			}
+			break;
+		}
+	}
+
+	list_add(&consumer->node, &consumers);
+
+	mutex_unlock(&lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
+
+/**
+ * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
+ * @consumer: pointer to consumer structure
+ *
+ * Remove a previously registered IRQ consumer from the list of consumers
+ * and disconnect it from any connected IRQ producer.
+ */
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
+{
+	struct irq_bypass_consumer *tmp;
+	struct irq_bypass_producer *producer;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return; /* nothing in the list anyway */
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &consumers, node) {
+		if (tmp->token != consumer->token)
+			continue;
+
+		list_for_each_entry(producer, &producers, node) {
+			if (producer->token == consumer->token) {
+				__disconnect(producer, consumer);
+				break;
+			}
+		}
+
+		list_del(&consumer->node);
+		module_put(THIS_MODULE);
+		break;
+	}
+
+	mutex_unlock(&lock);
+
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);
				`@@ -0,0 +1 @@`
				`obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o`