Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini: "First batch of KVM changes for 4.4. s390: A bunch of fixes and optimizations for interrupt and time handling. PPC: Mostly bug fixes. ARM: No big features, but many small fixes and prerequisites including: - a number of fixes for the arch-timer - introducing proper level-triggered semantics for the arch-timers - a series of patches to synchronously halt a guest (prerequisite for IRQ forwarding) - some tracepoint improvements - a tweak for the EL2 panic handlers - some more VGIC cleanups getting rid of redundant state x86: Quite a few changes: - support for VT-d posted interrupts (i.e. PCI devices can inject interrupts directly into vCPUs). This introduces a new component (in virt/lib/) that connects VFIO and KVM together. The same infrastructure will be used for ARM interrupt forwarding as well. - more Hyper-V features, though the main one Hyper-V synthetic interrupt controller will have to wait for 4.5. These will let KVM expose Hyper-V devices. - nested virtualization now supports VPID (same as PCID but for vCPUs) which makes it quite a bit faster - for future hardware that supports NVDIMM, there is support for clflushopt, clwb, pcommit - support for "split irqchip", i.e. LAPIC in kernel + IOAPIC/PIC/PIT in userspace, which reduces the attack surface of the hypervisor - obligatory smattering of SMM fixes - on the guest side, stable scheduler clock support was rewritten to not require help from the hypervisor" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits) KVM: VMX: Fix commit which broke PML KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0() KVM: x86: allow RSM from 64-bit mode KVM: VMX: fix SMEP and SMAP without EPT KVM: x86: move kvm_set_irq_inatomic to legacy device assignment KVM: device assignment: remove pointless #ifdefs KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic KVM: x86: zero apic_arb_prio on reset drivers/hv: share Hyper-V SynIC constants with userspace KVM: x86: handle SMBASE as physical address in RSM KVM: x86: add read_phys to x86_emulate_ops KVM: x86: removing unused variable KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr() KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings KVM: arm/arm64: Optimize away redundant LR tracking KVM: s390: use simple switch statement as multiplexer KVM: s390: drop useless newline in debugging data KVM: s390: SCA must not cross page boundaries KVM: arm: Do not indent the arguments of DECLARE_BITMAP ...
2026-05-01 15:00:59 -07:00 · 2015-11-05 16:26:26 -08:00
parent a3e7531535 a3eaa8649e
commit 933425fb00
89 changed files with 2956 additions and 1029 deletions
@@ -1585,6 +1585,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			nosid	disable Source ID checking
 			no_x2apic_optout
 				BIOS x2APIC opt-out request will be ignored
 			nopost	disable Interrupt Posting
 	iomem=		Disable strict checking of access to MMIO memory
 		strict	regions from userspace.
@@ -401,10 +401,9 @@ Capability: basic
 Architectures: x86, ppc, mips
 Type: vcpu ioctl
 Parameters: struct kvm_interrupt (in)
-Returns: 0 on success, -1 on error
+Returns: 0 on success, negative on failure.
-Queues a hardware interrupt vector to be injected.  This is only
+Queues a hardware interrupt vector to be injected.
 useful if in-kernel local APIC or equivalent is not used.
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
@@ -414,7 +413,14 @@ struct kvm_interrupt {
 X86:
-Note 'irq' is an interrupt vector, not an interrupt pin or line.
+Returns: 0 on success,
 	 -EEXIST if an interrupt is already enqueued
 	 -EINVAL the the irq number is invalid
 	 -ENXIO if the PIC is in the kernel
 	 -EFAULT if the pointer is invalid
 Note 'irq' is an interrupt vector, not an interrupt pin or line. This
 ioctl is useful if the in-kernel PIC is not used.
 PPC:
@@ -1598,7 +1604,7 @@ provided event instead of triggering an exit.
 struct kvm_ioeventfd {
 	__u64 datamatch;
 	__u64 addr;        /* legal pio/mmio address */
-	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__u32 len;         /* 0, 1, 2, 4, or 8 bytes    */
 	__s32 fd;
 	__u32 flags;
 	__u8  pad[36];
@@ -1621,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd.
 For virtio-ccw devices, addr contains the subchannel id and datamatch the
 virtqueue index.
 With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and
 the kernel will ignore the length of guest write and may get a faster vmexit.
 The speedup may only apply to specific architectures, but the ioeventfd will
 work anyway.
 4.60 KVM_DIRTY_TLB
@@ -3309,6 +3319,18 @@ Valid values for 'type' are:
   to ignore the request, or to gather VM memory core dump and/or
   reset/shutdown of the VM.
 		/* KVM_EXIT_IOAPIC_EOI */
 		struct {
 			__u8 vector;
 		} eoi;
 Indicates that the VCPU's in-kernel local APIC received an EOI for a
 level-triggered IOAPIC interrupt.  This exit only triggers when the
 IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
 the userspace IOAPIC should process the EOI and retrigger the interrupt if
 it is still asserted.  Vector is the LAPIC interrupt vector for which the
 EOI was received.
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -3627,6 +3649,26 @@ struct {
 KVM handlers should exit to userspace with rc = -EREMOTE.
 7.5 KVM_CAP_SPLIT_IRQCHIP
 Architectures: x86
 Parameters: args[0] - number of routes reserved for userspace IOAPICs
 Returns: 0 on success, -1 on error
 Create a local apic for each processor in the kernel. This can be used
 instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
 IOAPIC and PIC (and also the PIT, even though this has to be enabled
 separately).
 This capability also enables in kernel routing of interrupt requests;
 when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
 used in the IRQ routing table.  The first args[0] MSI routes are reserved
 for the IOAPIC pins.  Whenever the LAPIC receives an EOI for these routes,
 a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
 Fails if VCPU has already been created, or if the irqchip is already in the
 kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
 8. Other capabilities.
 ----------------------
@@ -0,0 +1,187 @@
 KVM/ARM VGIC Forwarded Physical Interrupts
 ==========================================
 The KVM/ARM code implements software support for the ARM Generic
 Interrupt Controller's (GIC's) hardware support for virtualization by
 allowing software to inject virtual interrupts to a VM, which the guest
 OS sees as regular interrupts.  The code is famously known as the VGIC.
 Some of these virtual interrupts, however, correspond to physical
 interrupts from real physical devices.  One example could be the
 architected timer, which itself supports virtualization, and therefore
 lets a guest OS program the hardware device directly to raise an
 interrupt at some point in time.  When such an interrupt is raised, the
 host OS initially handles the interrupt and must somehow signal this
 event as a virtual interrupt to the guest.  Another example could be a
 passthrough device, where the physical interrupts are initially handled
 by the host, but the device driver for the device lives in the guest OS
 and KVM must therefore somehow inject a virtual interrupt on behalf of
 the physical one to the guest OS.
 These virtual interrupts corresponding to a physical interrupt on the
 host are called forwarded physical interrupts, but are also sometimes
 referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
 Forwarded physical interrupts are handled slightly differently compared
 to virtual interrupts generated purely by a software emulated device.
 The HW bit
 ----------
 Virtual interrupts are signalled to the guest by programming the List
 Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
 with the virtual IRQ number and the state of the interrupt (Pending,
 Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
 interrupt, the LR state moves from Pending to Active, and finally to
 inactive.
 The LRs include an extra bit, called the HW bit.  When this bit is set,
 KVM must also program an additional field in the LR, the physical IRQ
 number, to link the virtual with the physical IRQ.
 When the HW bit is set, KVM must EITHER set the Pending OR the Active
 bit, never both at the same time.
 Setting the HW bit causes the hardware to deactivate the physical
 interrupt on the physical distributor when the guest deactivates the
 corresponding virtual interrupt.
 Forwarded Physical Interrupts Life Cycle
 ----------------------------------------
 The state of forwarded physical interrupts is managed in the following way:
  - The physical interrupt is acked by the host, and becomes active on
    the physical distributor (*).
  - KVM sets the LR.Pending bit, because this is the only way the GICV
    interface is going to present it to the guest.
  - LR.Pending will stay set as long as the guest has not acked the interrupt.
  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
    expected.
  - On guest EOI, the *physical distributor* active bit gets cleared,
    but the LR.Active is left untouched (set).
  - KVM clears the LR on VM exits when the physical distributor
    active state has been cleared.
 (*): The host handling is slightly more complicated.  For some forwarded
 interrupts (shared), KVM directly sets the active state on the physical
 distributor before entering the guest, because the interrupt is never actually
 handled on the host (see details on the timer as an example below).  For other
 forwarded interrupts (non-shared) the host does not deactivate the interrupt
 when the host ISR completes, but leaves the interrupt active until the guest
 deactivates it.  Leaving the interrupt active is allowed, because Linux
 configures the physical GIC with EOIMode=1, which causes EOI operations to
 perform a priority drop allowing the GIC to receive other interrupts of the
 default priority.
 Forwarded Edge and Level Triggered PPIs and SPIs
 ------------------------------------------------
 Forwarded physical interrupts injected should always be active on the
 physical distributor when injected to a guest.
 Level-triggered interrupts will keep the interrupt line to the GIC
 asserted, typically until the guest programs the device to deassert the
 line.  This means that the interrupt will remain pending on the physical
 distributor until the guest has reprogrammed the device.  Since we
 always run the VM with interrupts enabled on the CPU, a pending
 interrupt will exit the guest as soon as we switch into the guest,
 preventing the guest from ever making progress as the process repeats
 over and over.  Therefore, the active state on the physical distributor
 must be set when entering the guest, preventing the GIC from forwarding
 the pending interrupt to the CPU.  As soon as the guest deactivates the
 interrupt, the physical line is sampled by the hardware again and the host
 takes a new interrupt if and only if the physical line is still asserted.
 Edge-triggered interrupts do not exhibit the same problem with
 preventing guest execution that level-triggered interrupts do.  One
 option is to not use HW bit at all, and inject edge-triggered interrupts
 from a physical device as pure virtual interrupts.  But that would
 potentially slow down handling of the interrupt in the guest, because a
 physical interrupt occurring in the middle of the guest ISR would
 preempt the guest for the host to handle the interrupt.  Additionally,
 if you configure the system to handle interrupts on a separate physical
 core from that running your VCPU, you still have to interrupt the VCPU
 to queue the pending state onto the LR, even though the guest won't use
 this information until the guest ISR completes.  Therefore, the HW
 bit should always be set for forwarded edge-triggered interrupts.  With
 the HW bit set, the virtual interrupt is injected and additional
 physical interrupts occurring before the guest deactivates the interrupt
 simply mark the state on the physical distributor as Pending+Active.  As
 soon as the guest deactivates the interrupt, the host takes another
 interrupt if and only if there was a physical interrupt between injecting
 the forwarded interrupt to the guest and the guest deactivating the
 interrupt.
 Consequently, whenever we schedule a VCPU with one or more LRs with the
 HW bit set, the interrupt must also be active on the physical
 distributor.
 Forwarded LPIs
 --------------
 LPIs, introduced in GICv3, are always edge-triggered and do not have an
 active state.  They become pending when a device signal them, and as
 soon as they are acked by the CPU, they are inactive again.
 It therefore doesn't make sense, and is not supported, to set the HW bit
 for physical LPIs that are forwarded to a VM as virtual interrupts,
 typically virtual SPIs.
 For LPIs, there is no other choice than to preempt the VCPU thread if
 necessary, and queue the pending state onto the LR.
 Putting It Together: The Architected Timer
 ------------------------------------------
 The architected timer is a device that signals interrupts with level
 triggered semantics.  The timer hardware is directly accessed by VCPUs
 which program the timer to fire at some point in time.  Each VCPU on a
 system programs the timer to fire at different times, and therefore the
 hardware is multiplexed between multiple VCPUs.  This is implemented by
 context-switching the timer state along with each VCPU thread.
 However, this means that a scenario like the following is entirely
 possible, and in fact, typical:
 1.  KVM runs the VCPU
 2.  The guest programs the time to fire in T+100
 3.  The guest is idle and calls WFI (wait-for-interrupts)
 4.  The hardware traps to the host
 5.  KVM stores the timer state to memory and disables the hardware timer
 6.  KVM schedules a soft timer to fire in T+(100 - time since step 2)
 7.  KVM puts the VCPU thread to sleep (on a waitqueue)
 8.  The soft timer fires, waking up the VCPU thread
 9.  KVM reprograms the timer hardware with the VCPU's values
 10. KVM marks the timer interrupt as active on the physical distributor
 11. KVM injects a forwarded physical interrupt to the guest
 12. KVM runs the VCPU
 Notice that KVM injects a forwarded physical interrupt in step 11 without
 the corresponding interrupt having actually fired on the host.  That is
 exactly why we mark the timer interrupt as active in step 10, because
 the active state on the physical distributor is part of the state
 belonging to the timer hardware, which is context-switched along with
 the VCPU thread.
 If the guest does not idle because it is busy, the flow looks like this
 instead:
 1.  KVM runs the VCPU
 2.  The guest programs the time to fire in T+100
 4.  At T+100 the timer fires and a physical IRQ causes the VM to exit
    (note that this initially only traps to EL2 and does not run the host ISR
    until KVM has returned to the host).
 5.  With interrupts still disabled on the CPU coming back from the guest, KVM
    stores the virtual timer state to memory and disables the virtual hw timer.
 6.  KVM looks at the timer state (in memory) and injects a forwarded physical
    interrupt because it concludes the timer has expired.
 7.  KVM marks the timer interrupt as active on the physical distributor
 7.  KVM enables the timer, enables interrupts, and runs the VCPU
 Notice that again the forwarded physical interrupt is injected to the
 guest without having actually been handled on the host.  In this case it
 is because the physical interrupt is never actually seen by the host because the
 timer is disabled upon guest return, and the virtual forwarded interrupt is
 injected on the KVM guest entry path.
@@ -44,28 +44,29 @@ Groups:
  Attributes:
    The attr field of kvm_device_attr encodes two values:
    bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
-    values:   |    reserved   |   cpu id   |      offset     |
+    values:   |    reserved   | vcpu_index |      offset     |
    All distributor regs are (rw, 32-bit)
    The offset is relative to the "Distributor base address" as defined in the
    GICv2 specs.  Getting or setting such a register has the same effect as
-    reading or writing the register on the actual hardware from the cpu
+    reading or writing the register on the actual hardware from the cpu whose
-    specified with cpu id field.  Note that most distributor fields are not
+    index is specified with the vcpu_index field.  Note that most distributor
-    banked, but return the same value regardless of the cpu id used to access
+    fields are not banked, but return the same value regardless of the
-    the register.
+    vcpu_index used to access the register.
  Limitations:
    - Priorities are not implemented, and registers are RAZ/WI
    - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
  Errors:
-    -ENODEV: Getting or setting this register is not yet supported
+    -ENXIO: Getting or setting this register is not yet supported
    -EBUSY: One or more VCPUs are running
    -EINVAL: Invalid vcpu_index supplied
  KVM_DEV_ARM_VGIC_GRP_CPU_REGS
  Attributes:
    The attr field of kvm_device_attr encodes two values:
    bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
-    values:   |    reserved   |   cpu id   |      offset     |
+    values:   |    reserved   | vcpu_index |      offset     |
    All CPU interface regs are (rw, 32-bit)
@@ -91,8 +92,9 @@ Groups:
    - Priorities are not implemented, and registers are RAZ/WI
    - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
  Errors:
-    -ENODEV: Getting or setting this register is not yet supported
+    -ENXIO: Getting or setting this register is not yet supported
    -EBUSY: One or more VCPUs are running
    -EINVAL: Invalid vcpu_index supplied
  KVM_DEV_ARM_VGIC_GRP_NR_IRQS
  Attributes:
@@ -166,3 +166,15 @@ Comment:	The srcu read lock must be held while accessing memslots (e.g.
 		MMIO/PIO address->device structure mapping (kvm->buses).
 		The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
 		if it is needed by multiple functions.
 Name:		blocked_vcpu_on_cpu_lock
 Type:		spinlock_t
 Arch:		x86
 Protects:	blocked_vcpu_on_cpu
 Comment:	This is a per-CPU lock and it is used for VT-d posted-interrupts.
 		When VT-d posted-interrupts is supported and the VM has assigned
 		devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
 		protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
 		wakeup notification event since external interrupts from the
 		assigned devices happens, we will find the vCPU on the list to
 		wakeup.
@@ -11348,6 +11348,13 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/via/via-velocity.*
 VIRT LIB
 M:	Alex Williamson <alex.williamson@redhat.com>
 M:	Paolo Bonzini <pbonzini@redhat.com>
 L:	kvm@vger.kernel.org
 S:	Supported
 F:	virt/lib/
 VIVID VIRTUAL VIDEO DRIVER
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 L:	linux-media@vger.kernel.org
@@ -550,6 +550,7 @@ drivers-y	:= drivers/ sound/ firmware/
 net-y		:= net/
 libs-y		:= lib/
 core-y		:= usr/
 virt-y		:= virt/
 endif # KBUILD_EXTMOD
 ifeq ($(dot-config),1)
@@ -882,10 +883,10 @@ core-y		+= kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
-		     $(net-y) $(net-m) $(libs-y) $(libs-m)))
+		     $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))
 vmlinux-alldirs	:= $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
-		     $(init-) $(core-) $(drivers-) $(net-) $(libs-))))
+		     $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
 init-y		:= $(patsubst %/, %/built-in.o, $(init-y))
 core-y		:= $(patsubst %/, %/built-in.o, $(core-y))
@@ -894,14 +895,15 @@ net-y		:= $(patsubst %/, %/built-in.o, $(net-y))
 libs-y1		:= $(patsubst %/, %/lib.a, $(libs-y))
 libs-y2		:= $(patsubst %/, %/built-in.o, $(libs-y))
 libs-y		:= $(libs-y1) $(libs-y2)
 virt-y		:= $(patsubst %/, %/built-in.o, $(virt-y))
 # Externally visible symbols (used by link-vmlinux.sh)
 export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
-export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y)
+export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
 export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
 export LDFLAGS_vmlinux
 # used by scripts/pacmage/Makefile
-export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt)
+export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
@@ -218,4 +218,24 @@
 #define HSR_DABT_CM		(1U << 8)
 #define HSR_DABT_EA		(1U << 9)
 #define kvm_arm_exception_type	\
 	{0, "RESET" }, 		\
 	{1, "UNDEFINED" },	\
 	{2, "SOFTWARE" },	\
 	{3, "PREF_ABORT" },	\
 	{4, "DATA_ABORT" },	\
 	{5, "IRQ" },		\
 	{6, "FIQ" },		\
 	{7, "HVC" }
 #define HSRECN(x) { HSR_EC_##x, #x }
 #define kvm_arm_exception_class \
 	HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \
 	HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \
 	HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \
 	HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \
 	HSRECN(DABT), HSRECN(DABT_HYP)
 #endif /* __ARM_KVM_ARM_H__ */
@@ -126,7 +126,10 @@ struct kvm_vcpu_arch {
 	 * here.
 	 */
-	/* Don't run the guest on this vcpu */
+	/* vcpu power-off state */
 	bool power_off;
 	 /* Don't run the guest (internal implementation need) */
 	bool pause;
 	/* IO related fields */
@@ -46,4 +46,6 @@ config KVM_ARM_HOST
 	---help---
 	  Provides host support for ARM processors.
 source drivers/vhost/Kconfig
 endif # VIRTUALIZATION
@@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return kvm_timer_should_fire(vcpu);
 }
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 {
 	kvm_timer_schedule(vcpu);
 }
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
 {
 	kvm_timer_unschedule(vcpu);
 }
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	/* Force users to call KVM_ARM_VCPU_INIT */
@@ -308,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	if (vcpu->arch.pause)
+	if (vcpu->arch.power_off)
 		mp_state->mp_state = KVM_MP_STATE_STOPPED;
 	else
 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
@@ -321,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
 	switch (mp_state->mp_state) {
 	case KVM_MP_STATE_RUNNABLE:
-		vcpu->arch.pause = false;
+		vcpu->arch.power_off = false;
 		break;
 	case KVM_MP_STATE_STOPPED:
-		vcpu->arch.pause = true;
+		vcpu->arch.power_off = true;
 		break;
 	default:
 		return -EINVAL;
@@ -342,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 */
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v);
+	return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
 		&& !v->arch.power_off && !v->arch.pause);
 }
 /* Just ensure a guest exit from a particular CPU */
@@ -468,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
 	return vgic_initialized(kvm);
 }
-static void vcpu_pause(struct kvm_vcpu *vcpu)
+static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
 static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
 static void kvm_arm_halt_guest(struct kvm *kvm)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		vcpu->arch.pause = true;
 	force_vm_exit(cpu_all_mask);
 }
 static void kvm_arm_resume_guest(struct kvm *kvm)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 		vcpu->arch.pause = false;
 		wake_up_interruptible(wq);
 	}
 }
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
 {
 	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
-	wait_event_interruptible(*wq, !vcpu->arch.pause);
+	wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 				       (!vcpu->arch.pause)));
 }
 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@@ -522,8 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		update_vttbr(vcpu->kvm);
-		if (vcpu->arch.pause)
+		if (vcpu->arch.power_off || vcpu->arch.pause)
-			vcpu_pause(vcpu);
+			vcpu_sleep(vcpu);
 		/*
 		 * Disarming the background timer must be done in a
@@ -549,11 +587,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			run->exit_reason = KVM_EXIT_INTR;
 		}
-		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
+		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
 			vcpu->arch.power_off || vcpu->arch.pause) {
 			local_irq_enable();
 			kvm_timer_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			preempt_enable();
 			kvm_timer_sync_hwstate(vcpu);
 			continue;
 		}
@@ -596,14 +635,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * guest time.
 		 */
 		kvm_guest_exit();
-		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 		/*
 		 * We must sync the timer state before the vgic state so that
 		 * the vgic can properly sample the updated state of the
 		 * interrupt line.
 		 */
 		kvm_timer_sync_hwstate(vcpu);
 		kvm_vgic_sync_hwstate(vcpu);
 		preempt_enable();
 		kvm_timer_sync_hwstate(vcpu);
 		ret = handle_exit(vcpu, run, ret);
 	}
@@ -765,12 +809,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	vcpu_reset_hcr(vcpu);
 	/*
-	 * Handle the "start in power-off" case by marking the VCPU as paused.
+	 * Handle the "start in power-off" case.
 	 */
 	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu->arch.pause = true;
+		vcpu->arch.power_off = true;
 	else
-		vcpu->arch.pause = false;
+		vcpu->arch.power_off = false;
 	return 0;
 }
@@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.pause = true;
+	vcpu->arch.power_off = true;
 }
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	 */
 	if (!vcpu)
 		return PSCI_RET_INVALID_PARAMS;
-	if (!vcpu->arch.pause) {
+	if (!vcpu->arch.power_off) {
 		if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
 			return PSCI_RET_ALREADY_ON;
 		else
@@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	 * the general puspose registers are undefined upon CPU_ON.
 	 */
 	*vcpu_reg(vcpu, 0) = context_id;
-	vcpu->arch.pause = false;
+	vcpu->arch.power_off = false;
 	smp_mb();		/* Make sure the above is visible */
 	wq = kvm_arch_vcpu_wq(vcpu);
@@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
 		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
 		if ((mpidr & target_affinity_mask) == target_affinity) {
 			matching_cpus++;
-			if (!tmp->arch.pause)
+			if (!tmp->arch.power_off)
 				return PSCI_0_2_AFFINITY_LEVEL_ON;
 		}
 	}
@@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 	 * re-initialized.
 	 */
 	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
-		tmp->arch.pause = true;
+		tmp->arch.power_off = true;
 		kvm_vcpu_kick(tmp);
 	}
@@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry,
 );
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc),
+	TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
-	TP_ARGS(exit_reason, vcpu_pc),
+	TP_ARGS(idx, exit_reason, vcpu_pc),
 	TP_STRUCT__entry(
 		__field(	int,		idx		)
 		__field(	unsigned int,	exit_reason	)
 		__field(	unsigned long,	vcpu_pc		)
 	),
 	TP_fast_assign(
 		__entry->idx			= idx;
 		__entry->exit_reason		= exit_reason;
 		__entry->vcpu_pc		= vcpu_pc;
 	),
-	TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx",
+	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
 		  __print_symbolic(__entry->idx, kvm_arm_exception_type),
 		  __entry->exit_reason,
 		  __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
 		  __entry->vcpu_pc)
 );
@@ -200,4 +200,20 @@
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK	(~UL(0xf))
 #define kvm_arm_exception_type	\
 	{0, "IRQ" }, 		\
 	{1, "TRAP" }
 #define ECN(x) { ESR_ELx_EC_##x, #x }
 #define kvm_arm_exception_class \
 	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
 	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \
 	ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \
 	ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
 	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
 	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
 	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
 	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
 #endif /* __ARM64_KVM_ARM_H__ */
@@ -149,7 +149,10 @@ struct kvm_vcpu_arch {
 		u32	mdscr_el1;
 	} guest_debug_preserved;
-	/* Don't run the guest */
+	/* vcpu power-off state */
 	bool power_off;
 	/* Don't run the guest (internal implementation need) */
 	bool pause;
 	/* IO related fields */
@@ -48,4 +48,6 @@ config KVM_ARM_HOST
 	---help---
 	  Provides host support for ARM processors.
 source drivers/vhost/Kconfig
 endif # VIRTUALIZATION
@@ -880,6 +880,14 @@ __kvm_hyp_panic:
 	bl __restore_sysregs
 	/*
 	 * Make sure we have a valid host stack, and don't leave junk in the
 	 * frame pointer that will give us a misleading host stack unwinding.
 	 */
 	ldr	x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
 	msr	sp_el1, x22
 	mov	x29, xzr
 1:	adr	x0, __hyp_panic_str
 	adr	x1, 2f
 	ldp	x2, x3, [x1]
@@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 #endif /* __MIPS_KVM_HOST_H__ */
@@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst)
 	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
 }
 static inline unsigned int get_tmrn(u32 inst)
 {
 	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
 }
 static inline unsigned int get_rt(u32 inst)
 {
 	return (inst >> 21) & 0x1f;
@@ -716,5 +716,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 #endif /* __POWERPC_KVM_HOST_H__ */
--- a/Show More
+++ b/Show More