From 081976992f43be52c155889509524bcfbc9af132 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:33 -0700
Subject: [PATCH 01/28] KVM: x86/mmu: Flush remote TLBs iff MMU-writable flag
 is cleared from RO SPTE

Don't force a remote TLB flush if KVM happens to effectively "refresh" a
read-only SPTE that is still MMU-Writable, as KVM allows MMU-Writable SPTEs
to have Writable TLB entries, even if the SPTE is !Writable.  Remote TLBs
need to be flushed only when creating a read-only SPTE for write-tracking,
i.e. when installing a !MMU-Writable SPTE.

In practice, especially now that KVM doesn't overwrite existing SPTEs when
prefetching, KVM will rarely "refresh" a read-only, MMU-Writable SPTE,
i.e. this is unlikely to eliminate many, if any, TLB flushes.  But, more
precisely flushing makes it easier to understand exactly when KVM does and
doesn't need to flush.

Note, x86 architecturally requires relevant TLB entries to be invalidated
on a page fault, i.e. there is no risk of putting a vCPU into an infinite
loop of read-only page faults.

Cc: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/r/20241011021051.1557902-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c  | 12 +++++++-----
 arch/x86/kvm/mmu/spte.c |  6 ------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 084e6c4d1eaf..d4bb9d60dc59 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -514,9 +514,12 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 /* Rules for using mmu_spte_update:
  * Update the state bits, it means the mapped pfn is not changed.
  *
- * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
- * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
- * spte, even though the writable spte might be cached on a CPU's TLB.
+ * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
+ * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
+ * as KVM allows stale Writable TLB entries to exist.  When dirty logging, KVM
+ * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
+ * not whether or not SPTEs were modified, i.e. only the write-tracking case
+ * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
  *
  * Returns true if the TLB needs to be flushed
  */
@@ -533,8 +536,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
-	if (is_mmu_writable_spte(old_spte) &&
-	      !is_writable_pte(new_spte))
+	if (is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte))
 		flush = true;
 
 	/*
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index f1a50a78badb..e5af69a8f101 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -133,12 +133,6 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
  */
 bool spte_has_volatile_bits(u64 spte)
 {
-	/*
-	 * Always atomically update spte if it can be updated
-	 * out of mmu-lock, it can ensure dirty bit is not lost,
-	 * also, it can help us to get a stable is_writable_pte()
-	 * to ensure tlb flush is not missed.
-	 */
 	if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
 		return true;
 

From cc7ed3358e418d1008d3bb178f60c3b44e0ecd2e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:34 -0700
Subject: [PATCH 02/28] KVM: x86/mmu: Always set SPTE's dirty bit if it's
 created as writable

When creating a SPTE, always set the Dirty bit if the Writable bit is set,
i.e. if KVM is creating a writable mapping.  If two (or more) vCPUs are
racing to install a writable SPTE on a !PRESENT fault, only the "winning"
vCPU will create a SPTE with W=1 and D=1, all "losers" will generate a
SPTE with W=1 && D=0.

As a result, tdp_mmu_map_handle_target_level() will fail to detect that
the losing faults are effectively spurious, and will overwrite the D=1
SPTE with a D=0 SPTE.  For normal VMs, overwriting a present SPTE is a
small performance blip; KVM blasts a remote TLB flush, but otherwise life
goes on.

For upcoming TDX VMs, overwriting a present SPTE is much more costly, and
can even lead to the VM being terminated if KVM isn't careful, e.g. if KVM
attempts TDH.MEM.PAGE.AUG because the TDX code doesn't detect that the
new SPTE is actually the same as the old SPTE (which would be a bug in its
own right).

Suggested-by: Sagi Shahar <sagis@google.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/r/20241011021051.1557902-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/spte.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index e5af69a8f101..09ce93c4916a 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -219,30 +219,21 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (pte_access & ACC_WRITE_MASK) {
 		spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
 
-		/*
-		 * When overwriting an existing leaf SPTE, and the old SPTE was
-		 * writable, skip trying to unsync shadow pages as any relevant
-		 * shadow pages must already be unsync, i.e. the hash lookup is
-		 * unnecessary (and expensive).
-		 *
-		 * The same reasoning applies to dirty page/folio accounting;
-		 * KVM marked the folio dirty when the old SPTE was created,
-		 * thus there's no need to mark the folio dirty again.
-		 *
-		 * Note, both cases rely on KVM not changing PFNs without first
-		 * zapping the old SPTE, which is guaranteed by both the shadow
-		 * MMU and the TDP MMU.
-		 */
-		if (is_last_spte(old_spte, level) && is_writable_pte(old_spte))
-			goto out;
-
 		/*
 		 * Unsync shadow pages that are reachable by the new, writable
 		 * SPTE.  Write-protect the SPTE if the page can't be unsync'd,
 		 * e.g. it's write-tracked (upper-level SPs) or has one or more
 		 * shadow pages and unsync'ing pages is not allowed.
+		 *
+		 * When overwriting an existing leaf SPTE, and the old SPTE was
+		 * writable, skip trying to unsync shadow pages as any relevant
+		 * shadow pages must already be unsync, i.e. the hash lookup is
+		 * unnecessary (and expensive).  Note, this relies on KVM not
+		 * changing PFNs without first zapping the old SPTE, which is
+		 * guaranteed by both the shadow MMU and the TDP MMU.
 		 */
-		if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) {
+		if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
+		    mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) {
 			wrprot = true;
 			pte_access &= ~ACC_WRITE_MASK;
 			spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
@@ -252,7 +243,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (pte_access & ACC_WRITE_MASK)
 		spte |= spte_shadow_dirty_mask(spte);
 
-out:
 	if (prefetch && !synchronizing)
 		spte = mark_spte_for_access_track(spte);
 

From 0387d79e24d6cd816ea600f91607bd27c680a897 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:35 -0700
Subject: [PATCH 03/28] KVM: x86/mmu: Fold all of make_spte()'s writable
 handling into one if-else

Now that make_spte() no longer uses a funky goto to bail out for a special
case of its unsync handling, combine all of the unsync vs. writable logic
into a single if-else statement.

No functional change intended.

Link: https://lore.kernel.org/r/20241011021051.1557902-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/spte.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 09ce93c4916a..030813781a63 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -217,8 +217,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if (pte_access & ACC_WRITE_MASK) {
-		spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask;
-
 		/*
 		 * Unsync shadow pages that are reachable by the new, writable
 		 * SPTE.  Write-protect the SPTE if the page can't be unsync'd,
@@ -233,16 +231,13 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		 * guaranteed by both the shadow MMU and the TDP MMU.
 		 */
 		if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
-		    mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) {
+		    mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch))
 			wrprot = true;
-			pte_access &= ~ACC_WRITE_MASK;
-			spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
-		}
+		else
+			spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
+				spte_shadow_dirty_mask(spte);
 	}
 
-	if (pte_access & ACC_WRITE_MASK)
-		spte |= spte_shadow_dirty_mask(spte);
-
 	if (prefetch && !synchronizing)
 		spte = mark_spte_for_access_track(spte);
 

From b7ed46b201a41a2f63bc104a66f33c65e1b44fbf Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:36 -0700
Subject: [PATCH 04/28] KVM: x86/mmu: Don't force flush if SPTE update clears
 Accessed bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't force a TLB flush if mmu_spte_update() clears the Accessed bit, as
access tracking tolerates false negatives, as evidenced by the
mmu_notifier hooks that explicitly test and age SPTEs without doing a TLB
flush.

In practice, this is very nearly a nop.  spte_write_protect() and
spte_clear_dirty() never clear the Accessed bit.  make_spte() always
sets the Accessed bit for !prefetch scenarios.  FNAME(sync_spte) only sets
SPTE if the protection bits are changing, i.e. if a flush will be needed
regardless of the Accessed bits.  And FNAME(pte_prefetch) sets SPTE if and
only if the old SPTE is !PRESENT.

That leaves kvm_arch_async_page_ready() as the one path that will generate
a !ACCESSED SPTE *and* overwrite a PRESENT SPTE.  And that's very arguably
a bug, as clobbering a valid SPTE in that case is nonsensical.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Link: https://lore.kernel.org/r/20241011021051.1557902-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d4bb9d60dc59..067b6bb7bfca 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -521,36 +521,24 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
  * not whether or not SPTEs were modified, i.e. only the write-tracking case
  * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
  *
+ * Remote TLBs also need to be flushed if the Dirty bit is cleared, as false
+ * negatives are not acceptable, e.g. if KVM is using D-bit based PML on VMX.
+ *
+ * Don't flush if the Accessed bit is cleared, as access tracking tolerates
+ * false negatives, and the one path that does care about TLB flushes,
+ * kvm_mmu_notifier_clear_flush_young(), uses mmu_spte_update_no_track().
+ *
  * Returns true if the TLB needs to be flushed
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
-	bool flush = false;
 	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 
 	if (!is_shadow_present_pte(old_spte))
 		return false;
 
-	/*
-	 * For the spte updated out of mmu-lock is safe, since
-	 * we always atomically update it, see the comments in
-	 * spte_has_volatile_bits().
-	 */
-	if (is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte))
-		flush = true;
-
-	/*
-	 * Flush TLB when accessed/dirty states are changed in the page tables,
-	 * to guarantee consistency between TLB and page tables.
-	 */
-
-	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte))
-		flush = true;
-
-	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte))
-		flush = true;
-
-	return flush;
+	return (is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte)) ||
+	       (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte));
 }
 
 /*

From 856cf4a60cffbffb31a429761bf605108dbf5ff4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:37 -0700
Subject: [PATCH 05/28] KVM: x86/mmu: Don't flush TLBs when clearing Dirty bit
 in shadow MMU

Don't force a TLB flush when an SPTE update in the shadow MMU happens to
clear the Dirty bit, as KVM unconditionally flushes TLBs when enabling
dirty logging, and when clearing dirty logs, KVM flushes based on its
software structures, not the SPTEs.  I.e. the flows that care about
accurate Dirty bit information already ensure there are no stale TLB
entries.

Opportunistically drop is_dirty_spte() as mmu_spte_update() was the sole
caller.

Link: https://lore.kernel.org/r/20241011021051.1557902-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c  | 14 ++++++++------
 arch/x86/kvm/mmu/spte.h |  7 -------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 067b6bb7bfca..cb4f0a8c52e0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -521,12 +521,15 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
  * not whether or not SPTEs were modified, i.e. only the write-tracking case
  * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
  *
- * Remote TLBs also need to be flushed if the Dirty bit is cleared, as false
- * negatives are not acceptable, e.g. if KVM is using D-bit based PML on VMX.
- *
  * Don't flush if the Accessed bit is cleared, as access tracking tolerates
  * false negatives, and the one path that does care about TLB flushes,
- * kvm_mmu_notifier_clear_flush_young(), uses mmu_spte_update_no_track().
+ * kvm_mmu_notifier_clear_flush_young(), flushes if a young SPTE is found, i.e.
+ * doesn't rely on lower helpers to detect the need to flush.
+ *
+ * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
+ * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
+ * when clearing dirty logs, KVM flushes based on whether or not dirty entries
+ * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
  *
  * Returns true if the TLB needs to be flushed
  */
@@ -537,8 +540,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	if (!is_shadow_present_pte(old_spte))
 		return false;
 
-	return (is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte)) ||
-	       (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte));
+	return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index c81cac9358e0..574ca9a1fcab 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -363,13 +363,6 @@ static inline bool is_accessed_spte(u64 spte)
 			     : !is_access_track_spte(spte);
 }
 
-static inline bool is_dirty_spte(u64 spte)
-{
-	u64 dirty_mask = spte_shadow_dirty_mask(spte);
-
-	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
-}
-
 static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
 				int level)
 {

From 010344122dca7c9e772cc03d1534aa908e055f48 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:38 -0700
Subject: [PATCH 06/28] KVM: x86/mmu: Drop ignored return value from
 kvm_tdp_mmu_clear_dirty_slot()

Drop the return value from kvm_tdp_mmu_clear_dirty_slot() as its sole
caller ignores the result (KVM flushes after clearing dirty logs based on
the logs themselves, not based on SPTEs).

Cc: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20241011021051.1557902-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 20 ++++++--------------
 arch/x86/kvm/mmu/tdp_mmu.h |  2 +-
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 91caa73a905b..9c66be7fb002 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1492,13 +1492,12 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
 	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
 }
 
-static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-			   gfn_t start, gfn_t end)
+static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+				  gfn_t start, gfn_t end)
 {
 	const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
 							    shadow_dirty_mask;
 	struct tdp_iter iter;
-	bool spte_set = false;
 
 	rcu_read_lock();
 
@@ -1519,31 +1518,24 @@ retry:
 
 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
 			goto retry;
-
-		spte_set = true;
 	}
 
 	rcu_read_unlock();
-	return spte_set;
 }
 
 /*
  * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
- * memslot. Returns true if an SPTE has been changed and the TLBs need to be
- * flushed.
+ * memslot.
  */
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
 				  const struct kvm_memory_slot *slot)
 {
 	struct kvm_mmu_page *root;
-	bool spte_set = false;
 
 	lockdep_assert_held_read(&kvm->mmu_lock);
 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
-		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
-				slot->base_gfn + slot->npages);
-
-	return spte_set;
+		clear_dirty_gfn_range(kvm, root, slot->base_gfn,
+				      slot->base_gfn + slot->npages);
 }
 
 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 1b74e058a81c..d842bfe103ab 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -34,7 +34,7 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
 			     const struct kvm_memory_slot *slot, int min_level);
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
 				  const struct kvm_memory_slot *slot);
 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				       struct kvm_memory_slot *slot,

From 67c93802928b54fabc076f29f372e3f977590ca1 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:39 -0700
Subject: [PATCH 07/28] KVM: x86/mmu: Fold mmu_spte_update_no_track() into
 mmu_spte_update()

Fold the guts of mmu_spte_update_no_track() into mmu_spte_update() now
that the latter doesn't flush when clearing A/D bits, i.e. now that there
is no need to explicitly avoid TLB flushes when aging SPTEs.

Opportunistically WARN if mmu_spte_update() requests a TLB flush when
aging SPTEs, as aging should never modify a SPTE in such a way that KVM
thinks a TLB flush is needed.

Link: https://lore.kernel.org/r/20241011021051.1557902-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 50 ++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cb4f0a8c52e0..5f7d7db30b21 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -485,32 +485,6 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 	__set_spte(sptep, new_spte);
 }
 
-/*
- * Update the SPTE (excluding the PFN), but do not track changes in its
- * accessed/dirty status.
- */
-static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
-{
-	u64 old_spte = *sptep;
-
-	WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
-	check_spte_writable_invariants(new_spte);
-
-	if (!is_shadow_present_pte(old_spte)) {
-		mmu_spte_set(sptep, new_spte);
-		return old_spte;
-	}
-
-	if (!spte_has_volatile_bits(old_spte))
-		__update_clear_spte_fast(sptep, new_spte);
-	else
-		old_spte = __update_clear_spte_slow(sptep, new_spte);
-
-	WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
-
-	return old_spte;
-}
-
 /* Rules for using mmu_spte_update:
  * Update the state bits, it means the mapped pfn is not changed.
  *
@@ -535,10 +509,23 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 {
-	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+	u64 old_spte = *sptep;
 
-	if (!is_shadow_present_pte(old_spte))
+	WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
+	check_spte_writable_invariants(new_spte);
+
+	if (!is_shadow_present_pte(old_spte)) {
+		mmu_spte_set(sptep, new_spte);
 		return false;
+	}
+
+	if (!spte_has_volatile_bits(old_spte))
+		__update_clear_spte_fast(sptep, new_spte);
+	else
+		old_spte = __update_clear_spte_slow(sptep, new_spte);
+
+	WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
+		     spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 
 	return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
 }
@@ -1598,8 +1585,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
 				clear_bit((ffs(shadow_accessed_mask) - 1),
 					(unsigned long *)sptep);
 			} else {
+				/*
+				 * WARN if mmu_spte_update() signals the need
+				 * for a TLB flush, as Access tracking a SPTE
+				 * should never trigger an _immediate_ flush.
+				 */
 				spte = mark_spte_for_access_track(spte);
-				mmu_spte_update_no_track(sptep, spte);
+				WARN_ON_ONCE(mmu_spte_update(sptep, spte));
 			}
 			young = true;
 		}

From 1a175082b190190d9d05a0a5927b72cbb657fd1d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:40 -0700
Subject: [PATCH 08/28] KVM: x86/mmu: WARN and flush if resolving a TDP MMU
 fault clears MMU-writable

Do a remote TLB flush if installing a leaf SPTE overwrites an existing
leaf SPTE (with the same target pfn, which is enforced by a BUG() in
handle_changed_spte()) and clears the MMU-Writable bit.  Since the TDP MMU
passes ACC_ALL to make_spte(), i.e. always requests a Writable SPTE, the
only scenario in which make_spte() should create a !MMU-Writable SPTE is
if the gfn is write-tracked or if KVM is prefetching a SPTE.

When write-protecting for write-tracking, KVM must hold mmu_lock for write,
i.e. can't race with a vCPU faulting in the SPTE.  And when prefetching a
SPTE, the TDP MMU takes care to avoid clobbering a shadow-present SPTE,
i.e. it should be impossible to replace a MMU-writable SPTE with a
!MMU-writable SPTE when handling a TDP MMU fault.

Cc: David Matlack <dmatlack@google.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/r/20241011021051.1557902-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 9c66be7fb002..bc9e2f50dc80 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1033,7 +1033,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
 		return RET_PF_RETRY;
 	else if (is_shadow_present_pte(iter->old_spte) &&
-		 !is_last_spte(iter->old_spte, iter->level))
+		 (!is_last_spte(iter->old_spte, iter->level) ||
+		  WARN_ON_ONCE(is_mmu_writable_spte(iter->old_spte) &&
+			       !is_mmu_writable_spte(new_spte))))
 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
 
 	/*

From a5da5dde4ba475512dffc3c118fcb2aba9233fa4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:41 -0700
Subject: [PATCH 09/28] KVM: x86/mmu: Add a dedicated flag to track if A/D bits
 are globally enabled

Add a dedicated flag to track if KVM has enabled A/D bits at the module
level, instead of inferring the state based on whether or not the MMU's
shadow_accessed_mask is non-zero.  This will allow defining and using
shadow_accessed_mask even when A/D bits aren't used by hardware.

Link: https://lore.kernel.org/r/20241011021051.1557902-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c     |  6 +++---
 arch/x86/kvm/mmu/spte.c    |  6 ++++++
 arch/x86/kvm/mmu/spte.h    | 20 +++++++++-----------
 arch/x86/kvm/mmu/tdp_mmu.c |  4 ++--
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5f7d7db30b21..80b5a45ac317 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3357,7 +3357,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
 	 *    by setting the Writable bit, which can be done out of mmu_lock.
 	 */
 	if (!fault->present)
-		return !kvm_ad_enabled();
+		return !kvm_ad_enabled;
 
 	/*
 	 * Note, instruction fetches and writes are mutually exclusive, ignore
@@ -3492,7 +3492,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
 		 * enabled, the SPTE can't be an access-tracked SPTE.
 		 */
-		if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
+		if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
 			new_spte = restore_acc_track_spte(new_spte);
 
 		/*
@@ -5469,7 +5469,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	role.efer_nx = true;
 	role.smm = cpu_role.base.smm;
 	role.guest_mode = cpu_role.base.guest_mode;
-	role.ad_disabled = !kvm_ad_enabled();
+	role.ad_disabled = !kvm_ad_enabled;
 	role.level = kvm_mmu_get_tdp_level(vcpu);
 	role.direct = true;
 	role.has_4_byte_gpte = false;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 030813781a63..c9543625dda2 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching;
 module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
 EXPORT_SYMBOL_GPL(enable_mmio_caching);
 
+bool __read_mostly kvm_ad_enabled;
+
 u64 __read_mostly shadow_host_writable_mask;
 u64 __read_mostly shadow_mmu_writable_mask;
 u64 __read_mostly shadow_nx_mask;
@@ -414,6 +416,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask);
 
 void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 {
+	kvm_ad_enabled		= has_ad_bits;
+
 	shadow_user_mask	= VMX_EPT_READABLE_MASK;
 	shadow_accessed_mask	= has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
 	shadow_dirty_mask	= has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
@@ -447,6 +451,8 @@ void kvm_mmu_reset_all_pte_masks(void)
 	u8 low_phys_bits;
 	u64 mask;
 
+	kvm_ad_enabled = true;
+
 	/*
 	 * If the CPU has 46 or less physical address bits, then set an
 	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 574ca9a1fcab..908cb672c4e1 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -167,6 +167,15 @@ static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
 #define SHADOW_NONPRESENT_VALUE	0ULL
 #endif
 
+
+/*
+ * True if A/D bits are supported in hardware and are enabled by KVM.  When
+ * enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can disable
+ * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario
+ * where KVM is using A/D bits for L1, but not L2.
+ */
+extern bool __read_mostly kvm_ad_enabled;
+
 extern u64 __read_mostly shadow_host_writable_mask;
 extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
@@ -285,17 +294,6 @@ static inline bool is_ept_ve_possible(u64 spte)
 	       (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
 }
 
-/*
- * Returns true if A/D bits are supported in hardware and are enabled by KVM.
- * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
- * scenario where KVM is using A/D bits for L1, but not L2.
- */
-static inline bool kvm_ad_enabled(void)
-{
-	return !!shadow_accessed_mask;
-}
-
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 {
 	return sp->role.ad_disabled;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bc9e2f50dc80..f77927b57962 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1075,7 +1075,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
 			   struct kvm_mmu_page *sp, bool shared)
 {
-	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
+	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
 	int ret = 0;
 
 	if (shared) {
@@ -1491,7 +1491,7 @@ static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
 	 * from level, so it is valid to key off any shadow page to determine if
 	 * write protection is needed for an entire tree.
 	 */
-	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
+	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
 }
 
 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,

From 3835819fb1b37bc736ef78c83f1ef275bc9e6565 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:42 -0700
Subject: [PATCH 10/28] KVM: x86/mmu: Set shadow_accessed_mask for EPT even if
 A/D bits disabled

Now that KVM doesn't use shadow_accessed_mask to detect if hardware A/D
bits are enabled, set shadow_accessed_mask for EPT even when A/D bits
are disabled in hardware.  This will allow using shadow_accessed_mask for
software purposes, e.g. to preserve accessed status in a non-present SPTE
acros NUMA balancing, if something like that is ever desirable.

Link: https://lore.kernel.org/r/20241011021051.1557902-11-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/spte.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c9543625dda2..e352d1821319 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -419,7 +419,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 	kvm_ad_enabled		= has_ad_bits;
 
 	shadow_user_mask	= VMX_EPT_READABLE_MASK;
-	shadow_accessed_mask	= has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull;
+	shadow_accessed_mask	= VMX_EPT_ACCESS_BIT;
 	shadow_dirty_mask	= has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
 	shadow_nx_mask		= 0ull;
 	shadow_x_mask		= VMX_EPT_EXECUTABLE_MASK;

From 53510b912518bd15dba9632ee8e539168166af3c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:43 -0700
Subject: [PATCH 11/28] KVM: x86/mmu: Set shadow_dirty_mask for EPT even if A/D
 bits disabled

Set shadow_dirty_mask to the architectural EPT Dirty bit value even if
A/D bits are disabled at the module level, i.e. even if KVM will never
enable A/D bits in hardware.  Doing so provides consistent behavior for
Accessed and Dirty bits, i.e. doesn't leave KVM in a state where it sets
shadow_accessed_mask but not shadow_dirty_mask.

Functionally, this should be one big nop, as consumption of
shadow_dirty_mask is always guarded by a check that hardware A/D bits are
enabled.

Link: https://lore.kernel.org/r/20241011021051.1557902-12-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/spte.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index e352d1821319..54d8c9b76050 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -420,7 +420,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 
 	shadow_user_mask	= VMX_EPT_READABLE_MASK;
 	shadow_accessed_mask	= VMX_EPT_ACCESS_BIT;
-	shadow_dirty_mask	= has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
+	shadow_dirty_mask	= VMX_EPT_DIRTY_BIT;
 	shadow_nx_mask		= 0ull;
 	shadow_x_mask		= VMX_EPT_EXECUTABLE_MASK;
 	/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */

From 7971801b5618e37d7173f615300f640ab590ba6d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:44 -0700
Subject: [PATCH 12/28] KVM: x86/mmu: Use Accessed bit even when _hardware_ A/D
 bits are disabled

Use the Accessed bit in SPTEs even when A/D bits are disabled in hardware,
i.e. propagate accessed information to SPTE.Accessed even when KVM is
doing manual tracking by making SPTEs not-present.  In addition to
eliminating a small amount of code in is_accessed_spte(), this also paves
the way for preserving Accessed information when a SPTE is zapped in
response to a mmu_notifier PROTECTION event, e.g. if a SPTE is zapped
because NUMA balancing kicks in.

Note, EPT is the only flavor of paging in which A/D bits are conditionally
enabled, and the Accessed (and Dirty) bit is software-available when A/D
bits are disabled.

Note #2, there are currently no concrete plans to preserve Accessed
information.  Explorations on that front were the initial catalyst, but
the cleanup is the motivation for the actual commit.

Link: https://lore.kernel.org/r/20241011021051.1557902-13-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c  |  3 ++-
 arch/x86/kvm/mmu/spte.c |  4 ++--
 arch/x86/kvm/mmu/spte.h | 11 +----------
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 80b5a45ac317..36247b8d7476 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3493,7 +3493,8 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * enabled, the SPTE can't be an access-tracked SPTE.
 		 */
 		if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
-			new_spte = restore_acc_track_spte(new_spte);
+			new_spte = restore_acc_track_spte(new_spte) |
+				   shadow_accessed_mask;
 
 		/*
 		 * To keep things simple, only SPTEs that are MMU-writable can
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 54d8c9b76050..617479efd127 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -175,7 +175,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	spte |= shadow_present_mask;
 	if (!prefetch || synchronizing)
-		spte |= spte_shadow_accessed_mask(spte);
+		spte |= shadow_accessed_mask;
 
 	/*
 	 * For simplicity, enforce the NX huge page mitigation even if not
@@ -346,7 +346,7 @@ u64 mark_spte_for_access_track(u64 spte)
 
 	spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
 		SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
-	spte &= ~shadow_acc_track_mask;
+	spte &= ~(shadow_acc_track_mask | shadow_accessed_mask);
 
 	return spte;
 }
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 908cb672c4e1..c8dc75337c8b 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -316,12 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte)
 	return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
 }
 
-static inline u64 spte_shadow_accessed_mask(u64 spte)
-{
-	KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
-	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
-}
-
 static inline u64 spte_shadow_dirty_mask(u64 spte)
 {
 	KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
@@ -355,10 +349,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
 
 static inline bool is_accessed_spte(u64 spte)
 {
-	u64 accessed_mask = spte_shadow_accessed_mask(spte);
-
-	return accessed_mask ? spte & accessed_mask
-			     : !is_access_track_spte(spte);
+	return spte & shadow_accessed_mask;
 }
 
 static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,

From 526e609f05676c21aeed6c04115ae777c40081ec Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:45 -0700
Subject: [PATCH 13/28] KVM: x86/mmu: Process only valid TDP MMU roots when
 aging a gfn range

Skip invalid TDP MMU roots when aging a gfn range.  There is zero reason
to process invalid roots, as they by definition hold stale information.
E.g. if a root is invalid because its from a previous memslot generation,
in the unlikely event the root has a SPTE for the gfn, then odds are good
that the gfn=>hva mapping is different, i.e. doesn't map to the hva that
is being aged by the primary MMU.

Link: https://lore.kernel.org/r/20241011021051.1557902-14-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f77927b57962..e8c061bf94ec 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1205,9 +1205,11 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
 
 	/*
 	 * Don't support rescheduling, none of the MMU notifiers that funnel
-	 * into this helper allow blocking; it'd be dead, wasteful code.
+	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
+	 * this helper must NOT be used to unmap GFNs, as it processes only
+	 * valid roots!
 	 */
-	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+	for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
 		rcu_read_lock();
 
 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)

From 51192ebdd145f4808f440de5cbc6426afcaa0564 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:46 -0700
Subject: [PATCH 14/28] KVM: x86/mmu: Stop processing TDP MMU roots for
 test_age if young SPTE found

Return immediately if a young SPTE is found when testing, but not updating,
SPTEs.  The return value is a boolean, i.e. whether there is one young SPTE
or fifty is irrelevant (ignoring the fact that it's impossible for there to
be fifty SPTEs, as KVM has a hard limit on the number of valid TDP MMU
roots).

Link: https://lore.kernel.org/r/20241011021051.1557902-15-seanjc@google.com
[sean: use guard(rcu)(), as suggested by Paolo]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 79 +++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index e8c061bf94ec..3df9fac273e1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1192,35 +1192,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 	return flush;
 }
 
-typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
-			      struct kvm_gfn_range *range);
-
-static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
-						   struct kvm_gfn_range *range,
-						   tdp_handler_t handler)
-{
-	struct kvm_mmu_page *root;
-	struct tdp_iter iter;
-	bool ret = false;
-
-	/*
-	 * Don't support rescheduling, none of the MMU notifiers that funnel
-	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
-	 * this helper must NOT be used to unmap GFNs, as it processes only
-	 * valid roots!
-	 */
-	for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
-		rcu_read_lock();
-
-		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
-			ret |= handler(kvm, &iter, range);
-
-		rcu_read_unlock();
-	}
-
-	return ret;
-}
-
 /*
  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  * if any of the GFNs in the range have been accessed.
@@ -1229,15 +1200,10 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
  * from the clear_young() or clear_flush_young() notifier, which uses the
  * return value to determine if the page has been accessed.
  */
-static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
-			  struct kvm_gfn_range *range)
+static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
 {
 	u64 new_spte;
 
-	/* If we have a non-accessed entry we don't need to change the pte. */
-	if (!is_accessed_spte(iter->old_spte))
-		return false;
-
 	if (spte_ad_enabled(iter->old_spte)) {
 		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
 							 iter->old_spte,
@@ -1253,23 +1219,48 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
 
 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
 				       iter->old_spte, new_spte);
-	return true;
+}
+
+static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
+					struct kvm_gfn_range *range,
+					bool test_only)
+{
+	struct kvm_mmu_page *root;
+	struct tdp_iter iter;
+	bool ret = false;
+
+	/*
+	 * Don't support rescheduling, none of the MMU notifiers that funnel
+	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
+	 * this helper must NOT be used to unmap GFNs, as it processes only
+	 * valid roots!
+	 */
+	for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
+		guard(rcu)();
+
+		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+			if (!is_accessed_spte(iter.old_spte))
+				continue;
+
+			if (test_only)
+				return true;
+
+			ret = true;
+			kvm_tdp_mmu_age_spte(&iter);
+		}
+	}
+
+	return ret;
 }
 
 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
-}
-
-static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
-			 struct kvm_gfn_range *range)
-{
-	return is_accessed_spte(iter->old_spte);
+	return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
 }
 
 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
+	return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
 }
 
 /*

From c9b625625ba37d4b493f5fc878d8fce3d38dab2d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:47 -0700
Subject: [PATCH 15/28] KVM: x86/mmu: Dedup logic for detecting TLB flushes on
 leaf SPTE changes

Now that the shadow MMU and TDP MMU have identical logic for detecting
required TLB flushes when updating SPTEs, move said logic to a helper so
that the TDP MMU code can benefit from the comments that are currently
exclusive to the shadow MMU.

No functional change intended.

Link: https://lore.kernel.org/r/20241011021051.1557902-16-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c     | 19 +------------------
 arch/x86/kvm/mmu/spte.h    | 28 ++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_mmu.c |  3 +--
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 36247b8d7476..266a25d96512 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -488,23 +488,6 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
 /* Rules for using mmu_spte_update:
  * Update the state bits, it means the mapped pfn is not changed.
  *
- * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
- * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
- * as KVM allows stale Writable TLB entries to exist.  When dirty logging, KVM
- * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
- * not whether or not SPTEs were modified, i.e. only the write-tracking case
- * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
- *
- * Don't flush if the Accessed bit is cleared, as access tracking tolerates
- * false negatives, and the one path that does care about TLB flushes,
- * kvm_mmu_notifier_clear_flush_young(), flushes if a young SPTE is found, i.e.
- * doesn't rely on lower helpers to detect the need to flush.
- *
- * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
- * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
- * when clearing dirty logs, KVM flushes based on whether or not dirty entries
- * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
- *
  * Returns true if the TLB needs to be flushed
  */
 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
@@ -527,7 +510,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 	WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
 		     spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 
-	return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
+	return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index c8dc75337c8b..1d83614baac5 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -467,6 +467,34 @@ static inline bool is_mmu_writable_spte(u64 spte)
 	return spte & shadow_mmu_writable_mask;
 }
 
+/*
+ * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
+ * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
+ * as KVM allows stale Writable TLB entries to exist.  When dirty logging, KVM
+ * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
+ * not whether or not SPTEs were modified, i.e. only the write-tracking case
+ * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
+ *
+ * Don't flush if the Accessed bit is cleared, as access tracking tolerates
+ * false negatives, and the one path that does care about TLB flushes,
+ * kvm_mmu_notifier_clear_flush_young(), flushes if a young SPTE is found, i.e.
+ * doesn't rely on lower helpers to detect the need to flush.
+ *
+ * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
+ * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
+ * when clearing dirty logs, KVM flushes based on whether or not dirty entries
+ * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
+ *
+ * Note, this logic only applies to shadow-present leaf SPTEs.  The caller is
+ * responsible for checking that the old SPTE is shadow-present, and is also
+ * responsible for determining whether or not a TLB flush is required when
+ * modifying a shadow-present non-leaf SPTE.
+ */
+static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte)
+{
+	return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
+}
+
 static inline u64 get_mmio_spte_generation(u64 spte)
 {
 	u64 gen;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3df9fac273e1..672babbfe2a5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1034,8 +1034,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 		return RET_PF_RETRY;
 	else if (is_shadow_present_pte(iter->old_spte) &&
 		 (!is_last_spte(iter->old_spte, iter->level) ||
-		  WARN_ON_ONCE(is_mmu_writable_spte(iter->old_spte) &&
-			       !is_mmu_writable_spte(new_spte))))
+		  WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
 
 	/*

From 85649117511ddf9971e4759fe8ac9fbb69ad5064 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:48 -0700
Subject: [PATCH 16/28] KVM: x86/mmu: Set Dirty bit for new SPTEs, even if
 _hardware_ A/D bits are disabled

When making a SPTE, set the Dirty bit in the SPTE as appropriate, even if
hardware A/D bits are disabled.  Only EPT allows A/D bits to be disabled,
and for EPT, the bits are software-available (ignored by hardware) when
A/D bits are disabled, i.e. it is perfectly legal for KVM to use the Dirty
to track dirty pages in software.

Link: https://lore.kernel.org/r/20241011021051.1557902-17-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/spte.c | 2 +-
 arch/x86/kvm/mmu/spte.h | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 617479efd127..fd8c3c92ade0 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -237,7 +237,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			wrprot = true;
 		else
 			spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
-				spte_shadow_dirty_mask(spte);
+				shadow_dirty_mask;
 	}
 
 	if (prefetch && !synchronizing)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1d83614baac5..ccfd5a091b7a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -316,12 +316,6 @@ static inline bool spte_ad_need_write_protect(u64 spte)
 	return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
 }
 
-static inline u64 spte_shadow_dirty_mask(u64 spte)
-{
-	KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
-	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
-}
-
 static inline bool is_access_track_spte(u64 spte)
 {
 	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;

From 2ebbe0308c29287914eeabe6ed83a6e21f9962bc Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:49 -0700
Subject: [PATCH 17/28] KVM: Allow arch code to elide TLB flushes when aging a
 young page

Add a Kconfig to allow architectures to opt-out of a TLB flush when a
young page is aged, as invalidating TLB entries is not functionally
required on most KVM-supported architectures.  Stale TLB entries can
result in false negatives and theoretically lead to suboptimal reclaim,
but in practice all observations have been that the performance gained by
skipping TLB flushes outweighs any performance lost by reclaiming hot
pages.

E.g. the primary MMUs for x86 RISC-V, s390, and PPC Book3S elide the TLB
flush for ptep_clear_flush_young(), and arm64's MMU skips the trailing DSB
that's required for ordering (presumably because there are optimizations
related to eliding other TLB flushes when doing make-before-break).

Link: https://lore.kernel.org/r/20241011021051.1557902-18-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 virt/kvm/Kconfig    |  4 ++++
 virt/kvm/kvm_main.c | 20 ++++++--------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index fd6a3010afa8..54e959e7d68f 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -100,6 +100,10 @@ config KVM_GENERIC_MMU_NOTIFIER
        select MMU_NOTIFIER
        bool
 
+config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+       depends on KVM_GENERIC_MMU_NOTIFIER
+       bool
+
 config KVM_GENERIC_MEMORY_ATTRIBUTES
        depends on KVM_GENERIC_MMU_NOTIFIER
        bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b10dc408a0..83b525d16b61 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -630,7 +630,8 @@ mmu_unlock:
 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 						unsigned long start,
 						unsigned long end,
-						gfn_handler_t handler)
+						gfn_handler_t handler,
+						bool flush_on_ret)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	const struct kvm_mmu_notifier_range range = {
@@ -638,7 +639,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 		.end		= end,
 		.handler	= handler,
 		.on_lock	= (void *)kvm_null_fn,
-		.flush_on_ret	= true,
+		.flush_on_ret	= flush_on_ret,
 		.may_block	= false,
 	};
 
@@ -650,17 +651,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 							 unsigned long end,
 							 gfn_handler_t handler)
 {
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	const struct kvm_mmu_notifier_range range = {
-		.start		= start,
-		.end		= end,
-		.handler	= handler,
-		.on_lock	= (void *)kvm_null_fn,
-		.flush_on_ret	= false,
-		.may_block	= false,
-	};
-
-	return __kvm_handle_hva_range(kvm, &range).ret;
+	return kvm_handle_hva_range(mn, start, end, handler, false);
 }
 
 void kvm_mmu_invalidate_begin(struct kvm *kvm)
@@ -825,7 +816,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 {
 	trace_kvm_age_hva(start, end);
 
-	return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
+	return kvm_handle_hva_range(mn, start, end, kvm_age_gfn,
+				    !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,

From b9883ee40d7e6c3f3fb59c247283b152fdde955c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 10 Oct 2024 19:10:50 -0700
Subject: [PATCH 18/28] KVM: x86: Don't emit TLB flushes when aging SPTEs for
 mmu_notifiers

Follow x86's primary MMU, which hasn't flushed TLBs when clearing Accessed
bits for 10+ years, and skip all TLB flushes when aging SPTEs in response
to a clear_flush_young() mmu_notifier event.  As documented in x86's
ptep_clear_flush_young(), the probability and impact of "bad" reclaim due
to stale A-bit information is relatively low, whereas the performance cost
of TLB flushes is relatively high.  I.e. the cost of flushing TLBs
outweighs the benefits.

On KVM x86, the cost of TLB flushes is even higher, as KVM doesn't batch
TLB flushes for mmu_notifier events (KVM's mmu_notifier contract with MM
makes it all but impossible), and sending IPIs forces all running vCPUs to
go through a VM-Exit => VM-Enter roundtrip.

Furthermore, MGLRU aging of secondary MMUs is expected to use flush-less
mmu_notifiers, i.e. flushing for the !MGLRU will make even less sense, and
will be actively confusing as it wouldn't be clear why KVM "needs" to
flush TLBs for legacy LRU aging, but not for MGLRU aging.

Cc: James Houghton <jthoughton@google.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/all/20240926013506.860253-18-jthoughton@google.com
Link: https://lore.kernel.org/r/20241011021051.1557902-19-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/Kconfig    | 1 +
 arch/x86/kvm/mmu/spte.h | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f09f13c01c6b..1ed1e4f5d51c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM_X86
 	depends on X86_LOCAL_APIC
 	select KVM_COMMON
 	select KVM_GENERIC_MMU_NOTIFIER
+	select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_PFNCACHE
 	select HAVE_KVM_DIRTY_RING_TSO
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index ccfd5a091b7a..4d2b700c26dd 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -470,9 +470,8 @@ static inline bool is_mmu_writable_spte(u64 spte)
  * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
  *
  * Don't flush if the Accessed bit is cleared, as access tracking tolerates
- * false negatives, and the one path that does care about TLB flushes,
- * kvm_mmu_notifier_clear_flush_young(), flushes if a young SPTE is found, i.e.
- * doesn't rely on lower helpers to detect the need to flush.
+ * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a
+ * mmu_notifier.clear_flush_young() event.
  *
  * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
  * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and

From 8ccd51cb5911df764f36cc7083bbc22bdd8a7aae Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 23 Aug 2024 16:56:43 -0700
Subject: [PATCH 19/28] KVM: x86/mmu: Drop @max_level from
 kvm_mmu_max_mapping_level()

Drop the @max_level parameter from kvm_mmu_max_mapping_level(). All
callers pass in PG_LEVEL_NUM, so @max_level can be replaced with
PG_LEVEL_NUM in the function body.

No functional change intended.

Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20240823235648.3236880-2-dmatlack@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c          | 8 +++-----
 arch/x86/kvm/mmu/mmu_internal.h | 3 +--
 arch/x86/kvm/mmu/tdp_mmu.c      | 3 +--
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 266a25d96512..3c2322440d9c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3114,13 +3114,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
 }
 
 int kvm_mmu_max_mapping_level(struct kvm *kvm,
-			      const struct kvm_memory_slot *slot, gfn_t gfn,
-			      int max_level)
+			      const struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	bool is_private = kvm_slot_can_be_private(slot) &&
 			  kvm_mem_is_private(kvm, gfn);
 
-	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
 }
 
 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -6919,8 +6918,7 @@ restart:
 		 * mapping if the indirect sp has level = 1.
 		 */
 		if (sp->role.direct &&
-		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
-							       PG_LEVEL_NUM)) {
+		    sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
 			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
 
 			if (kvm_available_flush_remote_tlbs_range())
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index fabbea504a69..b00abbe3f6cf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -346,8 +346,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 }
 
 int kvm_mmu_max_mapping_level(struct kvm *kvm,
-			      const struct kvm_memory_slot *slot, gfn_t gfn,
-			      int max_level);
+			      const struct kvm_memory_slot *slot, gfn_t gfn);
 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 672babbfe2a5..07ffd34150cc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1626,8 +1626,7 @@ retry:
 		if (iter.gfn < start || iter.gfn >= end)
 			continue;
 
-		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
-							      iter.gfn, PG_LEVEL_NUM);
+		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
 		if (max_mapping_level < iter.level)
 			continue;
 

From 35ef80eb29ab5f7b7c7264c7f21a64b3aa046921 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 23 Aug 2024 16:56:44 -0700
Subject: [PATCH 20/28] KVM: x86/mmu: Batch TLB flushes when zapping
 collapsible TDP MMU SPTEs

Set SPTEs directly to SHADOW_NONPRESENT_VALUE and batch up TLB flushes
when zapping collapsible SPTEs, rather than freezing them first.

Freezing the SPTE first is not required. It is fine for another thread
holding mmu_lock for read to immediately install a present entry before
TLBs are flushed because the underlying mapping is not changing. vCPUs
that translate through the stale 4K mappings or a new huge page mapping
will still observe the same GPA->HPA translations.

KVM must only flush TLBs before dropping RCU (to avoid use-after-free of
the zapped page tables) and before dropping mmu_lock (to synchronize
with mmu_notifiers invalidating mappings).

In VMs backed with 2MiB pages, batching TLB flushes improves the time it
takes to zap collapsible SPTEs to disable dirty logging:

 $ ./dirty_log_perf_test -s anonymous_hugetlb_2mb -v 64 -e -b 4g

 Before: Disabling dirty logging time: 14.334453428s (131072 flushes)
 After:  Disabling dirty logging time: 4.794969689s  (76 flushes)

Skipping freezing SPTEs also avoids stalling vCPU threads on the frozen
SPTE for the time it takes to perform a remote TLB flush. vCPUs faulting
on the zapped mapping can now immediately install a new huge mapping and
proceed with guest execution.

Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20240823235648.3236880-3-dmatlack@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 55 +++++++-------------------------------
 1 file changed, 10 insertions(+), 45 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 07ffd34150cc..076343c3c8a7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -583,48 +583,6 @@ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
 	return 0;
 }
 
-static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm,
-						       struct tdp_iter *iter)
-{
-	int ret;
-
-	lockdep_assert_held_read(&kvm->mmu_lock);
-
-	/*
-	 * Freeze the SPTE by setting it to a special, non-present value. This
-	 * will stop other threads from immediately installing a present entry
-	 * in its place before the TLBs are flushed.
-	 *
-	 * Delay processing of the zapped SPTE until after TLBs are flushed and
-	 * the FROZEN_SPTE is replaced (see below).
-	 */
-	ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE);
-	if (ret)
-		return ret;
-
-	kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
-
-	/*
-	 * No other thread can overwrite the frozen SPTE as they must either
-	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
-	 * overwrite the special frozen SPTE value. Use the raw write helper to
-	 * avoid an unnecessary check on volatile bits.
-	 */
-	__kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
-
-	/*
-	 * Process the zapped SPTE after flushing TLBs, and after replacing
-	 * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are
-	 * blocked by the FROZEN_SPTE and reduces contention on the child
-	 * SPTEs.
-	 */
-	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
-			    SHADOW_NONPRESENT_VALUE, iter->level, true);
-
-	return 0;
-}
-
-
 /*
  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
  * @kvm:	      KVM instance
@@ -1596,13 +1554,16 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 	gfn_t end = start + slot->npages;
 	struct tdp_iter iter;
 	int max_mapping_level;
+	bool flush = false;
 
 	rcu_read_lock();
 
 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
 retry:
-		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+			flush = false;
 			continue;
+		}
 
 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
 		    !is_shadow_present_pte(iter.old_spte))
@@ -1630,11 +1591,15 @@ retry:
 		if (max_mapping_level < iter.level)
 			continue;
 
-		/* Note, a successful atomic zap also does a remote TLB flush. */
-		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
+		if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
 			goto retry;
+
+		flush = true;
 	}
 
+	if (flush)
+		kvm_flush_remote_tlbs_memslot(kvm, slot);
+
 	rcu_read_unlock();
 }
 

From e287e43167139a6c644ba648be6b2bf39314eaae Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 31 Oct 2024 10:06:32 -0700
Subject: [PATCH 21/28] KVM: x86/mmu: Check yielded_gfn for forward progress
 iff resched is needed

Swap the order of the checks in tdp_mmu_iter_cond_resched() so that KVM
checks to see if a resched is needed _before_ checking to see if yielding
must be disallowed to guarantee forward progress.  Iterating over TDP MMU
SPTEs is a hot path, e.g. tearing down a root can touch millions of SPTEs,
and not needing to reschedule is by far the common case.  On the other
hand, disallowing yielding because forward progress has not been made is a
very rare case.

Returning early for the common case (no resched), effectively reduces the
number of checks from 2 to 1 for the common case, and should make the code
slightly more predictable for the CPU.

To resolve a weird conundrum where the forward progress check currently
returns false, but the need resched check subtly returns iter->yielded,
which _should_ be false (enforced by a WARN), return false unconditionally
(which might also help make the sequence more predictable).  If KVM has a
bug where iter->yielded is left danging, continuing to yield is neither
right nor wrong, it was simply an artifact of how the original code was
written.

Unconditionally returning false when yielding is unnecessary or unwanted
will also allow extracting the "should resched" logic to a separate helper
in a future patch.

Cc: David Matlack <dmatlack@google.com>
Reviewed-by: James Houghton <jthoughton@google.com>
Link: https://lore.kernel.org/r/20241031170633.1502783-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 076343c3c8a7..8170b16b91c3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -658,29 +658,29 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 {
 	WARN_ON_ONCE(iter->yielded);
 
+	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
+		return false;
+
 	/* Ensure forward progress has been made before yielding. */
 	if (iter->next_last_level_gfn == iter->yielded_gfn)
 		return false;
 
-	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
-		if (flush)
-			kvm_flush_remote_tlbs(kvm);
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
 
-		rcu_read_unlock();
+	rcu_read_unlock();
 
-		if (shared)
-			cond_resched_rwlock_read(&kvm->mmu_lock);
-		else
-			cond_resched_rwlock_write(&kvm->mmu_lock);
+	if (shared)
+		cond_resched_rwlock_read(&kvm->mmu_lock);
+	else
+		cond_resched_rwlock_write(&kvm->mmu_lock);
 
-		rcu_read_lock();
+	rcu_read_lock();
 
-		WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
+	WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
 
-		iter->yielded = true;
-	}
-
-	return iter->yielded;
+	iter->yielded = true;
+	return true;
 }
 
 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)

From 38b0ac47169b981bd40226f16f17d8a098c81309 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 31 Oct 2024 10:06:33 -0700
Subject: [PATCH 22/28] KVM: x86/mmu: Demote the WARN on yielded in
 xxx_cond_resched() to KVM_MMU_WARN_ON

Convert the WARN in tdp_mmu_iter_cond_resched() that the iterator hasn't
already yielded to a KVM_MMU_WARN_ON() so the code is compiled out for
production kernels (assuming production kernels disable KVM_PROVE_MMU).

Checking for a needed reschedule is a hot path, and KVM sanity checks
iter->yielded in several other less-hot paths, i.e. the odds of KVM not
flagging that something went sideways are quite low.  Furthermore, the
odds of KVM not noticing *and* the WARN detecting something worth
investigating are even lower.

Link: https://lore.kernel.org/r/20241031170633.1502783-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 8170b16b91c3..ef44dcba1ecb 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -656,7 +656,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 							  struct tdp_iter *iter,
 							  bool flush, bool shared)
 {
-	WARN_ON_ONCE(iter->yielded);
+	KVM_MMU_WARN_ON(iter->yielded);
 
 	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
 		return false;

From dd2e7dbc4ae2497cd6eea2d7003fe60039ebae50 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 30 Oct 2024 17:00:23 -0700
Subject: [PATCH 23/28] KVM: x86/mmu: Refactor TDP MMU iter need resched check

Refactor the TDP MMU iterator "need resched" checks into a helper
function so they can be called from a different code path in a
subsequent commit.

No functional change intended.

Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20240823235648.3236880-4-dmatlack@google.com
[sean: rebase on a swapped order of checks]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ef44dcba1ecb..3bc486537b39 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -638,6 +638,16 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
 	for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
 
+static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
+							  struct tdp_iter *iter)
+{
+	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
+		return false;
+
+	/* Ensure forward progress has been made before yielding. */
+	return iter->next_last_level_gfn != iter->yielded_gfn;
+}
+
 /*
  * Yield if the MMU lock is contended or this thread needs to return control
  * to the scheduler.
@@ -658,11 +668,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
 {
 	KVM_MMU_WARN_ON(iter->yielded);
 
-	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
-		return false;
-
-	/* Ensure forward progress has been made before yielding. */
-	if (iter->next_last_level_gfn == iter->yielded_gfn)
+	if (!tdp_mmu_iter_need_resched(kvm, iter))
 		return false;
 
 	if (flush)

From 13e2e4f62a4bb1288688e7218818f6f655600028 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 23 Aug 2024 16:56:46 -0700
Subject: [PATCH 24/28] KVM: x86/mmu: Recover TDP MMU huge page mappings
 in-place instead of zapping

Recover TDP MMU huge page mappings in-place instead of zapping them when
dirty logging is disabled, and rename functions that recover huge page
mappings when dirty logging is disabled to move away from the "zap
collapsible spte" terminology.

Before KVM flushes TLBs, guest accesses may be translated through either
the (stale) small SPTE or the (new) huge SPTE. This is already possible
when KVM is doing eager page splitting (where TLB flushes are also
batched), and when vCPUs are faulting in huge mappings (where TLBs are
flushed after the new huge SPTE is installed).

Recovering huge pages reduces the number of page faults when dirty
logging is disabled:

 $ perf stat -e kvm:kvm_page_fault -- ./dirty_log_perf_test -s anonymous_hugetlb_2mb -v 64 -e -b 4g

 Before: 393,599      kvm:kvm_page_fault
 After:  262,575      kvm:kvm_page_fault

vCPU throughput and the latency of disabling dirty-logging are about
equal compared to zapping, but avoiding faults can be beneficial to
remove vCPU jitter in extreme scenarios.

Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20240823235648.3236880-5-dmatlack@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +--
 arch/x86/kvm/mmu/mmu.c          |  6 ++--
 arch/x86/kvm/mmu/spte.c         | 36 +++++++++++++++++++++--
 arch/x86/kvm/mmu/spte.h         |  1 +
 arch/x86/kvm/mmu/tdp_mmu.c      | 52 +++++++++++++++++++++++++++------
 arch/x86/kvm/mmu/tdp_mmu.h      |  4 +--
 arch/x86/kvm/x86.c              | 18 +++++-------
 7 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9..aa8007bc520a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1955,8 +1955,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
 				  const struct kvm_memory_slot *memslot,
 				  u64 start, u64 end,
 				  int target_level);
-void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				   const struct kvm_memory_slot *memslot);
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+				const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3c2322440d9c..33542ee4bbb1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6946,8 +6946,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
 		kvm_flush_remote_tlbs_memslot(kvm, slot);
 }
 
-void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				   const struct kvm_memory_slot *slot)
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+				const struct kvm_memory_slot *slot)
 {
 	if (kvm_memslots_have_rmaps(kvm)) {
 		write_lock(&kvm->mmu_lock);
@@ -6957,7 +6957,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 
 	if (tdp_mmu_enabled) {
 		read_lock(&kvm->mmu_lock);
-		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+		kvm_tdp_mmu_recover_huge_pages(kvm, slot);
 		read_unlock(&kvm->mmu_lock);
 	}
 }
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index fd8c3c92ade0..4bc107dc6b45 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -262,15 +262,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return wrprot;
 }
 
-static u64 make_spte_executable(u64 spte)
+static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
 {
 	bool is_access_track = is_access_track_spte(spte);
 
 	if (is_access_track)
 		spte = restore_acc_track_spte(spte);
 
-	spte &= ~shadow_nx_mask;
-	spte |= shadow_x_mask;
+	KVM_MMU_WARN_ON(set & clear);
+	spte = (spte | set) & ~clear;
 
 	if (is_access_track)
 		spte = mark_spte_for_access_track(spte);
@@ -278,6 +278,16 @@ static u64 make_spte_executable(u64 spte)
 	return spte;
 }
 
+static u64 make_spte_executable(u64 spte)
+{
+	return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
+}
+
+static u64 make_spte_nonexecutable(u64 spte)
+{
+	return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
+}
+
 /*
  * Construct an SPTE that maps a sub-page of the given huge page SPTE where
  * `index` identifies which sub-page.
@@ -314,6 +324,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
 	return child_spte;
 }
 
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
+{
+	u64 huge_spte;
+
+	KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
+
+	huge_spte = small_spte | PT_PAGE_SIZE_MASK;
+
+	/*
+	 * huge_spte already has the address of the sub-page being collapsed
+	 * from small_spte, so just clear the lower address bits to create the
+	 * huge page address.
+	 */
+	huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
+
+	if (is_nx_huge_page_enabled(kvm))
+		huge_spte = make_spte_nonexecutable(huge_spte);
+
+	return huge_spte;
+}
 
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 {
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 4d2b700c26dd..3a4147345e03 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -506,6 +506,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	       bool host_writable, u64 *new_spte);
 u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
 		      	      union kvm_mmu_page_role role, int index);
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
 u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3bc486537b39..2b5d30190557 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1552,15 +1552,43 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
 }
 
-static void zap_collapsible_spte_range(struct kvm *kvm,
-				       struct kvm_mmu_page *root,
-				       const struct kvm_memory_slot *slot)
+static int tdp_mmu_make_huge_spte(struct kvm *kvm,
+				  struct tdp_iter *parent,
+				  u64 *huge_spte)
+{
+	struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
+	gfn_t start = parent->gfn;
+	gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
+	struct tdp_iter iter;
+
+	tdp_root_for_each_leaf_pte(iter, root, start, end) {
+		/*
+		 * Use the parent iterator when checking for forward progress so
+		 * that KVM doesn't get stuck continuously trying to yield (i.e.
+		 * returning -EAGAIN here and then failing the forward progress
+		 * check in the caller ad nauseam).
+		 */
+		if (tdp_mmu_iter_need_resched(kvm, parent))
+			return -EAGAIN;
+
+		*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static void recover_huge_pages_range(struct kvm *kvm,
+				     struct kvm_mmu_page *root,
+				     const struct kvm_memory_slot *slot)
 {
 	gfn_t start = slot->base_gfn;
 	gfn_t end = start + slot->npages;
 	struct tdp_iter iter;
 	int max_mapping_level;
 	bool flush = false;
+	u64 huge_spte;
+	int r;
 
 	rcu_read_lock();
 
@@ -1597,7 +1625,13 @@ retry:
 		if (max_mapping_level < iter.level)
 			continue;
 
-		if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
+		r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
+		if (r == -EAGAIN)
+			goto retry;
+		else if (r)
+			continue;
+
+		if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
 			goto retry;
 
 		flush = true;
@@ -1610,17 +1644,17 @@ retry:
 }
 
 /*
- * Zap non-leaf SPTEs (and free their associated page tables) which could
- * be replaced by huge pages, for GFNs within the slot.
+ * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
+ * huge SPTEs where possible.
  */
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				       const struct kvm_memory_slot *slot)
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+				    const struct kvm_memory_slot *slot)
 {
 	struct kvm_mmu_page *root;
 
 	lockdep_assert_held_read(&kvm->mmu_lock);
 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
-		zap_collapsible_spte_range(kvm, root, slot);
+		recover_huge_pages_range(kvm, root, slot);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index d842bfe103ab..f03ca0dd13d9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -40,8 +40,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				       struct kvm_memory_slot *slot,
 				       gfn_t gfn, unsigned long mask,
 				       bool wrprot);
-void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				       const struct kvm_memory_slot *slot);
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+				    const struct kvm_memory_slot *slot);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 				   struct kvm_memory_slot *slot, gfn_t gfn,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83fe0a78146f..8507b300ff43 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13104,19 +13104,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 
 	if (!log_dirty_pages) {
 		/*
-		 * Dirty logging tracks sptes in 4k granularity, meaning that
-		 * large sptes have to be split.  If live migration succeeds,
-		 * the guest in the source machine will be destroyed and large
-		 * sptes will be created in the destination.  However, if the
-		 * guest continues to run in the source machine (for example if
-		 * live migration fails), small sptes will remain around and
-		 * cause bad performance.
+		 * Recover huge page mappings in the slot now that dirty logging
+		 * is disabled, i.e. now that KVM does not have to track guest
+		 * writes at 4KiB granularity.
 		 *
-		 * Scan sptes if dirty logging has been stopped, dropping those
-		 * which can be collapsed into a single large-page spte.  Later
-		 * page faults will create the large-page sptes.
+		 * Dirty logging might be disabled by userspace if an ongoing VM
+		 * live migration is cancelled and the VM must continue running
+		 * on the source.
 		 */
-		kvm_mmu_zap_collapsible_sptes(kvm, new);
+		kvm_mmu_recover_huge_pages(kvm, new);
 	} else {
 		/*
 		 * Initially-all-set does not require write protecting any page,

From 430e264b76538d371ff4e2d0d20801fa11b10198 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 23 Aug 2024 16:56:47 -0700
Subject: [PATCH 25/28] KVM: x86/mmu: Rename make_huge_page_split_spte() to
 make_small_spte()

Rename make_huge_page_split_spte() to make_small_spte(). This ensures
that the usage of "small_spte" and "huge_spte" are consistent between
make_huge_spte() and make_small_spte().

This should also reduce some confusion as make_huge_page_split_spte()
almost reads like it will create a huge SPTE, when in fact it is
creating a small SPTE to split the huge SPTE.

No functional change intended.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20240823235648.3236880-6-dmatlack@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c     | 2 +-
 arch/x86/kvm/mmu/spte.c    | 4 ++--
 arch/x86/kvm/mmu/spte.h    | 4 ++--
 arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 33542ee4bbb1..b522b076c2ce 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6735,7 +6735,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
 			continue;
 		}
 
-		spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
+		spte = make_small_spte(kvm, huge_spte, sp->role, index);
 		mmu_spte_set(sptep, spte);
 		__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
 	}
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4bc107dc6b45..22551e2f1d00 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -295,8 +295,8 @@ static u64 make_spte_nonexecutable(u64 spte)
  * This is used during huge page splitting to build the SPTEs that make up the
  * new page table.
  */
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
-			      union kvm_mmu_page_role role, int index)
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+		    union kvm_mmu_page_role role, int index)
 {
 	u64 child_spte = huge_spte;
 
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 3a4147345e03..f332b33bc817 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -504,8 +504,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
 	       u64 old_spte, bool prefetch, bool synchronizing,
 	       bool host_writable, u64 *new_spte);
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
-		      	      union kvm_mmu_page_role role, int index);
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+		    union kvm_mmu_page_role role, int index);
 u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 2b5d30190557..233570e2cd35 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1314,7 +1314,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
 	 * not been linked in yet and thus is not reachable from any other CPU.
 	 */
 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
-		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
+		sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
 
 	/*
 	 * Replace the huge spte with a pointer to the populated lower level

From 06c4cd957b5cfc8ce995474d3dc935cf89bcf454 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 23 Aug 2024 16:56:48 -0700
Subject: [PATCH 26/28] KVM: x86/mmu: WARN if huge page recovery triggered
 during dirty logging

WARN and bail out of recover_huge_pages_range() if dirty logging is
enabled. KVM shouldn't be recovering huge pages during dirty logging
anyway, since KVM needs to track writes at 4KiB. However it's not out of
the possibility that that changes in the future.

If KVM wants to recover huge pages during dirty logging, make_huge_spte()
must be updated to write-protect the new huge page mapping. Otherwise,
writes through the newly recovered huge page mapping will not be tracked.

Note that this potential risk did not exist back when KVM zapped to
recover huge page mappings, since subsequent accesses would just be
faulted in at PG_LEVEL_4K if dirty logging was enabled.

Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20240823235648.3236880-7-dmatlack@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 233570e2cd35..4508d868f1cd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1590,6 +1590,9 @@ static void recover_huge_pages_range(struct kvm *kvm,
 	u64 huge_spte;
 	int r;
 
+	if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
+		return;
+
 	rcu_read_lock();
 
 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {

From fe140e611d3450708a962d937546c7bd164183ea Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Fri, 1 Nov 2024 13:14:37 -0700
Subject: [PATCH 27/28] KVM: x86/mmu: Remove KVM's MMU shrinker

Remove KVM's MMU shrinker and (almost) all of its related code, as the
current implementation is very disruptive to VMs (if it ever runs),
without providing any meaningful benefit[1].

Alternatively, KVM could repurpose its shrinker, e.g. to reclaim pages
from the per-vCPU caches[2], but given that no one has complained about
lack of TDP MMU support for the shrinker in the 3+ years since the TDP MMU
was enabled by default, it's safe to say that there is likely no real use
case for initiating reclaim of KVM's page tables from the shrinker.

And while clever/cute, reclaiming the per-vCPU caches doesn't scale the
same way that reclaiming in-use page table pages does.  E.g. the amount of
memory being used by a VM doesn't always directly correlate with the
number vCPUs, and even when it does, reclaiming a few pages from per-vCPU
caches likely won't make much of a dent in the VM's total memory usage,
especially for VMs with huge amounts of memory.

Lastly, if it turns out that there is a strong use case for dropping the
per-vCPU caches, re-introducing the shrinker registration is trivial
compared to the complexity of actually reclaiming pages from the caches.

[1] https://lore.kernel.org/lkml/Y45dldZnI6OIf+a5@google.com
[2] https://lore.kernel.org/kvm/20241004195540.210396-3-vipinsh@google.com

Suggested-by: Sean Christopherson <seanjc@google.com>
Suggested-by: David Matlack <dmatlack@google.com>
Signed-off-by: Vipin Sharma <vipinsh@google.com>
Link: https://lore.kernel.org/r/20241101201437.1604321-2-vipinsh@google.com
[sean: keep zapped_obsolete_pages for now, massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 105 +----------------------------------------
 1 file changed, 2 insertions(+), 103 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b522b076c2ce..8919ca104374 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
 
 static struct kmem_cache *pte_list_desc_cache;
 struct kmem_cache *mmu_page_header_cache;
-static struct percpu_counter kvm_total_used_mmu_pages;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
 
@@ -1622,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
 #endif
 }
 
-/*
- * This value is the sum of all of the kvm instances's
- * kvm->arch.n_used_mmu_pages values.  We need a global,
- * aggregate version in order to make the slab shrinker
- * faster
- */
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
-{
-	kvm->arch.n_used_mmu_pages += nr;
-	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
-}
-
 static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	kvm_mod_used_mmu_pages(kvm, +1);
+	kvm->arch.n_used_mmu_pages++;
 	kvm_account_pgtable_pages((void *)sp->spt, +1);
 }
 
 static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	kvm_mod_used_mmu_pages(kvm, -1);
+	kvm->arch.n_used_mmu_pages--;
 	kvm_account_pgtable_pages((void *)sp->spt, -1);
 }
 
@@ -6492,11 +6479,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 		kvm_tdp_mmu_zap_invalidated_roots(kvm);
 }
 
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
-{
-	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
-}
-
 void kvm_mmu_init_vm(struct kvm *kvm)
 {
 	kvm->arch.shadow_mmio_value = shadow_mmio_value;
@@ -7112,72 +7094,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 	}
 }
 
-static unsigned long mmu_shrink_scan(struct shrinker *shrink,
-				     struct shrink_control *sc)
-{
-	struct kvm *kvm;
-	int nr_to_scan = sc->nr_to_scan;
-	unsigned long freed = 0;
-
-	mutex_lock(&kvm_lock);
-
-	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int idx;
-
-		/*
-		 * Never scan more than sc->nr_to_scan VM instances.
-		 * Will not hit this condition practically since we do not try
-		 * to shrink more than one VM and it is very unlikely to see
-		 * !n_used_mmu_pages so many times.
-		 */
-		if (!nr_to_scan--)
-			break;
-		/*
-		 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
-		 * here. We may skip a VM instance errorneosly, but we do not
-		 * want to shrink a VM that only started to populate its MMU
-		 * anyway.
-		 */
-		if (!kvm->arch.n_used_mmu_pages &&
-		    !kvm_has_zapped_obsolete_pages(kvm))
-			continue;
-
-		idx = srcu_read_lock(&kvm->srcu);
-		write_lock(&kvm->mmu_lock);
-
-		if (kvm_has_zapped_obsolete_pages(kvm)) {
-			kvm_mmu_commit_zap_page(kvm,
-			      &kvm->arch.zapped_obsolete_pages);
-			goto unlock;
-		}
-
-		freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
-
-unlock:
-		write_unlock(&kvm->mmu_lock);
-		srcu_read_unlock(&kvm->srcu, idx);
-
-		/*
-		 * unfair on small ones
-		 * per-vm shrinkers cry out
-		 * sadness comes quickly
-		 */
-		list_move_tail(&kvm->vm_list, &vm_list);
-		break;
-	}
-
-	mutex_unlock(&kvm_lock);
-	return freed;
-}
-
-static unsigned long mmu_shrink_count(struct shrinker *shrink,
-				      struct shrink_control *sc)
-{
-	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
-}
-
-static struct shrinker *mmu_shrinker;
-
 static void mmu_destroy_caches(void)
 {
 	kmem_cache_destroy(pte_list_desc_cache);
@@ -7304,23 +7220,8 @@ int kvm_mmu_vendor_module_init(void)
 	if (!mmu_page_header_cache)
 		goto out;
 
-	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
-		goto out;
-
-	mmu_shrinker = shrinker_alloc(0, "x86-mmu");
-	if (!mmu_shrinker)
-		goto out_shrinker;
-
-	mmu_shrinker->count_objects = mmu_shrink_count;
-	mmu_shrinker->scan_objects = mmu_shrink_scan;
-	mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
-
-	shrinker_register(mmu_shrinker);
-
 	return 0;
 
-out_shrinker:
-	percpu_counter_destroy(&kvm_total_used_mmu_pages);
 out:
 	mmu_destroy_caches();
 	return ret;
@@ -7337,8 +7238,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 void kvm_mmu_vendor_module_exit(void)
 {
 	mmu_destroy_caches();
-	percpu_counter_destroy(&kvm_total_used_mmu_pages);
-	shrinker_free(mmu_shrinker);
 }
 
 /*

From 4cf20d42543cff8778f70b0c29def984098641a5 Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Fri, 1 Nov 2024 13:14:37 -0700
Subject: [PATCH 28/28] KVM: x86/mmu: Drop per-VM zapped_obsolete_pages list

Drop the per-VM zapped_obsolete_pages list now that the usage from the
defunct mmu_shrinker is gone, and instead use a local list to track pages
in kvm_zap_obsolete_pages(), the sole remaining user of
zapped_obsolete_pages.

Opportunistically add an assertion to verify and document that slots_lock
must be held, i.e. that there can only be one active instance of
kvm_zap_obsolete_pages() at any given time, and by doing so also prove
that using a local list instead of a per-VM list doesn't change any
functionality (beyond trivialities like list initialization).

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Link: https://lore.kernel.org/r/20241101201437.1604321-2-vipinsh@google.com
[sean: split to separate patch, write changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/mmu/mmu.c          | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa8007bc520a..433e65974e2b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1306,7 +1306,6 @@ struct kvm_arch {
 	bool pre_fault_allowed;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	struct list_head active_mmu_pages;
-	struct list_head zapped_obsolete_pages;
 	/*
 	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
 	 * replaced by an NX huge page.  A shadow page is on this list if its
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8919ca104374..38128e7b9af1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6367,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 	int nr_zapped, batch = 0;
+	LIST_HEAD(invalid_list);
 	bool unstable;
 
+	lockdep_assert_held(&kvm->slots_lock);
+
 restart:
 	list_for_each_entry_safe_reverse(sp, node,
 	      &kvm->arch.active_mmu_pages, link) {
@@ -6400,7 +6403,7 @@ restart:
 		}
 
 		unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
-				&kvm->arch.zapped_obsolete_pages, &nr_zapped);
+				&invalid_list, &nr_zapped);
 		batch += nr_zapped;
 
 		if (unstable)
@@ -6416,7 +6419,7 @@ restart:
 	 * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
 	 * running with an obsolete MMU.
 	 */
-	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
 /*
@@ -6483,7 +6486,6 @@ void kvm_mmu_init_vm(struct kvm *kvm)
 {
 	kvm->arch.shadow_mmio_value = shadow_mmio_value;
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
 	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);