Patchwork [05/14] Revert "KVM: MMU: drop kvm_mmu_zap_mmio_sptes"

login
register
mail settings
Submitter Christopherson, Sean J
Date Jan. 10, 2019, 6:06 p.m.
Message ID <20190110180706.24974-6-sean.j.christopherson@intel.com>
Download mbox | patch
Permalink /patch/697163/
State New
Headers show

Comments

Christopherson, Sean J - Jan. 10, 2019, 6:06 p.m.
Revert back to a dedicated (and slower) mechanism for handling the
scenario where all MMIO shadow PTEs need to be zapped due to overflowing
the MMIO generation number.  The MMIO generation scenario is almost
literally a one-in-a-million occurrence, i.e. is not a performance
sensitive scenario.

Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user
of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing
the fast invalidate mechanism altogether.

This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331.

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)
Paolo Bonzini - Jan. 30, 2019, 4:32 p.m.
On 10/01/19 19:06, Sean Christopherson wrote:
> The MMIO generation scenario is almost
> literally

Almost also because of this

         /*
          * Generations must be different for each address space.
          * Init kvm generation close to the maximum to easily test the
          * code of handling generation number wrap-around.
          */
         slots->generation = i * 2 - 150;

which probably we can remove.

Paolo

 a one-in-a-million occurrence, i.e. is not a performance
> sensitive scenario.
Paolo Bonzini - Jan. 30, 2019, 4:43 p.m.
On 10/01/19 19:06, Sean Christopherson wrote:
> Revert back to a dedicated (and slower) mechanism for handling the
> scenario where all MMIO shadow PTEs need to be zapped due to overflowing
> the MMIO generation number.  The MMIO generation scenario is almost
> literally a one-in-a-million occurrence, i.e. is not a performance
> sensitive scenario.
> 
> Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user
> of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing
> the fast invalidate mechanism altogether.
> 
> This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331.
> 
> Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu.c              | 22 +++++++++++++++++++++-
>  2 files changed, 22 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 31d05f0ef6d0..c2d49ad19a25 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -318,6 +318,7 @@ struct kvm_mmu_page {
>  	struct list_head link;
>  	struct hlist_node hash_link;
>  	bool unsync;
> +	bool mmio_cached;
>  
>  	/*
>  	 * The following two entries are used to key the shadow page in the
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index bb56beb166e4..df41193aba38 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -386,6 +386,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
>  	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
>  		<< shadow_nonpresent_or_rsvd_mask_len;
>  
> +	page_header(__pa(sptep))->mmio_cached = true;
> +
>  	trace_mark_mmio_spte(sptep, gfn, access, gen);
>  	mmu_spte_set(sptep, mask);
>  }
> @@ -5932,6 +5934,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
>  	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
>  }
>  
> +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
> +{
> +	struct kvm_mmu_page *sp, *node;
> +	LIST_HEAD(invalid_list);
> +
> +	spin_lock(&kvm->mmu_lock);
> +restart:
> +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> +		if (!sp->mmio_cached)
> +			continue;
> +		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> +			goto restart;

This is potentially quadratic.  If the "- 150" hack is removed, I'd
rather have a zap-all instead.

I understand it is a huge headache to squash the change at this point of
the series precisely, so it's okay to have a patch on top at the end.

Paolo

> +	}
> +
> +	kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
>  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
>  {
>  	/*
> @@ -5940,7 +5960,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
>  	 */
>  	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
>  		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
> -		kvm_mmu_invalidate_zap_all_pages(kvm);
> +		kvm_mmu_zap_mmio_sptes(kvm);
>  	}
>  }
>  
>
Christopherson, Sean J - Jan. 30, 2019, 6:10 p.m.
On Wed, Jan 30, 2019 at 05:32:52PM +0100, Paolo Bonzini wrote:
> On 10/01/19 19:06, Sean Christopherson wrote:
> > The MMIO generation scenario is almost
> > literally
> 
> Almost also because of this
> 
>          /*
>           * Generations must be different for each address space.
>           * Init kvm generation close to the maximum to easily test the
>           * code of handling generation number wrap-around.
>           */
>          slots->generation = i * 2 - 150;
> 
> which probably we can remove.

I completely forgot about that little gem.
Christopherson, Sean J - Feb. 1, 2019, 6:17 p.m.
On Wed, Jan 30, 2019 at 05:43:53PM +0100, Paolo Bonzini wrote:
> On 10/01/19 19:06, Sean Christopherson wrote:
> > Revert back to a dedicated (and slower) mechanism for handling the
> > scenario where all MMIO shadow PTEs need to be zapped due to overflowing
> > the MMIO generation number.  The MMIO generation scenario is almost
> > literally a one-in-a-million occurrence, i.e. is not a performance
> > sensitive scenario.
> > 
> > Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user
> > of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing
> > the fast invalidate mechanism altogether.
> > 
> > This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331.
> > 
> > Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > ---

[...]

> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index bb56beb166e4..df41193aba38 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -386,6 +386,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
> >  	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
> >  		<< shadow_nonpresent_or_rsvd_mask_len;
> >  
> > +	page_header(__pa(sptep))->mmio_cached = true;
> > +
> >  	trace_mark_mmio_spte(sptep, gfn, access, gen);
> >  	mmu_spte_set(sptep, mask);
> >  }
> > @@ -5932,6 +5934,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> >  	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> >  }
> >  
> > +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
> > +{
> > +	struct kvm_mmu_page *sp, *node;
> > +	LIST_HEAD(invalid_list);
> > +
> > +	spin_lock(&kvm->mmu_lock);
> > +restart:
> > +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> > +		if (!sp->mmio_cached)
> > +			continue;
> > +		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> > +			goto restart;
> 
> This is potentially quadratic.  If the "- 150" hack is removed, I'd
> rather have a zap-all instead.

Why is the walk restarted after zapping children?  Commit 0738541396be
("KVM: MMU: awareness of new kvm_mmu_zap_page behaviour") added the
restart and simply states:

   kvm_mmu_zap_page will soon zap the unsynced children of a page.
   Restart list walk in such case.

Which isn't exactly illuminating, but it does imply that the MMIO case
should never need to restart since MMIO sptes can't have children.

The "unsynced children" behavior was added by commit 4731d4c7a077 ("KVM:
MMU: out of sync shadow core"), and at that time kvm_mmu_zap_page() only
returned the number of zapped unsynced children.

Commit 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the
number of pages it actually freed") extended the return value to include
the page itself, which makes sense for the general case, but it means
restarting the walk on a non-zero return value is at least partially
wrong, if not completely unnecessary.

> I understand it is a huge headache to squash the change at this point of
> the series precisely, so it's okay to have a patch on top at the end.
> 
> Paolo
> 
> > +	}
> > +
> > +	kvm_mmu_commit_zap_page(kvm, &invalid_list);
> > +	spin_unlock(&kvm->mmu_lock);
> > +}
> > +
> >  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
> >  {
> >  	/*
> > @@ -5940,7 +5960,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
> >  	 */
> >  	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
> >  		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
> > -		kvm_mmu_invalidate_zap_all_pages(kvm);
> > +		kvm_mmu_zap_mmio_sptes(kvm);
> >  	}
> >  }
> >  
> > 
>
Christopherson, Sean J - Feb. 1, 2019, 6:27 p.m.
On Fri, Feb 01, 2019 at 10:17:39AM -0800, Sean Christopherson wrote:
> On Wed, Jan 30, 2019 at 05:43:53PM +0100, Paolo Bonzini wrote:
> > On 10/01/19 19:06, Sean Christopherson wrote:
> > > Revert back to a dedicated (and slower) mechanism for handling the
> > > scenario where all MMIO shadow PTEs need to be zapped due to overflowing
> > > the MMIO generation number.  The MMIO generation scenario is almost
> > > literally a one-in-a-million occurrence, i.e. is not a performance
> > > sensitive scenario.
> > > 
> > > Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user
> > > of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing
> > > the fast invalidate mechanism altogether.
> > > 
> > > This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331.
> > > 
> > > Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
> > > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > > ---
> 
> [...]
> 
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index bb56beb166e4..df41193aba38 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -386,6 +386,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
> > >  	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
> > >  		<< shadow_nonpresent_or_rsvd_mask_len;
> > >  
> > > +	page_header(__pa(sptep))->mmio_cached = true;
> > > +
> > >  	trace_mark_mmio_spte(sptep, gfn, access, gen);
> > >  	mmu_spte_set(sptep, mask);
> > >  }
> > > @@ -5932,6 +5934,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
> > >  	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
> > >  }
> > >  
> > > +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
> > > +{
> > > +	struct kvm_mmu_page *sp, *node;
> > > +	LIST_HEAD(invalid_list);
> > > +
> > > +	spin_lock(&kvm->mmu_lock);
> > > +restart:
> > > +	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
> > > +		if (!sp->mmio_cached)
> > > +			continue;
> > > +		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
> > > +			goto restart;
> > 
> > This is potentially quadratic.  If the "- 150" hack is removed, I'd
> > rather have a zap-all instead.
> 
> Why is the walk restarted after zapping children?  Commit 0738541396be
> ("KVM: MMU: awareness of new kvm_mmu_zap_page behaviour") added the
> restart and simply states:
> 
>    kvm_mmu_zap_page will soon zap the unsynced children of a page.
>    Restart list walk in such case.

Never mind, brain finally clicked and realized the next entry in the
list could be modified when zapping a child.

> 
> Which isn't exactly illuminating, but it does imply that the MMIO case
> should never need to restart since MMIO sptes can't have children.
> 
> The "unsynced children" behavior was added by commit 4731d4c7a077 ("KVM:
> MMU: out of sync shadow core"), and at that time kvm_mmu_zap_page() only
> returned the number of zapped unsynced children.
> 
> Commit 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the
> number of pages it actually freed") extended the return value to include
> the page itself, which makes sense for the general case, but it means
> restarting the walk on a non-zero return value is at least partially
> wrong, if not completely unnecessary.
> 
> > I understand it is a huge headache to squash the change at this point of
> > the series precisely, so it's okay to have a patch on top at the end.
> > 
> > Paolo
> > 
> > > +	}
> > > +
> > > +	kvm_mmu_commit_zap_page(kvm, &invalid_list);
> > > +	spin_unlock(&kvm->mmu_lock);
> > > +}
> > > +
> > >  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
> > >  {
> > >  	/*
> > > @@ -5940,7 +5960,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
> > >  	 */
> > >  	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
> > >  		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
> > > -		kvm_mmu_invalidate_zap_all_pages(kvm);
> > > +		kvm_mmu_zap_mmio_sptes(kvm);
> > >  	}
> > >  }
> > >  
> > > 
> >

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 31d05f0ef6d0..c2d49ad19a25 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -318,6 +318,7 @@  struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 	bool unsync;
+	bool mmio_cached;
 
 	/*
 	 * The following two entries are used to key the shadow page in the
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bb56beb166e4..df41193aba38 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -386,6 +386,8 @@  static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 		<< shadow_nonpresent_or_rsvd_mask_len;
 
+	page_header(__pa(sptep))->mmio_cached = true;
+
 	trace_mark_mmio_spte(sptep, gfn, access, gen);
 	mmu_spte_set(sptep, mask);
 }
@@ -5932,6 +5934,24 @@  static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
+static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
+{
+	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
+
+	spin_lock(&kvm->mmu_lock);
+restart:
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+		if (!sp->mmio_cached)
+			continue;
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+			goto restart;
+	}
+
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 {
 	/*
@@ -5940,7 +5960,7 @@  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 	 */
 	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
-		kvm_mmu_invalidate_zap_all_pages(kvm);
+		kvm_mmu_zap_mmio_sptes(kvm);
 	}
 }