Patchwork [RFC,v8,6/7] KVM: Enables the kernel to isolate and report free pages

login
register
mail settings
Submitter Nitesh Narayan Lal
Date Feb. 4, 2019, 8:18 p.m.
Message ID <20190204201854.2328-7-nitesh@redhat.com>
Download mbox | patch
Permalink /patch/717843/
State New
Headers show

Comments

Nitesh Narayan Lal - Feb. 4, 2019, 8:18 p.m.
This patch enables the kernel to scan the per cpu array and
compress it by removing the repetitive/re-allocated pages.
Once the per cpu array is completely filled with pages in the
buddy it wakes up the kernel per cpu thread which re-scans the
entire per cpu array by acquiring a zone lock corresponding to
the page which is being scanned. If the page is still free and
present in the buddy it tries to isolate the page and adds it
to another per cpu array.

Once this scanning process is complete and if there are any
isolated pages added to the new per cpu array kernel thread
invokes hyperlist_ready().

In hyperlist_ready() a hypercall is made to report these pages to
the host using the virtio-balloon framework. In order to do so
another virtqueue 'hinting_vq' is added to the balloon framework.
As the host frees all the reported pages, the kernel thread returns
them back to the buddy.

Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
---
 drivers/virtio/virtio_balloon.c     |  56 +++++++-
 include/linux/page_hinting.h        |  18 ++-
 include/uapi/linux/virtio_balloon.h |   1 +
 mm/page_alloc.c                     |   2 +-
 virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
 5 files changed, 269 insertions(+), 10 deletions(-)
Michael S. Tsirkin - Feb. 5, 2019, 8:45 p.m.
On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> This patch enables the kernel to scan the per cpu array and
> compress it by removing the repetitive/re-allocated pages.
> Once the per cpu array is completely filled with pages in the
> buddy it wakes up the kernel per cpu thread which re-scans the
> entire per cpu array by acquiring a zone lock corresponding to
> the page which is being scanned. If the page is still free and
> present in the buddy it tries to isolate the page and adds it
> to another per cpu array.
> 
> Once this scanning process is complete and if there are any
> isolated pages added to the new per cpu array kernel thread
> invokes hyperlist_ready().
> 
> In hyperlist_ready() a hypercall is made to report these pages to
> the host using the virtio-balloon framework. In order to do so
> another virtqueue 'hinting_vq' is added to the balloon framework.
> As the host frees all the reported pages, the kernel thread returns
> them back to the buddy.
> 
> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>


This looks kind of like what early iterations of Wei's patches did.

But this has lots of issues, for example you might end up with
a hypercall per a 4K page.

So in the end, he switched over to just reporting only
MAX_ORDER - 1 pages.

Would that be a good idea for you too?

An alternative would be a different much lighter weight
way to report these pages and to free them on the host.

> ---
>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
>  include/linux/page_hinting.h        |  18 ++-
>  include/uapi/linux/virtio_balloon.h |   1 +
>  mm/page_alloc.c                     |   2 +-
>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
>  5 files changed, 269 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 728ecd1eea30..8af34e0b9a32 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
>  	VIRTIO_BALLOON_VQ_INFLATE,
>  	VIRTIO_BALLOON_VQ_DEFLATE,
>  	VIRTIO_BALLOON_VQ_STATS,
> +	VIRTIO_BALLOON_VQ_HINTING,
>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
>  	VIRTIO_BALLOON_VQ_MAX
>  };
>  
>  struct virtio_balloon {
>  	struct virtio_device *vdev;
> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> +								*hinting_vq;
>  
>  	/* Balloon's own wq for cpu-intensive work items */
>  	struct workqueue_struct *balloon_wq;
> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
>  	{ 0 },
>  };
>  
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> +			      int hyper_entries)
> +{
> +	u64 gpaddr = virt_to_phys((void *)gvaddr);
> +
> +	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> +	virtqueue_kick_sync(vb->hinting_vq);
> +}
> +
> +static void hinting_ack(struct virtqueue *vq)
> +{
> +	struct virtio_balloon *vb = vq->vdev->priv;
> +
> +	wake_up(&vb->acked);
> +}
> +
> +static void enable_hinting(struct virtio_balloon *vb)
> +{
> +	guest_page_hinting_flag = 1;
> +	static_branch_enable(&guest_page_hinting_key);
> +	request_hypercall = (void *)&virtballoon_page_hinting;
> +	balloon_ptr = vb;
> +	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> +}
> +
> +static void disable_hinting(void)
> +{
> +	guest_page_hinting_flag = 0;
> +	static_branch_enable(&guest_page_hinting_key);
> +	balloon_ptr = NULL;
> +}
> +#endif
> +
>  static u32 page_to_balloon_pfn(struct page *page)
>  {
>  	unsigned long pfn = page_to_pfn(page);
> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> +	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
>  
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>  	}
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> +		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> +		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> +	}
>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
>  					 vqs, callbacks, names, NULL, NULL);
>  	if (err)
>  		return err;
>  
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> +		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> +
>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  		if (err)
>  			goto out_del_balloon_wq;
>  	}
> +
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> +		enable_hinting(vb);
> +#endif
>  	virtio_device_ready(vdev);
>  
>  	if (towards_target(vb))
> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
>  	cancel_work_sync(&vb->update_balloon_size_work);
>  	cancel_work_sync(&vb->update_balloon_stats_work);
>  
> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> +		disable_hinting();
> +#endif
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>  		cancel_work_sync(&vb->report_free_page_work);
>  		destroy_workqueue(vb->balloon_wq);
> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
>  	VIRTIO_BALLOON_F_STATS_VQ,
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> +	VIRTIO_BALLOON_F_HINTING,
>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>  	VIRTIO_BALLOON_F_PAGE_POISON,
>  };
> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> index e800c6b07561..3ba8c1f3b4a4 100644
> --- a/include/linux/page_hinting.h
> +++ b/include/linux/page_hinting.h
> @@ -1,15 +1,12 @@
>  #include <linux/smpboot.h>
>  
> -/*
> - * Size of the array which is used to store the freed pages is defined by
> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> - * we can get rid of the hardcoded array size.
> - */
>  #define MAX_FGPT_ENTRIES	1000
>  /*
>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
> - * @pfn: page frame number for the page which needs to be sent to the host.
> - * @order: order of the page needs to be reported to the host.
> + * @pfn - page frame number for the page which is to be freed.
> + * @pages - number of pages which are supposed to be freed.
> + * A global array object is used to to hold the list of pfn and pages and is
> + * passed as part of the hypercall.
>   */
>  struct hypervisor_pages {
>  	unsigned long pfn;
> @@ -19,11 +16,18 @@ struct hypervisor_pages {
>  extern int guest_page_hinting_flag;
>  extern struct static_key_false guest_page_hinting_key;
>  extern struct smp_hotplug_thread hinting_threads;
> +extern void (*request_hypercall)(void *, u64, int);
> +extern void *balloon_ptr;
>  extern bool want_page_poisoning;
>  
>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>  			      void __user *buffer, size_t *lenp, loff_t *ppos);
>  void guest_free_page(struct page *page, int order);
> +extern int __isolate_free_page(struct page *page, unsigned int order);
> +extern void free_one_page(struct zone *zone,
> +			  struct page *page, unsigned long pfn,
> +			  unsigned int order,
> +			  int migratetype);
>  
>  static inline void disable_page_poisoning(void)
>  {

I guess you will want to put this in some other header.  Function
declarations belong close to where they are implemented, not used.

> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index a1966cd7b677..2b0f62814e22 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -36,6 +36,7 @@
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> +#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d295c9bc01a8..93224cba9243 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
>  	spin_unlock(&zone->lock);
>  }
>  
> -static void free_one_page(struct zone *zone,
> +void free_one_page(struct zone *zone,
>  				struct page *page, unsigned long pfn,
>  				unsigned int order,
>  				int migratetype)
> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> index be529f6f2bc0..315099fcda43 100644
> --- a/virt/kvm/page_hinting.c
> +++ b/virt/kvm/page_hinting.c
> @@ -1,6 +1,8 @@
>  #include <linux/gfp.h>
>  #include <linux/mm.h>
> +#include <linux/page_ref.h>
>  #include <linux/kvm_host.h>
> +#include <linux/sort.h>
>  #include <linux/kernel.h>
>  
>  /*
> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
>  EXPORT_SYMBOL(guest_page_hinting_flag);
>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
>  
> +void (*request_hypercall)(void *, u64, int);
> +EXPORT_SYMBOL(request_hypercall);
> +void *balloon_ptr;
> +EXPORT_SYMBOL(balloon_ptr);
> +
>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>  			      void __user *buffer, size_t *lenp,
>  			      loff_t *ppos)
> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>  	return ret;
>  }
>  
> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
> +{
> +	int i = 0;
> +	int mt = 0;
> +
> +	if (balloon_ptr)
> +		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> +				  entries);
> +
> +	while (i < entries) {
> +		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> +
> +		mt = get_pageblock_migratetype(page);
> +		free_one_page(page_zone(page), page, page_to_pfn(page),
> +			      guest_isolated_pages[i].order, mt);
> +		i++;
> +	}
> +}
> +
> +struct page *get_buddy_page(struct page *page)
> +{
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned int order;
> +
> +	for (order = 0; order < MAX_ORDER; order++) {
> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
> +
> +		if (PageBuddy(page_head) && page_private(page_head) >= order)
> +			return page_head;
> +	}
> +	return NULL;
> +}
> +
>  static void hinting_fn(unsigned int cpu)
>  {
>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +	int idx = 0, ret = 0;
> +	struct zone *zone_cur;
> +	unsigned long flags = 0;
> +
> +	while (idx < MAX_FGPT_ENTRIES) {
> +		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> +		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> +			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> +
> +		while (pfn <= pfn_end) {
> +			struct page *page = pfn_to_page(pfn);
> +			struct page *buddy_page = NULL;
> +
> +			zone_cur = page_zone(page);
> +			spin_lock_irqsave(&zone_cur->lock, flags);
> +
> +			if (PageCompound(page)) {
> +				struct page *head_page = compound_head(page);
> +				unsigned long head_pfn = page_to_pfn(head_page);
> +				unsigned int alloc_pages =
> +					1 << compound_order(head_page);
> +
> +				pfn = head_pfn + alloc_pages;
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +
> +			if (page_ref_count(page)) {
> +				pfn++;
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +
> +			if (PageBuddy(page)) {
> +				int buddy_order = page_private(page);
>  
> +				ret = __isolate_free_page(page, buddy_order);
> +				if (!ret) {
> +				} else {
> +					int l_idx = page_hinting_obj->hyp_idx;
> +					struct hypervisor_pages *l_obj =
> +					page_hinting_obj->hypervisor_pagelist;
> +
> +					l_obj[l_idx].pfn = pfn;
> +					l_obj[l_idx].order = buddy_order;
> +					page_hinting_obj->hyp_idx += 1;
> +				}
> +				pfn = pfn + (1 << buddy_order);
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +
> +			buddy_page = get_buddy_page(page);
> +			if (buddy_page) {
> +				int buddy_order = page_private(buddy_page);
> +
> +				ret = __isolate_free_page(buddy_page,
> +							  buddy_order);
> +				if (!ret) {
> +				} else {
> +					int l_idx = page_hinting_obj->hyp_idx;
> +					struct hypervisor_pages *l_obj =
> +					page_hinting_obj->hypervisor_pagelist;
> +					unsigned long buddy_pfn =
> +						page_to_pfn(buddy_page);
> +
> +					l_obj[l_idx].pfn = buddy_pfn;
> +					l_obj[l_idx].order = buddy_order;
> +					page_hinting_obj->hyp_idx += 1;
> +				}
> +				pfn = page_to_pfn(buddy_page) +
> +					(1 << buddy_order);
> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> +				continue;
> +			}
> +			spin_unlock_irqrestore(&zone_cur->lock, flags);
> +			pfn++;
> +		}
> +		page_hinting_obj->kvm_pt[idx].pfn = 0;
> +		page_hinting_obj->kvm_pt[idx].order = -1;
> +		page_hinting_obj->kvm_pt[idx].zonenum = -1;
> +		idx++;
> +	}
> +	if (page_hinting_obj->hyp_idx > 0) {
> +		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> +				page_hinting_obj->hyp_idx);
> +		page_hinting_obj->hyp_idx = 0;
> +	}
>  	page_hinting_obj->kvm_pt_idx = 0;
>  	put_cpu_var(hinting_obj);
>  }
>  
> +int if_exist(struct page *page)
> +{
> +	int i = 0;
> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> +			return 1;
> +		i++;
> +	}
> +	return 0;
> +}
> +
> +void pack_array(void)
> +{
> +	int i = 0, j = 0;
> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +
> +	while (i < MAX_FGPT_ENTRIES) {
> +		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> +			if (i != j) {
> +				page_hinting_obj->kvm_pt[j].pfn =
> +					page_hinting_obj->kvm_pt[i].pfn;
> +				page_hinting_obj->kvm_pt[j].order =
> +					page_hinting_obj->kvm_pt[i].order;
> +				page_hinting_obj->kvm_pt[j].zonenum =
> +					page_hinting_obj->kvm_pt[i].zonenum;
> +			}
> +			j++;
> +		}
> +		i++;
> +	}
> +	i = j;
> +	page_hinting_obj->kvm_pt_idx = j;
> +	while (j < MAX_FGPT_ENTRIES) {
> +		page_hinting_obj->kvm_pt[j].pfn = 0;
> +		page_hinting_obj->kvm_pt[j].order = -1;
> +		page_hinting_obj->kvm_pt[j].zonenum = -1;
> +		j++;
> +	}
> +}
> +
>  void scan_array(void)
>  {
>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> +	int i = 0;
>  
> +	while (i < MAX_FGPT_ENTRIES) {
> +		struct page *page =
> +			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> +		struct page *buddy_page = get_buddy_page(page);
> +
> +		if (!PageBuddy(page) && buddy_page) {
> +			if (if_exist(buddy_page)) {
> +				page_hinting_obj->kvm_pt[i].pfn = 0;
> +				page_hinting_obj->kvm_pt[i].order = -1;
> +				page_hinting_obj->kvm_pt[i].zonenum = -1;
> +			} else {
> +				page_hinting_obj->kvm_pt[i].pfn =
> +					page_to_pfn(buddy_page);
> +				page_hinting_obj->kvm_pt[i].order =
> +					page_private(buddy_page);
> +			}
> +		}
> +		i++;
> +	}
> +	pack_array();
>  	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
>  		wake_up_process(__this_cpu_read(hinting_task));
>  }
> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
>  		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
>  							order;
>  		page_hinting_obj->kvm_pt_idx += 1;
> -		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> +		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> +			/*
> +			 * We are depending on the buddy free-list to identify
> +			 * if a page is free or not. Hence, we are dumping all
> +			 * the per-cpu pages back into the buddy allocator. This
> +			 * will ensure less failures when we try to isolate free
> +			 * captured pages and hence more memory reporting to the
> +			 * host.
> +			 */
> +			drain_local_pages(NULL);
>  			scan_array();
> +		}
>  	}
>  	local_irq_restore(flags);
>  }
> -- 
> 2.17.2
Nitesh Narayan Lal - Feb. 5, 2019, 9:54 p.m.
On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
>> This patch enables the kernel to scan the per cpu array and
>> compress it by removing the repetitive/re-allocated pages.
>> Once the per cpu array is completely filled with pages in the
>> buddy it wakes up the kernel per cpu thread which re-scans the
>> entire per cpu array by acquiring a zone lock corresponding to
>> the page which is being scanned. If the page is still free and
>> present in the buddy it tries to isolate the page and adds it
>> to another per cpu array.
>>
>> Once this scanning process is complete and if there are any
>> isolated pages added to the new per cpu array kernel thread
>> invokes hyperlist_ready().
>>
>> In hyperlist_ready() a hypercall is made to report these pages to
>> the host using the virtio-balloon framework. In order to do so
>> another virtqueue 'hinting_vq' is added to the balloon framework.
>> As the host frees all the reported pages, the kernel thread returns
>> them back to the buddy.
>>
>> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
>
> This looks kind of like what early iterations of Wei's patches did.
>
> But this has lots of issues, for example you might end up with
> a hypercall per a 4K page.
> So in the end, he switched over to just reporting only
> MAX_ORDER - 1 pages.
You mean that I should only capture/attempt to isolate pages with order
MAX_ORDER - 1?
>
> Would that be a good idea for you too?
Will it help if we have a threshold value based on the amount of memory
captured instead of the number of entries/pages in the array?
>
> An alternative would be a different much lighter weight
> way to report these pages and to free them on the host.
>
>> ---
>>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
>>  include/linux/page_hinting.h        |  18 ++-
>>  include/uapi/linux/virtio_balloon.h |   1 +
>>  mm/page_alloc.c                     |   2 +-
>>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
>>  5 files changed, 269 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index 728ecd1eea30..8af34e0b9a32 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
>>  	VIRTIO_BALLOON_VQ_INFLATE,
>>  	VIRTIO_BALLOON_VQ_DEFLATE,
>>  	VIRTIO_BALLOON_VQ_STATS,
>> +	VIRTIO_BALLOON_VQ_HINTING,
>>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
>>  	VIRTIO_BALLOON_VQ_MAX
>>  };
>>  
>>  struct virtio_balloon {
>>  	struct virtio_device *vdev;
>> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
>> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
>> +								*hinting_vq;
>>  
>>  	/* Balloon's own wq for cpu-intensive work items */
>>  	struct workqueue_struct *balloon_wq;
>> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
>>  	{ 0 },
>>  };
>>  
>> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
>> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
>> +			      int hyper_entries)
>> +{
>> +	u64 gpaddr = virt_to_phys((void *)gvaddr);
>> +
>> +	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
>> +	virtqueue_kick_sync(vb->hinting_vq);
>> +}
>> +
>> +static void hinting_ack(struct virtqueue *vq)
>> +{
>> +	struct virtio_balloon *vb = vq->vdev->priv;
>> +
>> +	wake_up(&vb->acked);
>> +}
>> +
>> +static void enable_hinting(struct virtio_balloon *vb)
>> +{
>> +	guest_page_hinting_flag = 1;
>> +	static_branch_enable(&guest_page_hinting_key);
>> +	request_hypercall = (void *)&virtballoon_page_hinting;
>> +	balloon_ptr = vb;
>> +	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
>> +}
>> +
>> +static void disable_hinting(void)
>> +{
>> +	guest_page_hinting_flag = 0;
>> +	static_branch_enable(&guest_page_hinting_key);
>> +	balloon_ptr = NULL;
>> +}
>> +#endif
>> +
>>  static u32 page_to_balloon_pfn(struct page *page)
>>  {
>>  	unsigned long pfn = page_to_pfn(page);
>> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
>>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
>>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
>>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>> +	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
>>  
>>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
>> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
>>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
>>  	}
>>  
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
>> +		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
>> +		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
>> +	}
>>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
>>  					 vqs, callbacks, names, NULL, NULL);
>>  	if (err)
>>  		return err;
>>  
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
>> +		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
>> +
>>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
>>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
>>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
>> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
>>  		if (err)
>>  			goto out_del_balloon_wq;
>>  	}
>> +
>> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
>> +		enable_hinting(vb);
>> +#endif
>>  	virtio_device_ready(vdev);
>>  
>>  	if (towards_target(vb))
>> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
>>  	cancel_work_sync(&vb->update_balloon_size_work);
>>  	cancel_work_sync(&vb->update_balloon_stats_work);
>>  
>> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
>> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
>> +		disable_hinting();
>> +#endif
>>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>>  		cancel_work_sync(&vb->report_free_page_work);
>>  		destroy_workqueue(vb->balloon_wq);
>> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
>>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
>>  	VIRTIO_BALLOON_F_STATS_VQ,
>>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>> +	VIRTIO_BALLOON_F_HINTING,
>>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
>>  	VIRTIO_BALLOON_F_PAGE_POISON,
>>  };
>> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
>> index e800c6b07561..3ba8c1f3b4a4 100644
>> --- a/include/linux/page_hinting.h
>> +++ b/include/linux/page_hinting.h
>> @@ -1,15 +1,12 @@
>>  #include <linux/smpboot.h>
>>  
>> -/*
>> - * Size of the array which is used to store the freed pages is defined by
>> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
>> - * we can get rid of the hardcoded array size.
>> - */
>>  #define MAX_FGPT_ENTRIES	1000
>>  /*
>>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
>> - * @pfn: page frame number for the page which needs to be sent to the host.
>> - * @order: order of the page needs to be reported to the host.
>> + * @pfn - page frame number for the page which is to be freed.
>> + * @pages - number of pages which are supposed to be freed.
>> + * A global array object is used to to hold the list of pfn and pages and is
>> + * passed as part of the hypercall.
>>   */
>>  struct hypervisor_pages {
>>  	unsigned long pfn;
>> @@ -19,11 +16,18 @@ struct hypervisor_pages {
>>  extern int guest_page_hinting_flag;
>>  extern struct static_key_false guest_page_hinting_key;
>>  extern struct smp_hotplug_thread hinting_threads;
>> +extern void (*request_hypercall)(void *, u64, int);
>> +extern void *balloon_ptr;
>>  extern bool want_page_poisoning;
>>  
>>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>>  			      void __user *buffer, size_t *lenp, loff_t *ppos);
>>  void guest_free_page(struct page *page, int order);
>> +extern int __isolate_free_page(struct page *page, unsigned int order);
>> +extern void free_one_page(struct zone *zone,
>> +			  struct page *page, unsigned long pfn,
>> +			  unsigned int order,
>> +			  int migratetype);
>>  
>>  static inline void disable_page_poisoning(void)
>>  {
> I guess you will want to put this in some other header.  Function
> declarations belong close to where they are implemented, not used.
I will find a better place.
>
>> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
>> index a1966cd7b677..2b0f62814e22 100644
>> --- a/include/uapi/linux/virtio_balloon.h
>> +++ b/include/uapi/linux/virtio_balloon.h
>> @@ -36,6 +36,7 @@
>>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
>>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
>> +#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
>>  
>>  /* Size of a PFN in the balloon interface. */
>>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index d295c9bc01a8..93224cba9243 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
>>  	spin_unlock(&zone->lock);
>>  }
>>  
>> -static void free_one_page(struct zone *zone,
>> +void free_one_page(struct zone *zone,
>>  				struct page *page, unsigned long pfn,
>>  				unsigned int order,
>>  				int migratetype)
>> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
>> index be529f6f2bc0..315099fcda43 100644
>> --- a/virt/kvm/page_hinting.c
>> +++ b/virt/kvm/page_hinting.c
>> @@ -1,6 +1,8 @@
>>  #include <linux/gfp.h>
>>  #include <linux/mm.h>
>> +#include <linux/page_ref.h>
>>  #include <linux/kvm_host.h>
>> +#include <linux/sort.h>
>>  #include <linux/kernel.h>
>>  
>>  /*
>> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
>>  EXPORT_SYMBOL(guest_page_hinting_flag);
>>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
>>  
>> +void (*request_hypercall)(void *, u64, int);
>> +EXPORT_SYMBOL(request_hypercall);
>> +void *balloon_ptr;
>> +EXPORT_SYMBOL(balloon_ptr);
>> +
>>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>>  			      void __user *buffer, size_t *lenp,
>>  			      loff_t *ppos)
>> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
>>  	return ret;
>>  }
>>  
>> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
>> +{
>> +	int i = 0;
>> +	int mt = 0;
>> +
>> +	if (balloon_ptr)
>> +		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
>> +				  entries);
>> +
>> +	while (i < entries) {
>> +		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
>> +
>> +		mt = get_pageblock_migratetype(page);
>> +		free_one_page(page_zone(page), page, page_to_pfn(page),
>> +			      guest_isolated_pages[i].order, mt);
>> +		i++;
>> +	}
>> +}
>> +
>> +struct page *get_buddy_page(struct page *page)
>> +{
>> +	unsigned long pfn = page_to_pfn(page);
>> +	unsigned int order;
>> +
>> +	for (order = 0; order < MAX_ORDER; order++) {
>> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
>> +
>> +		if (PageBuddy(page_head) && page_private(page_head) >= order)
>> +			return page_head;
>> +	}
>> +	return NULL;
>> +}
>> +
>>  static void hinting_fn(unsigned int cpu)
>>  {
>>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +	int idx = 0, ret = 0;
>> +	struct zone *zone_cur;
>> +	unsigned long flags = 0;
>> +
>> +	while (idx < MAX_FGPT_ENTRIES) {
>> +		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
>> +		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
>> +			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
>> +
>> +		while (pfn <= pfn_end) {
>> +			struct page *page = pfn_to_page(pfn);
>> +			struct page *buddy_page = NULL;
>> +
>> +			zone_cur = page_zone(page);
>> +			spin_lock_irqsave(&zone_cur->lock, flags);
>> +
>> +			if (PageCompound(page)) {
>> +				struct page *head_page = compound_head(page);
>> +				unsigned long head_pfn = page_to_pfn(head_page);
>> +				unsigned int alloc_pages =
>> +					1 << compound_order(head_page);
>> +
>> +				pfn = head_pfn + alloc_pages;
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +
>> +			if (page_ref_count(page)) {
>> +				pfn++;
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +
>> +			if (PageBuddy(page)) {
>> +				int buddy_order = page_private(page);
>>  
>> +				ret = __isolate_free_page(page, buddy_order);
>> +				if (!ret) {
>> +				} else {
>> +					int l_idx = page_hinting_obj->hyp_idx;
>> +					struct hypervisor_pages *l_obj =
>> +					page_hinting_obj->hypervisor_pagelist;
>> +
>> +					l_obj[l_idx].pfn = pfn;
>> +					l_obj[l_idx].order = buddy_order;
>> +					page_hinting_obj->hyp_idx += 1;
>> +				}
>> +				pfn = pfn + (1 << buddy_order);
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +
>> +			buddy_page = get_buddy_page(page);
>> +			if (buddy_page) {
>> +				int buddy_order = page_private(buddy_page);
>> +
>> +				ret = __isolate_free_page(buddy_page,
>> +							  buddy_order);
>> +				if (!ret) {
>> +				} else {
>> +					int l_idx = page_hinting_obj->hyp_idx;
>> +					struct hypervisor_pages *l_obj =
>> +					page_hinting_obj->hypervisor_pagelist;
>> +					unsigned long buddy_pfn =
>> +						page_to_pfn(buddy_page);
>> +
>> +					l_obj[l_idx].pfn = buddy_pfn;
>> +					l_obj[l_idx].order = buddy_order;
>> +					page_hinting_obj->hyp_idx += 1;
>> +				}
>> +				pfn = page_to_pfn(buddy_page) +
>> +					(1 << buddy_order);
>> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +				continue;
>> +			}
>> +			spin_unlock_irqrestore(&zone_cur->lock, flags);
>> +			pfn++;
>> +		}
>> +		page_hinting_obj->kvm_pt[idx].pfn = 0;
>> +		page_hinting_obj->kvm_pt[idx].order = -1;
>> +		page_hinting_obj->kvm_pt[idx].zonenum = -1;
>> +		idx++;
>> +	}
>> +	if (page_hinting_obj->hyp_idx > 0) {
>> +		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
>> +				page_hinting_obj->hyp_idx);
>> +		page_hinting_obj->hyp_idx = 0;
>> +	}
>>  	page_hinting_obj->kvm_pt_idx = 0;
>>  	put_cpu_var(hinting_obj);
>>  }
>>  
>> +int if_exist(struct page *page)
>> +{
>> +	int i = 0;
>> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
>> +			return 1;
>> +		i++;
>> +	}
>> +	return 0;
>> +}
>> +
>> +void pack_array(void)
>> +{
>> +	int i = 0, j = 0;
>> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
>> +			if (i != j) {
>> +				page_hinting_obj->kvm_pt[j].pfn =
>> +					page_hinting_obj->kvm_pt[i].pfn;
>> +				page_hinting_obj->kvm_pt[j].order =
>> +					page_hinting_obj->kvm_pt[i].order;
>> +				page_hinting_obj->kvm_pt[j].zonenum =
>> +					page_hinting_obj->kvm_pt[i].zonenum;
>> +			}
>> +			j++;
>> +		}
>> +		i++;
>> +	}
>> +	i = j;
>> +	page_hinting_obj->kvm_pt_idx = j;
>> +	while (j < MAX_FGPT_ENTRIES) {
>> +		page_hinting_obj->kvm_pt[j].pfn = 0;
>> +		page_hinting_obj->kvm_pt[j].order = -1;
>> +		page_hinting_obj->kvm_pt[j].zonenum = -1;
>> +		j++;
>> +	}
>> +}
>> +
>>  void scan_array(void)
>>  {
>>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
>> +	int i = 0;
>>  
>> +	while (i < MAX_FGPT_ENTRIES) {
>> +		struct page *page =
>> +			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
>> +		struct page *buddy_page = get_buddy_page(page);
>> +
>> +		if (!PageBuddy(page) && buddy_page) {
>> +			if (if_exist(buddy_page)) {
>> +				page_hinting_obj->kvm_pt[i].pfn = 0;
>> +				page_hinting_obj->kvm_pt[i].order = -1;
>> +				page_hinting_obj->kvm_pt[i].zonenum = -1;
>> +			} else {
>> +				page_hinting_obj->kvm_pt[i].pfn =
>> +					page_to_pfn(buddy_page);
>> +				page_hinting_obj->kvm_pt[i].order =
>> +					page_private(buddy_page);
>> +			}
>> +		}
>> +		i++;
>> +	}
>> +	pack_array();
>>  	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
>>  		wake_up_process(__this_cpu_read(hinting_task));
>>  }
>> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
>>  		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
>>  							order;
>>  		page_hinting_obj->kvm_pt_idx += 1;
>> -		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
>> +		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
>> +			/*
>> +			 * We are depending on the buddy free-list to identify
>> +			 * if a page is free or not. Hence, we are dumping all
>> +			 * the per-cpu pages back into the buddy allocator. This
>> +			 * will ensure less failures when we try to isolate free
>> +			 * captured pages and hence more memory reporting to the
>> +			 * host.
>> +			 */
>> +			drain_local_pages(NULL);
>>  			scan_array();
>> +		}
>>  	}
>>  	local_irq_restore(flags);
>>  }
>> -- 
>> 2.17.2
Michael S. Tsirkin - Feb. 5, 2019, 9:55 p.m.
On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> 
> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> >> This patch enables the kernel to scan the per cpu array and
> >> compress it by removing the repetitive/re-allocated pages.
> >> Once the per cpu array is completely filled with pages in the
> >> buddy it wakes up the kernel per cpu thread which re-scans the
> >> entire per cpu array by acquiring a zone lock corresponding to
> >> the page which is being scanned. If the page is still free and
> >> present in the buddy it tries to isolate the page and adds it
> >> to another per cpu array.
> >>
> >> Once this scanning process is complete and if there are any
> >> isolated pages added to the new per cpu array kernel thread
> >> invokes hyperlist_ready().
> >>
> >> In hyperlist_ready() a hypercall is made to report these pages to
> >> the host using the virtio-balloon framework. In order to do so
> >> another virtqueue 'hinting_vq' is added to the balloon framework.
> >> As the host frees all the reported pages, the kernel thread returns
> >> them back to the buddy.
> >>
> >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> >
> > This looks kind of like what early iterations of Wei's patches did.
> >
> > But this has lots of issues, for example you might end up with
> > a hypercall per a 4K page.
> > So in the end, he switched over to just reporting only
> > MAX_ORDER - 1 pages.
> You mean that I should only capture/attempt to isolate pages with order
> MAX_ORDER - 1?
> >
> > Would that be a good idea for you too?
> Will it help if we have a threshold value based on the amount of memory
> captured instead of the number of entries/pages in the array?

This is what Wei's patches do at least.

> >
> > An alternative would be a different much lighter weight
> > way to report these pages and to free them on the host.
> >
> >> ---
> >>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
> >>  include/linux/page_hinting.h        |  18 ++-
> >>  include/uapi/linux/virtio_balloon.h |   1 +
> >>  mm/page_alloc.c                     |   2 +-
> >>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
> >>  5 files changed, 269 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> >> index 728ecd1eea30..8af34e0b9a32 100644
> >> --- a/drivers/virtio/virtio_balloon.c
> >> +++ b/drivers/virtio/virtio_balloon.c
> >> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
> >>  	VIRTIO_BALLOON_VQ_INFLATE,
> >>  	VIRTIO_BALLOON_VQ_DEFLATE,
> >>  	VIRTIO_BALLOON_VQ_STATS,
> >> +	VIRTIO_BALLOON_VQ_HINTING,
> >>  	VIRTIO_BALLOON_VQ_FREE_PAGE,
> >>  	VIRTIO_BALLOON_VQ_MAX
> >>  };
> >>  
> >>  struct virtio_balloon {
> >>  	struct virtio_device *vdev;
> >> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> >> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> >> +								*hinting_vq;
> >>  
> >>  	/* Balloon's own wq for cpu-intensive work items */
> >>  	struct workqueue_struct *balloon_wq;
> >> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
> >>  	{ 0 },
> >>  };
> >>  
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> >> +			      int hyper_entries)
> >> +{
> >> +	u64 gpaddr = virt_to_phys((void *)gvaddr);
> >> +
> >> +	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> >> +	virtqueue_kick_sync(vb->hinting_vq);
> >> +}
> >> +
> >> +static void hinting_ack(struct virtqueue *vq)
> >> +{
> >> +	struct virtio_balloon *vb = vq->vdev->priv;
> >> +
> >> +	wake_up(&vb->acked);
> >> +}
> >> +
> >> +static void enable_hinting(struct virtio_balloon *vb)
> >> +{
> >> +	guest_page_hinting_flag = 1;
> >> +	static_branch_enable(&guest_page_hinting_key);
> >> +	request_hypercall = (void *)&virtballoon_page_hinting;
> >> +	balloon_ptr = vb;
> >> +	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> >> +}
> >> +
> >> +static void disable_hinting(void)
> >> +{
> >> +	guest_page_hinting_flag = 0;
> >> +	static_branch_enable(&guest_page_hinting_key);
> >> +	balloon_ptr = NULL;
> >> +}
> >> +#endif
> >> +
> >>  static u32 page_to_balloon_pfn(struct page *page)
> >>  {
> >>  	unsigned long pfn = page_to_pfn(page);
> >> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
> >>  	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> >>  	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> >>  	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >> +	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
> >>  
> >>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >>  		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> >> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
> >>  		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >>  	}
> >>  
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> >> +		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> >> +		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> >> +	}
> >>  	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> >>  					 vqs, callbacks, names, NULL, NULL);
> >>  	if (err)
> >>  		return err;
> >>  
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> >> +
> >>  	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> >>  	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> >>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >>  		if (err)
> >>  			goto out_del_balloon_wq;
> >>  	}
> >> +
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +		enable_hinting(vb);
> >> +#endif
> >>  	virtio_device_ready(vdev);
> >>  
> >>  	if (towards_target(vb))
> >> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device *vdev)
> >>  	cancel_work_sync(&vb->update_balloon_size_work);
> >>  	cancel_work_sync(&vb->update_balloon_stats_work);
> >>  
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +		disable_hinting();
> >> +#endif
> >>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> >>  		cancel_work_sync(&vb->report_free_page_work);
> >>  		destroy_workqueue(vb->balloon_wq);
> >> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
> >>  	VIRTIO_BALLOON_F_MUST_TELL_HOST,
> >>  	VIRTIO_BALLOON_F_STATS_VQ,
> >>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >> +	VIRTIO_BALLOON_F_HINTING,
> >>  	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> >>  	VIRTIO_BALLOON_F_PAGE_POISON,
> >>  };
> >> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> >> index e800c6b07561..3ba8c1f3b4a4 100644
> >> --- a/include/linux/page_hinting.h
> >> +++ b/include/linux/page_hinting.h
> >> @@ -1,15 +1,12 @@
> >>  #include <linux/smpboot.h>
> >>  
> >> -/*
> >> - * Size of the array which is used to store the freed pages is defined by
> >> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> >> - * we can get rid of the hardcoded array size.
> >> - */
> >>  #define MAX_FGPT_ENTRIES	1000
> >>  /*
> >>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
> >> - * @pfn: page frame number for the page which needs to be sent to the host.
> >> - * @order: order of the page needs to be reported to the host.
> >> + * @pfn - page frame number for the page which is to be freed.
> >> + * @pages - number of pages which are supposed to be freed.
> >> + * A global array object is used to to hold the list of pfn and pages and is
> >> + * passed as part of the hypercall.
> >>   */
> >>  struct hypervisor_pages {
> >>  	unsigned long pfn;
> >> @@ -19,11 +16,18 @@ struct hypervisor_pages {
> >>  extern int guest_page_hinting_flag;
> >>  extern struct static_key_false guest_page_hinting_key;
> >>  extern struct smp_hotplug_thread hinting_threads;
> >> +extern void (*request_hypercall)(void *, u64, int);
> >> +extern void *balloon_ptr;
> >>  extern bool want_page_poisoning;
> >>  
> >>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>  			      void __user *buffer, size_t *lenp, loff_t *ppos);
> >>  void guest_free_page(struct page *page, int order);
> >> +extern int __isolate_free_page(struct page *page, unsigned int order);
> >> +extern void free_one_page(struct zone *zone,
> >> +			  struct page *page, unsigned long pfn,
> >> +			  unsigned int order,
> >> +			  int migratetype);
> >>  
> >>  static inline void disable_page_poisoning(void)
> >>  {
> > I guess you will want to put this in some other header.  Function
> > declarations belong close to where they are implemented, not used.
> I will find a better place.
> >
> >> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> >> index a1966cd7b677..2b0f62814e22 100644
> >> --- a/include/uapi/linux/virtio_balloon.h
> >> +++ b/include/uapi/linux/virtio_balloon.h
> >> @@ -36,6 +36,7 @@
> >>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> >>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
> >>  #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
> >> +#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
> >>  
> >>  /* Size of a PFN in the balloon interface. */
> >>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index d295c9bc01a8..93224cba9243 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
> >>  	spin_unlock(&zone->lock);
> >>  }
> >>  
> >> -static void free_one_page(struct zone *zone,
> >> +void free_one_page(struct zone *zone,
> >>  				struct page *page, unsigned long pfn,
> >>  				unsigned int order,
> >>  				int migratetype)
> >> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> >> index be529f6f2bc0..315099fcda43 100644
> >> --- a/virt/kvm/page_hinting.c
> >> +++ b/virt/kvm/page_hinting.c
> >> @@ -1,6 +1,8 @@
> >>  #include <linux/gfp.h>
> >>  #include <linux/mm.h>
> >> +#include <linux/page_ref.h>
> >>  #include <linux/kvm_host.h>
> >> +#include <linux/sort.h>
> >>  #include <linux/kernel.h>
> >>  
> >>  /*
> >> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
> >>  EXPORT_SYMBOL(guest_page_hinting_flag);
> >>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
> >>  
> >> +void (*request_hypercall)(void *, u64, int);
> >> +EXPORT_SYMBOL(request_hypercall);
> >> +void *balloon_ptr;
> >> +EXPORT_SYMBOL(balloon_ptr);
> >> +
> >>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>  			      void __user *buffer, size_t *lenp,
> >>  			      loff_t *ppos)
> >> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>  	return ret;
> >>  }
> >>  
> >> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
> >> +{
> >> +	int i = 0;
> >> +	int mt = 0;
> >> +
> >> +	if (balloon_ptr)
> >> +		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> >> +				  entries);
> >> +
> >> +	while (i < entries) {
> >> +		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> >> +
> >> +		mt = get_pageblock_migratetype(page);
> >> +		free_one_page(page_zone(page), page, page_to_pfn(page),
> >> +			      guest_isolated_pages[i].order, mt);
> >> +		i++;
> >> +	}
> >> +}
> >> +
> >> +struct page *get_buddy_page(struct page *page)
> >> +{
> >> +	unsigned long pfn = page_to_pfn(page);
> >> +	unsigned int order;
> >> +
> >> +	for (order = 0; order < MAX_ORDER; order++) {
> >> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
> >> +
> >> +		if (PageBuddy(page_head) && page_private(page_head) >= order)
> >> +			return page_head;
> >> +	}
> >> +	return NULL;
> >> +}
> >> +
> >>  static void hinting_fn(unsigned int cpu)
> >>  {
> >>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +	int idx = 0, ret = 0;
> >> +	struct zone *zone_cur;
> >> +	unsigned long flags = 0;
> >> +
> >> +	while (idx < MAX_FGPT_ENTRIES) {
> >> +		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> >> +		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> >> +			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> >> +
> >> +		while (pfn <= pfn_end) {
> >> +			struct page *page = pfn_to_page(pfn);
> >> +			struct page *buddy_page = NULL;
> >> +
> >> +			zone_cur = page_zone(page);
> >> +			spin_lock_irqsave(&zone_cur->lock, flags);
> >> +
> >> +			if (PageCompound(page)) {
> >> +				struct page *head_page = compound_head(page);
> >> +				unsigned long head_pfn = page_to_pfn(head_page);
> >> +				unsigned int alloc_pages =
> >> +					1 << compound_order(head_page);
> >> +
> >> +				pfn = head_pfn + alloc_pages;
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +
> >> +			if (page_ref_count(page)) {
> >> +				pfn++;
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +
> >> +			if (PageBuddy(page)) {
> >> +				int buddy_order = page_private(page);
> >>  
> >> +				ret = __isolate_free_page(page, buddy_order);
> >> +				if (!ret) {
> >> +				} else {
> >> +					int l_idx = page_hinting_obj->hyp_idx;
> >> +					struct hypervisor_pages *l_obj =
> >> +					page_hinting_obj->hypervisor_pagelist;
> >> +
> >> +					l_obj[l_idx].pfn = pfn;
> >> +					l_obj[l_idx].order = buddy_order;
> >> +					page_hinting_obj->hyp_idx += 1;
> >> +				}
> >> +				pfn = pfn + (1 << buddy_order);
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +
> >> +			buddy_page = get_buddy_page(page);
> >> +			if (buddy_page) {
> >> +				int buddy_order = page_private(buddy_page);
> >> +
> >> +				ret = __isolate_free_page(buddy_page,
> >> +							  buddy_order);
> >> +				if (!ret) {
> >> +				} else {
> >> +					int l_idx = page_hinting_obj->hyp_idx;
> >> +					struct hypervisor_pages *l_obj =
> >> +					page_hinting_obj->hypervisor_pagelist;
> >> +					unsigned long buddy_pfn =
> >> +						page_to_pfn(buddy_page);
> >> +
> >> +					l_obj[l_idx].pfn = buddy_pfn;
> >> +					l_obj[l_idx].order = buddy_order;
> >> +					page_hinting_obj->hyp_idx += 1;
> >> +				}
> >> +				pfn = page_to_pfn(buddy_page) +
> >> +					(1 << buddy_order);
> >> +				spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +				continue;
> >> +			}
> >> +			spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +			pfn++;
> >> +		}
> >> +		page_hinting_obj->kvm_pt[idx].pfn = 0;
> >> +		page_hinting_obj->kvm_pt[idx].order = -1;
> >> +		page_hinting_obj->kvm_pt[idx].zonenum = -1;
> >> +		idx++;
> >> +	}
> >> +	if (page_hinting_obj->hyp_idx > 0) {
> >> +		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> >> +				page_hinting_obj->hyp_idx);
> >> +		page_hinting_obj->hyp_idx = 0;
> >> +	}
> >>  	page_hinting_obj->kvm_pt_idx = 0;
> >>  	put_cpu_var(hinting_obj);
> >>  }
> >>  
> >> +int if_exist(struct page *page)
> >> +{
> >> +	int i = 0;
> >> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> +	while (i < MAX_FGPT_ENTRIES) {
> >> +		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> >> +			return 1;
> >> +		i++;
> >> +	}
> >> +	return 0;
> >> +}
> >> +
> >> +void pack_array(void)
> >> +{
> >> +	int i = 0, j = 0;
> >> +	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> +	while (i < MAX_FGPT_ENTRIES) {
> >> +		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> >> +			if (i != j) {
> >> +				page_hinting_obj->kvm_pt[j].pfn =
> >> +					page_hinting_obj->kvm_pt[i].pfn;
> >> +				page_hinting_obj->kvm_pt[j].order =
> >> +					page_hinting_obj->kvm_pt[i].order;
> >> +				page_hinting_obj->kvm_pt[j].zonenum =
> >> +					page_hinting_obj->kvm_pt[i].zonenum;
> >> +			}
> >> +			j++;
> >> +		}
> >> +		i++;
> >> +	}
> >> +	i = j;
> >> +	page_hinting_obj->kvm_pt_idx = j;
> >> +	while (j < MAX_FGPT_ENTRIES) {
> >> +		page_hinting_obj->kvm_pt[j].pfn = 0;
> >> +		page_hinting_obj->kvm_pt[j].order = -1;
> >> +		page_hinting_obj->kvm_pt[j].zonenum = -1;
> >> +		j++;
> >> +	}
> >> +}
> >> +
> >>  void scan_array(void)
> >>  {
> >>  	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +	int i = 0;
> >>  
> >> +	while (i < MAX_FGPT_ENTRIES) {
> >> +		struct page *page =
> >> +			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> >> +		struct page *buddy_page = get_buddy_page(page);
> >> +
> >> +		if (!PageBuddy(page) && buddy_page) {
> >> +			if (if_exist(buddy_page)) {
> >> +				page_hinting_obj->kvm_pt[i].pfn = 0;
> >> +				page_hinting_obj->kvm_pt[i].order = -1;
> >> +				page_hinting_obj->kvm_pt[i].zonenum = -1;
> >> +			} else {
> >> +				page_hinting_obj->kvm_pt[i].pfn =
> >> +					page_to_pfn(buddy_page);
> >> +				page_hinting_obj->kvm_pt[i].order =
> >> +					page_private(buddy_page);
> >> +			}
> >> +		}
> >> +		i++;
> >> +	}
> >> +	pack_array();
> >>  	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >>  		wake_up_process(__this_cpu_read(hinting_task));
> >>  }
> >> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
> >>  		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
> >>  							order;
> >>  		page_hinting_obj->kvm_pt_idx += 1;
> >> -		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >> +		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> >> +			/*
> >> +			 * We are depending on the buddy free-list to identify
> >> +			 * if a page is free or not. Hence, we are dumping all
> >> +			 * the per-cpu pages back into the buddy allocator. This
> >> +			 * will ensure less failures when we try to isolate free
> >> +			 * captured pages and hence more memory reporting to the
> >> +			 * host.
> >> +			 */
> >> +			drain_local_pages(NULL);
> >>  			scan_array();
> >> +		}
> >>  	}
> >>  	local_irq_restore(flags);
> >>  }
> >> -- 
> >> 2.17.2
> -- 
> Regards
> Nitesh
>
Alexander Duyck - Feb. 7, 2019, 5:43 p.m.
On Tue, Feb 5, 2019 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> >
> > On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> > >> This patch enables the kernel to scan the per cpu array and
> > >> compress it by removing the repetitive/re-allocated pages.
> > >> Once the per cpu array is completely filled with pages in the
> > >> buddy it wakes up the kernel per cpu thread which re-scans the
> > >> entire per cpu array by acquiring a zone lock corresponding to
> > >> the page which is being scanned. If the page is still free and
> > >> present in the buddy it tries to isolate the page and adds it
> > >> to another per cpu array.
> > >>
> > >> Once this scanning process is complete and if there are any
> > >> isolated pages added to the new per cpu array kernel thread
> > >> invokes hyperlist_ready().
> > >>
> > >> In hyperlist_ready() a hypercall is made to report these pages to
> > >> the host using the virtio-balloon framework. In order to do so
> > >> another virtqueue 'hinting_vq' is added to the balloon framework.
> > >> As the host frees all the reported pages, the kernel thread returns
> > >> them back to the buddy.
> > >>
> > >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> > >
> > > This looks kind of like what early iterations of Wei's patches did.
> > >
> > > But this has lots of issues, for example you might end up with
> > > a hypercall per a 4K page.
> > > So in the end, he switched over to just reporting only
> > > MAX_ORDER - 1 pages.
> > You mean that I should only capture/attempt to isolate pages with order
> > MAX_ORDER - 1?
> > >
> > > Would that be a good idea for you too?
> > Will it help if we have a threshold value based on the amount of memory
> > captured instead of the number of entries/pages in the array?
>
> This is what Wei's patches do at least.

So in the solution I had posted I was looking more at
HUGETLB_PAGE_ORDER and above as the size of pages to provide the hints
on [1]. The advantage to doing that is that you can also avoid
fragmenting huge pages which in turn can cause what looks like a
memory leak as the memory subsystem attempts to reassemble huge
pages[2]. In my mind a 2MB page makes good sense in terms of the size
of things to be performing hints on as anything smaller than that is
going to just end up being a bunch of extra work and end up causing a
bunch of fragmentation.

The only issue with limiting things on an arbitrary boundary like that
is that you have to hook into the buddy allocator to catch the cases
where a page has been merged up into that range.

[1] https://lkml.org/lkml/2019/2/4/903
[2] https://blog.digitalocean.com/transparent-huge-pages-and-alternative-memory-allocators/
Michael S. Tsirkin - Feb. 7, 2019, 7:01 p.m.
On Thu, Feb 07, 2019 at 09:43:44AM -0800, Alexander Duyck wrote:
> On Tue, Feb 5, 2019 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> > >
> > > On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > > > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> > > >> This patch enables the kernel to scan the per cpu array and
> > > >> compress it by removing the repetitive/re-allocated pages.
> > > >> Once the per cpu array is completely filled with pages in the
> > > >> buddy it wakes up the kernel per cpu thread which re-scans the
> > > >> entire per cpu array by acquiring a zone lock corresponding to
> > > >> the page which is being scanned. If the page is still free and
> > > >> present in the buddy it tries to isolate the page and adds it
> > > >> to another per cpu array.
> > > >>
> > > >> Once this scanning process is complete and if there are any
> > > >> isolated pages added to the new per cpu array kernel thread
> > > >> invokes hyperlist_ready().
> > > >>
> > > >> In hyperlist_ready() a hypercall is made to report these pages to
> > > >> the host using the virtio-balloon framework. In order to do so
> > > >> another virtqueue 'hinting_vq' is added to the balloon framework.
> > > >> As the host frees all the reported pages, the kernel thread returns
> > > >> them back to the buddy.
> > > >>
> > > >> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> > > >
> > > > This looks kind of like what early iterations of Wei's patches did.
> > > >
> > > > But this has lots of issues, for example you might end up with
> > > > a hypercall per a 4K page.
> > > > So in the end, he switched over to just reporting only
> > > > MAX_ORDER - 1 pages.
> > > You mean that I should only capture/attempt to isolate pages with order
> > > MAX_ORDER - 1?
> > > >
> > > > Would that be a good idea for you too?
> > > Will it help if we have a threshold value based on the amount of memory
> > > captured instead of the number of entries/pages in the array?
> >
> > This is what Wei's patches do at least.
> 
> So in the solution I had posted I was looking more at
> HUGETLB_PAGE_ORDER and above as the size of pages to provide the hints
> on [1]. The advantage to doing that is that you can also avoid
> fragmenting huge pages which in turn can cause what looks like a
> memory leak as the memory subsystem attempts to reassemble huge
> pages[2]. In my mind a 2MB page makes good sense in terms of the size
> of things to be performing hints on as anything smaller than that is
> going to just end up being a bunch of extra work and end up causing a
> bunch of fragmentation.

Yes MAX_ORDER-1 is 4M. So not a lot of difference on x86.

The idea behind keying off MAX_ORDER is that CPU hugepages isn't
the only reason to avoid fragmentation, there's other
hardware that benefits from linear physical addresses.
And there are weird platforms where HUGETLB_PAGE_ORDER exceeds
MAX_ORDER - 1. So from that POV keying it off MAX_ORDER
makes more sense.


> The only issue with limiting things on an arbitrary boundary like that
> is that you have to hook into the buddy allocator to catch the cases
> where a page has been merged up into that range.
> 
> [1] https://lkml.org/lkml/2019/2/4/903
> [2] https://blog.digitalocean.com/transparent-huge-pages-and-alternative-memory-allocators/
Nitesh Narayan Lal - Feb. 7, 2019, 8:50 p.m.
On 2/7/19 12:43 PM, Alexander Duyck wrote:
> On Tue, Feb 5, 2019 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
>>> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
>>>> On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
>>>>> This patch enables the kernel to scan the per cpu array and
>>>>> compress it by removing the repetitive/re-allocated pages.
>>>>> Once the per cpu array is completely filled with pages in the
>>>>> buddy it wakes up the kernel per cpu thread which re-scans the
>>>>> entire per cpu array by acquiring a zone lock corresponding to
>>>>> the page which is being scanned. If the page is still free and
>>>>> present in the buddy it tries to isolate the page and adds it
>>>>> to another per cpu array.
>>>>>
>>>>> Once this scanning process is complete and if there are any
>>>>> isolated pages added to the new per cpu array kernel thread
>>>>> invokes hyperlist_ready().
>>>>>
>>>>> In hyperlist_ready() a hypercall is made to report these pages to
>>>>> the host using the virtio-balloon framework. In order to do so
>>>>> another virtqueue 'hinting_vq' is added to the balloon framework.
>>>>> As the host frees all the reported pages, the kernel thread returns
>>>>> them back to the buddy.
>>>>>
>>>>> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
>>>> This looks kind of like what early iterations of Wei's patches did.
>>>>
>>>> But this has lots of issues, for example you might end up with
>>>> a hypercall per a 4K page.
>>>> So in the end, he switched over to just reporting only
>>>> MAX_ORDER - 1 pages.
>>> You mean that I should only capture/attempt to isolate pages with order
>>> MAX_ORDER - 1?
>>>> Would that be a good idea for you too?
>>> Will it help if we have a threshold value based on the amount of memory
>>> captured instead of the number of entries/pages in the array?
>> This is what Wei's patches do at least.
> So in the solution I had posted I was looking more at
> HUGETLB_PAGE_ORDER and above as the size of pages to provide the hints
> on [1]. The advantage to doing that is that you can also avoid
> fragmenting huge pages which in turn can cause what looks like a
> memory leak as the memory subsystem attempts to reassemble huge
> pages[2]. In my mind a 2MB page makes good sense in terms of the size
> of things to be performing hints on as anything smaller than that is
> going to just end up being a bunch of extra work and end up causing a
> bunch of fragmentation.
As per my opinion, in any implementation which page size to store before
reporting depends on the allocation pattern of the workload running in
the guest.

I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
However I am still thinking about a workload which I can use to test its
effectiveness.

>
> The only issue with limiting things on an arbitrary boundary like that
> is that you have to hook into the buddy allocator to catch the cases
> where a page has been merged up into that range.
I don't think, I understood your comment completely. In any case, we
have to rely on the buddy for merging the pages.
>
> [1] https://lkml.org/lkml/2019/2/4/903
> [2] https://blog.digitalocean.com/transparent-huge-pages-and-alternative-memory-allocators/
Alexander Duyck - Feb. 8, 2019, 5:58 p.m.
On Thu, Feb 7, 2019 at 12:50 PM Nitesh Narayan Lal <nitesh@redhat.com> wrote:
>
>
> On 2/7/19 12:43 PM, Alexander Duyck wrote:
> > On Tue, Feb 5, 2019 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >> On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> >>> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> >>>> On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> >>>>> This patch enables the kernel to scan the per cpu array and
> >>>>> compress it by removing the repetitive/re-allocated pages.
> >>>>> Once the per cpu array is completely filled with pages in the
> >>>>> buddy it wakes up the kernel per cpu thread which re-scans the
> >>>>> entire per cpu array by acquiring a zone lock corresponding to
> >>>>> the page which is being scanned. If the page is still free and
> >>>>> present in the buddy it tries to isolate the page and adds it
> >>>>> to another per cpu array.
> >>>>>
> >>>>> Once this scanning process is complete and if there are any
> >>>>> isolated pages added to the new per cpu array kernel thread
> >>>>> invokes hyperlist_ready().
> >>>>>
> >>>>> In hyperlist_ready() a hypercall is made to report these pages to
> >>>>> the host using the virtio-balloon framework. In order to do so
> >>>>> another virtqueue 'hinting_vq' is added to the balloon framework.
> >>>>> As the host frees all the reported pages, the kernel thread returns
> >>>>> them back to the buddy.
> >>>>>
> >>>>> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> >>>> This looks kind of like what early iterations of Wei's patches did.
> >>>>
> >>>> But this has lots of issues, for example you might end up with
> >>>> a hypercall per a 4K page.
> >>>> So in the end, he switched over to just reporting only
> >>>> MAX_ORDER - 1 pages.
> >>> You mean that I should only capture/attempt to isolate pages with order
> >>> MAX_ORDER - 1?
> >>>> Would that be a good idea for you too?
> >>> Will it help if we have a threshold value based on the amount of memory
> >>> captured instead of the number of entries/pages in the array?
> >> This is what Wei's patches do at least.
> > So in the solution I had posted I was looking more at
> > HUGETLB_PAGE_ORDER and above as the size of pages to provide the hints
> > on [1]. The advantage to doing that is that you can also avoid
> > fragmenting huge pages which in turn can cause what looks like a
> > memory leak as the memory subsystem attempts to reassemble huge
> > pages[2]. In my mind a 2MB page makes good sense in terms of the size
> > of things to be performing hints on as anything smaller than that is
> > going to just end up being a bunch of extra work and end up causing a
> > bunch of fragmentation.
> As per my opinion, in any implementation which page size to store before
> reporting depends on the allocation pattern of the workload running in
> the guest.

I suggest you take a look at item 2 that I had called out in the
previous email. There are known issues with providing hints smaller
than THP using MADV_DONTNEED or MADV_FREE. Specifically what will
happen is that you end up breaking up a higher order transparent huge
page, backfilling a few holes with other pages, but then the memory
allocation subsystem attempts to reassemble the larger THP page
resulting in an application exhibiting behavior similar to a memory
leak while not actually allocating memory since it is sitting on
fragments of THP pages.

Also while I am thinking of it I haven't noticed anywhere that you are
handling the case of a device assigned to the guest. That seems like a
spot where we are going to have to stop hinting as well aren't we?
Otherwise we would need to redo the memory mapping of the guest in the
IOMMU every time a page is evicted and replaced.

> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
> However I am still thinking about a workload which I can use to test its
> effectiveness.

You might want to look at doing something like min(MAX_ORDER - 1,
HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
THP which is the most likely to be used page size with the guest.

> >
> > The only issue with limiting things on an arbitrary boundary like that
> > is that you have to hook into the buddy allocator to catch the cases
> > where a page has been merged up into that range.
> I don't think, I understood your comment completely. In any case, we
> have to rely on the buddy for merging the pages.
> >
> > [1] https://lkml.org/lkml/2019/2/4/903
> > [2] https://blog.digitalocean.com/transparent-huge-pages-and-alternative-memory-allocators/
> --
> Regards
> Nitesh
>
Nitesh Narayan Lal - Feb. 8, 2019, 8:41 p.m.
On 2/8/19 12:58 PM, Alexander Duyck wrote:
> On Thu, Feb 7, 2019 at 12:50 PM Nitesh Narayan Lal <nitesh@redhat.com> wrote:
>>
>> On 2/7/19 12:43 PM, Alexander Duyck wrote:
>>> On Tue, Feb 5, 2019 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
>>>>> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
>>>>>> On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
>>>>>>> This patch enables the kernel to scan the per cpu array and
>>>>>>> compress it by removing the repetitive/re-allocated pages.
>>>>>>> Once the per cpu array is completely filled with pages in the
>>>>>>> buddy it wakes up the kernel per cpu thread which re-scans the
>>>>>>> entire per cpu array by acquiring a zone lock corresponding to
>>>>>>> the page which is being scanned. If the page is still free and
>>>>>>> present in the buddy it tries to isolate the page and adds it
>>>>>>> to another per cpu array.
>>>>>>>
>>>>>>> Once this scanning process is complete and if there are any
>>>>>>> isolated pages added to the new per cpu array kernel thread
>>>>>>> invokes hyperlist_ready().
>>>>>>>
>>>>>>> In hyperlist_ready() a hypercall is made to report these pages to
>>>>>>> the host using the virtio-balloon framework. In order to do so
>>>>>>> another virtqueue 'hinting_vq' is added to the balloon framework.
>>>>>>> As the host frees all the reported pages, the kernel thread returns
>>>>>>> them back to the buddy.
>>>>>>>
>>>>>>> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
>>>>>> This looks kind of like what early iterations of Wei's patches did.
>>>>>>
>>>>>> But this has lots of issues, for example you might end up with
>>>>>> a hypercall per a 4K page.
>>>>>> So in the end, he switched over to just reporting only
>>>>>> MAX_ORDER - 1 pages.
>>>>> You mean that I should only capture/attempt to isolate pages with order
>>>>> MAX_ORDER - 1?
>>>>>> Would that be a good idea for you too?
>>>>> Will it help if we have a threshold value based on the amount of memory
>>>>> captured instead of the number of entries/pages in the array?
>>>> This is what Wei's patches do at least.
>>> So in the solution I had posted I was looking more at
>>> HUGETLB_PAGE_ORDER and above as the size of pages to provide the hints
>>> on [1]. The advantage to doing that is that you can also avoid
>>> fragmenting huge pages which in turn can cause what looks like a
>>> memory leak as the memory subsystem attempts to reassemble huge
>>> pages[2]. In my mind a 2MB page makes good sense in terms of the size
>>> of things to be performing hints on as anything smaller than that is
>>> going to just end up being a bunch of extra work and end up causing a
>>> bunch of fragmentation.
>> As per my opinion, in any implementation which page size to store before
>> reporting depends on the allocation pattern of the workload running in
>> the guest.
> I suggest you take a look at item 2 that I had called out in the
> previous email. There are known issues with providing hints smaller
> than THP using MADV_DONTNEED or MADV_FREE. Specifically what will
> happen is that you end up breaking up a higher order transparent huge
> page, backfilling a few holes with other pages, but then the memory
> allocation subsystem attempts to reassemble the larger THP page
> resulting in an application exhibiting behavior similar to a memory
> leak while not actually allocating memory since it is sitting on
> fragments of THP pages.
I will look into this.
>
> Also while I am thinking of it I haven't noticed anywhere that you are
> handling the case of a device assigned to the guest. That seems like a
> spot where we are going to have to stop hinting as well aren't we?
> Otherwise we would need to redo the memory mapping of the guest in the
> IOMMU every time a page is evicted and replaced.
 I haven't explored such a use-case as of now but will definitely
explore it.
>
>> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
>> However I am still thinking about a workload which I can use to test its
>> effectiveness.
> You might want to look at doing something like min(MAX_ORDER - 1,
> HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
> THP which is the most likely to be used page size with the guest.
Sure, thanks for the suggestion.
>
>>> The only issue with limiting things on an arbitrary boundary like that
>>> is that you have to hook into the buddy allocator to catch the cases
>>> where a page has been merged up into that range.
>> I don't think, I understood your comment completely. In any case, we
>> have to rely on the buddy for merging the pages.
>>> [1] https://lkml.org/lkml/2019/2/4/903
>>> [2] https://blog.digitalocean.com/transparent-huge-pages-and-alternative-memory-allocators/
>> --
>> Regards
>> Nitesh
>>
Michael S. Tsirkin - Feb. 8, 2019, 9:35 p.m.
On Fri, Feb 08, 2019 at 09:58:47AM -0800, Alexander Duyck wrote:
> On Thu, Feb 7, 2019 at 12:50 PM Nitesh Narayan Lal <nitesh@redhat.com> wrote:
> >
> >
> > On 2/7/19 12:43 PM, Alexander Duyck wrote:
> > > On Tue, Feb 5, 2019 at 3:21 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >> On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> > >>> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > >>>> On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> > >>>>> This patch enables the kernel to scan the per cpu array and
> > >>>>> compress it by removing the repetitive/re-allocated pages.
> > >>>>> Once the per cpu array is completely filled with pages in the
> > >>>>> buddy it wakes up the kernel per cpu thread which re-scans the
> > >>>>> entire per cpu array by acquiring a zone lock corresponding to
> > >>>>> the page which is being scanned. If the page is still free and
> > >>>>> present in the buddy it tries to isolate the page and adds it
> > >>>>> to another per cpu array.
> > >>>>>
> > >>>>> Once this scanning process is complete and if there are any
> > >>>>> isolated pages added to the new per cpu array kernel thread
> > >>>>> invokes hyperlist_ready().
> > >>>>>
> > >>>>> In hyperlist_ready() a hypercall is made to report these pages to
> > >>>>> the host using the virtio-balloon framework. In order to do so
> > >>>>> another virtqueue 'hinting_vq' is added to the balloon framework.
> > >>>>> As the host frees all the reported pages, the kernel thread returns
> > >>>>> them back to the buddy.
> > >>>>>
> > >>>>> Signed-off-by: Nitesh Narayan Lal <nitesh@redhat.com>
> > >>>> This looks kind of like what early iterations of Wei's patches did.
> > >>>>
> > >>>> But this has lots of issues, for example you might end up with
> > >>>> a hypercall per a 4K page.
> > >>>> So in the end, he switched over to just reporting only
> > >>>> MAX_ORDER - 1 pages.
> > >>> You mean that I should only capture/attempt to isolate pages with order
> > >>> MAX_ORDER - 1?
> > >>>> Would that be a good idea for you too?
> > >>> Will it help if we have a threshold value based on the amount of memory
> > >>> captured instead of the number of entries/pages in the array?
> > >> This is what Wei's patches do at least.
> > > So in the solution I had posted I was looking more at
> > > HUGETLB_PAGE_ORDER and above as the size of pages to provide the hints
> > > on [1]. The advantage to doing that is that you can also avoid
> > > fragmenting huge pages which in turn can cause what looks like a
> > > memory leak as the memory subsystem attempts to reassemble huge
> > > pages[2]. In my mind a 2MB page makes good sense in terms of the size
> > > of things to be performing hints on as anything smaller than that is
> > > going to just end up being a bunch of extra work and end up causing a
> > > bunch of fragmentation.
> > As per my opinion, in any implementation which page size to store before
> > reporting depends on the allocation pattern of the workload running in
> > the guest.
> 
> I suggest you take a look at item 2 that I had called out in the
> previous email. There are known issues with providing hints smaller
> than THP using MADV_DONTNEED or MADV_FREE. Specifically what will
> happen is that you end up breaking up a higher order transparent huge
> page, backfilling a few holes with other pages, but then the memory
> allocation subsystem attempts to reassemble the larger THP page
> resulting in an application exhibiting behavior similar to a memory
> leak while not actually allocating memory since it is sitting on
> fragments of THP pages.
> 
> Also while I am thinking of it I haven't noticed anywhere that you are
> handling the case of a device assigned to the guest. That seems like a
> spot where we are going to have to stop hinting as well aren't we?

That would be easy for the host to do, way easier than for the guest.

> Otherwise we would need to redo the memory mapping of the guest in the
> IOMMU every time a page is evicted and replaced.

I think that in fact we could in theory make it work.


The reason is that while Linux IOMMU APIs do not allow
this, in fact you can change a mapping just for a single
page within a huge mapping while others are used, as follows:

- create a new set of PTEs
- copy over all PTE mappings except the one
  we are changing
- change the required mapping in the new entry
- atomically update the PMD to point at new PTEs
- flush IOMMU translation cache

similarly for higher levels if there are no PTEs.

So we could come up with something like
        int (*remap)(struct iommu_domain *domain, unsigned long iova,
                   phys_addr_t paddr, size_t size, int prot);

that just tweaks a mapping for a specified range without
breaking others.



> > I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
> > However I am still thinking about a workload which I can use to test its
> > effectiveness.
> 
> You might want to look at doing something like min(MAX_ORDER - 1,
> HUGETLB_PAGE_ORDER).
> I know for x86 a 2MB page is the upper limit for
> THP which is the most likely to be used page size with the guest.

Did you mean max?

I just feel that a good order has much more to do with how
the buddy allocators works than with hardware.

And maybe TRT is to completely disable hinting for when
HUGETLB_PAGE_ORDER > MAX_ORDER since clearly using
buddy allocator for hinting when that breaks huge pages
isn't a good idea.


> > >
> > > The only issue with limiting things on an arbitrary boundary like that
> > > is that you have to hook into the buddy allocator to catch the cases
> > > where a page has been merged up into that range.
> > I don't think, I understood your comment completely. In any case, we
> > have to rely on the buddy for merging the pages.
> > >
> > > [1] https://lkml.org/lkml/2019/2/4/903
> > > [2] https://blog.digitalocean.com/transparent-huge-pages-and-alternative-memory-allocators/
> > --
> > Regards
> > Nitesh
> >
Michael S. Tsirkin - Feb. 8, 2019, 9:38 p.m.
On Fri, Feb 08, 2019 at 03:41:55PM -0500, Nitesh Narayan Lal wrote:
> >> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
> >> However I am still thinking about a workload which I can use to test its
> >> effectiveness.
> > You might want to look at doing something like min(MAX_ORDER - 1,
> > HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
> > THP which is the most likely to be used page size with the guest.
> Sure, thanks for the suggestion.

Given current hinting in balloon is MAX_ORDER I'd say
share code. If you feel a need to adjust down the road,
adjust both of them with actual testing showing gains.
Alexander Duyck - Feb. 8, 2019, 10:05 p.m.
On Fri, Feb 8, 2019 at 1:38 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Feb 08, 2019 at 03:41:55PM -0500, Nitesh Narayan Lal wrote:
> > >> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
> > >> However I am still thinking about a workload which I can use to test its
> > >> effectiveness.
> > > You might want to look at doing something like min(MAX_ORDER - 1,
> > > HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
> > > THP which is the most likely to be used page size with the guest.
> > Sure, thanks for the suggestion.
>
> Given current hinting in balloon is MAX_ORDER I'd say
> share code. If you feel a need to adjust down the road,
> adjust both of them with actual testing showing gains.

Actually I'm left kind of wondering why we are even going through
virtio-balloon for this? It seems like this would make much more sense
as core functionality of KVM itself for the specific architectures
rather than some side thing. In addition this could end up being
redundant when you start getting into either the s390 or PowerPC
architectures as they already have means of providing unused page
hints.

I have a set of patches I proposed that add similar functionality via
a KVM hypercall for x86 instead of doing it as a part of a Virtio
device[1].  I'm suspecting the overhead of doing things this way is
much less then having to make multiple madvise system calls from QEMU
back into the kernel.

One other concern that has been pointed out with my patchset that
would likely need to be addressed here as well is what do we do about
other hypervisors that decide to implement page hinting. We probably
should look at making this KVM/QEMU specific code run through the
paravirtual infrastructure instead of trying into the x86 arch code
directly.

[1] https://lkml.org/lkml/2019/2/4/903
Michael S. Tsirkin - Feb. 10, 2019, 12:38 a.m.
On Fri, Feb 08, 2019 at 02:05:09PM -0800, Alexander Duyck wrote:
> On Fri, Feb 8, 2019 at 1:38 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Feb 08, 2019 at 03:41:55PM -0500, Nitesh Narayan Lal wrote:
> > > >> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
> > > >> However I am still thinking about a workload which I can use to test its
> > > >> effectiveness.
> > > > You might want to look at doing something like min(MAX_ORDER - 1,
> > > > HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
> > > > THP which is the most likely to be used page size with the guest.
> > > Sure, thanks for the suggestion.
> >
> > Given current hinting in balloon is MAX_ORDER I'd say
> > share code. If you feel a need to adjust down the road,
> > adjust both of them with actual testing showing gains.
> 
> Actually I'm left kind of wondering why we are even going through
> virtio-balloon for this?

Just look at what does it do.

It improves memory overcommit if guests are cooperative, and it does
this by giving the hypervisor addresses of pages which it can discard.

It's just *exactly* like the balloon with all the same limitations.

> It seems like this would make much more sense
> as core functionality of KVM itself for the specific architectures
> rather than some side thing.

Well same as balloon: whether it's useful to you at all
would very much depend on your workloads.

This kind of cooperative functionality is good for co-located
single-tenant VMs. That's pretty niche.  The core things in KVM
generally don't trust guests.


> In addition this could end up being
> redundant when you start getting into either the s390 or PowerPC
> architectures as they already have means of providing unused page
> hints.

Interesting. Is there host support in kvm?


> I have a set of patches I proposed that add similar functionality via
> a KVM hypercall for x86 instead of doing it as a part of a Virtio
> device[1].  I'm suspecting the overhead of doing things this way is
> much less then having to make multiple madvise system calls from QEMU
> back into the kernel.

Well whether it's a virtio device is orthogonal to whether it's an
madvise call, right? You can build vhost-pagehint and that can
handle requests in a VQ within balloon and do it
within host kernel directly.

virtio rings let you pass multiple pages so it's really hard to
say which will win outright - maybe it's more important
to coalesce exits.

Nitesh, how about trying same tests and reporting performance?


> One other concern that has been pointed out with my patchset that
> would likely need to be addressed here as well is what do we do about
> other hypervisors that decide to implement page hinting. We probably
> should look at making this KVM/QEMU specific code run through the
> paravirtual infrastructure instead of trying into the x86 arch code
> directly.
> 
> [1] https://lkml.org/lkml/2019/2/4/903


So virtio is a paravirtual interface, that's an argument for
using it then.

In any case pls copy the Cc'd crowd on future version of your patches.
David Hildenbrand - Feb. 11, 2019, 9:28 a.m.
On 10.02.19 01:38, Michael S. Tsirkin wrote:
> On Fri, Feb 08, 2019 at 02:05:09PM -0800, Alexander Duyck wrote:
>> On Fri, Feb 8, 2019 at 1:38 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>>
>>> On Fri, Feb 08, 2019 at 03:41:55PM -0500, Nitesh Narayan Lal wrote:
>>>>>> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
>>>>>> However I am still thinking about a workload which I can use to test its
>>>>>> effectiveness.
>>>>> You might want to look at doing something like min(MAX_ORDER - 1,
>>>>> HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
>>>>> THP which is the most likely to be used page size with the guest.
>>>> Sure, thanks for the suggestion.
>>>
>>> Given current hinting in balloon is MAX_ORDER I'd say
>>> share code. If you feel a need to adjust down the road,
>>> adjust both of them with actual testing showing gains.
>>
>> Actually I'm left kind of wondering why we are even going through
>> virtio-balloon for this?
> 
> Just look at what does it do.
> 
> It improves memory overcommit if guests are cooperative, and it does
> this by giving the hypervisor addresses of pages which it can discard.
> 
> It's just *exactly* like the balloon with all the same limitations.

I agree, this belongs to virtio-balloon *unless* we run into real
problems implementing it via an asynchronous mechanism.

> 
>> It seems like this would make much more sense
>> as core functionality of KVM itself for the specific architectures
>> rather than some side thing.

Whatever can be handled in user space and does not have significant
performance impacts should be handled in user space. If we run into real
problems with that approach, fair enough. (e.g. vcpu yielding is a good
example where an implementation in KVM makes sense, not going via QEMU)

> 
> Well same as balloon: whether it's useful to you at all
> would very much depend on your workloads.
> 
> This kind of cooperative functionality is good for co-located
> single-tenant VMs. That's pretty niche.  The core things in KVM
> generally don't trust guests.
> 
> 
>> In addition this could end up being
>> redundant when you start getting into either the s390 or PowerPC
>> architectures as they already have means of providing unused page
>> hints.

I'd like to note that on s390x the functionality is not provided when
running nested guests. And there are real problems getting it ever
supported. (see description below how it works on s390x, the issue for
nested guests are the bits in the guest -> host page tables we cannot
support for nested guests).

Hinting only works for guests running one level under LPAR (with a
recent machine), but not nested guests.

(LPAR -> KVM1 works, LPAR - KVM1 -> KVM2 foes not work for the latter)

So an implementation for s390 would still make sense for this scenario.

> 
> Interesting. Is there host support in kvm?

On s390x there is. It works on page granularity and synchronization
between guest/host ("don't drop a page in the host while the guest is
reusing it") is done via special bits in the host->guest page table.
Instructions in the guest are able to modify these bits. A guest can
configure a "usage state" of it's backed PTEs. E.g. "unused" or "stable".

Whenever a page in the guest is freed/reused, the ESSA instruction is
triggered in the guest. It will modify the page table bits and add the
guest phyical pfn to a buffer in the host. Once that buffer is full,
ESSA will trigger an intercept to the hypervisor. Here, all these
"unused" pages can be zapped.

Also, when swapping a page out in the hypervisor, if it was masked by
the guest as unused or logically zero, instead of swapping out the page,
it can simply be dropped and a fresh zero page can be supplied when the
guest tries to access it.

"ESSA" is implemented in KVM in arch/s390/kvm/priv.c:handle_essa().

So on s390x, it works because the synchronization with the hypervisor is
directly built into hw vitualization support (guest->host page tables +
instruction) and ESSA will not intercept on every call (due to the buffer).
> 
> 
>> I have a set of patches I proposed that add similar functionality via
>> a KVM hypercall for x86 instead of doing it as a part of a Virtio
>> device[1].  I'm suspecting the overhead of doing things this way is
>> much less then having to make multiple madvise system calls from QEMU
>> back into the kernel.
> 
> Well whether it's a virtio device is orthogonal to whether it's an
> madvise call, right? You can build vhost-pagehint and that can
> handle requests in a VQ within balloon and do it
> within host kernel directly.
> 
> virtio rings let you pass multiple pages so it's really hard to
> say which will win outright - maybe it's more important
> to coalesce exits.

We don't know until we measure it.
Michael S. Tsirkin - Feb. 12, 2019, 5:16 a.m.
On Mon, Feb 11, 2019 at 10:28:31AM +0100, David Hildenbrand wrote:
> On 10.02.19 01:38, Michael S. Tsirkin wrote:
> > On Fri, Feb 08, 2019 at 02:05:09PM -0800, Alexander Duyck wrote:
> >> On Fri, Feb 8, 2019 at 1:38 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>>
> >>> On Fri, Feb 08, 2019 at 03:41:55PM -0500, Nitesh Narayan Lal wrote:
> >>>>>> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
> >>>>>> However I am still thinking about a workload which I can use to test its
> >>>>>> effectiveness.
> >>>>> You might want to look at doing something like min(MAX_ORDER - 1,
> >>>>> HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
> >>>>> THP which is the most likely to be used page size with the guest.
> >>>> Sure, thanks for the suggestion.
> >>>
> >>> Given current hinting in balloon is MAX_ORDER I'd say
> >>> share code. If you feel a need to adjust down the road,
> >>> adjust both of them with actual testing showing gains.
> >>
> >> Actually I'm left kind of wondering why we are even going through
> >> virtio-balloon for this?
> > 
> > Just look at what does it do.
> > 
> > It improves memory overcommit if guests are cooperative, and it does
> > this by giving the hypervisor addresses of pages which it can discard.
> > 
> > It's just *exactly* like the balloon with all the same limitations.
> 
> I agree, this belongs to virtio-balloon *unless* we run into real
> problems implementing it via an asynchronous mechanism.
> 
> > 
> >> It seems like this would make much more sense
> >> as core functionality of KVM itself for the specific architectures
> >> rather than some side thing.
> 
> Whatever can be handled in user space and does not have significant
> performance impacts should be handled in user space. If we run into real
> problems with that approach, fair enough. (e.g. vcpu yielding is a good
> example where an implementation in KVM makes sense, not going via QEMU)

Just to note, if we wanted to we could add a special kind of VQ where
e.g. kick yields the VCPU. You don't necessarily need a hypercall for
this. A virtio-cpu, yay!


> > 
> > Well same as balloon: whether it's useful to you at all
> > would very much depend on your workloads.
> > 
> > This kind of cooperative functionality is good for co-located
> > single-tenant VMs. That's pretty niche.  The core things in KVM
> > generally don't trust guests.
> > 
> > 
> >> In addition this could end up being
> >> redundant when you start getting into either the s390 or PowerPC
> >> architectures as they already have means of providing unused page
> >> hints.
> 
> I'd like to note that on s390x the functionality is not provided when
> running nested guests. And there are real problems getting it ever
> supported. (see description below how it works on s390x, the issue for
> nested guests are the bits in the guest -> host page tables we cannot
> support for nested guests).
> 
> Hinting only works for guests running one level under LPAR (with a
> recent machine), but not nested guests.
> 
> (LPAR -> KVM1 works, LPAR - KVM1 -> KVM2 foes not work for the latter)
> 
> So an implementation for s390 would still make sense for this scenario.
> 
> > 
> > Interesting. Is there host support in kvm?
> 
> On s390x there is. It works on page granularity and synchronization
> between guest/host ("don't drop a page in the host while the guest is
> reusing it") is done via special bits in the host->guest page table.
> Instructions in the guest are able to modify these bits. A guest can
> configure a "usage state" of it's backed PTEs. E.g. "unused" or "stable".
> 
> Whenever a page in the guest is freed/reused, the ESSA instruction is
> triggered in the guest. It will modify the page table bits and add the
> guest phyical pfn to a buffer in the host. Once that buffer is full,
> ESSA will trigger an intercept to the hypervisor. Here, all these
> "unused" pages can be zapped.
> 
> Also, when swapping a page out in the hypervisor, if it was masked by
> the guest as unused or logically zero, instead of swapping out the page,
> it can simply be dropped and a fresh zero page can be supplied when the
> guest tries to access it.
> 
> "ESSA" is implemented in KVM in arch/s390/kvm/priv.c:handle_essa().
> 
> So on s390x, it works because the synchronization with the hypervisor is
> directly built into hw vitualization support (guest->host page tables +
> instruction) and ESSA will not intercept on every call (due to the buffer).
> > 
> > 
> >> I have a set of patches I proposed that add similar functionality via
> >> a KVM hypercall for x86 instead of doing it as a part of a Virtio
> >> device[1].  I'm suspecting the overhead of doing things this way is
> >> much less then having to make multiple madvise system calls from QEMU
> >> back into the kernel.
> > 
> > Well whether it's a virtio device is orthogonal to whether it's an
> > madvise call, right? You can build vhost-pagehint and that can
> > handle requests in a VQ within balloon and do it
> > within host kernel directly.
> > 
> > virtio rings let you pass multiple pages so it's really hard to
> > say which will win outright - maybe it's more important
> > to coalesce exits.
> 
> We don't know until we measure it.

So to measure, I think we can start with traces that show how often do
specific workloads allocate/free pages of specific size.  We don't
necessarily need hypercall/host support.  We might want "mm: Add merge
page notifier" so we can count merges.

> -- 
> 
> Thanks,
> 
> David / dhildenb
Nitesh Narayan Lal - Feb. 12, 2019, 5:10 p.m.
On 2/9/19 7:38 PM, Michael S. Tsirkin wrote:
> On Fri, Feb 08, 2019 at 02:05:09PM -0800, Alexander Duyck wrote:
>> On Fri, Feb 8, 2019 at 1:38 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Fri, Feb 08, 2019 at 03:41:55PM -0500, Nitesh Narayan Lal wrote:
>>>>>> I am also planning to try Michael's suggestion of using MAX_ORDER - 1.
>>>>>> However I am still thinking about a workload which I can use to test its
>>>>>> effectiveness.
>>>>> You might want to look at doing something like min(MAX_ORDER - 1,
>>>>> HUGETLB_PAGE_ORDER). I know for x86 a 2MB page is the upper limit for
>>>>> THP which is the most likely to be used page size with the guest.
>>>> Sure, thanks for the suggestion.
>>> Given current hinting in balloon is MAX_ORDER I'd say
>>> share code. If you feel a need to adjust down the road,
>>> adjust both of them with actual testing showing gains.
>> Actually I'm left kind of wondering why we are even going through
>> virtio-balloon for this?
> Just look at what does it do.
>
> It improves memory overcommit if guests are cooperative, and it does
> this by giving the hypervisor addresses of pages which it can discard.
>
> It's just *exactly* like the balloon with all the same limitations.
>
>> It seems like this would make much more sense
>> as core functionality of KVM itself for the specific architectures
>> rather than some side thing.
> Well same as balloon: whether it's useful to you at all
> would very much depend on your workloads.
>
> This kind of cooperative functionality is good for co-located
> single-tenant VMs. That's pretty niche.  The core things in KVM
> generally don't trust guests.
>
>
>> In addition this could end up being
>> redundant when you start getting into either the s390 or PowerPC
>> architectures as they already have means of providing unused page
>> hints.
> Interesting. Is there host support in kvm?
>
>
>> I have a set of patches I proposed that add similar functionality via
>> a KVM hypercall for x86 instead of doing it as a part of a Virtio
>> device[1].  I'm suspecting the overhead of doing things this way is
>> much less then having to make multiple madvise system calls from QEMU
>> back into the kernel.
> Well whether it's a virtio device is orthogonal to whether it's an
> madvise call, right? You can build vhost-pagehint and that can
> handle requests in a VQ within balloon and do it
> within host kernel directly.
>
> virtio rings let you pass multiple pages so it's really hard to
> say which will win outright - maybe it's more important
> to coalesce exits.
>
> Nitesh, how about trying same tests and reporting performance?
Noted, I can give it a try before my next positing.
>
>
>> One other concern that has been pointed out with my patchset that
>> would likely need to be addressed here as well is what do we do about
>> other hypervisors that decide to implement page hinting. We probably
>> should look at making this KVM/QEMU specific code run through the
>> paravirtual infrastructure instead of trying into the x86 arch code
>> directly.
>>
>> [1] https://lkml.org/lkml/2019/2/4/903
>
> So virtio is a paravirtual interface, that's an argument for
> using it then.
>
> In any case pls copy the Cc'd crowd on future version of your patches.
>

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 728ecd1eea30..8af34e0b9a32 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -57,13 +57,15 @@  enum virtio_balloon_vq {
 	VIRTIO_BALLOON_VQ_INFLATE,
 	VIRTIO_BALLOON_VQ_DEFLATE,
 	VIRTIO_BALLOON_VQ_STATS,
+	VIRTIO_BALLOON_VQ_HINTING,
 	VIRTIO_BALLOON_VQ_FREE_PAGE,
 	VIRTIO_BALLOON_VQ_MAX
 };
 
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
+								*hinting_vq;
 
 	/* Balloon's own wq for cpu-intensive work items */
 	struct workqueue_struct *balloon_wq;
@@ -122,6 +124,40 @@  static struct virtio_device_id id_table[] = {
 	{ 0 },
 };
 
+#ifdef CONFIG_KVM_FREE_PAGE_HINTING
+void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
+			      int hyper_entries)
+{
+	u64 gpaddr = virt_to_phys((void *)gvaddr);
+
+	virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
+	virtqueue_kick_sync(vb->hinting_vq);
+}
+
+static void hinting_ack(struct virtqueue *vq)
+{
+	struct virtio_balloon *vb = vq->vdev->priv;
+
+	wake_up(&vb->acked);
+}
+
+static void enable_hinting(struct virtio_balloon *vb)
+{
+	guest_page_hinting_flag = 1;
+	static_branch_enable(&guest_page_hinting_key);
+	request_hypercall = (void *)&virtballoon_page_hinting;
+	balloon_ptr = vb;
+	WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
+}
+
+static void disable_hinting(void)
+{
+	guest_page_hinting_flag = 0;
+	static_branch_enable(&guest_page_hinting_key);
+	balloon_ptr = NULL;
+}
+#endif
+
 static u32 page_to_balloon_pfn(struct page *page)
 {
 	unsigned long pfn = page_to_pfn(page);
@@ -481,6 +517,7 @@  static int init_vqs(struct virtio_balloon *vb)
 	names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
 	names[VIRTIO_BALLOON_VQ_STATS] = NULL;
 	names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+	names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
 
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
 		names[VIRTIO_BALLOON_VQ_STATS] = "stats";
@@ -492,11 +529,18 @@  static int init_vqs(struct virtio_balloon *vb)
 		callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
 	}
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
+		names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
+		callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
+	}
 	err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
 					 vqs, callbacks, names, NULL, NULL);
 	if (err)
 		return err;
 
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
+		vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
+
 	vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
 	vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
@@ -908,6 +952,11 @@  static int virtballoon_probe(struct virtio_device *vdev)
 		if (err)
 			goto out_del_balloon_wq;
 	}
+
+#ifdef CONFIG_KVM_FREE_PAGE_HINTING
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
+		enable_hinting(vb);
+#endif
 	virtio_device_ready(vdev);
 
 	if (towards_target(vb))
@@ -950,6 +999,10 @@  static void virtballoon_remove(struct virtio_device *vdev)
 	cancel_work_sync(&vb->update_balloon_size_work);
 	cancel_work_sync(&vb->update_balloon_stats_work);
 
+#ifdef CONFIG_KVM_FREE_PAGE_HINTING
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
+		disable_hinting();
+#endif
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
 		cancel_work_sync(&vb->report_free_page_work);
 		destroy_workqueue(vb->balloon_wq);
@@ -1009,6 +1062,7 @@  static unsigned int features[] = {
 	VIRTIO_BALLOON_F_MUST_TELL_HOST,
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
+	VIRTIO_BALLOON_F_HINTING,
 	VIRTIO_BALLOON_F_FREE_PAGE_HINT,
 	VIRTIO_BALLOON_F_PAGE_POISON,
 };
diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
index e800c6b07561..3ba8c1f3b4a4 100644
--- a/include/linux/page_hinting.h
+++ b/include/linux/page_hinting.h
@@ -1,15 +1,12 @@ 
 #include <linux/smpboot.h>
 
-/*
- * Size of the array which is used to store the freed pages is defined by
- * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
- * we can get rid of the hardcoded array size.
- */
 #define MAX_FGPT_ENTRIES	1000
 /*
  * hypervisor_pages - It is a dummy structure passed with the hypercall.
- * @pfn: page frame number for the page which needs to be sent to the host.
- * @order: order of the page needs to be reported to the host.
+ * @pfn - page frame number for the page which is to be freed.
+ * @pages - number of pages which are supposed to be freed.
+ * A global array object is used to to hold the list of pfn and pages and is
+ * passed as part of the hypercall.
  */
 struct hypervisor_pages {
 	unsigned long pfn;
@@ -19,11 +16,18 @@  struct hypervisor_pages {
 extern int guest_page_hinting_flag;
 extern struct static_key_false guest_page_hinting_key;
 extern struct smp_hotplug_thread hinting_threads;
+extern void (*request_hypercall)(void *, u64, int);
+extern void *balloon_ptr;
 extern bool want_page_poisoning;
 
 int guest_page_hinting_sysctl(struct ctl_table *table, int write,
 			      void __user *buffer, size_t *lenp, loff_t *ppos);
 void guest_free_page(struct page *page, int order);
+extern int __isolate_free_page(struct page *page, unsigned int order);
+extern void free_one_page(struct zone *zone,
+			  struct page *page, unsigned long pfn,
+			  unsigned int order,
+			  int migratetype);
 
 static inline void disable_page_poisoning(void)
 {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index a1966cd7b677..2b0f62814e22 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -36,6 +36,7 @@ 
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT	3 /* VQ to report free pages */
 #define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_HINTING	5 /* Page hinting virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d295c9bc01a8..93224cba9243 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1199,7 +1199,7 @@  static void free_pcppages_bulk(struct zone *zone, int count,
 	spin_unlock(&zone->lock);
 }
 
-static void free_one_page(struct zone *zone,
+void free_one_page(struct zone *zone,
 				struct page *page, unsigned long pfn,
 				unsigned int order,
 				int migratetype)
diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
index be529f6f2bc0..315099fcda43 100644
--- a/virt/kvm/page_hinting.c
+++ b/virt/kvm/page_hinting.c
@@ -1,6 +1,8 @@ 
 #include <linux/gfp.h>
 #include <linux/mm.h>
+#include <linux/page_ref.h>
 #include <linux/kvm_host.h>
+#include <linux/sort.h>
 #include <linux/kernel.h>
 
 /*
@@ -39,6 +41,11 @@  int guest_page_hinting_flag;
 EXPORT_SYMBOL(guest_page_hinting_flag);
 static DEFINE_PER_CPU(struct task_struct *, hinting_task);
 
+void (*request_hypercall)(void *, u64, int);
+EXPORT_SYMBOL(request_hypercall);
+void *balloon_ptr;
+EXPORT_SYMBOL(balloon_ptr);
+
 int guest_page_hinting_sysctl(struct ctl_table *table, int write,
 			      void __user *buffer, size_t *lenp,
 			      loff_t *ppos)
@@ -55,18 +62,201 @@  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
+void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int entries)
+{
+	int i = 0;
+	int mt = 0;
+
+	if (balloon_ptr)
+		request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
+				  entries);
+
+	while (i < entries) {
+		struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
+
+		mt = get_pageblock_migratetype(page);
+		free_one_page(page_zone(page), page, page_to_pfn(page),
+			      guest_isolated_pages[i].order, mt);
+		i++;
+	}
+}
+
+struct page *get_buddy_page(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	unsigned int order;
+
+	for (order = 0; order < MAX_ORDER; order++) {
+		struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+		if (PageBuddy(page_head) && page_private(page_head) >= order)
+			return page_head;
+	}
+	return NULL;
+}
+
 static void hinting_fn(unsigned int cpu)
 {
 	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
+	int idx = 0, ret = 0;
+	struct zone *zone_cur;
+	unsigned long flags = 0;
+
+	while (idx < MAX_FGPT_ENTRIES) {
+		unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
+		unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
+			(1 << page_hinting_obj->kvm_pt[idx].order) - 1;
+
+		while (pfn <= pfn_end) {
+			struct page *page = pfn_to_page(pfn);
+			struct page *buddy_page = NULL;
+
+			zone_cur = page_zone(page);
+			spin_lock_irqsave(&zone_cur->lock, flags);
+
+			if (PageCompound(page)) {
+				struct page *head_page = compound_head(page);
+				unsigned long head_pfn = page_to_pfn(head_page);
+				unsigned int alloc_pages =
+					1 << compound_order(head_page);
+
+				pfn = head_pfn + alloc_pages;
+				spin_unlock_irqrestore(&zone_cur->lock, flags);
+				continue;
+			}
+
+			if (page_ref_count(page)) {
+				pfn++;
+				spin_unlock_irqrestore(&zone_cur->lock, flags);
+				continue;
+			}
+
+			if (PageBuddy(page)) {
+				int buddy_order = page_private(page);
 
+				ret = __isolate_free_page(page, buddy_order);
+				if (!ret) {
+				} else {
+					int l_idx = page_hinting_obj->hyp_idx;
+					struct hypervisor_pages *l_obj =
+					page_hinting_obj->hypervisor_pagelist;
+
+					l_obj[l_idx].pfn = pfn;
+					l_obj[l_idx].order = buddy_order;
+					page_hinting_obj->hyp_idx += 1;
+				}
+				pfn = pfn + (1 << buddy_order);
+				spin_unlock_irqrestore(&zone_cur->lock, flags);
+				continue;
+			}
+
+			buddy_page = get_buddy_page(page);
+			if (buddy_page) {
+				int buddy_order = page_private(buddy_page);
+
+				ret = __isolate_free_page(buddy_page,
+							  buddy_order);
+				if (!ret) {
+				} else {
+					int l_idx = page_hinting_obj->hyp_idx;
+					struct hypervisor_pages *l_obj =
+					page_hinting_obj->hypervisor_pagelist;
+					unsigned long buddy_pfn =
+						page_to_pfn(buddy_page);
+
+					l_obj[l_idx].pfn = buddy_pfn;
+					l_obj[l_idx].order = buddy_order;
+					page_hinting_obj->hyp_idx += 1;
+				}
+				pfn = page_to_pfn(buddy_page) +
+					(1 << buddy_order);
+				spin_unlock_irqrestore(&zone_cur->lock, flags);
+				continue;
+			}
+			spin_unlock_irqrestore(&zone_cur->lock, flags);
+			pfn++;
+		}
+		page_hinting_obj->kvm_pt[idx].pfn = 0;
+		page_hinting_obj->kvm_pt[idx].order = -1;
+		page_hinting_obj->kvm_pt[idx].zonenum = -1;
+		idx++;
+	}
+	if (page_hinting_obj->hyp_idx > 0) {
+		hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
+				page_hinting_obj->hyp_idx);
+		page_hinting_obj->hyp_idx = 0;
+	}
 	page_hinting_obj->kvm_pt_idx = 0;
 	put_cpu_var(hinting_obj);
 }
 
+int if_exist(struct page *page)
+{
+	int i = 0;
+	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
+
+	while (i < MAX_FGPT_ENTRIES) {
+		if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
+			return 1;
+		i++;
+	}
+	return 0;
+}
+
+void pack_array(void)
+{
+	int i = 0, j = 0;
+	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
+
+	while (i < MAX_FGPT_ENTRIES) {
+		if (page_hinting_obj->kvm_pt[i].pfn != 0) {
+			if (i != j) {
+				page_hinting_obj->kvm_pt[j].pfn =
+					page_hinting_obj->kvm_pt[i].pfn;
+				page_hinting_obj->kvm_pt[j].order =
+					page_hinting_obj->kvm_pt[i].order;
+				page_hinting_obj->kvm_pt[j].zonenum =
+					page_hinting_obj->kvm_pt[i].zonenum;
+			}
+			j++;
+		}
+		i++;
+	}
+	i = j;
+	page_hinting_obj->kvm_pt_idx = j;
+	while (j < MAX_FGPT_ENTRIES) {
+		page_hinting_obj->kvm_pt[j].pfn = 0;
+		page_hinting_obj->kvm_pt[j].order = -1;
+		page_hinting_obj->kvm_pt[j].zonenum = -1;
+		j++;
+	}
+}
+
 void scan_array(void)
 {
 	struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
+	int i = 0;
 
+	while (i < MAX_FGPT_ENTRIES) {
+		struct page *page =
+			pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
+		struct page *buddy_page = get_buddy_page(page);
+
+		if (!PageBuddy(page) && buddy_page) {
+			if (if_exist(buddy_page)) {
+				page_hinting_obj->kvm_pt[i].pfn = 0;
+				page_hinting_obj->kvm_pt[i].order = -1;
+				page_hinting_obj->kvm_pt[i].zonenum = -1;
+			} else {
+				page_hinting_obj->kvm_pt[i].pfn =
+					page_to_pfn(buddy_page);
+				page_hinting_obj->kvm_pt[i].order =
+					page_private(buddy_page);
+			}
+		}
+		i++;
+	}
+	pack_array();
 	if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
 		wake_up_process(__this_cpu_read(hinting_task));
 }
@@ -111,8 +301,18 @@  void guest_free_page(struct page *page, int order)
 		page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
 							order;
 		page_hinting_obj->kvm_pt_idx += 1;
-		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
+		if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
+			/*
+			 * We are depending on the buddy free-list to identify
+			 * if a page is free or not. Hence, we are dumping all
+			 * the per-cpu pages back into the buddy allocator. This
+			 * will ensure less failures when we try to isolate free
+			 * captured pages and hence more memory reporting to the
+			 * host.
+			 */
+			drain_local_pages(NULL);
 			scan_array();
+		}
 	}
 	local_irq_restore(flags);
 }