Patchwork [RFT,Update,2/2] cpufreq: intel_pstate: Update max CPU frequency on global turbo changes

login
register
mail settings
Submitter Rafael J. Wysocki
Date March 1, 2019, 12:57 p.m.
Message ID <3017597.CVkTxgYgAs@aspire.rjw.lan>
Download mbox | patch
Permalink /patch/739311/
State New
Headers show

Comments

Rafael J. Wysocki - March 1, 2019, 12:57 p.m.
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

While the cpuinfo.max_freq value doesn't really matter for
intel_pstate in the active mode, in the passive mode it is used by
governors as the maximum physical frequency of the CPU and the
results of governor computations generally depend on it.  Also it
is made available to user space via sysfs and it should match the
current HW configuration.

For this reason, make intel_pstate update cpuinfo.max_freq for all
CPUs if it detects a global change of turbo frequency settings from
"disable" to "enable" or the other way associated with a _PPC change
notification from the platform firmware.

Note that policy_is_inactive() and cpufreq_set_policy() need to be
made available to it for this purpose.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759
Reported-by: Gabriele Mazzotta <gabriele.mzt@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---

Update, because the patch sent previously doesn't build, due to an extra
arg declared for intel_pstate_update_max_freq().

---
 drivers/cpufreq/cpufreq.c      |   12 ++----------
 drivers/cpufreq/intel_pstate.c |   33 ++++++++++++++++++++++++++++++++-
 include/linux/cpufreq.h        |    7 +++++++
 3 files changed, 41 insertions(+), 11 deletions(-)
Chen, Yu C - March 4, 2019, 2:39 p.m.
On Fri, Mar 01, 2019 at 01:57:06PM +0100, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> While the cpuinfo.max_freq value doesn't really matter for
> intel_pstate in the active mode, in the passive mode it is used by
> governors as the maximum physical frequency of the CPU and the
> results of governor computations generally depend on it.  Also it
> is made available to user space via sysfs and it should match the
> current HW configuration.
> 
> For this reason, make intel_pstate update cpuinfo.max_freq for all
> CPUs if it detects a global change of turbo frequency settings from
> "disable" to "enable" or the other way associated with a _PPC change
> notification from the platform firmware.
> 
> Note that policy_is_inactive() and cpufreq_set_policy() need to be
> made available to it for this purpose.
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759
> Reported-by: Gabriele Mazzotta <gabriele.mzt@gmail.com>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> 
> Update, because the patch sent previously doesn't build, due to an extra
> arg declared for intel_pstate_update_max_freq().
> 
> ---
>  drivers/cpufreq/cpufreq.c      |   12 ++----------
>  drivers/cpufreq/intel_pstate.c |   33 ++++++++++++++++++++++++++++++++-
>  include/linux/cpufreq.h        |    7 +++++++
>  3 files changed, 41 insertions(+), 11 deletions(-)
> 
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -897,6 +897,36 @@ static void intel_pstate_update_policies
>  		cpufreq_update_policy(cpu);
>  }
>  
> +static void intel_pstate_update_max_freq(unsigned int cpu)
> +{
> +	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> +	struct cpufreq_policy new_policy;
> +	struct cpudata *cpudata;
> +
> +	if (!policy)
> +		return;
> +
> +	down_write(&policy->rwsem);
> +
> +	if (policy_is_inactive(policy))
> +		goto unlock;
> +
> +	cpudata = all_cpu_data[cpu];
> +	policy->cpuinfo.max_freq = global.turbo_disabled_upd ?
> +			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
> +
> +	memcpy(&new_policy, policy, sizeof(*policy));
> +	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
> +	new_policy.min = min(policy->user_policy.min, new_policy.max);
> +
> +	cpufreq_set_policy(policy, &new_policy);
> +
> +unlock:
> +	up_write(&policy->rwsem);
> +
> +	cpufreq_cpu_put(policy);
> +}
> +
I tried to test on a macbook in hand however I did not see the _PPC
notifier on this machine so it might not cover the code path in
this patch. I checked the cpufreq with this patch using
different load and the cpufreq scales well.

>  static void intel_pstate_update_limits(unsigned int cpu)
>  {
>  	mutex_lock(&intel_pstate_driver_lock);
> @@ -908,7 +938,8 @@ static void intel_pstate_update_limits(u
>  	 */
>  	if (global.turbo_disabled_upd != global.turbo_disabled) {
>  		global.turbo_disabled_upd = global.turbo_disabled;
> -		intel_pstate_update_policies();
> +		for_each_possible_cpu(cpu)
> +			intel_pstate_update_max_freq(cpu);
>  	} else {
>  		cpufreq_update_policy(cpu);
>  	}
> Index: linux-pm/drivers/cpufreq/cpufreq.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/cpufreq.c
> +++ linux-pm/drivers/cpufreq/cpufreq.c
> @@ -34,11 +34,6 @@
>  
>  static LIST_HEAD(cpufreq_policy_list);
>  
> -static inline bool policy_is_inactive(struct cpufreq_policy *policy)
> -{
> -	return cpumask_empty(policy->cpus);
> -}
> -
>  /* Macros to iterate over CPU policies */
>  #define for_each_suitable_policy(__policy, __active)			 \
>  	list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> @@ -675,9 +670,6 @@ static ssize_t show_scaling_cur_freq(str
>  	return ret;
>  }
>  
> -static int cpufreq_set_policy(struct cpufreq_policy *policy,
> -				struct cpufreq_policy *new_policy);
> -
>  /**
>   * cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access
>   */
> @@ -2235,8 +2227,8 @@ EXPORT_SYMBOL(cpufreq_get_policy);
>   *
>   * The cpuinfo part of @policy is not updated by this function.
>   */
There first seems to be some patching error when applying this on
top of upstream 5.0, but I realized that this patch is based on
intel-next.

Thanks,
Ryan

> -static int cpufreq_set_policy(struct cpufreq_policy *policy,
> -			      struct cpufreq_policy *new_policy)
> +int cpufreq_set_policy(struct cpufreq_policy *policy,
> +		       struct cpufreq_policy *new_policy)
>  {
>  	struct cpufreq_governor *old_gov;
>  	int ret;
> Index: linux-pm/include/linux/cpufreq.h
> ===================================================================
> --- linux-pm.orig/include/linux/cpufreq.h
> +++ linux-pm/include/linux/cpufreq.h
> @@ -178,6 +178,11 @@ static inline struct cpufreq_policy *cpu
>  static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { }
>  #endif
>  
> +static inline bool policy_is_inactive(struct cpufreq_policy *policy)
> +{
> +	return cpumask_empty(policy->cpus);
> +}
> +
>  static inline bool policy_is_shared(struct cpufreq_policy *policy)
>  {
>  	return cpumask_weight(policy->cpus) > 1;
> @@ -194,6 +199,8 @@ void disable_cpufreq(void);
>  
>  u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
>  int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
> +int cpufreq_set_policy(struct cpufreq_policy *policy,
> +		       struct cpufreq_policy *new_policy);
>  void cpufreq_update_policy(unsigned int cpu);
>  void cpufreq_update_limits(unsigned int cpu);
>  bool have_governor_per_policy(void);
>
Quentin Perret - March 5, 2019, 10:42 a.m.
Hi Rafael,

+CC Peter since we were talking about cpuinfo.*_freq recently.

On Friday 01 Mar 2019 at 13:57:06 (+0100), Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> While the cpuinfo.max_freq value doesn't really matter for
> intel_pstate in the active mode, in the passive mode it is used by
> governors as the maximum physical frequency of the CPU and the
> results of governor computations generally depend on it. Also it
> is made available to user space via sysfs and it should match the
> current HW configuration.
> 
> For this reason, make intel_pstate update cpuinfo.max_freq for all
> CPUs if it detects a global change of turbo frequency settings from
> "disable" to "enable" or the other way associated with a _PPC change
> notification from the platform firmware.
> 
> Note that policy_is_inactive() and cpufreq_set_policy() need to be
> made available to it for this purpose.
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759
> Reported-by: Gabriele Mazzotta <gabriele.mzt@gmail.com>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
> 
> Update, because the patch sent previously doesn't build, due to an extra
> arg declared for intel_pstate_update_max_freq().
> 
> ---
>  drivers/cpufreq/cpufreq.c      |   12 ++----------
>  drivers/cpufreq/intel_pstate.c |   33 ++++++++++++++++++++++++++++++++-
>  include/linux/cpufreq.h        |    7 +++++++
>  3 files changed, 41 insertions(+), 11 deletions(-)
> 
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===================================================================
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -897,6 +897,36 @@ static void intel_pstate_update_policies
>  		cpufreq_update_policy(cpu);
>  }
>  
> +static void intel_pstate_update_max_freq(unsigned int cpu)
> +{
> +	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> +	struct cpufreq_policy new_policy;
> +	struct cpudata *cpudata;
> +
> +	if (!policy)
> +		return;
> +
> +	down_write(&policy->rwsem);
> +
> +	if (policy_is_inactive(policy))
> +		goto unlock;
> +
> +	cpudata = all_cpu_data[cpu];
> +	policy->cpuinfo.max_freq = global.turbo_disabled_upd ?
> +			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
> +
> +	memcpy(&new_policy, policy, sizeof(*policy));
> +	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
> +	new_policy.min = min(policy->user_policy.min, new_policy.max);
> +
> +	cpufreq_set_policy(policy, &new_policy);

Do you want to force-restart the governor here ? Schedutil caches
cpuinfo.max_freq for the iowait stuff in sugov_start() [1]. I'm not sure
about the other governors.

And just removing sg_cpu->iowait_boost_max to use the cpuinfo struct
instead will conflict with [2], I think.

Thanks,
Quentin

[1] https://elixir.bootlin.com/linux/latest/source/kernel/sched/cpufreq_schedutil.c#L840
[2] https://lore.kernel.org/lkml/1550831866-32749-1-git-send-email-chunyan.zhang@unisoc.com/

> +
> +unlock:
> +	up_write(&policy->rwsem);
> +
> +	cpufreq_cpu_put(policy);
> +}
Rafael J. Wysocki - March 5, 2019, 10:50 a.m.
On Tuesday, March 5, 2019 11:42:59 AM CET Quentin Perret wrote:
> Hi Rafael,
> 
> +CC Peter since we were talking about cpuinfo.*_freq recently.
> 
> On Friday 01 Mar 2019 at 13:57:06 (+0100), Rafael J. Wysocki wrote:
> > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > 
> > While the cpuinfo.max_freq value doesn't really matter for
> > intel_pstate in the active mode, in the passive mode it is used by
> > governors as the maximum physical frequency of the CPU and the
> > results of governor computations generally depend on it. Also it
> > is made available to user space via sysfs and it should match the
> > current HW configuration.
> > 
> > For this reason, make intel_pstate update cpuinfo.max_freq for all
> > CPUs if it detects a global change of turbo frequency settings from
> > "disable" to "enable" or the other way associated with a _PPC change
> > notification from the platform firmware.
> > 
> > Note that policy_is_inactive() and cpufreq_set_policy() need to be
> > made available to it for this purpose.
> > 
> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759
> > Reported-by: Gabriele Mazzotta <gabriele.mzt@gmail.com>
> > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > ---
> > 
> > Update, because the patch sent previously doesn't build, due to an extra
> > arg declared for intel_pstate_update_max_freq().
> > 
> > ---
> >  drivers/cpufreq/cpufreq.c      |   12 ++----------
> >  drivers/cpufreq/intel_pstate.c |   33 ++++++++++++++++++++++++++++++++-
> >  include/linux/cpufreq.h        |    7 +++++++
> >  3 files changed, 41 insertions(+), 11 deletions(-)
> > 
> > Index: linux-pm/drivers/cpufreq/intel_pstate.c
> > ===================================================================
> > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> > +++ linux-pm/drivers/cpufreq/intel_pstate.c
> > @@ -897,6 +897,36 @@ static void intel_pstate_update_policies
> >  		cpufreq_update_policy(cpu);
> >  }
> >  
> > +static void intel_pstate_update_max_freq(unsigned int cpu)
> > +{
> > +	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > +	struct cpufreq_policy new_policy;
> > +	struct cpudata *cpudata;
> > +
> > +	if (!policy)
> > +		return;
> > +
> > +	down_write(&policy->rwsem);
> > +
> > +	if (policy_is_inactive(policy))
> > +		goto unlock;
> > +
> > +	cpudata = all_cpu_data[cpu];
> > +	policy->cpuinfo.max_freq = global.turbo_disabled_upd ?
> > +			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
> > +
> > +	memcpy(&new_policy, policy, sizeof(*policy));
> > +	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
> > +	new_policy.min = min(policy->user_policy.min, new_policy.max);
> > +
> > +	cpufreq_set_policy(policy, &new_policy);
> 
> Do you want to force-restart the governor here ?

cpufreq_set_policy() is expected to take care of the governor.
If it doesn't, there is a bug somewhere.

> Schedutil caches cpuinfo.max_freq for the iowait stuff in sugov_start() [1].

If it does so, it should update the cached value in sugov_limits().

I guess I can add a patch updating it to this series.

> I'm not sure about the other governors.

They don't do that AFAICS.

> And just removing sg_cpu->iowait_boost_max to use the cpuinfo struct
> instead will conflict with [2], I think.

Thanks for pointing this out.

Cheers,
Rafael
Rafael J. Wysocki - March 5, 2019, 10:58 a.m.
On Tuesday, March 5, 2019 11:50:56 AM CET Rafael J. Wysocki wrote:
> On Tuesday, March 5, 2019 11:42:59 AM CET Quentin Perret wrote:
> > Hi Rafael,
> > 
> > +CC Peter since we were talking about cpuinfo.*_freq recently.
> > 
> > On Friday 01 Mar 2019 at 13:57:06 (+0100), Rafael J. Wysocki wrote:
> > > From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > 
> > > While the cpuinfo.max_freq value doesn't really matter for
> > > intel_pstate in the active mode, in the passive mode it is used by
> > > governors as the maximum physical frequency of the CPU and the
> > > results of governor computations generally depend on it. Also it
> > > is made available to user space via sysfs and it should match the
> > > current HW configuration.
> > > 
> > > For this reason, make intel_pstate update cpuinfo.max_freq for all
> > > CPUs if it detects a global change of turbo frequency settings from
> > > "disable" to "enable" or the other way associated with a _PPC change
> > > notification from the platform firmware.
> > > 
> > > Note that policy_is_inactive() and cpufreq_set_policy() need to be
> > > made available to it for this purpose.
> > > 
> > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759
> > > Reported-by: Gabriele Mazzotta <gabriele.mzt@gmail.com>
> > > Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > > ---
> > > 
> > > Update, because the patch sent previously doesn't build, due to an extra
> > > arg declared for intel_pstate_update_max_freq().
> > > 
> > > ---
> > >  drivers/cpufreq/cpufreq.c      |   12 ++----------
> > >  drivers/cpufreq/intel_pstate.c |   33 ++++++++++++++++++++++++++++++++-
> > >  include/linux/cpufreq.h        |    7 +++++++
> > >  3 files changed, 41 insertions(+), 11 deletions(-)
> > > 
> > > Index: linux-pm/drivers/cpufreq/intel_pstate.c
> > > ===================================================================
> > > --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> > > +++ linux-pm/drivers/cpufreq/intel_pstate.c
> > > @@ -897,6 +897,36 @@ static void intel_pstate_update_policies
> > >  		cpufreq_update_policy(cpu);
> > >  }
> > >  
> > > +static void intel_pstate_update_max_freq(unsigned int cpu)
> > > +{
> > > +	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > > +	struct cpufreq_policy new_policy;
> > > +	struct cpudata *cpudata;
> > > +
> > > +	if (!policy)
> > > +		return;
> > > +
> > > +	down_write(&policy->rwsem);
> > > +
> > > +	if (policy_is_inactive(policy))
> > > +		goto unlock;
> > > +
> > > +	cpudata = all_cpu_data[cpu];
> > > +	policy->cpuinfo.max_freq = global.turbo_disabled_upd ?
> > > +			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
> > > +
> > > +	memcpy(&new_policy, policy, sizeof(*policy));
> > > +	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
> > > +	new_policy.min = min(policy->user_policy.min, new_policy.max);
> > > +
> > > +	cpufreq_set_policy(policy, &new_policy);
> > 
> > Do you want to force-restart the governor here ?
> 
> cpufreq_set_policy() is expected to take care of the governor.
> If it doesn't, there is a bug somewhere.
> 
> > Schedutil caches cpuinfo.max_freq for the iowait stuff in sugov_start() [1].
> 
> If it does so, it should update the cached value in sugov_limits().
> 
> I guess I can add a patch updating it to this series.

So after the Peter's patch "sched/cpufreq: Fix 32bit math overflow"
I will need to recompute sg_cpu->min in sugov_limits().

I'll wait for that patch from Peter to go in and repost the series on
top of it.
Quentin Perret - March 5, 2019, 11:01 a.m.
On Tuesday 05 Mar 2019 at 11:50:56 (+0100), Rafael J. Wysocki wrote:
> On Tuesday, March 5, 2019 11:42:59 AM CET Quentin Perret wrote:
> > > +static void intel_pstate_update_max_freq(unsigned int cpu)
> > > +{
> > > +	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
> > > +	struct cpufreq_policy new_policy;
> > > +	struct cpudata *cpudata;
> > > +
> > > +	if (!policy)
> > > +		return;
> > > +
> > > +	down_write(&policy->rwsem);
> > > +
> > > +	if (policy_is_inactive(policy))
> > > +		goto unlock;
> > > +
> > > +	cpudata = all_cpu_data[cpu];
> > > +	policy->cpuinfo.max_freq = global.turbo_disabled_upd ?
> > > +			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
> > > +
> > > +	memcpy(&new_policy, policy, sizeof(*policy));
> > > +	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
> > > +	new_policy.min = min(policy->user_policy.min, new_policy.max);
> > > +
> > > +	cpufreq_set_policy(policy, &new_policy);
> > 
> > Do you want to force-restart the governor here ?
> 
> cpufreq_set_policy() is expected to take care of the governor.
> If it doesn't, there is a bug somewhere.

Yes, it does something when there is a governor change, but that's not
the case here IIUC.

> > Schedutil caches cpuinfo.max_freq for the iowait stuff in sugov_start() [1].
> 
> If it does so, it should update the cached value in sugov_limits().
> 
> I guess I can add a patch updating it to this series.

Right, I think we've been under the assumption that unlike policy->max,
cpuinfo.max_freq should be constant, so there was no need to update the
value at run time. But if that assumption doesn't hold any more, then
yeah we'll need something more dynamic I guess.

> 
> > I'm not sure about the other governors.
> 
> They don't do that AFAICS.

OK

> > And just removing sg_cpu->iowait_boost_max to use the cpuinfo struct
> > instead will conflict with [2], I think.
> 
> Thanks for pointing this out.

N/p :-)

Thanks,
Quentin
Peter Zijlstra - March 5, 2019, 11:44 a.m.
On Tue, Mar 05, 2019 at 11:58:37AM +0100, Rafael J. Wysocki wrote:
> So after the Peter's patch "sched/cpufreq: Fix 32bit math overflow"
> I will need to recompute sg_cpu->min in sugov_limits().

So there's still an open question; do we want that ->min thing to depend
on available frequencies _at_all_ ?

I'm thinking it might be a good thing to have the iowait boost curve be
independent of all that.

Like said; if we set it at 128 (static), it takes 9 consequtive wake-ups
for it to reach 1024 (max). While now the curve depends on how wide the
gap is between min_freq and max_freq. And it seems weird to have this
behaviour depend on that. To me at least.

Now, I don't know if 128/9 is the right curve, it is just a random
number I pulled out of a hat. But it seems to make more sense than
depending on frequencies.
Rafael J. Wysocki - March 5, 2019, 11:52 a.m.
On Tue, Mar 5, 2019 at 12:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Mar 05, 2019 at 11:58:37AM +0100, Rafael J. Wysocki wrote:
> > So after the Peter's patch "sched/cpufreq: Fix 32bit math overflow"
> > I will need to recompute sg_cpu->min in sugov_limits().
>
> So there's still an open question; do we want that ->min thing to depend
> on available frequencies _at_all_ ?
>
> I'm thinking it might be a good thing to have the iowait boost curve be
> independent of all that.
>
> Like said; if we set it at 128 (static), it takes 9 consequtive wake-ups
> for it to reach 1024 (max). While now the curve depends on how wide the
> gap is between min_freq and max_freq. And it seems weird to have this
> behaviour depend on that. To me at least.
>
> Now, I don't know if 128/9 is the right curve, it is just a random
> number I pulled out of a hat. But it seems to make more sense than
> depending on frequencies.

I agree.
Quentin Perret - March 5, 2019, noon
On Tuesday 05 Mar 2019 at 12:44:06 (+0100), Peter Zijlstra wrote:
> On Tue, Mar 05, 2019 at 11:58:37AM +0100, Rafael J. Wysocki wrote:
> > So after the Peter's patch "sched/cpufreq: Fix 32bit math overflow"
> > I will need to recompute sg_cpu->min in sugov_limits().
> 
> So there's still an open question; do we want that ->min thing to depend
> on available frequencies _at_all_ ?
> 
> I'm thinking it might be a good thing to have the iowait boost curve be
> independent of all that.
> 
> Like said; if we set it at 128 (static), it takes 9 consequtive wake-ups
> for it to reach 1024 (max). While now the curve depends on how wide the
> gap is between min_freq and max_freq. And it seems weird to have this
> behaviour depend on that. To me at least.

I'm not conceptually against it, but that really wants testing I think.
I can already see how we're gonna see regressions: 128 is much lower
than 'min' on my juno for ex, so with the approach you suggest it's
gonna take several wake-up before the iowait stuff does anything at all.
The first steps will all be below the min freq, so they'll just be
transparent, while right now the iowait boost kicks in much faster :/

OTOH, you also have platforms like the recent Snapdragons with 30+ OPPs,
and for them starting at 128 will speed things up.

So maybe what you want is to start at max(min, 128) ?

> Now, I don't know if 128/9 is the right curve, it is just a random
> number I pulled out of a hat. But it seems to make more sense than
> depending on frequencies.

And btw why do you need 9 steps to reach MAX starting from 128 ? If we
double the boost at each wakeup you'll do 128 -> 256 -> 512 -> 1024 no ?

Thanks,
Quentin
Peter Zijlstra - March 5, 2019, 12:24 p.m.
On Tue, Mar 05, 2019 at 12:00:43PM +0000, Quentin Perret wrote:

> And btw why do you need 9 steps to reach MAX starting from 128 ? If we
> double the boost at each wakeup you'll do 128 -> 256 -> 512 -> 1024 no ?

Argh; I'm so full of fail this week; 10-7=3.
Rafael J. Wysocki - March 5, 2019, 5:02 p.m.
On Tue, Mar 5, 2019 at 1:00 PM Quentin Perret <quentin.perret@arm.com> wrote:
>
> On Tuesday 05 Mar 2019 at 12:44:06 (+0100), Peter Zijlstra wrote:
> > On Tue, Mar 05, 2019 at 11:58:37AM +0100, Rafael J. Wysocki wrote:
> > > So after the Peter's patch "sched/cpufreq: Fix 32bit math overflow"
> > > I will need to recompute sg_cpu->min in sugov_limits().
> >
> > So there's still an open question; do we want that ->min thing to depend
> > on available frequencies _at_all_ ?
> >
> > I'm thinking it might be a good thing to have the iowait boost curve be
> > independent of all that.
> >
> > Like said; if we set it at 128 (static), it takes 9 consequtive wake-ups
> > for it to reach 1024 (max). While now the curve depends on how wide the
> > gap is between min_freq and max_freq. And it seems weird to have this
> > behaviour depend on that. To me at least.
>
> I'm not conceptually against it, but that really wants testing I think.
> I can already see how we're gonna see regressions: 128 is much lower
> than 'min' on my juno for ex, so with the approach you suggest it's
> gonna take several wake-up before the iowait stuff does anything at all.

But that 128 needs to be compared to

(SCHED_CAPACITY_SCALE * cpuinfo.min_freq) / cpuinfo.max_freq

so with SCHED_CAPACITY_SCALE equal to 1024 this means max_freq 8x
higher than min_freq.  That is not totally unreasonable IMO and
because sg_cpu->iowait_boost grows exponentially, the difference
between 8x and, say, 4x is one iteration.

> The first steps will all be below the min freq, so they'll just be
> transparent, while right now the iowait boost kicks in much faster :/

There can be one iteration of a difference this way or that way AFAICS
and I'm not even sure how much of a performance difference that makes
in practice.

OTOH I fundamentally don't see why the iowait boost should ramp up
faster on CPUs having a higher max_freq to min_freq ratio.  Say you
have two platforms, both with max_freq of 2 GHz and with min_freq
equal to 250 MHz and 500 MHz, respectively.  The ratios in question
will be 8 and 4 then, so the first one will reliably react 50% slower
to iowait than the second one for no particular reason at all.

> OTOH, you also have platforms like the recent Snapdragons with 30+ OPPs,
> and for them starting at 128 will speed things up.
>
> So maybe what you want is to start at max(min, 128) ?

That's not just min, though, or is it?
Quentin Perret - March 5, 2019, 5:37 p.m.
On Tuesday 05 Mar 2019 at 18:02:25 (+0100), Rafael J. Wysocki wrote:
> But that 128 needs to be compared to
> 
> (SCHED_CAPACITY_SCALE * cpuinfo.min_freq) / cpuinfo.max_freq
> 
> so with SCHED_CAPACITY_SCALE equal to 1024 this means max_freq 8x
> higher than min_freq.  That is not totally unreasonable IMO and
> because sg_cpu->iowait_boost grows exponentially, the difference
> between 8x and, say, 4x is one iteration.
> 
> > The first steps will all be below the min freq, so they'll just be
> > transparent, while right now the iowait boost kicks in much faster :/
> 
> There can be one iteration of a difference this way or that way AFAICS
> and I'm not even sure how much of a performance difference that makes
> in practice.

Yeah I don't expect that to have a huge impact TBH but it'd be nice to
actually get numbers to verify that, that's all I'm saying :-)

You have 'funny' platforms like Juno r0 out there where the min/max
frequencies are 450MHz/850Mhz. In this case, starting from 128 you'll
need 3 wake-ups to reach what is currently the starting point. I'm not
sure if the impact is visible or not, but it's worth checking.

> OTOH I fundamentally don't see why the iowait boost should ramp up
> faster on CPUs having a higher max_freq to min_freq ratio.  Say you
> have two platforms, both with max_freq of 2 GHz and with min_freq
> equal to 250 MHz and 500 MHz, respectively.  The ratios in question
> will be 8 and 4 then, so the first one will reliably react 50% slower
> to iowait than the second one for no particular reason at all.
>
> > OTOH, you also have platforms like the recent Snapdragons with 30+ OPPs,
> > and for them starting at 128 will speed things up.
> >
> > So maybe what you want is to start at max(min, 128) ?
> 
> That's not just min, though, or is it?

I'm not sure to get the question, so just to make sure it's clearer, I
was suggesting to do something along the lines of:

	sg_cpu->min = max(min_freq * 1024 / max_freq, 128);

That basically just prevents you from starting too low -- some boards,
unlike juno, have tons of OPPs on the lower end of the curve so these
might benefit from getting a higher starting point. But then perhaps
this is in fact a good illustration of the issue of having different
ramp-up speeds depending on the min_freq so ... :-)

Thanks,
Quentin
Rafael J. Wysocki - March 6, 2019, 10:05 a.m.
On Tue, Mar 5, 2019 at 6:37 PM Quentin Perret <quentin.perret@arm.com> wrote:
>
> On Tuesday 05 Mar 2019 at 18:02:25 (+0100), Rafael J. Wysocki wrote:
> > But that 128 needs to be compared to
> >
> > (SCHED_CAPACITY_SCALE * cpuinfo.min_freq) / cpuinfo.max_freq
> >
> > so with SCHED_CAPACITY_SCALE equal to 1024 this means max_freq 8x
> > higher than min_freq.  That is not totally unreasonable IMO and
> > because sg_cpu->iowait_boost grows exponentially, the difference
> > between 8x and, say, 4x is one iteration.
> >
> > > The first steps will all be below the min freq, so they'll just be
> > > transparent, while right now the iowait boost kicks in much faster :/
> >
> > There can be one iteration of a difference this way or that way AFAICS
> > and I'm not even sure how much of a performance difference that makes
> > in practice.
>
> Yeah I don't expect that to have a huge impact TBH but it'd be nice to
> actually get numbers to verify that, that's all I'm saying :-)
>
> You have 'funny' platforms like Juno r0 out there where the min/max
> frequencies are 450MHz/850Mhz. In this case, starting from 128 you'll
> need 3 wake-ups to reach what is currently the starting point. I'm not
> sure if the impact is visible or not, but it's worth checking.

Please recall that the iowait boosting algo was different to start
with, though: it jumped to the max right away and then backed off.
That turned out to be overly aggressive in general and led to some
undesired "jittery" behavior, which is why it was changed.

Thus it looks like the platforms on which it still jumps to the max
right away may actually benefit from changing it to more steps. :-)

In turn, the platforms where it takes more than 3 steps for the boost
to get to the max would get a slight performance improvement from this
changes and I'm not sure why that could be bad.

Moreover, it didn't depend on the min originally, just on the max and
just because I wanted the number of backoff steps needed to go back
down to zero to be independent of the platform IIRC.  The dependency
on the min is sort of artificial here and leads to arbitrary
differences in behavior between different platforms which isn't
particularly fortunate.

With all of that in mind, I would go right away to making the boost
independent of min and max (the final number of steps to reach the max
is TBD in practice, but 3 looks like a good enough compromise to me).

FWIW, the internal governor in intel_pstate has been changed along
these lines already (the changes is waiting for Linus to pull it ATM).
Quentin Perret - March 7, 2019, 11:02 a.m.
Hi Rafael,

On Wednesday 06 Mar 2019 at 11:05:47 (+0100), Rafael J. Wysocki wrote:
> Please recall that the iowait boosting algo was different to start
> with, though: it jumped to the max right away and then backed off.
> That turned out to be overly aggressive in general and led to some
> undesired "jittery" behavior, which is why it was changed.
> 
> Thus it looks like the platforms on which it still jumps to the max
> right away may actually benefit from changing it to more steps. :-)

On the energy side of things at least ... ;-)

> In turn, the platforms where it takes more than 3 steps for the boost
> to get to the max would get a slight performance improvement from this
> changes and I'm not sure why that could be bad.

For energy possibly ? IIUC this thread:

    https://patchwork.kernel.org/patch/9735885/

the original intent of the ramp thing for the iowait boost was to reduce
power consumption.

> Moreover, it didn't depend on the min originally, just on the max and
> just because I wanted the number of backoff steps needed to go back
> down to zero to be independent of the platform IIRC.  The dependency
> on the min is sort of artificial here and leads to arbitrary
> differences in behavior between different platforms which isn't
> particularly fortunate.

It's a question of perspective I would say. Right now you can say the
behaviour is somewhat coherent across platforms: getting an IOWAIT boost
means you'll run twice as fast regardless of your board. With the '128
approach', you may or may not run faster, depending on your set of OPPs.

Also on recent big little SoCs, the capacity ratio can be pretty high.
You can get little CPUs with 300 of capacity or so. The arbitrary 128
thing is basically gonna go near max freq in one step, although the CPUs
actually 20 available OPPs or so. And I guess that's a shame.

For these reasons, I feel like it's not completely idiotic to have a
platform-dependent starting point, although arguably min_freq might not
always be the best choice.

> With all of that in mind, I would go right away to making the boost
> independent of min and max (the final number of steps to reach the max
> is TBD in practice, but 3 looks like a good enough compromise to me).

Perhaps the energy model could help find a good starting point, and a
good number of steps ?

FWIW there should be a slot at OSPM to discuss how sugov could be made
smarter using the EM :-).

> FWIW, the internal governor in intel_pstate has been changed along
> these lines already (the changes is waiting for Linus to pull it ATM).

Thanks,
Quentin
Peter Zijlstra - March 7, 2019, 11:23 a.m.
On Thu, Mar 07, 2019 at 11:02:58AM +0000, Quentin Perret wrote:
> Also on recent big little SoCs, the capacity ratio can be pretty high.
> You can get little CPUs with 300 of capacity or so. The arbitrary 128
> thing is basically gonna go near max freq in one step, although the CPUs
> actually 20 available OPPs or so. And I guess that's a shame.

Ah, but remember; with the latest patch the iowait_boost is scaled on
max. So in order to hit that 300, you first have to hit 1024.

+       boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
+       return max(boost, util);

So iowait_boost goes from 128 to 1024 in 4 wakeups (the first sets 128,
the next 3 get it to 1024), but the effective value is, given @max ==
300:

  128 -> 37
  256 -> 75
  512 -> 150
 1024 -> 300

so irrespective of the number of OPPs and their spread, it takes 4
wakeups to get to max.
Rafael J. Wysocki - March 7, 2019, 11:25 a.m.
On Thu, Mar 7, 2019 at 12:03 PM Quentin Perret <quentin.perret@arm.com> wrote:
>
> Hi Rafael,
>
> On Wednesday 06 Mar 2019 at 11:05:47 (+0100), Rafael J. Wysocki wrote:
> > Please recall that the iowait boosting algo was different to start
> > with, though: it jumped to the max right away and then backed off.
> > That turned out to be overly aggressive in general and led to some
> > undesired "jittery" behavior, which is why it was changed.
> >
> > Thus it looks like the platforms on which it still jumps to the max
> > right away may actually benefit from changing it to more steps. :-)
>
> On the energy side of things at least ... ;-)
>
> > In turn, the platforms where it takes more than 3 steps for the boost
> > to get to the max would get a slight performance improvement from this
> > changes and I'm not sure why that could be bad.
>
> For energy possibly ? IIUC this thread:
>
>     https://patchwork.kernel.org/patch/9735885/
>
> the original intent of the ramp thing for the iowait boost was to reduce
> power consumption.
>
> > Moreover, it didn't depend on the min originally, just on the max and
> > just because I wanted the number of backoff steps needed to go back
> > down to zero to be independent of the platform IIRC.  The dependency
> > on the min is sort of artificial here and leads to arbitrary
> > differences in behavior between different platforms which isn't
> > particularly fortunate.
>
> It's a question of perspective I would say. Right now you can say the
> behaviour is somewhat coherent across platforms: getting an IOWAIT boost
> means you'll run twice as fast regardless of your board. With the '128
> approach', you may or may not run faster, depending on your set of OPPs.
>
> Also on recent big little SoCs, the capacity ratio can be pretty high.
> You can get little CPUs with 300 of capacity or so. The arbitrary 128
> thing is basically gonna go near max freq in one step, although the CPUs
> actually 20 available OPPs or so. And I guess that's a shame.

OK, you seem to be arguing that on some platforms there is a little
difference between 128 and 1024 in terms of power, while there may be
a lot of difference between, say, 64 and 128.

I can buy that, but then it also makes sense to boost quickly enough,
so maybe it could start at the min and jump from there to 256 right
away in the first step?

> For these reasons, I feel like it's not completely idiotic to have a
> platform-dependent starting point, although arguably min_freq might not
> always be the best choice.
>
> > With all of that in mind, I would go right away to making the boost
> > independent of min and max (the final number of steps to reach the max
> > is TBD in practice, but 3 looks like a good enough compromise to me).
>
> Perhaps the energy model could help find a good starting point, and a
> good number of steps ?
>
> FWIW there should be a slot at OSPM to discuss how sugov could be made
> smarter using the EM :-).

Well, if you can make a case for that. :-)
Quentin Perret - March 7, 2019, 11:49 a.m.
On Thursday 07 Mar 2019 at 12:23:44 (+0100), Peter Zijlstra wrote:
> Ah, but remember; with the latest patch the iowait_boost is scaled on
> max. So in order to hit that 300, you first have to hit 1024.

Argh ! Right ... So yeah, my whole point about big.little platforms was
totally wrong :(

/me goes back to sleep
Quentin Perret - March 7, 2019, 11:59 a.m.
On Thursday 07 Mar 2019 at 12:25:10 (+0100), Rafael J. Wysocki wrote:
> On Thu, Mar 7, 2019 at 12:03 PM Quentin Perret <quentin.perret@arm.com> wrote:
> >
> > Hi Rafael,
> >
> > On Wednesday 06 Mar 2019 at 11:05:47 (+0100), Rafael J. Wysocki wrote:
> > > Please recall that the iowait boosting algo was different to start
> > > with, though: it jumped to the max right away and then backed off.
> > > That turned out to be overly aggressive in general and led to some
> > > undesired "jittery" behavior, which is why it was changed.
> > >
> > > Thus it looks like the platforms on which it still jumps to the max
> > > right away may actually benefit from changing it to more steps. :-)
> >
> > On the energy side of things at least ... ;-)
> >
> > > In turn, the platforms where it takes more than 3 steps for the boost
> > > to get to the max would get a slight performance improvement from this
> > > changes and I'm not sure why that could be bad.
> >
> > For energy possibly ? IIUC this thread:
> >
> >     https://patchwork.kernel.org/patch/9735885/
> >
> > the original intent of the ramp thing for the iowait boost was to reduce
> > power consumption.
> >
> > > Moreover, it didn't depend on the min originally, just on the max and
> > > just because I wanted the number of backoff steps needed to go back
> > > down to zero to be independent of the platform IIRC.  The dependency
> > > on the min is sort of artificial here and leads to arbitrary
> > > differences in behavior between different platforms which isn't
> > > particularly fortunate.
> >
> > It's a question of perspective I would say. Right now you can say the
> > behaviour is somewhat coherent across platforms: getting an IOWAIT boost
> > means you'll run twice as fast regardless of your board. With the '128
> > approach', you may or may not run faster, depending on your set of OPPs.
> >
> > Also on recent big little SoCs, the capacity ratio can be pretty high.
> > You can get little CPUs with 300 of capacity or so. The arbitrary 128
> > thing is basically gonna go near max freq in one step, although the CPUs
> > actually 20 available OPPs or so. And I guess that's a shame.
> 
> OK, you seem to be arguing that on some platforms there is a little
> difference between 128 and 1024 in terms of power, while there may be
> a lot of difference between, say, 64 and 128.

Well, in fact what I was saying here is wrong. As Peter said in another
email, we'll scale the boost to the CPU's cap, so even on little CPUs it
would take 4 steps to go to max ... So the problem I was trying to
highlight here simply doesn't exist.

So yes, please ignore that point :/

> I can buy that, but then it also makes sense to boost quickly enough,
> so maybe it could start at the min and jump from there to 256 right
> away in the first step?
> 
> > For these reasons, I feel like it's not completely idiotic to have a
> > platform-dependent starting point, although arguably min_freq might not
> > always be the best choice.
> >
> > > With all of that in mind, I would go right away to making the boost
> > > independent of min and max (the final number of steps to reach the max
> > > is TBD in practice, but 3 looks like a good enough compromise to me).
> >
> > Perhaps the energy model could help find a good starting point, and a
> > good number of steps ?
> >
> > FWIW there should be a slot at OSPM to discuss how sugov could be made
> > smarter using the EM :-).
> 
> Well, if you can make a case for that. :-)

:-)

Thanks,
Quentin

Patch

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -897,6 +897,36 @@  static void intel_pstate_update_policies
 		cpufreq_update_policy(cpu);
 }
 
+static void intel_pstate_update_max_freq(unsigned int cpu)
+{
+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+	struct cpufreq_policy new_policy;
+	struct cpudata *cpudata;
+
+	if (!policy)
+		return;
+
+	down_write(&policy->rwsem);
+
+	if (policy_is_inactive(policy))
+		goto unlock;
+
+	cpudata = all_cpu_data[cpu];
+	policy->cpuinfo.max_freq = global.turbo_disabled_upd ?
+			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
+
+	memcpy(&new_policy, policy, sizeof(*policy));
+	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
+	new_policy.min = min(policy->user_policy.min, new_policy.max);
+
+	cpufreq_set_policy(policy, &new_policy);
+
+unlock:
+	up_write(&policy->rwsem);
+
+	cpufreq_cpu_put(policy);
+}
+
 static void intel_pstate_update_limits(unsigned int cpu)
 {
 	mutex_lock(&intel_pstate_driver_lock);
@@ -908,7 +938,8 @@  static void intel_pstate_update_limits(u
 	 */
 	if (global.turbo_disabled_upd != global.turbo_disabled) {
 		global.turbo_disabled_upd = global.turbo_disabled;
-		intel_pstate_update_policies();
+		for_each_possible_cpu(cpu)
+			intel_pstate_update_max_freq(cpu);
 	} else {
 		cpufreq_update_policy(cpu);
 	}
Index: linux-pm/drivers/cpufreq/cpufreq.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq.c
+++ linux-pm/drivers/cpufreq/cpufreq.c
@@ -34,11 +34,6 @@ 
 
 static LIST_HEAD(cpufreq_policy_list);
 
-static inline bool policy_is_inactive(struct cpufreq_policy *policy)
-{
-	return cpumask_empty(policy->cpus);
-}
-
 /* Macros to iterate over CPU policies */
 #define for_each_suitable_policy(__policy, __active)			 \
 	list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
@@ -675,9 +670,6 @@  static ssize_t show_scaling_cur_freq(str
 	return ret;
 }
 
-static int cpufreq_set_policy(struct cpufreq_policy *policy,
-				struct cpufreq_policy *new_policy);
-
 /**
  * cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access
  */
@@ -2235,8 +2227,8 @@  EXPORT_SYMBOL(cpufreq_get_policy);
  *
  * The cpuinfo part of @policy is not updated by this function.
  */
-static int cpufreq_set_policy(struct cpufreq_policy *policy,
-			      struct cpufreq_policy *new_policy)
+int cpufreq_set_policy(struct cpufreq_policy *policy,
+		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
 	int ret;
Index: linux-pm/include/linux/cpufreq.h
===================================================================
--- linux-pm.orig/include/linux/cpufreq.h
+++ linux-pm/include/linux/cpufreq.h
@@ -178,6 +178,11 @@  static inline struct cpufreq_policy *cpu
 static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { }
 #endif
 
+static inline bool policy_is_inactive(struct cpufreq_policy *policy)
+{
+	return cpumask_empty(policy->cpus);
+}
+
 static inline bool policy_is_shared(struct cpufreq_policy *policy)
 {
 	return cpumask_weight(policy->cpus) > 1;
@@ -194,6 +199,8 @@  void disable_cpufreq(void);
 
 u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
+int cpufreq_set_policy(struct cpufreq_policy *policy,
+		       struct cpufreq_policy *new_policy);
 void cpufreq_update_policy(unsigned int cpu);
 void cpufreq_update_limits(unsigned int cpu);
 bool have_governor_per_policy(void);