Patchwork [V2] cpufreq: Call transition notifier only once for each policy

login
register
mail settings
Submitter Viresh Kumar
Date March 15, 2019, 9:13 a.m.
Message ID <e2b56be446eeb67f1905d2ac6f8d82dd4389d0c5.1552640968.git.viresh.kumar@linaro.org>
Download mbox | patch
Permalink /patch/749409/
State New
Headers show

Comments

Viresh Kumar - March 15, 2019, 9:13 a.m.
Currently we call these notifiers once for each CPU of the policy->cpus
cpumask. It would be more optimal if the notifier can be called only
once and all the relevant information be provided to it. Out of the 24
drivers that register for the transition notifiers today, only 5 of them
do per-cpu updates and the callback for the rest can be called only once
for the policy without any impact.

This would also avoid multiple function calls to the notifier callbacks
and reduce multiple iterations of notifier core's code (which does
locking as well).

This patch adds pointer to the cpufreq policy to the struct
cpufreq_freqs, so the notifier callback has all the information
available to it with a single call. The five drivers which perform
per-cpu updates are updated to use the cpufreq policy. The freqs->cpu
field is redundant now and is removed.

Acked-by: David S. Miller <davem@davemloft.net> (sparc)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
V1->V2:
- Add cpufreq policy instead of cpus in struct cpufreq_freqs.
- Use policy->cpus instead of related_cpus everywhere in order not to
  change the existing behavior.
- Merged all 7 patches into a single patch.
- Updated changlog and included Ack from David.

 arch/arm/kernel/smp.c       | 24 +++++++++++++++---------
 arch/arm/kernel/smp_twd.c   |  9 ++++++---
 arch/sparc/kernel/time_64.c | 28 ++++++++++++++++------------
 arch/x86/kernel/tsc.c       | 32 +++++++++++++++++++++-----------
 arch/x86/kvm/x86.c          | 31 ++++++++++++++++++++-----------
 drivers/cpufreq/cpufreq.c   | 19 ++++++++++---------
 include/linux/cpufreq.h     | 14 +++++++-------
 7 files changed, 95 insertions(+), 62 deletions(-)
Peter Zijlstra - March 15, 2019, 12:29 p.m.
On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 3fae23834069..cff8779fc0d2 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
>  				void *data)
>  {
>  	struct cpufreq_freqs *freq = data;
> -	unsigned long *lpj;
> -
> -	lpj = &boot_cpu_data.loops_per_jiffy;
> -#ifdef CONFIG_SMP
> -	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> -		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> -#endif
> +	struct cpumask *cpus = freq->policy->cpus;
> +	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> +	unsigned long lpj;
> +	int cpu;
>  
>  	if (!ref_freq) {
>  		ref_freq = freq->old;
> -		loops_per_jiffy_ref = *lpj;
>  		tsc_khz_ref = tsc_khz;
> +
> +		if (boot_cpu)
> +			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> +		else
> +			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
>  	}
> +
>  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
>  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> -		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> -
> +		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
>  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> +
>  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
>  			mark_tsc_unstable("cpufreq changes");
>  
> -		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> +		if (boot_cpu) {
> +			boot_cpu_data.loops_per_jiffy = lpj;
> +		} else {
> +			for_each_cpu(cpu, cpus)
> +				cpu_data(cpu).loops_per_jiffy = lpj;
> +		}
> +
> +		for_each_cpu(cpu, cpus)
> +			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());

This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
question. That's part of the whole problem here, TSC isn't sync'ed when
it's subject to CPUFREQ.

>  	}
>  
>  	return 0;
Viresh Kumar - March 18, 2019, 2:35 a.m.
On 15-03-19, 13:29, Peter Zijlstra wrote:
> On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index 3fae23834069..cff8779fc0d2 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> >  				void *data)
> >  {
> >  	struct cpufreq_freqs *freq = data;
> > -	unsigned long *lpj;
> > -
> > -	lpj = &boot_cpu_data.loops_per_jiffy;
> > -#ifdef CONFIG_SMP
> > -	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > -		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > -#endif
> > +	struct cpumask *cpus = freq->policy->cpus;
> > +	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > +	unsigned long lpj;
> > +	int cpu;
> >  
> >  	if (!ref_freq) {
> >  		ref_freq = freq->old;
> > -		loops_per_jiffy_ref = *lpj;
> >  		tsc_khz_ref = tsc_khz;
> > +
> > +		if (boot_cpu)
> > +			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > +		else
> > +			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> >  	}
> > +
> >  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> >  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > -		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > -
> > +		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> >  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > +
> >  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> >  			mark_tsc_unstable("cpufreq changes");
> >  
> > -		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > +		if (boot_cpu) {
> > +			boot_cpu_data.loops_per_jiffy = lpj;
> > +		} else {
> > +			for_each_cpu(cpu, cpus)
> > +				cpu_data(cpu).loops_per_jiffy = lpj;
> > +		}
> > +
> > +		for_each_cpu(cpu, cpus)
> > +			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> 
> This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> question.

You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
that and it was left for the notifier to do. This patch doesn't change the
behavior at all, just that it moves the for-loop to the notifier instead of the
cpufreq core.

> That's part of the whole problem here, TSC isn't sync'ed when
> it's subject to CPUFREQ.
> 
> >  	}
> >  
> >  	return 0;
Rafael J. Wysocki - March 18, 2019, 10:45 a.m.
On Fri, Mar 15, 2019 at 1:30 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > index 3fae23834069..cff8779fc0d2 100644
> > --- a/arch/x86/kernel/tsc.c
> > +++ b/arch/x86/kernel/tsc.c
> > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> >                               void *data)
> >  {
> >       struct cpufreq_freqs *freq = data;
> > -     unsigned long *lpj;
> > -
> > -     lpj = &boot_cpu_data.loops_per_jiffy;
> > -#ifdef CONFIG_SMP
> > -     if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > -             lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > -#endif
> > +     struct cpumask *cpus = freq->policy->cpus;
> > +     bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > +     unsigned long lpj;
> > +     int cpu;
> >
> >       if (!ref_freq) {
> >               ref_freq = freq->old;
> > -             loops_per_jiffy_ref = *lpj;
> >               tsc_khz_ref = tsc_khz;
> > +
> > +             if (boot_cpu)
> > +                     loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > +             else
> > +                     loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> >       }
> > +
> >       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> >                       (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > -             *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > -
> > +             lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> >               tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > +
> >               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> >                       mark_tsc_unstable("cpufreq changes");
> >
> > -             set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > +             if (boot_cpu) {
> > +                     boot_cpu_data.loops_per_jiffy = lpj;
> > +             } else {
> > +                     for_each_cpu(cpu, cpus)
> > +                             cpu_data(cpu).loops_per_jiffy = lpj;
> > +             }
> > +
> > +             for_each_cpu(cpu, cpus)
> > +                     set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
>
> This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> question.

Well, strictly speaking the TSC value here comes from the CPU running the code.

The original code has this problem too, though (as Viresh said), so
the patch really doesn't make it worse in that respect. :-)

I'm not going to defend the original code (I ldidn't invent it
anyway), but it clearly assumes that different CPUs cannot run at
different frequencies and that kind of explains what happens in it.

> That's part of the whole problem here, TSC isn't sync'ed when
> it's subject to CPUFREQ.

So what would you recommend us to do here?

Obviously, this won't run on any new hardware.  Frankly, I'm not even
sure what the most recent HW where this hack would make a difference
is (the comment talking about Opterons suggests early 2000s), so this
clearly falls into the "legacy" bucket to me.

Does it make sense to try to preserve it, or can we simply make
cpufreq init fail on the systems where the TSC rate depends on the
frequency?
Peter Zijlstra - March 18, 2019, 10:53 a.m.
On Mon, Mar 18, 2019 at 08:05:14AM +0530, Viresh Kumar wrote:
> On 15-03-19, 13:29, Peter Zijlstra wrote:
> > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > index 3fae23834069..cff8779fc0d2 100644
> > > --- a/arch/x86/kernel/tsc.c
> > > +++ b/arch/x86/kernel/tsc.c
> > > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > >  				void *data)
> > >  {
> > >  	struct cpufreq_freqs *freq = data;
> > > -	unsigned long *lpj;
> > > -
> > > -	lpj = &boot_cpu_data.loops_per_jiffy;
> > > -#ifdef CONFIG_SMP
> > > -	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > -		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > -#endif
> > > +	struct cpumask *cpus = freq->policy->cpus;
> > > +	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > > +	unsigned long lpj;
> > > +	int cpu;
> > >  
> > >  	if (!ref_freq) {
> > >  		ref_freq = freq->old;
> > > -		loops_per_jiffy_ref = *lpj;
> > >  		tsc_khz_ref = tsc_khz;
> > > +
> > > +		if (boot_cpu)
> > > +			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > > +		else
> > > +			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> > >  	}
> > > +
> > >  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> > >  			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > > -		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > -
> > > +		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > >  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > > +
> > >  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > >  			mark_tsc_unstable("cpufreq changes");
> > >  
> > > -		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > +		if (boot_cpu) {
> > > +			boot_cpu_data.loops_per_jiffy = lpj;
> > > +		} else {
> > > +			for_each_cpu(cpu, cpus)
> > > +				cpu_data(cpu).loops_per_jiffy = lpj;
> > > +		}
> > > +
> > > +		for_each_cpu(cpu, cpus)
> > > +			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> > 
> > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > question.
> 
> You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
> that and it was left for the notifier to do. This patch doesn't change the
> behavior at all, just that it moves the for-loop to the notifier instead of the
> cpufreq core.

Yuck..

Rafael; how does this work in practise? Earlier you said that on x86 the
policies typically have a single cpu in them anyway. Is the freq change
also notified from _that_ cpu?

I don't think I have old enough hardware around anymore to test any of
this. This was truly ancient p6 era stuff IIRC.

Because in that case, I'm all for not doing the changes to this notifier
Viresh is proposing but simply adding something like:


	WARN_ON_ONCE(cpumask_weight(cpuc) != 1);
	WARN_ON_ONCE(cpumask_first(cpuc) != smp_processor_id());

And leave it at that.
Peter Zijlstra - March 18, 2019, 11:01 a.m.
On Mon, Mar 18, 2019 at 11:45:00AM +0100, Rafael J. Wysocki wrote:
> On Fri, Mar 15, 2019 at 1:30 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:

> > > +             for_each_cpu(cpu, cpus)
> > > +                     set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> >
> > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > question.
> 
> Well, strictly speaking the TSC value here comes from the CPU running the code.
> 
> The original code has this problem too, though (as Viresh said), so
> the patch really doesn't make it worse in that respect. :-)
> 
> I'm not going to defend the original code (I ldidn't invent it
> anyway), but it clearly assumes that different CPUs cannot run at
> different frequencies and that kind of explains what happens in it.

The assumption was always that if CPUs ran at different frequencies, the
notifier would run on the affected CPU. After all, only that CPU would
know its frequency changed.

> > That's part of the whole problem here, TSC isn't sync'ed when
> > it's subject to CPUFREQ.
> 
> So what would you recommend us to do here?
> 
> Obviously, this won't run on any new hardware.  Frankly, I'm not even
> sure what the most recent HW where this hack would make a difference
> is (the comment talking about Opterons suggests early 2000s), so this
> clearly falls into the "legacy" bucket to me.
> 
> Does it make sense to try to preserve it, or can we simply make
> cpufreq init fail on the systems where the TSC rate depends on the
> frequency?

I'm all for deleting this and basically dropping support for anything
that needs this, but I suspect some people digging their legacy systems
(*cough* Pavel *cough*) might object to that.

Heck, I'm even ok with just calling panic() when TSC goes wobbly :-) I'm
fed up with all that broken crap. And yes, I know, that's systems sold
today :-(
Rafael J. Wysocki - March 18, 2019, 11:09 a.m.
On Mon, Mar 18, 2019 at 11:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Mar 18, 2019 at 08:05:14AM +0530, Viresh Kumar wrote:
> > On 15-03-19, 13:29, Peter Zijlstra wrote:
> > > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > > index 3fae23834069..cff8779fc0d2 100644
> > > > --- a/arch/x86/kernel/tsc.c
> > > > +++ b/arch/x86/kernel/tsc.c
> > > > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > > >                           void *data)
> > > >  {
> > > >   struct cpufreq_freqs *freq = data;
> > > > - unsigned long *lpj;
> > > > -
> > > > - lpj = &boot_cpu_data.loops_per_jiffy;
> > > > -#ifdef CONFIG_SMP
> > > > - if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > > -         lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > > -#endif
> > > > + struct cpumask *cpus = freq->policy->cpus;
> > > > + bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > > > + unsigned long lpj;
> > > > + int cpu;
> > > >
> > > >   if (!ref_freq) {
> > > >           ref_freq = freq->old;
> > > > -         loops_per_jiffy_ref = *lpj;
> > > >           tsc_khz_ref = tsc_khz;
> > > > +
> > > > +         if (boot_cpu)
> > > > +                 loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > > > +         else
> > > > +                 loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> > > >   }
> > > > +
> > > >   if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> > > >                   (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > > > -         *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > > -
> > > > +         lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > >           tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > > > +
> > > >           if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > >                   mark_tsc_unstable("cpufreq changes");
> > > >
> > > > -         set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > > +         if (boot_cpu) {
> > > > +                 boot_cpu_data.loops_per_jiffy = lpj;
> > > > +         } else {
> > > > +                 for_each_cpu(cpu, cpus)
> > > > +                         cpu_data(cpu).loops_per_jiffy = lpj;
> > > > +         }
> > > > +
> > > > +         for_each_cpu(cpu, cpus)
> > > > +                 set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> > >
> > > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > > question.
> >
> > You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
> > that and it was left for the notifier to do. This patch doesn't change the
> > behavior at all, just that it moves the for-loop to the notifier instead of the
> > cpufreq core.
>
> Yuck..
>
> Rafael; how does this work in practise? Earlier you said that on x86 the
> policies typically have a single cpu in them anyway.

Yes.

> Is the freq change also notified from _that_ cpu?

May not be, depending on what CPU runs the work item/thread changing
the freq.  It generally is not guaranteed to always be the same as the
target CPU.

> I don't think I have old enough hardware around anymore to test any of
> this. This was truly ancient p6 era stuff IIRC.
>
> Because in that case, I'm all for not doing the changes to this notifier
> Viresh is proposing but simply adding something like:
>
>
>         WARN_ON_ONCE(cpumask_weight(cpuc) != 1);
>         WARN_ON_ONCE(cpumask_first(cpuc) != smp_processor_id());
>
> And leave it at that.

That may not work I'm afraid.
Rafael J. Wysocki - March 18, 2019, 11:20 a.m.
On Mon, Mar 18, 2019 at 12:09 PM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Mon, Mar 18, 2019 at 11:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, Mar 18, 2019 at 08:05:14AM +0530, Viresh Kumar wrote:
> > > On 15-03-19, 13:29, Peter Zijlstra wrote:
> > > > On Fri, Mar 15, 2019 at 02:43:07PM +0530, Viresh Kumar wrote:
> > > > > diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> > > > > index 3fae23834069..cff8779fc0d2 100644
> > > > > --- a/arch/x86/kernel/tsc.c
> > > > > +++ b/arch/x86/kernel/tsc.c
> > > > > @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
> > > > >                           void *data)
> > > > >  {
> > > > >   struct cpufreq_freqs *freq = data;
> > > > > - unsigned long *lpj;
> > > > > -
> > > > > - lpj = &boot_cpu_data.loops_per_jiffy;
> > > > > -#ifdef CONFIG_SMP
> > > > > - if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > > > -         lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> > > > > -#endif
> > > > > + struct cpumask *cpus = freq->policy->cpus;
> > > > > + bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> > > > > + unsigned long lpj;
> > > > > + int cpu;
> > > > >
> > > > >   if (!ref_freq) {
> > > > >           ref_freq = freq->old;
> > > > > -         loops_per_jiffy_ref = *lpj;
> > > > >           tsc_khz_ref = tsc_khz;
> > > > > +
> > > > > +         if (boot_cpu)
> > > > > +                 loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> > > > > +         else
> > > > > +                 loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
> > > > >   }
> > > > > +
> > > > >   if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> > > > >                   (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> > > > > -         *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > > > -
> > > > > +         lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> > > > >           tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> > > > > +
> > > > >           if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> > > > >                   mark_tsc_unstable("cpufreq changes");
> > > > >
> > > > > -         set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> > > > > +         if (boot_cpu) {
> > > > > +                 boot_cpu_data.loops_per_jiffy = lpj;
> > > > > +         } else {
> > > > > +                 for_each_cpu(cpu, cpus)
> > > > > +                         cpu_data(cpu).loops_per_jiffy = lpj;
> > > > > +         }
> > > > > +
> > > > > +         for_each_cpu(cpu, cpus)
> > > > > +                 set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
> > > >
> > > > This code doesn't make sense, the rdtsc() _must_ be called on the CPU in
> > > > question.
> > >
> > > You mean rdtsc() must be locally on that CPU? The cpufreq core never guaranteed
> > > that and it was left for the notifier to do. This patch doesn't change the
> > > behavior at all, just that it moves the for-loop to the notifier instead of the
> > > cpufreq core.
> >
> > Yuck..
> >
> > Rafael; how does this work in practise? Earlier you said that on x86 the
> > policies typically have a single cpu in them anyway.
>
> Yes.
>
> > Is the freq change also notified from _that_ cpu?
>
> May not be, depending on what CPU runs the work item/thread changing
> the freq.  It generally is not guaranteed to always be the same as the
> target CPU.

Actually, scratch that.

On x86, with one CPU per cpufreq policy, that will always be the target CPU.

> > I don't think I have old enough hardware around anymore to test any of
> > this. This was truly ancient p6 era stuff IIRC.
> >
> > Because in that case, I'm all for not doing the changes to this notifier
> > Viresh is proposing but simply adding something like:
> >
> >
> >         WARN_ON_ONCE(cpumask_weight(cpuc) != 1);
> >         WARN_ON_ONCE(cpumask_first(cpuc) != smp_processor_id());
> >
> > And leave it at that.
>
> That may not work I'm afraid.

So something like that could work.
Rafael J. Wysocki - March 18, 2019, 11:49 a.m.
On Fri, Mar 15, 2019 at 10:13 AM Viresh Kumar <viresh.kumar@linaro.org> wrote:
>
> Currently we call these notifiers once for each CPU of the policy->cpus
> cpumask. It would be more optimal if the notifier can be called only
> once and all the relevant information be provided to it. Out of the 24
> drivers that register for the transition notifiers today, only 5 of them
> do per-cpu updates and the callback for the rest can be called only once
> for the policy without any impact.
>
> This would also avoid multiple function calls to the notifier callbacks
> and reduce multiple iterations of notifier core's code (which does
> locking as well).
>
> This patch adds pointer to the cpufreq policy to the struct
> cpufreq_freqs, so the notifier callback has all the information
> available to it with a single call. The five drivers which perform
> per-cpu updates are updated to use the cpufreq policy. The freqs->cpu
> field is redundant now and is removed.
>
> Acked-by: David S. Miller <davem@davemloft.net> (sparc)
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
> V1->V2:
> - Add cpufreq policy instead of cpus in struct cpufreq_freqs.
> - Use policy->cpus instead of related_cpus everywhere in order not to
>   change the existing behavior.
> - Merged all 7 patches into a single patch.
> - Updated changlog and included Ack from David.
>
>  arch/arm/kernel/smp.c       | 24 +++++++++++++++---------
>  arch/arm/kernel/smp_twd.c   |  9 ++++++---
>  arch/sparc/kernel/time_64.c | 28 ++++++++++++++++------------
>  arch/x86/kernel/tsc.c       | 32 +++++++++++++++++++++-----------
>  arch/x86/kvm/x86.c          | 31 ++++++++++++++++++++-----------
>  drivers/cpufreq/cpufreq.c   | 19 ++++++++++---------
>  include/linux/cpufreq.h     | 14 +++++++-------
>  7 files changed, 95 insertions(+), 62 deletions(-)
>
> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> index 1d6f5ea522f4..6f6b981fecda 100644
> --- a/arch/arm/kernel/smp.c
> +++ b/arch/arm/kernel/smp.c
> @@ -758,15 +758,20 @@ static int cpufreq_callback(struct notifier_block *nb,
>                                         unsigned long val, void *data)
>  {
>         struct cpufreq_freqs *freq = data;
> -       int cpu = freq->cpu;
> +       struct cpumask *cpus = freq->policy->cpus;
> +       int cpu, first = cpumask_first(cpus);
> +       unsigned int lpj;
>
>         if (freq->flags & CPUFREQ_CONST_LOOPS)
>                 return NOTIFY_OK;
>
> -       if (!per_cpu(l_p_j_ref, cpu)) {
> -               per_cpu(l_p_j_ref, cpu) =
> -                       per_cpu(cpu_data, cpu).loops_per_jiffy;
> -               per_cpu(l_p_j_ref_freq, cpu) = freq->old;
> +       if (!per_cpu(l_p_j_ref, first)) {
> +               for_each_cpu(cpu, cpus) {
> +                       per_cpu(l_p_j_ref, cpu) =
> +                               per_cpu(cpu_data, cpu).loops_per_jiffy;
> +                       per_cpu(l_p_j_ref_freq, cpu) = freq->old;
> +               }
> +
>                 if (!global_l_p_j_ref) {
>                         global_l_p_j_ref = loops_per_jiffy;
>                         global_l_p_j_ref_freq = freq->old;
> @@ -778,10 +783,11 @@ static int cpufreq_callback(struct notifier_block *nb,
>                 loops_per_jiffy = cpufreq_scale(global_l_p_j_ref,
>                                                 global_l_p_j_ref_freq,
>                                                 freq->new);
> -               per_cpu(cpu_data, cpu).loops_per_jiffy =
> -                       cpufreq_scale(per_cpu(l_p_j_ref, cpu),
> -                                       per_cpu(l_p_j_ref_freq, cpu),
> -                                       freq->new);
> +
> +               lpj = cpufreq_scale(per_cpu(l_p_j_ref, first),
> +                                   per_cpu(l_p_j_ref_freq, first), freq->new);
> +               for_each_cpu(cpu, cpus)
> +                       per_cpu(cpu_data, cpu).loops_per_jiffy = lpj;
>         }
>         return NOTIFY_OK;
>  }
> diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
> index b30eafeef096..495cc7282096 100644
> --- a/arch/arm/kernel/smp_twd.c
> +++ b/arch/arm/kernel/smp_twd.c
> @@ -162,15 +162,18 @@ static int twd_cpufreq_transition(struct notifier_block *nb,
>         unsigned long state, void *data)
>  {
>         struct cpufreq_freqs *freqs = data;
> +       int cpu;
>
>         /*
>          * The twd clock events must be reprogrammed to account for the new
>          * frequency.  The timer is local to a cpu, so cross-call to the
>          * changing cpu.
>          */
> -       if (state == CPUFREQ_POSTCHANGE)
> -               smp_call_function_single(freqs->cpu, twd_update_frequency,
> -                       NULL, 1);
> +       if (state != CPUFREQ_POSTCHANGE)
> +               return NOTIFY_OK;
> +
> +       for_each_cpu(cpu, freqs->policy->cpus)
> +               smp_call_function_single(cpu, twd_update_frequency, NULL, 1);
>
>         return NOTIFY_OK;
>  }
> diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
> index 3eb77943ce12..89fb05f90609 100644
> --- a/arch/sparc/kernel/time_64.c
> +++ b/arch/sparc/kernel/time_64.c
> @@ -653,19 +653,23 @@ static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val
>                                     void *data)
>  {
>         struct cpufreq_freqs *freq = data;
> -       unsigned int cpu = freq->cpu;
> -       struct freq_table *ft = &per_cpu(sparc64_freq_table, cpu);
> +       unsigned int cpu;
> +       struct freq_table *ft;
>
> -       if (!ft->ref_freq) {
> -               ft->ref_freq = freq->old;
> -               ft->clock_tick_ref = cpu_data(cpu).clock_tick;
> -       }
> -       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> -           (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> -               cpu_data(cpu).clock_tick =
> -                       cpufreq_scale(ft->clock_tick_ref,
> -                                     ft->ref_freq,
> -                                     freq->new);
> +       for_each_cpu(cpu, freq->policy->cpus) {
> +               ft = &per_cpu(sparc64_freq_table, cpu);
> +
> +               if (!ft->ref_freq) {
> +                       ft->ref_freq = freq->old;
> +                       ft->clock_tick_ref = cpu_data(cpu).clock_tick;
> +               }
> +
> +               if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
> +                   (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> +                       cpu_data(cpu).clock_tick =
> +                               cpufreq_scale(ft->clock_tick_ref, ft->ref_freq,
> +                                             freq->new);
> +               }
>         }
>
>         return 0;
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 3fae23834069..cff8779fc0d2 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -956,28 +956,38 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
>                                 void *data)
>  {
>         struct cpufreq_freqs *freq = data;
> -       unsigned long *lpj;
> -
> -       lpj = &boot_cpu_data.loops_per_jiffy;
> -#ifdef CONFIG_SMP
> -       if (!(freq->flags & CPUFREQ_CONST_LOOPS))
> -               lpj = &cpu_data(freq->cpu).loops_per_jiffy;
> -#endif
> +       struct cpumask *cpus = freq->policy->cpus;
> +       bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
> +       unsigned long lpj;
> +       int cpu;
>
>         if (!ref_freq) {
>                 ref_freq = freq->old;
> -               loops_per_jiffy_ref = *lpj;
>                 tsc_khz_ref = tsc_khz;
> +
> +               if (boot_cpu)
> +                       loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
> +               else
> +                       loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
>         }
> +
>         if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
>                         (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
> -               *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
> -
> +               lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
>                 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
> +
>                 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
>                         mark_tsc_unstable("cpufreq changes");
>
> -               set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
> +               if (boot_cpu) {
> +                       boot_cpu_data.loops_per_jiffy = lpj;
> +               } else {
> +                       for_each_cpu(cpu, cpus)
> +                               cpu_data(cpu).loops_per_jiffy = lpj;
> +               }
> +
> +               for_each_cpu(cpu, cpus)
> +                       set_cyc2ns_scale(tsc_khz, cpu, rdtsc());

To summarize, I think that it would be sufficient to do this just for
policy->cpu and, as Peter said, warn once if there are more CPUs in
the policy or policy->cpu is not the CPU running this code.  And mark
the TSC as unstable in both of these cases.

Patch

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 1d6f5ea522f4..6f6b981fecda 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -758,15 +758,20 @@  static int cpufreq_callback(struct notifier_block *nb,
 					unsigned long val, void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	int cpu = freq->cpu;
+	struct cpumask *cpus = freq->policy->cpus;
+	int cpu, first = cpumask_first(cpus);
+	unsigned int lpj;
 
 	if (freq->flags & CPUFREQ_CONST_LOOPS)
 		return NOTIFY_OK;
 
-	if (!per_cpu(l_p_j_ref, cpu)) {
-		per_cpu(l_p_j_ref, cpu) =
-			per_cpu(cpu_data, cpu).loops_per_jiffy;
-		per_cpu(l_p_j_ref_freq, cpu) = freq->old;
+	if (!per_cpu(l_p_j_ref, first)) {
+		for_each_cpu(cpu, cpus) {
+			per_cpu(l_p_j_ref, cpu) =
+				per_cpu(cpu_data, cpu).loops_per_jiffy;
+			per_cpu(l_p_j_ref_freq, cpu) = freq->old;
+		}
+
 		if (!global_l_p_j_ref) {
 			global_l_p_j_ref = loops_per_jiffy;
 			global_l_p_j_ref_freq = freq->old;
@@ -778,10 +783,11 @@  static int cpufreq_callback(struct notifier_block *nb,
 		loops_per_jiffy = cpufreq_scale(global_l_p_j_ref,
 						global_l_p_j_ref_freq,
 						freq->new);
-		per_cpu(cpu_data, cpu).loops_per_jiffy =
-			cpufreq_scale(per_cpu(l_p_j_ref, cpu),
-					per_cpu(l_p_j_ref_freq, cpu),
-					freq->new);
+
+		lpj = cpufreq_scale(per_cpu(l_p_j_ref, first),
+				    per_cpu(l_p_j_ref_freq, first), freq->new);
+		for_each_cpu(cpu, cpus)
+			per_cpu(cpu_data, cpu).loops_per_jiffy = lpj;
 	}
 	return NOTIFY_OK;
 }
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index b30eafeef096..495cc7282096 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -162,15 +162,18 @@  static int twd_cpufreq_transition(struct notifier_block *nb,
 	unsigned long state, void *data)
 {
 	struct cpufreq_freqs *freqs = data;
+	int cpu;
 
 	/*
 	 * The twd clock events must be reprogrammed to account for the new
 	 * frequency.  The timer is local to a cpu, so cross-call to the
 	 * changing cpu.
 	 */
-	if (state == CPUFREQ_POSTCHANGE)
-		smp_call_function_single(freqs->cpu, twd_update_frequency,
-			NULL, 1);
+	if (state != CPUFREQ_POSTCHANGE)
+		return NOTIFY_OK;
+
+	for_each_cpu(cpu, freqs->policy->cpus)
+		smp_call_function_single(cpu, twd_update_frequency, NULL, 1);
 
 	return NOTIFY_OK;
 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 3eb77943ce12..89fb05f90609 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -653,19 +653,23 @@  static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val
 				    void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	unsigned int cpu = freq->cpu;
-	struct freq_table *ft = &per_cpu(sparc64_freq_table, cpu);
+	unsigned int cpu;
+	struct freq_table *ft;
 
-	if (!ft->ref_freq) {
-		ft->ref_freq = freq->old;
-		ft->clock_tick_ref = cpu_data(cpu).clock_tick;
-	}
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
-		cpu_data(cpu).clock_tick =
-			cpufreq_scale(ft->clock_tick_ref,
-				      ft->ref_freq,
-				      freq->new);
+	for_each_cpu(cpu, freq->policy->cpus) {
+		ft = &per_cpu(sparc64_freq_table, cpu);
+
+		if (!ft->ref_freq) {
+			ft->ref_freq = freq->old;
+			ft->clock_tick_ref = cpu_data(cpu).clock_tick;
+		}
+
+		if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+		    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+			cpu_data(cpu).clock_tick =
+				cpufreq_scale(ft->clock_tick_ref, ft->ref_freq,
+					      freq->new);
+		}
 	}
 
 	return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3fae23834069..cff8779fc0d2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -956,28 +956,38 @@  static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	unsigned long *lpj;
-
-	lpj = &boot_cpu_data.loops_per_jiffy;
-#ifdef CONFIG_SMP
-	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#endif
+	struct cpumask *cpus = freq->policy->cpus;
+	bool boot_cpu = !IS_ENABLED(CONFIG_SMP) || freq->flags & CPUFREQ_CONST_LOOPS;
+	unsigned long lpj;
+	int cpu;
 
 	if (!ref_freq) {
 		ref_freq = freq->old;
-		loops_per_jiffy_ref = *lpj;
 		tsc_khz_ref = tsc_khz;
+
+		if (boot_cpu)
+			loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
+		else
+			loops_per_jiffy_ref = cpu_data(cpumask_first(cpus)).loops_per_jiffy;
 	}
+
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
-		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
+		lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
 			mark_tsc_unstable("cpufreq changes");
 
-		set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
+		if (boot_cpu) {
+			boot_cpu_data.loops_per_jiffy = lpj;
+		} else {
+			for_each_cpu(cpu, cpus)
+				cpu_data(cpu).loops_per_jiffy = lpj;
+		}
+
+		for_each_cpu(cpu, cpus)
+			set_cyc2ns_scale(tsc_khz, cpu, rdtsc());
 	}
 
 	return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 941f932373d0..653c7da11647 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6648,10 +6648,8 @@  static void kvm_hyperv_tsc_notifier(void)
 }
 #endif
 
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-				     void *data)
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
 {
-	struct cpufreq_freqs *freq = data;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int i, send_ipi = 0;
@@ -6695,17 +6693,12 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	 *
 	 */
 
-	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
-		return 0;
-	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
-		return 0;
-
-	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 
 	spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			if (vcpu->cpu != freq->cpu)
+			if (vcpu->cpu != cpu)
 				continue;
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
@@ -6727,8 +6720,24 @@  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 		 * guest context is entered kvmclock will be updated,
 		 * so the guest will not see stale values.
 		 */
-		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+		smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 	}
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				     void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	int cpu;
+
+	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+		return 0;
+	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+		return 0;
+
+	for_each_cpu(cpu, freq->policy->cpus)
+		__kvmclock_cpufreq_notifier(freq, cpu);
+
 	return 0;
 }
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e10922709d13..fba38bf27d26 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -300,11 +300,14 @@  static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 				      struct cpufreq_freqs *freqs,
 				      unsigned int state)
 {
+	int cpu;
+
 	BUG_ON(irqs_disabled());
 
 	if (cpufreq_disabled())
 		return;
 
+	freqs->policy = policy;
 	freqs->flags = cpufreq_driver->flags;
 	pr_debug("notification %u of frequency transition to %u kHz\n",
 		 state, freqs->new);
@@ -324,10 +327,8 @@  static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 			}
 		}
 
-		for_each_cpu(freqs->cpu, policy->cpus) {
-			srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
-						 CPUFREQ_PRECHANGE, freqs);
-		}
+		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
+					 CPUFREQ_PRECHANGE, freqs);
 
 		adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
 		break;
@@ -337,11 +338,11 @@  static void cpufreq_notify_transition(struct cpufreq_policy *policy,
 		pr_debug("FREQ: %u - CPUs: %*pbl\n", freqs->new,
 			 cpumask_pr_args(policy->cpus));
 
-		for_each_cpu(freqs->cpu, policy->cpus) {
-			trace_cpu_frequency(freqs->new, freqs->cpu);
-			srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
-						 CPUFREQ_POSTCHANGE, freqs);
-		}
+		for_each_cpu(cpu, policy->cpus)
+			trace_cpu_frequency(freqs->new, cpu);
+
+		srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
+					 CPUFREQ_POSTCHANGE, freqs);
 
 		cpufreq_stats_record_transition(policy, freqs->new);
 		policy->cur = freqs->new;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b160e98076e3..e327523ddff2 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -42,13 +42,6 @@  enum cpufreq_table_sorting {
 	CPUFREQ_TABLE_SORTED_DESCENDING
 };
 
-struct cpufreq_freqs {
-	unsigned int cpu;	/* cpu nr */
-	unsigned int old;
-	unsigned int new;
-	u8 flags;		/* flags of cpufreq_driver, see below. */
-};
-
 struct cpufreq_cpuinfo {
 	unsigned int		max_freq;
 	unsigned int		min_freq;
@@ -156,6 +149,13 @@  struct cpufreq_policy {
 	struct thermal_cooling_device *cdev;
 };
 
+struct cpufreq_freqs {
+	struct cpufreq_policy *policy;
+	unsigned int old;
+	unsigned int new;
+	u8 flags;		/* flags of cpufreq_driver, see below. */
+};
+
 /* Only for ACPI */
 #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
 #define CPUFREQ_SHARED_TYPE_HW	 (1) /* HW does needed coordination */