Patchwork [v4,bpf-next,01/15] bpf: split read liveness into REG_LIVE_READ64 and REG_LIVE_READ32

login
register
mail settings
Submitter Jiong Wang
Date April 15, 2019, 5:26 p.m.
Message ID <1555349185-12508-2-git-send-email-jiong.wang@netronome.com>
Download mbox | patch
Permalink /patch/773505/
State New
Headers show

Comments

Jiong Wang - April 15, 2019, 5:26 p.m.
Register liveness infrastructure doesn't track register read width at the
moment, while the width information will be needed for the later 32-bit
safety analysis pass.

This patch take the first step to split read liveness into REG_LIVE_READ64
and REG_LIVE_READ32.

Liveness propagation code are updated accordingly. They are taught to
understand how to propagate REG_LIVE_READ64 and REG_LIVE_READ32 at the same
propagation iteration. For example, "mark_reg_read" now propagate "flags"
which could be multiple read bits instead of the single REG_LIVE_READ64.

A write still screen off all width of reads.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
---
 include/linux/bpf_verifier.h |   8 +--
 kernel/bpf/verifier.c        | 119 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 115 insertions(+), 12 deletions(-)
Jakub Kicinski - April 15, 2019, 11:03 p.m.
On Mon, 15 Apr 2019 18:26:11 +0100, Jiong Wang wrote:
> Register liveness infrastructure doesn't track register read width at the
> moment, while the width information will be needed for the later 32-bit
> safety analysis pass.
> 
> This patch take the first step to split read liveness into REG_LIVE_READ64
> and REG_LIVE_READ32.
> 
> Liveness propagation code are updated accordingly. They are taught to
> understand how to propagate REG_LIVE_READ64 and REG_LIVE_READ32 at the same
> propagation iteration. For example, "mark_reg_read" now propagate "flags"
> which could be multiple read bits instead of the single REG_LIVE_READ64.
> 
> A write still screen off all width of reads.
> 
> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Thanks!
Alexei Starovoitov - April 16, 2019, 1:26 a.m.
On Mon, Apr 15, 2019 at 06:26:11PM +0100, Jiong Wang wrote:
> Register liveness infrastructure doesn't track register read width at the
> moment, while the width information will be needed for the later 32-bit
> safety analysis pass.
> 
> This patch take the first step to split read liveness into REG_LIVE_READ64
> and REG_LIVE_READ32.
> 
> Liveness propagation code are updated accordingly. They are taught to
> understand how to propagate REG_LIVE_READ64 and REG_LIVE_READ32 at the same
> propagation iteration. For example, "mark_reg_read" now propagate "flags"
> which could be multiple read bits instead of the single REG_LIVE_READ64.
> 
> A write still screen off all width of reads.
> 
> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
> ---
>  include/linux/bpf_verifier.h |   8 +--
>  kernel/bpf/verifier.c        | 119 +++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 115 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index b3ab61f..fba0ebb 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -36,9 +36,11 @@
>   */
>  enum bpf_reg_liveness {
>  	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
>  };
>  
>  struct bpf_reg_state {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index c722015..5784b279 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1135,7 +1135,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
>   */
>  static int mark_reg_read(struct bpf_verifier_env *env,
>  			 const struct bpf_reg_state *state,
> -			 struct bpf_reg_state *parent)
> +			 struct bpf_reg_state *parent, u8 flags)
>  {
>  	bool writes = parent == state->parent; /* Observe write marks */
>  	int cnt = 0;
> @@ -1150,17 +1150,23 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>  				parent->var_off.value, parent->off);
>  			return -EFAULT;
>  		}
> -		if (parent->live & REG_LIVE_READ)
> +		/* The first condition is much more likely to be true than the
> +		 * second, make it checked first.
> +		 */
> +		if ((parent->live & REG_LIVE_READ) == flags ||
> +		    parent->live & REG_LIVE_READ64)
>  			/* The parentage chain never changes and
>  			 * this parent was already marked as LIVE_READ.
>  			 * There is no need to keep walking the chain again and
>  			 * keep re-marking all parents as LIVE_READ.
>  			 * This case happens when the same register is read
>  			 * multiple times without writes into it in-between.
> +			 * Also, if parent has REG_LIVE_READ64 set, then no need
> +			 * to set the weak REG_LIVE_READ32.
>  			 */
>  			break;
>  		/* ... then we depend on parent's value */
> -		parent->live |= REG_LIVE_READ;
> +		parent->live |= flags;
>  		state = parent;
>  		parent = state->parent;
>  		writes = true;
> @@ -1172,12 +1178,95 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>  	return 0;
>  }
>  
> +/* This function is supposed to be used by the following 32-bit optimization
> + * code only. It returns TRUE if the source or destination register operates
> + * on 64-bit, otherwise return FALSE.
> + */
> +static bool is_reg64(struct bpf_insn *insn, u32 regno,
> +		     struct bpf_reg_state *reg, enum reg_arg_type t)
> +{
> +	u8 code, class, op;
> +

why is it called for case when t != SRC_OP ?
this patch is using the return value only in t == SRC_OP case
and other patches don't use is_reg64() at all.

> +	code = insn->code;
> +	class = BPF_CLASS(code);
> +	op = BPF_OP(code);
> +	if (class == BPF_JMP) {
> +		/* BPF_EXIT will reach here because of return value readability
> +		 * test for "main" which has s32 return value.
> +		 */
> +		if (op == BPF_EXIT)
> +			return false;

That's not incorrect. bpf2bpf calls return 64-bit values.
bpf abi is such that all bpf progs return 64-bit values.
Historically we truncate to 32-bit in BPF_PROG_RUN,
but some future bpf hook might use all 64 bits of return value.

> +		if (op == BPF_CALL) {
> +			/* BPF to BPF call will reach here because of marking
> +			 * caller saved clobber with DST_OP_NO_MARK for which we
> +			 * don't care the register def because they are anyway
> +			 * marked as NOT_INIT already.
> +			 */

the comment doesn't seem to match the code. why return anything here?
The return value won't be used anyway.

If is_reg64() is inside check_reg_arg() under if (t == SRC_OP)
all these corner cases wouldn't cause review headaches and can be converted
to WARN_ONCE.

What am I missing?
Jiong Wang - April 16, 2019, 7:39 a.m.
Alexei Starovoitov writes:

> On Mon, Apr 15, 2019 at 06:26:11PM +0100, Jiong Wang wrote:
>> Register liveness infrastructure doesn't track register read width at the
>> moment, while the width information will be needed for the later 32-bit
>> safety analysis pass.
>> 
>> This patch take the first step to split read liveness into REG_LIVE_READ64
>> and REG_LIVE_READ32.
>> 
>> Liveness propagation code are updated accordingly. They are taught to
>> understand how to propagate REG_LIVE_READ64 and REG_LIVE_READ32 at the same
>> propagation iteration. For example, "mark_reg_read" now propagate "flags"
>> which could be multiple read bits instead of the single REG_LIVE_READ64.
>> 
>> A write still screen off all width of reads.
>> 
>> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
>> ---
>>  include/linux/bpf_verifier.h |   8 +--
>>  kernel/bpf/verifier.c        | 119 +++++++++++++++++++++++++++++++++++++++----
>>  2 files changed, 115 insertions(+), 12 deletions(-)
>> 
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index b3ab61f..fba0ebb 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -36,9 +36,11 @@
>>   */
>>  enum bpf_reg_liveness {
>>  	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
>> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
>> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
>> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
>> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
>> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
>> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
>> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
>> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
>>  };
>>  
>>  struct bpf_reg_state {
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index c722015..5784b279 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -1135,7 +1135,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
>>   */
>>  static int mark_reg_read(struct bpf_verifier_env *env,
>>  			 const struct bpf_reg_state *state,
>> -			 struct bpf_reg_state *parent)
>> +			 struct bpf_reg_state *parent, u8 flags)
>>  {
>>  	bool writes = parent == state->parent; /* Observe write marks */
>>  	int cnt = 0;
>> @@ -1150,17 +1150,23 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>>  				parent->var_off.value, parent->off);
>>  			return -EFAULT;
>>  		}
>> -		if (parent->live & REG_LIVE_READ)
>> +		/* The first condition is much more likely to be true than the
>> +		 * second, make it checked first.
>> +		 */
>> +		if ((parent->live & REG_LIVE_READ) == flags ||
>> +		    parent->live & REG_LIVE_READ64)
>>  			/* The parentage chain never changes and
>>  			 * this parent was already marked as LIVE_READ.
>>  			 * There is no need to keep walking the chain again and
>>  			 * keep re-marking all parents as LIVE_READ.
>>  			 * This case happens when the same register is read
>>  			 * multiple times without writes into it in-between.
>> +			 * Also, if parent has REG_LIVE_READ64 set, then no need
>> +			 * to set the weak REG_LIVE_READ32.
>>  			 */
>>  			break;
>>  		/* ... then we depend on parent's value */
>> -		parent->live |= REG_LIVE_READ;
>> +		parent->live |= flags;
>>  		state = parent;
>>  		parent = state->parent;
>>  		writes = true;
>> @@ -1172,12 +1178,95 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>>  	return 0;
>>  }
>>  
>> +/* This function is supposed to be used by the following 32-bit optimization
>> + * code only. It returns TRUE if the source or destination register operates
>> + * on 64-bit, otherwise return FALSE.
>> + */
>> +static bool is_reg64(struct bpf_insn *insn, u32 regno,
>> +		     struct bpf_reg_state *reg, enum reg_arg_type t)
>> +{
>> +	u8 code, class, op;
>> +
>
> why is it called for case when t != SRC_OP ?
> this patch is using the return value only in t == SRC_OP case
> and other patches don't use is_reg64() at all.

It is used for case when t == DST*, in patch 2/15, please search "rw64" in
that patch.

And "is_reg64" aims to return TRUE if it is 64-bit read when T == SRC_OP,
and return TRUE if it is 64-bit write when T = DST*. 

>> +	code = insn->code;
>> +	class = BPF_CLASS(code);
>> +	op = BPF_OP(code);
>> +	if (class == BPF_JMP) {
>> +		/* BPF_EXIT will reach here because of return value readability
>> +		 * test for "main" which has s32 return value.
>> +		 */
>> +		if (op == BPF_EXIT)
>> +			return false;
>
> That's not incorrect. bpf2bpf calls return 64-bit values.

bpf2bpf calls has all instructions exposed to insn walker, so the data-flow
is naturally tracked. For example:
  
callee:
  w0 = w2
  exit

caller:
  call callee2
  r2 = r0
  
insn walker should have marked REG_0 is a sub-register define in callee's
frame, and such marker is naturally propagetd back to caller's frame inside
"prepare_func_exit" which is doing:

  /* return to the caller whatever r0 had in the callee */                 
  caller->regs[BPF_REG_0] = *r0;

This copies parentage chain, and also copies the sub-register definition
information as we have merged it into reg state.

> bpf abi is such that all bpf progs return 64-bit values.
> Historically we truncate to 32-bit in BPF_PROG_RUN,
> but some future bpf hook might use all 64 bits of return value.

hmm, was thinking bpf main always return 32-bit, so safe to return FALSE
here. If we could have 64-bit main return, then perhaps for main, always do
zero-extension on r0 if it comes from sub-register def, this seems a little
bit unnecessary, given there is prog_type available during the check, using
which we can known whether whether return value is 64-bit or not.

thoughts?

>> +		if (op == BPF_CALL) {
>> +			/* BPF to BPF call will reach here because of marking
>> +			 * caller saved clobber with DST_OP_NO_MARK for which we
>> +			 * don't care the register def because they are anyway
>> +			 * marked as NOT_INIT already.
>> +			 */
>
> the comment doesn't seem to match the code. why return anything here?

It is to handle the case like:

  check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);

which is called inside "check_func_call" for call insn. I think for caller
saved register, return anything is fine. They must have new def inside
callee before used. And when returned to caller, they must be restored
before used otherwise the prog will be rejected due to their status is
marked as NOT_INIT.

So, for all cases, they are tracked insn walker accurately.

> The return value won't be used anyway.
>
> If is_reg64() is inside check_reg_arg() under if (t == SRC_OP)
> all these corner cases wouldn't cause review headaches and can be converted
> to WARN_ONCE.
>
> What am I missing?

As explained, is_reg64 could be used by both t == SRC_OP and DST*. And is
will be called for instructions with implicit register usage for example
CALL, and we need to return access width for those implicitly used register
here as well.

Make sense?

Regards,
Jiong
Alexei Starovoitov - April 16, 2019, 4:20 p.m.
On Tue, Apr 16, 2019 at 08:39:30AM +0100, Jiong Wang wrote:
> 
> Alexei Starovoitov writes:
> 
> > On Mon, Apr 15, 2019 at 06:26:11PM +0100, Jiong Wang wrote:
> >> Register liveness infrastructure doesn't track register read width at the
> >> moment, while the width information will be needed for the later 32-bit
> >> safety analysis pass.
> >> 
> >> This patch take the first step to split read liveness into REG_LIVE_READ64
> >> and REG_LIVE_READ32.
> >> 
> >> Liveness propagation code are updated accordingly. They are taught to
> >> understand how to propagate REG_LIVE_READ64 and REG_LIVE_READ32 at the same
> >> propagation iteration. For example, "mark_reg_read" now propagate "flags"
> >> which could be multiple read bits instead of the single REG_LIVE_READ64.
> >> 
> >> A write still screen off all width of reads.
> >> 
> >> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
> >> ---
> >>  include/linux/bpf_verifier.h |   8 +--
> >>  kernel/bpf/verifier.c        | 119 +++++++++++++++++++++++++++++++++++++++----
> >>  2 files changed, 115 insertions(+), 12 deletions(-)
> >> 
> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> >> index b3ab61f..fba0ebb 100644
> >> --- a/include/linux/bpf_verifier.h
> >> +++ b/include/linux/bpf_verifier.h
> >> @@ -36,9 +36,11 @@
> >>   */
> >>  enum bpf_reg_liveness {
> >>  	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
> >> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
> >> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
> >> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
> >> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
> >> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
> >> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
> >> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
> >> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
> >>  };
> >>  
> >>  struct bpf_reg_state {
> >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >> index c722015..5784b279 100644
> >> --- a/kernel/bpf/verifier.c
> >> +++ b/kernel/bpf/verifier.c
> >> @@ -1135,7 +1135,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
> >>   */
> >>  static int mark_reg_read(struct bpf_verifier_env *env,
> >>  			 const struct bpf_reg_state *state,
> >> -			 struct bpf_reg_state *parent)
> >> +			 struct bpf_reg_state *parent, u8 flags)
> >>  {
> >>  	bool writes = parent == state->parent; /* Observe write marks */
> >>  	int cnt = 0;
> >> @@ -1150,17 +1150,23 @@ static int mark_reg_read(struct bpf_verifier_env *env,
> >>  				parent->var_off.value, parent->off);
> >>  			return -EFAULT;
> >>  		}
> >> -		if (parent->live & REG_LIVE_READ)
> >> +		/* The first condition is much more likely to be true than the
> >> +		 * second, make it checked first.
> >> +		 */
> >> +		if ((parent->live & REG_LIVE_READ) == flags ||
> >> +		    parent->live & REG_LIVE_READ64)
> >>  			/* The parentage chain never changes and
> >>  			 * this parent was already marked as LIVE_READ.
> >>  			 * There is no need to keep walking the chain again and
> >>  			 * keep re-marking all parents as LIVE_READ.
> >>  			 * This case happens when the same register is read
> >>  			 * multiple times without writes into it in-between.
> >> +			 * Also, if parent has REG_LIVE_READ64 set, then no need
> >> +			 * to set the weak REG_LIVE_READ32.
> >>  			 */
> >>  			break;
> >>  		/* ... then we depend on parent's value */
> >> -		parent->live |= REG_LIVE_READ;
> >> +		parent->live |= flags;
> >>  		state = parent;
> >>  		parent = state->parent;
> >>  		writes = true;
> >> @@ -1172,12 +1178,95 @@ static int mark_reg_read(struct bpf_verifier_env *env,
> >>  	return 0;
> >>  }
> >>  
> >> +/* This function is supposed to be used by the following 32-bit optimization
> >> + * code only. It returns TRUE if the source or destination register operates
> >> + * on 64-bit, otherwise return FALSE.
> >> + */
> >> +static bool is_reg64(struct bpf_insn *insn, u32 regno,
> >> +		     struct bpf_reg_state *reg, enum reg_arg_type t)
> >> +{
> >> +	u8 code, class, op;
> >> +
> >
> > why is it called for case when t != SRC_OP ?
> > this patch is using the return value only in t == SRC_OP case
> > and other patches don't use is_reg64() at all.
> 
> It is used for case when t == DST*, in patch 2/15, please search "rw64" in
> that patch.

argh. such patch split didn't make it easy to review.

> And "is_reg64" aims to return TRUE if it is 64-bit read when T == SRC_OP,
> and return TRUE if it is 64-bit write when T = DST*. 
> 
> >> +	code = insn->code;
> >> +	class = BPF_CLASS(code);
> >> +	op = BPF_OP(code);
> >> +	if (class == BPF_JMP) {
> >> +		/* BPF_EXIT will reach here because of return value readability
> >> +		 * test for "main" which has s32 return value.
> >> +		 */
> >> +		if (op == BPF_EXIT)
> >> +			return false;
> >
> > That's not incorrect. bpf2bpf calls return 64-bit values.
> 
> bpf2bpf calls has all instructions exposed to insn walker, so the data-flow
> is naturally tracked. For example:
>   
> callee:
>   w0 = w2
>   exit

So then it needs to be different code? Like read32_maybe ?

> caller:
>   call callee2
>   r2 = r0
>   
> insn walker should have marked REG_0 is a sub-register define in callee's
> frame, and such marker is naturally propagetd back to caller's frame inside
> "prepare_func_exit" which is doing:
> 
>   /* return to the caller whatever r0 had in the callee */                 
>   caller->regs[BPF_REG_0] = *r0;
> 
> This copies parentage chain, and also copies the sub-register definition
> information as we have merged it into reg state.
> 
> > bpf abi is such that all bpf progs return 64-bit values.
> > Historically we truncate to 32-bit in BPF_PROG_RUN,
> > but some future bpf hook might use all 64 bits of return value.
> 
> hmm, was thinking bpf main always return 32-bit, so safe to return FALSE
> here. If we could have 64-bit main return, then perhaps for main, always do
> zero-extension on r0 if it comes from sub-register def, this seems a little
> bit unnecessary, given there is prog_type available during the check, using
> which we can known whether whether return value is 64-bit or not.
> 
> thoughts?

I think it's safer to go with read64 for now and optimize later
if really necessary.

> >> +		if (op == BPF_CALL) {
> >> +			/* BPF to BPF call will reach here because of marking
> >> +			 * caller saved clobber with DST_OP_NO_MARK for which we
> >> +			 * don't care the register def because they are anyway
> >> +			 * marked as NOT_INIT already.
> >> +			 */
> >
> > the comment doesn't seem to match the code. why return anything here?
> 
> It is to handle the case like:
> 
>   check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
> 
> which is called inside "check_func_call" for call insn. I think for caller
> saved register, return anything is fine. They must have new def inside
> callee before used. And when returned to caller, they must be restored
> before used otherwise the prog will be rejected due to their status is
> marked as NOT_INIT.

but neither this patch nor patch 2 is using the return value.

> 
> So, for all cases, they are tracked insn walker accurately.
> 
> > The return value won't be used anyway.
> >
> > If is_reg64() is inside check_reg_arg() under if (t == SRC_OP)
> > all these corner cases wouldn't cause review headaches and can be converted
> > to WARN_ONCE.
> >
> > What am I missing?
> 
> As explained, is_reg64 could be used by both t == SRC_OP and DST*. And is
> will be called for instructions with implicit register usage for example
> CALL, and we need to return access width for those implicitly used register
> here as well.
> 
> Make sense?

Kinda. I'll take a look again. My first reaction is that patch 1 and 2
should be one patch or split differently.
Jiong Wang - April 16, 2019, 8:19 p.m.
Alexei Starovoitov writes:

> On Tue, Apr 16, 2019 at 08:39:30AM +0100, Jiong Wang wrote:
>> 
>> Alexei Starovoitov writes:
>> 
>> > On Mon, Apr 15, 2019 at 06:26:11PM +0100, Jiong Wang wrote:
>> >> Register liveness infrastructure doesn't track register read width at the
>> >> moment, while the width information will be needed for the later 32-bit
>> >> safety analysis pass.
>> >> 
>> >> This patch take the first step to split read liveness into REG_LIVE_READ64
>> >> and REG_LIVE_READ32.
>> >> 
>> >> Liveness propagation code are updated accordingly. They are taught to
>> >> understand how to propagate REG_LIVE_READ64 and REG_LIVE_READ32 at the same
>> >> propagation iteration. For example, "mark_reg_read" now propagate "flags"
>> >> which could be multiple read bits instead of the single REG_LIVE_READ64.
>> >> 
>> >> A write still screen off all width of reads.
>> >> 
>> >> Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
>> >> ---
>> >>  include/linux/bpf_verifier.h |   8 +--
>> >>  kernel/bpf/verifier.c        | 119 +++++++++++++++++++++++++++++++++++++++----
>> >>  2 files changed, 115 insertions(+), 12 deletions(-)
>> >> 
>> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> >> index b3ab61f..fba0ebb 100644
>> >> --- a/include/linux/bpf_verifier.h
>> >> +++ b/include/linux/bpf_verifier.h
>> >> @@ -36,9 +36,11 @@
>> >>   */
>> >>  enum bpf_reg_liveness {
>> >>  	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
>> >> -	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
>> >> -	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
>> >> -	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
>> >> +	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
>> >> +	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
>> >> +	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
>> >> +	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
>> >> +	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
>> >>  };
>> >>  
>> >>  struct bpf_reg_state {
>> >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> >> index c722015..5784b279 100644
>> >> --- a/kernel/bpf/verifier.c
>> >> +++ b/kernel/bpf/verifier.c
>> >> @@ -1135,7 +1135,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
>> >>   */
>> >>  static int mark_reg_read(struct bpf_verifier_env *env,
>> >>  			 const struct bpf_reg_state *state,
>> >> -			 struct bpf_reg_state *parent)
>> >> +			 struct bpf_reg_state *parent, u8 flags)
>> >>  {
>> >>  	bool writes = parent == state->parent; /* Observe write marks */
>> >>  	int cnt = 0;
>> >> @@ -1150,17 +1150,23 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>> >>  				parent->var_off.value, parent->off);
>> >>  			return -EFAULT;
>> >>  		}
>> >> -		if (parent->live & REG_LIVE_READ)
>> >> +		/* The first condition is much more likely to be true than the
>> >> +		 * second, make it checked first.
>> >> +		 */
>> >> +		if ((parent->live & REG_LIVE_READ) == flags ||
>> >> +		    parent->live & REG_LIVE_READ64)
>> >>  			/* The parentage chain never changes and
>> >>  			 * this parent was already marked as LIVE_READ.
>> >>  			 * There is no need to keep walking the chain again and
>> >>  			 * keep re-marking all parents as LIVE_READ.
>> >>  			 * This case happens when the same register is read
>> >>  			 * multiple times without writes into it in-between.
>> >> +			 * Also, if parent has REG_LIVE_READ64 set, then no need
>> >> +			 * to set the weak REG_LIVE_READ32.
>> >>  			 */
>> >>  			break;
>> >>  		/* ... then we depend on parent's value */
>> >> -		parent->live |= REG_LIVE_READ;
>> >> +		parent->live |= flags;
>> >>  		state = parent;
>> >>  		parent = state->parent;
>> >>  		writes = true;
>> >> @@ -1172,12 +1178,95 @@ static int mark_reg_read(struct bpf_verifier_env *env,
>> >>  	return 0;
>> >>  }
>> >>  
>> >> +/* This function is supposed to be used by the following 32-bit optimization
>> >> + * code only. It returns TRUE if the source or destination register operates
>> >> + * on 64-bit, otherwise return FALSE.
>> >> + */
>> >> +static bool is_reg64(struct bpf_insn *insn, u32 regno,
>> >> +		     struct bpf_reg_state *reg, enum reg_arg_type t)
>> >> +{
>> >> +	u8 code, class, op;
>> >> +
>> >
>> > why is it called for case when t != SRC_OP ?
>> > this patch is using the return value only in t == SRC_OP case
>> > and other patches don't use is_reg64() at all.
>> 
>> It is used for case when t == DST*, in patch 2/15, please search "rw64" in
>> that patch.
>
> argh. such patch split didn't make it easy to review.

Was trying to make each patch smaller, and was thinking split READ into
READ64 and READ32 doesn't affect currect read check, so could be treated as
an independent first step.

On the other hand, zero-extension candidates are identified through from
read and write info, so perhaps merge them into one patch is more logically
sane, especially when reviews are done.

Will go ahead to merge patch 1 and 2 into a single patch in next version.

>> And "is_reg64" aims to return TRUE if it is 64-bit read when T == SRC_OP,
>> and return TRUE if it is 64-bit write when T = DST*. 
>> 
>> >> +	code = insn->code;
>> >> +	class = BPF_CLASS(code);
>> >> +	op = BPF_OP(code);
>> >> +	if (class == BPF_JMP) {
>> >> +		/* BPF_EXIT will reach here because of return value readability
>> >> +		 * test for "main" which has s32 return value.
>> >> +		 */
>> >> +		if (op == BPF_EXIT)
>> >> +			return false;
>> >
>> > That's not incorrect. bpf2bpf calls return 64-bit values.
>> 
>> bpf2bpf calls has all instructions exposed to insn walker, so the data-flow
>> is naturally tracked. For example:
>>   
>> callee:
>>   w0 = w2
>>   exit
>
> So then it needs to be different code? Like read32_maybe ?

Read this question several times, still can't get the meaning, could you
please be more clear on this?

>> caller:
>>   call callee2
>>   r2 = r0
>>   
>> insn walker should have marked REG_0 is a sub-register define in callee's
>> frame, and such marker is naturally propagetd back to caller's frame inside
>> "prepare_func_exit" which is doing:
>> 
>>   /* return to the caller whatever r0 had in the callee */                 
>>   caller->regs[BPF_REG_0] = *r0;
>> 
>> This copies parentage chain, and also copies the sub-register definition
>> information as we have merged it into reg state.
>> 
>> > bpf abi is such that all bpf progs return 64-bit values.
>> > Historically we truncate to 32-bit in BPF_PROG_RUN,
>> > but some future bpf hook might use all 64 bits of return value.
>> 
>> hmm, was thinking bpf main always return 32-bit, so safe to return FALSE
>> here. If we could have 64-bit main return, then perhaps for main, always do
>> zero-extension on r0 if it comes from sub-register def, this seems a little
>> bit unnecessary, given there is prog_type available during the check, using
>> which we can known whether whether return value is 64-bit or not.
>> 
>> thoughts?
>
> I think it's safer to go with read64 for now and optimize later
> if really necessary.

OK, will do in next version.

>> >> +		if (op == BPF_CALL) {
>> >> +			/* BPF to BPF call will reach here because of marking
>> >> +			 * caller saved clobber with DST_OP_NO_MARK for which we
>> >> +			 * don't care the register def because they are anyway
>> >> +			 * marked as NOT_INIT already.
>> >> +			 */
>> >
>> > the comment doesn't seem to match the code. why return anything here?
>> 
>> It is to handle the case like:
>> 
>>   check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
>> 
>> which is called inside "check_func_call" for call insn. I think for caller
>> saved register, return anything is fine. They must have new def inside
>> callee before used. And when returned to caller, they must be restored
>> before used otherwise the prog will be rejected due to their status is
>> marked as NOT_INIT.
>
> but neither this patch nor patch 2 is using the return value.

You mean "neither this patch nor patch 2 is using the return value of
"is_reg64" when we reached it from BPF_CALL insn through above caller saved
registr check, right?

patch 2 is using it inside check_reg_arg:

  reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx;

I think DST_OP_NO_MARK was mostly for customized value range calculation,
so when reg_arg_type is it, subreg_def will still be set.

And as explained, here for caller saved registers, return either TRUE or
FALSE from is_reg64 is fine. TRUE or FALSE will cause different marking for
sub-register def, but because caller saved registers have also been marked
as NOT_INIT, so there must be later restore instruction to override the
sub-register def to correct one.

Let me know if I am answering wrong question.

>> 
>> So, for all cases, they are tracked insn walker accurately.
>> 
>> > The return value won't be used anyway.
>> >
>> > If is_reg64() is inside check_reg_arg() under if (t == SRC_OP)
>> > all these corner cases wouldn't cause review headaches and can be converted
>> > to WARN_ONCE.
>> >
>> > What am I missing?
>> 
>> As explained, is_reg64 could be used by both t == SRC_OP and DST*. And is
>> will be called for instructions with implicit register usage for example
>> CALL, and we need to return access width for those implicitly used register
>> here as well.
>> 
>> Make sense?
>
> Kinda. I'll take a look again.

Thanks.

> My first reaction is that patch 1 and 2 should be one patch or split
> differently.

As replied, will merge patch 1 and patch 2.

Regards,
Jiong

Patch

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b3ab61f..fba0ebb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -36,9 +36,11 @@ 
  */
 enum bpf_reg_liveness {
 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
-	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
-	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
-	REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
+	REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
+	REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
+	REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
+	REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
+	REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
 };
 
 struct bpf_reg_state {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c722015..5784b279 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1135,7 +1135,7 @@  static int check_subprogs(struct bpf_verifier_env *env)
  */
 static int mark_reg_read(struct bpf_verifier_env *env,
 			 const struct bpf_reg_state *state,
-			 struct bpf_reg_state *parent)
+			 struct bpf_reg_state *parent, u8 flags)
 {
 	bool writes = parent == state->parent; /* Observe write marks */
 	int cnt = 0;
@@ -1150,17 +1150,23 @@  static int mark_reg_read(struct bpf_verifier_env *env,
 				parent->var_off.value, parent->off);
 			return -EFAULT;
 		}
-		if (parent->live & REG_LIVE_READ)
+		/* The first condition is much more likely to be true than the
+		 * second, make it checked first.
+		 */
+		if ((parent->live & REG_LIVE_READ) == flags ||
+		    parent->live & REG_LIVE_READ64)
 			/* The parentage chain never changes and
 			 * this parent was already marked as LIVE_READ.
 			 * There is no need to keep walking the chain again and
 			 * keep re-marking all parents as LIVE_READ.
 			 * This case happens when the same register is read
 			 * multiple times without writes into it in-between.
+			 * Also, if parent has REG_LIVE_READ64 set, then no need
+			 * to set the weak REG_LIVE_READ32.
 			 */
 			break;
 		/* ... then we depend on parent's value */
-		parent->live |= REG_LIVE_READ;
+		parent->live |= flags;
 		state = parent;
 		parent = state->parent;
 		writes = true;
@@ -1172,12 +1178,95 @@  static int mark_reg_read(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_insn *insn, u32 regno,
+		     struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+	u8 code, class, op;
+
+	code = insn->code;
+	class = BPF_CLASS(code);
+	op = BPF_OP(code);
+	if (class == BPF_JMP) {
+		/* BPF_EXIT will reach here because of return value readability
+		 * test for "main" which has s32 return value.
+		 */
+		if (op == BPF_EXIT)
+			return false;
+		if (op == BPF_CALL) {
+			/* BPF to BPF call will reach here because of marking
+			 * caller saved clobber with DST_OP_NO_MARK for which we
+			 * don't care the register def because they are anyway
+			 * marked as NOT_INIT already.
+			 */
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				return false;
+			/* Helper call will reach here because of arg type
+			 * check. Conservatively marking all args as 64-bit.
+			 */
+			return true;
+		}
+	}
+
+	if (class == BPF_ALU64 || class == BPF_JMP ||
+	    /* BPF_END always use BPF_ALU class. */
+	    (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+		return true;
+
+	if (class == BPF_ALU || class == BPF_JMP32)
+		return false;
+
+	if (class == BPF_LDX) {
+		if (t != SRC_OP)
+			return BPF_SIZE(code) == BPF_DW;
+		/* LDX source must be ptr. */
+		return true;
+	}
+
+	if (class == BPF_STX) {
+		if (reg->type != SCALAR_VALUE)
+			return true;
+		return BPF_SIZE(code) == BPF_DW;
+	}
+
+	if (class == BPF_LD) {
+		u8 mode = BPF_MODE(code);
+
+		/* LD_IMM64 */
+		if (mode == BPF_IMM)
+			return true;
+
+		/* Both LD_IND and LD_ABS return 32-bit data. */
+		if (t != SRC_OP)
+			return  false;
+
+		/* Implicit ctx ptr. */
+		if (regno == BPF_REG_6)
+			return true;
+
+		/* Explicit source could be any width. */
+		return true;
+	}
+
+	if (class == BPF_ST)
+		/* The only source register for BPF_ST is a ptr. */
+		return true;
+
+	/* Conservatively return true at default. */
+	return true;
+}
+
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
 	struct bpf_verifier_state *vstate = env->cur_state;
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
 	struct bpf_reg_state *reg, *regs = state->regs;
+	bool rw64;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -1185,6 +1274,7 @@  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	}
 
 	reg = &regs[regno];
+	rw64 = is_reg64(insn, regno, reg, t);
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (reg->type == NOT_INIT) {
@@ -1195,7 +1285,8 @@  static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 		if (regno == BPF_REG_FP)
 			return 0;
 
-		return mark_reg_read(env, reg, reg->parent);
+		return mark_reg_read(env, reg, reg->parent,
+				     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -1382,7 +1473,8 @@  static int check_stack_read(struct bpf_verifier_env *env,
 			state->regs[value_regno].live |= REG_LIVE_WRITTEN;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      REG_LIVE_READ64);
 		return 0;
 	} else {
 		int zeros = 0;
@@ -1399,7 +1491,9 @@  static int check_stack_read(struct bpf_verifier_env *env,
 			return -EACCES;
 		}
 		mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
-			      reg_state->stack[spi].spilled_ptr.parent);
+			      reg_state->stack[spi].spilled_ptr.parent,
+			      size == BPF_REG_SIZE
+			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
 		if (value_regno >= 0) {
 			if (zeros == size) {
 				/* any size read into register is zero extended,
@@ -2337,7 +2431,9 @@  static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		 * the whole slot to be marked as 'read'
 		 */
 		mark_reg_read(env, &state->stack[spi].spilled_ptr,
-			      state->stack[spi].spilled_ptr.parent);
+			      state->stack[spi].spilled_ptr.parent,
+			      access_size == BPF_REG_SIZE
+			      ? REG_LIVE_READ64 : REG_LIVE_READ32);
 	}
 	return update_stack_depth(env, state, min_off);
 }
@@ -6227,12 +6323,17 @@  static int propagate_liveness_reg(struct bpf_verifier_env *env,
 				  struct bpf_reg_state *reg,
 				  struct bpf_reg_state *parent_reg)
 {
+	u8 parent_bits = parent_reg->live & REG_LIVE_READ;
+	u8 bits = reg->live & REG_LIVE_READ;
+	u8 bits_diff = parent_bits ^ bits;
+	u8 bits_prop = bits_diff & bits;
 	int err;
 
-	if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ))
+	/* No diff bit comes from "reg". */
+	if (!bits_prop)
 		return 0;
 
-	err = mark_reg_read(env, reg, parent_reg);
+	err = mark_reg_read(env, reg, parent_reg, bits_prop);
 	if (err)
 		return err;