Patchwork [v2] iommu: io-pgtable: Add ARM Mali midgard MMU page table format

login
register
mail settings
Submitter Robin Murphy
Date March 1, 2019, 4:28 p.m.
Message ID <e7a36bc9-e425-064b-ca12-3c655c8e5d4a@arm.com>
Download mbox | patch
Permalink /patch/739473/
State New
Headers show

Comments

Robin Murphy - March 1, 2019, 4:28 p.m.
On 27/02/2019 23:22, Rob Herring wrote:
> ARM Mali midgard GPU page tables are similar to standard 64-bit stage 1
> page tables, but have a few differences. Add a new format type to
> represent the format. The input address size is 48-bits and the output
> address size is 40-bits (and possibly less?). Note that the later
> bifrost GPUs follow the standard 64-bit stage 1 format.
> 
> The differences in the format compared to 64-bit stage 1 format are:
> 
> The 3rd level page entry bits are 0x1 instead of 0x3 for page entries.
> 
> The access flags are not read-only and unprivileged, but read and write.
> This is similar to stage 2 entries, but the memory attributes field matches
> stage 1 being an index.
> 
> The nG bit is not set by the vendor driver. This one didn't seem to matter,
> but we'll keep it aligned to the vendor driver.
> 
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: iommu@lists.linux-foundation.org
> Signed-off-by: Rob Herring <robh@kernel.org>
> ---
> Robin, Hopefully this is what you had in mind.

Getting there, for sure :)

>   drivers/iommu/io-pgtable-arm.c | 70 +++++++++++++++++++++++++++-------
>   drivers/iommu/io-pgtable.c     |  1 +
>   include/linux/io-pgtable.h     |  2 +
>   3 files changed, 59 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index d3700ec15cbd..84beea1f47a7 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -180,11 +180,6 @@
>   
>   #define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
>   
> -#define iopte_leaf(pte,l)					\
> -	(l == (ARM_LPAE_MAX_LEVELS - 1) ?			\
> -		(iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_PAGE) :	\
> -		(iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_BLOCK))
> -
>   struct arm_lpae_io_pgtable {
>   	struct io_pgtable	iop;
>   
> @@ -198,6 +193,14 @@ struct arm_lpae_io_pgtable {
>   
>   typedef u64 arm_lpae_iopte;
>   
> +static inline bool iopte_leaf(arm_lpae_iopte pte, int l, enum io_pgtable_fmt fmt)
> +{
> +	if ((l == (ARM_LPAE_MAX_LEVELS - 1)) && (fmt != ARM_MALI_LPAE))
> +		return iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_PAGE;
> +
> +	return iopte_type(pte,l) == ARM_LPAE_PTE_TYPE_BLOCK;
> +}
> +
>   static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr,
>   				     struct arm_lpae_io_pgtable *data)
>   {
> @@ -304,11 +307,14 @@ static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
>   		pte |= ARM_LPAE_PTE_NS;
>   
>   	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
> -		pte |= ARM_LPAE_PTE_TYPE_PAGE;
> +		pte |= (data->iop.fmt == ARM_MALI_LPAE) ?
> +			ARM_LPAE_PTE_TYPE_BLOCK : ARM_LPAE_PTE_TYPE_PAGE;

Yuck at that ternary... Made worse by the previous hunk which proves you 
already know the nice reasonable way to structure this logic ;)

>   	else
>   		pte |= ARM_LPAE_PTE_TYPE_BLOCK;
>   
> -	pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS;
> +	if (data->iop.fmt != ARM_MALI_LPAE)
> +		pte |= ARM_LPAE_PTE_AF;

So ENTRY_ACCESS_BIT is something different from the VMSA access flag then?

> +	pte |= ARM_LPAE_PTE_SH_IS;
>   	pte |= paddr_to_iopte(paddr, data);
>   
>   	__arm_lpae_set_pte(ptep, pte, &data->iop.cfg);
> @@ -321,7 +327,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
>   {
>   	arm_lpae_iopte pte = *ptep;
>   
> -	if (iopte_leaf(pte, lvl)) {
> +	if (iopte_leaf(pte, lvl, data->iop.fmt)) {
>   		/* We require an unmap first */
>   		WARN_ON(!selftest_running);
>   		return -EEXIST;
> @@ -409,7 +415,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
>   		__arm_lpae_sync_pte(ptep, cfg);
>   	}
>   
> -	if (pte && !iopte_leaf(pte, lvl)) {
> +	if (pte && !iopte_leaf(pte, lvl, data->iop.fmt)) {
>   		cptep = iopte_deref(pte, data);
>   	} else if (pte) {
>   		/* We require an unmap first */
> @@ -426,7 +432,22 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
>   {
>   	arm_lpae_iopte pte;
>   
> -	if (data->iop.fmt == ARM_64_LPAE_S1 ||
> +	if (data->iop.fmt == ARM_MALI_LPAE) {
> +		pte = 0;
> +
> +		if (prot & IOMMU_WRITE)
> +			pte |= ARM_LPAE_PTE_AP_RDONLY;

This one in particular I will never be able to look at without 
instinctively thinking "hang on, how did I ever let that bug slip 
through?"...

> +
> +		if (prot & IOMMU_READ)
> +			pte |= ARM_LPAE_PTE_AP_UNPRIV;

...while this one only looks utterly insane. Can we please at least use 
the stage 2 permission definitions for these so that they're actually 
readable.

> +
> +		if (prot & IOMMU_MMIO)
> +			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
> +				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
> +		else if (prot & IOMMU_CACHE)
> +			pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> +				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
> +	} else if (data->iop.fmt == ARM_64_LPAE_S1 ||

In fact, looking at it now, I think it only takes a little (untested) 
refactor to split permissions vs. attributes to get the Mali version for 
free:

----->8-----
  	if (prot & IOMMU_NOEXEC)
-----8<-----

>   	    data->iop.fmt == ARM_32_LPAE_S1) {
>   		pte = ARM_LPAE_PTE_nG;
>   
> @@ -511,7 +532,7 @@ static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
>   	while (ptep != end) {
>   		arm_lpae_iopte pte = *ptep++;
>   
> -		if (!pte || iopte_leaf(pte, lvl))
> +		if (!pte || iopte_leaf(pte, lvl, data->iop.fmt))
>   			continue;
>   
>   		__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
> @@ -602,7 +623,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>   	if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
>   		__arm_lpae_set_pte(ptep, 0, &iop->cfg);
>   
> -		if (!iopte_leaf(pte, lvl)) {
> +		if (!iopte_leaf(pte, lvl, iop->fmt)) {
>   			/* Also flush any partial walks */
>   			io_pgtable_tlb_add_flush(iop, iova, size,
>   						ARM_LPAE_GRANULE(data), false);
> @@ -621,7 +642,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>   		}
>   
>   		return size;
> -	} else if (iopte_leaf(pte, lvl)) {
> +	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
>   		/*
>   		 * Insert a table at the next level to map the old region,
>   		 * minus the part we want to unmap
> @@ -669,7 +690,7 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
>   			return 0;
>   
>   		/* Leaf entry? */
> -		if (iopte_leaf(pte,lvl))
> +		if (iopte_leaf(pte,lvl, data->iop.fmt))

Super-nit: If you're touching this line anyway, would you mind adding 
the currently-missing space as well?

>   			goto found_translation;
>   
>   		/* Take it to the next level */
> @@ -995,6 +1016,22 @@ arm_32_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
>   	return iop;
>   }
>   
> +static struct io_pgtable *
> +arm_mali_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
> +{
> +	struct io_pgtable *iop;
> +
> +	if (cfg->ias != 48 || cfg->oas > 40)
> +		return NULL;
> +
> +	cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
> +	iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
> +	if (iop)
> +		cfg->arm_lpae_s1_cfg.tcr = 0;

The general design intent is that we return ready-to-go register values 
here (much like mmu_get_as_setup() in mali_kbase, it seems), so I think 
it's worth adding an arm_mali_cfg to the io_pgtable_cfg union and 
transforming the VMSA output into final transtab/transcfg/memattr format 
at this point, rather then callers needing to care (e.g. some of those 
AS_TRANSTAB_LPAE_* bits look reminiscent of the walk attributes we set 
up for a VMSA TCR).

Robin.

> +
> +	return iop;
> +}
> +
>   struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
>   	.alloc	= arm_64_lpae_alloc_pgtable_s1,
>   	.free	= arm_lpae_free_pgtable,
> @@ -1015,6 +1052,11 @@ struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns = {
>   	.free	= arm_lpae_free_pgtable,
>   };
>   
> +struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = {
> +	.alloc	= arm_mali_lpae_alloc_pgtable,
> +	.free	= arm_lpae_free_pgtable,
> +};
> +
>   #ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST
>   
>   static struct io_pgtable_cfg *cfg_cookie;
> diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
> index 93f2880be6c6..5227cfdbb65b 100644
> --- a/drivers/iommu/io-pgtable.c
> +++ b/drivers/iommu/io-pgtable.c
> @@ -30,6 +30,7 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
>   	[ARM_32_LPAE_S2] = &io_pgtable_arm_32_lpae_s2_init_fns,
>   	[ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
>   	[ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
> +	[ARM_MALI_LPAE] = &io_pgtable_arm_mali_lpae_init_fns,
>   #endif
>   #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
>   	[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
> diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
> index 47d5ae559329..5f0be2471651 100644
> --- a/include/linux/io-pgtable.h
> +++ b/include/linux/io-pgtable.h
> @@ -12,6 +12,7 @@ enum io_pgtable_fmt {
>   	ARM_64_LPAE_S1,
>   	ARM_64_LPAE_S2,
>   	ARM_V7S,
> +	ARM_MALI_LPAE,
>   	IO_PGTABLE_NUM_FMTS,
>   };
>   
> @@ -209,5 +210,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
>   extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
>   extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
>   extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
> +extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns;
>   
>   #endif /* __IO_PGTABLE_H */
>
Rob Herring - March 4, 2019, 3:32 p.m.
On Fri, Mar 1, 2019 at 10:28 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 27/02/2019 23:22, Rob Herring wrote:
> > ARM Mali midgard GPU page tables are similar to standard 64-bit stage 1
> > page tables, but have a few differences. Add a new format type to
> > represent the format. The input address size is 48-bits and the output
> > address size is 40-bits (and possibly less?). Note that the later
> > bifrost GPUs follow the standard 64-bit stage 1 format.
> >
> > The differences in the format compared to 64-bit stage 1 format are:
> >
> > The 3rd level page entry bits are 0x1 instead of 0x3 for page entries.
> >
> > The access flags are not read-only and unprivileged, but read and write.
> > This is similar to stage 2 entries, but the memory attributes field matches
> > stage 1 being an index.
> >
> > The nG bit is not set by the vendor driver. This one didn't seem to matter,
> > but we'll keep it aligned to the vendor driver.

[...]

> > +     cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
> > +     iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
> > +     if (iop)
> > +             cfg->arm_lpae_s1_cfg.tcr = 0;
>
> The general design intent is that we return ready-to-go register values
> here (much like mmu_get_as_setup() in mali_kbase, it seems), so I think
> it's worth adding an arm_mali_cfg to the io_pgtable_cfg union and
> transforming the VMSA output into final transtab/transcfg/memattr format
> at this point, rather then callers needing to care (e.g. some of those
> AS_TRANSTAB_LPAE_* bits look reminiscent of the walk attributes we set
> up for a VMSA TCR).

I agree with returning ready-to-go values, but I'm not so sure the
bits are the same. Bits 0-1 are enable/mode bits which are pretty
specific to mali. Bit 2 is 'read inner' for whatever that means.
Perhaps it is read allocate cacheability, but that's a bit different
than RGN bits. Bit 4 is 'share outer'. Maybe it's the same as SH0 if
bit 3 is included, but I have no evidence of that. So I don't think
there's really anything to transform. We just set the bits needed. So
here's what I have in mind:

iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
if (iop) {
  u64 mair, ttbr;

  /* Copy values as union fields overlap */
  mair = cfg->arm_lpae_s1_cfg.mair[0];
  ttbr = cfg->arm_lpae_s1_cfg.ttbr[0];

  cfg->arm_mali_lpae_cfg.mair = mair;
  cfg->arm_mali_lpae_cfg.ttbr = ttbr;
  cfg->arm_mali_lpae_cfg.ttbr |= ARM_MALI_LPAE_TTBR_READ_INNER |
    ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
}

Rob
Robin Murphy - March 4, 2019, 5:31 p.m.
On 04/03/2019 15:32, Rob Herring wrote:
> On Fri, Mar 1, 2019 at 10:28 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 27/02/2019 23:22, Rob Herring wrote:
>>> ARM Mali midgard GPU page tables are similar to standard 64-bit stage 1
>>> page tables, but have a few differences. Add a new format type to
>>> represent the format. The input address size is 48-bits and the output
>>> address size is 40-bits (and possibly less?). Note that the later
>>> bifrost GPUs follow the standard 64-bit stage 1 format.
>>>
>>> The differences in the format compared to 64-bit stage 1 format are:
>>>
>>> The 3rd level page entry bits are 0x1 instead of 0x3 for page entries.
>>>
>>> The access flags are not read-only and unprivileged, but read and write.
>>> This is similar to stage 2 entries, but the memory attributes field matches
>>> stage 1 being an index.
>>>
>>> The nG bit is not set by the vendor driver. This one didn't seem to matter,
>>> but we'll keep it aligned to the vendor driver.
> 
> [...]
> 
>>> +     cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
>>> +     iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
>>> +     if (iop)
>>> +             cfg->arm_lpae_s1_cfg.tcr = 0;
>>
>> The general design intent is that we return ready-to-go register values
>> here (much like mmu_get_as_setup() in mali_kbase, it seems), so I think
>> it's worth adding an arm_mali_cfg to the io_pgtable_cfg union and
>> transforming the VMSA output into final transtab/transcfg/memattr format
>> at this point, rather then callers needing to care (e.g. some of those
>> AS_TRANSTAB_LPAE_* bits look reminiscent of the walk attributes we set
>> up for a VMSA TCR).
> 
> I agree with returning ready-to-go values, but I'm not so sure the
> bits are the same. Bits 0-1 are enable/mode bits which are pretty
> specific to mali. Bit 2 is 'read inner' for whatever that means.
> Perhaps it is read allocate cacheability, but that's a bit different
> than RGN bits. Bit 4 is 'share outer'. Maybe it's the same as SH0 if
> bit 3 is included, but I have no evidence of that. So I don't think
> there's really anything to transform. We just set the bits needed. So
> here's what I have in mind:

Right, my Friday-afternoon wording perhaps wasn't the best - by 
"transform" I didn't mean to imply trying to reinterpret the default 
settings we configure for a VMSA TCR, merely applying some 
similarly-appropriate defaults to make a Mali TRANSTAB out of the VMSA TTBR.

> iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
> if (iop) {
>    u64 mair, ttbr;
> 
>    /* Copy values as union fields overlap */
>    mair = cfg->arm_lpae_s1_cfg.mair[0];
>    ttbr = cfg->arm_lpae_s1_cfg.ttbr[0];
> 
>    cfg->arm_mali_lpae_cfg.mair = mair;
>    cfg->arm_mali_lpae_cfg.ttbr = ttbr;
>    cfg->arm_mali_lpae_cfg.ttbr |= ARM_MALI_LPAE_TTBR_READ_INNER |
>      ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
> }

...pretty much exactly like that (although I'd still prefer to use the 
Mali register names for clarity, and presumably you'll still explicitly 
initialise TRANSCFG too in the real patch).

Robin.
Rob Herring - March 4, 2019, 6:48 p.m.
On Mon, Mar 4, 2019 at 11:31 AM Robin Murphy <robin.murphy@arm.com> wrote:
>
> On 04/03/2019 15:32, Rob Herring wrote:
> > On Fri, Mar 1, 2019 at 10:28 AM Robin Murphy <robin.murphy@arm.com> wrote:
> >>
> >> On 27/02/2019 23:22, Rob Herring wrote:
> >>> ARM Mali midgard GPU page tables are similar to standard 64-bit stage 1
> >>> page tables, but have a few differences. Add a new format type to
> >>> represent the format. The input address size is 48-bits and the output
> >>> address size is 40-bits (and possibly less?). Note that the later
> >>> bifrost GPUs follow the standard 64-bit stage 1 format.
> >>>
> >>> The differences in the format compared to 64-bit stage 1 format are:
> >>>
> >>> The 3rd level page entry bits are 0x1 instead of 0x3 for page entries.
> >>>
> >>> The access flags are not read-only and unprivileged, but read and write.
> >>> This is similar to stage 2 entries, but the memory attributes field matches
> >>> stage 1 being an index.
> >>>
> >>> The nG bit is not set by the vendor driver. This one didn't seem to matter,
> >>> but we'll keep it aligned to the vendor driver.
> >
> > [...]
> >
> >>> +     cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
> >>> +     iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
> >>> +     if (iop)
> >>> +             cfg->arm_lpae_s1_cfg.tcr = 0;
> >>
> >> The general design intent is that we return ready-to-go register values
> >> here (much like mmu_get_as_setup() in mali_kbase, it seems), so I think
> >> it's worth adding an arm_mali_cfg to the io_pgtable_cfg union and
> >> transforming the VMSA output into final transtab/transcfg/memattr format
> >> at this point, rather then callers needing to care (e.g. some of those
> >> AS_TRANSTAB_LPAE_* bits look reminiscent of the walk attributes we set
> >> up for a VMSA TCR).
> >
> > I agree with returning ready-to-go values, but I'm not so sure the
> > bits are the same. Bits 0-1 are enable/mode bits which are pretty
> > specific to mali. Bit 2 is 'read inner' for whatever that means.
> > Perhaps it is read allocate cacheability, but that's a bit different
> > than RGN bits. Bit 4 is 'share outer'. Maybe it's the same as SH0 if
> > bit 3 is included, but I have no evidence of that. So I don't think
> > there's really anything to transform. We just set the bits needed. So
> > here's what I have in mind:
>
> Right, my Friday-afternoon wording perhaps wasn't the best - by
> "transform" I didn't mean to imply trying to reinterpret the default
> settings we configure for a VMSA TCR, merely applying some
> similarly-appropriate defaults to make a Mali TRANSTAB out of the VMSA TTBR.
>
> > iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
> > if (iop) {
> >    u64 mair, ttbr;
> >
> >    /* Copy values as union fields overlap */
> >    mair = cfg->arm_lpae_s1_cfg.mair[0];
> >    ttbr = cfg->arm_lpae_s1_cfg.ttbr[0];
> >
> >    cfg->arm_mali_lpae_cfg.mair = mair;
> >    cfg->arm_mali_lpae_cfg.ttbr = ttbr;
> >    cfg->arm_mali_lpae_cfg.ttbr |= ARM_MALI_LPAE_TTBR_READ_INNER |
> >      ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
> > }
>
> ...pretty much exactly like that (although I'd still prefer to use the
> Mali register names for clarity, and presumably you'll still explicitly
> initialise TRANSCFG too in the real patch).

No, TRANSCFG is only on Bifrost. While the page table format seems to
be standard stage 1 64-bit, the registers are still different from
anything else. So I guess we'll need yet another format definition
when we get there. Also, we may still have to massage the register
values outside of this code. It's not going to know the
'system_coherency' value the kbase driver uses (And I'm not sure how
we want to express that upstream either).

Rob
Robin Murphy - March 5, 2019, 4:04 p.m.
On 04/03/2019 18:48, Rob Herring wrote:
> On Mon, Mar 4, 2019 at 11:31 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>
>> On 04/03/2019 15:32, Rob Herring wrote:
>>> On Fri, Mar 1, 2019 at 10:28 AM Robin Murphy <robin.murphy@arm.com> wrote:
>>>>
>>>> On 27/02/2019 23:22, Rob Herring wrote:
>>>>> ARM Mali midgard GPU page tables are similar to standard 64-bit stage 1
>>>>> page tables, but have a few differences. Add a new format type to
>>>>> represent the format. The input address size is 48-bits and the output
>>>>> address size is 40-bits (and possibly less?). Note that the later
>>>>> bifrost GPUs follow the standard 64-bit stage 1 format.
>>>>>
>>>>> The differences in the format compared to 64-bit stage 1 format are:
>>>>>
>>>>> The 3rd level page entry bits are 0x1 instead of 0x3 for page entries.
>>>>>
>>>>> The access flags are not read-only and unprivileged, but read and write.
>>>>> This is similar to stage 2 entries, but the memory attributes field matches
>>>>> stage 1 being an index.
>>>>>
>>>>> The nG bit is not set by the vendor driver. This one didn't seem to matter,
>>>>> but we'll keep it aligned to the vendor driver.
>>>
>>> [...]
>>>
>>>>> +     cfg->pgsize_bitmap &= (SZ_4K | SZ_2M | SZ_1G);
>>>>> +     iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
>>>>> +     if (iop)
>>>>> +             cfg->arm_lpae_s1_cfg.tcr = 0;
>>>>
>>>> The general design intent is that we return ready-to-go register values
>>>> here (much like mmu_get_as_setup() in mali_kbase, it seems), so I think
>>>> it's worth adding an arm_mali_cfg to the io_pgtable_cfg union and
>>>> transforming the VMSA output into final transtab/transcfg/memattr format
>>>> at this point, rather then callers needing to care (e.g. some of those
>>>> AS_TRANSTAB_LPAE_* bits look reminiscent of the walk attributes we set
>>>> up for a VMSA TCR).
>>>
>>> I agree with returning ready-to-go values, but I'm not so sure the
>>> bits are the same. Bits 0-1 are enable/mode bits which are pretty
>>> specific to mali. Bit 2 is 'read inner' for whatever that means.
>>> Perhaps it is read allocate cacheability, but that's a bit different
>>> than RGN bits. Bit 4 is 'share outer'. Maybe it's the same as SH0 if
>>> bit 3 is included, but I have no evidence of that. So I don't think
>>> there's really anything to transform. We just set the bits needed. So
>>> here's what I have in mind:
>>
>> Right, my Friday-afternoon wording perhaps wasn't the best - by
>> "transform" I didn't mean to imply trying to reinterpret the default
>> settings we configure for a VMSA TCR, merely applying some
>> similarly-appropriate defaults to make a Mali TRANSTAB out of the VMSA TTBR.
>>
>>> iop = arm_64_lpae_alloc_pgtable_s1(cfg, cookie);
>>> if (iop) {
>>>     u64 mair, ttbr;
>>>
>>>     /* Copy values as union fields overlap */
>>>     mair = cfg->arm_lpae_s1_cfg.mair[0];
>>>     ttbr = cfg->arm_lpae_s1_cfg.ttbr[0];
>>>
>>>     cfg->arm_mali_lpae_cfg.mair = mair;
>>>     cfg->arm_mali_lpae_cfg.ttbr = ttbr;
>>>     cfg->arm_mali_lpae_cfg.ttbr |= ARM_MALI_LPAE_TTBR_READ_INNER |
>>>       ARM_MALI_LPAE_TTBR_ADRMODE_TABLE;
>>> }
>>
>> ...pretty much exactly like that (although I'd still prefer to use the
>> Mali register names for clarity, and presumably you'll still explicitly
>> initialise TRANSCFG too in the real patch).
> 
> No, TRANSCFG is only on Bifrost.

Ah, fair enough - I thought that your "cfg->arm_lpae_s1_cfg.tcr = 0;" 
was deliberately echoing the "setup->transcfg = 0;" in mali_kbase to 
imply that AS_TRANSCFG_ADRMODE_LEGACY was significant, but I suppose 
maybe Midgard *is* the legacy in this case. I'll admit I've not gone 
looking for the actual register-poking to see what's consumed by which 
devices.

> While the page table format seems to
> be standard stage 1 64-bit, the registers are still different from
> anything else. So I guess we'll need yet another format definition
> when we get there.

Hmm, at that point I'd be inclined to use standard AArch64 format and 
handle the rest in the driver, similar to how we repack TCR values into 
Context Descriptors for SMMUv3. Those TRANSCFG_PTW_* fields even look 
like they're pretending to be TCR.SH1 and TCR.IRGN1 (albeit with the 
latter having a wonky encoding, and the fact that there's no TTBR1 anyway).

I appreciate that somewhat undermines my argument for having io-pgtable 
fill in complete LPAE-format registers, so if you wanted the driver to 
handle both cases itself for consistency I wouldn't really mind - as 
long as LPAE still has its own init_fn where we can fine-tune the 
relevant constraints and sanity checks I'll be happy.

> Also, we may still have to massage the register
> values outside of this code. It's not going to know the
> 'system_coherency' value the kbase driver uses (And I'm not sure how
> we want to express that upstream either).

AFAICS there are two possible aspects to coherency. One is I/O coherency 
(i.e. "can the GPU snoop CPU caches"), which is already controlled by 
the "dma-coherent" property. The other is whether the GPU cache itself 
is coherent with the other caches in the system (i.e. "can CPUs snoop 
the GPU cache; how much GPU cache maintenance is necessary") which 
should be something we can infer from the integration-specific 
compatible string, because we should always have an integration-specific 
compatible string, right? ;)

And yes, the existing io-pgtable implementations don't really account 
for I/O coherency very well in terms of TCR values at the moment - we 
simply set cacheable walk attributes all the time on the assumption that 
non-coherent interconnects will ignore them (so if you ever did want a 
coherent SMMU to make non-cacheable walks for some reason, tough luck). 
It's a known issue, and there have been some Qcom patches flying around 
attempting to make it a bit better.

Robin.

Patch

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 237cacd4a62b..3cbc08bb7acd 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -430,31 +430,33 @@  static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,
  	if (data->iop.fmt == ARM_64_LPAE_S1 ||
  	    data->iop.fmt == ARM_32_LPAE_S1) {
  		pte = ARM_LPAE_PTE_nG;
-
  		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
  			pte |= ARM_LPAE_PTE_AP_RDONLY;
-
  		if (!(prot & IOMMU_PRIV))
  			pte |= ARM_LPAE_PTE_AP_UNPRIV;
-
-		if (prot & IOMMU_MMIO)
-			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
-				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
-		else if (prot & IOMMU_CACHE)
-			pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
-				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
  	} else {
  		pte = ARM_LPAE_PTE_HAP_FAULT;
  		if (prot & IOMMU_READ)
  			pte |= ARM_LPAE_PTE_HAP_READ;
  		if (prot & IOMMU_WRITE)
  			pte |= ARM_LPAE_PTE_HAP_WRITE;
+	}
+
+	if (data->iop.fmt == ARM_64_LPAE_S2 ||
+	    data->iop.fmt == ARM_32_LPAE_S2) {
  		if (prot & IOMMU_MMIO)
  			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
  		else if (prot & IOMMU_CACHE)
  			pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
  		else
  			pte |= ARM_LPAE_PTE_MEMATTR_NC;
+	} else {
+		if (prot & IOMMU_MMIO)
+			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
+				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+		else if (prot & IOMMU_CACHE)
+			pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
+				<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
  	}