Patchwork [PATCH-for-4.0,v2,2/2] hw/arm/virt-acpi-build: IORT Update for revision D

login
register
mail settings
Submitter Auger Eric
Date Dec. 6, 2018, 5:07 p.m.
Message ID <20181206170733.7469-3-eric.auger@redhat.com>
Download mbox | patch
Permalink /patch/674423/
State New
Headers show

Comments

Auger Eric - Dec. 6, 2018, 5:07 p.m.
Let's update the structs according to revision D of the IORT
specification and set the IORT node revision fields.

In IORT smmuv3 node: the new proximity field is not used as
the proximity domain valid flag is not set. The new DeviceId
mapping index is not used either as all the SMMU control
interrupts are GSIV based.

In IORT RC node: the new memory address size limit field is
set to 64 bits.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

---

v1 -> v2:
- separate patches for SMMUv3 DMA coherency issue and struct
  update to revision D.
- revision fields set
---
 hw/arm/virt-acpi-build.c    |  4 ++++
 include/hw/acpi/acpi-defs.h | 10 +++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)
Andrew Jones - Dec. 17, 2018, 4:27 p.m.
On Thu, Dec 06, 2018 at 06:07:33PM +0100, Eric Auger wrote:
> Let's update the structs according to revision D of the IORT
> specification and set the IORT node revision fields.
> 
> In IORT smmuv3 node: the new proximity field is not used as
> the proximity domain valid flag is not set. The new DeviceId
> mapping index is not used either as all the SMMU control
> interrupts are GSIV based.
> 
> In IORT RC node: the new memory address size limit field is
> set to 64 bits.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> 
> ---
> 
> v1 -> v2:
> - separate patches for SMMUv3 DMA coherency issue and struct
>   update to revision D.
> - revision fields set
> ---
>  hw/arm/virt-acpi-build.c    |  4 ++++
>  include/hw/acpi/acpi-defs.h | 10 +++++++++-
>  2 files changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index aa177ba64d..a34a0958a5 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -423,6 +423,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>       */
>      iort_node_offset = sizeof(*iort);
>      iort->node_offset = cpu_to_le32(iort_node_offset);
> +    iort->revision = 0;
>  
>      /* ITS group node */
>      node_size =  sizeof(*its) + sizeof(uint32_t);
> @@ -445,6 +446,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>  
>          smmu->type = ACPI_IORT_NODE_SMMU_V3;
>          smmu->length = cpu_to_le16(node_size);
> +        smmu->revision = cpu_to_le32(2);
>          smmu->mapping_count = cpu_to_le32(1);
>          smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
>          smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
> @@ -470,6 +472,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>  
>      rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
>      rc->length = cpu_to_le16(node_size);
> +    rc->revision = cpu_to_le32(1);
>      rc->mapping_count = cpu_to_le32(1);
>      rc->mapping_offset = cpu_to_le32(sizeof(*rc));
>  
> @@ -477,6 +480,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>      rc->memory_properties.cache_coherency = cpu_to_le32(1);
>      rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
>      rc->pci_segment_number = 0; /* MCFG pci_segment */
> +    rc->memory_address_limit = 64;

Hmmm... IIUC this provides the size of the IOVA space. When it's 64 the
size of the space is U64_MAX, which gets added to the DMA base (also 64
bits) when calculating things like the last PFN. It seems strange to use
a value that will surely overflow those calculations. Is it common to
put 64 here? Can you elaborate on this?

>  
>      /* Identity RID mapping covering the whole input RID range */
>      idmap = &rc->id_mapping_array[0];
> diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
> index 532eaf79bd..b4a5104367 100644
> --- a/include/hw/acpi/acpi-defs.h
> +++ b/include/hw/acpi/acpi-defs.h
> @@ -628,7 +628,11 @@ struct AcpiIortItsGroup {
>  } QEMU_PACKED;
>  typedef struct AcpiIortItsGroup AcpiIortItsGroup;
>  
> -#define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE 1
> +enum {
> +    ACPI_IORT_SMMU_V3_COHACC_OVERRIDE = 1 << 0,
> +    ACPI_IORT_SMMU_V3_HTTU_OVERRIDE   = 3 << 1,
> +    ACPI_IORT_SMMU_V3_PXM_VALID       = 1 << 3
> +};

We don't usually add flag definitions until we need them. Indeed it'll
just be more code to delete when switching to the aml_append* API.

>  
>  struct AcpiIortSmmu3 {
>      ACPI_IORT_NODE_HEADER_DEF
> @@ -641,6 +645,8 @@ struct AcpiIortSmmu3 {
>      uint32_t pri_gsiv;
>      uint32_t gerr_gsiv;
>      uint32_t sync_gsiv;
> +    uint32_t pxm;
> +    uint32_t id_mapping_index;
>      AcpiIortIdMapping id_mapping_array[0];
>  } QEMU_PACKED;
>  typedef struct AcpiIortSmmu3 AcpiIortSmmu3;
> @@ -650,6 +656,8 @@ struct AcpiIortRC {
>      AcpiIortMemoryAccess memory_properties;
>      uint32_t ats_attribute;
>      uint32_t pci_segment_number;
> +    uint8_t memory_address_limit;
> +    uint8_t reserved2[3];
>      AcpiIortIdMapping id_mapping_array[0];
>  } QEMU_PACKED;
>  typedef struct AcpiIortRC AcpiIortRC;
> -- 
> 2.17.2
> 
>

Thanks,
drew
Auger Eric - Dec. 17, 2018, 4:49 p.m.
Hi Drew,

On 12/17/18 5:27 PM, Andrew Jones wrote:
> On Thu, Dec 06, 2018 at 06:07:33PM +0100, Eric Auger wrote:
>> Let's update the structs according to revision D of the IORT
>> specification and set the IORT node revision fields.
>>
>> In IORT smmuv3 node: the new proximity field is not used as
>> the proximity domain valid flag is not set. The new DeviceId
>> mapping index is not used either as all the SMMU control
>> interrupts are GSIV based.
>>
>> In IORT RC node: the new memory address size limit field is
>> set to 64 bits.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>
>> ---
>>
>> v1 -> v2:
>> - separate patches for SMMUv3 DMA coherency issue and struct
>>   update to revision D.
>> - revision fields set
>> ---
>>  hw/arm/virt-acpi-build.c    |  4 ++++
>>  include/hw/acpi/acpi-defs.h | 10 +++++++++-
>>  2 files changed, 13 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
>> index aa177ba64d..a34a0958a5 100644
>> --- a/hw/arm/virt-acpi-build.c
>> +++ b/hw/arm/virt-acpi-build.c
>> @@ -423,6 +423,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>       */
>>      iort_node_offset = sizeof(*iort);
>>      iort->node_offset = cpu_to_le32(iort_node_offset);
>> +    iort->revision = 0;
>>  
>>      /* ITS group node */
>>      node_size =  sizeof(*its) + sizeof(uint32_t);
>> @@ -445,6 +446,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>  
>>          smmu->type = ACPI_IORT_NODE_SMMU_V3;
>>          smmu->length = cpu_to_le16(node_size);
>> +        smmu->revision = cpu_to_le32(2);
>>          smmu->mapping_count = cpu_to_le32(1);
>>          smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
>>          smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
>> @@ -470,6 +472,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>  
>>      rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
>>      rc->length = cpu_to_le16(node_size);
>> +    rc->revision = cpu_to_le32(1);
>>      rc->mapping_count = cpu_to_le32(1);
>>      rc->mapping_offset = cpu_to_le32(sizeof(*rc));
>>  
>> @@ -477,6 +480,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>      rc->memory_properties.cache_coherency = cpu_to_le32(1);
>>      rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
>>      rc->pci_segment_number = 0; /* MCFG pci_segment */
>> +    rc->memory_address_limit = 64;
> 
> Hmmm... IIUC this provides the size of the IOVA space. When it's 64 the
> size of the space is U64_MAX, which gets added to the DMA base (also 64
> bits) when calculating things like the last PFN. It seems strange to use
> a value that will surely overflow those calculations. Is it common to
> put 64 here? Can you elaborate on this?

I looked at drivers/acpi/arm64/iort.c nc_dma_get_range.
There you can find

        *size = ncomp->memory_address_limit >= 64 ? U64_MAX :
                        1ULL<<ncomp->memory_address_limit;

So I was expecting the value "64" to be properly handled by the kernel.
If one decides to select something less than the whole range, which
value would you suggest?
> 
>>  
>>      /* Identity RID mapping covering the whole input RID range */
>>      idmap = &rc->id_mapping_array[0];
>> diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
>> index 532eaf79bd..b4a5104367 100644
>> --- a/include/hw/acpi/acpi-defs.h
>> +++ b/include/hw/acpi/acpi-defs.h
>> @@ -628,7 +628,11 @@ struct AcpiIortItsGroup {
>>  } QEMU_PACKED;
>>  typedef struct AcpiIortItsGroup AcpiIortItsGroup;
>>  
>> -#define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE 1
>> +enum {
>> +    ACPI_IORT_SMMU_V3_COHACC_OVERRIDE = 1 << 0,
>> +    ACPI_IORT_SMMU_V3_HTTU_OVERRIDE   = 3 << 1,
>> +    ACPI_IORT_SMMU_V3_PXM_VALID       = 1 << 3
>> +};
> 
> We don't usually add flag definitions until we need them. Indeed it'll
> just be more code to delete when switching to the aml_append* API.

The rationale was that those flags becomes usable with this revision so
I decided to expose them. Now I don't have a strong opinion here.

Thanks

Eric
> 
>>  
>>  struct AcpiIortSmmu3 {
>>      ACPI_IORT_NODE_HEADER_DEF
>> @@ -641,6 +645,8 @@ struct AcpiIortSmmu3 {
>>      uint32_t pri_gsiv;
>>      uint32_t gerr_gsiv;
>>      uint32_t sync_gsiv;
>> +    uint32_t pxm;
>> +    uint32_t id_mapping_index;
>>      AcpiIortIdMapping id_mapping_array[0];
>>  } QEMU_PACKED;
>>  typedef struct AcpiIortSmmu3 AcpiIortSmmu3;
>> @@ -650,6 +656,8 @@ struct AcpiIortRC {
>>      AcpiIortMemoryAccess memory_properties;
>>      uint32_t ats_attribute;
>>      uint32_t pci_segment_number;
>> +    uint8_t memory_address_limit;
>> +    uint8_t reserved2[3];
>>      AcpiIortIdMapping id_mapping_array[0];
>>  } QEMU_PACKED;
>>  typedef struct AcpiIortRC AcpiIortRC;
>> -- 
>> 2.17.2
>>
>>
> 
> Thanks,
> drew
>
Andrew Jones - Dec. 17, 2018, 6:25 p.m.
On Mon, Dec 17, 2018 at 05:49:02PM +0100, Auger Eric wrote:
> Hi Drew,
> 
> On 12/17/18 5:27 PM, Andrew Jones wrote:
> > On Thu, Dec 06, 2018 at 06:07:33PM +0100, Eric Auger wrote:
> >> Let's update the structs according to revision D of the IORT
> >> specification and set the IORT node revision fields.
> >>
> >> In IORT smmuv3 node: the new proximity field is not used as
> >> the proximity domain valid flag is not set. The new DeviceId
> >> mapping index is not used either as all the SMMU control
> >> interrupts are GSIV based.
> >>
> >> In IORT RC node: the new memory address size limit field is
> >> set to 64 bits.
> >>
> >> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>
> >> ---
> >>
> >> v1 -> v2:
> >> - separate patches for SMMUv3 DMA coherency issue and struct
> >>   update to revision D.
> >> - revision fields set
> >> ---
> >>  hw/arm/virt-acpi-build.c    |  4 ++++
> >>  include/hw/acpi/acpi-defs.h | 10 +++++++++-
> >>  2 files changed, 13 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> >> index aa177ba64d..a34a0958a5 100644
> >> --- a/hw/arm/virt-acpi-build.c
> >> +++ b/hw/arm/virt-acpi-build.c
> >> @@ -423,6 +423,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>       */
> >>      iort_node_offset = sizeof(*iort);
> >>      iort->node_offset = cpu_to_le32(iort_node_offset);
> >> +    iort->revision = 0;
> >>  
> >>      /* ITS group node */
> >>      node_size =  sizeof(*its) + sizeof(uint32_t);
> >> @@ -445,6 +446,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>  
> >>          smmu->type = ACPI_IORT_NODE_SMMU_V3;
> >>          smmu->length = cpu_to_le16(node_size);
> >> +        smmu->revision = cpu_to_le32(2);
> >>          smmu->mapping_count = cpu_to_le32(1);
> >>          smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
> >>          smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
> >> @@ -470,6 +472,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>  
> >>      rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
> >>      rc->length = cpu_to_le16(node_size);
> >> +    rc->revision = cpu_to_le32(1);
> >>      rc->mapping_count = cpu_to_le32(1);
> >>      rc->mapping_offset = cpu_to_le32(sizeof(*rc));
> >>  
> >> @@ -477,6 +480,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>      rc->memory_properties.cache_coherency = cpu_to_le32(1);
> >>      rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
> >>      rc->pci_segment_number = 0; /* MCFG pci_segment */
> >> +    rc->memory_address_limit = 64;
> > 
> > Hmmm... IIUC this provides the size of the IOVA space. When it's 64 the
> > size of the space is U64_MAX, which gets added to the DMA base (also 64
> > bits) when calculating things like the last PFN. It seems strange to use
> > a value that will surely overflow those calculations. Is it common to
> > put 64 here? Can you elaborate on this?
> 
> I looked at drivers/acpi/arm64/iort.c nc_dma_get_range.
> There you can find
> 
>         *size = ncomp->memory_address_limit >= 64 ? U64_MAX :
>                         1ULL<<ncomp->memory_address_limit;
> 
> So I was expecting the value "64" to be properly handled by the kernel.

I traced the code further and see that the size gets added to the u64
dma base without any special checks in both iort_dma_setup() and
iommu_dma_init_domain(). Also iommu_dma_init_domain() function has this
comment

 * @base and @size should be exact multiples of IOMMU page granularity to
 * avoid rounding surprises. If necessary, we reserve the page at address 0
 * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
 * any change which could make prior IOVAs invalid will fail.

U64_MAX is not a multiple of any page size. Maybe that U64_MAX assignment
is a kernel bug?


> If one decides to select something less than the whole range, which
> value would you suggest?

I don't know. Isn't it host/guest/device specific? Should we ask KVM what
the supported IPA is first? What kind of values are showing up in the
IORTs of real hardware?

> > 
> >>  
> >>      /* Identity RID mapping covering the whole input RID range */
> >>      idmap = &rc->id_mapping_array[0];
> >> diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
> >> index 532eaf79bd..b4a5104367 100644
> >> --- a/include/hw/acpi/acpi-defs.h
> >> +++ b/include/hw/acpi/acpi-defs.h
> >> @@ -628,7 +628,11 @@ struct AcpiIortItsGroup {
> >>  } QEMU_PACKED;
> >>  typedef struct AcpiIortItsGroup AcpiIortItsGroup;
> >>  
> >> -#define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE 1
> >> +enum {
> >> +    ACPI_IORT_SMMU_V3_COHACC_OVERRIDE = 1 << 0,
> >> +    ACPI_IORT_SMMU_V3_HTTU_OVERRIDE   = 3 << 1,
> >> +    ACPI_IORT_SMMU_V3_PXM_VALID       = 1 << 3
> >> +};
> > 
> > We don't usually add flag definitions until we need them. Indeed it'll
> > just be more code to delete when switching to the aml_append* API.
> 
> The rationale was that those flags becomes usable with this revision so
> I decided to expose them. Now I don't have a strong opinion here.

I'd drop the change then.

Thanks,
drew
Auger Eric - Dec. 18, 2018, 10:54 a.m.
Hi Drew,

On 12/17/18 7:25 PM, Andrew Jones wrote:
> On Mon, Dec 17, 2018 at 05:49:02PM +0100, Auger Eric wrote:
>> Hi Drew,
>>
>> On 12/17/18 5:27 PM, Andrew Jones wrote:
>>> On Thu, Dec 06, 2018 at 06:07:33PM +0100, Eric Auger wrote:
>>>> Let's update the structs according to revision D of the IORT
>>>> specification and set the IORT node revision fields.
>>>>
>>>> In IORT smmuv3 node: the new proximity field is not used as
>>>> the proximity domain valid flag is not set. The new DeviceId
>>>> mapping index is not used either as all the SMMU control
>>>> interrupts are GSIV based.
>>>>
>>>> In IORT RC node: the new memory address size limit field is
>>>> set to 64 bits.
>>>>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>
>>>> ---
>>>>
>>>> v1 -> v2:
>>>> - separate patches for SMMUv3 DMA coherency issue and struct
>>>>   update to revision D.
>>>> - revision fields set
>>>> ---
>>>>  hw/arm/virt-acpi-build.c    |  4 ++++
>>>>  include/hw/acpi/acpi-defs.h | 10 +++++++++-
>>>>  2 files changed, 13 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
>>>> index aa177ba64d..a34a0958a5 100644
>>>> --- a/hw/arm/virt-acpi-build.c
>>>> +++ b/hw/arm/virt-acpi-build.c
>>>> @@ -423,6 +423,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>       */
>>>>      iort_node_offset = sizeof(*iort);
>>>>      iort->node_offset = cpu_to_le32(iort_node_offset);
>>>> +    iort->revision = 0;
>>>>  
>>>>      /* ITS group node */
>>>>      node_size =  sizeof(*its) + sizeof(uint32_t);
>>>> @@ -445,6 +446,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>  
>>>>          smmu->type = ACPI_IORT_NODE_SMMU_V3;
>>>>          smmu->length = cpu_to_le16(node_size);
>>>> +        smmu->revision = cpu_to_le32(2);
>>>>          smmu->mapping_count = cpu_to_le32(1);
>>>>          smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
>>>>          smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
>>>> @@ -470,6 +472,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>  
>>>>      rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
>>>>      rc->length = cpu_to_le16(node_size);
>>>> +    rc->revision = cpu_to_le32(1);
>>>>      rc->mapping_count = cpu_to_le32(1);
>>>>      rc->mapping_offset = cpu_to_le32(sizeof(*rc));
>>>>  
>>>> @@ -477,6 +480,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>      rc->memory_properties.cache_coherency = cpu_to_le32(1);
>>>>      rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
>>>>      rc->pci_segment_number = 0; /* MCFG pci_segment */
>>>> +    rc->memory_address_limit = 64;
>>>
>>> Hmmm... IIUC this provides the size of the IOVA space. When it's 64 the
>>> size of the space is U64_MAX, which gets added to the DMA base (also 64
>>> bits) when calculating things like the last PFN. It seems strange to use
>>> a value that will surely overflow those calculations. Is it common to
>>> put 64 here? Can you elaborate on this?
>>
>> I looked at drivers/acpi/arm64/iort.c nc_dma_get_range.
>> There you can find
>>
>>         *size = ncomp->memory_address_limit >= 64 ? U64_MAX :
>>                         1ULL<<ncomp->memory_address_limit;
>>
>> So I was expecting the value "64" to be properly handled by the kernel.
> 
> I traced the code further and see that the size gets added to the u64
> dma base without any special checks in both iort_dma_setup() and
> iommu_dma_init_domain(). Also iommu_dma_init_domain() function has this
> comment
> 
>  * @base and @size should be exact multiples of IOMMU page granularity to
>  * avoid rounding surprises. If necessary, we reserve the page at address 0
>  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
>  * any change which could make prior IOVAs invalid will fail.
> 
> U64_MAX is not a multiple of any page size. Maybe that U64_MAX assignment
> is a kernel bug?
Yes most probably the kernel should check address wraps and alignment.
Do you want to send a kernel patch yourself or shall I do?
> 
> 
>> If one decides to select something less than the whole range, which
>> value would you suggest?
> 
> I don't know. Isn't it host/guest/device specific? Should we ask KVM what
> the supported IPA is first? What kind of values are showing up in the
> IORTs of real hardware?
Digging into https://lkml.org/lkml/2017/7/31/472, the field is motivated
for cases where the bus connecting devices to the IOMMU is smaller in
size than the IOMMU input address bits. Architecturally the SMMU input
address space is 64 bits. As the vSMMUv3 only implements stage 1, the
input address is treated as a VA and when bypassed as intermediate
physical address.

My understanding is the VAS (VA size) is max 2x52bits=53 bits for
ARMv8.2. IPA is max 52 bits for 8.2.

But there are cases where the 64b upper byte is used (when TBI is set)
in the translation. I still feel difficult to understand SMMU spec 3.4.1
chapter (Input address size and Virtual Address size).

But anyway I think the kernel should support setting the value to 64bits.

The machines I have access to have Rev 0 IORT table so this field is not
used :-(

Thanks

Eric



> 
>>>
>>>>  
>>>>      /* Identity RID mapping covering the whole input RID range */
>>>>      idmap = &rc->id_mapping_array[0];
>>>> diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
>>>> index 532eaf79bd..b4a5104367 100644
>>>> --- a/include/hw/acpi/acpi-defs.h
>>>> +++ b/include/hw/acpi/acpi-defs.h
>>>> @@ -628,7 +628,11 @@ struct AcpiIortItsGroup {
>>>>  } QEMU_PACKED;
>>>>  typedef struct AcpiIortItsGroup AcpiIortItsGroup;
>>>>  
>>>> -#define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE 1
>>>> +enum {
>>>> +    ACPI_IORT_SMMU_V3_COHACC_OVERRIDE = 1 << 0,
>>>> +    ACPI_IORT_SMMU_V3_HTTU_OVERRIDE   = 3 << 1,
>>>> +    ACPI_IORT_SMMU_V3_PXM_VALID       = 1 << 3
>>>> +};
>>>
>>> We don't usually add flag definitions until we need them. Indeed it'll
>>> just be more code to delete when switching to the aml_append* API.
>>
>> The rationale was that those flags becomes usable with this revision so
>> I decided to expose them. Now I don't have a strong opinion here.
> 
> I'd drop the change then.
> 
> Thanks,
> drew
>
Andrew Jones - Dec. 18, 2018, 2:31 p.m.
On Tue, Dec 18, 2018 at 11:54:32AM +0100, Auger Eric wrote:
> Hi Drew,
> 
> On 12/17/18 7:25 PM, Andrew Jones wrote:
> > On Mon, Dec 17, 2018 at 05:49:02PM +0100, Auger Eric wrote:
> >> Hi Drew,
> >>
> >> On 12/17/18 5:27 PM, Andrew Jones wrote:
> >>> On Thu, Dec 06, 2018 at 06:07:33PM +0100, Eric Auger wrote:
> >>>> Let's update the structs according to revision D of the IORT
> >>>> specification and set the IORT node revision fields.
> >>>>
> >>>> In IORT smmuv3 node: the new proximity field is not used as
> >>>> the proximity domain valid flag is not set. The new DeviceId
> >>>> mapping index is not used either as all the SMMU control
> >>>> interrupts are GSIV based.
> >>>>
> >>>> In IORT RC node: the new memory address size limit field is
> >>>> set to 64 bits.
> >>>>
> >>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>
> >>>> ---
> >>>>
> >>>> v1 -> v2:
> >>>> - separate patches for SMMUv3 DMA coherency issue and struct
> >>>>   update to revision D.
> >>>> - revision fields set
> >>>> ---
> >>>>  hw/arm/virt-acpi-build.c    |  4 ++++
> >>>>  include/hw/acpi/acpi-defs.h | 10 +++++++++-
> >>>>  2 files changed, 13 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> >>>> index aa177ba64d..a34a0958a5 100644
> >>>> --- a/hw/arm/virt-acpi-build.c
> >>>> +++ b/hw/arm/virt-acpi-build.c
> >>>> @@ -423,6 +423,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>>>       */
> >>>>      iort_node_offset = sizeof(*iort);
> >>>>      iort->node_offset = cpu_to_le32(iort_node_offset);
> >>>> +    iort->revision = 0;
> >>>>  
> >>>>      /* ITS group node */
> >>>>      node_size =  sizeof(*its) + sizeof(uint32_t);
> >>>> @@ -445,6 +446,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>>>  
> >>>>          smmu->type = ACPI_IORT_NODE_SMMU_V3;
> >>>>          smmu->length = cpu_to_le16(node_size);
> >>>> +        smmu->revision = cpu_to_le32(2);
> >>>>          smmu->mapping_count = cpu_to_le32(1);
> >>>>          smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
> >>>>          smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
> >>>> @@ -470,6 +472,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>>>  
> >>>>      rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
> >>>>      rc->length = cpu_to_le16(node_size);
> >>>> +    rc->revision = cpu_to_le32(1);
> >>>>      rc->mapping_count = cpu_to_le32(1);
> >>>>      rc->mapping_offset = cpu_to_le32(sizeof(*rc));
> >>>>  
> >>>> @@ -477,6 +480,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
> >>>>      rc->memory_properties.cache_coherency = cpu_to_le32(1);
> >>>>      rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
> >>>>      rc->pci_segment_number = 0; /* MCFG pci_segment */
> >>>> +    rc->memory_address_limit = 64;
> >>>
> >>> Hmmm... IIUC this provides the size of the IOVA space. When it's 64 the
> >>> size of the space is U64_MAX, which gets added to the DMA base (also 64
> >>> bits) when calculating things like the last PFN. It seems strange to use
> >>> a value that will surely overflow those calculations. Is it common to
> >>> put 64 here? Can you elaborate on this?
> >>
> >> I looked at drivers/acpi/arm64/iort.c nc_dma_get_range.
> >> There you can find
> >>
> >>         *size = ncomp->memory_address_limit >= 64 ? U64_MAX :
> >>                         1ULL<<ncomp->memory_address_limit;
> >>
> >> So I was expecting the value "64" to be properly handled by the kernel.
> > 
> > I traced the code further and see that the size gets added to the u64
> > dma base without any special checks in both iort_dma_setup() and
> > iommu_dma_init_domain(). Also iommu_dma_init_domain() function has this
> > comment
> > 
> >  * @base and @size should be exact multiples of IOMMU page granularity to
> >  * avoid rounding surprises. If necessary, we reserve the page at address 0
> >  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
> >  * any change which could make prior IOVAs invalid will fail.
> > 
> > U64_MAX is not a multiple of any page size. Maybe that U64_MAX assignment
> > is a kernel bug?
> Yes most probably the kernel should check address wraps and alignment.
> Do you want to send a kernel patch yourself or shall I do?

I could write a patch, but I'm not equipped to test it. I'm also not that
familiar with it's purpose, or even with IOVA ranges in general...

FWIW, I'd probably leave the U64_MAX assignment as is, but then check the
addition everywhere it's used. E.g. in iommu_dma_init_domain() we could
replace all occurrences of 'base + size' with 'max_addr', where 'max_addr'
is defined as

  dma_addr_t max_addr;

  if (base != ALIGN(base, iommu_page_size)) {
     pr_warn("specified DMA base is not on a %d boundary\n", iommu_page_size);
     return -EINVAL;
  }

  max_addr = base + size;

  if (max_addr < base)
     max_addr = U64_MAX;

And I'd remove the '@size' from the 'should be exact multiples of IOMMU
page granularity' comment.

And at least iort_dma_setup() also needs an overflow check.

If you agree, then I can send that myself, otherwise feel free to send
what you think is best.

> > 
> > 
> >> If one decides to select something less than the whole range, which
> >> value would you suggest?
> > 
> > I don't know. Isn't it host/guest/device specific? Should we ask KVM what
> > the supported IPA is first? What kind of values are showing up in the
> > IORTs of real hardware?
> Digging into https://lkml.org/lkml/2017/7/31/472, the field is motivated
> for cases where the bus connecting devices to the IOMMU is smaller in
> size than the IOMMU input address bits. Architecturally the SMMU input
> address space is 64 bits. As the vSMMUv3 only implements stage 1, the
> input address is treated as a VA and when bypassed as intermediate
> physical address.
> 
> My understanding is the VAS (VA size) is max 2x52bits=53 bits for
> ARMv8.2. IPA is max 52 bits for 8.2.
> 
> But there are cases where the 64b upper byte is used (when TBI is set)
> in the translation. I still feel difficult to understand SMMU spec 3.4.1
> chapter (Input address size and Virtual Address size).
> 
> But anyway I think the kernel should support setting the value to 64bits.
> 
> The machines I have access to have Rev 0 IORT table so this field is not
> used :-(

I'll take your word for it :-)

Thanks,
drew
Auger Eric - Dec. 18, 2018, 2:54 p.m.
Hi Drew,

On 12/18/18 3:31 PM, Andrew Jones wrote:
> On Tue, Dec 18, 2018 at 11:54:32AM +0100, Auger Eric wrote:
>> Hi Drew,
>>
>> On 12/17/18 7:25 PM, Andrew Jones wrote:
>>> On Mon, Dec 17, 2018 at 05:49:02PM +0100, Auger Eric wrote:
>>>> Hi Drew,
>>>>
>>>> On 12/17/18 5:27 PM, Andrew Jones wrote:
>>>>> On Thu, Dec 06, 2018 at 06:07:33PM +0100, Eric Auger wrote:
>>>>>> Let's update the structs according to revision D of the IORT
>>>>>> specification and set the IORT node revision fields.
>>>>>>
>>>>>> In IORT smmuv3 node: the new proximity field is not used as
>>>>>> the proximity domain valid flag is not set. The new DeviceId
>>>>>> mapping index is not used either as all the SMMU control
>>>>>> interrupts are GSIV based.
>>>>>>
>>>>>> In IORT RC node: the new memory address size limit field is
>>>>>> set to 64 bits.
>>>>>>
>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>
>>>>>> ---
>>>>>>
>>>>>> v1 -> v2:
>>>>>> - separate patches for SMMUv3 DMA coherency issue and struct
>>>>>>   update to revision D.
>>>>>> - revision fields set
>>>>>> ---
>>>>>>  hw/arm/virt-acpi-build.c    |  4 ++++
>>>>>>  include/hw/acpi/acpi-defs.h | 10 +++++++++-
>>>>>>  2 files changed, 13 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
>>>>>> index aa177ba64d..a34a0958a5 100644
>>>>>> --- a/hw/arm/virt-acpi-build.c
>>>>>> +++ b/hw/arm/virt-acpi-build.c
>>>>>> @@ -423,6 +423,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>>>       */
>>>>>>      iort_node_offset = sizeof(*iort);
>>>>>>      iort->node_offset = cpu_to_le32(iort_node_offset);
>>>>>> +    iort->revision = 0;
>>>>>>  
>>>>>>      /* ITS group node */
>>>>>>      node_size =  sizeof(*its) + sizeof(uint32_t);
>>>>>> @@ -445,6 +446,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>>>  
>>>>>>          smmu->type = ACPI_IORT_NODE_SMMU_V3;
>>>>>>          smmu->length = cpu_to_le16(node_size);
>>>>>> +        smmu->revision = cpu_to_le32(2);
>>>>>>          smmu->mapping_count = cpu_to_le32(1);
>>>>>>          smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
>>>>>>          smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
>>>>>> @@ -470,6 +472,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>>>  
>>>>>>      rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
>>>>>>      rc->length = cpu_to_le16(node_size);
>>>>>> +    rc->revision = cpu_to_le32(1);
>>>>>>      rc->mapping_count = cpu_to_le32(1);
>>>>>>      rc->mapping_offset = cpu_to_le32(sizeof(*rc));
>>>>>>  
>>>>>> @@ -477,6 +480,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>>>>>>      rc->memory_properties.cache_coherency = cpu_to_le32(1);
>>>>>>      rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
>>>>>>      rc->pci_segment_number = 0; /* MCFG pci_segment */
>>>>>> +    rc->memory_address_limit = 64;
>>>>>
>>>>> Hmmm... IIUC this provides the size of the IOVA space. When it's 64 the
>>>>> size of the space is U64_MAX, which gets added to the DMA base (also 64
>>>>> bits) when calculating things like the last PFN. It seems strange to use
>>>>> a value that will surely overflow those calculations. Is it common to
>>>>> put 64 here? Can you elaborate on this?
>>>>
>>>> I looked at drivers/acpi/arm64/iort.c nc_dma_get_range.
>>>> There you can find
>>>>
>>>>         *size = ncomp->memory_address_limit >= 64 ? U64_MAX :
>>>>                         1ULL<<ncomp->memory_address_limit;
>>>>
>>>> So I was expecting the value "64" to be properly handled by the kernel.
>>>
>>> I traced the code further and see that the size gets added to the u64
>>> dma base without any special checks in both iort_dma_setup() and
>>> iommu_dma_init_domain(). Also iommu_dma_init_domain() function has this
>>> comment
>>>
>>>  * @base and @size should be exact multiples of IOMMU page granularity to
>>>  * avoid rounding surprises. If necessary, we reserve the page at address 0
>>>  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
>>>  * any change which could make prior IOVAs invalid will fail.
>>>
>>> U64_MAX is not a multiple of any page size. Maybe that U64_MAX assignment
>>> is a kernel bug?
>> Yes most probably the kernel should check address wraps and alignment.
>> Do you want to send a kernel patch yourself or shall I do?
> 
> I could write a patch, but I'm not equipped to test it. I'm also not that
> familiar with it's purpose, or even with IOVA ranges in general...
> 
> FWIW, I'd probably leave the U64_MAX assignment as is, but then check the
> addition everywhere it's used. E.g. in iommu_dma_init_domain() we could
> replace all occurrences of 'base + size' with 'max_addr', where 'max_addr'
> is defined as
> 
>   dma_addr_t max_addr;
> 
>   if (base != ALIGN(base, iommu_page_size)) {
>      pr_warn("specified DMA base is not on a %d boundary\n", iommu_page_size);
>      return -EINVAL;
>   }
> 
>   max_addr = base + size;
> 
>   if (max_addr < base)
>      max_addr = U64_MAX;
> 
> And I'd remove the '@size' from the 'should be exact multiples of IOMMU
> page granularity' comment.
> 
> And at least iort_dma_setup() also needs an overflow check.
> 
> If you agree, then I can send that myself, otherwise feel free to send
> what you think is best.

looks sensible to me. Looking forward to reviewing your patch then.

With respect to this series I plan to re-post patch 1 separately and
wait for those kernel uncertainties to be discussed before re-posting
patch 2. I will also work on the aml_append rework.

Thanks

Eric
> 
>>>
>>>
>>>> If one decides to select something less than the whole range, which
>>>> value would you suggest?
>>>
>>> I don't know. Isn't it host/guest/device specific? Should we ask KVM what
>>> the supported IPA is first? What kind of values are showing up in the
>>> IORTs of real hardware?
>> Digging into https://lkml.org/lkml/2017/7/31/472, the field is motivated
>> for cases where the bus connecting devices to the IOMMU is smaller in
>> size than the IOMMU input address bits. Architecturally the SMMU input
>> address space is 64 bits. As the vSMMUv3 only implements stage 1, the
>> input address is treated as a VA and when bypassed as intermediate
>> physical address.
>>
>> My understanding is the VAS (VA size) is max 2x52bits=53 bits for
>> ARMv8.2. IPA is max 52 bits for 8.2.
>>
>> But there are cases where the 64b upper byte is used (when TBI is set)
>> in the translation. I still feel difficult to understand SMMU spec 3.4.1
>> chapter (Input address size and Virtual Address size).
>>
>> But anyway I think the kernel should support setting the value to 64bits.
>>
>> The machines I have access to have Rev 0 IORT table so this field is not
>> used :-(
> 
> I'll take your word for it :-)
> 
> Thanks,
> drew
>

Patch

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index aa177ba64d..a34a0958a5 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -423,6 +423,7 @@  build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
      */
     iort_node_offset = sizeof(*iort);
     iort->node_offset = cpu_to_le32(iort_node_offset);
+    iort->revision = 0;
 
     /* ITS group node */
     node_size =  sizeof(*its) + sizeof(uint32_t);
@@ -445,6 +446,7 @@  build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 
         smmu->type = ACPI_IORT_NODE_SMMU_V3;
         smmu->length = cpu_to_le16(node_size);
+        smmu->revision = cpu_to_le32(2);
         smmu->mapping_count = cpu_to_le32(1);
         smmu->mapping_offset = cpu_to_le32(sizeof(*smmu));
         smmu->base_address = cpu_to_le64(vms->memmap[VIRT_SMMU].base);
@@ -470,6 +472,7 @@  build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 
     rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
     rc->length = cpu_to_le16(node_size);
+    rc->revision = cpu_to_le32(1);
     rc->mapping_count = cpu_to_le32(1);
     rc->mapping_offset = cpu_to_le32(sizeof(*rc));
 
@@ -477,6 +480,7 @@  build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     rc->memory_properties.cache_coherency = cpu_to_le32(1);
     rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
     rc->pci_segment_number = 0; /* MCFG pci_segment */
+    rc->memory_address_limit = 64;
 
     /* Identity RID mapping covering the whole input RID range */
     idmap = &rc->id_mapping_array[0];
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 532eaf79bd..b4a5104367 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -628,7 +628,11 @@  struct AcpiIortItsGroup {
 } QEMU_PACKED;
 typedef struct AcpiIortItsGroup AcpiIortItsGroup;
 
-#define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE 1
+enum {
+    ACPI_IORT_SMMU_V3_COHACC_OVERRIDE = 1 << 0,
+    ACPI_IORT_SMMU_V3_HTTU_OVERRIDE   = 3 << 1,
+    ACPI_IORT_SMMU_V3_PXM_VALID       = 1 << 3
+};
 
 struct AcpiIortSmmu3 {
     ACPI_IORT_NODE_HEADER_DEF
@@ -641,6 +645,8 @@  struct AcpiIortSmmu3 {
     uint32_t pri_gsiv;
     uint32_t gerr_gsiv;
     uint32_t sync_gsiv;
+    uint32_t pxm;
+    uint32_t id_mapping_index;
     AcpiIortIdMapping id_mapping_array[0];
 } QEMU_PACKED;
 typedef struct AcpiIortSmmu3 AcpiIortSmmu3;
@@ -650,6 +656,8 @@  struct AcpiIortRC {
     AcpiIortMemoryAccess memory_properties;
     uint32_t ats_attribute;
     uint32_t pci_segment_number;
+    uint8_t memory_address_limit;
+    uint8_t reserved2[3];
     AcpiIortIdMapping id_mapping_array[0];
 } QEMU_PACKED;
 typedef struct AcpiIortRC AcpiIortRC;