[dpdk-dev] vfio: fix sPAPR IOMMU DMA window size
Checks
Commit Message
DMA window size needs to be big enough to span all memory segment's
physical addresses. We do not need multiple levels of IOMMU tables
as we already span ~70TB of physical memory with 16MB hugepages.
Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
---
lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
Comments
On 08/08/17 01:11, Jonas Pfefferle wrote:
> DMA window size needs to be big enough to span all memory segment's
> physical addresses. We do not need multiple levels of IOMMU tables
> as we already span ~70TB of physical memory with 16MB hugepages.
>
> Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> ---
> lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
> 1 file changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 946df7e..8502216 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
> return 0;
> }
>
> +static uint64_t
> +roundup_next_pow2(uint64_t n)
> +{
> + uint32_t i;
> +
> + n--;
> + for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> + n |= n >> i;
> +
> + return ++n;
> +}
> +
wow :)
QEMU does it using __builtin_ctzll() (used below for the page_shift)
without a loop:
https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/host-utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382
Anyway, seems working.
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> static int
> vfio_spapr_dma_map(int vfio_container_fd)
> {
> @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
> return -1;
> }
>
> - /* calculate window size based on number of hugepages configured */
> - create.window_size = rte_eal_get_physmem_size();
> + /* physicaly pages are sorted descending i.e. ms[0].phys_addr is max */
> + /* create DMA window from 0 to max(phys_addr + len) */
> + /* sPAPR requires window size to be a power of 2 */
> + create.window_size = roundup_next_pow2(ms[0].phys_addr + ms[0].len);
> create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> - create.levels = 2;
> + create.levels = 1;
>
> ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> if (ret) {
> @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
> return -1;
> }
>
> + if (create.start_addr != 0) {
> + RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
> + return -1;
> + }
> +
> /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> struct vfio_iommu_type1_dma_map dma_map;
>
Alexey Kardashevskiy <aik@ozlabs.ru> wrote on 08/08/2017 09:38:00 AM:
> From: Alexey Kardashevskiy <aik@ozlabs.ru>
> To: Jonas Pfefferle <jpf@zurich.ibm.com>, anatoly.burakov@intel.com
> Cc: dev@dpdk.org
> Date: 08/08/2017 09:38 AM
> Subject: Re: [PATCH] vfio: fix sPAPR IOMMU DMA window size
>
> On 08/08/17 01:11, Jonas Pfefferle wrote:
> > DMA window size needs to be big enough to span all memory segment's
> > physical addresses. We do not need multiple levels of IOMMU tables
> > as we already span ~70TB of physical memory with 16MB hugepages.
> >
> > Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> > ---
> > lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
> > 1 file changed, 22 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/
> librte_eal/linuxapp/eal/eal_vfio.c
> > index 946df7e..8502216 100644
> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
> > return 0;
> > }
> >
> > +static uint64_t
> > +roundup_next_pow2(uint64_t n)
> > +{
> > + uint32_t i;
> > +
> > + n--;
> > + for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> > + n |= n >> i;
> > +
> > + return ++n;
> > +}
> > +
>
> wow :)
>
> QEMU does it using __builtin_ctzll() (used below for the page_shift)
> without a loop:
>
> https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/
> host-
>
utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382
>
>
> Anyway, seems working.
Ok let me fix that :)
>
>
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
>
>
>
> > static int
> > vfio_spapr_dma_map(int vfio_container_fd)
> > {
> > @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > return -1;
> > }
> >
> > - /* calculate window size based on number of hugepages configured */
> > - create.window_size = rte_eal_get_physmem_size();
> > + /* physicaly pages are sorted descending i.e. ms[0].phys_addr is
max */
> > + /* create DMA window from 0 to max(phys_addr + len) */
> > + /* sPAPR requires window size to be a power of 2 */
> > + create.window_size = roundup_next_pow2(ms[0].phys_addr + ms
[0].len);
> > create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> > - create.levels = 2;
> > + create.levels = 1;
> >
> > ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE,
&create);
> > if (ret) {
> > @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > return -1;
> > }
> >
> > + if (create.start_addr != 0) {
> > + RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
> > + return -1;
> > + }
> > +
> > /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> > for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> > struct vfio_iommu_type1_dma_map dma_map;
> >
>
>
> --
> Alexey
>
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Alexey Kardashevskiy
> Sent: Tuesday, August 8, 2017 10:38 AM
> To: Jonas Pfefferle <jpf@zurich.ibm.com>; Burakov, Anatoly <anatoly.burakov@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] vfio: fix sPAPR IOMMU DMA window size
>
> On 08/08/17 01:11, Jonas Pfefferle wrote:
> > DMA window size needs to be big enough to span all memory segment's
> > physical addresses. We do not need multiple levels of IOMMU tables
> > as we already span ~70TB of physical memory with 16MB hugepages.
> >
> > Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> > ---
> > lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
> > 1 file changed, 22 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > index 946df7e..8502216 100644
> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
> > return 0;
> > }
> >
> > +static uint64_t
> > +roundup_next_pow2(uint64_t n)
> > +{
> > + uint32_t i;
> > +
> > + n--;
> > + for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> > + n |= n >> i;
> > +
> > + return ++n;
> > +}
> > +
>
> wow :)
>
> QEMU does it using __builtin_ctzll() (used below for the page_shift)
> without a loop:
>
> https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/host-
> utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382
>
>
> Anyway, seems working.
As I remember, there already exists rte_align64pow2().
Konstantin
>
>
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
>
>
>
> > static int
> > vfio_spapr_dma_map(int vfio_container_fd)
> > {
> > @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > return -1;
> > }
> >
> > - /* calculate window size based on number of hugepages configured */
> > - create.window_size = rte_eal_get_physmem_size();
> > + /* physicaly pages are sorted descending i.e. ms[0].phys_addr is max */
> > + /* create DMA window from 0 to max(phys_addr + len) */
> > + /* sPAPR requires window size to be a power of 2 */
> > + create.window_size = roundup_next_pow2(ms[0].phys_addr + ms[0].len);
> > create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> > - create.levels = 2;
> > + create.levels = 1;
> >
> > ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> > if (ret) {
> > @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > return -1;
> > }
> >
> > + if (create.start_addr != 0) {
> > + RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
> > + return -1;
> > + }
> > +
> > /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> > for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> > struct vfio_iommu_type1_dma_map dma_map;
> >
>
>
> --
> Alexey
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote on 08/08/2017
10:27:28 AM:
> From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> To: Alexey Kardashevskiy <aik@ozlabs.ru>, Jonas Pfefferle
> <jpf@zurich.ibm.com>, "Burakov, Anatoly" <anatoly.burakov@intel.com>
> Cc: "dev@dpdk.org" <dev@dpdk.org>
> Date: 08/08/2017 10:27 AM
> Subject: RE: [dpdk-dev] [PATCH] vfio: fix sPAPR IOMMU DMA window size
>
>
>
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Alexey
Kardashevskiy
> > Sent: Tuesday, August 8, 2017 10:38 AM
> > To: Jonas Pfefferle <jpf@zurich.ibm.com>; Burakov, Anatoly
> <anatoly.burakov@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] vfio: fix sPAPR IOMMU DMA window size
> >
> > On 08/08/17 01:11, Jonas Pfefferle wrote:
> > > DMA window size needs to be big enough to span all memory segment's
> > > physical addresses. We do not need multiple levels of IOMMU tables
> > > as we already span ~70TB of physical memory with 16MB hugepages.
> > >
> > > Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> > > ---
> > > lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 +++++++++++++++++++++
+---
> > > 1 file changed, 22 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/
> librte_eal/linuxapp/eal/eal_vfio.c
> > > index 946df7e..8502216 100644
> > > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > > @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
> > > return 0;
> > > }
> > >
> > > +static uint64_t
> > > +roundup_next_pow2(uint64_t n)
> > > +{
> > > + uint32_t i;
> > > +
> > > + n--;
> > > + for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> > > + n |= n >> i;
> > > +
> > > + return ++n;
> > > +}
> > > +
> >
> > wow :)
> >
> > QEMU does it using __builtin_ctzll() (used below for the page_shift)
> > without a loop:
> >
> > https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/host-
> >
>
utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382
> >
> >
> > Anyway, seems working.
>
> As I remember, there already exists rte_align64pow2().
> Konstantin
Thanks. Fixed it.
>
> >
> >
> > Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >
> >
> >
> >
> > > static int
> > > vfio_spapr_dma_map(int vfio_container_fd)
> > > {
> > > @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > > return -1;
> > > }
> > >
> > > - /* calculate window size based on number of hugepages configured
*/
> > > - create.window_size = rte_eal_get_physmem_size();
> > > + /* physicaly pages are sorted descending i.e. ms
> [0].phys_addr is max */
> > > + /* create DMA window from 0 to max(phys_addr + len) */
> > > + /* sPAPR requires window size to be a power of 2 */
> > > + create.window_size = roundup_next_pow2(ms[0].phys_addr + ms
[0].len);
> > > create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> > > - create.levels = 2;
> > > + create.levels = 1;
> > >
> > > ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE,
&create);
> > > if (ret) {
> > > @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > > return -1;
> > > }
> > >
> > > + if (create.start_addr != 0) {
> > > + RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
> > > + return -1;
> > > + }
> > > +
> > > /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> > > for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> > > struct vfio_iommu_type1_dma_map dma_map;
> > >
> >
> >
> > --
> > Alexey
@@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
return 0;
}
+static uint64_t
+roundup_next_pow2(uint64_t n)
+{
+ uint32_t i;
+
+ n--;
+ for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
+ n |= n >> i;
+
+ return ++n;
+}
+
static int
vfio_spapr_dma_map(int vfio_container_fd)
{
@@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
return -1;
}
- /* calculate window size based on number of hugepages configured */
- create.window_size = rte_eal_get_physmem_size();
+ /* physicaly pages are sorted descending i.e. ms[0].phys_addr is max */
+ /* create DMA window from 0 to max(phys_addr + len) */
+ /* sPAPR requires window size to be a power of 2 */
+ create.window_size = roundup_next_pow2(ms[0].phys_addr + ms[0].len);
create.page_shift = __builtin_ctzll(ms->hugepage_sz);
- create.levels = 2;
+ create.levels = 1;
ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
if (ret) {
@@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
return -1;
}
+ if (create.start_addr != 0) {
+ RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
+ return -1;
+ }
+
/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
for (i = 0; i < RTE_MAX_MEMSEG; i++) {
struct vfio_iommu_type1_dma_map dma_map;