[dpdk-dev] net/mlx5: remmap UAR address for multiple process

Message ID 20180119150854.89828-1-xuemingl@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Xueming Li Jan. 19, 2018, 3:08 p.m. UTC
  UAR(doorbell) is hw resources that have to be same address between
primary and secondary process, failed to mmap UAR will make TX packets
invisible to HW.
Today, UAR address returned from verbs api is mixed in heap and loaded
library address space, prone to be occupied in secondary process.
This patch reserves a dedicate UAR address space, both primary and
secondary process re-mmap UAR pages into this space.
Below is a brief picture of dpdk app address space allocation:
	Before			This patch
	------			----------
	[stack]			[stack]
	[.so, uar, heap]	[.so, heap]
	[(empty)]		[(empty)]
	[hugepage]		[hugepage]
	[? others]		[? others]
	[(empty)]		[(empty)]
				[uar]
				[(empty)]
To minimize conflicts, UAR address space comes after hugepage space with
an offset to skip potential usage from other drivers.

Once UAR space reserved successfully, UAR pages are re-mmapped into new
area to keep UAR address aligned between primary and secondary process.

Signed-off-by: Xueming Li <xuemingl@mellanox.com>
---
 drivers/net/mlx5/mlx5.c         | 107 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5.h         |   1 +
 drivers/net/mlx5/mlx5_defs.h    |  10 ++++
 drivers/net/mlx5/mlx5_rxtx.h    |   3 +-
 drivers/net/mlx5/mlx5_trigger.c |   7 ++-
 drivers/net/mlx5/mlx5_txq.c     |  51 +++++++++++++------
 6 files changed, 163 insertions(+), 16 deletions(-)
  

Comments

Nélio Laranjeiro Jan. 22, 2018, 2:53 p.m. UTC | #1
Hi Xueming,

On Fri, Jan 19, 2018 at 11:08:54PM +0800, Xueming Li wrote:
> UAR(doorbell) is hw resources that have to be same address between
> primary and secondary process, failed to mmap UAR will make TX packets
> invisible to HW.
> Today, UAR address returned from verbs api is mixed in heap and loaded
> library address space, prone to be occupied in secondary process.
> This patch reserves a dedicate UAR address space, both primary and
> secondary process re-mmap UAR pages into this space.
> Below is a brief picture of dpdk app address space allocation:
> 	Before			This patch
> 	------			----------
> 	[stack]			[stack]
> 	[.so, uar, heap]	[.so, heap]
> 	[(empty)]		[(empty)]
> 	[hugepage]		[hugepage]
> 	[? others]		[? others]
> 	[(empty)]		[(empty)]
> 				[uar]
> 				[(empty)]
> To minimize conflicts, UAR address space comes after hugepage space with
> an offset to skip potential usage from other drivers.

Seems it is not the case when the memory is contiguous, according to
what I see in my testpmd /proc/<pid>/maps:

 PMD: mlx5.c:523: mlx5_uar_init_primary(): Reserved UAR address space: 0x0x7f4da5800000

And the fist huge page is at address 0x7f4fa5800000, new UAR space is
before and not after.

With this patch I still have the situation described as "before".

> Once UAR space reserved successfully, UAR pages are re-mmapped into new
> area to keep UAR address aligned between primary and secondary process.
> 
> Signed-off-by: Xueming Li <xuemingl@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c         | 107 ++++++++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5.h         |   1 +
>  drivers/net/mlx5/mlx5_defs.h    |  10 ++++
>  drivers/net/mlx5/mlx5_rxtx.h    |   3 +-
>  drivers/net/mlx5/mlx5_trigger.c |   7 ++-
>  drivers/net/mlx5/mlx5_txq.c     |  51 +++++++++++++------
>  6 files changed, 163 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index fc2d59fee..1539ef608 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -39,6 +39,7 @@
>  #include <stdlib.h>
>  #include <errno.h>
>  #include <net/if.h>
> +#include <sys/mman.h>
>  
>  /* Verbs header. */
>  /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
> @@ -56,6 +57,7 @@
>  #include <rte_pci.h>
>  #include <rte_bus_pci.h>
>  #include <rte_common.h>
> +#include <rte_eal_memconfig.h>
>  #include <rte_kvargs.h>
>  
>  #include "mlx5.h"
> @@ -466,6 +468,101 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
>  
>  static struct rte_pci_driver mlx5_driver;
>  
> +/*
> + * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process
> + * local resource used by both primary and secondary to avoid duplicate
> + * reservation.
> + * The space has to be available on both primary and secondary process,
> + * TXQ UAR maps to this area using fixed mmap w/o double check.
> + */
> +static void *uar_base;
> +
> +/**
> + * Reserve UAR address space for primary process
> + *
> + * @param[in] priv
> + *   Pointer to private structure.
> + *
> + * @return
> + *   0 on success, negative errno value on failure.
> + */
> +static int
> +mlx5_uar_init_primary(struct priv *priv)
> +{
> +	void *addr = (void *)0;
> +	int i;
> +	const struct rte_mem_config *mcfg;
> +
> +	if (uar_base) { /* UAR address space mapped */
> +		priv->uar_base = uar_base;
> +		return 0;
> +	}
> +	/* find out lower bound of hugepage segments */
> +	mcfg = rte_eal_get_configuration()->mem_config;
> +	for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {
> +		if (addr)
> +			addr = RTE_MIN(addr, mcfg->memseg[i].addr);
> +		else
> +			addr = mcfg->memseg[i].addr;

This if/else is useless as addr is already initialised with the smallest
possible value.

> +	}
> +	/* offset down UAR area */
> +	addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);

Seems the error is here, the loops get the address of the memseg with
the smallest address and then it subtract the UAR size, addr cannot be
after the huge pages unless if this subtraction overflows.

> +	/* anonymous mmap, no real memory consumption */
> +	addr = mmap(addr, MLX5_UAR_SIZE,
> +		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +	if (addr == MAP_FAILED) {
> +		ERROR("Failed to reserve UAR address space, please adjust "
> +		      "MLX5_UAR_SIZE or try --base-virtaddr");

How does a user knows the UAR memory space the NIC needs to adjust the
MLX5_UAR_SIZE?

> +		return -ENOMEM;
> +	}
> +	/* Accept either same addr or a new addr returned from mmap if target
> +	 * range occupied.
> +	 */
> +	INFO("Reserved UAR address space: 0x%p", addr);

The '%p' already prefix the address with the 0x.

> +	priv->uar_base = addr; /* for primary and secondary UAR re-mmap */
> +	uar_base = addr; /* process local, don't reserve again */
> +	return 0;
> +}
> +
<snip/>

Regards,
  
Xueming Li Jan. 23, 2018, 9:50 a.m. UTC | #2
Hi Nelio,

> -----Original Message-----

> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]

> Sent: Monday, January 22, 2018 10:53 PM

> To: Xueming(Steven) Li <xuemingl@mellanox.com>

> Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org

> Subject: Re: [PATCH] net/mlx5: remmap UAR address for multiple process

> 

> Hi Xueming,

> 

> On Fri, Jan 19, 2018 at 11:08:54PM +0800, Xueming Li wrote:

> > UAR(doorbell) is hw resources that have to be same address between

> > primary and secondary process, failed to mmap UAR will make TX packets

> > invisible to HW.

> > Today, UAR address returned from verbs api is mixed in heap and loaded

> > library address space, prone to be occupied in secondary process.

> > This patch reserves a dedicate UAR address space, both primary and

> > secondary process re-mmap UAR pages into this space.

> > Below is a brief picture of dpdk app address space allocation:

> > 	Before			This patch

> > 	------			----------

> > 	[stack]			[stack]

> > 	[.so, uar, heap]	[.so, heap]

> > 	[(empty)]		[(empty)]

> > 	[hugepage]		[hugepage]

> > 	[? others]		[? others]

> > 	[(empty)]		[(empty)]

> > 				[uar]

> > 				[(empty)]

> > To minimize conflicts, UAR address space comes after hugepage space

> > with an offset to skip potential usage from other drivers.

> 

> Seems it is not the case when the memory is contiguous, according to what

> I see in my testpmd /proc/<pid>/maps:

> 

>  PMD: mlx5.c:523: mlx5_uar_init_primary(): Reserved UAR address space:

> 0x0x7f4da5800000

> 

> And the fist huge page is at address 0x7f4fa5800000, new UAR space is

> before and not after.

> 

> With this patch I still have the situation described as "before".

> 


Your observation is correct, system is allocating address in a high-to-low
manner like stack. UAR address range 0x0x7f4da5800000 - 0x0x7f4ea5800000, 
4GB size, With another 4G offset, hugepage range start is 0x7f4fa5800000.

> > Once UAR space reserved successfully, UAR pages are re-mmapped into

> > new area to keep UAR address aligned between primary and secondary

> process.

> >

> > Signed-off-by: Xueming Li <xuemingl@mellanox.com>

> > ---

> >  drivers/net/mlx5/mlx5.c         | 107

> ++++++++++++++++++++++++++++++++++++++++

> >  drivers/net/mlx5/mlx5.h         |   1 +

> >  drivers/net/mlx5/mlx5_defs.h    |  10 ++++

> >  drivers/net/mlx5/mlx5_rxtx.h    |   3 +-

> >  drivers/net/mlx5/mlx5_trigger.c |   7 ++-

> >  drivers/net/mlx5/mlx5_txq.c     |  51 +++++++++++++------

> >  6 files changed, 163 insertions(+), 16 deletions(-)

> >

> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index

> > fc2d59fee..1539ef608 100644

> > --- a/drivers/net/mlx5/mlx5.c

> > +++ b/drivers/net/mlx5/mlx5.c

> > @@ -39,6 +39,7 @@

> >  #include <stdlib.h>

> >  #include <errno.h>

> >  #include <net/if.h>

> > +#include <sys/mman.h>

> >

> >  /* Verbs header. */

> >  /* ISO C doesn't support unnamed structs/unions, disabling -pedantic.

> > */ @@ -56,6 +57,7 @@  #include <rte_pci.h>  #include <rte_bus_pci.h>

> > #include <rte_common.h>

> > +#include <rte_eal_memconfig.h>

> >  #include <rte_kvargs.h>

> >

> >  #include "mlx5.h"

> > @@ -466,6 +468,101 @@ mlx5_args(struct mlx5_dev_config *config, struct

> > rte_devargs *devargs)

> >

> >  static struct rte_pci_driver mlx5_driver;

> >

> > +/*

> > + * Reserved UAR address space for TXQ UAR(hw doorbell) mapping,

> > +process

> > + * local resource used by both primary and secondary to avoid

> > +duplicate

> > + * reservation.

> > + * The space has to be available on both primary and secondary

> > +process,

> > + * TXQ UAR maps to this area using fixed mmap w/o double check.

> > + */

> > +static void *uar_base;

> > +

> > +/**

> > + * Reserve UAR address space for primary process

> > + *

> > + * @param[in] priv

> > + *   Pointer to private structure.

> > + *

> > + * @return

> > + *   0 on success, negative errno value on failure.

> > + */

> > +static int

> > +mlx5_uar_init_primary(struct priv *priv) {

> > +	void *addr = (void *)0;

> > +	int i;

> > +	const struct rte_mem_config *mcfg;

> > +

> > +	if (uar_base) { /* UAR address space mapped */

> > +		priv->uar_base = uar_base;

> > +		return 0;

> > +	}

> > +	/* find out lower bound of hugepage segments */

> > +	mcfg = rte_eal_get_configuration()->mem_config;

> > +	for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {

> > +		if (addr)

> > +			addr = RTE_MIN(addr, mcfg->memseg[i].addr);

> > +		else

> > +			addr = mcfg->memseg[i].addr;

> 

> This if/else is useless as addr is already initialised with the smallest

> possible value.


That's my original code :-) and I always get addr zero then. 
Addr here is the lower bound of hugepage, we don't want addr to keep zero.

> 

> > +	}

> > +	/* offset down UAR area */

> > +	addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);

> 

> Seems the error is here, the loops get the address of the memseg with the

> smallest address and then it subtract the UAR size, addr cannot be after

> the huge pages unless if this subtraction overflows.


Thanks, my word "after" is something like address alloction order, the UAR block 
under "hugepage" on the overall picture.

> 

> > +	/* anonymous mmap, no real memory consumption */

> > +	addr = mmap(addr, MLX5_UAR_SIZE,

> > +		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

> > +	if (addr == MAP_FAILED) {

> > +		ERROR("Failed to reserve UAR address space, please adjust "

> > +		      "MLX5_UAR_SIZE or try --base-virtaddr");

> 

> How does a user knows the UAR memory space the NIC needs to adjust the

> MLX5_UAR_SIZE?

> 

> > +		return -ENOMEM;

> > +	}

> > +	/* Accept either same addr or a new addr returned from mmap if

> target

> > +	 * range occupied.

> > +	 */

> > +	INFO("Reserved UAR address space: 0x%p", addr);

> 

> The '%p' already prefix the address with the 0x.

> 

> > +	priv->uar_base = addr; /* for primary and secondary UAR re-mmap */

> > +	uar_base = addr; /* process local, don't reserve again */

> > +	return 0;

> > +}

> > +

> <snip/>

> 

> Regards,

> 

> --

> Nélio Laranjeiro

> 6WIND
  
Nélio Laranjeiro Jan. 23, 2018, 1:31 p.m. UTC | #3
Hi Xueming,

My lonely comments are more related to the commit log which should be
re-written to be more accurate to the issue you try to address, even if
this patch does not solves it.

Please see below,

On Tue, Jan 23, 2018 at 09:50:42AM +0000, Xueming(Steven) Li wrote:
> Hi Nelio,
> 
> > -----Original Message-----
> > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > Sent: Monday, January 22, 2018 10:53 PM
> > To: Xueming(Steven) Li <xuemingl@mellanox.com>
> > Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> > Subject: Re: [PATCH] net/mlx5: remmap UAR address for multiple process
> > 
> > Hi Xueming,
> > 
> > On Fri, Jan 19, 2018 at 11:08:54PM +0800, Xueming Li wrote:
> > > UAR(doorbell) is hw resources that have to be same address between
> > > primary and secondary process, failed to mmap UAR will make TX packets
> > > invisible to HW.
> > > Today, UAR address returned from verbs api is mixed in heap and loaded
> > > library address space, prone to be occupied in secondary process.
> > > This patch reserves a dedicate UAR address space, both primary and
> > > secondary process re-mmap UAR pages into this space.
> > > Below is a brief picture of dpdk app address space allocation:
> > > 	Before			This patch
> > > 	------			----------
> > > 	[stack]			[stack]
> > > 	[.so, uar, heap]	[.so, heap]
> > > 	[(empty)]		[(empty)]
> > > 	[hugepage]		[hugepage]
> > > 	[? others]		[? others]
> > > 	[(empty)]		[(empty)]
> > > 				[uar]
> > > 				[(empty)]
> > > To minimize conflicts, UAR address space comes after hugepage space
> > > with an offset to skip potential usage from other drivers.
> > 
> > Seems it is not the case when the memory is contiguous, according to what
> > I see in my testpmd /proc/<pid>/maps:
> > 
> >  PMD: mlx5.c:523: mlx5_uar_init_primary(): Reserved UAR address space:
> > 0x0x7f4da5800000
> > 
> > And the fist huge page is at address 0x7f4fa5800000, new UAR space is
> > before and not after.
> > 
> > With this patch I still have the situation described as "before".
> > 
> 
> Your observation is correct, system is allocating address in a high-to-low
> manner like stack. UAR address range 0x0x7f4da5800000 - 0x0x7f4ea5800000, 
> 4GB size, With another 4G offset, hugepage range start is 0x7f4fa5800000.

From what I understand, remapping the UAR pages to an address before
the huge pages reduce the situation where the secondaries process cannot
start.  This patch does not fix the fact it may fail.

Your small display of the memory mapping between before and after seems
not accurate depending on the OS being run, on Linux v4.14 from debian9
S.I.D. I am still on the situation before no matter how many time I
restart the process.  For that I'll suggest you to remove it.

> > > Once UAR space reserved successfully, UAR pages are re-mmapped into
> > > new area to keep UAR address aligned between primary and secondary
> > process.
> > >
> > > Signed-off-by: Xueming Li <xuemingl@mellanox.com>
> > > ---
> > >  drivers/net/mlx5/mlx5.c         | 107
> > ++++++++++++++++++++++++++++++++++++++++
> > >  drivers/net/mlx5/mlx5.h         |   1 +
> > >  drivers/net/mlx5/mlx5_defs.h    |  10 ++++
> > >  drivers/net/mlx5/mlx5_rxtx.h    |   3 +-
> > >  drivers/net/mlx5/mlx5_trigger.c |   7 ++-
> > >  drivers/net/mlx5/mlx5_txq.c     |  51 +++++++++++++------
> > >  6 files changed, 163 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > > fc2d59fee..1539ef608 100644
> > > --- a/drivers/net/mlx5/mlx5.c
> > > +++ b/drivers/net/mlx5/mlx5.c
> > > @@ -39,6 +39,7 @@
> > >  #include <stdlib.h>
> > >  #include <errno.h>
> > >  #include <net/if.h>
> > > +#include <sys/mman.h>
> > >
> > >  /* Verbs header. */
> > >  /* ISO C doesn't support unnamed structs/unions, disabling -pedantic.
> > > */ @@ -56,6 +57,7 @@  #include <rte_pci.h>  #include <rte_bus_pci.h>
> > > #include <rte_common.h>
> > > +#include <rte_eal_memconfig.h>
> > >  #include <rte_kvargs.h>
> > >
> > >  #include "mlx5.h"
> > > @@ -466,6 +468,101 @@ mlx5_args(struct mlx5_dev_config *config, struct
> > > rte_devargs *devargs)
> > >
> > >  static struct rte_pci_driver mlx5_driver;
> > >
> > > +/*
> > > + * Reserved UAR address space for TXQ UAR(hw doorbell) mapping,
> > > +process
> > > + * local resource used by both primary and secondary to avoid
> > > +duplicate
> > > + * reservation.
> > > + * The space has to be available on both primary and secondary
> > > +process,
> > > + * TXQ UAR maps to this area using fixed mmap w/o double check.
> > > + */
> > > +static void *uar_base;
> > > +
> > > +/**
> > > + * Reserve UAR address space for primary process
> > > + *
> > > + * @param[in] priv
> > > + *   Pointer to private structure.
> > > + *
> > > + * @return
> > > + *   0 on success, negative errno value on failure.
> > > + */
> > > +static int
> > > +mlx5_uar_init_primary(struct priv *priv) {
> > > +	void *addr = (void *)0;
> > > +	int i;
> > > +	const struct rte_mem_config *mcfg;
> > > +
> > > +	if (uar_base) { /* UAR address space mapped */
> > > +		priv->uar_base = uar_base;
> > > +		return 0;
> > > +	}
> > > +	/* find out lower bound of hugepage segments */
> > > +	mcfg = rte_eal_get_configuration()->mem_config;
> > > +	for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {
> > > +		if (addr)
> > > +			addr = RTE_MIN(addr, mcfg->memseg[i].addr);
> > > +		else
> > > +			addr = mcfg->memseg[i].addr;
> > 
> > This if/else is useless as addr is already initialised with the smallest
> > possible value.
> 
> That's my original code :-) and I always get addr zero then. 
> Addr here is the lower bound of hugepage, we don't want addr to keep zero.

Indeed, I mix my mind the min and max.

> > > +	}
> > > +	/* offset down UAR area */
> > > +	addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
> > 
> > Seems the error is here, the loops get the address of the memseg with the
> > smallest address and then it subtract the UAR size, addr cannot be after
> > the huge pages unless if this subtraction overflows.
> 
> Thanks, my word "after" is something like address alloction order, the UAR block 
> under "hugepage" on the overall picture.

There is no guarantee that the system will allocate from the end to the
beginning.  After means having an address higher than the reference,
otherwise it is not after but before.

> > > +	/* anonymous mmap, no real memory consumption */
> > > +	addr = mmap(addr, MLX5_UAR_SIZE,
> > > +		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> > > +	if (addr == MAP_FAILED) {
> > > +		ERROR("Failed to reserve UAR address space, please adjust "
> > > +		      "MLX5_UAR_SIZE or try --base-virtaddr");
> > 
> > How does a user knows the UAR memory space the NIC needs to adjust the
> > MLX5_UAR_SIZE?
> > 
> > > +		return -ENOMEM;
> > > +	}
> > > +	/* Accept either same addr or a new addr returned from mmap if
> > target
> > > +	 * range occupied.
> > > +	 */
> > > +	INFO("Reserved UAR address space: 0x%p", addr);
> > 
> > The '%p' already prefix the address with the 0x.
> > 
> > > +	priv->uar_base = addr; /* for primary and secondary UAR re-mmap */
> > > +	uar_base = addr; /* process local, don't reserve again */
> > > +	return 0;
> > > +}
> > > +
> > <snip/>
> > 
> > Regards,
> > 
> > --
> > Nélio Laranjeiro
> > 6WIND

Thanks,
  
Xueming Li Jan. 23, 2018, 2:16 p.m. UTC | #4
> -----Original Message-----

> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]

> Sent: Tuesday, January 23, 2018 9:31 PM

> To: Xueming(Steven) Li <xuemingl@mellanox.com>

> Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org

> Subject: Re: [PATCH] net/mlx5: remmap UAR address for multiple process

> 

> Hi Xueming,

> 

> My lonely comments are more related to the commit log which should be re-

> written to be more accurate to the issue you try to address, even if this

> patch does not solves it.


The current situation is that UAR locates inside top area of address space,
VERY easy to conflicts in secondary process, this patch address that with 
less conflicts.

Similar to hugepage mapping in secondary, no good solution to resolve it 
completely, correct? But it always good to add this warning in commit log.

> 

> Please see below,

> 

> On Tue, Jan 23, 2018 at 09:50:42AM +0000, Xueming(Steven) Li wrote:

> > Hi Nelio,

> >

> > > -----Original Message-----

> > > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]

> > > Sent: Monday, January 22, 2018 10:53 PM

> > > To: Xueming(Steven) Li <xuemingl@mellanox.com>

> > > Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org

> > > Subject: Re: [PATCH] net/mlx5: remmap UAR address for multiple

> > > process

> > >

> > > Hi Xueming,

> > >

> > > On Fri, Jan 19, 2018 at 11:08:54PM +0800, Xueming Li wrote:

> > > > UAR(doorbell) is hw resources that have to be same address between

> > > > primary and secondary process, failed to mmap UAR will make TX

> > > > packets invisible to HW.

> > > > Today, UAR address returned from verbs api is mixed in heap and

> > > > loaded library address space, prone to be occupied in secondary

> process.

> > > > This patch reserves a dedicate UAR address space, both primary and

> > > > secondary process re-mmap UAR pages into this space.

> > > > Below is a brief picture of dpdk app address space allocation:

> > > > 	Before			This patch

> > > > 	------			----------

> > > > 	[stack]			[stack]

> > > > 	[.so, uar, heap]	[.so, heap]

> > > > 	[(empty)]		[(empty)]

> > > > 	[hugepage]		[hugepage]

> > > > 	[? others]		[? others]

> > > > 	[(empty)]		[(empty)]

> > > > 				[uar]

> > > > 				[(empty)]

> > > > To minimize conflicts, UAR address space comes after hugepage

> > > > space with an offset to skip potential usage from other drivers.

> > >

> > > Seems it is not the case when the memory is contiguous, according to

> > > what I see in my testpmd /proc/<pid>/maps:

> > >

> > >  PMD: mlx5.c:523: mlx5_uar_init_primary(): Reserved UAR address space:

> > > 0x0x7f4da5800000

> > >

> > > And the fist huge page is at address 0x7f4fa5800000, new UAR space

> > > is before and not after.

> > >

> > > With this patch I still have the situation described as "before".

> > >

> >

> > Your observation is correct, system is allocating address in a

> > high-to-low manner like stack. UAR address range 0x0x7f4da5800000 -

> > 0x0x7f4ea5800000, 4GB size, With another 4G offset, hugepage range start

> is 0x7f4fa5800000.

> 

> From what I understand, remapping the UAR pages to an address before the

> huge pages reduce the situation where the secondaries process cannot start.

> This patch does not fix the fact it may fail.

> 

> Your small display of the memory mapping between before and after seems

> not accurate depending on the OS being run, on Linux v4.14 from debian9

> S.I.D. I am still on the situation before no matter how many time I

> restart the process.  For that I'll suggest you to remove it.

> 

> > > > Once UAR space reserved successfully, UAR pages are re-mmapped

> > > > into new area to keep UAR address aligned between primary and

> > > > secondary

> > > process.

> > > >

> > > > Signed-off-by: Xueming Li <xuemingl@mellanox.com>

> > > > ---

> > > >  drivers/net/mlx5/mlx5.c         | 107

> > > ++++++++++++++++++++++++++++++++++++++++

> > > >  drivers/net/mlx5/mlx5.h         |   1 +

> > > >  drivers/net/mlx5/mlx5_defs.h    |  10 ++++

> > > >  drivers/net/mlx5/mlx5_rxtx.h    |   3 +-

> > > >  drivers/net/mlx5/mlx5_trigger.c |   7 ++-

> > > >  drivers/net/mlx5/mlx5_txq.c     |  51 +++++++++++++------

> > > >  6 files changed, 163 insertions(+), 16 deletions(-)

> > > >

> > > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c

> > > > index

> > > > fc2d59fee..1539ef608 100644

> > > > --- a/drivers/net/mlx5/mlx5.c

> > > > +++ b/drivers/net/mlx5/mlx5.c

> > > > @@ -39,6 +39,7 @@

> > > >  #include <stdlib.h>

> > > >  #include <errno.h>

> > > >  #include <net/if.h>

> > > > +#include <sys/mman.h>

> > > >

> > > >  /* Verbs header. */

> > > >  /* ISO C doesn't support unnamed structs/unions, disabling -

> pedantic.

> > > > */ @@ -56,6 +57,7 @@  #include <rte_pci.h>  #include

> > > > <rte_bus_pci.h> #include <rte_common.h>

> > > > +#include <rte_eal_memconfig.h>

> > > >  #include <rte_kvargs.h>

> > > >

> > > >  #include "mlx5.h"

> > > > @@ -466,6 +468,101 @@ mlx5_args(struct mlx5_dev_config *config,

> > > > struct rte_devargs *devargs)

> > > >

> > > >  static struct rte_pci_driver mlx5_driver;

> > > >

> > > > +/*

> > > > + * Reserved UAR address space for TXQ UAR(hw doorbell) mapping,

> > > > +process

> > > > + * local resource used by both primary and secondary to avoid

> > > > +duplicate

> > > > + * reservation.

> > > > + * The space has to be available on both primary and secondary

> > > > +process,

> > > > + * TXQ UAR maps to this area using fixed mmap w/o double check.

> > > > + */

> > > > +static void *uar_base;

> > > > +

> > > > +/**

> > > > + * Reserve UAR address space for primary process

> > > > + *

> > > > + * @param[in] priv

> > > > + *   Pointer to private structure.

> > > > + *

> > > > + * @return

> > > > + *   0 on success, negative errno value on failure.

> > > > + */

> > > > +static int

> > > > +mlx5_uar_init_primary(struct priv *priv) {

> > > > +	void *addr = (void *)0;

> > > > +	int i;

> > > > +	const struct rte_mem_config *mcfg;

> > > > +

> > > > +	if (uar_base) { /* UAR address space mapped */

> > > > +		priv->uar_base = uar_base;

> > > > +		return 0;

> > > > +	}

> > > > +	/* find out lower bound of hugepage segments */

> > > > +	mcfg = rte_eal_get_configuration()->mem_config;

> > > > +	for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {

> > > > +		if (addr)

> > > > +			addr = RTE_MIN(addr, mcfg->memseg[i].addr);

> > > > +		else

> > > > +			addr = mcfg->memseg[i].addr;

> > >

> > > This if/else is useless as addr is already initialised with the

> > > smallest possible value.

> >

> > That's my original code :-) and I always get addr zero then.

> > Addr here is the lower bound of hugepage, we don't want addr to keep

> zero.

> 

> Indeed, I mix my mind the min and max.

> 

> > > > +	}

> > > > +	/* offset down UAR area */

> > > > +	addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);

> > >

> > > Seems the error is here, the loops get the address of the memseg

> > > with the smallest address and then it subtract the UAR size, addr

> > > cannot be after the huge pages unless if this subtraction overflows.

> >

> > Thanks, my word "after" is something like address alloction order, the

> > UAR block under "hugepage" on the overall picture.

> 

> There is no guarantee that the system will allocate from the end to the

> beginning.  After means having an address higher than the reference,

> otherwise it is not after but before.

> 

> > > > +	/* anonymous mmap, no real memory consumption */

> > > > +	addr = mmap(addr, MLX5_UAR_SIZE,

> > > > +		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

> > > > +	if (addr == MAP_FAILED) {

> > > > +		ERROR("Failed to reserve UAR address space, please

> adjust "

> > > > +		      "MLX5_UAR_SIZE or try --base-virtaddr");

> > >

> > > How does a user knows the UAR memory space the NIC needs to adjust

> > > the MLX5_UAR_SIZE?

> > >

> > > > +		return -ENOMEM;

> > > > +	}

> > > > +	/* Accept either same addr or a new addr returned from mmap if

> > > target

> > > > +	 * range occupied.

> > > > +	 */

> > > > +	INFO("Reserved UAR address space: 0x%p", addr);

> > >

> > > The '%p' already prefix the address with the 0x.

> > >

> > > > +	priv->uar_base = addr; /* for primary and secondary UAR re-

> mmap */

> > > > +	uar_base = addr; /* process local, don't reserve again */

> > > > +	return 0;

> > > > +}

> > > > +

> > > <snip/>

> > >

> > > Regards,

> > >

> > > --

> > > Nélio Laranjeiro

> > > 6WIND

> 

> Thanks,

> 

> --

> Nélio Laranjeiro

> 6WIND
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index fc2d59fee..1539ef608 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -39,6 +39,7 @@ 
 #include <stdlib.h>
 #include <errno.h>
 #include <net/if.h>
+#include <sys/mman.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -56,6 +57,7 @@ 
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
 #include <rte_common.h>
+#include <rte_eal_memconfig.h>
 #include <rte_kvargs.h>
 
 #include "mlx5.h"
@@ -466,6 +468,101 @@  mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 
 static struct rte_pci_driver mlx5_driver;
 
+/*
+ * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process
+ * local resource used by both primary and secondary to avoid duplicate
+ * reservation.
+ * The space has to be available on both primary and secondary process,
+ * TXQ UAR maps to this area using fixed mmap w/o double check.
+ */
+static void *uar_base;
+
+/**
+ * Reserve UAR address space for primary process
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+static int
+mlx5_uar_init_primary(struct priv *priv)
+{
+	void *addr = (void *)0;
+	int i;
+	const struct rte_mem_config *mcfg;
+
+	if (uar_base) { /* UAR address space mapped */
+		priv->uar_base = uar_base;
+		return 0;
+	}
+	/* find out lower bound of hugepage segments */
+	mcfg = rte_eal_get_configuration()->mem_config;
+	for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {
+		if (addr)
+			addr = RTE_MIN(addr, mcfg->memseg[i].addr);
+		else
+			addr = mcfg->memseg[i].addr;
+	}
+	/* offset down UAR area */
+	addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
+	/* anonymous mmap, no real memory consumption */
+	addr = mmap(addr, MLX5_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("Failed to reserve UAR address space, please adjust "
+		      "MLX5_UAR_SIZE or try --base-virtaddr");
+		return -ENOMEM;
+	}
+	/* Accept either same addr or a new addr returned from mmap if target
+	 * range occupied.
+	 */
+	INFO("Reserved UAR address space: 0x%p", addr);
+	priv->uar_base = addr; /* for primary and secondary UAR re-mmap */
+	uar_base = addr; /* process local, don't reserve again */
+	return 0;
+}
+
+/**
+ * Reserve UAR address space for secondary process, align with
+ * primary process.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+static int
+mlx5_uar_init_secondary(struct priv *priv)
+{
+	void *addr;
+
+	assert(priv->uar_base);
+	if (uar_base) { /* already reserved */
+		assert(uar_base == priv->uar_base);
+		return 0;
+	}
+	/* anonymous mmap, no real memory consumption */
+	addr = mmap(priv->uar_base, MLX5_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("UAR mmap failed: 0x%p size: %llu",
+		      priv->uar_base, MLX5_UAR_SIZE);
+		return -ENXIO;
+	}
+	if (priv->uar_base != addr) {
+		ERROR("UAR address 0x%p size %llu occupied, please adjust "
+		      "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr",
+		      priv->uar_base, MLX5_UAR_SIZE);
+		return -ENXIO;
+	}
+	uar_base = addr; /* process local, don't reserve again */
+	INFO("Reserved UAR address space: 0x%p", addr);
+	return 0;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -648,6 +745,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			eth_dev->device = &pci_dev->device;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			priv = eth_dev->data->dev_private;
+			err = mlx5_uar_init_secondary(priv);
+			if (err < 0) {
+				err = -err;
+				goto error;
+			}
 			/* Receive command fd from primary process */
 			err = priv_socket_connect(priv);
 			if (err < 0) {
@@ -805,6 +907,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			WARN("Rx CQE compression isn't supported");
 			config.cqe_comp = 0;
 		}
+		err = mlx5_uar_init_primary(priv);
+		if (err < 0) {
+			err = -err;
+			goto port_error;
+		}
 		/* Configure the first MAC address by default. */
 		if (priv_get_mac(priv, &mac.addr_bytes)) {
 			ERROR("cannot get MAC address, is mlx5_en loaded?"
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 9a4a59bd4..6a234c14f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -162,6 +162,7 @@  struct priv {
 	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
 	rte_spinlock_t lock; /* Lock for control functions. */
 	int primary_socket; /* Unix socket for primary process. */
+	void *uar_base; /* Reserved address space for UAR mapping */
 	struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */
 	struct mlx5_dev_config config; /* Device configuration. */
 };
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 64bb0712e..26bb39720 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -110,4 +110,14 @@ 
 /* Supported RSS */
 #define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP))
 
+/* Reserved address space for UAR mapping. */
+#define MLX5_UAR_SIZE (1ULL << 32)
+
+/* Offset of reserved UAR address space to hugepage memory. Offset is used here
+ * to minimize possibility of address next to hugepage being used by other code
+ * in either primary or secondary process, failing to map TX UAR would make TX
+ * packets invisible to HW.
+ */
+#define MLX5_UAR_OFFSET (1ULL << 32)
+
 #endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index a239642ac..be53dd662 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -205,7 +205,7 @@  struct mlx5_txq_data {
 	volatile void *wqes; /* Work queue (use volatile to write into). */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	volatile void *bf_reg; /* Blueflame register. */
+	volatile void *bf_reg; /* Blueflame register remapped. */
 	struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
@@ -230,6 +230,7 @@  struct mlx5_txq_ctrl {
 	struct mlx5_txq_ibv *ibv; /* Verbs queue object. */
 	struct mlx5_txq_data txq; /* Data path structure. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
+	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
 };
 
 /* mlx5_rxq.c */
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 6f5c799b5..9acfa394f 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -76,7 +76,12 @@  priv_txq_start(struct priv *priv)
 			goto error;
 		}
 	}
-	return -ret;
+	ret = priv_tx_uar_remap(priv, priv->ctx->cmd_fd);
+	if (ret) {
+		ret = -ret;
+		goto error;
+	}
+	return ret;
 error:
 	priv_txq_stop(priv);
 	return -ret;
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 26db15a4f..82f1b5ae8 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -288,7 +288,9 @@  mlx5_tx_queue_release(void *dpdk_txq)
 
 
 /**
- * Map locally UAR used in Tx queues for BlueFlame doorbell.
+ * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
+ * Both primary and secondary process do mmap to make UAR address
+ * aligned.
  *
  * @param[in] priv
  *   Pointer to private structure.
@@ -296,7 +298,7 @@  mlx5_tx_queue_release(void *dpdk_txq)
  *   Verbs file descriptor to map UAR pages.
  *
  * @return
- *   0 on success, errno value on failure.
+ *   0 on success, negative errno value on failure.
  */
 int
 priv_tx_uar_remap(struct priv *priv, int fd)
@@ -305,7 +307,9 @@  priv_tx_uar_remap(struct priv *priv, int fd)
 	uintptr_t pages[priv->txqs_n];
 	unsigned int pages_n = 0;
 	uintptr_t uar_va;
+	uintptr_t off;
 	void *addr;
+	void *ret;
 	struct mlx5_txq_data *txq;
 	struct mlx5_txq_ctrl *txq_ctrl;
 	int already_mapped;
@@ -320,8 +324,10 @@  priv_tx_uar_remap(struct priv *priv, int fd)
 	for (i = 0; i != priv->txqs_n; ++i) {
 		txq = (*priv->txqs)[i];
 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
-		uar_va = (uintptr_t)txq_ctrl->txq.bf_reg;
-		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size);
+		/* UAR addr form verbs used to find dup and offset in page */
+		uar_va = (uintptr_t)txq_ctrl->bf_reg_orig;
+		off = uar_va & (page_size - 1); /* offset in page */
+		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr */
 		already_mapped = 0;
 		for (j = 0; j != pages_n; ++j) {
 			if (pages[j] == uar_va) {
@@ -329,16 +335,29 @@  priv_tx_uar_remap(struct priv *priv, int fd)
 				break;
 			}
 		}
-		if (already_mapped)
-			continue;
-		pages[pages_n++] = uar_va;
-		addr = mmap((void *)uar_va, page_size,
-			    PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
-			    txq_ctrl->uar_mmap_offset);
-		if (addr != (void *)uar_va) {
-			ERROR("call to mmap failed on UAR for txq %d\n", i);
-			return -1;
+		/* new address in reserved UAR address space */
+		addr = RTE_PTR_ADD(priv->uar_base,
+				   uar_va & (MLX5_UAR_SIZE - 1));
+		if (!already_mapped) {
+			pages[pages_n++] = uar_va;
+			/* fixed mmap to specified address in reserved
+			 * address space
+			 */
+			ret = mmap(addr, page_size,
+				   PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+				   txq_ctrl->uar_mmap_offset);
+			if (ret != addr) {
+				/* fixed mmap have to return same address */
+				ERROR("call to mmap failed on UAR for txq %d\n",
+				      i);
+				return -ENXIO;
+			}
 		}
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once */
+			txq_ctrl->txq.bf_reg = RTE_PTR_ADD((void *)addr, off);
+		else
+			assert(txq_ctrl->txq.bf_reg ==
+			       RTE_PTR_ADD((void *)addr, off));
 	}
 	return 0;
 }
@@ -503,7 +522,7 @@  mlx5_priv_txq_ibv_new(struct priv *priv, uint16_t idx)
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
-	txq_data->bf_reg = qp.bf.reg;
+	txq_ctrl->bf_reg_orig = qp.bf.reg;
 	txq_data->cq_db = cq_info.dbrec;
 	txq_data->cqes =
 		(volatile struct mlx5_cqe (*)[])
@@ -836,6 +855,7 @@  mlx5_priv_txq_release(struct priv *priv, uint16_t idx)
 {
 	unsigned int i;
 	struct mlx5_txq_ctrl *txq;
+	size_t page_size = sysconf(_SC_PAGESIZE);
 
 	if (!(*priv->txqs)[idx])
 		return 0;
@@ -855,6 +875,9 @@  mlx5_priv_txq_release(struct priv *priv, uint16_t idx)
 			txq->txq.mp2mr[i] = NULL;
 		}
 	}
+	if (priv->uar_base)
+		munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg,
+		       page_size), page_size);
 	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
 		txq_free_elts(txq);
 		LIST_REMOVE(txq, next);