[dpdk-dev,v2,1/5] mbuf: fix clone support when application uses private mbuf data

Message ID 1427385595-15011-2-git-send-email-olivier.matz@6wind.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Olivier Matz March 26, 2015, 3:59 p.m. UTC
Add a new private_size field in mbuf structure that should
be initialized at mbuf pool creation. This field contains the
size of the application private data in mbufs.

Introduce new static inline functions rte_mbuf_from_indirect()
and rte_mbuf_to_baddr() to replace the existing macros, which
take the private size in account when attaching and detaching
mbufs.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 app/test-pmd/testpmd.c     |  1 +
 examples/vhost/main.c      |  2 +-
 lib/librte_mbuf/rte_mbuf.c |  1 +
 lib/librte_mbuf/rte_mbuf.h | 44 ++++++++++++++++++++++++++++++++++----------
 4 files changed, 37 insertions(+), 11 deletions(-)
  

Comments

Zoltan Kiss March 26, 2015, 5:13 p.m. UTC | #1
On 26/03/15 15:59, Olivier Matz wrote:
> Add a new private_size field in mbuf structure that should
> be initialized at mbuf pool creation. This field contains the
> size of the application private data in mbufs.
>
> Introduce new static inline functions rte_mbuf_from_indirect()
> and rte_mbuf_to_baddr() to replace the existing macros, which
> take the private size in account when attaching and detaching
> mbufs.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   app/test-pmd/testpmd.c     |  1 +
>   examples/vhost/main.c      |  2 +-
>   lib/librte_mbuf/rte_mbuf.c |  1 +
>   lib/librte_mbuf/rte_mbuf.h | 44 ++++++++++++++++++++++++++++++++++----------
>   4 files changed, 37 insertions(+), 11 deletions(-)
>
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index 3057791..c5a195a 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -425,6 +425,7 @@ testpmd_mbuf_ctor(struct rte_mempool *mp,
>   	mb->tx_offload   = 0;
>   	mb->vlan_tci     = 0;
>   	mb->hash.rss     = 0;
> +	mb->priv_size    = 0;
>   }
>
>   static void
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c
> index c3fcb80..d542461 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -139,7 +139,7 @@
>   /* Number of descriptors per cacheline. */
>   #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
>
> -#define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
> +#define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
>
>   /* mask of enabled ports */
>   static uint32_t enabled_port_mask = 0;
You would need to fix pktmbuf_detach_zcp as well, but see my reply to 
Konstantin.

> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 526b18d..e095999 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -125,6 +125,7 @@ rte_pktmbuf_init(struct rte_mempool *mp,
>   	m->pool = mp;
>   	m->nb_segs = 1;
>   	m->port = 0xff;
> +	m->priv_size = 0;
>   }
>
>   /* do some sanity checks on a mbuf: panic if it fails */
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 17ba791..45ac948 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -317,18 +317,42 @@ struct rte_mbuf {
>   			/* uint64_t unused:8; */
>   		};
>   	};
> +
> +	uint16_t priv_size;       /**< size of the application private data */
>   } __rte_cache_aligned;
>
>   /**
> - * Given the buf_addr returns the pointer to corresponding mbuf.
> + * Return the mbuf owning the data buffer address of an indirect mbuf.
> + *
> + * @param mi
> + *   The pointer to the indirect mbuf.
> + * @return
> + *   The address of the direct mbuf corresponding to buffer_addr.
>    */
> -#define RTE_MBUF_FROM_BADDR(ba)     (((struct rte_mbuf *)(ba)) - 1)
> +static inline struct rte_mbuf *
> +rte_mbuf_from_indirect(struct rte_mbuf *mi)
> +{
> +       struct rte_mbuf *md;
> +       md = (struct rte_mbuf *)((char *)mi->buf_addr - sizeof(*mi) -
> +	       mi->priv_size);
> +       return md;
> +}
>
>   /**
> - * Given the pointer to mbuf returns an address where it's  buf_addr
> - * should point to.
> + * Return the buffer address embedded in the given mbuf.
> + *
> + * @param md
> + *   The pointer to the mbuf.
> + * @return
> + *   The address of the data buffer owned by the mbuf.
>    */
> -#define RTE_MBUF_TO_BADDR(mb)       (((struct rte_mbuf *)(mb)) + 1)
> +static inline char *
> +rte_mbuf_to_baddr(struct rte_mbuf *md)
> +{
> +       char *buffer_addr;
> +       buffer_addr = (char *)md + sizeof(*md) + md->priv_size;
> +       return buffer_addr;
> +}
>
>   /**
>    * Returns TRUE if given mbuf is indirect, or FALSE otherwise.
> @@ -744,12 +768,12 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md)
>   static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>   {
>   	const struct rte_mempool *mp = m->pool;
> -	void *buf = RTE_MBUF_TO_BADDR(m);
> -	uint32_t buf_len = mp->elt_size - sizeof(*m);
> -	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof (*m);
> +	void *buf = rte_mbuf_to_baddr(m);
> +	unsigned mhdr_size = (char *)buf - (char *)m;
Minor nit: I think "sizeof(*m) + m->priv_size" would be much more clear, 
like you did above. In fact, this might worth its own inline function, 
and then you can substitute mhdr_size below.

>
> +	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + mhdr_size;
>   	m->buf_addr = buf;
> -	m->buf_len = (uint16_t)buf_len;
> +	m->buf_len = (uint16_t)(mp->elt_size - mhdr_size);
>
>   	m->data_off = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
>   			RTE_PKTMBUF_HEADROOM : m->buf_len;
> @@ -774,7 +798,7 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
>   		 *  - free attached mbuf segment
>   		 */
>   		if (RTE_MBUF_INDIRECT(m)) {
> -			struct rte_mbuf *md = RTE_MBUF_FROM_BADDR(m->buf_addr);
> +			struct rte_mbuf *md = rte_mbuf_from_indirect(m);
>   			rte_pktmbuf_detach(m);
>   			if (rte_mbuf_refcnt_update(md, -1) == 0)
>   				__rte_mbuf_raw_free(md);
>
  
Ananyev, Konstantin March 27, 2015, 12:24 a.m. UTC | #2
Hi Olivier,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Olivier Matz
> Sent: Thursday, March 26, 2015 4:00 PM
> To: dev@dpdk.org
> Cc: zoltan.kiss@linaro.org
> Subject: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> 
> Add a new private_size field in mbuf structure that should
> be initialized at mbuf pool creation. This field contains the
> size of the application private data in mbufs.
> 
> Introduce new static inline functions rte_mbuf_from_indirect()
> and rte_mbuf_to_baddr() to replace the existing macros, which
> take the private size in account when attaching and detaching
> mbufs.
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  app/test-pmd/testpmd.c     |  1 +
>  examples/vhost/main.c      |  2 +-
>  lib/librte_mbuf/rte_mbuf.c |  1 +
>  lib/librte_mbuf/rte_mbuf.h | 44 ++++++++++++++++++++++++++++++++++----------
>  4 files changed, 37 insertions(+), 11 deletions(-)
> 
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
> index 3057791..c5a195a 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -425,6 +425,7 @@ testpmd_mbuf_ctor(struct rte_mempool *mp,
>  	mb->tx_offload   = 0;
>  	mb->vlan_tci     = 0;
>  	mb->hash.rss     = 0;
> +	mb->priv_size    = 0;
>  }
> 
>  static void
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c
> index c3fcb80..d542461 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -139,7 +139,7 @@
>  /* Number of descriptors per cacheline. */
>  #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
> 
> -#define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
> +#define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
> 
>  /* mask of enabled ports */
>  static uint32_t enabled_port_mask = 0;
> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
> index 526b18d..e095999 100644
> --- a/lib/librte_mbuf/rte_mbuf.c
> +++ b/lib/librte_mbuf/rte_mbuf.c
> @@ -125,6 +125,7 @@ rte_pktmbuf_init(struct rte_mempool *mp,
>  	m->pool = mp;
>  	m->nb_segs = 1;
>  	m->port = 0xff;
> +	m->priv_size = 0;
>  }
> 
>  /* do some sanity checks on a mbuf: panic if it fails */
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 17ba791..45ac948 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -317,18 +317,42 @@ struct rte_mbuf {
>  			/* uint64_t unused:8; */
>  		};
>  	};
> +
> +	uint16_t priv_size;       /**< size of the application private data */
>  } __rte_cache_aligned;
> 
>  /**
> - * Given the buf_addr returns the pointer to corresponding mbuf.
> + * Return the mbuf owning the data buffer address of an indirect mbuf.
> + *
> + * @param mi
> + *   The pointer to the indirect mbuf.
> + * @return
> + *   The address of the direct mbuf corresponding to buffer_addr.
>   */
> -#define RTE_MBUF_FROM_BADDR(ba)     (((struct rte_mbuf *)(ba)) - 1)
> +static inline struct rte_mbuf *
> +rte_mbuf_from_indirect(struct rte_mbuf *mi)
> +{
> +       struct rte_mbuf *md;
> +       md = (struct rte_mbuf *)((char *)mi->buf_addr - sizeof(*mi) -
> +	       mi->priv_size);
> +       return md;
> +}
> 
>  /**
> - * Given the pointer to mbuf returns an address where it's  buf_addr
> - * should point to.
> + * Return the buffer address embedded in the given mbuf.
> + *
> + * @param md
> + *   The pointer to the mbuf.
> + * @return
> + *   The address of the data buffer owned by the mbuf.
>   */
> -#define RTE_MBUF_TO_BADDR(mb)       (((struct rte_mbuf *)(mb)) + 1)
> +static inline char *
> +rte_mbuf_to_baddr(struct rte_mbuf *md)
> +{
> +       char *buffer_addr;
> +       buffer_addr = (char *)md + sizeof(*md) + md->priv_size;
> +       return buffer_addr;
> +}
> 

I am a bit puzzled here, so for indirect mbuf, what value priv_size should hold?
Is that priv_size of indirect mfuf, or priv_size of direct mbuf, that mbuf is attached to?
If it is first one, then your changes in __rte_pktmbuf_prefree_seg() wouldn't work properly,
If second one then  changes in rte_pktmbuf_detach() looks wrong to me.
Unless, of course priv_size for all mbufs in the system should always be the same value.
But I suppose, that's not what your intention was, otherwise we don't need priv_size inside mbuf at all -
just a new macro in config file seems enough, plus it would be easier and faster.

I think that to support ability to setup priv_size on a mempool basis,
and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
we need to: 

1. Store priv_size both inside the mempool and inside the mbuf.

2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}

3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
rte_pktmbuf_detach(struct rte_mbuf *m) 
{
  ...
   m->priv_size = rte_mempool_get_privsize(m->pool);
   m->buf _addr= rte_mbuf_to_baddr(m);
   ...
}

Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
- either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
- or provide separate function that could be called straight after rte_mempool_create() , that
would setup priv_size for the  pool and for all its mbufs.
- or some sort of combination of these 2 approaches - introduce a wrapper function 
(rte_mbuf_pool_create() or something) that would take priv_size as parameter, 
create a new mempool and then setup priv_size.

Though, I still think that the better approach is to reserve private space before struct rte_mbuf, not after.
In that case, user got his private space, and we keep buf_addr straight after  rte_mbuf, without any whole.
So we don't need steps 2 and 3, above,
plus we don't need rte_mbuf_to_baddr() and rte_mbuf_from_indirect() - 
RTE_MBUF_TO_BADDR/ RTE_MBUF_FROM_BADDR would keep working correctly.
In fact, with this scheme - we don't even need priv_size for mbuf management (attach/detach/free).

Wonder did you try that approach?
Thanks
Konstantin


>  /**
>   * Returns TRUE if given mbuf is indirect, or FALSE otherwise.
> @@ -744,12 +768,12 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md)
>  static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>  {
>  	const struct rte_mempool *mp = m->pool;
> -	void *buf = RTE_MBUF_TO_BADDR(m);
> -	uint32_t buf_len = mp->elt_size - sizeof(*m);
> -	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof (*m);
> +	void *buf = rte_mbuf_to_baddr(m);
> +	unsigned mhdr_size = (char *)buf - (char *)m;
> 
> +	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + mhdr_size;
>  	m->buf_addr = buf;
> -	m->buf_len = (uint16_t)buf_len;
> +	m->buf_len = (uint16_t)(mp->elt_size - mhdr_size);
> 
>  	m->data_off = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
>  			RTE_PKTMBUF_HEADROOM : m->buf_len;
> @@ -774,7 +798,7 @@ __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
>  		 *  - free attached mbuf segment
>  		 */
>  		if (RTE_MBUF_INDIRECT(m)) {
> -			struct rte_mbuf *md = RTE_MBUF_FROM_BADDR(m->buf_addr);
> +			struct rte_mbuf *md = rte_mbuf_from_indirect(m);
>  			rte_pktmbuf_detach(m);
>  			if (rte_mbuf_refcnt_update(md, -1) == 0)
>  				__rte_mbuf_raw_free(md);
> --
> 2.1.4
  
Olivier Matz March 27, 2015, 9:07 a.m. UTC | #3
Hi Konstantin,

Thank you for your comments.

On 03/27/2015 01:24 AM, Ananyev, Konstantin wrote:
> Hi Olivier,
> 
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Olivier Matz
>> Sent: Thursday, March 26, 2015 4:00 PM
>> To: dev@dpdk.org
>> Cc: zoltan.kiss@linaro.org
>> Subject: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
>>
>> Add a new private_size field in mbuf structure that should
>> be initialized at mbuf pool creation. This field contains the
>> size of the application private data in mbufs.
>>
>> Introduce new static inline functions rte_mbuf_from_indirect()
>> and rte_mbuf_to_baddr() to replace the existing macros, which
>> take the private size in account when attaching and detaching
>> mbufs.
>>
>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>> ---
>>  app/test-pmd/testpmd.c     |  1 +
>>  examples/vhost/main.c      |  2 +-
>>  lib/librte_mbuf/rte_mbuf.c |  1 +
>>  lib/librte_mbuf/rte_mbuf.h | 44 ++++++++++++++++++++++++++++++++++----------
>>  4 files changed, 37 insertions(+), 11 deletions(-)
>>
>> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
>> index 3057791..c5a195a 100644
>> --- a/app/test-pmd/testpmd.c
>> +++ b/app/test-pmd/testpmd.c
>> @@ -425,6 +425,7 @@ testpmd_mbuf_ctor(struct rte_mempool *mp,
>>  	mb->tx_offload   = 0;
>>  	mb->vlan_tci     = 0;
>>  	mb->hash.rss     = 0;
>> +	mb->priv_size    = 0;
>>  }
>>
>>  static void
>> diff --git a/examples/vhost/main.c b/examples/vhost/main.c
>> index c3fcb80..d542461 100644
>> --- a/examples/vhost/main.c
>> +++ b/examples/vhost/main.c
>> @@ -139,7 +139,7 @@
>>  /* Number of descriptors per cacheline. */
>>  #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
>>
>> -#define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
>> +#define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
>>
>>  /* mask of enabled ports */
>>  static uint32_t enabled_port_mask = 0;
>> diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
>> index 526b18d..e095999 100644
>> --- a/lib/librte_mbuf/rte_mbuf.c
>> +++ b/lib/librte_mbuf/rte_mbuf.c
>> @@ -125,6 +125,7 @@ rte_pktmbuf_init(struct rte_mempool *mp,
>>  	m->pool = mp;
>>  	m->nb_segs = 1;
>>  	m->port = 0xff;
>> +	m->priv_size = 0;
>>  }
>>
>>  /* do some sanity checks on a mbuf: panic if it fails */
>> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
>> index 17ba791..45ac948 100644
>> --- a/lib/librte_mbuf/rte_mbuf.h
>> +++ b/lib/librte_mbuf/rte_mbuf.h
>> @@ -317,18 +317,42 @@ struct rte_mbuf {
>>  			/* uint64_t unused:8; */
>>  		};
>>  	};
>> +
>> +	uint16_t priv_size;       /**< size of the application private data */
>>  } __rte_cache_aligned;
>>
>>  /**
>> - * Given the buf_addr returns the pointer to corresponding mbuf.
>> + * Return the mbuf owning the data buffer address of an indirect mbuf.
>> + *
>> + * @param mi
>> + *   The pointer to the indirect mbuf.
>> + * @return
>> + *   The address of the direct mbuf corresponding to buffer_addr.
>>   */
>> -#define RTE_MBUF_FROM_BADDR(ba)     (((struct rte_mbuf *)(ba)) - 1)
>> +static inline struct rte_mbuf *
>> +rte_mbuf_from_indirect(struct rte_mbuf *mi)
>> +{
>> +       struct rte_mbuf *md;
>> +       md = (struct rte_mbuf *)((char *)mi->buf_addr - sizeof(*mi) -
>> +	       mi->priv_size);
>> +       return md;
>> +}
>>
>>  /**
>> - * Given the pointer to mbuf returns an address where it's  buf_addr
>> - * should point to.
>> + * Return the buffer address embedded in the given mbuf.
>> + *
>> + * @param md
>> + *   The pointer to the mbuf.
>> + * @return
>> + *   The address of the data buffer owned by the mbuf.
>>   */
>> -#define RTE_MBUF_TO_BADDR(mb)       (((struct rte_mbuf *)(mb)) + 1)
>> +static inline char *
>> +rte_mbuf_to_baddr(struct rte_mbuf *md)
>> +{
>> +       char *buffer_addr;
>> +       buffer_addr = (char *)md + sizeof(*md) + md->priv_size;
>> +       return buffer_addr;
>> +}
>>
> 
> I am a bit puzzled here, so for indirect mbuf, what value priv_size should hold?
> Is that priv_size of indirect mfuf, or priv_size of direct mbuf, that mbuf is attached to?
> If it is first one, then your changes in __rte_pktmbuf_prefree_seg() wouldn't work properly,
> If second one then  changes in rte_pktmbuf_detach() looks wrong to me.
> Unless, of course priv_size for all mbufs in the system should always be the same value.
> But I suppose, that's not what your intention was, otherwise we don't need priv_size inside mbuf at all -
> just a new macro in config file seems enough, plus it would be easier and faster.

Yes, my assumption was that the priv_size isi the same on all mbufs
of a pool, and probably in most cases all mbufs of the system.
To me, a config macro is not th best solution because it prevents the
application to choose the proper size at run-time, especially if the
dpdk is distributed in binary format.

That's why I decided to have priv_size in mbufs, although having it in
the mempool private area is possible but it adds an additional
indirection (see the cover letter).

> I think that to support ability to setup priv_size on a mempool basis,
> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
> we need to: 
> 
> 1. Store priv_size both inside the mempool and inside the mbuf.
> 
> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
> 
> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
> rte_pktmbuf_detach(struct rte_mbuf *m) 
> {
>   ...
>    m->priv_size = rte_mempool_get_privsize(m->pool);
>    m->buf _addr= rte_mbuf_to_baddr(m);
>    ...
> }
> 
> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
> - or provide separate function that could be called straight after rte_mempool_create() , that
> would setup priv_size for the  pool and for all its mbufs.
> - or some sort of combination of these 2 approaches - introduce a wrapper function 
> (rte_mbuf_pool_create() or something) that would take priv_size as parameter, 
> create a new mempool and then setup priv_size.

Your solution looks good to me, and even if I'm not absolutely
convinced that there is a use case where a packet is attached to
another one with a different priv_size, it's stronger from an
API perspective to support this. Maybe it could happen in a
virtualization or secondary process context where there are 2
different mbuf pools.

The mbuf_priv_size could go in struct rte_pktmbuf_pool_private, but
it can already be deducted from what we have today without changing
anything:

priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
	pool_private->mbuf_data_room_size -
	sizeof(rte_mbuf)

Introducing rte_mbuf_pool_create() seems a good idea to me, it
would hide 'rte_pktmbuf_pool_private' from the user and force
to initialize all the required fields (mbuf_data_room_size only
today, and maybe mbuf_priv_size).

The API would be:

struct rte_mempool *
rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
	unsigned cache_size, size_t mbuf_priv_size,
	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
	int socket_id, unsigned flags)

I can give it a try and send a patch for this.

> Though, I still think that the better approach is to reserve private space before struct rte_mbuf, not after.
> In that case, user got his private space, and we keep buf_addr straight after  rte_mbuf, without any whole.
> So we don't need steps 2 and 3, above,
> plus we don't need rte_mbuf_to_baddr() and rte_mbuf_from_indirect() - 
> RTE_MBUF_TO_BADDR/ RTE_MBUF_FROM_BADDR would keep working correctly.
> In fact, with this scheme - we don't even need priv_size for mbuf management (attach/detach/free).
> 
> Wonder did you try that approach?

Yes, I though about this approach too. But I think it would require
more changes. When an application or a driver allocate a mbuf with
mempool_get(), it expects that the returned pointer is the rte_mbuf *.

With this solution, there are 2 options:
- no mempool modification, so each application/driver has to add
  priv_size bytes to the object to get the mbuf pointer. This does not
  look feasible.
- change the mempool to support private area before each object. I
  think mempool API is already quite complex, and I think it's not
  the role of the mempool library to provide such features.

In any case, I think it would require more modifications (in terms of
lines of code, but also in terms of "allocation logic"). At the end,
the patch I suggested does not break any paradygm and just add ~10 lines
of code.

Regards,
Olivier
  
Olivier Matz March 27, 2015, 1:56 p.m. UTC | #4
Hi Konstantin,

On 03/27/2015 10:07 AM, Olivier MATZ wrote:
>> I think that to support ability to setup priv_size on a mempool basis,
>> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
>> we need to: 
>>
>> 1. Store priv_size both inside the mempool and inside the mbuf.
>>
>> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
>> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
>>
>> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
>> rte_pktmbuf_detach(struct rte_mbuf *m) 
>> {
>>   ...
>>    m->priv_size = rte_mempool_get_privsize(m->pool);
>>    m->buf _addr= rte_mbuf_to_baddr(m);
>>    ...
>> }
>>
>> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
>> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
>> - or provide separate function that could be called straight after rte_mempool_create() , that
>> would setup priv_size for the  pool and for all its mbufs.
>> - or some sort of combination of these 2 approaches - introduce a wrapper function 
>> (rte_mbuf_pool_create() or something) that would take priv_size as parameter, 
>> create a new mempool and then setup priv_size.

I though a bit more about this solution, and I realized that doing
mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
a good idea, as there is no garantee that the size of mi is large enough
to store the priv of md.

Having the same priv_size for mi and md is maybe a good constraint.
I can add this in the API comments and assertions in the code to
check this condition, what do you think?


> Introducing rte_mbuf_pool_create() seems a good idea to me, it
> would hide 'rte_pktmbuf_pool_private' from the user and force
> to initialize all the required fields (mbuf_data_room_size only
> today, and maybe mbuf_priv_size).
> 
> The API would be:
> 
> struct rte_mempool *
> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
> 	unsigned cache_size, size_t mbuf_priv_size,
> 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
> 	int socket_id, unsigned flags)
> 
> I can give it a try and send a patch for this.

About this, it is not required anymore for this patch series if we
agree with my comment above.

I'll send a separate patch for that. It's probably a good occasion
to get rid of the pointer casted into an integer for
mbuf_data_room_size.

Regards,
Olivier
  
Ananyev, Konstantin March 27, 2015, 2:25 p.m. UTC | #5
Hi Olivier,

> -----Original Message-----
> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> Sent: Friday, March 27, 2015 1:56 PM
> To: Ananyev, Konstantin; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> 
> Hi Konstantin,
> 
> On 03/27/2015 10:07 AM, Olivier MATZ wrote:
> >> I think that to support ability to setup priv_size on a mempool basis,
> >> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
> >> we need to:
> >>
> >> 1. Store priv_size both inside the mempool and inside the mbuf.
> >>
> >> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
> >> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
> >>
> >> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
> >> rte_pktmbuf_detach(struct rte_mbuf *m)
> >> {
> >>   ...
> >>    m->priv_size = rte_mempool_get_privsize(m->pool);
> >>    m->buf _addr= rte_mbuf_to_baddr(m);
> >>    ...
> >> }
> >>
> >> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
> >> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
> >> - or provide separate function that could be called straight after rte_mempool_create() , that
> >> would setup priv_size for the  pool and for all its mbufs.
> >> - or some sort of combination of these 2 approaches - introduce a wrapper function
> >> (rte_mbuf_pool_create() or something) that would take priv_size as parameter,
> >> create a new mempool and then setup priv_size.
> 
> I though a bit more about this solution, and I realized that doing
> mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
> a good idea, as there is no garantee that the size of mi is large enough
> to store the priv of md.
> 
> Having the same priv_size for mi and md is maybe a good constraint.
> I can add this in the API comments and assertions in the code to
> check this condition, what do you think?

Probably we have a different concepts of what is mbuf's  private space in mind.
From my point, even indirect buffer should use it's own private space and
leave contents of direct mbuf it attached to unmodified.  
After attach() operation, only contents of the buffer are shared between mbufs,
but not the mbuf's metadata. 
Otherwise on detach(), you'll have to copy contents of private space back, from direct to indirect mbuf? 
Again how to deal with the case, when 2 or more mbufs will attach to the same direct one?

So let say, if we'll have a macro:

#define RTE_MBUF_PRIV_PTR(mb)	((void *)((struct rte_mbuf *)(mb)) + 1))

No matter is mb  a direct or indirect mbuf.
Do you have something else in mind here?

> 
> 
> > Introducing rte_mbuf_pool_create() seems a good idea to me, it
> > would hide 'rte_pktmbuf_pool_private' from the user and force
> > to initialize all the required fields (mbuf_data_room_size only
> > today, and maybe mbuf_priv_size).
> >
> > The API would be:
> >
> > struct rte_mempool *
> > rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
> > 	unsigned cache_size, size_t mbuf_priv_size,
> > 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
> > 	int socket_id, unsigned flags)
> >
> > I can give it a try and send a patch for this.
> 
> About this, it is not required anymore for this patch series if we
> agree with my comment above.

I still think we need some way to setup priv_size on a per-mempool basis.
Doing that in rte_mbuf_pool_create() seems like a good thing to me.
Not sure, why you decided to drop it?

Konstantin

> 
> I'll send a separate patch for that. It's probably a good occasion
> to get rid of the pointer casted into an integer for
> mbuf_data_room_size.
> 
> Regards,
> Olivier
  
Olivier Matz March 27, 2015, 3:17 p.m. UTC | #6
Hi Konstantin,

On 03/27/2015 03:25 PM, Ananyev, Konstantin wrote:
> Hi Olivier,
> 
>> -----Original Message-----
>> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
>> Sent: Friday, March 27, 2015 1:56 PM
>> To: Ananyev, Konstantin; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
>>
>> Hi Konstantin,
>>
>> On 03/27/2015 10:07 AM, Olivier MATZ wrote:
>>>> I think that to support ability to setup priv_size on a mempool basis,
>>>> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
>>>> we need to:
>>>>
>>>> 1. Store priv_size both inside the mempool and inside the mbuf.
>>>>
>>>> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
>>>> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
>>>>
>>>> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
>>>> rte_pktmbuf_detach(struct rte_mbuf *m)
>>>> {
>>>>   ...
>>>>    m->priv_size = rte_mempool_get_privsize(m->pool);
>>>>    m->buf _addr= rte_mbuf_to_baddr(m);
>>>>    ...
>>>> }
>>>>
>>>> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
>>>> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
>>>> - or provide separate function that could be called straight after rte_mempool_create() , that
>>>> would setup priv_size for the  pool and for all its mbufs.
>>>> - or some sort of combination of these 2 approaches - introduce a wrapper function
>>>> (rte_mbuf_pool_create() or something) that would take priv_size as parameter,
>>>> create a new mempool and then setup priv_size.
>>
>> I though a bit more about this solution, and I realized that doing
>> mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
>> a good idea, as there is no garantee that the size of mi is large enough
>> to store the priv of md.
>>
>> Having the same priv_size for mi and md is maybe a good constraint.
>> I can add this in the API comments and assertions in the code to
>> check this condition, what do you think?
> 
> Probably we have a different concepts of what is mbuf's  private space in mind.
> From my point, even indirect buffer should use it's own private space and
> leave contents of direct mbuf it attached to unmodified.  
> After attach() operation, only contents of the buffer are shared between mbufs,
> but not the mbuf's metadata. 

Sorry if it was not clear in my previous messages, but I agree
with your description. When attaching a mbuf, only data, not
metadata should be shared.

In the solution you are suggesting (quoted above), you say we need
to set mi->priv_size to md->priv_size in rte_pktmbuf_attach(). I felt
this was not possible, but it depends on the meaning we give to
priv_size:

1. If the meaning is "the size of the private data embedded in this
   mbuf", which is the most logical meaning, we cannot do this
   affectation

2. If the meaning is "the size of the private data embedded in the
   mbuf the buf_addr is pointing to" (which is harder to get), the
   affectation makes sense.

From what I understand, you feel we should use 2/ as priv_size
definition. Is it correct?

In my previous message, the definition of m->priv_size was 1/,
so that's why I felt assigning mi->priv_size to md->priv_size was
not possible.

I agree 2/ is probably a good choice, as it would allow to attach
to a mbuf with a different priv_size. It may require some additional
comments above the field in the structure to explain that.


> Otherwise on detach(), you'll have to copy contents of private space back, from direct to indirect mbuf? 
> Again how to deal with the case, when 2 or more mbufs will attach to the same direct one?
> 
> So let say, if we'll have a macro:
> 
> #define RTE_MBUF_PRIV_PTR(mb)	((void *)((struct rte_mbuf *)(mb)) + 1))
> 
> No matter is mb  a direct or indirect mbuf.
> Do you have something else in mind here?

I completely agree with this macro. We should consider the private data
as an extension of the mbuf structure.


>>> Introducing rte_mbuf_pool_create() seems a good idea to me, it
>>> would hide 'rte_pktmbuf_pool_private' from the user and force
>>> to initialize all the required fields (mbuf_data_room_size only
>>> today, and maybe mbuf_priv_size).
>>>
>>> The API would be:
>>>
>>> struct rte_mempool *
>>> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
>>> 	unsigned cache_size, size_t mbuf_priv_size,
>>> 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
>>> 	int socket_id, unsigned flags)
>>>
>>> I can give it a try and send a patch for this.
>>
>> About this, it is not required anymore for this patch series if we
>> agree with my comment above.
> 
> I still think we need some way to setup priv_size on a per-mempool basis.
> Doing that in rte_mbuf_pool_create() seems like a good thing to me.
> Not sure, why you decided to drop it?

I think we can already do it without changing the API by providing
our own rte_pktmbuf_init and rte_pktmbuf_pool_init.

rte_pktmbuf_init() has to set:
  m->buf_len = mp->elt_size - sizeof(struct mbuf);
  m->priv_size = sizeof(struct mbuf) - sizeof(struct rte_mbuf);

rte_pktmbuf_pool_init() has to set:
  /* we can use the default function */
  mbp_priv->mbuf_data_room_size = MBUF_RXDATA_SIZE +
  	RTE_PKTMBUF_HEADROOM;

In this case, it is possible to calculate the mbuf_priv_size only
from the pool object:

  mbuf_priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
	pool_private->mbuf_data_room_size -
	sizeof(rte_mbuf)


I agree it's not ideal, but I think the mbuf pool initialization
is another problem. That's why I suggested to change this in a
separate series that will add rte_mbuf_pool_create() with the
API described above. Thoughts?


Thanks,
Olivier


> 
> Konstantin
> 
>>
>> I'll send a separate patch for that. It's probably a good occasion
>> to get rid of the pointer casted into an integer for
>> mbuf_data_room_size.
>>
>> Regards,
>> Olivier
  
Zoltan Kiss March 27, 2015, 6:11 p.m. UTC | #7
On 27/03/15 15:17, Olivier MATZ wrote:
> Hi Konstantin,
>
> On 03/27/2015 03:25 PM, Ananyev, Konstantin wrote:
>> Hi Olivier,
>>
>>> -----Original Message-----
>>> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
>>> Sent: Friday, March 27, 2015 1:56 PM
>>> To: Ananyev, Konstantin; dev@dpdk.org
>>> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
>>>
>>> Hi Konstantin,
>>>
>>> On 03/27/2015 10:07 AM, Olivier MATZ wrote:
>>>>> I think that to support ability to setup priv_size on a mempool basis,
>>>>> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
>>>>> we need to:
>>>>>
>>>>> 1. Store priv_size both inside the mempool and inside the mbuf.
>>>>>
>>>>> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
>>>>> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
>>>>>
>>>>> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
>>>>> rte_pktmbuf_detach(struct rte_mbuf *m)
>>>>> {
>>>>>    ...
>>>>>     m->priv_size = rte_mempool_get_privsize(m->pool);
>>>>>     m->buf _addr= rte_mbuf_to_baddr(m);
>>>>>     ...
>>>>> }
>>>>>
>>>>> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
>>>>> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
>>>>> - or provide separate function that could be called straight after rte_mempool_create() , that
>>>>> would setup priv_size for the  pool and for all its mbufs.
>>>>> - or some sort of combination of these 2 approaches - introduce a wrapper function
>>>>> (rte_mbuf_pool_create() or something) that would take priv_size as parameter,
>>>>> create a new mempool and then setup priv_size.
>>>
>>> I though a bit more about this solution, and I realized that doing
>>> mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
>>> a good idea, as there is no garantee that the size of mi is large enough
>>> to store the priv of md.
>>>
>>> Having the same priv_size for mi and md is maybe a good constraint.
>>> I can add this in the API comments and assertions in the code to
>>> check this condition, what do you think?
>>
>> Probably we have a different concepts of what is mbuf's  private space in mind.
>>  From my point, even indirect buffer should use it's own private space and
>> leave contents of direct mbuf it attached to unmodified.
>> After attach() operation, only contents of the buffer are shared between mbufs,
>> but not the mbuf's metadata.
>
> Sorry if it was not clear in my previous messages, but I agree
> with your description. When attaching a mbuf, only data, not
> metadata should be shared.
>
> In the solution you are suggesting (quoted above), you say we need
> to set mi->priv_size to md->priv_size in rte_pktmbuf_attach(). I felt
> this was not possible, but it depends on the meaning we give to
> priv_size:
>
> 1. If the meaning is "the size of the private data embedded in this
>     mbuf", which is the most logical meaning, we cannot do this
>     affectation
>
> 2. If the meaning is "the size of the private data embedded in the
>     mbuf the buf_addr is pointing to" (which is harder to get), the
>     affectation makes sense.
>
>  From what I understand, you feel we should use 2/ as priv_size
> definition. Is it correct?
>
> In my previous message, the definition of m->priv_size was 1/,
> so that's why I felt assigning mi->priv_size to md->priv_size was
> not possible.
>
> I agree 2/ is probably a good choice, as it would allow to attach
> to a mbuf with a different priv_size. It may require some additional
> comments above the field in the structure to explain that.
I think we need to document it in the comments very well that you can 
attach mbuf's to each other with different private area sizes, and 
applications should care about this difference. And we should give a 
macro to get the private area size, which will get rte_mbuf.mp->priv_size.
Actually we should give some better name to rte_mbuf.priv_size, it's a 
bit misleading now. Maybe direct_priv_size?
>
>
>> Otherwise on detach(), you'll have to copy contents of private space back, from direct to indirect mbuf?
>> Again how to deal with the case, when 2 or more mbufs will attach to the same direct one?
>>
>> So let say, if we'll have a macro:
>>
>> #define RTE_MBUF_PRIV_PTR(mb)	((void *)((struct rte_mbuf *)(mb)) + 1))
>>
>> No matter is mb  a direct or indirect mbuf.
>> Do you have something else in mind here?
>
> I completely agree with this macro. We should consider the private data
> as an extension of the mbuf structure.
>
>
>>>> Introducing rte_mbuf_pool_create() seems a good idea to me, it
>>>> would hide 'rte_pktmbuf_pool_private' from the user and force
>>>> to initialize all the required fields (mbuf_data_room_size only
>>>> today, and maybe mbuf_priv_size).
>>>>
>>>> The API would be:
>>>>
>>>> struct rte_mempool *
>>>> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
>>>> 	unsigned cache_size, size_t mbuf_priv_size,
>>>> 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
>>>> 	int socket_id, unsigned flags)
>>>>
>>>> I can give it a try and send a patch for this.
>>>
>>> About this, it is not required anymore for this patch series if we
>>> agree with my comment above.
>>
>> I still think we need some way to setup priv_size on a per-mempool basis.
>> Doing that in rte_mbuf_pool_create() seems like a good thing to me.
>> Not sure, why you decided to drop it?
>
> I think we can already do it without changing the API by providing
> our own rte_pktmbuf_init and rte_pktmbuf_pool_init.
>
> rte_pktmbuf_init() has to set:
>    m->buf_len = mp->elt_size - sizeof(struct mbuf);
>    m->priv_size = sizeof(struct mbuf) - sizeof(struct rte_mbuf);
What's struct mbuf? If we take my assumption above, direct_priv_size 
could go uninitalized, and we can set it when attaching.
>
> rte_pktmbuf_pool_init() has to set:
>    /* we can use the default function */
>    mbp_priv->mbuf_data_room_size = MBUF_RXDATA_SIZE +
>    	RTE_PKTMBUF_HEADROOM;
>
> In this case, it is possible to calculate the mbuf_priv_size only
> from the pool object:
>
>    mbuf_priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
> 	pool_private->mbuf_data_room_size -
> 	sizeof(rte_mbuf)
My understanding is that the pool private date is something completely 
different than the private data of the mbufs. I think 
rte_mempool.priv_size should be initialized in *mp_init.

>
>
> I agree it's not ideal, but I think the mbuf pool initialization
> is another problem. That's why I suggested to change this in a
> separate series that will add rte_mbuf_pool_create() with the
> API described above. Thoughts?


>
>
> Thanks,
> Olivier
>
>
>>
>> Konstantin
>>
>>>
>>> I'll send a separate patch for that. It's probably a good occasion
>>> to get rid of the pointer casted into an integer for
>>> mbuf_data_room_size.
>>>
>>> Regards,
>>> Olivier
  
Olivier Matz March 28, 2015, 9:19 p.m. UTC | #8
Hi Zoltan,

On 03/27/2015 07:11 PM, Zoltan Kiss wrote:
>> Sorry if it was not clear in my previous messages, but I agree
>> with your description. When attaching a mbuf, only data, not
>> metadata should be shared.
>>
>> In the solution you are suggesting (quoted above), you say we need
>> to set mi->priv_size to md->priv_size in rte_pktmbuf_attach(). I felt
>> this was not possible, but it depends on the meaning we give to
>> priv_size:
>>
>> 1. If the meaning is "the size of the private data embedded in this
>>     mbuf", which is the most logical meaning, we cannot do this
>>     affectation
>>
>> 2. If the meaning is "the size of the private data embedded in the
>>     mbuf the buf_addr is pointing to" (which is harder to get), the
>>     affectation makes sense.
>>
>>  From what I understand, you feel we should use 2/ as priv_size
>> definition. Is it correct?
>>
>> In my previous message, the definition of m->priv_size was 1/,
>> so that's why I felt assigning mi->priv_size to md->priv_size was
>> not possible.
>>
>> I agree 2/ is probably a good choice, as it would allow to attach
>> to a mbuf with a different priv_size. It may require some additional
>> comments above the field in the structure to explain that.
> I think we need to document it in the comments very well that you can
> attach mbuf's to each other with different private area sizes, and
> applications should care about this difference. And we should give a
> macro to get the private area size, which will get rte_mbuf.mp->priv_size.
> Actually we should give some better name to rte_mbuf.priv_size, it's a
> bit misleading now. Maybe direct_priv_size?

I agree it should be well documented in the API comments, but I'm
not sure it's worth changing the name of the field. After all,
m->buf_addr and m->buf_len are also related to the buffer of the
direct mbuf without beeing named "direct_*".


>>>>> Introducing rte_mbuf_pool_create() seems a good idea to me, it
>>>>> would hide 'rte_pktmbuf_pool_private' from the user and force
>>>>> to initialize all the required fields (mbuf_data_room_size only
>>>>> today, and maybe mbuf_priv_size).
>>>>>
>>>>> The API would be:
>>>>>
>>>>> struct rte_mempool *
>>>>> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
>>>>>     unsigned cache_size, size_t mbuf_priv_size,
>>>>>     rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
>>>>>     int socket_id, unsigned flags)
>>>>>
>>>>> I can give it a try and send a patch for this.
>>>>
>>>> About this, it is not required anymore for this patch series if we
>>>> agree with my comment above.
>>>
>>> I still think we need some way to setup priv_size on a per-mempool
>>> basis.
>>> Doing that in rte_mbuf_pool_create() seems like a good thing to me.
>>> Not sure, why you decided to drop it?
>>
>> I think we can already do it without changing the API by providing
>> our own rte_pktmbuf_init and rte_pktmbuf_pool_init.
>>
>> rte_pktmbuf_init() has to set:
>>    m->buf_len = mp->elt_size - sizeof(struct mbuf);
>>    m->priv_size = sizeof(struct mbuf) - sizeof(struct rte_mbuf);
> What's struct mbuf? If we take my assumption above, direct_priv_size
> could go uninitalized, and we can set it when attaching.

In this example, "struct mbuf" is the mbuf used by the application:

struct mbuf {
	struct rte_mbuf rte_mb;
	struct app_private priv;
};

Therefore, we have:

sizeof(struct mbuf) = sizeof(struct rte_mbuf) +
	sizeof(struct app_private);


About keeping the m->priv_size field not initialized, I'm not really
convinced because we would always have to use the pool->mbuf_priv_size
when we want to get the address of data buffer embedded in a mbuf. This
implies several indirections, and we have only one if the info is
already in the mbuf.


>> rte_pktmbuf_pool_init() has to set:
>>    /* we can use the default function */
>>    mbp_priv->mbuf_data_room_size = MBUF_RXDATA_SIZE +
>>        RTE_PKTMBUF_HEADROOM;
>>
>> In this case, it is possible to calculate the mbuf_priv_size only
>> from the pool object:
>>
>>    mbuf_priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
>>     pool_private->mbuf_data_room_size -
>>     sizeof(rte_mbuf)
> My understanding is that the pool private date is something completely
> different than the private data of the mbufs. I think
> rte_mempool.priv_size should be initialized in *mp_init.

As I said in my previous mail, I think we don't need to have
pool_private->mbuf_priv_size for this series, as it can be
calculated from what we already have in pool_private. I'll send
a v3 patch that implement this solution, and it can be a base for
discussions.

However, I think the way mbuf pools are initialized may need some
rework, maybe using rte_mbuf_pool_create() as Konstantin suggested.
I'll try to do some reworks in this area in another series.

Regards,
Olivier



> 
>>
>>
>> I agree it's not ideal, but I think the mbuf pool initialization
>> is another problem. That's why I suggested to change this in a
>> separate series that will add rte_mbuf_pool_create() with the
>> API described above. Thoughts?
> 
> 
>>
>>
>> Thanks,
>> Olivier
>>
>>
>>>
>>> Konstantin
>>>
>>>>
>>>> I'll send a separate patch for that. It's probably a good occasion
>>>> to get rid of the pointer casted into an integer for
>>>> mbuf_data_room_size.
>>>>
>>>> Regards,
>>>> Olivier
  
Ananyev, Konstantin March 30, 2015, 12:34 p.m. UTC | #9
Hi Olivier,

> -----Original Message-----
> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> Sent: Friday, March 27, 2015 3:17 PM
> To: Ananyev, Konstantin; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> 
> Hi Konstantin,
> 
> On 03/27/2015 03:25 PM, Ananyev, Konstantin wrote:
> > Hi Olivier,
> >
> >> -----Original Message-----
> >> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> >> Sent: Friday, March 27, 2015 1:56 PM
> >> To: Ananyev, Konstantin; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> >>
> >> Hi Konstantin,
> >>
> >> On 03/27/2015 10:07 AM, Olivier MATZ wrote:
> >>>> I think that to support ability to setup priv_size on a mempool basis,
> >>>> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
> >>>> we need to:
> >>>>
> >>>> 1. Store priv_size both inside the mempool and inside the mbuf.
> >>>>
> >>>> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
> >>>> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
> >>>>
> >>>> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
> >>>> rte_pktmbuf_detach(struct rte_mbuf *m)
> >>>> {
> >>>>   ...
> >>>>    m->priv_size = rte_mempool_get_privsize(m->pool);
> >>>>    m->buf _addr= rte_mbuf_to_baddr(m);
> >>>>    ...
> >>>> }
> >>>>
> >>>> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
> >>>> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
> >>>> - or provide separate function that could be called straight after rte_mempool_create() , that
> >>>> would setup priv_size for the  pool and for all its mbufs.
> >>>> - or some sort of combination of these 2 approaches - introduce a wrapper function
> >>>> (rte_mbuf_pool_create() or something) that would take priv_size as parameter,
> >>>> create a new mempool and then setup priv_size.
> >>
> >> I though a bit more about this solution, and I realized that doing
> >> mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
> >> a good idea, as there is no garantee that the size of mi is large enough
> >> to store the priv of md.
> >>
> >> Having the same priv_size for mi and md is maybe a good constraint.
> >> I can add this in the API comments and assertions in the code to
> >> check this condition, what do you think?
> >
> > Probably we have a different concepts of what is mbuf's  private space in mind.
> > From my point, even indirect buffer should use it's own private space and
> > leave contents of direct mbuf it attached to unmodified.
> > After attach() operation, only contents of the buffer are shared between mbufs,
> > but not the mbuf's metadata.
> 
> Sorry if it was not clear in my previous messages, but I agree
> with your description. When attaching a mbuf, only data, not
> metadata should be shared.
> 
> In the solution you are suggesting (quoted above), you say we need
> to set mi->priv_size to md->priv_size in rte_pktmbuf_attach(). I felt
> this was not possible, but it depends on the meaning we give to
> priv_size:
> 
> 1. If the meaning is "the size of the private data embedded in this
>    mbuf", which is the most logical meaning, we cannot do this
>    affectation
> 
> 2. If the meaning is "the size of the private data embedded in the
>    mbuf the buf_addr is pointing to" (which is harder to get), the
>    affectation makes sense.
> 
> From what I understand, you feel we should use 2/ as priv_size
> definition. Is it correct?

Yes, I meant 2.
From my point priv_size inside mbuf is more like 'buf_ofs'.
It's main purpose is for internal use - to help our mbuf manipulation routinies
(attach/detach/free) to work correctly.
If the user wants to query size of private space for the mbuf, he probably should
use the value from mempool.

> 
> In my previous message, the definition of m->priv_size was 1/,
> so that's why I felt assigning mi->priv_size to md->priv_size was
> not possible.
> 
> I agree 2/ is probably a good choice, as it would allow to attach
> to a mbuf with a different priv_size. It may require some additional
> comments above the field in the structure to explain that.
> 
> 
> > Otherwise on detach(), you'll have to copy contents of private space back, from direct to indirect mbuf?
> > Again how to deal with the case, when 2 or more mbufs will attach to the same direct one?
> >
> > So let say, if we'll have a macro:
> >
> > #define RTE_MBUF_PRIV_PTR(mb)	((void *)((struct rte_mbuf *)(mb)) + 1))
> >
> > No matter is mb  a direct or indirect mbuf.
> > Do you have something else in mind here?
> 
> I completely agree with this macro. We should consider the private data
> as an extension of the mbuf structure.
> 
> 
> >>> Introducing rte_mbuf_pool_create() seems a good idea to me, it
> >>> would hide 'rte_pktmbuf_pool_private' from the user and force
> >>> to initialize all the required fields (mbuf_data_room_size only
> >>> today, and maybe mbuf_priv_size).
> >>>
> >>> The API would be:
> >>>
> >>> struct rte_mempool *
> >>> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
> >>> 	unsigned cache_size, size_t mbuf_priv_size,
> >>> 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
> >>> 	int socket_id, unsigned flags)
> >>>
> >>> I can give it a try and send a patch for this.
> >>
> >> About this, it is not required anymore for this patch series if we
> >> agree with my comment above.
> >
> > I still think we need some way to setup priv_size on a per-mempool basis.
> > Doing that in rte_mbuf_pool_create() seems like a good thing to me.
> > Not sure, why you decided to drop it?
> 
> I think we can already do it without changing the API by providing
> our own rte_pktmbuf_init and rte_pktmbuf_pool_init.
> 
> rte_pktmbuf_init() has to set:
>   m->buf_len = mp->elt_size - sizeof(struct mbuf);
>   m->priv_size = sizeof(struct mbuf) - sizeof(struct rte_mbuf);
> 
> rte_pktmbuf_pool_init() has to set:
>   /* we can use the default function */
>   mbp_priv->mbuf_data_room_size = MBUF_RXDATA_SIZE +
>   	RTE_PKTMBUF_HEADROOM;

Yeh, when  arg==NULL for rte_pktmbuf_pool_init() we always set up
mbuf_data_room_size to the hardcoded value.
Which always looked strange to me.
I think it should be set to:
mp->elt_size - sizeof(struct rte_mbuf) - RTE_PKTMBUF_HEADROOM;
for that case.

> 
> In this case, it is possible to calculate the mbuf_priv_size only
> from the pool object:
> 
>   mbuf_priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
> 	pool_private->mbuf_data_room_size -
> 	sizeof(rte_mbuf)
> 

So if I understand your idea correctly:
If  second parameter for rte_pktmbuf_pool_init() is NULL, then 
we setup

mbp_priv->mbuf_data_room_size = mp->elt_size - sizeof(struct rte_mbuf) - RTE_PKTMBUF_HEADROOM;

Which means that  priv_size ==0  for all  mbufs in the pool 
Otherwise we setup:

 mbp_priv->mbuf_data_room_size = opaque_arg;

As we are doing now, and priv_size for all mbufs in the pool will be:
pool->elt_size - pool_private->mbuf_data_room_size - sizeof(rte_mbuf);

And in that case, all users have to do to specify priv_size for mempool is to pass a proper argument
for rte_pktmbuf_pool_init() at  mempool_create().
Correct? 

> 
> I agree it's not ideal, but I think the mbuf pool initialization
> is another problem. That's why I suggested to change this in a
> separate series that will add rte_mbuf_pool_create() with the
> API described above. Thoughts?
> 

I also put answers to another mail below.
Just to keep all discussion in one place.

> > Though, I still think that the better approach is to reserve private space before struct rte_mbuf, not after.
> > In that case, user got his private space, and we keep buf_addr straight after  rte_mbuf, without any whole.
> > So we don't need steps 2 and 3, above,
> > plus we don't need rte_mbuf_to_baddr() and rte_mbuf_from_indirect() -
> > RTE_MBUF_TO_BADDR/ RTE_MBUF_FROM_BADDR would keep working correctly.
> > In fact, with this scheme - we don't even need priv_size for mbuf management (attach/detach/free).
> >
> > Wonder did you try that approach?
> 
> Yes, I though about this approach too. But I think it would require
> more changes. When an application or a driver allocate a mbuf with
> mempool_get(), it expects that the returned pointer is the rte_mbuf *.

Yep, sure it will still get the pointer to the rte_mbuf *.
Though later, if upper layer would like to convert from  rte_mbuf* to app_specific_mbuf *,
it would have to use a macro:

#define RTE_MBUF_TO_PRIV(m)	((void *)((uintptr_t)(m) - (m)->priv_size)) 

> 
> With this solution, there are 2 options:
> - no mempool modification, so each application/driver has to add
>   priv_size bytes to the object to get the mbuf pointer. This does not
>   look feasible.
> - change the mempool to support private area before each object. I
>   think mempool API is already quite complex, and I think it's not
>   the role of the mempool library to provide such features.


In fact, I think the changes would be minimal.
All we have to do, is to make changes in rte_pktmbuf_init():

void
rte_pktmbuf_init(struct rte_mempool *mp,
                 __attribute__((unused)) void *opaque_arg,
                 void *_m,
                 __attribute__((unused)) unsigned i)
{

     //extract priv_size from mempool   (discussed above).
      uint16_t priv_size = rte_mbufpool_get_priv_size(mp); 

      struct rte_mbuf *m = _m + priv_size;
      uint32_t buf_len = mp->elt_size - sizeof(struct rte_mbuf) - priv_size;

....


With that way priv_size inside mbuf would always be the size its own private space,
for both direct and indirect mbus., so we don't require to set priv_size at attach/detach.
Again RTE_MBUF_TO_BADDR() and RTE_MBUF_FROM_BADDR() would just work without any modifications.

Konstantin
  
Olivier Matz March 30, 2015, 7:55 p.m. UTC | #10
Hi Konstantin,

On 03/30/2015 02:34 PM, Ananyev, Konstantin wrote:
> Hi Olivier,
> 
>> -----Original Message-----
>> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
>> Sent: Friday, March 27, 2015 3:17 PM
>> To: Ananyev, Konstantin; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
>>
>> Hi Konstantin,
>>
>> On 03/27/2015 03:25 PM, Ananyev, Konstantin wrote:
>>> Hi Olivier,
>>>
>>>> -----Original Message-----
>>>> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
>>>> Sent: Friday, March 27, 2015 1:56 PM
>>>> To: Ananyev, Konstantin; dev@dpdk.org
>>>> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
>>>>
>>>> Hi Konstantin,
>>>>
>>>> On 03/27/2015 10:07 AM, Olivier MATZ wrote:
>>>>>> I think that to support ability to setup priv_size on a mempool basis,
>>>>>> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
>>>>>> we need to:
>>>>>>
>>>>>> 1. Store priv_size both inside the mempool and inside the mbuf.
>>>>>>
>>>>>> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
>>>>>> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
>>>>>>
>>>>>> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
>>>>>> rte_pktmbuf_detach(struct rte_mbuf *m)
>>>>>> {
>>>>>>   ...
>>>>>>    m->priv_size = rte_mempool_get_privsize(m->pool);
>>>>>>    m->buf _addr= rte_mbuf_to_baddr(m);
>>>>>>    ...
>>>>>> }
>>>>>>
>>>>>> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
>>>>>> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
>>>>>> - or provide separate function that could be called straight after rte_mempool_create() , that
>>>>>> would setup priv_size for the  pool and for all its mbufs.
>>>>>> - or some sort of combination of these 2 approaches - introduce a wrapper function
>>>>>> (rte_mbuf_pool_create() or something) that would take priv_size as parameter,
>>>>>> create a new mempool and then setup priv_size.
>>>>
>>>> I though a bit more about this solution, and I realized that doing
>>>> mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
>>>> a good idea, as there is no garantee that the size of mi is large enough
>>>> to store the priv of md.
>>>>
>>>> Having the same priv_size for mi and md is maybe a good constraint.
>>>> I can add this in the API comments and assertions in the code to
>>>> check this condition, what do you think?
>>>
>>> Probably we have a different concepts of what is mbuf's  private space in mind.
>>> From my point, even indirect buffer should use it's own private space and
>>> leave contents of direct mbuf it attached to unmodified.
>>> After attach() operation, only contents of the buffer are shared between mbufs,
>>> but not the mbuf's metadata.
>>
>> Sorry if it was not clear in my previous messages, but I agree
>> with your description. When attaching a mbuf, only data, not
>> metadata should be shared.
>>
>> In the solution you are suggesting (quoted above), you say we need
>> to set mi->priv_size to md->priv_size in rte_pktmbuf_attach(). I felt
>> this was not possible, but it depends on the meaning we give to
>> priv_size:
>>
>> 1. If the meaning is "the size of the private data embedded in this
>>    mbuf", which is the most logical meaning, we cannot do this
>>    affectation
>>
>> 2. If the meaning is "the size of the private data embedded in the
>>    mbuf the buf_addr is pointing to" (which is harder to get), the
>>    affectation makes sense.
>>
>> From what I understand, you feel we should use 2/ as priv_size
>> definition. Is it correct?
> 
> Yes, I meant 2.
> From my point priv_size inside mbuf is more like 'buf_ofs'.
> It's main purpose is for internal use - to help our mbuf manipulation routinies
> (attach/detach/free) to work correctly.
> If the user wants to query size of private space for the mbuf, he probably should
> use the value from mempool.

Agree.


>> In my previous message, the definition of m->priv_size was 1/,
>> so that's why I felt assigning mi->priv_size to md->priv_size was
>> not possible.
>>
>> I agree 2/ is probably a good choice, as it would allow to attach
>> to a mbuf with a different priv_size. It may require some additional
>> comments above the field in the structure to explain that.
>>
>>
>>> Otherwise on detach(), you'll have to copy contents of private space back, from direct to indirect mbuf?
>>> Again how to deal with the case, when 2 or more mbufs will attach to the same direct one?
>>>
>>> So let say, if we'll have a macro:
>>>
>>> #define RTE_MBUF_PRIV_PTR(mb)	((void *)((struct rte_mbuf *)(mb)) + 1))
>>>
>>> No matter is mb  a direct or indirect mbuf.
>>> Do you have something else in mind here?
>>
>> I completely agree with this macro. We should consider the private data
>> as an extension of the mbuf structure.
>>
>>
>>>>> Introducing rte_mbuf_pool_create() seems a good idea to me, it
>>>>> would hide 'rte_pktmbuf_pool_private' from the user and force
>>>>> to initialize all the required fields (mbuf_data_room_size only
>>>>> today, and maybe mbuf_priv_size).
>>>>>
>>>>> The API would be:
>>>>>
>>>>> struct rte_mempool *
>>>>> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
>>>>> 	unsigned cache_size, size_t mbuf_priv_size,
>>>>> 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
>>>>> 	int socket_id, unsigned flags)
>>>>>
>>>>> I can give it a try and send a patch for this.
>>>>
>>>> About this, it is not required anymore for this patch series if we
>>>> agree with my comment above.
>>>
>>> I still think we need some way to setup priv_size on a per-mempool basis.
>>> Doing that in rte_mbuf_pool_create() seems like a good thing to me.
>>> Not sure, why you decided to drop it?
>>
>> I think we can already do it without changing the API by providing
>> our own rte_pktmbuf_init and rte_pktmbuf_pool_init.
>>
>> rte_pktmbuf_init() has to set:
>>   m->buf_len = mp->elt_size - sizeof(struct mbuf);
>>   m->priv_size = sizeof(struct mbuf) - sizeof(struct rte_mbuf);
>>
>> rte_pktmbuf_pool_init() has to set:
>>   /* we can use the default function */
>>   mbp_priv->mbuf_data_room_size = MBUF_RXDATA_SIZE +
>>   	RTE_PKTMBUF_HEADROOM;
> 
> Yeh, when  arg==NULL for rte_pktmbuf_pool_init() we always set up
> mbuf_data_room_size to the hardcoded value.
> Which always looked strange to me.
> I think it should be set to:
> mp->elt_size - sizeof(struct rte_mbuf) - RTE_PKTMBUF_HEADROOM;
> for that case.

Yes, that would make more sense instead of the hardcoded value.
But I'm not sure it should be part of this series as the clone
patches won't change this behavior. I would prefer to have it in
another series that reworks mbuf pool initialization. I can also
work on it.

On the other hand if you really feel this patch is needed in
this series, it's not a problem as it's a one-liner.

> 
>>
>> In this case, it is possible to calculate the mbuf_priv_size only
>> from the pool object:
>>
>>   mbuf_priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
>> 	pool_private->mbuf_data_room_size -
>> 	sizeof(rte_mbuf)
>>
> 
> So if I understand your idea correctly:
> If  second parameter for rte_pktmbuf_pool_init() is NULL, then 
> we setup
> 
> mbp_priv->mbuf_data_room_size = mp->elt_size - sizeof(struct rte_mbuf) - RTE_PKTMBUF_HEADROOM;
> 
> Which means that  priv_size ==0  for all  mbufs in the pool 
> Otherwise we setup:
> 
>  mbp_priv->mbuf_data_room_size = opaque_arg;
> 
> As we are doing now, and priv_size for all mbufs in the pool will be:
> pool->elt_size - pool_private->mbuf_data_room_size - sizeof(rte_mbuf);
> 
> And in that case, all users have to do to specify priv_size for mempool is to pass a proper argument
> for rte_pktmbuf_pool_init() at  mempool_create().
> Correct? 

Correct.



> 
>>
>> I agree it's not ideal, but I think the mbuf pool initialization
>> is another problem. That's why I suggested to change this in a
>> separate series that will add rte_mbuf_pool_create() with the
>> API described above. Thoughts?
>>
> 
> I also put answers to another mail below.
> Just to keep all discussion in one place.
> 
>>> Though, I still think that the better approach is to reserve private space before struct rte_mbuf, not after.
>>> In that case, user got his private space, and we keep buf_addr straight after  rte_mbuf, without any whole.
>>> So we don't need steps 2 and 3, above,
>>> plus we don't need rte_mbuf_to_baddr() and rte_mbuf_from_indirect() -
>>> RTE_MBUF_TO_BADDR/ RTE_MBUF_FROM_BADDR would keep working correctly.
>>> In fact, with this scheme - we don't even need priv_size for mbuf management (attach/detach/free).
>>>
>>> Wonder did you try that approach?
>>
>> Yes, I though about this approach too. But I think it would require
>> more changes. When an application or a driver allocate a mbuf with
>> mempool_get(), it expects that the returned pointer is the rte_mbuf *.
> 
> Yep, sure it will still get the pointer to the rte_mbuf *.
> Though later, if upper layer would like to convert from  rte_mbuf* to app_specific_mbuf *,
> it would have to use a macro:
> 
> #define RTE_MBUF_TO_PRIV(m)	((void *)((uintptr_t)(m) - (m)->priv_size)) 
> 
>>
>> With this solution, there are 2 options:
>> - no mempool modification, so each application/driver has to add
>>   priv_size bytes to the object to get the mbuf pointer. This does not
>>   look feasible.
>> - change the mempool to support private area before each object. I
>>   think mempool API is already quite complex, and I think it's not
>>   the role of the mempool library to provide such features.
> 
> 
> In fact, I think the changes would be minimal.
> All we have to do, is to make changes in rte_pktmbuf_init():
> 
> void
> rte_pktmbuf_init(struct rte_mempool *mp,
>                  __attribute__((unused)) void *opaque_arg,
>                  void *_m,
>                  __attribute__((unused)) unsigned i)
> {
> 
>      //extract priv_size from mempool   (discussed above).
>       uint16_t priv_size = rte_mbufpool_get_priv_size(mp); 
> 
>       struct rte_mbuf *m = _m + priv_size;
>       uint32_t buf_len = mp->elt_size - sizeof(struct rte_mbuf) - priv_size;
> 
> ....
> 
> 
> With that way priv_size inside mbuf would always be the size its own private space,
> for both direct and indirect mbus., so we don't require to set priv_size at attach/detach.
> Again RTE_MBUF_TO_BADDR() and RTE_MBUF_FROM_BADDR() would just work without any modifications.

I'm not sure I'm getting it. The argument '_m' of your
rte_pktmbuf_init() is the pointer to the priv data, right?
So it means that the mbuf is located priv_size bytes after.

The rte_pktmbuf_init() function is called by mempool_create(),
and the _m parameter is a pointer to a mempool object. So
in my understanding, mempool_get() would not return a rte_mbuf
but a pointer to the application private data.


Regards,
Olivier



> 
> Konstantin
>  
>
  
Ananyev, Konstantin March 30, 2015, 11:17 p.m. UTC | #11
Hi Olivier,

> -----Original Message-----
> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> Sent: Monday, March 30, 2015 8:56 PM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> 
> Hi Konstantin,
> 
> On 03/30/2015 02:34 PM, Ananyev, Konstantin wrote:
> > Hi Olivier,
> >
> >> -----Original Message-----
> >> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> >> Sent: Friday, March 27, 2015 3:17 PM
> >> To: Ananyev, Konstantin; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> >>
> >> Hi Konstantin,
> >>
> >> On 03/27/2015 03:25 PM, Ananyev, Konstantin wrote:
> >>> Hi Olivier,
> >>>
> >>>> -----Original Message-----
> >>>> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> >>>> Sent: Friday, March 27, 2015 1:56 PM
> >>>> To: Ananyev, Konstantin; dev@dpdk.org
> >>>> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> >>>>
> >>>> Hi Konstantin,
> >>>>
> >>>> On 03/27/2015 10:07 AM, Olivier MATZ wrote:
> >>>>>> I think that to support ability to setup priv_size on a mempool basis,
> >>>>>> and reserve private space between struct rte_mbuf and rte_mbuf. buf_addr,
> >>>>>> we need to:
> >>>>>>
> >>>>>> 1. Store priv_size both inside the mempool and inside the mbuf.
> >>>>>>
> >>>>>> 2. rte_pktmbuf_attach() should change the value of priv_size to the priv_size of the direct mbuf we are going to attach to:
> >>>>>> rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md) {... mi->priv_size = md->priv_size; ...}
> >>>>>>
> >>>>>> 3. rte_pktmbuf_detach() should restore original value of mbuf's priv_size:
> >>>>>> rte_pktmbuf_detach(struct rte_mbuf *m)
> >>>>>> {
> >>>>>>   ...
> >>>>>>    m->priv_size = rte_mempool_get_privsize(m->pool);
> >>>>>>    m->buf _addr= rte_mbuf_to_baddr(m);
> >>>>>>    ...
> >>>>>> }
> >>>>>>
> >>>>>> Also I think we need to provide a way to specify priv_size for all mbufs of the mepool at init time:
> >>>>>> - either force people to specify it at rte_mempool_create() time (probably use init_arg for that),
> >>>>>> - or provide separate function that could be called straight after rte_mempool_create() , that
> >>>>>> would setup priv_size for the  pool and for all its mbufs.
> >>>>>> - or some sort of combination of these 2 approaches - introduce a wrapper function
> >>>>>> (rte_mbuf_pool_create() or something) that would take priv_size as parameter,
> >>>>>> create a new mempool and then setup priv_size.
> >>>>
> >>>> I though a bit more about this solution, and I realized that doing
> >>>> mi->priv_size = md->priv_size in rte_pktmbuf_attach() is probably not
> >>>> a good idea, as there is no garantee that the size of mi is large enough
> >>>> to store the priv of md.
> >>>>
> >>>> Having the same priv_size for mi and md is maybe a good constraint.
> >>>> I can add this in the API comments and assertions in the code to
> >>>> check this condition, what do you think?
> >>>
> >>> Probably we have a different concepts of what is mbuf's  private space in mind.
> >>> From my point, even indirect buffer should use it's own private space and
> >>> leave contents of direct mbuf it attached to unmodified.
> >>> After attach() operation, only contents of the buffer are shared between mbufs,
> >>> but not the mbuf's metadata.
> >>
> >> Sorry if it was not clear in my previous messages, but I agree
> >> with your description. When attaching a mbuf, only data, not
> >> metadata should be shared.
> >>
> >> In the solution you are suggesting (quoted above), you say we need
> >> to set mi->priv_size to md->priv_size in rte_pktmbuf_attach(). I felt
> >> this was not possible, but it depends on the meaning we give to
> >> priv_size:
> >>
> >> 1. If the meaning is "the size of the private data embedded in this
> >>    mbuf", which is the most logical meaning, we cannot do this
> >>    affectation
> >>
> >> 2. If the meaning is "the size of the private data embedded in the
> >>    mbuf the buf_addr is pointing to" (which is harder to get), the
> >>    affectation makes sense.
> >>
> >> From what I understand, you feel we should use 2/ as priv_size
> >> definition. Is it correct?
> >
> > Yes, I meant 2.
> > From my point priv_size inside mbuf is more like 'buf_ofs'.
> > It's main purpose is for internal use - to help our mbuf manipulation routinies
> > (attach/detach/free) to work correctly.
> > If the user wants to query size of private space for the mbuf, he probably should
> > use the value from mempool.
> 
> Agree.
> 
> 
> >> In my previous message, the definition of m->priv_size was 1/,
> >> so that's why I felt assigning mi->priv_size to md->priv_size was
> >> not possible.
> >>
> >> I agree 2/ is probably a good choice, as it would allow to attach
> >> to a mbuf with a different priv_size. It may require some additional
> >> comments above the field in the structure to explain that.
> >>
> >>
> >>> Otherwise on detach(), you'll have to copy contents of private space back, from direct to indirect mbuf?
> >>> Again how to deal with the case, when 2 or more mbufs will attach to the same direct one?
> >>>
> >>> So let say, if we'll have a macro:
> >>>
> >>> #define RTE_MBUF_PRIV_PTR(mb)	((void *)((struct rte_mbuf *)(mb)) + 1))
> >>>
> >>> No matter is mb  a direct or indirect mbuf.
> >>> Do you have something else in mind here?
> >>
> >> I completely agree with this macro. We should consider the private data
> >> as an extension of the mbuf structure.
> >>
> >>
> >>>>> Introducing rte_mbuf_pool_create() seems a good idea to me, it
> >>>>> would hide 'rte_pktmbuf_pool_private' from the user and force
> >>>>> to initialize all the required fields (mbuf_data_room_size only
> >>>>> today, and maybe mbuf_priv_size).
> >>>>>
> >>>>> The API would be:
> >>>>>
> >>>>> struct rte_mempool *
> >>>>> rte_mbuf_pool_create(const char *name, unsigned n, unsigned elt_size,
> >>>>> 	unsigned cache_size, size_t mbuf_priv_size,
> >>>>> 	rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
> >>>>> 	int socket_id, unsigned flags)
> >>>>>
> >>>>> I can give it a try and send a patch for this.
> >>>>
> >>>> About this, it is not required anymore for this patch series if we
> >>>> agree with my comment above.
> >>>
> >>> I still think we need some way to setup priv_size on a per-mempool basis.
> >>> Doing that in rte_mbuf_pool_create() seems like a good thing to me.
> >>> Not sure, why you decided to drop it?
> >>
> >> I think we can already do it without changing the API by providing
> >> our own rte_pktmbuf_init and rte_pktmbuf_pool_init.
> >>
> >> rte_pktmbuf_init() has to set:
> >>   m->buf_len = mp->elt_size - sizeof(struct mbuf);
> >>   m->priv_size = sizeof(struct mbuf) - sizeof(struct rte_mbuf);
> >>
> >> rte_pktmbuf_pool_init() has to set:
> >>   /* we can use the default function */
> >>   mbp_priv->mbuf_data_room_size = MBUF_RXDATA_SIZE +
> >>   	RTE_PKTMBUF_HEADROOM;
> >
> > Yeh, when  arg==NULL for rte_pktmbuf_pool_init() we always set up
> > mbuf_data_room_size to the hardcoded value.
> > Which always looked strange to me.
> > I think it should be set to:
> > mp->elt_size - sizeof(struct rte_mbuf) - RTE_PKTMBUF_HEADROOM;
> > for that case.
> 
> Yes, that would make more sense instead of the hardcoded value.
> But I'm not sure it should be part of this series as the clone
> patches won't change this behavior. I would prefer to have it in
> another series that reworks mbuf pool initialization. I can also
> work on it.
> 
> On the other hand if you really feel this patch is needed in
> this series, it's not a problem as it's a one-liner.
> 
> >
> >>
> >> In this case, it is possible to calculate the mbuf_priv_size only
> >> from the pool object:
> >>
> >>   mbuf_priv_size = pool->elt_size - RTE_PKTMBUF_HEADROOM -
> >> 	pool_private->mbuf_data_room_size -
> >> 	sizeof(rte_mbuf)
> >>
> >
> > So if I understand your idea correctly:
> > If  second parameter for rte_pktmbuf_pool_init() is NULL, then
> > we setup
> >
> > mbp_priv->mbuf_data_room_size = mp->elt_size - sizeof(struct rte_mbuf) - RTE_PKTMBUF_HEADROOM;
> >
> > Which means that  priv_size ==0  for all  mbufs in the pool
> > Otherwise we setup:
> >
> >  mbp_priv->mbuf_data_room_size = opaque_arg;
> >
> > As we are doing now, and priv_size for all mbufs in the pool will be:
> > pool->elt_size - pool_private->mbuf_data_room_size - sizeof(rte_mbuf);
> >
> > And in that case, all users have to do to specify priv_size for mempool is to pass a proper argument
> > for rte_pktmbuf_pool_init() at  mempool_create().
> > Correct?
> 
> Correct.
> 
> 
> 
> >
> >>
> >> I agree it's not ideal, but I think the mbuf pool initialization
> >> is another problem. That's why I suggested to change this in a
> >> separate series that will add rte_mbuf_pool_create() with the
> >> API described above. Thoughts?
> >>
> >
> > I also put answers to another mail below.
> > Just to keep all discussion in one place.
> >
> >>> Though, I still think that the better approach is to reserve private space before struct rte_mbuf, not after.
> >>> In that case, user got his private space, and we keep buf_addr straight after  rte_mbuf, without any whole.
> >>> So we don't need steps 2 and 3, above,
> >>> plus we don't need rte_mbuf_to_baddr() and rte_mbuf_from_indirect() -
> >>> RTE_MBUF_TO_BADDR/ RTE_MBUF_FROM_BADDR would keep working correctly.
> >>> In fact, with this scheme - we don't even need priv_size for mbuf management (attach/detach/free).
> >>>
> >>> Wonder did you try that approach?
> >>
> >> Yes, I though about this approach too. But I think it would require
> >> more changes. When an application or a driver allocate a mbuf with
> >> mempool_get(), it expects that the returned pointer is the rte_mbuf *.
> >
> > Yep, sure it will still get the pointer to the rte_mbuf *.
> > Though later, if upper layer would like to convert from  rte_mbuf* to app_specific_mbuf *,
> > it would have to use a macro:
> >
> > #define RTE_MBUF_TO_PRIV(m)	((void *)((uintptr_t)(m) - (m)->priv_size))
> >
> >>
> >> With this solution, there are 2 options:
> >> - no mempool modification, so each application/driver has to add
> >>   priv_size bytes to the object to get the mbuf pointer. This does not
> >>   look feasible.
> >> - change the mempool to support private area before each object. I
> >>   think mempool API is already quite complex, and I think it's not
> >>   the role of the mempool library to provide such features.
> >
> >
> > In fact, I think the changes would be minimal.
> > All we have to do, is to make changes in rte_pktmbuf_init():
> >
> > void
> > rte_pktmbuf_init(struct rte_mempool *mp,
> >                  __attribute__((unused)) void *opaque_arg,
> >                  void *_m,
> >                  __attribute__((unused)) unsigned i)
> > {
> >
> >      //extract priv_size from mempool   (discussed above).
> >       uint16_t priv_size = rte_mbufpool_get_priv_size(mp);
> >
> >       struct rte_mbuf *m = _m + priv_size;
> >       uint32_t buf_len = mp->elt_size - sizeof(struct rte_mbuf) - priv_size;
> >
> > ....
> >
> >
> > With that way priv_size inside mbuf would always be the size its own private space,
> > for both direct and indirect mbus., so we don't require to set priv_size at attach/detach.
> > Again RTE_MBUF_TO_BADDR() and RTE_MBUF_FROM_BADDR() would just work without any modifications.
> 
> I'm not sure I'm getting it. The argument '_m' of your
> rte_pktmbuf_init() is the pointer to the priv data, right?
> So it means that the mbuf is located priv_size bytes after.
> 
> The rte_pktmbuf_init() function is called by mempool_create(),
> and the _m parameter is a pointer to a mempool object. So
> in my understanding, mempool_get() would not return a rte_mbuf
> but a pointer to the application private data.

Ah my bad, forgot that mempool's obj_init() returns void now :(
To make this approach work also need to change it, so it return a pointer to the new object. 
So, rte_pktmbuf_init() would return m and then in mempool_add_elem():

if (obj_init)
                obj = obj_init(mp, obj_init_arg, obj, obj_idx);

rte_ring_sp_enqueue(mp->ring, obj); 

Konstantin

> 
> 
> Regards,
> Olivier
> 
> 
> 
> >
> > Konstantin
> >
> >
  
Olivier Matz March 31, 2015, 7:01 p.m. UTC | #12
Hi Konstantin,

On 03/31/2015 01:17 AM, Ananyev, Konstantin wrote:
>>>> With this solution, there are 2 options:
>>>> - no mempool modification, so each application/driver has to add
>>>>   priv_size bytes to the object to get the mbuf pointer. This does not
>>>>   look feasible.
>>>> - change the mempool to support private area before each object. I
>>>>   think mempool API is already quite complex, and I think it's not
>>>>   the role of the mempool library to provide such features.
>>>
>>>
>>> In fact, I think the changes would be minimal.
>>> All we have to do, is to make changes in rte_pktmbuf_init():
>>>
>>> void
>>> rte_pktmbuf_init(struct rte_mempool *mp,
>>>                  __attribute__((unused)) void *opaque_arg,
>>>                  void *_m,
>>>                  __attribute__((unused)) unsigned i)
>>> {
>>>
>>>      //extract priv_size from mempool   (discussed above).
>>>       uint16_t priv_size = rte_mbufpool_get_priv_size(mp);
>>>
>>>       struct rte_mbuf *m = _m + priv_size;
>>>       uint32_t buf_len = mp->elt_size - sizeof(struct rte_mbuf) - priv_size;
>>>
>>> ....
>>>
>>>
>>> With that way priv_size inside mbuf would always be the size its own private space,
>>> for both direct and indirect mbus., so we don't require to set priv_size at attach/detach.
>>> Again RTE_MBUF_TO_BADDR() and RTE_MBUF_FROM_BADDR() would just work without any modifications.
>>
>> I'm not sure I'm getting it. The argument '_m' of your
>> rte_pktmbuf_init() is the pointer to the priv data, right?
>> So it means that the mbuf is located priv_size bytes after.
>>
>> The rte_pktmbuf_init() function is called by mempool_create(),
>> and the _m parameter is a pointer to a mempool object. So
>> in my understanding, mempool_get() would not return a rte_mbuf
>> but a pointer to the application private data.
> 
> Ah my bad, forgot that mempool's obj_init() returns void now :(
> To make this approach work also need to change it, so it return a pointer to the new object. 
> So, rte_pktmbuf_init() would return m and then in mempool_add_elem():
> 
> if (obj_init)
>                 obj = obj_init(mp, obj_init_arg, obj, obj_idx);
> 
> rte_ring_sp_enqueue(mp->ring, obj); 

Yes, but modififying mempool is what I wanted to avoid for several
reasons. First, I think that changing the mempool_create() API here
(actually the obj_init prototype) would impact existing applications.

Also, I'm afraid that storing a different address than the start
address of the object would have additional impacts. For instance,
some data is supposed to be stored before the object, see
__mempool_from_obj() or mempool_obj_audit().

Finally, I believe that mempool is not the proper place to do
modifications that are only needed for mbufs. If we really want
to implement mbuf private data in that way, maybe it would be
better to add a new layer above mempool (a mbuf pool layer).


Regards
Olivier
  
Ananyev, Konstantin April 1, 2015, 1:48 p.m. UTC | #13
> -----Original Message-----
> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> Sent: Tuesday, March 31, 2015 8:01 PM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2 1/5] mbuf: fix clone support when application uses private mbuf data
> 
> Hi Konstantin,
> 
> On 03/31/2015 01:17 AM, Ananyev, Konstantin wrote:
> >>>> With this solution, there are 2 options:
> >>>> - no mempool modification, so each application/driver has to add
> >>>>   priv_size bytes to the object to get the mbuf pointer. This does not
> >>>>   look feasible.
> >>>> - change the mempool to support private area before each object. I
> >>>>   think mempool API is already quite complex, and I think it's not
> >>>>   the role of the mempool library to provide such features.
> >>>
> >>>
> >>> In fact, I think the changes would be minimal.
> >>> All we have to do, is to make changes in rte_pktmbuf_init():
> >>>
> >>> void
> >>> rte_pktmbuf_init(struct rte_mempool *mp,
> >>>                  __attribute__((unused)) void *opaque_arg,
> >>>                  void *_m,
> >>>                  __attribute__((unused)) unsigned i)
> >>> {
> >>>
> >>>      //extract priv_size from mempool   (discussed above).
> >>>       uint16_t priv_size = rte_mbufpool_get_priv_size(mp);
> >>>
> >>>       struct rte_mbuf *m = _m + priv_size;
> >>>       uint32_t buf_len = mp->elt_size - sizeof(struct rte_mbuf) - priv_size;
> >>>
> >>> ....
> >>>
> >>>
> >>> With that way priv_size inside mbuf would always be the size its own private space,
> >>> for both direct and indirect mbus., so we don't require to set priv_size at attach/detach.
> >>> Again RTE_MBUF_TO_BADDR() and RTE_MBUF_FROM_BADDR() would just work without any modifications.
> >>
> >> I'm not sure I'm getting it. The argument '_m' of your
> >> rte_pktmbuf_init() is the pointer to the priv data, right?
> >> So it means that the mbuf is located priv_size bytes after.
> >>
> >> The rte_pktmbuf_init() function is called by mempool_create(),
> >> and the _m parameter is a pointer to a mempool object. So
> >> in my understanding, mempool_get() would not return a rte_mbuf
> >> but a pointer to the application private data.
> >
> > Ah my bad, forgot that mempool's obj_init() returns void now :(
> > To make this approach work also need to change it, so it return a pointer to the new object.
> > So, rte_pktmbuf_init() would return m and then in mempool_add_elem():
> >
> > if (obj_init)
> >                 obj = obj_init(mp, obj_init_arg, obj, obj_idx);
> >
> > rte_ring_sp_enqueue(mp->ring, obj);
> 
> Yes, but modififying mempool is what I wanted to avoid for several
> reasons. First, I think that changing the mempool_create() API here
> (actually the obj_init prototype) would impact existing applications.
> 
> Also, I'm afraid that storing a different address than the start
> address of the object would have additional impacts. For instance,
> some data is supposed to be stored before the object, see
> __mempool_from_obj() or mempool_obj_audit().

mempool_obj_audit() should be ok, I think, but yes -
rte_mempool_from_obj() would change the behaviour and
can't be used by mempool_obj_audit() anymore.

Ok, I am convinced - let's stick with private space between rte_mbuf and buf_adddr, as you suggested.  

> 
> Finally, I believe that mempool is not the proper place to do
> modifications that are only needed for mbufs. If we really want
> to implement mbuf private data in that way, maybe it would be
> better to add a new layer above mempool (a mbuf pool layer).

Not that I am against it, but seems like even more massive change -
every application would need to be changed to use rte_mbufpool_create(), no?

Konstantin

> 
> 
> Regards
> Olivier
  
Olivier Matz April 1, 2015, 3:18 p.m. UTC | #14
Hi,

On 04/01/2015 03:48 PM, Ananyev, Konstantin wrote:
>>>>>> With this solution, there are 2 options:
>>>>>> - no mempool modification, so each application/driver has to add
>>>>>>   priv_size bytes to the object to get the mbuf pointer. This does not
>>>>>>   look feasible.
>>>>>> - change the mempool to support private area before each object. I
>>>>>>   think mempool API is already quite complex, and I think it's not
>>>>>>   the role of the mempool library to provide such features.
>>>>>
>>>>>
>>>>> In fact, I think the changes would be minimal.
>>>>> All we have to do, is to make changes in rte_pktmbuf_init():
>>>>>
>>>>> void
>>>>> rte_pktmbuf_init(struct rte_mempool *mp,
>>>>>                  __attribute__((unused)) void *opaque_arg,
>>>>>                  void *_m,
>>>>>                  __attribute__((unused)) unsigned i)
>>>>> {
>>>>>
>>>>>      //extract priv_size from mempool   (discussed above).
>>>>>       uint16_t priv_size = rte_mbufpool_get_priv_size(mp);
>>>>>
>>>>>       struct rte_mbuf *m = _m + priv_size;
>>>>>       uint32_t buf_len = mp->elt_size - sizeof(struct rte_mbuf) - priv_size;
>>>>>
>>>>> ....
>>>>>
>>>>>
>>>>> With that way priv_size inside mbuf would always be the size its own private space,
>>>>> for both direct and indirect mbus., so we don't require to set priv_size at attach/detach.
>>>>> Again RTE_MBUF_TO_BADDR() and RTE_MBUF_FROM_BADDR() would just work without any modifications.
>>>>
>>>> I'm not sure I'm getting it. The argument '_m' of your
>>>> rte_pktmbuf_init() is the pointer to the priv data, right?
>>>> So it means that the mbuf is located priv_size bytes after.
>>>>
>>>> The rte_pktmbuf_init() function is called by mempool_create(),
>>>> and the _m parameter is a pointer to a mempool object. So
>>>> in my understanding, mempool_get() would not return a rte_mbuf
>>>> but a pointer to the application private data.
>>>
>>> Ah my bad, forgot that mempool's obj_init() returns void now :(
>>> To make this approach work also need to change it, so it return a pointer to the new object.
>>> So, rte_pktmbuf_init() would return m and then in mempool_add_elem():
>>>
>>> if (obj_init)
>>>                 obj = obj_init(mp, obj_init_arg, obj, obj_idx);
>>>
>>> rte_ring_sp_enqueue(mp->ring, obj);
>>
>> Yes, but modififying mempool is what I wanted to avoid for several
>> reasons. First, I think that changing the mempool_create() API here
>> (actually the obj_init prototype) would impact existing applications.
>>
>> Also, I'm afraid that storing a different address than the start
>> address of the object would have additional impacts. For instance,
>> some data is supposed to be stored before the object, see
>> __mempool_from_obj() or mempool_obj_audit().
> 
> mempool_obj_audit() should be ok, I think, but yes -
> rte_mempool_from_obj() would change the behaviour and
> can't be used by mempool_obj_audit() anymore.
> 
> Ok, I am convinced - let's stick with private space between rte_mbuf and buf_adddr, as you suggested.  
> 
>>
>> Finally, I believe that mempool is not the proper place to do
>> modifications that are only needed for mbufs. If we really want
>> to implement mbuf private data in that way, maybe it would be
>> better to add a new layer above mempool (a mbuf pool layer).
> 
> Not that I am against it, but seems like even more massive change -
> every application would need to be changed to use rte_mbufpool_create(), no?

Yes, indeed that would be a significant change for the applications,
which is probably not what we want, except if we can keep backward
compatibility. Maybe it's possible. That's something to keep in mind
if I send a patch series that introduce a new rte_mbuf_pool_create()
function.

Regards,
Olivier
  

Patch

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 3057791..c5a195a 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -425,6 +425,7 @@  testpmd_mbuf_ctor(struct rte_mempool *mp,
 	mb->tx_offload   = 0;
 	mb->vlan_tci     = 0;
 	mb->hash.rss     = 0;
+	mb->priv_size    = 0;
 }
 
 static void
diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index c3fcb80..d542461 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -139,7 +139,7 @@ 
 /* Number of descriptors per cacheline. */
 #define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
 
-#define MBUF_EXT_MEM(mb)   (RTE_MBUF_FROM_BADDR((mb)->buf_addr) != (mb))
+#define MBUF_EXT_MEM(mb)   (rte_mbuf_from_indirect(mb) != (mb))
 
 /* mask of enabled ports */
 static uint32_t enabled_port_mask = 0;
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
index 526b18d..e095999 100644
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -125,6 +125,7 @@  rte_pktmbuf_init(struct rte_mempool *mp,
 	m->pool = mp;
 	m->nb_segs = 1;
 	m->port = 0xff;
+	m->priv_size = 0;
 }
 
 /* do some sanity checks on a mbuf: panic if it fails */
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 17ba791..45ac948 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -317,18 +317,42 @@  struct rte_mbuf {
 			/* uint64_t unused:8; */
 		};
 	};
+
+	uint16_t priv_size;       /**< size of the application private data */
 } __rte_cache_aligned;
 
 /**
- * Given the buf_addr returns the pointer to corresponding mbuf.
+ * Return the mbuf owning the data buffer address of an indirect mbuf.
+ *
+ * @param mi
+ *   The pointer to the indirect mbuf.
+ * @return
+ *   The address of the direct mbuf corresponding to buffer_addr.
  */
-#define RTE_MBUF_FROM_BADDR(ba)     (((struct rte_mbuf *)(ba)) - 1)
+static inline struct rte_mbuf *
+rte_mbuf_from_indirect(struct rte_mbuf *mi)
+{
+       struct rte_mbuf *md;
+       md = (struct rte_mbuf *)((char *)mi->buf_addr - sizeof(*mi) -
+	       mi->priv_size);
+       return md;
+}
 
 /**
- * Given the pointer to mbuf returns an address where it's  buf_addr
- * should point to.
+ * Return the buffer address embedded in the given mbuf.
+ *
+ * @param md
+ *   The pointer to the mbuf.
+ * @return
+ *   The address of the data buffer owned by the mbuf.
  */
-#define RTE_MBUF_TO_BADDR(mb)       (((struct rte_mbuf *)(mb)) + 1)
+static inline char *
+rte_mbuf_to_baddr(struct rte_mbuf *md)
+{
+       char *buffer_addr;
+       buffer_addr = (char *)md + sizeof(*md) + md->priv_size;
+       return buffer_addr;
+}
 
 /**
  * Returns TRUE if given mbuf is indirect, or FALSE otherwise.
@@ -744,12 +768,12 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *md)
 static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
 {
 	const struct rte_mempool *mp = m->pool;
-	void *buf = RTE_MBUF_TO_BADDR(m);
-	uint32_t buf_len = mp->elt_size - sizeof(*m);
-	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof (*m);
+	void *buf = rte_mbuf_to_baddr(m);
+	unsigned mhdr_size = (char *)buf - (char *)m;
 
+	m->buf_physaddr = rte_mempool_virt2phy(mp, m) + mhdr_size;
 	m->buf_addr = buf;
-	m->buf_len = (uint16_t)buf_len;
+	m->buf_len = (uint16_t)(mp->elt_size - mhdr_size);
 
 	m->data_off = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
 			RTE_PKTMBUF_HEADROOM : m->buf_len;
@@ -774,7 +798,7 @@  __rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 		 *  - free attached mbuf segment
 		 */
 		if (RTE_MBUF_INDIRECT(m)) {
-			struct rte_mbuf *md = RTE_MBUF_FROM_BADDR(m->buf_addr);
+			struct rte_mbuf *md = rte_mbuf_from_indirect(m);
 			rte_pktmbuf_detach(m);
 			if (rte_mbuf_refcnt_update(md, -1) == 0)
 				__rte_mbuf_raw_free(md);