[v1,1/2] net/memif: add a Rx fast path

Message ID 20220517105109.1086090-2-joyce.kong@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series add a fast path for memif Rx/Tx |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Joyce Kong May 17, 2022, 10:51 a.m. UTC
  For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. The removal
of the branch leads to considerable performance uplift.

When memif <= buffer size, Rx chooses the fast memcpy path,
otherwise it would choose the original path.

Test with 1p1q on Ampere Altra AArch64 server,
--------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
--------------------------------------------
non-zc gain |     4.30%     |    -0.52%    |
--------------------------------------------
   zc gain  |     2.46%     |     0.70%    |
--------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
-------------------------------------------
  buf size  | memif <= mbuf | memif > mbuf |
-------------------------------------------
non-zc gain |     2.13%     |    -1.40%    |
-------------------------------------------
   zc gain  |     0.18%     |     0.48%    |
-------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
---
 drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 40 deletions(-)
  

Comments

Ferruh Yigit May 18, 2022, 4:53 p.m. UTC | #1
On 5/17/2022 11:51 AM, Joyce Kong wrote:
> For memif non-zero-copy mode, there is a branch to compare
> the mbuf and memif buffer size during memory copying. Add
> a fast memory copy path by removing this branch with mbuf
> and memif buffer size defined at compile time. The removal
> of the branch leads to considerable performance uplift.
> 
> When memif <= buffer size, Rx chooses the fast memcpy path,
> otherwise it would choose the original path.
> 
> Test with 1p1q on Ampere Altra AArch64 server,
> --------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> --------------------------------------------
> non-zc gain |     4.30%     |    -0.52%    |
> --------------------------------------------
>     zc gain  |     2.46%     |     0.70%    |
> --------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86server,
> -------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> -------------------------------------------
> non-zc gain |     2.13%     |    -1.40%    |
> -------------------------------------------
>     zc gain  |     0.18%     |     0.48%    |
> -------------------------------------------
> 


Hi Joyce,

I have multiple questions,

1) The patch updates only non-zero-copy mode Rx path ('eth_memif_rx'), 
why zero-copy path performance also impacted?

2) As far as I can see there is a behavior change, more details below

3) patch talking about memif buffer size being defined in compile time, 
is the big "memif <= mbuf" if block optimized out?
Since 'pkt_buffer_size' is a devarg, so it can change from run to run 
and it is not known in compile time, I doubt that it is optimized out.
Is having  'pkt_buffer_size' as devarg breaks your logic?

4) One option gains performance and other loose performance, do you 
think gain performance case is more common use case? Is there any data 
around it?


Jakub,

Do you want to test this patch first before progressing with it?

> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> ---
>   drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
>   1 file changed, 84 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
> index 587ad45576..f55776ca46 100644
> --- a/drivers/net/memif/rte_eth_memif.c
> +++ b/drivers/net/memif/rte_eth_memif.c
> @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
>   		goto refill;
>   	n_slots = last_slot - cur_slot;
>   
> -	while (n_slots && n_rx_pkts < nb_pkts) {
> -		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> -		if (unlikely(mbuf_head == NULL))
> -			goto no_free_bufs;
> -		mbuf = mbuf_head;
> -		mbuf->port = mq->in_port;
> +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> +		while (n_slots && n_rx_pkts < nb_pkts) {
> +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> +			if (unlikely(mbuf_head == NULL))
> +				goto no_free_bufs;
> +			mbuf = mbuf_head;
> +			mbuf->port = mq->in_port;
> +
> +next_slot1:
> +			s0 = cur_slot & mask;
> +			d0 = &ring->desc[s0];
>   
> -next_slot:
> -		s0 = cur_slot & mask;
> -		d0 = &ring->desc[s0];
> +			cp_len = d0->length;
>   
> -		src_len = d0->length;
> -		dst_off = 0;
> -		src_off = 0;
> +			rte_pktmbuf_data_len(mbuf) = cp_len;
> +			rte_pktmbuf_pkt_len(mbuf) = cp_len;
> +			if (mbuf != mbuf_head)
> +				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
>   
> -		do {
> -			dst_len = mbuf_size - dst_off;
> -			if (dst_len == 0) {
> -				dst_off = 0;
> -				dst_len = mbuf_size;
> +			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
> +				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
> +
> +			cur_slot++;
> +			n_slots--;
>   
> -				/* store pointer to tail */
> +			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
>   				mbuf_tail = mbuf;
>   				mbuf = rte_pktmbuf_alloc(mq->mempool);
>   				if (unlikely(mbuf == NULL))
>   					goto no_free_bufs;
> -				mbuf->port = mq->in_port;
>   				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
>   				if (unlikely(ret < 0)) {
>   					MIF_LOG(ERR, "number-of-segments-overflow");
>   					rte_pktmbuf_free(mbuf);
>   					goto no_free_bufs;
>   				}
> +				goto next_slot1;
>   			}

It is very hard to comment on the correct part of the patch, since it is 
mixed a lot, but
- previously when memif buffer is segmented, and its size is less than 
mbuf; mbuf is filled with as much memif data as possible and later 
switched to next mbuf, like:

   memif buffer
+-+  +-+  +-+  +-+
|a|->|b|->|c|->|d|
+-+  +-+  +-+  +-+

+---+  +---+
|abc|->|d  |
+---+  +---+
   mbuf


- Now each memif segment is a mbuf,

   memif buffer
+-+  +-+  +-+  +-+
|a|->|b|->|c|->|d|
+-+  +-+  +-+  +-+

+---+  +---+  +---+  +---+
|a  |->|b  |->|c  |->|d  |
+---+  +---+  +---+  +---+
   mbuf

Can you please confirm this behavior change? If so can you please 
highlight is more in the commit log?
And is this tradeoff something preferred?
  
Ferruh Yigit May 18, 2022, 5:06 p.m. UTC | #2
On 5/17/2022 11:51 AM, Joyce Kong wrote:
> For memif non-zero-copy mode, there is a branch to compare
> the mbuf and memif buffer size during memory copying. Add
> a fast memory copy path by removing this branch with mbuf
> and memif buffer size defined at compile time. The removal
> of the branch leads to considerable performance uplift.
> 
> When memif <= buffer size, Rx chooses the fast memcpy path,
> otherwise it would choose the original path.
> 
> Test with 1p1q on Ampere Altra AArch64 server,
> --------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> --------------------------------------------
> non-zc gain |     4.30%     |    -0.52%    |
> --------------------------------------------
>     zc gain  |     2.46%     |     0.70%    |
> --------------------------------------------
> 
> Test with 1p1q on Cascade Lake Xeon X86server,
> -------------------------------------------
>    buf size  | memif <= mbuf | memif > mbuf |
> -------------------------------------------
> non-zc gain |     2.13%     |    -1.40%    |
> -------------------------------------------
>     zc gain  |     0.18%     |     0.48%    |
> -------------------------------------------
> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>

<...>

> +	} else {
> +		while (n_slots && n_rx_pkts < nb_pkts) {
> +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> +			if (unlikely(mbuf_head == NULL))
> +				goto no_free_bufs;
> +			mbuf = mbuf_head;
> +			mbuf->port = mq->in_port;
> +
> +next_slot2:
> +			s0 = cur_slot & mask;
> +			d0 = &ring->desc[s0];
>   
> -			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
> -							   dst_off),
> -				(uint8_t *)memif_get_buffer(proc_private, d0) +
> -				src_off, cp_len);
> +			src_len = d0->length;
> +			dst_off = 0;
> +			src_off = 0;

Hi Joyce, Jakub,

Something doesn't look right in the original code (not in this patch), 
can you please help me check if I am missing something?

For the memif buffer segmented case, first buffer will be copied to 
mbuf, 'dst_off' increased and jump back to process next memif segment:

  + d0
  |
  v
+++  +-+
|a+->+b|
+-+  +-+

+---+
|a  |
+-+-+
   ^
   |
   + dst_off

"
     if (d0->flags & MEMIF_DESC_FLAG_NEXT)
          goto next_slot;
"

But here 'dst_off' set back to '0', wont this cause next memif buffer 
segment to write to beginning of mbuf overwriting previous data?

Thanks,
ferruh
  
Joyce Kong May 19, 2022, 7 a.m. UTC | #3
Hi Ferruh,

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> Sent: Thursday, May 19, 2022 12:53 AM
> To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar <jgrajcia@cisco.com>
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
> 
> On 5/17/2022 11:51 AM, Joyce Kong wrote:
> > For memif non-zero-copy mode, there is a branch to compare the mbuf
> > and memif buffer size during memory copying. Add a fast memory copy
> > path by removing this branch with mbuf and memif buffer size defined
> > at compile time. The removal of the branch leads to considerable
> > performance uplift.
> >
> > When memif <= buffer size, Rx chooses the fast memcpy path, otherwise
> > it would choose the original path.
> >
> > Test with 1p1q on Ampere Altra AArch64 server,
> > --------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > --------------------------------------------
> > non-zc gain |     4.30%     |    -0.52%    |
> > --------------------------------------------
> >     zc gain  |     2.46%     |     0.70%    |
> > --------------------------------------------
> >
> > Test with 1p1q on Cascade Lake Xeon X86server,
> > -------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > -------------------------------------------
> > non-zc gain |     2.13%     |    -1.40%    |
> > -------------------------------------------
> >     zc gain  |     0.18%     |     0.48%    |
> > -------------------------------------------
> >
> 
> 
> Hi Joyce,
> 
> I have multiple questions,
> 
> 1) The patch updates only non-zero-copy mode Rx path ('eth_memif_rx'), why
> zero-copy path performance also impacted?
> 
For memif zero-copy mode, only client runs 'eth_memif_rx_zc', and server still runs
'eth_memif_rx', so the patch would impacts zero-copy mode.

> 2) As far as I can see there is a behavior change, more details below
> 
> 3) patch talking about memif buffer size being defined in compile time, is the
> big "memif <= mbuf" if block optimized out?
> Since 'pkt_buffer_size' is a devarg, so it can change from run to run and it is not
> known in compile time, I doubt that it is optimized out.
> Is having  'pkt_buffer_size' as devarg breaks your logic?
> 
From memif run to run, run.pkt_buffer_size would change, and cfg.pkt_buffer_size
which is the reserved max buffer size would not change. For patch details, I use
cfg.pkt_buffer_size to implement the logic.

> 4) One option gains performance and other loose performance, do you think
> gain performance case is more common use case? Is there any data around it?
> 
Yes, I think the gain performance case is more common case, as the default memif
buffer size equals to mbuf size. In theory, when memif buf size >= mbuf size, the Rx
runs the original path, it would not lead to obvious impact.

> 
> Jakub,
> 
> Do you want to test this patch first before progressing with it?
> 
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > ---
> >   drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
> >   1 file changed, 84 insertions(+), 40 deletions(-)
> >
> > diff --git a/drivers/net/memif/rte_eth_memif.c
> > b/drivers/net/memif/rte_eth_memif.c
> > index 587ad45576..f55776ca46 100644
> > --- a/drivers/net/memif/rte_eth_memif.c
> > +++ b/drivers/net/memif/rte_eth_memif.c
> > @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf
> **bufs, uint16_t nb_pkts)
> >   		goto refill;
> >   	n_slots = last_slot - cur_slot;
> >
> > -	while (n_slots && n_rx_pkts < nb_pkts) {
> > -		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > -		if (unlikely(mbuf_head == NULL))
> > -			goto no_free_bufs;
> > -		mbuf = mbuf_head;
> > -		mbuf->port = mq->in_port;
> > +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> > +		while (n_slots && n_rx_pkts < nb_pkts) {
> > +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > +			if (unlikely(mbuf_head == NULL))
> > +				goto no_free_bufs;
> > +			mbuf = mbuf_head;
> > +			mbuf->port = mq->in_port;
> > +
> > +next_slot1:
> > +			s0 = cur_slot & mask;
> > +			d0 = &ring->desc[s0];
> >
> > -next_slot:
> > -		s0 = cur_slot & mask;
> > -		d0 = &ring->desc[s0];
> > +			cp_len = d0->length;
> >
> > -		src_len = d0->length;
> > -		dst_off = 0;
> > -		src_off = 0;
> > +			rte_pktmbuf_data_len(mbuf) = cp_len;
> > +			rte_pktmbuf_pkt_len(mbuf) = cp_len;
> > +			if (mbuf != mbuf_head)
> > +				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
> >
> > -		do {
> > -			dst_len = mbuf_size - dst_off;
> > -			if (dst_len == 0) {
> > -				dst_off = 0;
> > -				dst_len = mbuf_size;
> > +			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
> > +				(uint8_t *)memif_get_buffer(proc_private, d0),
> cp_len);
> > +
> > +			cur_slot++;
> > +			n_slots--;
> >
> > -				/* store pointer to tail */
> > +			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
> >   				mbuf_tail = mbuf;
> >   				mbuf = rte_pktmbuf_alloc(mq->mempool);
> >   				if (unlikely(mbuf == NULL))
> >   					goto no_free_bufs;
> > -				mbuf->port = mq->in_port;
> >   				ret = memif_pktmbuf_chain(mbuf_head,
> mbuf_tail, mbuf);
> >   				if (unlikely(ret < 0)) {
> >   					MIF_LOG(ERR, "number-of-segments-
> overflow");
> >   					rte_pktmbuf_free(mbuf);
> >   					goto no_free_bufs;
> >   				}
> > +				goto next_slot1;
> >   			}
> 
> It is very hard to comment on the correct part of the patch, since it is mixed a
> lot, but
> - previously when memif buffer is segmented, and its size is less than mbuf;
> mbuf is filled with as much memif data as possible and later switched to next
> mbuf, like:
> 
>    memif buffer
> +-+  +-+  +-+  +-+
> |a|->|b|->|c|->|d|
> +-+  +-+  +-+  +-+
> 
> +---+  +---+
> |abc|->|d  |
> +---+  +---+
>    mbuf
> 
> 
> - Now each memif segment is a mbuf,
> 
>    memif buffer
> +-+  +-+  +-+  +-+
> |a|->|b|->|c|->|d|
> +-+  +-+  +-+  +-+
> 
> +---+  +---+  +---+  +---+
> |a  |->|b  |->|c  |->|d  |
> +---+  +---+  +---+  +---+
>    mbuf
> 
> Can you please confirm this behavior change? If so can you please highlight is
> more in the commit log?
> And is this tradeoff something preferred?
  
Joyce Kong May 19, 2022, 8:44 a.m. UTC | #4
> -----Original Message-----
> From: Joyce Kong
> Sent: Thursday, May 19, 2022 3:00 PM
> To: Ferruh Yigit <ferruh.yigit@xilinx.com>; Jakub Grajciar <jgrajcia@cisco.com>
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> Subject: RE: [PATCH v1 1/2] net/memif: add a Rx fast path
> 
> Hi Ferruh,
> 
> > -----Original Message-----
> > From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> > Sent: Thursday, May 19, 2022 12:53 AM
> > To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar
> > <jgrajcia@cisco.com>
> > Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> > Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
> >
> > On 5/17/2022 11:51 AM, Joyce Kong wrote:
> > > For memif non-zero-copy mode, there is a branch to compare the mbuf
> > > and memif buffer size during memory copying. Add a fast memory copy
> > > path by removing this branch with mbuf and memif buffer size defined
> > > at compile time. The removal of the branch leads to considerable
> > > performance uplift.
> > >
> > > When memif <= buffer size, Rx chooses the fast memcpy path,
> > > otherwise it would choose the original path.
> > >
> > > Test with 1p1q on Ampere Altra AArch64 server,
> > > --------------------------------------------
> > >    buf size  | memif <= mbuf | memif > mbuf |
> > > --------------------------------------------
> > > non-zc gain |     4.30%     |    -0.52%    |
> > > --------------------------------------------
> > >     zc gain  |     2.46%     |     0.70%    |
> > > --------------------------------------------
> > >
> > > Test with 1p1q on Cascade Lake Xeon X86server,
> > > -------------------------------------------
> > >    buf size  | memif <= mbuf | memif > mbuf |
> > > -------------------------------------------
> > > non-zc gain |     2.13%     |    -1.40%    |
> > > -------------------------------------------
> > >     zc gain  |     0.18%     |     0.48%    |
> > > -------------------------------------------
> > >
> >
> >
> > Hi Joyce,
> >
> > I have multiple questions,
> >
> > 1) The patch updates only non-zero-copy mode Rx path ('eth_memif_rx'),
> > why zero-copy path performance also impacted?
> >
> For memif zero-copy mode, only client runs 'eth_memif_rx_zc', and server still
> runs 'eth_memif_rx', so the patch would impacts zero-copy mode.
> 
> > 2) As far as I can see there is a behavior change, more details below
> >
> > 3) patch talking about memif buffer size being defined in compile
> > time, is the big "memif <= mbuf" if block optimized out?
> > Since 'pkt_buffer_size' is a devarg, so it can change from run to run
> > and it is not known in compile time, I doubt that it is optimized out.
> > Is having  'pkt_buffer_size' as devarg breaks your logic?
> >
> From memif run to run, run.pkt_buffer_size would change, and
> cfg.pkt_buffer_size which is the reserved max buffer size would not change.
> For patch details, I use cfg.pkt_buffer_size to implement the logic.
> 
> > 4) One option gains performance and other loose performance, do you
> > think gain performance case is more common use case? Is there any data
> around it?
> >
> Yes, I think the gain performance case is more common case, as the default
> memif buffer size equals to mbuf size. In theory, when memif buf size >= mbuf
> size, the Rx runs the original path, it would not lead to obvious impact.
> 
> >
> > Jakub,
> >
> > Do you want to test this patch first before progressing with it?
> >
> > > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > > ---
> > >   drivers/net/memif/rte_eth_memif.c | 124 ++++++++++++++++++++----------
> > >   1 file changed, 84 insertions(+), 40 deletions(-)
> > >
> > > diff --git a/drivers/net/memif/rte_eth_memif.c
> > > b/drivers/net/memif/rte_eth_memif.c
> > > index 587ad45576..f55776ca46 100644
> > > --- a/drivers/net/memif/rte_eth_memif.c
> > > +++ b/drivers/net/memif/rte_eth_memif.c
> > > @@ -342,66 +342,111 @@ eth_memif_rx(void *queue, struct rte_mbuf
> > **bufs, uint16_t nb_pkts)
> > >   		goto refill;
> > >   	n_slots = last_slot - cur_slot;
> > >
> > > -	while (n_slots && n_rx_pkts < nb_pkts) {
> > > -		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > > -		if (unlikely(mbuf_head == NULL))
> > > -			goto no_free_bufs;
> > > -		mbuf = mbuf_head;
> > > -		mbuf->port = mq->in_port;
> > > +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> > > +		while (n_slots && n_rx_pkts < nb_pkts) {
> > > +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > > +			if (unlikely(mbuf_head == NULL))
> > > +				goto no_free_bufs;
> > > +			mbuf = mbuf_head;
> > > +			mbuf->port = mq->in_port;
> > > +
> > > +next_slot1:
> > > +			s0 = cur_slot & mask;
> > > +			d0 = &ring->desc[s0];
> > >
> > > -next_slot:
> > > -		s0 = cur_slot & mask;
> > > -		d0 = &ring->desc[s0];
> > > +			cp_len = d0->length;
> > >
> > > -		src_len = d0->length;
> > > -		dst_off = 0;
> > > -		src_off = 0;
> > > +			rte_pktmbuf_data_len(mbuf) = cp_len;
> > > +			rte_pktmbuf_pkt_len(mbuf) = cp_len;
> > > +			if (mbuf != mbuf_head)
> > > +				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
> > >
> > > -		do {
> > > -			dst_len = mbuf_size - dst_off;
> > > -			if (dst_len == 0) {
> > > -				dst_off = 0;
> > > -				dst_len = mbuf_size;
> > > +			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
> > > +				(uint8_t *)memif_get_buffer(proc_private, d0),
> > cp_len);
> > > +
> > > +			cur_slot++;
> > > +			n_slots--;
> > >
> > > -				/* store pointer to tail */
> > > +			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
> > >   				mbuf_tail = mbuf;
> > >   				mbuf = rte_pktmbuf_alloc(mq->mempool);
> > >   				if (unlikely(mbuf == NULL))
> > >   					goto no_free_bufs;
> > > -				mbuf->port = mq->in_port;
> > >   				ret = memif_pktmbuf_chain(mbuf_head,
> > mbuf_tail, mbuf);
> > >   				if (unlikely(ret < 0)) {
> > >   					MIF_LOG(ERR, "number-of-segments-
> > overflow");
> > >   					rte_pktmbuf_free(mbuf);
> > >   					goto no_free_bufs;
> > >   				}
> > > +				goto next_slot1;
> > >   			}
> >
> > It is very hard to comment on the correct part of the patch, since it
> > is mixed a lot, but
> > - previously when memif buffer is segmented, and its size is less than
> > mbuf; mbuf is filled with as much memif data as possible and later
> > switched to next mbuf, like:
> >
> >    memif buffer
> > +-+  +-+  +-+  +-+
> > |a|->|b|->|c|->|d|
> > +-+  +-+  +-+  +-+
> >
> > +---+  +---+
> > |abc|->|d  |
> > +---+  +---+
> >    mbuf
> >
> >
> > - Now each memif segment is a mbuf,
> >
> >    memif buffer
> > +-+  +-+  +-+  +-+
> > |a|->|b|->|c|->|d|
> > +-+  +-+  +-+  +-+
> >
> > +---+  +---+  +---+  +---+
> > |a  |->|b  |->|c  |->|d  |
> > +---+  +---+  +---+  +---+
> >    mbuf
> >
> > Can you please confirm this behavior change? If so can you please
> > highlight is more in the commit log?
> > And is this tradeoff something preferred?

Yes, the patch leads to the behavior change, and I will highlight more in the commit
log for next version.
This change is the same as zero-copy mode does, reducing complexed comparation
with more memory space. I am also looking forward to get some feedback from the
community about the tradeoff.

Thanks,
Joyce
  
Joyce Kong May 19, 2022, 3:09 p.m. UTC | #5
> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
> Sent: Thursday, May 19, 2022 1:06 AM
> To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar <jgrajcia@cisco.com>
> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>
> Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
> 
> On 5/17/2022 11:51 AM, Joyce Kong wrote:
> > For memif non-zero-copy mode, there is a branch to compare
> > the mbuf and memif buffer size during memory copying. Add
> > a fast memory copy path by removing this branch with mbuf
> > and memif buffer size defined at compile time. The removal
> > of the branch leads to considerable performance uplift.
> >
> > When memif <= buffer size, Rx chooses the fast memcpy path,
> > otherwise it would choose the original path.
> >
> > Test with 1p1q on Ampere Altra AArch64 server,
> > --------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > --------------------------------------------
> > non-zc gain |     4.30%     |    -0.52%    |
> > --------------------------------------------
> >     zc gain  |     2.46%     |     0.70%    |
> > --------------------------------------------
> >
> > Test with 1p1q on Cascade Lake Xeon X86server,
> > -------------------------------------------
> >    buf size  | memif <= mbuf | memif > mbuf |
> > -------------------------------------------
> > non-zc gain |     2.13%     |    -1.40%    |
> > -------------------------------------------
> >     zc gain  |     0.18%     |     0.48%    |
> > -------------------------------------------
> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> 
> <...>
> 
> > +	} else {
> > +		while (n_slots && n_rx_pkts < nb_pkts) {
> > +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> > +			if (unlikely(mbuf_head == NULL))
> > +				goto no_free_bufs;
> > +			mbuf = mbuf_head;
> > +			mbuf->port = mq->in_port;
> > +
> > +next_slot2:
> > +			s0 = cur_slot & mask;
> > +			d0 = &ring->desc[s0];
> >
> > -			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
> > -							   dst_off),
> > -				(uint8_t *)memif_get_buffer(proc_private, d0)
> +
> > -				src_off, cp_len);
> > +			src_len = d0->length;
> > +			dst_off = 0;
> > +			src_off = 0;
> 
> Hi Joyce, Jakub,
> 
> Something doesn't look right in the original code (not in this patch),
> can you please help me check if I am missing something?
> 
> For the memif buffer segmented case, first buffer will be copied to
> mbuf, 'dst_off' increased and jump back to process next memif segment:
> 
>   + d0
>   |
>   v
> +++  +-+
> |a+->+b|
> +-+  +-+
> 
> +---+
> |a  |
> +-+-+
>    ^
>    |
>    + dst_off
> 
> "
>      if (d0->flags & MEMIF_DESC_FLAG_NEXT)
>           goto next_slot;
> "
> 
> But here 'dst_off' set back to '0', wont this cause next memif buffer
> segment to write to beginning of mbuf overwriting previous data?
> 
> Thanks,
> Ferruh

Hi Ferruh,

Agree with you here, and sorry I didn’t notice it before. Perhaps moving
'det_off = 0' to the line above 'next_slot' would solve the overwriting?

Best Regards,
Joyce
  
Ferruh Yigit May 19, 2022, 4:38 p.m. UTC | #6
On 5/19/2022 4:09 PM, Joyce Kong wrote:
> [CAUTION: External Email]
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@xilinx.com>
>> Sent: Thursday, May 19, 2022 1:06 AM
>> To: Joyce Kong <Joyce.Kong@arm.com>; Jakub Grajciar <jgrajcia@cisco.com>
>> Cc: Ruifeng Wang <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd
>> <nd@arm.com>
>> Subject: Re: [PATCH v1 1/2] net/memif: add a Rx fast path
>>
>> On 5/17/2022 11:51 AM, Joyce Kong wrote:
>>> For memif non-zero-copy mode, there is a branch to compare
>>> the mbuf and memif buffer size during memory copying. Add
>>> a fast memory copy path by removing this branch with mbuf
>>> and memif buffer size defined at compile time. The removal
>>> of the branch leads to considerable performance uplift.
>>>
>>> When memif <= buffer size, Rx chooses the fast memcpy path,
>>> otherwise it would choose the original path.
>>>
>>> Test with 1p1q on Ampere Altra AArch64 server,
>>> --------------------------------------------
>>>     buf size  | memif <= mbuf | memif > mbuf |
>>> --------------------------------------------
>>> non-zc gain |     4.30%     |    -0.52%    |
>>> --------------------------------------------
>>>      zc gain  |     2.46%     |     0.70%    |
>>> --------------------------------------------
>>>
>>> Test with 1p1q on Cascade Lake Xeon X86server,
>>> -------------------------------------------
>>>     buf size  | memif <= mbuf | memif > mbuf |
>>> -------------------------------------------
>>> non-zc gain |     2.13%     |    -1.40%    |
>>> -------------------------------------------
>>>      zc gain  |     0.18%     |     0.48%    |
>>> -------------------------------------------
>>>
>>> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
>>
>> <...>
>>
>>> +   } else {
>>> +           while (n_slots && n_rx_pkts < nb_pkts) {
>>> +                   mbuf_head = rte_pktmbuf_alloc(mq->mempool);
>>> +                   if (unlikely(mbuf_head == NULL))
>>> +                           goto no_free_bufs;
>>> +                   mbuf = mbuf_head;
>>> +                   mbuf->port = mq->in_port;
>>> +
>>> +next_slot2:
>>> +                   s0 = cur_slot & mask;
>>> +                   d0 = &ring->desc[s0];
>>>
>>> -                   rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
>>> -                                                      dst_off),
>>> -                           (uint8_t *)memif_get_buffer(proc_private, d0)
>> +
>>> -                           src_off, cp_len);
>>> +                   src_len = d0->length;
>>> +                   dst_off = 0;
>>> +                   src_off = 0;
>>
>> Hi Joyce, Jakub,
>>
>> Something doesn't look right in the original code (not in this patch),
>> can you please help me check if I am missing something?
>>
>> For the memif buffer segmented case, first buffer will be copied to
>> mbuf, 'dst_off' increased and jump back to process next memif segment:
>>
>>    + d0
>>    |
>>    v
>> +++  +-+
>> |a+->+b|
>> +-+  +-+
>>
>> +---+
>> |a  |
>> +-+-+
>>     ^
>>     |
>>     + dst_off
>>
>> "
>>       if (d0->flags & MEMIF_DESC_FLAG_NEXT)
>>            goto next_slot;
>> "
>>
>> But here 'dst_off' set back to '0', wont this cause next memif buffer
>> segment to write to beginning of mbuf overwriting previous data?
>>
>> Thanks,
>> Ferruh
> 
> Hi Ferruh,
> 
> Agree with you here, and sorry I didn’t notice it before. Perhaps moving
> 'det_off = 0' to the line above 'next_slot' would solve the overwriting?
> 

Yes, I think this solves the issue.


And I wonder why this is not caught by testing. @Jakub, is the segmented 
memif buffers not a common use case?

I did able to reproduce the issue as following (and confirm suggested 
change fixes it):

server
./build/app/dpdk-testpmd --proc-type=primary --file-prefix=pmd1 
--vdev=net_memif0,role=server,bsize=32 -- -i --txpkts=512
 > set fwd txonly
 > start

client
./build/app/dpdk-testpmd --proc-type=primary --file-prefix=pmd2 
--vdev=net_memif1,bsize=32 -- -i
 > set fwd rxonly
 > set verbose 3
 > start

'client' will display packets info wrong, it will be all '0'. Also it is 
possible to capture packets in client and confirm.
  

Patch

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index 587ad45576..f55776ca46 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -342,66 +342,111 @@  eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		goto refill;
 	n_slots = last_slot - cur_slot;
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
-			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot1:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+			cp_len = d0->length;
 
-		src_len = d0->length;
-		dst_off = 0;
-		src_off = 0;
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
+
+			cur_slot++;
+			n_slots--;
 
-				/* store pointer to tail */
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
 				if (unlikely(mbuf == NULL))
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		cur_slot++;
-		n_slots--;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +739,6 @@  eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,