[v2,1/2] net/memif: add a Rx fast path

Message ID 20220701102815.1444223-2-joyce.kong@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series add a fast path for memif Rx/Tx |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Joyce Kong July 1, 2022, 10:28 a.m. UTC
  For memif non-zero-copy mode, there is a branch to compare
the mbuf and memif buffer size during memory copying. Add
a fast memory copy path by removing this branch with mbuf
and memif buffer size defined at compile time. The removal
of the branch leads to considerable performance uplift.
The Rx fast path would not change mbuf's behavior of storing
memif buf.

When memif <= buffer size, Rx chooses the fast memcpy path,
otherwise it would choose the original path.

Test with 1p1q on Ampere Altra AArch64 server,
----------------------------------------------
|  buf size   | memif <= mbuf | memif > mbuf |
----------------------------------------------
| non-zc gain |     4.30%     |    -0.52%    |
----------------------------------------------
|   zc gain   |     2.46%     |     0.70%    |
----------------------------------------------

Test with 1p1q on Cascade Lake Xeon X86server,
----------------------------------------------
|   buf size  | memif <= mbuf | memif > mbuf |
----------------------------------------------
| non-zc gain |     2.13%     |    -1.40%    |
----------------------------------------------
|   zc gain   |     0.18%     |     0.48%    |
----------------------------------------------

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 drivers/net/memif/rte_eth_memif.c | 123 ++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 40 deletions(-)
  

Comments

Stephen Hemminger July 1, 2022, 4:51 p.m. UTC | #1
On Fri,  1 Jul 2022 10:28:14 +0000
Joyce Kong <joyce.kong@arm.com> wrote:

>  	n_slots = last_slot - cur_slot;
> +	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
> +		while (n_slots && n_rx_pkts < nb_pkts) {
> +			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
> +			if (unlikely(mbuf_head == NULL))
> +				goto no_free_bufs;
> +			mbuf = mbuf_head;
> +
> +next_slot1:
> +			mbuf->port = mq->in_port;
> +			s0 = cur_slot & mask;
> +			d0 = &ring->desc[s0];
>  

You might get additional speedup by doing bulk allocation.
If you know you are going to get N packets than rte_pktmbuf_alloc_bulk()
might speed it up?
  

Patch

diff --git a/drivers/net/memif/rte_eth_memif.c b/drivers/net/memif/rte_eth_memif.c
index dd951b8296..24fc8b13fa 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -341,67 +341,111 @@  eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	if (cur_slot == last_slot)
 		goto refill;
 	n_slots = last_slot - cur_slot;
+	if (likely(mbuf_size >= pmd->cfg.pkt_buffer_size)) {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+
+next_slot1:
+			mbuf->port = mq->in_port;
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-	while (n_slots && n_rx_pkts < nb_pkts) {
-		mbuf_head = rte_pktmbuf_alloc(mq->mempool);
-		if (unlikely(mbuf_head == NULL))
-			goto no_free_bufs;
-		mbuf = mbuf_head;
-		mbuf->port = mq->in_port;
-		dst_off = 0;
+			cp_len = d0->length;
 
-next_slot:
-		s0 = cur_slot & mask;
-		d0 = &ring->desc[s0];
+			rte_pktmbuf_data_len(mbuf) = cp_len;
+			rte_pktmbuf_pkt_len(mbuf) = cp_len;
+			if (mbuf != mbuf_head)
+				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		src_len = d0->length;
-		src_off = 0;
+			rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
+				(uint8_t *)memif_get_buffer(proc_private, d0), cp_len);
 
-		do {
-			dst_len = mbuf_size - dst_off;
-			if (dst_len == 0) {
-				dst_off = 0;
-				dst_len = mbuf_size;
+			cur_slot++;
+			n_slots--;
 
-				/* store pointer to tail */
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT) {
 				mbuf_tail = mbuf;
 				mbuf = rte_pktmbuf_alloc(mq->mempool);
 				if (unlikely(mbuf == NULL))
 					goto no_free_bufs;
-				mbuf->port = mq->in_port;
 				ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
 				if (unlikely(ret < 0)) {
 					MIF_LOG(ERR, "number-of-segments-overflow");
 					rte_pktmbuf_free(mbuf);
 					goto no_free_bufs;
 				}
+				goto next_slot1;
 			}
-			cp_len = RTE_MIN(dst_len, src_len);
 
-			rte_pktmbuf_data_len(mbuf) += cp_len;
-			rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
-			if (mbuf != mbuf_head)
-				rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
+	} else {
+		while (n_slots && n_rx_pkts < nb_pkts) {
+			mbuf_head = rte_pktmbuf_alloc(mq->mempool);
+			if (unlikely(mbuf_head == NULL))
+				goto no_free_bufs;
+			mbuf = mbuf_head;
+			mbuf->port = mq->in_port;
+
+next_slot2:
+			s0 = cur_slot & mask;
+			d0 = &ring->desc[s0];
 
-			rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
-							   dst_off),
-				(uint8_t *)memif_get_buffer(proc_private, d0) +
-				src_off, cp_len);
+			src_len = d0->length;
+			dst_off = 0;
+			src_off = 0;
 
-			src_off += cp_len;
-			dst_off += cp_len;
-			src_len -= cp_len;
-		} while (src_len);
+			do {
+				dst_len = mbuf_size - dst_off;
+				if (dst_len == 0) {
+					dst_off = 0;
+					dst_len = mbuf_size;
+
+					/* store pointer to tail */
+					mbuf_tail = mbuf;
+					mbuf = rte_pktmbuf_alloc(mq->mempool);
+					if (unlikely(mbuf == NULL))
+						goto no_free_bufs;
+					mbuf->port = mq->in_port;
+					ret = memif_pktmbuf_chain(mbuf_head, mbuf_tail, mbuf);
+					if (unlikely(ret < 0)) {
+						MIF_LOG(ERR, "number-of-segments-overflow");
+						rte_pktmbuf_free(mbuf);
+						goto no_free_bufs;
+					}
+				}
+				cp_len = RTE_MIN(dst_len, src_len);
 
-		cur_slot++;
-		n_slots--;
+				rte_pktmbuf_data_len(mbuf) += cp_len;
+				rte_pktmbuf_pkt_len(mbuf) = rte_pktmbuf_data_len(mbuf);
+				if (mbuf != mbuf_head)
+					rte_pktmbuf_pkt_len(mbuf_head) += cp_len;
 
-		if (d0->flags & MEMIF_DESC_FLAG_NEXT)
-			goto next_slot;
+				rte_memcpy(rte_pktmbuf_mtod_offset(mbuf, void *,
+								   dst_off),
+					(uint8_t *)memif_get_buffer(proc_private, d0) +
+					src_off, cp_len);
 
-		mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
-		*bufs++ = mbuf_head;
-		n_rx_pkts++;
+				src_off += cp_len;
+				dst_off += cp_len;
+				src_len -= cp_len;
+			} while (src_len);
+
+			cur_slot++;
+			n_slots--;
+
+			if (d0->flags & MEMIF_DESC_FLAG_NEXT)
+				goto next_slot2;
+
+			mq->n_bytes += rte_pktmbuf_pkt_len(mbuf_head);
+			*bufs++ = mbuf_head;
+			n_rx_pkts++;
+		}
 	}
 
 no_free_bufs:
@@ -694,7 +738,6 @@  eth_memif_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return n_tx_pkts;
 }
 
-
 static int
 memif_tx_one_zc(struct pmd_process_private *proc_private, struct memif_queue *mq,
 		memif_ring_t *ring, struct rte_mbuf *mbuf, const uint16_t mask,