[v3] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs

Message ID 1706759150-6269-1-git-send-email-longli@linuxonhyperv.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series [v3] net/mana: use rte_pktmbuf_alloc_bulk for allocating RX WQEs |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-compile-arm64-testing success Testing PASS

Commit Message

Long Li Feb. 1, 2024, 3:45 a.m. UTC
  From: Long Li <longli@microsoft.com>

Instead of allocating mbufs one by one during RX, use
rte_pktmbuf_alloc_bulk() to allocate them in a batch.

There are no measurable performance improvements in benchmarks. However,
this patch should improve CPU cycles and reduce potential locking
conflicts in real-world applications.

Signed-off-by: Long Li <longli@microsoft.com>
---
Change in v2:
use rte_calloc_socket() in place of rte_calloc()

v3:
add more comment explaining the benefit of doing alloc_bulk.
free mbufs that are failed to post

 drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 24 deletions(-)
  

Comments

Tyler Retzlaff Feb. 1, 2024, 4:16 p.m. UTC | #1
On Wed, Jan 31, 2024 at 07:45:50PM -0800, longli@linuxonhyperv.com wrote:
> From: Long Li <longli@microsoft.com>
> 
> Instead of allocating mbufs one by one during RX, use
> rte_pktmbuf_alloc_bulk() to allocate them in a batch.
> 
> There are no measurable performance improvements in benchmarks. However,
> this patch should improve CPU cycles and reduce potential locking
> conflicts in real-world applications.
> 
> Signed-off-by: Long Li <longli@microsoft.com>
> ---
> Change in v2:
> use rte_calloc_socket() in place of rte_calloc()
> 
> v3:
> add more comment explaining the benefit of doing alloc_bulk.
> free mbufs that are failed to post
> 
>  drivers/net/mana/rx.c | 74 +++++++++++++++++++++++++++++--------------
>  1 file changed, 50 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
> index acad5e26cd..6112db2219 100644
> --- a/drivers/net/mana/rx.c
> +++ b/drivers/net/mana/rx.c
> @@ -2,6 +2,7 @@
>   * Copyright 2022 Microsoft Corporation
>   */
>  #include <ethdev_driver.h>
> +#include <rte_malloc.h>
>  
>  #include <infiniband/verbs.h>
>  #include <infiniband/manadv.h>
> @@ -59,9 +60,8 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
>  }
>  
>  static int
> -mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
> +mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
>  {
> -	struct rte_mbuf *mbuf = NULL;
>  	struct gdma_sgl_element sgl[1];
>  	struct gdma_work_request request;
>  	uint32_t wqe_size_in_bu;
> @@ -69,12 +69,6 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>  	int ret;
>  	struct mana_mr_cache *mr;
>  
> -	mbuf = rte_pktmbuf_alloc(rxq->mp);
> -	if (!mbuf) {
> -		rxq->stats.nombuf++;
> -		return -ENOMEM;
> -	}
> -
>  	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
>  	if (!mr) {
>  		DP_LOG(ERR, "failed to register RX MR");
> @@ -121,19 +115,32 @@ mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
>   * Post work requests for a Rx queue.
>   */
>  static int
> -mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
> +mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
>  {
>  	int ret;
>  	uint32_t i;
> +	struct rte_mbuf **mbufs;
> +
> +	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
> +				  0, rxq->mp->socket_id);
> +	if (!mbufs)
> +		return -ENOMEM;
> +
> +	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
> +	if (ret) {
> +		DP_LOG(ERR, "failed to allocate mbufs for RX");
> +		rxq->stats.nombuf += count;
> +		goto fail;
> +	}
>  
>  #ifdef RTE_ARCH_32
>  	rxq->wqe_cnt_to_short_db = 0;
>  #endif
> -	for (i = 0; i < rxq->num_desc; i++) {
> -		ret = mana_alloc_and_post_rx_wqe(rxq);
> +	for (i = 0; i < count; i++) {
> +		ret = mana_post_rx_wqe(rxq, mbufs[i]);
>  		if (ret) {
>  			DP_LOG(ERR, "failed to post RX ret = %d", ret);
> -			return ret;
> +			break;
>  		}
>  
>  #ifdef RTE_ARCH_32
> @@ -144,8 +151,16 @@ mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
>  #endif
>  	}
>  
> +	/* Free the remaining mbufs that are not posted */
> +	while (i < count) {
> +		rte_pktmbuf_free(mbufs[i]);
> +		i++;
> +	}

there is also rte_pktmbuf_free_bulk() that could be used. probably won't
make any material difference to perf though so just an fyi.
  
Long Li Feb. 1, 2024, 7:41 p.m. UTC | #2
> > +	/* Free the remaining mbufs that are not posted */
> > +	while (i < count) {
> > +		rte_pktmbuf_free(mbufs[i]);
> > +		i++;
> > +	}
> 
> there is also rte_pktmbuf_free_bulk() that could be used. probably won't make
> any material difference to perf though so just an fyi.

Thank you! Will use rte_pktmbuf_free_bulk().
  

Patch

diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index acad5e26cd..6112db2219 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -2,6 +2,7 @@ 
  * Copyright 2022 Microsoft Corporation
  */
 #include <ethdev_driver.h>
+#include <rte_malloc.h>
 
 #include <infiniband/verbs.h>
 #include <infiniband/manadv.h>
@@ -59,9 +60,8 @@  mana_rq_ring_doorbell(struct mana_rxq *rxq)
 }
 
 static int
-mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+mana_post_rx_wqe(struct mana_rxq *rxq, struct rte_mbuf *mbuf)
 {
-	struct rte_mbuf *mbuf = NULL;
 	struct gdma_sgl_element sgl[1];
 	struct gdma_work_request request;
 	uint32_t wqe_size_in_bu;
@@ -69,12 +69,6 @@  mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
 	int ret;
 	struct mana_mr_cache *mr;
 
-	mbuf = rte_pktmbuf_alloc(rxq->mp);
-	if (!mbuf) {
-		rxq->stats.nombuf++;
-		return -ENOMEM;
-	}
-
 	mr = mana_alloc_pmd_mr(&rxq->mr_btree, priv, mbuf);
 	if (!mr) {
 		DP_LOG(ERR, "failed to register RX MR");
@@ -121,19 +115,32 @@  mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
  * Post work requests for a Rx queue.
  */
 static int
-mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq, uint32_t count)
 {
 	int ret;
 	uint32_t i;
+	struct rte_mbuf **mbufs;
+
+	mbufs = rte_calloc_socket("mana_rx_mbufs", count, sizeof(struct rte_mbuf *),
+				  0, rxq->mp->socket_id);
+	if (!mbufs)
+		return -ENOMEM;
+
+	ret = rte_pktmbuf_alloc_bulk(rxq->mp, mbufs, count);
+	if (ret) {
+		DP_LOG(ERR, "failed to allocate mbufs for RX");
+		rxq->stats.nombuf += count;
+		goto fail;
+	}
 
 #ifdef RTE_ARCH_32
 	rxq->wqe_cnt_to_short_db = 0;
 #endif
-	for (i = 0; i < rxq->num_desc; i++) {
-		ret = mana_alloc_and_post_rx_wqe(rxq);
+	for (i = 0; i < count; i++) {
+		ret = mana_post_rx_wqe(rxq, mbufs[i]);
 		if (ret) {
 			DP_LOG(ERR, "failed to post RX ret = %d", ret);
-			return ret;
+			break;
 		}
 
 #ifdef RTE_ARCH_32
@@ -144,8 +151,16 @@  mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
 #endif
 	}
 
+	/* Free the remaining mbufs that are not posted */
+	while (i < count) {
+		rte_pktmbuf_free(mbufs[i]);
+		i++;
+	}
+
 	mana_rq_ring_doorbell(rxq);
 
+fail:
+	rte_free(mbufs);
 	return ret;
 }
 
@@ -404,7 +419,9 @@  mana_start_rx_queues(struct rte_eth_dev *dev)
 	}
 
 	for (i = 0; i < priv->num_queues; i++) {
-		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		ret = mana_alloc_and_post_rx_wqes(rxq, rxq->num_desc);
 		if (ret)
 			goto fail;
 	}
@@ -423,7 +440,7 @@  uint16_t
 mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	uint16_t pkt_received = 0;
-	uint16_t wqe_posted = 0;
+	uint16_t wqe_consumed = 0;
 	struct mana_rxq *rxq = dpdk_rxq;
 	struct mana_priv *priv = rxq->priv;
 	struct rte_mbuf *mbuf;
@@ -535,18 +552,23 @@  mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
 
-		/* Consume this request and post another request */
-		ret = mana_alloc_and_post_rx_wqe(rxq);
-		if (ret) {
-			DP_LOG(ERR, "failed to post rx wqe ret=%d", ret);
-			break;
-		}
-
-		wqe_posted++;
+		/* Record the number of the RX WQE we need to post to replenish
+		 * consumed RX requests
+		 */
+		wqe_consumed++;
 		if (pkt_received == pkts_n)
 			break;
 
 #ifdef RTE_ARCH_32
+		/* Always post WQE as soon as it's consumed for short DB */
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+			return pkt_received;
+		}
+		wqe_consumed = 0;
+
 		/* Ring short doorbell if approaching the wqe increment
 		 * limit.
 		 */
@@ -569,8 +591,12 @@  mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		goto repoll;
 	}
 
-	if (wqe_posted)
-		mana_rq_ring_doorbell(rxq);
+	if (wqe_consumed) {
+		ret = mana_alloc_and_post_rx_wqes(rxq, wqe_consumed);
+		if (ret)
+			DRV_LOG(ERR, "failed to post %d WQEs, ret %d",
+				wqe_consumed, ret);
+	}
 
 	return pkt_received;
 }