[2/2] net/hns3: optimized Tx performance

Message ID 20211111133859.13705-3-humin29@huawei.com (mailing list archive)
State Changes Requested, archived
Delegated to: Ferruh Yigit
Headers
Series performance optimized for hns3 PMD |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/github-robot: build success github build: passed
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional fail Functional Testing issues

Commit Message

humin (Q) Nov. 11, 2021, 1:38 p.m. UTC
  From: Chengwen Feng <fengchengwen@huawei.com>

The PMD should check whether the descriptor is done by hardware before
free the corresponding mbuf. Currently the common xmit algorithm will
free mbuf every time when it's invoked. Because hardware may not have
finished sending, this may lead to many invalid queries which are
whether the descriptors are done.

This patch uses tx_free_thresh to control whether invoke free mbuf, and
free tx_rs_thresh mbufs each time.

This patch also modifies the implementation of PMD's tx_done_cleanup
because the mbuf free algorithm changed.

In the testpmd single core MAC forwarding scenario, the performance is
improved by 10% at 64B on Kunpeng920 platform.

Cc: stable@dpdk.org

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
---
 drivers/net/hns3/hns3_rxtx.c | 118 ++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 64 deletions(-)
  

Comments

Ferruh Yigit Nov. 15, 2021, 5:32 p.m. UTC | #1
On 11/11/2021 1:38 PM, Min Hu (Connor) wrote:
> From: Chengwen Feng<fengchengwen@huawei.com>
> 
> The PMD should check whether the descriptor is done by hardware before
> free the corresponding mbuf. Currently the common xmit algorithm will
> free mbuf every time when it's invoked. Because hardware may not have
> finished sending, this may lead to many invalid queries which are
> whether the descriptors are done.
> 

Hi Connor, Chengwen,

Since there will be a new version, can you please reword above paragraph?

> This patch uses tx_free_thresh to control whether invoke free mbuf, and
> free tx_rs_thresh mbufs each time.
> 
> This patch also modifies the implementation of PMD's tx_done_cleanup
> because the mbuf free algorithm changed.
> 
> In the testpmd single core MAC forwarding scenario, the performance is
> improved by 10% at 64B on Kunpeng920 platform.
> 
> Cc:stable@dpdk.org
> 
> Signed-off-by: Chengwen Feng<fengchengwen@huawei.com>
> Signed-off-by: Min Hu (Connor)<humin29@huawei.com>
  

Patch

diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index 78227a139f..114b2f397f 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -3077,40 +3077,51 @@  hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	return 0;
 }
 
-static void
+static int
 hns3_tx_free_useless_buffer(struct hns3_tx_queue *txq)
 {
 	uint16_t tx_next_clean = txq->next_to_clean;
-	uint16_t tx_next_use   = txq->next_to_use;
-	uint16_t tx_bd_ready   = txq->tx_bd_ready;
-	uint16_t tx_bd_max     = txq->nb_tx_desc;
-	struct hns3_entry *tx_bak_pkt = &txq->sw_ring[tx_next_clean];
+	uint16_t tx_next_use = txq->next_to_use;
+	struct hns3_entry *tx_entry = &txq->sw_ring[tx_next_clean];
 	struct hns3_desc *desc = &txq->tx_ring[tx_next_clean];
-	struct rte_mbuf *mbuf;
+	int i;
 
-	while ((!(desc->tx.tp_fe_sc_vld_ra_ri &
-		rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B)))) &&
-		tx_next_use != tx_next_clean) {
-		mbuf = tx_bak_pkt->mbuf;
-		if (mbuf) {
-			rte_pktmbuf_free_seg(mbuf);
-			tx_bak_pkt->mbuf = NULL;
-		}
+	if (tx_next_use >= tx_next_clean &&
+	    tx_next_use < tx_next_clean + txq->tx_rs_thresh)
+		return -1;
 
-		desc++;
-		tx_bak_pkt++;
-		tx_next_clean++;
-		tx_bd_ready++;
-
-		if (tx_next_clean >= tx_bd_max) {
-			tx_next_clean = 0;
-			desc = txq->tx_ring;
-			tx_bak_pkt = txq->sw_ring;
-		}
+	/*
+	 * All mbufs can be released only when the VLD bits of all
+	 * descriptors in a batch are cleared.
+	 */
+	for (i = 0; i < txq->tx_rs_thresh; i++) {
+		if (desc[i].tx.tp_fe_sc_vld_ra_ri &
+			rte_le_to_cpu_16(BIT(HNS3_TXD_VLD_B)))
+			return -1;
 	}
 
-	txq->next_to_clean = tx_next_clean;
-	txq->tx_bd_ready   = tx_bd_ready;
+	for (i = 0; i < txq->tx_rs_thresh; i++) {
+		rte_pktmbuf_free_seg(tx_entry[i].mbuf);
+		tx_entry[i].mbuf = NULL;
+	}
+
+	/* Update numbers of available descriptor due to buffer freed */
+	txq->tx_bd_ready += txq->tx_rs_thresh;
+	txq->next_to_clean += txq->tx_rs_thresh;
+	if (txq->next_to_clean >= txq->nb_tx_desc)
+		txq->next_to_clean = 0;
+
+	return 0;
+}
+
+static inline int
+hns3_tx_free_required_buffer(struct hns3_tx_queue *txq, uint16_t required_bds)
+{
+	while (required_bds > txq->tx_bd_ready) {
+		if (hns3_tx_free_useless_buffer(txq) != 0)
+			return -1;
+	}
+	return 0;
 }
 
 int
@@ -4147,8 +4158,8 @@  hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	uint16_t nb_tx;
 	uint16_t i;
 
-	/* free useless buffer */
-	hns3_tx_free_useless_buffer(txq);
+	if (txq->tx_bd_ready < txq->tx_free_thresh)
+		(void)hns3_tx_free_useless_buffer(txq);
 
 	tx_next_use   = txq->next_to_use;
 	tx_bd_max     = txq->nb_tx_desc;
@@ -4163,11 +4174,14 @@  hns3_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		nb_buf = tx_pkt->nb_segs;
 
 		if (nb_buf > txq->tx_bd_ready) {
-			txq->dfx_stats.queue_full_cnt++;
-			if (nb_tx == 0)
-				return 0;
-
-			goto end_of_tx;
+			/* Try to release the required MBUF, but avoid releasing
+			 * all MBUFs, otherwise, the MBUFs will be released for
+			 * a long time and may cause jitter.
+			 */
+			if (hns3_tx_free_required_buffer(txq, nb_buf) != 0) {
+				txq->dfx_stats.queue_full_cnt++;
+				goto end_of_tx;
+			}
 		}
 
 		/*
@@ -4577,46 +4591,22 @@  hns3_dev_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id)
 static int
 hns3_tx_done_cleanup_full(struct hns3_tx_queue *txq, uint32_t free_cnt)
 {
-	uint16_t next_to_clean = txq->next_to_clean;
-	uint16_t next_to_use   = txq->next_to_use;
-	uint16_t tx_bd_ready   = txq->tx_bd_ready;
-	struct hns3_entry *tx_pkt = &txq->sw_ring[next_to_clean];
-	struct hns3_desc *desc = &txq->tx_ring[next_to_clean];
+	uint16_t round_free_cnt;
 	uint32_t idx;
 
 	if (free_cnt == 0 || free_cnt > txq->nb_tx_desc)
 		free_cnt = txq->nb_tx_desc;
 
-	for (idx = 0; idx < free_cnt; idx++) {
-		if (next_to_clean == next_to_use)
-			break;
+	if (txq->tx_rs_thresh == 0)
+		return 0;
 
-		if (desc->tx.tp_fe_sc_vld_ra_ri &
-		    rte_cpu_to_le_16(BIT(HNS3_TXD_VLD_B)))
+	round_free_cnt = roundup(free_cnt, txq->tx_rs_thresh);
+	for (idx = 0; idx < round_free_cnt; idx += txq->tx_rs_thresh) {
+		if (hns3_tx_free_useless_buffer(txq) != 0)
 			break;
-
-		if (tx_pkt->mbuf != NULL) {
-			rte_pktmbuf_free_seg(tx_pkt->mbuf);
-			tx_pkt->mbuf = NULL;
-		}
-
-		next_to_clean++;
-		tx_bd_ready++;
-		tx_pkt++;
-		desc++;
-		if (next_to_clean == txq->nb_tx_desc) {
-			tx_pkt = txq->sw_ring;
-			desc = txq->tx_ring;
-			next_to_clean = 0;
-		}
-	}
-
-	if (idx > 0) {
-		txq->next_to_clean = next_to_clean;
-		txq->tx_bd_ready = tx_bd_ready;
 	}
 
-	return (int)idx;
+	return RTE_MIN(idx, free_cnt);
 }
 
 int