net/iavf: remove extra copy step in Rx bulk path

Message ID 20220324221132.10055-1-kathleen.capella@arm.com (mailing list archive)
State Accepted, archived
Delegated to: Qi Zhang
Headers
Series net/iavf: remove extra copy step in Rx bulk path |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-broadcom-Functional fail Functional Testing issues
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Kathleen Capella March 24, 2022, 10:11 p.m. UTC
  In the Rx bulk path, packets which are taken from the HW ring, are first
copied to the stage data structure and then later copied from the stage
to the rx_pkts array. For the number of packets requested immediately
by the receiving function, this two-step process adds extra overhead
that is not necessary.

Instead, put requested number of packets directly into the rx_pkts array
and only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up
to 4% performance improvement. On x86, no difference in performance was
observed.

Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
Suggested-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 25 deletions(-)
  

Comments

Kathleen Capella March 28, 2022, 5:31 p.m. UTC | #1
The failure in ci/iol-broadcom-Functional on this patch seems to be unrelated to the patch.

> -----Original Message-----
> From: Kathleen Capella <kathleen.capella@arm.com>
> Sent: Thursday, March 24, 2022 5:12 PM
> To: Jingjing Wu <jingjing.wu@intel.com>; Beilei Xing <beilei.xing@intel.com>
> Cc: dev@dpdk.org; nd <nd@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Dharmik Thakkar
> <Dharmik.Thakkar@arm.com>; Kathleen Capella
> <Kathleen.Capella@arm.com>
> Subject: [PATCH] net/iavf: remove extra copy step in Rx bulk path
> 
> In the Rx bulk path, packets which are taken from the HW ring, are first
> copied to the stage data structure and then later copied from the stage to the
> rx_pkts array. For the number of packets requested immediately by the
> receiving function, this two-step process adds extra overhead that is not
> necessary.
> 
> Instead, put requested number of packets directly into the rx_pkts array and
> only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up to 4%
> performance improvement. On x86, no difference in performance was
> observed.
> 
> Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
> Suggested-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> ---
>  drivers/net/iavf/iavf_rxtx.c | 74 ++++++++++++++++++++++++------------
>  1 file changed, 49 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index
> 16e8d021f9..245dd225fd 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -1813,7 +1813,9 @@ iavf_recv_scattered_pkts(void *rx_queue, struct
> rte_mbuf **rx_pkts,
> 
>  #define IAVF_LOOK_AHEAD 8
>  static inline int
> -iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
> +iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
> +			    struct rte_mbuf **rx_pkts,
> +			    uint16_t nb_pkts)
>  {
>  	volatile union iavf_rx_flex_desc *rxdp;
>  	struct rte_mbuf **rxep;
> @@ -1822,6 +1824,7 @@ iavf_rx_scan_hw_ring_flex_rxd(struct
> iavf_rx_queue *rxq)
>  	uint16_t pkt_len;
>  	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
>  	int32_t i, j, nb_rx = 0;
> +	int32_t nb_staged = 0;
>  	uint64_t pkt_flags;
>  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> 
> @@ -1867,8 +1870,6 @@ iavf_rx_scan_hw_ring_flex_rxd(struct
> iavf_rx_queue *rxq)  #endif
>  		}
> 
> -		nb_rx += nb_dd;
> -
>  		/* Translate descriptor info to mbuf parameters */
>  		for (j = 0; j < nb_dd; j++) {
>  			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
> @@ -1892,24 +1893,34 @@ iavf_rx_scan_hw_ring_flex_rxd(struct
> iavf_rx_queue *rxq)
>  			pkt_flags =
> iavf_flex_rxd_error_to_pkt_flags(stat_err0);
> 
>  			mb->ol_flags |= pkt_flags;
> -		}
> 
> -		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
> -			rxq->rx_stage[i + j] = rxep[j];
> +			/* Put up to nb_pkts directly into buffers */
> +			if ((i + j) < nb_pkts) {
> +				rx_pkts[i + j] = rxep[j];
> +				nb_rx++;
> +			} else {
> +				/* Stage excess pkts received */
> +				rxq->rx_stage[nb_staged] = rxep[j];
> +				nb_staged++;
> +			}
> +		}
> 
>  		if (nb_dd != IAVF_LOOK_AHEAD)
>  			break;
>  	}
> 
> +	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
> +	rxq->rx_nb_avail = nb_staged;
> +
>  	/* Clear software ring entries */
> -	for (i = 0; i < nb_rx; i++)
> +	for (i = 0; i < (nb_rx + nb_staged); i++)
>  		rxq->sw_ring[rxq->rx_tail + i] = NULL;
> 
>  	return nb_rx;
>  }
> 
>  static inline int
> -iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
> +iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>  {
>  	volatile union iavf_rx_desc *rxdp;
>  	struct rte_mbuf **rxep;
> @@ -1919,6 +1930,7 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
>  	uint32_t rx_status;
>  	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
>  	int32_t i, j, nb_rx = 0;
> +	int32_t nb_staged = 0;
>  	uint64_t pkt_flags;
>  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> 
> @@ -1970,8 +1982,6 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
> #endif
>  		}
> 
> -		nb_rx += nb_dd;
> -
>  		/* Translate descriptor info to mbuf parameters */
>  		for (j = 0; j < nb_dd; j++) {
>  			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
> @@ -2000,17 +2010,26 @@ iavf_rx_scan_hw_ring(struct iavf_rx_queue
> *rxq)
>  				pkt_flags |= iavf_rxd_build_fdir(&rxdp[j],
> mb);
> 
>  			mb->ol_flags |= pkt_flags;
> -		}
> 
> -		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
> -			rxq->rx_stage[i + j] = rxep[j];
> +			/* Put up to nb_pkts directly into buffers */
> +			if ((i + j) < nb_pkts) {
> +				rx_pkts[i + j] = rxep[j];
> +				nb_rx++;
> +			} else { /* Stage excess pkts received */
> +				rxq->rx_stage[nb_staged] = rxep[j];
> +				nb_staged++;
> +			}
> +		}
> 
>  		if (nb_dd != IAVF_LOOK_AHEAD)
>  			break;
>  	}
> 
> +	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
> +	rxq->rx_nb_avail = nb_staged;
> +
>  	/* Clear software ring entries */
> -	for (i = 0; i < nb_rx; i++)
> +	for (i = 0; i < (nb_rx + nb_staged); i++)
>  		rxq->sw_ring[rxq->rx_tail + i] = NULL;
> 
>  	return nb_rx;
> @@ -2098,23 +2117,31 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
> 
>  	if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <=
> IAVF_RXDID_LAST)
> -		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq);
> +		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq,
> rx_pkts,
> +nb_pkts);
>  	else
> -		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq);
> +		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts,
> nb_pkts);
> +
>  	rxq->rx_next_avail = 0;
> -	rxq->rx_nb_avail = nb_rx;
> -	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx);
> +	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail);
> 
>  	if (rxq->rx_tail > rxq->rx_free_trigger) {
>  		if (iavf_rx_alloc_bufs(rxq) != 0) {
> -			uint16_t i, j;
> +			uint16_t i, j, nb_staged;
> 
>  			/* TODO: count rx_mbuf_alloc_failed here */
> 
> +			nb_staged = rxq->rx_nb_avail;
>  			rxq->rx_nb_avail = 0;
> -			rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx);
> -			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++)
> +
> +			rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx +
> nb_staged));
> +			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) {
> +				rxq->sw_ring[j] = rx_pkts[i];
> +				rx_pkts[i] = NULL;
> +			}
> +			for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; i++,
> j++) {
>  				rxq->sw_ring[j] = rxq->rx_stage[i];
> +				rx_pkts[i] = NULL;
> +			}
> 
>  			return 0;
>  		}
> @@ -2127,10 +2154,7 @@ rx_recv_pkts(void *rx_queue, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  		   rxq->port_id, rxq->queue_id,
>  		   rxq->rx_tail, nb_rx);
> 
> -	if (rxq->rx_nb_avail)
> -		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
> -
> -	return 0;
> +	return nb_rx;
>  }
> 
>  static uint16_t
> --
> 2.31.1
  
Qi Zhang April 20, 2022, 12:47 p.m. UTC | #2
> -----Original Message-----
> From: Kathleen Capella <kathleen.capella@arm.com>
> Sent: Friday, March 25, 2022 6:12 AM
> To: Wu, Jingjing <jingjing.wu@intel.com>; Xing, Beilei <beilei.xing@intel.com>
> Cc: dev@dpdk.org; nd@arm.com; honnappa.nagarahalli@arm.com;
> dharmik.thakkar@arm.com; Kathleen Capella <kathleen.capella@arm.com>
> Subject: [PATCH] net/iavf: remove extra copy step in Rx bulk path
> 
> In the Rx bulk path, packets which are taken from the HW ring, are first
> copied to the stage data structure and then later copied from the stage to the
> rx_pkts array. For the number of packets requested immediately by the
> receiving function, this two-step process adds extra overhead that is not
> necessary.
> 
> Instead, put requested number of packets directly into the rx_pkts array and
> only stage excess packets. On N1SDP with 1 core/port, l3fwd saw up to 4%
> performance improvement. On x86, no difference in performance was
> observed.
> 
> Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
> Suggested-by: Dharmik Thakkar <dharmik.thakkar@arm.com>

Acked-by: Qi Zhang <qi.z.zhang@intel.com>

Applied to dpdk-next-net-intel.

Thanks
Qi
  

Patch

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 16e8d021f9..245dd225fd 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -1813,7 +1813,9 @@  iavf_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 #define IAVF_LOOK_AHEAD 8
 static inline int
-iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq,
+			    struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
 {
 	volatile union iavf_rx_flex_desc *rxdp;
 	struct rte_mbuf **rxep;
@@ -1822,6 +1824,7 @@  iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 	uint16_t pkt_len;
 	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
 	int32_t i, j, nb_rx = 0;
+	int32_t nb_staged = 0;
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1867,8 +1870,6 @@  iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 #endif
 		}
 
-		nb_rx += nb_dd;
-
 		/* Translate descriptor info to mbuf parameters */
 		for (j = 0; j < nb_dd; j++) {
 			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -1892,24 +1893,34 @@  iavf_rx_scan_hw_ring_flex_rxd(struct iavf_rx_queue *rxq)
 			pkt_flags = iavf_flex_rxd_error_to_pkt_flags(stat_err0);
 
 			mb->ol_flags |= pkt_flags;
-		}
 
-		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-			rxq->rx_stage[i + j] = rxep[j];
+			/* Put up to nb_pkts directly into buffers */
+			if ((i + j) < nb_pkts) {
+				rx_pkts[i + j] = rxep[j];
+				nb_rx++;
+			} else {
+				/* Stage excess pkts received */
+				rxq->rx_stage[nb_staged] = rxep[j];
+				nb_staged++;
+			}
+		}
 
 		if (nb_dd != IAVF_LOOK_AHEAD)
 			break;
 	}
 
+	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
+	rxq->rx_nb_avail = nb_staged;
+
 	/* Clear software ring entries */
-	for (i = 0; i < nb_rx; i++)
+	for (i = 0; i < (nb_rx + nb_staged); i++)
 		rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
 	return nb_rx;
 }
 
 static inline int
-iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
+iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	volatile union iavf_rx_desc *rxdp;
 	struct rte_mbuf **rxep;
@@ -1919,6 +1930,7 @@  iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 	uint32_t rx_status;
 	int32_t s[IAVF_LOOK_AHEAD], var, nb_dd;
 	int32_t i, j, nb_rx = 0;
+	int32_t nb_staged = 0;
 	uint64_t pkt_flags;
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 
@@ -1970,8 +1982,6 @@  iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 #endif
 		}
 
-		nb_rx += nb_dd;
-
 		/* Translate descriptor info to mbuf parameters */
 		for (j = 0; j < nb_dd; j++) {
 			IAVF_DUMP_RX_DESC(rxq, &rxdp[j],
@@ -2000,17 +2010,26 @@  iavf_rx_scan_hw_ring(struct iavf_rx_queue *rxq)
 				pkt_flags |= iavf_rxd_build_fdir(&rxdp[j], mb);
 
 			mb->ol_flags |= pkt_flags;
-		}
 
-		for (j = 0; j < IAVF_LOOK_AHEAD; j++)
-			rxq->rx_stage[i + j] = rxep[j];
+			/* Put up to nb_pkts directly into buffers */
+			if ((i + j) < nb_pkts) {
+				rx_pkts[i + j] = rxep[j];
+				nb_rx++;
+			} else { /* Stage excess pkts received */
+				rxq->rx_stage[nb_staged] = rxep[j];
+				nb_staged++;
+			}
+		}
 
 		if (nb_dd != IAVF_LOOK_AHEAD)
 			break;
 	}
 
+	/* Update rxq->rx_nb_avail to reflect number of staged pkts */
+	rxq->rx_nb_avail = nb_staged;
+
 	/* Clear software ring entries */
-	for (i = 0; i < nb_rx; i++)
+	for (i = 0; i < (nb_rx + nb_staged); i++)
 		rxq->sw_ring[rxq->rx_tail + i] = NULL;
 
 	return nb_rx;
@@ -2098,23 +2117,31 @@  rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
 
 	if (rxq->rxdid >= IAVF_RXDID_FLEX_NIC && rxq->rxdid <= IAVF_RXDID_LAST)
-		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq);
+		nb_rx = (uint16_t)iavf_rx_scan_hw_ring_flex_rxd(rxq, rx_pkts, nb_pkts);
 	else
-		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq);
+		nb_rx = (uint16_t)iavf_rx_scan_hw_ring(rxq, rx_pkts, nb_pkts);
+
 	rxq->rx_next_avail = 0;
-	rxq->rx_nb_avail = nb_rx;
-	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_rx + rxq->rx_nb_avail);
 
 	if (rxq->rx_tail > rxq->rx_free_trigger) {
 		if (iavf_rx_alloc_bufs(rxq) != 0) {
-			uint16_t i, j;
+			uint16_t i, j, nb_staged;
 
 			/* TODO: count rx_mbuf_alloc_failed here */
 
+			nb_staged = rxq->rx_nb_avail;
 			rxq->rx_nb_avail = 0;
-			rxq->rx_tail = (uint16_t)(rxq->rx_tail - nb_rx);
-			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++)
+
+			rxq->rx_tail = (uint16_t)(rxq->rx_tail - (nb_rx + nb_staged));
+			for (i = 0, j = rxq->rx_tail; i < nb_rx; i++, j++) {
+				rxq->sw_ring[j] = rx_pkts[i];
+				rx_pkts[i] = NULL;
+			}
+			for (i = 0, j = rxq->rx_tail + nb_rx; i < nb_staged; i++, j++) {
 				rxq->sw_ring[j] = rxq->rx_stage[i];
+				rx_pkts[i] = NULL;
+			}
 
 			return 0;
 		}
@@ -2127,10 +2154,7 @@  rx_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		   rxq->port_id, rxq->queue_id,
 		   rxq->rx_tail, nb_rx);
 
-	if (rxq->rx_nb_avail)
-		return iavf_rx_fill_from_stage(rxq, rx_pkts, nb_pkts);
-
-	return 0;
+	return nb_rx;
 }
 
 static uint16_t