[v3,1/2] net/octeon_ep: improve Rx performance

Message ID 20240202084324.9682-1-pbhagavatula@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers
Series [v3,1/2] net/octeon_ep: improve Rx performance |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Pavan Nikhilesh Bhagavatula Feb. 2, 2024, 8:43 a.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool API instead of pktmbuf alloc to avoid mbuf reset
as it will be done by rearm on receive.
Reorder refill to avoid unnecessary write commits on mbuf data.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix compilation with distro gcc.
 v3 Changes:
 - Fix aarch32 compilation.

 drivers/net/octeon_ep/cnxk_ep_rx.c     |  4 +--
 drivers/net/octeon_ep/cnxk_ep_rx.h     | 13 ++++++---
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 20 +++++++-------
 drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 38 ++++++++++++++------------
 drivers/net/octeon_ep/otx_ep_rxtx.h    |  2 +-
 5 files changed, 42 insertions(+), 35 deletions(-)

--
2.25.1
  

Patch

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c
index f3e4fb27d1..7465e0a017 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.c
@@ -76,12 +76,12 @@  cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	uint16_t new_pkts;

 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
 		cnxk_ep_rx_refill(droq);

+	cnxk_ep_process_pkts_scalar(rx_pkts, droq, new_pkts);
+
 	return new_pkts;
 }

diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.h b/drivers/net/octeon_ep/cnxk_ep_rx.h
index e71fc0de5c..61263e651e 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx.h
+++ b/drivers/net/octeon_ep/cnxk_ep_rx.h
@@ -21,13 +21,16 @@  cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 	uint32_t i;
 	int rc;

-	rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count);
+	rc = rte_mempool_get_bulk(droq->mpool, (void **)&recv_buf_list[refill_idx], count);
 	if (unlikely(rc)) {
 		droq->stats.rx_alloc_failure++;
 		return rc;
 	}

 	for (i = 0; i < count; i++) {
+		rte_prefetch_non_temporal(&desc_ring[(refill_idx + 1) & 3]);
+		if (i < count - 1)
+			rte_prefetch_non_temporal(recv_buf_list[refill_idx + 1]);
 		buf = recv_buf_list[refill_idx];
 		desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf);
 		refill_idx++;
@@ -42,9 +45,9 @@  cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count)
 static inline void
 cnxk_ep_rx_refill(struct otx_ep_droq *droq)
 {
-	uint32_t desc_refilled = 0, count;
-	uint32_t nb_desc = droq->nb_desc;
+	const uint32_t nb_desc = droq->nb_desc;
 	uint32_t refill_idx = droq->refill_idx;
+	uint32_t desc_refilled = 0, count;
 	int rc;

 	if (unlikely(droq->read_idx == refill_idx))
@@ -128,6 +131,8 @@  cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts)
 	return RTE_MIN(nb_pkts, droq->pkts_pending);
 }

+#define cnxk_pktmbuf_mtod(m, t) ((t)(void *)((char *)(m)->buf_addr + RTE_PKTMBUF_HEADROOM))
+
 static __rte_always_inline void
 cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
@@ -147,7 +152,7 @@  cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq,
 			      void *));

 		mbuf = recv_buf_list[read_idx];
-		info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
+		info = cnxk_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *);
 		read_idx = otx_ep_incr_index(read_idx, 1, nb_desc);
 		pkt_len = rte_bswap16(info->length >> 48);
 		mbuf->pkt_len = pkt_len;
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
index ae4615e6da..47eb1d2ef7 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -49,7 +49,7 @@  cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		/* Load rearm data and packet length for shuffle. */
 		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
 			data[i] = _mm256_set_epi64x(0,
-				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				cnxk_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
 				0, rearm_data);

 		/* Shuffle data to its place and sum the packet length. */
@@ -81,15 +81,15 @@  cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
 	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -99,11 +99,6 @@  cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
-	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -119,5 +114,10 @@  cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
index 67c0c1c862..308c8b2288 100644
--- a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
@@ -18,13 +18,15 @@  static __rte_always_inline void
 cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
 {
 	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
-	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
-	uint32_t idx0, idx1, idx2, idx3;
+	uint32_t read_idx = droq->read_idx;
 	struct rte_mbuf *m0, *m1, *m2, *m3;
 	uint16_t nb_desc = droq->nb_desc;
+	uint32_t idx0, idx1, idx2, idx3;
 	uint16_t pkts = 0;
+	__m128i bytes;

 	idx0 = read_idx;
+	bytes = _mm_setzero_si128();
 	while (pkts < new_pkts) {
 		const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
 							0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
@@ -42,14 +44,14 @@  cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 		m3 = recv_buf_list[idx3];

 		/* Load packet size big-endian. */
-		s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
-				    rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
+		s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
+				    cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
 		/* Convert to little-endian. */
 		s01 = _mm_shuffle_epi8(s01, bswap_mask);
-		/* Horizontal add. */
-		bytes_rsvd += hadd(s01);
+		/* Vertical add, consolidate outside loop */
+		bytes = _mm_add_epi32(bytes, s01);
 		/* Segregate to packet length and data length. */
 		s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
 		s01 = _mm_shuffle_epi8(s01, cpy_mask);
@@ -79,7 +81,7 @@  cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq
 	droq->pkts_pending -= new_pkts;
 	/* Stats */
 	droq->stats.pkts_received += new_pkts;
-	droq->stats.bytes_received += bytes_rsvd;
+	droq->stats.bytes_received += hadd(bytes);
 }

 uint16_t __rte_noinline __rte_hot
@@ -88,15 +90,15 @@  cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
 	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
 	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
 	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
 	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);

-	/* Refill RX buffers */
-	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
-		cnxk_ep_rx_refill(droq);
-
 	return new_pkts;
 }

@@ -106,11 +108,6 @@  cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
 	uint16_t new_pkts, vpkts;

-	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
-	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
-	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
-	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
-
 	/* Refill RX buffers */
 	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
 		cnxk_ep_rx_refill(droq);
@@ -126,5 +123,10 @@  cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkt
 		rte_write32(0, droq->pkts_credit_reg);
 	}

+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
+	cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
 	return new_pkts;
 }
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index 0adcbc7814..8f306bd94e 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -21,7 +21,7 @@ 

 /* SDP_LENGTH_S specifies packet length and is of 8-byte size */
 #define OTX_EP_INFO_SIZE 8
-#define DROQ_REFILL_THRESHOLD 16
+#define DROQ_REFILL_THRESHOLD  64
 #define OTX2_SDP_REQUEST_ISM   (0x1ULL << 63)

 static inline uint32_t