[v3,18/18] net/bnxt: enable SSE mode for compressed CQE

Message ID 20231227042119.72469-19-ajit.khaparde@broadcom.com (mailing list archive)
State Accepted, archived
Delegated to: Ajit Khaparde
Headers
Series bnxt patchset |

Checks

Context Check Description
ci/loongarch-compilation success Compilation OK
ci/checkpatch success coding style OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS

Commit Message

Ajit Khaparde Dec. 27, 2023, 4:21 a.m. UTC
  P7 device family supports 16 byte Rx completions.
Enable SSE vector mode for compressed Rx CQE processing.

Signed-off-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
---
 drivers/net/bnxt/bnxt_ethdev.c       |  16 ++-
 drivers/net/bnxt/bnxt_rxr.h          |   2 +
 drivers/net/bnxt/bnxt_rxtx_vec_sse.c | 167 +++++++++++++++++++++++++--
 3 files changed, 173 insertions(+), 12 deletions(-)
  

Patch

diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index bd8c7557dd..f9cd234bb6 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -1377,7 +1377,8 @@  bnxt_receive_function(struct rte_eth_dev *eth_dev)
 	 * asynchronous completions and receive completions can be placed in
 	 * the same completion ring.
 	 */
-	if (BNXT_TRUFLOW_EN(bp) || !BNXT_NUM_ASYNC_CPR(bp))
+	if ((BNXT_TRUFLOW_EN(bp) && !BNXT_CHIP_P7(bp)) ||
+	    !BNXT_NUM_ASYNC_CPR(bp))
 		goto use_scalar_rx;
 
 	/*
@@ -1410,12 +1411,19 @@  bnxt_receive_function(struct rte_eth_dev *eth_dev)
 			return bnxt_crx_pkts_vec_avx2;
 		return bnxt_recv_pkts_vec_avx2;
 	}
- #endif
+#endif
 	if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
 		PMD_DRV_LOG(INFO,
 			    "Using SSE vector mode receive for port %d\n",
 			    eth_dev->data->port_id);
 		bp->flags |= BNXT_FLAG_RX_VECTOR_PKT_MODE;
+		if (bnxt_compressed_rx_cqe_mode_enabled(bp)) {
+#if defined(RTE_ARCH_ARM64)
+			goto use_scalar_rx;
+#else
+			return bnxt_crx_pkts_vec;
+#endif
+		}
 		return bnxt_recv_pkts_vec;
 	}
 
@@ -1445,7 +1453,8 @@  bnxt_transmit_function(__rte_unused struct rte_eth_dev *eth_dev)
 	 */
 	if (eth_dev->data->scattered_rx ||
 	    (offloads & ~RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) ||
-	    BNXT_TRUFLOW_EN(bp) || bp->ieee_1588)
+	    (BNXT_TRUFLOW_EN(bp) && !BNXT_CHIP_P7(bp)) ||
+	    bp->ieee_1588)
 		goto use_scalar_tx;
 
 #if defined(RTE_ARCH_X86)
@@ -3125,6 +3134,7 @@  static const struct {
 } bnxt_rx_burst_info[] = {
 	{bnxt_recv_pkts,		"Scalar"},
 #if defined(RTE_ARCH_X86)
+	{bnxt_crx_pkts_vec,		"Vector SSE"},
 	{bnxt_recv_pkts_vec,		"Vector SSE"},
 #endif
 #if defined(RTE_ARCH_X86) && defined(CC_AVX2_SUPPORT)
diff --git a/drivers/net/bnxt/bnxt_rxr.h b/drivers/net/bnxt/bnxt_rxr.h
index a474a69ae3..d36cbded1d 100644
--- a/drivers/net/bnxt/bnxt_rxr.h
+++ b/drivers/net/bnxt/bnxt_rxr.h
@@ -156,6 +156,8 @@  int bnxt_flush_rx_cmp(struct bnxt_cp_ring_info *cpr);
 #if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM64)
 uint16_t bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts);
+uint16_t bnxt_crx_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			   uint16_t nb_pkts);
 int bnxt_rxq_vec_setup(struct bnxt_rx_queue *rxq);
 #endif
 
diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c
index e99a547f58..220aa82073 100644
--- a/drivers/net/bnxt/bnxt_rxtx_vec_sse.c
+++ b/drivers/net/bnxt/bnxt_rxtx_vec_sse.c
@@ -54,15 +54,9 @@ 
 
 static inline void
 descs_to_mbufs(__m128i mm_rxcmp[4], __m128i mm_rxcmp1[4],
-	       __m128i mbuf_init, struct rte_mbuf **mbuf,
-	       struct bnxt_rx_ring_info *rxr)
+	       __m128i mbuf_init, const __m128i shuf_msk,
+	       struct rte_mbuf **mbuf, struct bnxt_rx_ring_info *rxr)
 {
-	const __m128i shuf_msk =
-		_mm_set_epi8(15, 14, 13, 12,          /* rss */
-			     0xFF, 0xFF,              /* vlan_tci (zeroes) */
-			     3, 2,                    /* data_len */
-			     0xFF, 0xFF, 3, 2,        /* pkt_len */
-			     0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
 	const __m128i flags_type_mask =
 		_mm_set1_epi32(RX_PKT_CMPL_FLAGS_ITYPE_MASK);
 	const __m128i flags2_mask1 =
@@ -166,6 +160,12 @@  recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	int nb_rx_pkts = 0;
 	const __m128i valid_target =
 		_mm_set1_epi32(!!(raw_cons & cp_ring_size));
+	const __m128i shuf_msk =
+		_mm_set_epi8(15, 14, 13, 12,          /* rss */
+			     0xFF, 0xFF,              /* vlan_tci (zeroes) */
+			     3, 2,                    /* data_len */
+			     0xFF, 0xFF, 3, 2,        /* pkt_len */
+			     0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
 	int i;
 
 	/* If Rx Q was stopped return */
@@ -264,7 +264,7 @@  recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		if (num_valid == 0)
 			break;
 
-		descs_to_mbufs(rxcmp, rxcmp1, mbuf_init, &rx_pkts[nb_rx_pkts],
+		descs_to_mbufs(rxcmp, rxcmp1, mbuf_init, shuf_msk, &rx_pkts[nb_rx_pkts],
 			       rxr);
 		nb_rx_pkts += num_valid;
 
@@ -283,6 +283,134 @@  recv_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	return nb_rx_pkts;
 }
 
+static uint16_t
+crx_burst_vec_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct bnxt_rx_queue *rxq = rx_queue;
+	const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
+	struct bnxt_cp_ring_info *cpr = rxq->cp_ring;
+	struct bnxt_rx_ring_info *rxr = rxq->rx_ring;
+	uint16_t cp_ring_size = cpr->cp_ring_struct->ring_size;
+	uint16_t rx_ring_size = rxr->rx_ring_struct->ring_size;
+	struct cmpl_base *cp_desc_ring = cpr->cp_desc_ring;
+	uint64_t valid, desc_valid_mask = ~0ULL;
+	const __m128i info3_v_mask = _mm_set1_epi32(CMPL_BASE_V);
+	uint32_t raw_cons = cpr->cp_raw_cons;
+	uint32_t cons, mbcons;
+	int nb_rx_pkts = 0;
+	const __m128i valid_target =
+		_mm_set1_epi32(!!(raw_cons & cp_ring_size));
+	const __m128i shuf_msk =
+		_mm_set_epi8(7, 6, 5, 4,          /* rss */
+			     0xFF, 0xFF,              /* vlan_tci (zeroes) */
+			     3, 2,                    /* data_len */
+			     0xFF, 0xFF, 3, 2,        /* pkt_len */
+			     0xFF, 0xFF, 0xFF, 0xFF); /* pkt_type (zeroes) */
+	int i;
+
+	/* If Rx Q was stopped return */
+	if (unlikely(!rxq->rx_started))
+		return 0;
+
+	if (rxq->rxrearm_nb >= rxq->rx_free_thresh)
+		bnxt_rxq_rearm(rxq, rxr);
+
+	cons = raw_cons & (cp_ring_size - 1);
+	mbcons = raw_cons & (rx_ring_size - 1);
+
+	/* Prefetch first four descriptor pairs. */
+	rte_prefetch0(&cp_desc_ring[cons]);
+
+	/* Ensure that we do not go past the ends of the rings. */
+	nb_pkts = RTE_MIN(nb_pkts, RTE_MIN(rx_ring_size - mbcons,
+					   cp_ring_size - cons));
+	/*
+	 * If we are at the end of the ring, ensure that descriptors after the
+	 * last valid entry are not treated as valid. Otherwise, force the
+	 * maximum number of packets to receive to be a multiple of the per-
+	 * loop count.
+	 */
+	if (nb_pkts < BNXT_RX_DESCS_PER_LOOP_VEC128) {
+		desc_valid_mask >>=
+			16 * (BNXT_RX_DESCS_PER_LOOP_VEC128 - nb_pkts);
+	} else {
+		nb_pkts =
+			RTE_ALIGN_FLOOR(nb_pkts, BNXT_RX_DESCS_PER_LOOP_VEC128);
+	}
+
+	/* Handle RX burst request */
+	for (i = 0; i < nb_pkts; i += BNXT_RX_DESCS_PER_LOOP_VEC128,
+				  cons += BNXT_RX_DESCS_PER_LOOP_VEC128,
+				  mbcons += BNXT_RX_DESCS_PER_LOOP_VEC128) {
+		__m128i rxcmp1[BNXT_RX_DESCS_PER_LOOP_VEC128];
+		__m128i rxcmp[BNXT_RX_DESCS_PER_LOOP_VEC128];
+		__m128i tmp0, tmp1, info3_v;
+		uint32_t num_valid;
+
+		/* Copy four mbuf pointers to output array. */
+		tmp0 = _mm_loadu_si128((void *)&rxr->rx_buf_ring[mbcons]);
+#ifdef RTE_ARCH_X86_64
+		tmp1 = _mm_loadu_si128((void *)&rxr->rx_buf_ring[mbcons + 2]);
+#endif
+		_mm_storeu_si128((void *)&rx_pkts[i], tmp0);
+#ifdef RTE_ARCH_X86_64
+		_mm_storeu_si128((void *)&rx_pkts[i + 2], tmp1);
+#endif
+
+		/* Prefetch four descriptor pairs for next iteration. */
+		if (i + BNXT_RX_DESCS_PER_LOOP_VEC128 < nb_pkts)
+			rte_prefetch0(&cp_desc_ring[cons + 4]);
+
+		/*
+		 * Load the four current descriptors into SSE registers in
+		 * reverse order to ensure consistent state.
+		 */
+		rxcmp[3] = _mm_load_si128((void *)&cp_desc_ring[cons + 3]);
+		rte_compiler_barrier();
+		rxcmp[2] = _mm_load_si128((void *)&cp_desc_ring[cons + 2]);
+		rte_compiler_barrier();
+		rxcmp[1] = _mm_load_si128((void *)&cp_desc_ring[cons + 1]);
+		rte_compiler_barrier();
+		rxcmp[0] = _mm_load_si128((void *)&cp_desc_ring[cons + 0]);
+
+		tmp1 = _mm_unpackhi_epi32(rxcmp[2], rxcmp[3]);
+		tmp0 = _mm_unpackhi_epi32(rxcmp[0], rxcmp[1]);
+
+		/* Isolate descriptor valid flags. */
+		info3_v = _mm_and_si128(_mm_unpacklo_epi64(tmp0, tmp1),
+					info3_v_mask);
+		info3_v = _mm_xor_si128(info3_v, valid_target);
+
+		/*
+		 * Pack the 128-bit array of valid descriptor flags into 64
+		 * bits and count the number of set bits in order to determine
+		 * the number of valid descriptors.
+		 */
+		valid = _mm_cvtsi128_si64(_mm_packs_epi32(info3_v, info3_v));
+		num_valid = rte_popcount64(valid & desc_valid_mask);
+
+		if (num_valid == 0)
+			break;
+
+		descs_to_mbufs(rxcmp, rxcmp1, mbuf_init, shuf_msk, &rx_pkts[nb_rx_pkts],
+			       rxr);
+		nb_rx_pkts += num_valid;
+
+		if (num_valid < BNXT_RX_DESCS_PER_LOOP_VEC128)
+			break;
+	}
+
+	if (nb_rx_pkts) {
+		rxr->rx_raw_prod = RING_ADV(rxr->rx_raw_prod, nb_rx_pkts);
+
+		rxq->rxrearm_nb += nb_rx_pkts;
+		cpr->cp_raw_cons += nb_rx_pkts;
+		bnxt_db_cq(cpr);
+	}
+
+	return nb_rx_pkts;
+}
+
 uint16_t
 bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
@@ -304,6 +432,27 @@  bnxt_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	return cnt + recv_burst_vec_sse(rx_queue, rx_pkts + cnt, nb_pkts);
 }
 
+uint16_t
+bnxt_crx_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	uint16_t cnt = 0;
+
+	while (nb_pkts > RTE_BNXT_MAX_RX_BURST) {
+		uint16_t burst;
+
+		burst = crx_burst_vec_sse(rx_queue, rx_pkts + cnt,
+					   RTE_BNXT_MAX_RX_BURST);
+
+		cnt += burst;
+		nb_pkts -= burst;
+
+		if (burst < RTE_BNXT_MAX_RX_BURST)
+			return cnt;
+	}
+
+	return cnt + crx_burst_vec_sse(rx_queue, rx_pkts + cnt, nb_pkts);
+}
+
 static void
 bnxt_handle_tx_cp_vec(struct bnxt_tx_queue *txq)
 {