[v3,4/4] net/mlx5: allow use allocated mbuf with external buffer
diff mbox series

Message ID 1578993305-15165-5-git-send-email-viacheslavo@mellanox.com
State Superseded, archived
Delegated to: Thomas Monjalon
Headers show
Series
  • mbuf: detach mbuf with pinned external buffer
Related show

Checks

Context Check Description
ci/travis-robot warning Travis build: failed
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Slava Ovsiienko Jan. 14, 2020, 9:15 a.m. UTC
In the Rx datapath the flags in the newly allocated mbufs
are all explicitly cleared but the EXT_ATTACHED_MBUF must be
preserved. It would allow to use mbuf pools with pre-attached
external data buffers.

The vectorized rx_burst routines are updated in order to
inherit the EXT_ATTACHED_MBUF from mbuf pool private
RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF flag.

Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxq.c              |  7 ++++++-
 drivers/net/mlx5/mlx5_rxtx.c             |  2 +-
 drivers/net/mlx5/mlx5_rxtx.h             |  2 +-
 drivers/net/mlx5/mlx5_rxtx_vec.h         | 14 ++++----------
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h |  5 ++---
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 29 +++++++++++++++--------------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     |  2 +-
 7 files changed, 30 insertions(+), 31 deletions(-)

Patch
diff mbox series

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index ca25e32..c87ce15 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -225,6 +225,9 @@ 
 	if (mlx5_rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
 		struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
 		struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
+		struct rte_pktmbuf_pool_private *priv =
+			(struct rte_pktmbuf_pool_private *)
+				rte_mempool_get_priv(rxq_ctrl->rxq.mp);
 		int j;
 
 		/* Initialize default rearm_data for vPMD. */
@@ -232,13 +235,15 @@ 
 		rte_mbuf_refcnt_set(mbuf_init, 1);
 		mbuf_init->nb_segs = 1;
 		mbuf_init->port = rxq->port_id;
+		if (priv->flags & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF)
+			mbuf_init->ol_flags = EXT_ATTACHED_MBUF;
 		/*
 		 * prevent compiler reordering:
 		 * rearm_data covers previous fields.
 		 */
 		rte_compiler_barrier();
 		rxq->mbuf_initializer =
-			*(uint64_t *)&mbuf_init->rearm_data;
+			*(rte_xmm_t *)&mbuf_init->rearm_data;
 		/* Padding with a fake mbuf for vectorized Rx. */
 		for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
 			(*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b11c5eb..fdc7529 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1337,7 +1337,7 @@  enum mlx5_txcmp_code {
 			}
 			pkt = seg;
 			assert(len >= (rxq->crc_present << 2));
-			pkt->ol_flags = 0;
+			pkt->ol_flags &= EXT_ATTACHED_MBUF;
 			/* If compressed, take hash result from mini-CQE. */
 			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
 							cqe->rx_hash_res :
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index e362b4a..24fa038 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -144,7 +144,7 @@  struct mlx5_rxq_data {
 	struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
 	uint16_t idx; /* Queue index. */
 	struct mlx5_rxq_stats stats;
-	uint64_t mbuf_initializer; /* Default rearm_data for vectorized Rx. */
+	rte_xmm_t mbuf_initializer; /* Default rearm/flags for vectorized Rx. */
 	struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
 	void *cq_uar; /* CQ user access region. */
 	uint32_t cqn; /* CQ number. */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index 85e0bd5..d8c07f2 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -97,18 +97,12 @@ 
 		void *buf_addr;
 
 		/*
-		 * Load the virtual address for Rx WQE. non-x86 processors
-		 * (mostly RISC such as ARM and Power) are more vulnerable to
-		 * load stall. For x86, reducing the number of instructions
-		 * seems to matter most.
+		 * In order to support the mbufs with external attached
+		 * data buffer we should use the buf_addr pointer instead of
+		 * rte_mbuf_buf_addr(). It touches the mbuf itself and may
+		 * impact the performance.
 		 */
-#ifdef RTE_ARCH_X86_64
 		buf_addr = elts[i]->buf_addr;
-		assert(buf_addr == rte_mbuf_buf_addr(elts[i], rxq->mp));
-#else
-		buf_addr = rte_mbuf_buf_addr(elts[i], rxq->mp);
-		assert(buf_addr == elts[i]->buf_addr);
-#endif
 		wq[i].addr = rte_cpu_to_be_64((uintptr_t)buf_addr +
 					      RTE_PKTMBUF_HEADROOM);
 		/* If there's only one MR, no need to replace LKey in WQE. */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index 8e79883..9e5c6ee 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -344,9 +344,8 @@ 
 		PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 		PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED};
 	const vector unsigned char mbuf_init =
-		(vector unsigned char)(vector unsigned long){
-		*(__attribute__((__aligned__(8))) unsigned long *)
-		&rxq->mbuf_initializer, 0LL};
+		(vector unsigned char)vec_vsx_ld
+			(0, (vector unsigned char *)&rxq->mbuf_initializer);
 	const vector unsigned short rearm_sel_mask =
 		(vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
 	vector unsigned char rearm0, rearm1, rearm2, rearm3;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 86785c7..332e9ac 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -264,8 +264,8 @@ 
 	const uint32x4_t cv_mask =
 		vdupq_n_u32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 			    PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
-	const uint64x1_t mbuf_init = vld1_u64(&rxq->mbuf_initializer);
-	const uint64x1_t r32_mask = vcreate_u64(0xffffffff);
+	const uint64x2_t mbuf_init = vld1q_u64
+				((const uint64_t *)&rxq->mbuf_initializer);
 	uint64x2_t rearm0, rearm1, rearm2, rearm3;
 	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;
 
@@ -326,18 +326,19 @@ 
 	/* Merge to ol_flags. */
 	ol_flags = vorrq_u32(ol_flags, cv_flags);
 	/* Merge mbuf_init and ol_flags, and store. */
-	rearm0 = vcombine_u64(mbuf_init,
-			      vshr_n_u64(vget_high_u64(vreinterpretq_u64_u32(
-						       ol_flags)), 32));
-	rearm1 = vcombine_u64(mbuf_init,
-			      vand_u64(vget_high_u64(vreinterpretq_u64_u32(
-						     ol_flags)), r32_mask));
-	rearm2 = vcombine_u64(mbuf_init,
-			      vshr_n_u64(vget_low_u64(vreinterpretq_u64_u32(
-						      ol_flags)), 32));
-	rearm3 = vcombine_u64(mbuf_init,
-			      vand_u64(vget_low_u64(vreinterpretq_u64_u32(
-						    ol_flags)), r32_mask));
+	rearm0 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 3),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+	rearm1 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 2),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+	rearm2 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 1),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+	rearm3 = vreinterpretq_u64_u32(vsetq_lane_u32
+					(vgetq_lane_u32(ol_flags, 0),
+					 vreinterpretq_u32_u64(mbuf_init), 2));
+
 	vst1q_u64((void *)&pkts[0]->rearm_data, rearm0);
 	vst1q_u64((void *)&pkts[1]->rearm_data, rearm1);
 	vst1q_u64((void *)&pkts[2]->rearm_data, rearm2);
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 35b7761..07d40d5 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -259,7 +259,7 @@ 
 			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
 	const __m128i mbuf_init =
-		_mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer);
+		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);
 	__m128i rearm0, rearm1, rearm2, rearm3;
 	uint8_t pt_idx0, pt_idx1, pt_idx2, pt_idx3;