[4/5] net/mlx5: support enhanced CQE zipping in vector Rx burst

Message ID 20230228164310.807594-5-akozyrev@nvidia.com (mailing list archive)
State Accepted, archived
Delegated to: Raslan Darawsheh
Headers
Series net/mlx5: enhanced CQE compression layout |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Alexander Kozyrev Feb. 28, 2023, 4:43 p.m. UTC
  Add Enhanced CQE compression support to vectorized Rx burst routines.
Adopt the same algorithm as scalar Rx burst routines have today.
1. Retrieve the validity_iteration_count from CQEs and use it
to check if the CQE is ready to be processed instead of the owner_bit.
2. Do not invalidate reserved CQEs between miniCQE arrays.
3. Copy the title packet from the last processed uncompressed CQE
since we will need it later to build packets from zipped CQEs.
4. Skip the regular CQE processing and go straight to the CQE unzip
function in case the very first CQE is compressed to sace CPU time.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
 drivers/net/mlx5/mlx5_rx.h               |   1 +
 drivers/net/mlx5/mlx5_rxtx_vec.c         |  24 ++++-
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 108 ++++++++++++++++-------
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    |  91 +++++++++++++------
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     |  94 ++++++++++++++------
 5 files changed, 232 insertions(+), 86 deletions(-)
  

Comments

Slava Ovsiienko March 6, 2023, 1:13 p.m. UTC | #1
> -----Original Message-----
> From: Alexander Kozyrev <akozyrev@nvidia.com>
> Sent: вторник, 28 февраля 2023 г. 18:43
> To: dev@dpdk.org
> Cc: Raslan Darawsheh <rasland@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; Matan Azrad <matan@nvidia.com>
> Subject: [PATCH 4/5] net/mlx5: support enhanced CQE zipping in vector Rx
> burst
> 
> Add Enhanced CQE compression support to vectorized Rx burst routines.
> Adopt the same algorithm as scalar Rx burst routines have today.
> 1. Retrieve the validity_iteration_count from CQEs and use it to check if the CQE
> is ready to be processed instead of the owner_bit.
> 2. Do not invalidate reserved CQEs between miniCQE arrays.
> 3. Copy the title packet from the last processed uncompressed CQE since we will
> need it later to build packets from zipped CQEs.
> 4. Skip the regular CQE processing and go straight to the CQE unzip function in
> case the very first CQE is compressed to sace CPU time.
> 
> Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
  

Patch

diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index 143685c6ab..8b87adad36 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -122,6 +122,7 @@  struct mlx5_rxq_data {
 	volatile struct mlx5_cqe(*cqes)[];
 	struct mlx5_cqe title_cqe; /* Title CQE for CQE compression. */
 	struct rte_mbuf *(*elts)[];
+	struct rte_mbuf title_pkt; /* Title packet for CQE compression. */
 	struct mlx5_mprq_buf *(*mprq_bufs)[];
 	struct rte_mempool *mp;
 	struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 667475a93e..2363d7ed27 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -290,13 +290,14 @@  rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	const uint16_t q_mask = q_n - 1;
 	const uint16_t e_n = 1 << rxq->elts_n;
 	const uint16_t e_mask = e_n - 1;
-	volatile struct mlx5_cqe *cq;
+	volatile struct mlx5_cqe *cq, *next;
 	struct rte_mbuf **elts;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
 	uint16_t rcvd_pkt = 0;
 	unsigned int cq_idx = rxq->cq_ci & q_mask;
 	unsigned int elts_idx;
+	int ret;
 
 	MLX5_ASSERT(rxq->sges_n == 0);
 	MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
@@ -342,6 +343,15 @@  rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	rxq->cq_ci += nocmp_n;
 	rxq->rq_pi += nocmp_n;
 	rcvd_pkt += nocmp_n;
+	/* Copy title packet for future compressed sessions. */
+	if (rxq->cqe_comp_layout) {
+		next = &(*rxq->cqes)[rxq->cq_ci & q_mask];
+		ret = check_cqe_iteration(next,	rxq->cqe_n, rxq->cq_ci);
+		if (ret != MLX5_CQE_STATUS_SW_OWN ||
+		    MLX5_CQE_FORMAT(next->op_own) == MLX5_COMPRESSED)
+			rte_memcpy(&rxq->title_pkt, elts[nocmp_n - 1],
+				   sizeof(struct rte_mbuf));
+	}
 	/* Decompress the last CQE if compressed. */
 	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
 		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
@@ -431,7 +441,7 @@  rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	const uint32_t strd_n = RTE_BIT32(rxq->log_strd_num);
 	const uint32_t elts_n = wqe_n * strd_n;
 	const uint32_t elts_mask = elts_n - 1;
-	volatile struct mlx5_cqe *cq;
+	volatile struct mlx5_cqe *cq, *next;
 	struct rte_mbuf **elts;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
@@ -439,6 +449,7 @@  rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	uint16_t cp_pkt = 0;
 	unsigned int cq_idx = rxq->cq_ci & q_mask;
 	unsigned int elts_idx;
+	int ret;
 
 	MLX5_ASSERT(rxq->sges_n == 0);
 	cq = &(*rxq->cqes)[cq_idx];
@@ -482,6 +493,15 @@  rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
 	MLX5_ASSERT(nocmp_n <= pkts_n);
 	cp_pkt = rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n);
 	rcvd_pkt += cp_pkt;
+	/* Copy title packet for future compressed sessions. */
+	if (rxq->cqe_comp_layout) {
+		next = &(*rxq->cqes)[rxq->cq_ci & q_mask];
+		ret = check_cqe_iteration(next,	rxq->cqe_n, rxq->cq_ci);
+		if (ret != MLX5_CQE_STATUS_SW_OWN ||
+		    MLX5_CQE_FORMAT(next->op_own) == MLX5_COMPRESSED)
+			rte_memcpy(&rxq->title_pkt, elts[nocmp_n - 1],
+				   sizeof(struct rte_mbuf));
+	}
 	/* Decompress the last CQE if compressed. */
 	if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP) {
 		MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index 204d17a8f2..14ffff26f4 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -76,8 +76,10 @@  static inline uint16_t
 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		    struct rte_mbuf **elts)
 {
-	volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
-	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	volatile struct mlx5_mini_cqe8 *mcq =
+		(void *)&(cq + !rxq->cqe_comp_layout)->pkt_info;
+	/* Title packet is pre-built. */
+	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
 	const __vector unsigned char zero = (__vector unsigned char){0};
 	/* Mask to shuffle from extracted mini CQE to mbuf. */
 	const __vector unsigned char shuf_mask1 = (__vector unsigned char){
@@ -93,8 +95,10 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			-1, -1,           /* skip vlan_tci */
 			11, 10,  9,  8};  /* bswap32, rss */
 	/* Restore the compressed count. Must be 16 bits. */
-	const uint16_t mcqe_n = t_pkt->data_len +
-		(rxq->crc_present * RTE_ETHER_CRC_LEN);
+	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
+		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
+		t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
+	uint16_t pkts_n = mcqe_n;
 	const __vector unsigned char rearm =
 		(__vector unsigned char)vec_vsx_ld(0,
 		(signed int const *)&t_pkt->rearm_data);
@@ -132,6 +136,9 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	 * D. store rx_descriptor_fields1.
 	 * E. store flow tag (rte_flow mark).
 	 */
+cycle:
+	if (rxq->cqe_comp_layout)
+		rte_prefetch0((void *)(cq + mcqe_n));
 	for (pos = 0; pos < mcqe_n; ) {
 		__vector unsigned char mcqe1, mcqe2;
 		__vector unsigned char rxdf1, rxdf2;
@@ -154,9 +161,10 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		const __vector unsigned long shmax = {64, 64};
 #endif
 
-		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-			if (likely(pos + i < mcqe_n))
-				rte_prefetch0((void *)(cq + pos + i));
+		if (!rxq->cqe_comp_layout)
+			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+				if (likely(pos + i < mcqe_n))
+					rte_prefetch0((void *)(cq + pos + i));
 		/* A.1 load mCQEs into a 128bit register. */
 		mcqe1 = (__vector unsigned char)vec_vsx_ld(0,
 			(signed int const *)&mcq[pos % 8]);
@@ -488,25 +496,43 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 
 		pos += MLX5_VPMD_DESCS_PER_LOOP;
 		/* Move to next CQE and invalidate consumed CQEs. */
-		if (!(pos & 0x7) && pos < mcqe_n) {
-			if (pos + 8 < mcqe_n)
-				rte_prefetch0((void *)(cq + pos + 8));
-			mcq = (void *)&(cq + pos)->pkt_info;
-			for (i = 0; i < 8; ++i)
-				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		if (!rxq->cqe_comp_layout) {
+			if (!(pos & 0x7) && pos < mcqe_n) {
+				if (pos + 8 < mcqe_n)
+					rte_prefetch0((void *)(cq + pos + 8));
+				mcq = (void *)&(cq + pos)->pkt_info;
+				for (i = 0; i < 8; ++i)
+					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+			}
 		}
 	}
 
-	/* Invalidate the rest of CQEs. */
-	for (; inv < mcqe_n; ++inv)
-		cq[inv].op_own = MLX5_CQE_INVALIDATE;
+	if (rxq->cqe_comp_layout) {
+		int ret;
+		/* Keep unzipping if the next CQE is the miniCQE array. */
+		cq = &cq[mcqe_n];
+		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
+		if (ret == MLX5_CQE_STATUS_SW_OWN &&
+		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
+			pos = 0;
+			elts = &elts[mcqe_n];
+			mcq = (void *)cq;
+			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
+			pkts_n += mcqe_n;
+			goto cycle;
+		}
+	} else {
+		/* Invalidate the rest of CQEs. */
+		for (; inv < pkts_n; ++inv)
+			cq[inv].op_own = MLX5_CQE_INVALIDATE;
+	}
 
 #ifdef MLX5_PMD_SOFT_COUNTERS
-	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ipackets += pkts_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
 
-	return mcqe_n;
+	return pkts_n;
 }
 
 /**
@@ -787,9 +813,13 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	uint64_t n = 0;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
-	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+	const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
+	unsigned int own = !(rxq->cq_ci & (q_mask + 1));
 	const __vector unsigned char zero = (__vector unsigned char){0};
 	const __vector unsigned char ones = vec_splat_u8(-1);
+	const __vector unsigned char vic_check =
+		(__vector unsigned char)(__vector unsigned long){
+		0x00ff000000ff0000LL, 0x00ff000000ff0000LL};
 	const __vector unsigned char owner_check =
 		(__vector unsigned char)(__vector unsigned long){
 		0x0100000001000000LL, 0x0100000001000000LL};
@@ -837,7 +867,16 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		(__vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
 	const __vector unsigned short cqe_sel_mask2 =
 		(__vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
-
+	const __vector unsigned char validity = (__vector unsigned char){
+		0,  0,  vic,  0,
+		0,  0,  vic,  0,
+		0,  0,  vic,  0,
+		0,  0,  vic,  0};
+	const __vector unsigned char ownership = (__vector unsigned char){
+		0,  0,  0,  own,
+		0,  0,  0,  own,
+		0,  0,  0,  own,
+		0,  0,  0,  own};
 	/*
 	 * A. load first Qword (8bytes) in one loop.
 	 * B. copy 4 mbuf pointers from elts ring to returning pkts.
@@ -848,7 +887,7 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	 *          uint8_t  pkt_info;
 	 *          uint8_t  flow_tag[3];
 	 *          uint16_t byte_cnt;
-	 *          uint8_t  rsvd4;
+	 *          uint8_t  validity_iteration_count;
 	 *          uint8_t  op_own;
 	 *          uint16_t hdr_type_etc;
 	 *          uint16_t vlan_info;
@@ -1082,17 +1121,25 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		*(__vector unsigned char *)
 			&pkts[pos]->pkt_len = pkt_mb0;
 
-		/* E.2 flip owner bit to mark CQEs from last round. */
-		owner_mask = (__vector unsigned char)
-			vec_and((__vector unsigned long)op_own,
-			(__vector unsigned long)owner_check);
-		if (ownership)
+		/* E.2 mask out CQEs belonging to HW. */
+		if (rxq->cqe_comp_layout) {
+			owner_mask = (__vector unsigned char)
+				vec_and((__vector unsigned long)op_own,
+				(__vector unsigned long)vic_check);
+			owner_mask = (__vector unsigned char)
+				vec_cmpeq((__vector unsigned int)owner_mask,
+				(__vector unsigned int)validity);
 			owner_mask = (__vector unsigned char)
 				vec_xor((__vector unsigned long)owner_mask,
+				(__vector unsigned long)ones);
+		} else {
+			owner_mask = (__vector unsigned char)
+				vec_and((__vector unsigned long)op_own,
 				(__vector unsigned long)owner_check);
-		owner_mask = (__vector unsigned char)
-			vec_cmpeq((__vector unsigned int)owner_mask,
-			(__vector unsigned int)owner_check);
+			owner_mask = (__vector unsigned char)
+				vec_cmpeq((__vector unsigned int)owner_mask,
+				(__vector unsigned int)ownership);
+		}
 		owner_mask = (__vector unsigned char)
 			vec_packs((__vector unsigned int)owner_mask,
 			(__vector unsigned int)zero);
@@ -1174,7 +1221,8 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			(__vector unsigned long)mask);
 
 		/* D.3 check error in opcode. */
-		adj = (comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
+		adj = (!rxq->cqe_comp_layout &&
+		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
 		mask = (__vector unsigned char)(__vector unsigned long){
 			(adj * sizeof(uint16_t) * 8), 0};
 		lshift = vec_splat((__vector unsigned long)mask, 0);
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index 41b9cf5444..75e8ed7e5a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -71,8 +71,10 @@  static inline uint16_t
 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		    struct rte_mbuf **elts)
 {
-	volatile struct mlx5_mini_cqe8 *mcq = (void *)&(cq + 1)->pkt_info;
-	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	volatile struct mlx5_mini_cqe8 *mcq =
+		(void *)&(cq + !rxq->cqe_comp_layout)->pkt_info;
+	/* Title packet is pre-built. */
+	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
 	unsigned int pos;
 	unsigned int i;
 	unsigned int inv = 0;
@@ -92,8 +94,10 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		11, 10,  9,  8  /* hash.rss, bswap32 */
 	};
 	/* Restore the compressed count. Must be 16 bits. */
-	const uint16_t mcqe_n = t_pkt->data_len +
-				(rxq->crc_present * RTE_ETHER_CRC_LEN);
+	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
+		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
+		t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
+	uint16_t pkts_n = mcqe_n;
 	const uint64x2_t rearm =
 		vld1q_u64((void *)&t_pkt->rearm_data);
 	const uint32x4_t rxdf_mask = {
@@ -131,6 +135,9 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	 * D. store rx_descriptor_fields1.
 	 * E. store flow tag (rte_flow mark).
 	 */
+cycle:
+	if (rxq->cqe_comp_layout)
+		rte_prefetch0((void *)(cq + mcqe_n));
 	for (pos = 0; pos < mcqe_n; ) {
 		uint8_t *p = (void *)&mcq[pos % 8];
 		uint8_t *e0 = (void *)&elts[pos]->rearm_data;
@@ -145,9 +152,10 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 					     sizeof(uint16_t) * 8) : 0);
 #endif
 
-		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-			if (likely(pos + i < mcqe_n))
-				rte_prefetch0((void *)(cq + pos + i));
+		if (!rxq->cqe_comp_layout)
+			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+				if (likely(pos + i < mcqe_n))
+					rte_prefetch0((void *)(cq + pos + i));
 		__asm__ volatile (
 		/* A.1 load mCQEs into a 128bit register. */
 		"ld1 {v16.16b - v17.16b}, [%[mcq]] \n\t"
@@ -354,22 +362,40 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		}
 		pos += MLX5_VPMD_DESCS_PER_LOOP;
 		/* Move to next CQE and invalidate consumed CQEs. */
-		if (!(pos & 0x7) && pos < mcqe_n) {
-			if (pos + 8 < mcqe_n)
-				rte_prefetch0((void *)(cq + pos + 8));
-			mcq = (void *)&(cq + pos)->pkt_info;
-			for (i = 0; i < 8; ++i)
-				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		if (!rxq->cqe_comp_layout) {
+			if (!(pos & 0x7) && pos < mcqe_n) {
+				if (pos + 8 < mcqe_n)
+					rte_prefetch0((void *)(cq + pos + 8));
+				mcq = (void *)&(cq + pos)->pkt_info;
+				for (i = 0; i < 8; ++i)
+					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+			}
+		}
+	}
+	if (rxq->cqe_comp_layout) {
+		int ret;
+		/* Keep unzipping if the next CQE is the miniCQE array. */
+		cq = &cq[mcqe_n];
+		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
+		if (ret == MLX5_CQE_STATUS_SW_OWN &&
+		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
+			pos = 0;
+			elts = &elts[mcqe_n];
+			mcq = (void *)cq;
+			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
+			pkts_n += mcqe_n;
+			goto cycle;
 		}
+	} else {
+		/* Invalidate the rest of CQEs. */
+		for (; inv < pkts_n; ++inv)
+			cq[inv].op_own = MLX5_CQE_INVALIDATE;
 	}
-	/* Invalidate the rest of CQEs. */
-	for (; inv < mcqe_n; ++inv)
-		cq[inv].op_own = MLX5_CQE_INVALIDATE;
 #ifdef MLX5_PMD_SOFT_COUNTERS
-	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ipackets += pkts_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
-	return mcqe_n;
+	return pkts_n;
 }
 
 /**
@@ -528,7 +554,9 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	uint64_t n = 0;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
+	const uint16x4_t validity = vdup_n_u16((rxq->cq_ci >> rxq->cqe_n) << 8);
 	const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+	const uint16x4_t vic_check = vcreate_u16(0xff00ff00ff00ff00);
 	const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
 	const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
 	const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
@@ -547,7 +575,7 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	const uint8x16_t cqe_shuf_m = {
 		28, 29,         /* hdr_type_etc */
 		 0,             /* pkt_info */
-		-1,             /* null */
+		62,             /* validity_iteration_count */
 		47, 46,         /* byte_cnt, bswap16 */
 		31, 30,         /* vlan_info, bswap16 */
 		15, 14, 13, 12, /* rx_hash_res, bswap32 */
@@ -564,10 +592,10 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	};
 	/* Mask to generate 16B owner vector. */
 	const uint8x8_t owner_shuf_m = {
-		63, -1,         /* 4th CQE */
-		47, -1,         /* 3rd CQE */
-		31, -1,         /* 2nd CQE */
-		15, -1          /* 1st CQE */
+		63, 51,         /* 4th CQE */
+		47, 35,         /* 3rd CQE */
+		31, 19,         /* 2nd CQE */
+		15,  3          /* 1st CQE */
 	};
 	/* Mask to generate a vector having packet_type/ol_flags. */
 	const uint8x16_t ptype_shuf_m = {
@@ -600,7 +628,7 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	 *        struct {
 	 *          uint16_t hdr_type_etc;
 	 *          uint8_t  pkt_info;
-	 *          uint8_t  rsvd;
+	 *          uint8_t  validity_iteration_count;
 	 *          uint16_t byte_cnt;
 	 *          uint16_t vlan_info;
 	 *          uint32_t rx_has_res;
@@ -748,9 +776,15 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		 "v16", "v17", "v18", "v19",
 		 "v20", "v21", "v22", "v23",
 		 "v24", "v25");
-		/* D.2 flip owner bit to mark CQEs from last round. */
-		owner_mask = vand_u16(op_own, owner_check);
-		owner_mask = vceq_u16(owner_mask, ownership);
+		/* D.2 mask out CQEs belonging to HW. */
+		if (rxq->cqe_comp_layout) {
+			owner_mask = vand_u16(op_own, vic_check);
+			owner_mask = vceq_u16(owner_mask, validity);
+			owner_mask = vmvn_u16(owner_mask);
+		} else {
+			owner_mask = vand_u16(op_own, owner_check);
+			owner_mask = vceq_u16(owner_mask, ownership);
+		}
 		/* D.3 get mask for invalidated CQEs. */
 		opcode = vand_u16(op_own, opcode_check);
 		invalid_mask = vceq_u16(opcode_check, opcode);
@@ -780,7 +814,8 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 				   -1UL >> (n * sizeof(uint16_t) * 8) : 0);
 		invalid_mask = vorr_u16(invalid_mask, mask);
 		/* D.3 check error in opcode. */
-		adj = (comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
+		adj = (!rxq->cqe_comp_layout &&
+		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
 		mask = vcreate_u16(adj ?
 			   -1UL >> ((n + 1) * sizeof(uint16_t) * 8) : -1UL);
 		mini_mask = vand_u16(invalid_mask, mask);
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index ab69af0c55..b282f8b8e6 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -73,8 +73,9 @@  static inline uint16_t
 rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		    struct rte_mbuf **elts)
 {
-	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + 1);
-	struct rte_mbuf *t_pkt = elts[0]; /* Title packet is pre-built. */
+	volatile struct mlx5_mini_cqe8 *mcq = (void *)(cq + !rxq->cqe_comp_layout);
+	/* Title packet is pre-built. */
+	struct rte_mbuf *t_pkt = rxq->cqe_comp_layout ? &rxq->title_pkt : elts[0];
 	unsigned int pos;
 	unsigned int i;
 	unsigned int inv = 0;
@@ -92,8 +93,10 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			    -1, -1, 14, 15, /* pkt_len, bswap16 */
 			    -1, -1, -1, -1  /* skip packet_type */);
 	/* Restore the compressed count. Must be 16 bits. */
-	const uint16_t mcqe_n = t_pkt->data_len +
-				(rxq->crc_present * RTE_ETHER_CRC_LEN);
+	uint16_t mcqe_n = (rxq->cqe_comp_layout) ?
+		(MLX5_CQE_NUM_MINIS(cq->op_own) + 1) :
+		t_pkt->data_len + (rxq->crc_present * RTE_ETHER_CRC_LEN);
+	uint16_t pkts_n = mcqe_n;
 	const __m128i rearm =
 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
 	const __m128i rxdf =
@@ -124,6 +127,9 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	 * D. store rx_descriptor_fields1.
 	 * E. store flow tag (rte_flow mark).
 	 */
+cycle:
+	if (rxq->cqe_comp_layout)
+		rte_prefetch0((void *)(cq + mcqe_n));
 	for (pos = 0; pos < mcqe_n; ) {
 		__m128i mcqe1, mcqe2;
 		__m128i rxdf1, rxdf2;
@@ -131,9 +137,10 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		__m128i byte_cnt, invalid_mask;
 #endif
 
-		for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
-			if (likely(pos + i < mcqe_n))
-				rte_prefetch0((void *)(cq + pos + i));
+		if (!rxq->cqe_comp_layout)
+			for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+				if (likely(pos + i < mcqe_n))
+					rte_prefetch0((void *)(cq + pos + i));
 		/* A.1 load mCQEs into a 128bit register. */
 		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
 		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
@@ -344,22 +351,40 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		}
 		pos += MLX5_VPMD_DESCS_PER_LOOP;
 		/* Move to next CQE and invalidate consumed CQEs. */
-		if (!(pos & 0x7) && pos < mcqe_n) {
-			if (pos + 8 < mcqe_n)
-				rte_prefetch0((void *)(cq + pos + 8));
-			mcq = (void *)(cq + pos);
-			for (i = 0; i < 8; ++i)
-				cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+		if (!rxq->cqe_comp_layout) {
+			if (!(pos & 0x7) && pos < mcqe_n) {
+				if (pos + 8 < mcqe_n)
+					rte_prefetch0((void *)(cq + pos + 8));
+				mcq = (void *)(cq + pos);
+				for (i = 0; i < 8; ++i)
+					cq[inv++].op_own = MLX5_CQE_INVALIDATE;
+			}
+		}
+	}
+	if (rxq->cqe_comp_layout) {
+		int ret;
+		/* Keep unzipping if the next CQE is the miniCQE array. */
+		cq = &cq[mcqe_n];
+		ret = check_cqe_iteration(cq, rxq->cqe_n, rxq->cq_ci + pkts_n);
+		if (ret == MLX5_CQE_STATUS_SW_OWN &&
+		    MLX5_CQE_FORMAT(cq->op_own) == MLX5_COMPRESSED) {
+			pos = 0;
+			elts = &elts[mcqe_n];
+			mcq = (void *)cq;
+			mcqe_n = MLX5_CQE_NUM_MINIS(cq->op_own) + 1;
+			pkts_n += mcqe_n;
+			goto cycle;
 		}
+	} else {
+		/* Invalidate the rest of CQEs. */
+		for (; inv < pkts_n; ++inv)
+			cq[inv].op_own = MLX5_CQE_INVALIDATE;
 	}
-	/* Invalidate the rest of CQEs. */
-	for (; inv < mcqe_n; ++inv)
-		cq[inv].op_own = MLX5_CQE_INVALIDATE;
 #ifdef MLX5_PMD_SOFT_COUNTERS
-	rxq->stats.ipackets += mcqe_n;
+	rxq->stats.ipackets += pkts_n;
 	rxq->stats.ibytes += rcvd_byte;
 #endif
-	return mcqe_n;
+	return pkts_n;
 }
 
 /**
@@ -527,7 +552,9 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	uint64_t n = 0;
 	uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
 	uint16_t nocmp_n = 0;
-	unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+	const uint8_t vic = rxq->cq_ci >> rxq->cqe_n;
+	const uint8_t own = !(rxq->cq_ci & (q_mask + 1));
+	const __m128i vic_check = _mm_set1_epi64x(0x00ff000000ff0000LL);
 	const __m128i owner_check =	_mm_set1_epi64x(0x0100000001000000LL);
 	const __m128i opcode_check = _mm_set1_epi64x(0xf0000000f0000000LL);
 	const __m128i format_check = _mm_set1_epi64x(0x0c0000000c000000LL);
@@ -541,6 +568,16 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 			     12, 13,  8,  9,
 			      4,  5,  0,  1);
 #endif
+	const __m128i validity =
+		_mm_set_epi8(0, vic, 0, 0,
+			     0, vic, 0, 0,
+			     0, vic, 0, 0,
+			     0, vic, 0, 0);
+	const __m128i ownership =
+		_mm_set_epi8(own, 0, 0, 0,
+			     own, 0, 0, 0,
+			     own, 0, 0, 0,
+			     own, 0, 0, 0);
 	/* Mask to shuffle from extracted CQE to mbuf. */
 	const __m128i shuf_mask =
 		_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */
@@ -573,7 +610,7 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	 *          uint8_t  pkt_info;
 	 *          uint8_t  flow_tag[3];
 	 *          uint16_t byte_cnt;
-	 *          uint8_t  rsvd4;
+	 *          uint8_t  validity_iteration_count;
 	 *          uint8_t  op_own;
 	 *          uint16_t hdr_type_etc;
 	 *          uint16_t vlan_info;
@@ -689,11 +726,15 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		/* D.1 fill in mbuf - rx_descriptor_fields1. */
 		_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
 		_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
-		/* E.2 flip owner bit to mark CQEs from last round. */
-		owner_mask = _mm_and_si128(op_own, owner_check);
-		if (ownership)
-			owner_mask = _mm_xor_si128(owner_mask, owner_check);
-		owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+		/* E.2 mask out CQEs belonging to HW. */
+		if (rxq->cqe_comp_layout) {
+			owner_mask = _mm_and_si128(op_own, vic_check);
+			owner_mask = _mm_cmpeq_epi32(owner_mask, validity);
+			owner_mask = _mm_xor_si128(owner_mask, ones);
+		} else {
+			owner_mask = _mm_and_si128(op_own, owner_check);
+			owner_mask = _mm_cmpeq_epi32(owner_mask, ownership);
+		}
 		owner_mask = _mm_packs_epi32(owner_mask, zero);
 		/* E.3 get mask for invalidated CQEs. */
 		opcode = _mm_and_si128(op_own, opcode_check);
@@ -729,7 +770,8 @@  rxq_cq_process_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		mask = _mm_sll_epi64(ones, mask);
 		invalid_mask = _mm_or_si128(invalid_mask, mask);
 		/* D.3 check error in opcode. */
-		adj = (comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
+		adj = (!rxq->cqe_comp_layout &&
+		       comp_idx != MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n);
 		mask = _mm_set_epi64x(0, adj * sizeof(uint16_t) * 8);
 		mini_mask = _mm_sll_epi64(invalid_mask, mask);
 		opcode = _mm_cmpeq_epi32(resp_err_check, opcode);