[v1,2/5] net/i40e: enable direct rearm mode

Message ID 20220420081650.2043183-3-feifei.wang2@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Andrew Rybchenko
Headers
Series Direct re-arming of buffers on receive side |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Feifei Wang April 20, 2022, 8:16 a.m. UTC
  For i40e driver, enable direct re-arm mode. This patch supports the case
of mapping Rx/Tx queues from the same single lcore.

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
 drivers/net/i40e/i40e_rxtx.h            |   4 +
 drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  14 +-
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++-
 drivers/net/i40e/i40e_rxtx_vec_neon.c   | 141 ++++++++++++-
 drivers/net/i40e/i40e_rxtx_vec_sse.c    | 170 ++++++++++++++-
 6 files changed, 839 insertions(+), 8 deletions(-)
  

Comments

Konstantin Ananyev May 11, 2022, 10:28 p.m. UTC | #1
> For i40e driver, enable direct re-arm mode. This patch supports the case
> of mapping Rx/Tx queues from the same single lcore.
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
>   drivers/net/i40e/i40e_rxtx.h            |   4 +
>   drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++
>   drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  14 +-
>   drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++-
>   drivers/net/i40e/i40e_rxtx_vec_neon.c   | 141 ++++++++++++-
>   drivers/net/i40e/i40e_rxtx_vec_sse.c    | 170 ++++++++++++++-
>   6 files changed, 839 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
> index 5e6eecc501..1fdf4305f4 100644
> --- a/drivers/net/i40e/i40e_rxtx.h
> +++ b/drivers/net/i40e/i40e_rxtx.h
> @@ -102,6 +102,8 @@ struct i40e_rx_queue {
>   
>   	uint16_t rxrearm_nb;	/**< number of remaining to be re-armed */
>   	uint16_t rxrearm_start;	/**< the idx we start the re-arming from */
> +	uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */
> +	uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */
>   	uint64_t mbuf_initializer; /**< value to init mbufs */
>   
>   	uint16_t port_id; /**< device port ID */
> @@ -121,6 +123,8 @@ struct i40e_rx_queue {
>   	uint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */
>   	uint8_t dcb_tc;         /**< Traffic class of rx queue */
>   	uint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */
> +	/**<  0 if direct re-arm mode disabled, 1 when enabled */
> +	bool direct_rxrearm_enable;
>   	const struct rte_memzone *mz;
>   };
>   
> diff --git a/drivers/net/i40e/i40e_rxtx_common_avx.h b/drivers/net/i40e/i40e_rxtx_common_avx.h
> index cfc1e63173..a742723e07 100644
> --- a/drivers/net/i40e/i40e_rxtx_common_avx.h
> +++ b/drivers/net/i40e/i40e_rxtx_common_avx.h
> @@ -209,6 +209,275 @@ i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
>   	/* Update the tail pointer on the NIC */
>   	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
>   }
> +
> +static __rte_always_inline void
> +i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
> +{
> +	struct rte_eth_dev *dev;
> +	struct i40e_tx_queue *txq;rivers/net/i40e/i40e_rxtx_common_avx.h
> +	volatile union i40e_rx_desc *rxdp;
> +	struct i40e_tx_entry *txep;
> +	struct i40e_rx_entry *rxep;
> +	struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
> +	uint16_t tx_port_id, tx_queue_id;
> +	uint16_t rx_id;
> +	uint16_t i, n;
> +	uint16_t nb_rearm = 0;
> +
> +	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> +	rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> +	tx_port_id = rxq->direct_rxrearm_port;
> +	tx_queue_id = rxq->direct_rxrearm_queue;
> +	dev = &rte_eth_devices[tx_port_id];
> +	txq = dev->data->tx_queues[tx_queue_id];
> +
> +	/* check Rx queue is able to take in the whole
> +	 * batch of free mbufs from Tx queue
> +	 */
> +	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> +		/* check DD bits on threshold descriptor */
> +		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> +				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> +				rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> +			goto mempool_bulk;
> +		}
> +
> +		if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
> +			goto mempool_bulk;

I think all these checks (is this mode can be enabled) should be done at 
config phase, not at data-path.

> +
> +		n = txq->tx_rs_thresh;
> +
> +		/* first buffer to free from S/W ring is at index
> +		 * tx_next_dd - (tx_rs_thresh-1)
> +		 */
> +		txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];


It really looks bad that RX function acesses and modifies TXQ data 
directly. Would be much better to hide TXD checking/manipulation into a 
separate TXQ function (txq_mbuf() or so) that RX path can invoke.

> +
> +		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> +			/* directly put mbufs from Tx to Rx,
> +			 * and initialize the mbufs in vector
> +			 */
> +			for (i = 0; i < n; i++)
> +				rxep[i].mbuf = txep[i].mbuf;
> +		} else {
> +			for (i = 0; i < n; i++) {
> +				m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> +				/* ensure each Tx freed buffer is valid */
> +				if (m[i] != NULL)
> +					nb_rearm++;
> +			}
> +
> +			if (nb_rearm != n) {
> +				txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> +				txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> +				if (txq->tx_next_dd >= txq->nb_tx_desc)
> +					txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);


So if nb_rearm != 0 what would happen with mbufs collected in m[]?
Are you just dropping/forgetting them?


> +
> +				goto mempool_bulk;

> +			} else {
> +				for (i = 0; i < n; i++)
> +					rxep[i].mbuf = m[i];
> +			}
> +		}
> +
> +		/* update counters for Tx */
> +		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> +		txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> +		if (txq->tx_next_dd >= txq->nb_tx_desc)
> +			txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> +	} else {


I suppose the chunk of code below is just a copy&paste of
exising i40e_rxq_direct_rearm_common()?
If so, no point to duplicate it, better to just invoke it here
(I presume a bit of re-factoring) would be need for that.

Pretty much same thoughts for other rearm functions below.

> +mempool_bulk:
> +		/* if TX did not free bufs into Rx sw-ring,
> +		 * get new bufs from mempool
> +		 */
> +		n = RTE_I40E_RXQ_REARM_THRESH;
> +
> +		/* Pull 'n' more MBUFs into the software ring */
> +		if (rte_mempool_get_bulk(rxq->mp,
> +					(void *)rxep,
> +					RTE_I40E_RXQ_REARM_THRESH) < 0) {
> +			if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
> +				rxq->nb_rx_desc) {
> +				__m128i dma_addr0;
> +				dma_addr0 = _mm_setzero_si128();
> +				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> +					rxep[i].mbuf = &rxq->fake_mbuf;
> +					_mm_store_si128((__m128i *)&rxdp[i].read,
> +							dma_addr0);
> +				}
> +			}
> +			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> +				RTE_I40E_RXQ_REARM_THRESH;
> +			return;
> +		}
> +	}
> +
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> +	struct rte_mbuf *mb0, *mb1;
> +	__m128i dma_addr0, dma_addr1;
> +	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
> +			RTE_PKTMBUF_HEADROOM);
> +	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
> +	for (i = 0; i < n; i += 2, rxep += 2) {
> +		__m128i vaddr0, vaddr1;
> +
> +		mb0 = rxep[0].mbuf;
> +		mb1 = rxep[1].mbuf;
> +
> +		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +				offsetof(struct rte_mbuf, buf_addr) + 8);
> +		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		/* convert pa to dma_addr hdr/data */
> +		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
> +
> +		/* add headroom to pa values */
> +		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
> +
> +		/* flush desc with pa dma_addr */
> +		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> +		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
> +	}
> +#else
> +#ifdef __AVX512VL__
> +	if (avx512) {
> +		struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
> +		struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
> +		__m512i dma_addr0_3, dma_addr4_7;
> +		__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
> +		/* Initialize the mbufs in vector, process 8 mbufs in one loop */
> +		for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) {
> +			__m128i vaddr0, vaddr1, vaddr2, vaddr3;
> +			__m128i vaddr4, vaddr5, vaddr6, vaddr7;
> +			__m256i vaddr0_1, vaddr2_3;
> +			__m256i vaddr4_5, vaddr6_7;
> +			__m512i vaddr0_3, vaddr4_7;
> +
> +			mb0 = rxep[0].mbuf;
> +			mb1 = rxep[1].mbuf;
> +			mb2 = rxep[2].mbuf;
> +			mb3 = rxep[3].mbuf;
> +			mb4 = rxep[4].mbuf;
> +			mb5 = rxep[5].mbuf;
> +			mb6 = rxep[6].mbuf;
> +			mb7 = rxep[7].mbuf;
> +
> +			/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +			RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +					offsetof(struct rte_mbuf, buf_addr) + 8);
> +			vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +			vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +			vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
> +			vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
> +			vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
> +			vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
> +			vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
> +			vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
> +
> +			/**
> +			 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
> +			 * into the high lanes. Similarly for 2 & 3, and so on.
> +			 */
> +			vaddr0_1 =
> +				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
> +							vaddr1, 1);
> +			vaddr2_3 =
> +				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
> +							vaddr3, 1);
> +			vaddr4_5 =
> +				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
> +							vaddr5, 1);
> +			vaddr6_7 =
> +				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
> +							vaddr7, 1);
> +			vaddr0_3 =
> +				_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
> +						   vaddr2_3, 1);
> +			vaddr4_7 =
> +				_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
> +						   vaddr6_7, 1);
> +
> +			/* convert pa to dma_addr hdr/data */
> +			dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
> +			dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
> +
> +			/* add headroom to pa values */
> +			dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
> +			dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
> +
> +			/* flush desc with pa dma_addr */
> +			_mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
> +			_mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);
> +		}
> +	} else {
> +#endif /* __AVX512VL__*/
> +		struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
> +		__m256i dma_addr0_1, dma_addr2_3;
> +		__m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
> +		/* Initialize the mbufs in vector, process 4 mbufs in one loop */
> +		for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) {
> +			__m128i vaddr0, vaddr1, vaddr2, vaddr3;
> +			__m256i vaddr0_1, vaddr2_3;
> +
> +			mb0 = rxep[0].mbuf;
> +			mb1 = rxep[1].mbuf;
> +			mb2 = rxep[2].mbuf;
> +			mb3 = rxep[3].mbuf;
> +
> +			/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +			RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +					offsetof(struct rte_mbuf, buf_addr) + 8);
> +			vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +			vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +			vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
> +			vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
> +
> +			/**
> +			 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
> +			 * into the high lanes. Similarly for 2 & 3
> +			 */
> +			vaddr0_1 = _mm256_inserti128_si256
> +				(_mm256_castsi128_si256(vaddr0), vaddr1, 1);
> +			vaddr2_3 = _mm256_inserti128_si256
> +				(_mm256_castsi128_si256(vaddr2), vaddr3, 1);
> +
> +			/* convert pa to dma_addr hdr/data */
> +			dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);
> +			dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);
> +
> +			/* add headroom to pa values */
> +			dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);
> +			dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);
> +
> +			/* flush desc with pa dma_addr */
> +			_mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1);
> +			_mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3);
> +		}
> +	}
> +
> +#endif
> +
> +	/* Update the descriptor initializer index */
> +	rxq->rxrearm_start += n;
> +	rx_id = rxq->rxrearm_start - 1;
> +
> +	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> +		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> +		if (!rxq->rxrearm_start)
> +			rx_id = rxq->nb_rx_desc - 1;
> +		else
> +			rx_id = rxq->rxrearm_start - 1;
> +	}
> +
> +	rxq->rxrearm_nb -= n;
> +
> +	/* Update the tail pointer on the NIC */
> +	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> +}
>   #endif /* __AVX2__*/
>   
>   #endif /*_I40E_RXTX_COMMON_AVX_H_*/
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> index c73b2a321b..fcb7ba0273 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> @@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>   	return i40e_rxq_rearm_common(rxq, false);
>   }
>   
> +static __rte_always_inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> +	return i40e_rxq_direct_rearm_common(rxq, false);
> +}
> +
>   #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>   /* Handles 32B descriptor FDIR ID processing:
>    * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
> @@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>   	/* See if we need to rearm the RX queue - gives the prefetch a bit
>   	 * of time to act
>   	 */
> -	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> -		i40e_rxq_rearm(rxq);
> +	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> +		if (rxq->direct_rxrearm_enable)
> +			i40e_rxq_direct_rearm(rxq);
> +		else
> +			i40e_rxq_rearm(rxq);
> +	}
>   
>   	/* Before we start moving massive data around, check to see if
>   	 * there is actually a packet available
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> index 2e8a3f0df6..d967095edc 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> @@ -21,6 +21,12 @@
>   
>   #define RTE_I40E_DESCS_PER_LOOP_AVX 8
>   
> +enum i40e_direct_rearm_type_value {
> +	I40E_DIRECT_REARM_TYPE_NORMAL		= 0x0,
> +	I40E_DIRECT_REARM_TYPE_FAST_FREE	= 0x1,
> +	I40E_DIRECT_REARM_TYPE_PRE_FREE		= 0x2,
> +};
> +
>   static __rte_always_inline void
>   i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>   {
> @@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>   	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
>   }
>   
> +static __rte_always_inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> +	struct rte_eth_dev *dev;
> +	struct i40e_tx_queue *txq;
> +	volatile union i40e_rx_desc *rxdp;
> +	struct i40e_vec_tx_entry *txep;
> +	struct i40e_rx_entry *rxep;
> +	struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
> +	uint16_t tx_port_id, tx_queue_id;
> +	uint16_t rx_id;
> +	uint16_t i, n;
> +	uint16_t j = 0;
> +	uint16_t nb_rearm = 0;
> +	enum i40e_direct_rearm_type_value type;
> +	struct rte_mempool_cache *cache = NULL;
> +
> +	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> +	rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> +	tx_port_id = rxq->direct_rxrearm_port;
> +	tx_queue_id = rxq->direct_rxrearm_queue;
> +	dev = &rte_eth_devices[tx_port_id];
> +	txq = dev->data->tx_queues[tx_queue_id];
> +
> +	/* check Rx queue is able to take in the whole
> +	 * batch of free mbufs from Tx queue
> +	 */
> +	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> +		/* check DD bits on threshold descriptor */
> +		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> +			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> +			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> +			goto mempool_bulk;
> +		}
> +
> +		if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
> +			goto mempool_bulk;
> +
> +		n = txq->tx_rs_thresh;
> +
> +		/* first buffer to free from S/W ring is at index
> +		 * tx_next_dd - (tx_rs_thresh-1)
> +		 */
> +		txep = (void *)txq->sw_ring;
> +		txep += txq->tx_next_dd - (n - 1);
> +
> +		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> +			/* directly put mbufs from Tx to Rx */
> +			uint32_t copied = 0;
> +			/* n is multiple of 32 */
> +			while (copied < n) {
> +				const __m512i a = _mm512_load_si512(&txep[copied]);
> +				const __m512i b = _mm512_load_si512(&txep[copied + 8]);
> +				const __m512i c = _mm512_load_si512(&txep[copied + 16]);
> +				const __m512i d = _mm512_load_si512(&txep[copied + 24]);
> +
> +				_mm512_storeu_si512(&rxep[copied], a);
> +				_mm512_storeu_si512(&rxep[copied + 8], b);
> +				_mm512_storeu_si512(&rxep[copied + 16], c);
> +				_mm512_storeu_si512(&rxep[copied + 24], d);
> +				copied += 32;
> +			}
> +			type = I40E_DIRECT_REARM_TYPE_FAST_FREE;
> +		} else {
> +			for (i = 0; i < n; i++) {
> +				m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> +				/* ensure each Tx freed buffer is valid */
> +				if (m[i] != NULL)
> +					nb_rearm++;
> +			}
> +
> +			if (nb_rearm != n) {
> +				txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> +				txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> +				if (txq->tx_next_dd >= txq->nb_tx_desc)
> +					txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> +				goto mempool_bulk;
> +			} else {
> +				type = I40E_DIRECT_REARM_TYPE_PRE_FREE;
> +			}
> +		}
> +
> +	/* update counters for Tx */
> +	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> +	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> +	if (txq->tx_next_dd >= txq->nb_tx_desc)
> +		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> +	} else {
> +mempool_bulk:
> +		cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());
> +
> +		if (unlikely(!cache))
> +			return i40e_rxq_rearm_common(rxq, true);
> +
> +		n = RTE_I40E_RXQ_REARM_THRESH;
> +
> +		/* We need to pull 'n' more MBUFs into the software ring from mempool
> +		 * We inline the mempool function here, so we can vectorize the copy
> +		 * from the cache into the shadow ring.
> +		 */
> +
> +		if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
> +			/* No. Backfill the cache first, and then fill from it */
> +			uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
> +					cache->len);
> +
> +			/* How many do we require
> +			 * i.e. number to fill the cache + the request
> +			 */
> +			int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
> +					&cache->objs[cache->len], req);
> +			if (ret == 0) {
> +				cache->len += req;
> +			} else {
> +				if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
> +						rxq->nb_rx_desc) {
> +					__m128i dma_addr0;
> +
> +					dma_addr0 = _mm_setzero_si128();
> +					for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> +						rxep[i].mbuf = &rxq->fake_mbuf;
> +						_mm_store_si128
> +							((__m128i *)&rxdp[i].read,
> +								dma_addr0);
> +					}
> +				}
> +				rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> +						RTE_I40E_RXQ_REARM_THRESH;
> +				return;
> +			}
> +		}
> +
> +		type = I40E_DIRECT_REARM_TYPE_NORMAL;
> +	}
> +
> +	const __m512i iova_offsets =  _mm512_set1_epi64
> +		(offsetof(struct rte_mbuf, buf_iova));
> +	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
> +
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> +	/* to shuffle the addresses to correct slots. Values 4-7 will contain
> +	 * zeros, so use 7 for a zero-value.
> +	 */
> +	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
> +#else
> +	const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
> +#endif
> +
> +	__m512i mbuf_ptrs;
> +
> +	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
> +	 * from mempool cache and populating both shadow and HW rings
> +	 */
> +	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
> +		switch (type) {
> +		case I40E_DIRECT_REARM_TYPE_FAST_FREE:
> +			mbuf_ptrs = _mm512_loadu_si512(rxep);
> +			break;
> +		case I40E_DIRECT_REARM_TYPE_PRE_FREE:
> +			mbuf_ptrs = _mm512_loadu_si512(&m[j]);
> +			_mm512_store_si512(rxep, mbuf_ptrs);
> +			j += 8;
> +			break;
> +		case I40E_DIRECT_REARM_TYPE_NORMAL:
> +			mbuf_ptrs = _mm512_loadu_si512
> +				(&cache->objs[cache->len - 8]);
> +			_mm512_store_si512(rxep, mbuf_ptrs);
> +			cache->len -= 8;
> +			break;
> +		}
> +
> +		/* gather iova of mbuf0-7 into one zmm reg */
> +		const __m512i iova_base_addrs = _mm512_i64gather_epi64
> +			(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
> +				0, /* base */
> +				1 /* scale */);
> +		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
> +				headroom);
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> +		const __m512i iovas0 = _mm512_castsi256_si512
> +			(_mm512_extracti64x4_epi64(iova_addrs, 0));
> +		const __m512i iovas1 = _mm512_castsi256_si512
> +			(_mm512_extracti64x4_epi64(iova_addrs, 1));
> +
> +		/* permute leaves desc 2-3 addresses in header address slots 0-1
> +		 * but these are ignored by driver since header split not
> +		 * enabled. Similarly for desc 4 & 5.
> +		 */
> +		const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
> +			(permute_idx, iovas0);
> +		const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
> +
> +		const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
> +			(permute_idx, iovas1);
> +		const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
> +
> +		_mm512_store_si512((void *)rxdp, desc_rd_0_1);
> +		_mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
> +		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
> +		_mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
> +#else
> +		/* permute leaves desc 4-7 addresses in header address slots 0-3
> +		 * but these are ignored by driver since header split not
> +		 * enabled.
> +		 */
> +		const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
> +			(permute_idx, iova_addrs);
> +		const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
> +
> +		_mm512_store_si512((void *)rxdp, desc_rd_0_3);
> +		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
> +#endif
> +		rxdp += 8, rxep += 8;
> +	}
> +
> +	/* Update the descriptor initializer index */
> +	rxq->rxrearm_start += n;
> +	rx_id = rxq->rxrearm_start - 1;
> +
> +	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> +		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> +		if (!rxq->rxrearm_start)
> +			rx_id = rxq->nb_rx_desc - 1;
> +		else
> +			rx_id = rxq->rxrearm_start - 1;
> +	}
> +
> +	rxq->rxrearm_nb -= n;
> +
> +	/* Update the tail pointer on the NIC */
> +	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> +}
> +
>   #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>   /* Handles 32B descriptor FDIR ID processing:
>    * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
> @@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>   	/* See if we need to rearm the RX queue - gives the prefetch a bit
>   	 * of time to act
>   	 */
> -	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> -		i40e_rxq_rearm(rxq);
> +	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> +		if (rxq->direct_rxrearm_enable)
> +			i40e_rxq_direct_rearm(rxq);
> +		else
> +			i40e_rxq_rearm(rxq);
> +	}
>   
>   	/* Before we start moving massive data around, check to see if
>   	 * there is actually a packet available
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> index fa9e6582c5..dc78e3c90b 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> @@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>   	I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
>   }
>   
> +static inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> +	struct rte_eth_dev *dev;
> +	struct i40e_tx_queue *txq;
> +	volatile union i40e_rx_desc *rxdp;
> +	struct i40e_tx_entry *txep;
> +	struct i40e_rx_entry *rxep;
> +	uint16_t tx_port_id, tx_queue_id;
> +	uint16_t rx_id;
> +	struct rte_mbuf *mb0, *mb1, *m;
> +	uint64x2_t dma_addr0, dma_addr1;
> +	uint64x2_t zero = vdupq_n_u64(0);
> +	uint64_t paddr;
> +	uint16_t i, n;
> +	uint16_t nb_rearm = 0;
> +
> +	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> +	rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> +	tx_port_id = rxq->direct_rxrearm_port;
> +	tx_queue_id = rxq->direct_rxrearm_queue;
> +	dev = &rte_eth_devices[tx_port_id];
> +	txq = dev->data->tx_queues[tx_queue_id];
> +
> +	/* check Rx queue is able to take in the whole
> +	 * batch of free mbufs from Tx queue
> +	 */
> +	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> +		/* check DD bits on threshold descriptor */
> +		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> +				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> +				rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> +			goto mempool_bulk;
> +		}
> +
> +		n = txq->tx_rs_thresh;
> +
> +		/* first buffer to free from S/W ring is at index
> +		 * tx_next_dd - (tx_rs_thresh-1)
> +		 */
> +		txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
> +
> +		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> +			/* directly put mbufs from Tx to Rx,
> +			 * and initialize the mbufs in vector
> +			 */
> +			for (i = 0; i < n; i++, rxep++, txep++) {
> +				rxep[0].mbuf = txep[0].mbuf;
> +
> +				/* Initialize rxdp descs */
> +				mb0 = txep[0].mbuf;
> +
> +				paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
> +				dma_addr0 = vdupq_n_u64(paddr);
> +				/* flush desc with pa dma_addr */
> +				vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> +			}
> +		} else {
> +			for (i = 0; i < n; i++) {
> +				m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> +				if (m != NULL) {
> +					rxep[i].mbuf = m;
> +
> +					/* Initialize rxdp descs */
> +					paddr = m->buf_iova + RTE_PKTMBUF_HEADROOM;
> +					dma_addr0 = vdupq_n_u64(paddr);
> +					/* flush desc with pa dma_addr */
> +					vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> +					nb_rearm++;
> +				}
> +			}
> +			n = nb_rearm;
> +		}
> +
> +		/* update counters for Tx */
> +		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> +		txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> +		if (txq->tx_next_dd >= txq->nb_tx_desc)
> +			txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> +	} else {
> +mempool_bulk:
> +		/* if TX did not free bufs into Rx sw-ring,
> +		 * get new bufs from mempool
> +		 */
> +		n = RTE_I40E_RXQ_REARM_THRESH;
> +		if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) {
> +			if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
> +				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> +					rxep[i].mbuf = &rxq->fake_mbuf;
> +					vst1q_u64((uint64_t *)&rxdp[i].read, zero);
> +				}
> +			}
> +			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n;
> +			return;
> +		}
> +
> +		/* Initialize the mbufs in vector, process 2 mbufs in one loop */
> +		for (i = 0; i < n; i += 2, rxep += 2) {
> +			mb0 = rxep[0].mbuf;
> +			mb1 = rxep[1].mbuf;
> +
> +			paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
> +			dma_addr0 = vdupq_n_u64(paddr);
> +			/* flush desc with pa dma_addr */
> +			vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> +
> +			paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
> +			dma_addr1 = vdupq_n_u64(paddr);
> +			/* flush desc with pa dma_addr */
> +			vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
> +		}
> +	}
> +
> +	/* Update the descriptor initializer index */
> +	rxq->rxrearm_start += n;
> +	rx_id = rxq->rxrearm_start - 1;
> +
> +	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> +		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> +		if (!rxq->rxrearm_start)
> +			rx_id = rxq->nb_rx_desc - 1;
> +		else
> +			rx_id = rxq->rxrearm_start - 1;
> +	}
> +
> +	rxq->rxrearm_nb -= n;
> +
> +	rte_io_wmb();
> +	/* Update the tail pointer on the NIC */
> +	I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
> +}
> +
>   #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>   /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */
>   static inline uint32x4_t
> @@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
>   	/* See if we need to rearm the RX queue - gives the prefetch a bit
>   	 * of time to act
>   	 */
> -	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> -		i40e_rxq_rearm(rxq);
> +	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> +		if (rxq->direct_rxrearm_enable)
> +			i40e_rxq_direct_rearm(rxq);
> +		else
> +			i40e_rxq_rearm(rxq);
> +	}
>   
>   	/* Before we start moving massive data around, check to see if
>   	 * there is actually a packet available
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
> index 3782e8052f..b2f1ab2c8d 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
> @@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
>   	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
>   }
>   
> +static inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> +	struct rte_eth_dev *dev;
> +	struct i40e_tx_queue *txq;
> +	volatile union i40e_rx_desc *rxdp;
> +	struct i40e_tx_entry *txep;
> +	struct i40e_rx_entry *rxep;
> +	uint16_t tx_port_id, tx_queue_id;
> +	uint16_t rx_id;
> +	struct rte_mbuf *mb0, *mb1, *m;
> +	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
> +			RTE_PKTMBUF_HEADROOM);
> +	__m128i dma_addr0, dma_addr1;
> +	__m128i vaddr0, vaddr1;
> +	uint16_t i, n;
> +	uint16_t nb_rearm = 0;
> +
> +	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> +	rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> +	tx_port_id = rxq->direct_rxrearm_port;
> +	tx_queue_id = rxq->direct_rxrearm_queue;
> +	dev = &rte_eth_devices[tx_port_id];
> +	txq = dev->data->tx_queues[tx_queue_id];
> +
> +	/* check Rx queue is able to take in the whole
> +	 * batch of free mbufs from Tx queue
> +	 */
> +	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> +		/* check DD bits on threshold descriptor */
> +		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> +				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> +				rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> +			goto mempool_bulk;
> +		}
> +
> +		n = txq->tx_rs_thresh;
> +
> +		/* first buffer to free from S/W ring is at index
> +		 * tx_next_dd - (tx_rs_thresh-1)
> +		 */
> +		txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
> +
> +		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> +			/* directly put mbufs from Tx to Rx,
> +			 * and initialize the mbufs in vector
> +			 */
> +			for (i = 0; i < n; i++, rxep++, txep++) {
> +				rxep[0].mbuf = txep[0].mbuf;
> +
> +				/* Initialize rxdp descs */
> +				mb0 = txep[0].mbuf;
> +
> +				/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +				RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +						offsetof(struct rte_mbuf, buf_addr) + 8);
> +				vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +
> +				/* convert pa to dma_addr hdr/data */
> +				dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +
> +				/* add headroom to pa values */
> +				dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +
> +				/* flush desc with pa dma_addr */
> +				_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> +			}
> +		} else {
> +			for (i = 0; i < n; i++) {
> +				m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> +				if (m != NULL) {
> +					rxep[i].mbuf = m;
> +
> +					/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +					RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +							offsetof(struct rte_mbuf, buf_addr) + 8);
> +					vaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr);
> +
> +					/* convert pa to dma_addr hdr/data */
> +					dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +
> +					/* add headroom to pa values */
> +					dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +
> +					/* flush desc with pa dma_addr */
> +					_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> +					nb_rearm++;
> +				}
> +			}
> +			n = nb_rearm;
> +		}
> +
> +		/* update counters for Tx */
> +		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> +		txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> +		if (txq->tx_next_dd >= txq->nb_tx_desc)
> +			txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> +	} else {
> +mempool_bulk:
> +		/* if TX did not free bufs into Rx sw-ring,
> +		 * get new bufs from mempool
> +		 */
> +		n = RTE_I40E_RXQ_REARM_THRESH;
> +		/* Pull 'n' more MBUFs into the software ring */
> +		if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) {
> +			if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
> +				dma_addr0 = _mm_setzero_si128();
> +				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> +					rxep[i].mbuf = &rxq->fake_mbuf;
> +					_mm_store_si128((__m128i *)&rxdp[i].read,
> +							dma_addr0);
> +				}
> +			}
> +			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> +				RTE_I40E_RXQ_REARM_THRESH;
> +			return;
> +		}
> +
> +		/* Initialize the mbufs in vector, process 2 mbufs in one loop */
> +		for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
> +			mb0 = rxep[0].mbuf;
> +			mb1 = rxep[1].mbuf;
> +
> +			/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> +			RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +					offsetof(struct rte_mbuf, buf_addr) + 8);
> +			vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +			vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +			/* convert pa to dma_addr hdr/data */
> +			dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +			dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
> +
> +			/* add headroom to pa values */
> +			dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +			dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
> +
> +			/* flush desc with pa dma_addr */
> +			_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> +			_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
> +		}
> +	}
> +
> +	/* Update the descriptor initializer index */
> +	rxq->rxrearm_start += n;
> +	rx_id = rxq->rxrearm_start - 1;
> +
> +	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> +		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> +		if (!rxq->rxrearm_start)
> +			rx_id = rxq->nb_rx_desc - 1;
> +		else
> +			rx_id = rxq->rxrearm_start - 1;
> +	}
> +
> +	rxq->rxrearm_nb -= n;
> +
> +	/* Update the tail pointer on the NIC */
> +	I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
> +}
> +
>   #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
>   /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
>   static inline __m128i
> @@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>   	/* See if we need to rearm the RX queue - gives the prefetch a bit
>   	 * of time to act
>   	 */
> -	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> -		i40e_rxq_rearm(rxq);
> +	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> +		if (rxq->direct_rxrearm_enable)
> +			i40e_rxq_direct_rearm(rxq);
> +		else
> +			i40e_rxq_rearm(rxq);
> +	}
>   
>   	/* Before we start moving massive data around, check to see if
>   	 * there is actually a packet available
  

Patch

diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
index 5e6eecc501..1fdf4305f4 100644
--- a/drivers/net/i40e/i40e_rxtx.h
+++ b/drivers/net/i40e/i40e_rxtx.h
@@ -102,6 +102,8 @@  struct i40e_rx_queue {
 
 	uint16_t rxrearm_nb;	/**< number of remaining to be re-armed */
 	uint16_t rxrearm_start;	/**< the idx we start the re-arming from */
+	uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */
+	uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */
 	uint64_t mbuf_initializer; /**< value to init mbufs */
 
 	uint16_t port_id; /**< device port ID */
@@ -121,6 +123,8 @@  struct i40e_rx_queue {
 	uint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */
 	uint8_t dcb_tc;         /**< Traffic class of rx queue */
 	uint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */
+	/**<  0 if direct re-arm mode disabled, 1 when enabled */
+	bool direct_rxrearm_enable;
 	const struct rte_memzone *mz;
 };
 
diff --git a/drivers/net/i40e/i40e_rxtx_common_avx.h b/drivers/net/i40e/i40e_rxtx_common_avx.h
index cfc1e63173..a742723e07 100644
--- a/drivers/net/i40e/i40e_rxtx_common_avx.h
+++ b/drivers/net/i40e/i40e_rxtx_common_avx.h
@@ -209,6 +209,275 @@  i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
 	/* Update the tail pointer on the NIC */
 	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
+
+static __rte_always_inline void
+i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
+{
+	struct rte_eth_dev *dev;
+	struct i40e_tx_queue *txq;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_tx_entry *txep;
+	struct i40e_rx_entry *rxep;
+	struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
+	uint16_t tx_port_id, tx_queue_id;
+	uint16_t rx_id;
+	uint16_t i, n;
+	uint16_t nb_rearm = 0;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+	rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+	tx_port_id = rxq->direct_rxrearm_port;
+	tx_queue_id = rxq->direct_rxrearm_queue;
+	dev = &rte_eth_devices[tx_port_id];
+	txq = dev->data->tx_queues[tx_queue_id];
+
+	/* check Rx queue is able to take in the whole
+	 * batch of free mbufs from Tx queue
+	 */
+	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+		/* check DD bits on threshold descriptor */
+		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+				rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+			goto mempool_bulk;
+		}
+
+		if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
+			goto mempool_bulk;
+
+		n = txq->tx_rs_thresh;
+
+		/* first buffer to free from S/W ring is at index
+		 * tx_next_dd - (tx_rs_thresh-1)
+		 */
+		txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+			/* directly put mbufs from Tx to Rx,
+			 * and initialize the mbufs in vector
+			 */
+			for (i = 0; i < n; i++)
+				rxep[i].mbuf = txep[i].mbuf;
+		} else {
+			for (i = 0; i < n; i++) {
+				m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+				/* ensure each Tx freed buffer is valid */
+				if (m[i] != NULL)
+					nb_rearm++;
+			}
+
+			if (nb_rearm != n) {
+				txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+				txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+				if (txq->tx_next_dd >= txq->nb_tx_desc)
+					txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+				goto mempool_bulk;
+			} else {
+				for (i = 0; i < n; i++)
+					rxep[i].mbuf = m[i];
+			}
+		}
+
+		/* update counters for Tx */
+		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+		txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+		if (txq->tx_next_dd >= txq->nb_tx_desc)
+			txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+	} else {
+mempool_bulk:
+		/* if TX did not free bufs into Rx sw-ring,
+		 * get new bufs from mempool
+		 */
+		n = RTE_I40E_RXQ_REARM_THRESH;
+
+		/* Pull 'n' more MBUFs into the software ring */
+		if (rte_mempool_get_bulk(rxq->mp,
+					(void *)rxep,
+					RTE_I40E_RXQ_REARM_THRESH) < 0) {
+			if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+				rxq->nb_rx_desc) {
+				__m128i dma_addr0;
+				dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+					rxep[i].mbuf = &rxq->fake_mbuf;
+					_mm_store_si128((__m128i *)&rxdp[i].read,
+							dma_addr0);
+				}
+			}
+			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+				RTE_I40E_RXQ_REARM_THRESH;
+			return;
+		}
+	}
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+	struct rte_mbuf *mb0, *mb1;
+	__m128i dma_addr0, dma_addr1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+			RTE_PKTMBUF_HEADROOM);
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < n; i += 2, rxep += 2) {
+		__m128i vaddr0, vaddr1;
+
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				offsetof(struct rte_mbuf, buf_addr) + 8);
+		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		/* flush desc with pa dma_addr */
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+	}
+#else
+#ifdef __AVX512VL__
+	if (avx512) {
+		struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+		struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+		__m512i dma_addr0_3, dma_addr4_7;
+		__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+		/* Initialize the mbufs in vector, process 8 mbufs in one loop */
+		for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) {
+			__m128i vaddr0, vaddr1, vaddr2, vaddr3;
+			__m128i vaddr4, vaddr5, vaddr6, vaddr7;
+			__m256i vaddr0_1, vaddr2_3;
+			__m256i vaddr4_5, vaddr6_7;
+			__m512i vaddr0_3, vaddr4_7;
+
+			mb0 = rxep[0].mbuf;
+			mb1 = rxep[1].mbuf;
+			mb2 = rxep[2].mbuf;
+			mb3 = rxep[3].mbuf;
+			mb4 = rxep[4].mbuf;
+			mb5 = rxep[5].mbuf;
+			mb6 = rxep[6].mbuf;
+			mb7 = rxep[7].mbuf;
+
+			/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+			RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+					offsetof(struct rte_mbuf, buf_addr) + 8);
+			vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+			vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+			vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+			vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+			vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+			vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
+			vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
+			vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
+
+			/**
+			 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+			 * into the high lanes. Similarly for 2 & 3, and so on.
+			 */
+			vaddr0_1 =
+				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
+							vaddr1, 1);
+			vaddr2_3 =
+				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
+							vaddr3, 1);
+			vaddr4_5 =
+				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
+							vaddr5, 1);
+			vaddr6_7 =
+				_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
+							vaddr7, 1);
+			vaddr0_3 =
+				_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
+						   vaddr2_3, 1);
+			vaddr4_7 =
+				_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
+						   vaddr6_7, 1);
+
+			/* convert pa to dma_addr hdr/data */
+			dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
+			dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
+
+			/* add headroom to pa values */
+			dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
+			dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
+
+			/* flush desc with pa dma_addr */
+			_mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
+			_mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);
+		}
+	} else {
+#endif /* __AVX512VL__*/
+		struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+		__m256i dma_addr0_1, dma_addr2_3;
+		__m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
+		/* Initialize the mbufs in vector, process 4 mbufs in one loop */
+		for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) {
+			__m128i vaddr0, vaddr1, vaddr2, vaddr3;
+			__m256i vaddr0_1, vaddr2_3;
+
+			mb0 = rxep[0].mbuf;
+			mb1 = rxep[1].mbuf;
+			mb2 = rxep[2].mbuf;
+			mb3 = rxep[3].mbuf;
+
+			/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+			RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+					offsetof(struct rte_mbuf, buf_addr) + 8);
+			vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+			vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+			vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+			vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+
+			/**
+			 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+			 * into the high lanes. Similarly for 2 & 3
+			 */
+			vaddr0_1 = _mm256_inserti128_si256
+				(_mm256_castsi128_si256(vaddr0), vaddr1, 1);
+			vaddr2_3 = _mm256_inserti128_si256
+				(_mm256_castsi128_si256(vaddr2), vaddr3, 1);
+
+			/* convert pa to dma_addr hdr/data */
+			dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);
+			dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);
+
+			/* add headroom to pa values */
+			dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);
+			dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);
+
+			/* flush desc with pa dma_addr */
+			_mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1);
+			_mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3);
+		}
+	}
+
+#endif
+
+	/* Update the descriptor initializer index */
+	rxq->rxrearm_start += n;
+	rx_id = rxq->rxrearm_start - 1;
+
+	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+		if (!rxq->rxrearm_start)
+			rx_id = rxq->nb_rx_desc - 1;
+		else
+			rx_id = rxq->rxrearm_start - 1;
+	}
+
+	rxq->rxrearm_nb -= n;
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
 #endif /* __AVX2__*/
 
 #endif /*_I40E_RXTX_COMMON_AVX_H_*/
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index c73b2a321b..fcb7ba0273 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -25,6 +25,12 @@  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	return i40e_rxq_rearm_common(rxq, false);
 }
 
+static __rte_always_inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+	return i40e_rxq_direct_rearm_common(rxq, false);
+}
+
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 /* Handles 32B descriptor FDIR ID processing:
  * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
@@ -128,8 +134,12 @@  _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
 	 */
-	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-		i40e_rxq_rearm(rxq);
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+		if (rxq->direct_rxrearm_enable)
+			i40e_rxq_direct_rearm(rxq);
+		else
+			i40e_rxq_rearm(rxq);
+	}
 
 	/* Before we start moving massive data around, check to see if
 	 * there is actually a packet available
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 2e8a3f0df6..d967095edc 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -21,6 +21,12 @@ 
 
 #define RTE_I40E_DESCS_PER_LOOP_AVX 8
 
+enum i40e_direct_rearm_type_value {
+	I40E_DIRECT_REARM_TYPE_NORMAL		= 0x0,
+	I40E_DIRECT_REARM_TYPE_FAST_FREE	= 0x1,
+	I40E_DIRECT_REARM_TYPE_PRE_FREE		= 0x2,
+};
+
 static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
@@ -150,6 +156,241 @@  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
+static __rte_always_inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+	struct rte_eth_dev *dev;
+	struct i40e_tx_queue *txq;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_vec_tx_entry *txep;
+	struct i40e_rx_entry *rxep;
+	struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
+	uint16_t tx_port_id, tx_queue_id;
+	uint16_t rx_id;
+	uint16_t i, n;
+	uint16_t j = 0;
+	uint16_t nb_rearm = 0;
+	enum i40e_direct_rearm_type_value type;
+	struct rte_mempool_cache *cache = NULL;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+	rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+	tx_port_id = rxq->direct_rxrearm_port;
+	tx_queue_id = rxq->direct_rxrearm_queue;
+	dev = &rte_eth_devices[tx_port_id];
+	txq = dev->data->tx_queues[tx_queue_id];
+
+	/* check Rx queue is able to take in the whole
+	 * batch of free mbufs from Tx queue
+	 */
+	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+		/* check DD bits on threshold descriptor */
+		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+			rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+			rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+			goto mempool_bulk;
+		}
+
+		if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
+			goto mempool_bulk;
+
+		n = txq->tx_rs_thresh;
+
+		/* first buffer to free from S/W ring is at index
+		 * tx_next_dd - (tx_rs_thresh-1)
+		 */
+		txep = (void *)txq->sw_ring;
+		txep += txq->tx_next_dd - (n - 1);
+
+		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+			/* directly put mbufs from Tx to Rx */
+			uint32_t copied = 0;
+			/* n is multiple of 32 */
+			while (copied < n) {
+				const __m512i a = _mm512_load_si512(&txep[copied]);
+				const __m512i b = _mm512_load_si512(&txep[copied + 8]);
+				const __m512i c = _mm512_load_si512(&txep[copied + 16]);
+				const __m512i d = _mm512_load_si512(&txep[copied + 24]);
+
+				_mm512_storeu_si512(&rxep[copied], a);
+				_mm512_storeu_si512(&rxep[copied + 8], b);
+				_mm512_storeu_si512(&rxep[copied + 16], c);
+				_mm512_storeu_si512(&rxep[copied + 24], d);
+				copied += 32;
+			}
+			type = I40E_DIRECT_REARM_TYPE_FAST_FREE;
+		} else {
+			for (i = 0; i < n; i++) {
+				m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+				/* ensure each Tx freed buffer is valid */
+				if (m[i] != NULL)
+					nb_rearm++;
+			}
+
+			if (nb_rearm != n) {
+				txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+				txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+				if (txq->tx_next_dd >= txq->nb_tx_desc)
+					txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+				goto mempool_bulk;
+			} else {
+				type = I40E_DIRECT_REARM_TYPE_PRE_FREE;
+			}
+		}
+
+	/* update counters for Tx */
+	txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+	txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+	if (txq->tx_next_dd >= txq->nb_tx_desc)
+		txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+	} else {
+mempool_bulk:
+		cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());
+
+		if (unlikely(!cache))
+			return i40e_rxq_rearm_common(rxq, true);
+
+		n = RTE_I40E_RXQ_REARM_THRESH;
+
+		/* We need to pull 'n' more MBUFs into the software ring from mempool
+		 * We inline the mempool function here, so we can vectorize the copy
+		 * from the cache into the shadow ring.
+		 */
+
+		if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
+			/* No. Backfill the cache first, and then fill from it */
+			uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
+					cache->len);
+
+			/* How many do we require
+			 * i.e. number to fill the cache + the request
+			 */
+			int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
+					&cache->objs[cache->len], req);
+			if (ret == 0) {
+				cache->len += req;
+			} else {
+				if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+						rxq->nb_rx_desc) {
+					__m128i dma_addr0;
+
+					dma_addr0 = _mm_setzero_si128();
+					for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+						rxep[i].mbuf = &rxq->fake_mbuf;
+						_mm_store_si128
+							((__m128i *)&rxdp[i].read,
+								dma_addr0);
+					}
+				}
+				rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+						RTE_I40E_RXQ_REARM_THRESH;
+				return;
+			}
+		}
+
+		type = I40E_DIRECT_REARM_TYPE_NORMAL;
+	}
+
+	const __m512i iova_offsets =  _mm512_set1_epi64
+		(offsetof(struct rte_mbuf, buf_iova));
+	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+	/* to shuffle the addresses to correct slots. Values 4-7 will contain
+	 * zeros, so use 7 for a zero-value.
+	 */
+	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
+#else
+	const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
+#endif
+
+	__m512i mbuf_ptrs;
+
+	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+	 * from mempool cache and populating both shadow and HW rings
+	 */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
+		switch (type) {
+		case I40E_DIRECT_REARM_TYPE_FAST_FREE:
+			mbuf_ptrs = _mm512_loadu_si512(rxep);
+			break;
+		case I40E_DIRECT_REARM_TYPE_PRE_FREE:
+			mbuf_ptrs = _mm512_loadu_si512(&m[j]);
+			_mm512_store_si512(rxep, mbuf_ptrs);
+			j += 8;
+			break;
+		case I40E_DIRECT_REARM_TYPE_NORMAL:
+			mbuf_ptrs = _mm512_loadu_si512
+				(&cache->objs[cache->len - 8]);
+			_mm512_store_si512(rxep, mbuf_ptrs);
+			cache->len -= 8;
+			break;
+		}
+
+		/* gather iova of mbuf0-7 into one zmm reg */
+		const __m512i iova_base_addrs = _mm512_i64gather_epi64
+			(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+				0, /* base */
+				1 /* scale */);
+		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+				headroom);
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+		const __m512i iovas0 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 0));
+		const __m512i iovas1 = _mm512_castsi256_si512
+			(_mm512_extracti64x4_epi64(iova_addrs, 1));
+
+		/* permute leaves desc 2-3 addresses in header address slots 0-1
+		 * but these are ignored by driver since header split not
+		 * enabled. Similarly for desc 4 & 5.
+		 */
+		const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
+			(permute_idx, iovas0);
+		const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
+
+		const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
+			(permute_idx, iovas1);
+		const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_1);
+		_mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
+		_mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
+#else
+		/* permute leaves desc 4-7 addresses in header address slots 0-3
+		 * but these are ignored by driver since header split not
+		 * enabled.
+		 */
+		const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
+			(permute_idx, iova_addrs);
+		const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
+
+		_mm512_store_si512((void *)rxdp, desc_rd_0_3);
+		_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
+#endif
+		rxdp += 8, rxep += 8;
+	}
+
+	/* Update the descriptor initializer index */
+	rxq->rxrearm_start += n;
+	rx_id = rxq->rxrearm_start - 1;
+
+	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+		if (!rxq->rxrearm_start)
+			rx_id = rxq->nb_rx_desc - 1;
+		else
+			rx_id = rxq->rxrearm_start - 1;
+	}
+
+	rxq->rxrearm_nb -= n;
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
+
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 /* Handles 32B descriptor FDIR ID processing:
  * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
@@ -252,8 +493,12 @@  _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
 	 */
-	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-		i40e_rxq_rearm(rxq);
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+		if (rxq->direct_rxrearm_enable)
+			i40e_rxq_direct_rearm(rxq);
+		else
+			i40e_rxq_rearm(rxq);
+	}
 
 	/* Before we start moving massive data around, check to see if
 	 * there is actually a packet available
diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
index fa9e6582c5..dc78e3c90b 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
@@ -77,6 +77,139 @@  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
 }
 
+static inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+	struct rte_eth_dev *dev;
+	struct i40e_tx_queue *txq;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_tx_entry *txep;
+	struct i40e_rx_entry *rxep;
+	uint16_t tx_port_id, tx_queue_id;
+	uint16_t rx_id;
+	struct rte_mbuf *mb0, *mb1, *m;
+	uint64x2_t dma_addr0, dma_addr1;
+	uint64x2_t zero = vdupq_n_u64(0);
+	uint64_t paddr;
+	uint16_t i, n;
+	uint16_t nb_rearm = 0;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+	rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+	tx_port_id = rxq->direct_rxrearm_port;
+	tx_queue_id = rxq->direct_rxrearm_queue;
+	dev = &rte_eth_devices[tx_port_id];
+	txq = dev->data->tx_queues[tx_queue_id];
+
+	/* check Rx queue is able to take in the whole
+	 * batch of free mbufs from Tx queue
+	 */
+	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+		/* check DD bits on threshold descriptor */
+		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+				rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+			goto mempool_bulk;
+		}
+
+		n = txq->tx_rs_thresh;
+
+		/* first buffer to free from S/W ring is at index
+		 * tx_next_dd - (tx_rs_thresh-1)
+		 */
+		txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+			/* directly put mbufs from Tx to Rx,
+			 * and initialize the mbufs in vector
+			 */
+			for (i = 0; i < n; i++, rxep++, txep++) {
+				rxep[0].mbuf = txep[0].mbuf;
+
+				/* Initialize rxdp descs */
+				mb0 = txep[0].mbuf;
+
+				paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+				dma_addr0 = vdupq_n_u64(paddr);
+				/* flush desc with pa dma_addr */
+				vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+			}
+		} else {
+			for (i = 0; i < n; i++) {
+				m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+				if (m != NULL) {
+					rxep[i].mbuf = m;
+
+					/* Initialize rxdp descs */
+					paddr = m->buf_iova + RTE_PKTMBUF_HEADROOM;
+					dma_addr0 = vdupq_n_u64(paddr);
+					/* flush desc with pa dma_addr */
+					vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+					nb_rearm++;
+				}
+			}
+			n = nb_rearm;
+		}
+
+		/* update counters for Tx */
+		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+		txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+		if (txq->tx_next_dd >= txq->nb_tx_desc)
+			txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+	} else {
+mempool_bulk:
+		/* if TX did not free bufs into Rx sw-ring,
+		 * get new bufs from mempool
+		 */
+		n = RTE_I40E_RXQ_REARM_THRESH;
+		if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) {
+			if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
+				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+					rxep[i].mbuf = &rxq->fake_mbuf;
+					vst1q_u64((uint64_t *)&rxdp[i].read, zero);
+				}
+			}
+			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n;
+			return;
+		}
+
+		/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+		for (i = 0; i < n; i += 2, rxep += 2) {
+			mb0 = rxep[0].mbuf;
+			mb1 = rxep[1].mbuf;
+
+			paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+			dma_addr0 = vdupq_n_u64(paddr);
+			/* flush desc with pa dma_addr */
+			vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+
+			paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
+			dma_addr1 = vdupq_n_u64(paddr);
+			/* flush desc with pa dma_addr */
+			vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
+		}
+	}
+
+	/* Update the descriptor initializer index */
+	rxq->rxrearm_start += n;
+	rx_id = rxq->rxrearm_start - 1;
+
+	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+		if (!rxq->rxrearm_start)
+			rx_id = rxq->nb_rx_desc - 1;
+		else
+			rx_id = rxq->rxrearm_start - 1;
+	}
+
+	rxq->rxrearm_nb -= n;
+
+	rte_io_wmb();
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+}
+
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */
 static inline uint32x4_t
@@ -381,8 +514,12 @@  _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
 	 */
-	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-		i40e_rxq_rearm(rxq);
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+		if (rxq->direct_rxrearm_enable)
+			i40e_rxq_direct_rearm(rxq);
+		else
+			i40e_rxq_rearm(rxq);
+	}
 
 	/* Before we start moving massive data around, check to see if
 	 * there is actually a packet available
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index 3782e8052f..b2f1ab2c8d 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -89,6 +89,168 @@  i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 	I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 }
 
+static inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+	struct rte_eth_dev *dev;
+	struct i40e_tx_queue *txq;
+	volatile union i40e_rx_desc *rxdp;
+	struct i40e_tx_entry *txep;
+	struct i40e_rx_entry *rxep;
+	uint16_t tx_port_id, tx_queue_id;
+	uint16_t rx_id;
+	struct rte_mbuf *mb0, *mb1, *m;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+			RTE_PKTMBUF_HEADROOM);
+	__m128i dma_addr0, dma_addr1;
+	__m128i vaddr0, vaddr1;
+	uint16_t i, n;
+	uint16_t nb_rearm = 0;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+	rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+	tx_port_id = rxq->direct_rxrearm_port;
+	tx_queue_id = rxq->direct_rxrearm_queue;
+	dev = &rte_eth_devices[tx_port_id];
+	txq = dev->data->tx_queues[tx_queue_id];
+
+	/* check Rx queue is able to take in the whole
+	 * batch of free mbufs from Tx queue
+	 */
+	if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+		/* check DD bits on threshold descriptor */
+		if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+				rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+				rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+			goto mempool_bulk;
+		}
+
+		n = txq->tx_rs_thresh;
+
+		/* first buffer to free from S/W ring is at index
+		 * tx_next_dd - (tx_rs_thresh-1)
+		 */
+		txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+		if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+			/* directly put mbufs from Tx to Rx,
+			 * and initialize the mbufs in vector
+			 */
+			for (i = 0; i < n; i++, rxep++, txep++) {
+				rxep[0].mbuf = txep[0].mbuf;
+
+				/* Initialize rxdp descs */
+				mb0 = txep[0].mbuf;
+
+				/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+				RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+						offsetof(struct rte_mbuf, buf_addr) + 8);
+				vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+
+				/* convert pa to dma_addr hdr/data */
+				dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+
+				/* add headroom to pa values */
+				dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+
+				/* flush desc with pa dma_addr */
+				_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+			}
+		} else {
+			for (i = 0; i < n; i++) {
+				m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+				if (m != NULL) {
+					rxep[i].mbuf = m;
+
+					/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+					RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+							offsetof(struct rte_mbuf, buf_addr) + 8);
+					vaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr);
+
+					/* convert pa to dma_addr hdr/data */
+					dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+
+					/* add headroom to pa values */
+					dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+
+					/* flush desc with pa dma_addr */
+					_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+					nb_rearm++;
+				}
+			}
+			n = nb_rearm;
+		}
+
+		/* update counters for Tx */
+		txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+		txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+		if (txq->tx_next_dd >= txq->nb_tx_desc)
+			txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+	} else {
+mempool_bulk:
+		/* if TX did not free bufs into Rx sw-ring,
+		 * get new bufs from mempool
+		 */
+		n = RTE_I40E_RXQ_REARM_THRESH;
+		/* Pull 'n' more MBUFs into the software ring */
+		if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) {
+			if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
+				dma_addr0 = _mm_setzero_si128();
+				for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+					rxep[i].mbuf = &rxq->fake_mbuf;
+					_mm_store_si128((__m128i *)&rxdp[i].read,
+							dma_addr0);
+				}
+			}
+			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+				RTE_I40E_RXQ_REARM_THRESH;
+			return;
+		}
+
+		/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+		for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+			mb0 = rxep[0].mbuf;
+			mb1 = rxep[1].mbuf;
+
+			/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+			RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+					offsetof(struct rte_mbuf, buf_addr) + 8);
+			vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+			vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+			/* convert pa to dma_addr hdr/data */
+			dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+			dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+			/* add headroom to pa values */
+			dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+			dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+			/* flush desc with pa dma_addr */
+			_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+			_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+		}
+	}
+
+	/* Update the descriptor initializer index */
+	rxq->rxrearm_start += n;
+	rx_id = rxq->rxrearm_start - 1;
+
+	if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+		rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+		if (!rxq->rxrearm_start)
+			rx_id = rxq->nb_rx_desc - 1;
+		else
+			rx_id = rxq->rxrearm_start - 1;
+	}
+
+	rxq->rxrearm_nb -= n;
+
+	/* Update the tail pointer on the NIC */
+	I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+}
+
 #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
 static inline __m128i
@@ -394,8 +556,12 @@  _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	/* See if we need to rearm the RX queue - gives the prefetch a bit
 	 * of time to act
 	 */
-	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
-		i40e_rxq_rearm(rxq);
+	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+		if (rxq->direct_rxrearm_enable)
+			i40e_rxq_direct_rearm(rxq);
+		else
+			i40e_rxq_rearm(rxq);
+	}
 
 	/* Before we start moving massive data around, check to see if
 	 * there is actually a packet available