[v1,2/5] net/i40e: enable direct rearm mode
Checks
Commit Message
For i40e driver, enable direct re-arm mode. This patch supports the case
of mapping Rx/Tx queues from the same single lcore.
Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
---
drivers/net/i40e/i40e_rxtx.h | 4 +
drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++
drivers/net/i40e/i40e_rxtx_vec_avx2.c | 14 +-
drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++-
drivers/net/i40e/i40e_rxtx_vec_neon.c | 141 ++++++++++++-
drivers/net/i40e/i40e_rxtx_vec_sse.c | 170 ++++++++++++++-
6 files changed, 839 insertions(+), 8 deletions(-)
Comments
> For i40e driver, enable direct re-arm mode. This patch supports the case
> of mapping Rx/Tx queues from the same single lcore.
>
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> ---
> drivers/net/i40e/i40e_rxtx.h | 4 +
> drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++
> drivers/net/i40e/i40e_rxtx_vec_avx2.c | 14 +-
> drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++-
> drivers/net/i40e/i40e_rxtx_vec_neon.c | 141 ++++++++++++-
> drivers/net/i40e/i40e_rxtx_vec_sse.c | 170 ++++++++++++++-
> 6 files changed, 839 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h
> index 5e6eecc501..1fdf4305f4 100644
> --- a/drivers/net/i40e/i40e_rxtx.h
> +++ b/drivers/net/i40e/i40e_rxtx.h
> @@ -102,6 +102,8 @@ struct i40e_rx_queue {
>
> uint16_t rxrearm_nb; /**< number of remaining to be re-armed */
> uint16_t rxrearm_start; /**< the idx we start the re-arming from */
> + uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */
> + uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */
> uint64_t mbuf_initializer; /**< value to init mbufs */
>
> uint16_t port_id; /**< device port ID */
> @@ -121,6 +123,8 @@ struct i40e_rx_queue {
> uint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */
> uint8_t dcb_tc; /**< Traffic class of rx queue */
> uint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */
> + /**< 0 if direct re-arm mode disabled, 1 when enabled */
> + bool direct_rxrearm_enable;
> const struct rte_memzone *mz;
> };
>
> diff --git a/drivers/net/i40e/i40e_rxtx_common_avx.h b/drivers/net/i40e/i40e_rxtx_common_avx.h
> index cfc1e63173..a742723e07 100644
> --- a/drivers/net/i40e/i40e_rxtx_common_avx.h
> +++ b/drivers/net/i40e/i40e_rxtx_common_avx.h
> @@ -209,6 +209,275 @@ i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
> /* Update the tail pointer on the NIC */
> I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> }
> +
> +static __rte_always_inline void
> +i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
> +{
> + struct rte_eth_dev *dev;
> + struct i40e_tx_queue *txq;rivers/net/i40e/i40e_rxtx_common_avx.h
> + volatile union i40e_rx_desc *rxdp;
> + struct i40e_tx_entry *txep;
> + struct i40e_rx_entry *rxep;
> + struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
> + uint16_t tx_port_id, tx_queue_id;
> + uint16_t rx_id;
> + uint16_t i, n;
> + uint16_t nb_rearm = 0;
> +
> + rxdp = rxq->rx_ring + rxq->rxrearm_start;
> + rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> + tx_port_id = rxq->direct_rxrearm_port;
> + tx_queue_id = rxq->direct_rxrearm_queue;
> + dev = &rte_eth_devices[tx_port_id];
> + txq = dev->data->tx_queues[tx_queue_id];
> +
> + /* check Rx queue is able to take in the whole
> + * batch of free mbufs from Tx queue
> + */
> + if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> + /* check DD bits on threshold descriptor */
> + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> + goto mempool_bulk;
> + }
> +
> + if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
> + goto mempool_bulk;
I think all these checks (is this mode can be enabled) should be done at
config phase, not at data-path.
> +
> + n = txq->tx_rs_thresh;
> +
> + /* first buffer to free from S/W ring is at index
> + * tx_next_dd - (tx_rs_thresh-1)
> + */
> + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
It really looks bad that RX function acesses and modifies TXQ data
directly. Would be much better to hide TXD checking/manipulation into a
separate TXQ function (txq_mbuf() or so) that RX path can invoke.
> +
> + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> + /* directly put mbufs from Tx to Rx,
> + * and initialize the mbufs in vector
> + */
> + for (i = 0; i < n; i++)
> + rxep[i].mbuf = txep[i].mbuf;
> + } else {
> + for (i = 0; i < n; i++) {
> + m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> + /* ensure each Tx freed buffer is valid */
> + if (m[i] != NULL)
> + nb_rearm++;
> + }
> +
> + if (nb_rearm != n) {
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + if (txq->tx_next_dd >= txq->nb_tx_desc)
> + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
So if nb_rearm != 0 what would happen with mbufs collected in m[]?
Are you just dropping/forgetting them?
> +
> + goto mempool_bulk;
> + } else {
> + for (i = 0; i < n; i++)
> + rxep[i].mbuf = m[i];
> + }
> + }
> +
> + /* update counters for Tx */
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + if (txq->tx_next_dd >= txq->nb_tx_desc)
> + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> + } else {
I suppose the chunk of code below is just a copy&paste of
exising i40e_rxq_direct_rearm_common()?
If so, no point to duplicate it, better to just invoke it here
(I presume a bit of re-factoring) would be need for that.
Pretty much same thoughts for other rearm functions below.
> +mempool_bulk:
> + /* if TX did not free bufs into Rx sw-ring,
> + * get new bufs from mempool
> + */
> + n = RTE_I40E_RXQ_REARM_THRESH;
> +
> + /* Pull 'n' more MBUFs into the software ring */
> + if (rte_mempool_get_bulk(rxq->mp,
> + (void *)rxep,
> + RTE_I40E_RXQ_REARM_THRESH) < 0) {
> + if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
> + rxq->nb_rx_desc) {
> + __m128i dma_addr0;
> + dma_addr0 = _mm_setzero_si128();
> + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> + rxep[i].mbuf = &rxq->fake_mbuf;
> + _mm_store_si128((__m128i *)&rxdp[i].read,
> + dma_addr0);
> + }
> + }
> + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> + RTE_I40E_RXQ_REARM_THRESH;
> + return;
> + }
> + }
> +
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> + struct rte_mbuf *mb0, *mb1;
> + __m128i dma_addr0, dma_addr1;
> + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
> + RTE_PKTMBUF_HEADROOM);
> + /* Initialize the mbufs in vector, process 2 mbufs in one loop */
> + for (i = 0; i < n; i += 2, rxep += 2) {
> + __m128i vaddr0, vaddr1;
> +
> + mb0 = rxep[0].mbuf;
> + mb1 = rxep[1].mbuf;
> +
> + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> + offsetof(struct rte_mbuf, buf_addr) + 8);
> + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> + /* convert pa to dma_addr hdr/data */
> + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
> +
> + /* add headroom to pa values */
> + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
> +
> + /* flush desc with pa dma_addr */
> + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
> + }
> +#else
> +#ifdef __AVX512VL__
> + if (avx512) {
> + struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
> + struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
> + __m512i dma_addr0_3, dma_addr4_7;
> + __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
> + /* Initialize the mbufs in vector, process 8 mbufs in one loop */
> + for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) {
> + __m128i vaddr0, vaddr1, vaddr2, vaddr3;
> + __m128i vaddr4, vaddr5, vaddr6, vaddr7;
> + __m256i vaddr0_1, vaddr2_3;
> + __m256i vaddr4_5, vaddr6_7;
> + __m512i vaddr0_3, vaddr4_7;
> +
> + mb0 = rxep[0].mbuf;
> + mb1 = rxep[1].mbuf;
> + mb2 = rxep[2].mbuf;
> + mb3 = rxep[3].mbuf;
> + mb4 = rxep[4].mbuf;
> + mb5 = rxep[5].mbuf;
> + mb6 = rxep[6].mbuf;
> + mb7 = rxep[7].mbuf;
> +
> + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> + offsetof(struct rte_mbuf, buf_addr) + 8);
> + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
> + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
> + vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
> + vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
> + vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
> + vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
> +
> + /**
> + * merge 0 & 1, by casting 0 to 256-bit and inserting 1
> + * into the high lanes. Similarly for 2 & 3, and so on.
> + */
> + vaddr0_1 =
> + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
> + vaddr1, 1);
> + vaddr2_3 =
> + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
> + vaddr3, 1);
> + vaddr4_5 =
> + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
> + vaddr5, 1);
> + vaddr6_7 =
> + _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
> + vaddr7, 1);
> + vaddr0_3 =
> + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
> + vaddr2_3, 1);
> + vaddr4_7 =
> + _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
> + vaddr6_7, 1);
> +
> + /* convert pa to dma_addr hdr/data */
> + dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
> + dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
> +
> + /* add headroom to pa values */
> + dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
> + dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
> +
> + /* flush desc with pa dma_addr */
> + _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
> + _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);
> + }
> + } else {
> +#endif /* __AVX512VL__*/
> + struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
> + __m256i dma_addr0_1, dma_addr2_3;
> + __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
> + /* Initialize the mbufs in vector, process 4 mbufs in one loop */
> + for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) {
> + __m128i vaddr0, vaddr1, vaddr2, vaddr3;
> + __m256i vaddr0_1, vaddr2_3;
> +
> + mb0 = rxep[0].mbuf;
> + mb1 = rxep[1].mbuf;
> + mb2 = rxep[2].mbuf;
> + mb3 = rxep[3].mbuf;
> +
> + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> + offsetof(struct rte_mbuf, buf_addr) + 8);
> + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> + vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
> + vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
> +
> + /**
> + * merge 0 & 1, by casting 0 to 256-bit and inserting 1
> + * into the high lanes. Similarly for 2 & 3
> + */
> + vaddr0_1 = _mm256_inserti128_si256
> + (_mm256_castsi128_si256(vaddr0), vaddr1, 1);
> + vaddr2_3 = _mm256_inserti128_si256
> + (_mm256_castsi128_si256(vaddr2), vaddr3, 1);
> +
> + /* convert pa to dma_addr hdr/data */
> + dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);
> + dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);
> +
> + /* add headroom to pa values */
> + dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);
> + dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);
> +
> + /* flush desc with pa dma_addr */
> + _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1);
> + _mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3);
> + }
> + }
> +
> +#endif
> +
> + /* Update the descriptor initializer index */
> + rxq->rxrearm_start += n;
> + rx_id = rxq->rxrearm_start - 1;
> +
> + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> + if (!rxq->rxrearm_start)
> + rx_id = rxq->nb_rx_desc - 1;
> + else
> + rx_id = rxq->rxrearm_start - 1;
> + }
> +
> + rxq->rxrearm_nb -= n;
> +
> + /* Update the tail pointer on the NIC */
> + I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> +}
> #endif /* __AVX2__*/
>
> #endif /*_I40E_RXTX_COMMON_AVX_H_*/
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> index c73b2a321b..fcb7ba0273 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
> @@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
> return i40e_rxq_rearm_common(rxq, false);
> }
>
> +static __rte_always_inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> + return i40e_rxq_direct_rearm_common(rxq, false);
> +}
> +
> #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> /* Handles 32B descriptor FDIR ID processing:
> * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
> @@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> /* See if we need to rearm the RX queue - gives the prefetch a bit
> * of time to act
> */
> - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> - i40e_rxq_rearm(rxq);
> + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> + if (rxq->direct_rxrearm_enable)
> + i40e_rxq_direct_rearm(rxq);
> + else
> + i40e_rxq_rearm(rxq);
> + }
>
> /* Before we start moving massive data around, check to see if
> * there is actually a packet available
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> index 2e8a3f0df6..d967095edc 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> @@ -21,6 +21,12 @@
>
> #define RTE_I40E_DESCS_PER_LOOP_AVX 8
>
> +enum i40e_direct_rearm_type_value {
> + I40E_DIRECT_REARM_TYPE_NORMAL = 0x0,
> + I40E_DIRECT_REARM_TYPE_FAST_FREE = 0x1,
> + I40E_DIRECT_REARM_TYPE_PRE_FREE = 0x2,
> +};
> +
> static __rte_always_inline void
> i40e_rxq_rearm(struct i40e_rx_queue *rxq)
> {
> @@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
> I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> }
>
> +static __rte_always_inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> + struct rte_eth_dev *dev;
> + struct i40e_tx_queue *txq;
> + volatile union i40e_rx_desc *rxdp;
> + struct i40e_vec_tx_entry *txep;
> + struct i40e_rx_entry *rxep;
> + struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
> + uint16_t tx_port_id, tx_queue_id;
> + uint16_t rx_id;
> + uint16_t i, n;
> + uint16_t j = 0;
> + uint16_t nb_rearm = 0;
> + enum i40e_direct_rearm_type_value type;
> + struct rte_mempool_cache *cache = NULL;
> +
> + rxdp = rxq->rx_ring + rxq->rxrearm_start;
> + rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> + tx_port_id = rxq->direct_rxrearm_port;
> + tx_queue_id = rxq->direct_rxrearm_queue;
> + dev = &rte_eth_devices[tx_port_id];
> + txq = dev->data->tx_queues[tx_queue_id];
> +
> + /* check Rx queue is able to take in the whole
> + * batch of free mbufs from Tx queue
> + */
> + if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> + /* check DD bits on threshold descriptor */
> + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> + goto mempool_bulk;
> + }
> +
> + if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
> + goto mempool_bulk;
> +
> + n = txq->tx_rs_thresh;
> +
> + /* first buffer to free from S/W ring is at index
> + * tx_next_dd - (tx_rs_thresh-1)
> + */
> + txep = (void *)txq->sw_ring;
> + txep += txq->tx_next_dd - (n - 1);
> +
> + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> + /* directly put mbufs from Tx to Rx */
> + uint32_t copied = 0;
> + /* n is multiple of 32 */
> + while (copied < n) {
> + const __m512i a = _mm512_load_si512(&txep[copied]);
> + const __m512i b = _mm512_load_si512(&txep[copied + 8]);
> + const __m512i c = _mm512_load_si512(&txep[copied + 16]);
> + const __m512i d = _mm512_load_si512(&txep[copied + 24]);
> +
> + _mm512_storeu_si512(&rxep[copied], a);
> + _mm512_storeu_si512(&rxep[copied + 8], b);
> + _mm512_storeu_si512(&rxep[copied + 16], c);
> + _mm512_storeu_si512(&rxep[copied + 24], d);
> + copied += 32;
> + }
> + type = I40E_DIRECT_REARM_TYPE_FAST_FREE;
> + } else {
> + for (i = 0; i < n; i++) {
> + m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> + /* ensure each Tx freed buffer is valid */
> + if (m[i] != NULL)
> + nb_rearm++;
> + }
> +
> + if (nb_rearm != n) {
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + if (txq->tx_next_dd >= txq->nb_tx_desc)
> + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> +
> + goto mempool_bulk;
> + } else {
> + type = I40E_DIRECT_REARM_TYPE_PRE_FREE;
> + }
> + }
> +
> + /* update counters for Tx */
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + if (txq->tx_next_dd >= txq->nb_tx_desc)
> + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> + } else {
> +mempool_bulk:
> + cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());
> +
> + if (unlikely(!cache))
> + return i40e_rxq_rearm_common(rxq, true);
> +
> + n = RTE_I40E_RXQ_REARM_THRESH;
> +
> + /* We need to pull 'n' more MBUFs into the software ring from mempool
> + * We inline the mempool function here, so we can vectorize the copy
> + * from the cache into the shadow ring.
> + */
> +
> + if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
> + /* No. Backfill the cache first, and then fill from it */
> + uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
> + cache->len);
> +
> + /* How many do we require
> + * i.e. number to fill the cache + the request
> + */
> + int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
> + &cache->objs[cache->len], req);
> + if (ret == 0) {
> + cache->len += req;
> + } else {
> + if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
> + rxq->nb_rx_desc) {
> + __m128i dma_addr0;
> +
> + dma_addr0 = _mm_setzero_si128();
> + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> + rxep[i].mbuf = &rxq->fake_mbuf;
> + _mm_store_si128
> + ((__m128i *)&rxdp[i].read,
> + dma_addr0);
> + }
> + }
> + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> + RTE_I40E_RXQ_REARM_THRESH;
> + return;
> + }
> + }
> +
> + type = I40E_DIRECT_REARM_TYPE_NORMAL;
> + }
> +
> + const __m512i iova_offsets = _mm512_set1_epi64
> + (offsetof(struct rte_mbuf, buf_iova));
> + const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
> +
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> + /* to shuffle the addresses to correct slots. Values 4-7 will contain
> + * zeros, so use 7 for a zero-value.
> + */
> + const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
> +#else
> + const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
> +#endif
> +
> + __m512i mbuf_ptrs;
> +
> + /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
> + * from mempool cache and populating both shadow and HW rings
> + */
> + for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
> + switch (type) {
> + case I40E_DIRECT_REARM_TYPE_FAST_FREE:
> + mbuf_ptrs = _mm512_loadu_si512(rxep);
> + break;
> + case I40E_DIRECT_REARM_TYPE_PRE_FREE:
> + mbuf_ptrs = _mm512_loadu_si512(&m[j]);
> + _mm512_store_si512(rxep, mbuf_ptrs);
> + j += 8;
> + break;
> + case I40E_DIRECT_REARM_TYPE_NORMAL:
> + mbuf_ptrs = _mm512_loadu_si512
> + (&cache->objs[cache->len - 8]);
> + _mm512_store_si512(rxep, mbuf_ptrs);
> + cache->len -= 8;
> + break;
> + }
> +
> + /* gather iova of mbuf0-7 into one zmm reg */
> + const __m512i iova_base_addrs = _mm512_i64gather_epi64
> + (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
> + 0, /* base */
> + 1 /* scale */);
> + const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
> + headroom);
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> + const __m512i iovas0 = _mm512_castsi256_si512
> + (_mm512_extracti64x4_epi64(iova_addrs, 0));
> + const __m512i iovas1 = _mm512_castsi256_si512
> + (_mm512_extracti64x4_epi64(iova_addrs, 1));
> +
> + /* permute leaves desc 2-3 addresses in header address slots 0-1
> + * but these are ignored by driver since header split not
> + * enabled. Similarly for desc 4 & 5.
> + */
> + const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
> + (permute_idx, iovas0);
> + const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
> +
> + const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
> + (permute_idx, iovas1);
> + const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
> +
> + _mm512_store_si512((void *)rxdp, desc_rd_0_1);
> + _mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
> + _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
> + _mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
> +#else
> + /* permute leaves desc 4-7 addresses in header address slots 0-3
> + * but these are ignored by driver since header split not
> + * enabled.
> + */
> + const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
> + (permute_idx, iova_addrs);
> + const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
> +
> + _mm512_store_si512((void *)rxdp, desc_rd_0_3);
> + _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
> +#endif
> + rxdp += 8, rxep += 8;
> + }
> +
> + /* Update the descriptor initializer index */
> + rxq->rxrearm_start += n;
> + rx_id = rxq->rxrearm_start - 1;
> +
> + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> + if (!rxq->rxrearm_start)
> + rx_id = rxq->nb_rx_desc - 1;
> + else
> + rx_id = rxq->rxrearm_start - 1;
> + }
> +
> + rxq->rxrearm_nb -= n;
> +
> + /* Update the tail pointer on the NIC */
> + I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> +}
> +
> #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> /* Handles 32B descriptor FDIR ID processing:
> * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
> @@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> /* See if we need to rearm the RX queue - gives the prefetch a bit
> * of time to act
> */
> - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> - i40e_rxq_rearm(rxq);
> + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> + if (rxq->direct_rxrearm_enable)
> + i40e_rxq_direct_rearm(rxq);
> + else
> + i40e_rxq_rearm(rxq);
> + }
>
> /* Before we start moving massive data around, check to see if
> * there is actually a packet available
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> index fa9e6582c5..dc78e3c90b 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_neon.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c
> @@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
> I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
> }
>
> +static inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> + struct rte_eth_dev *dev;
> + struct i40e_tx_queue *txq;
> + volatile union i40e_rx_desc *rxdp;
> + struct i40e_tx_entry *txep;
> + struct i40e_rx_entry *rxep;
> + uint16_t tx_port_id, tx_queue_id;
> + uint16_t rx_id;
> + struct rte_mbuf *mb0, *mb1, *m;
> + uint64x2_t dma_addr0, dma_addr1;
> + uint64x2_t zero = vdupq_n_u64(0);
> + uint64_t paddr;
> + uint16_t i, n;
> + uint16_t nb_rearm = 0;
> +
> + rxdp = rxq->rx_ring + rxq->rxrearm_start;
> + rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> + tx_port_id = rxq->direct_rxrearm_port;
> + tx_queue_id = rxq->direct_rxrearm_queue;
> + dev = &rte_eth_devices[tx_port_id];
> + txq = dev->data->tx_queues[tx_queue_id];
> +
> + /* check Rx queue is able to take in the whole
> + * batch of free mbufs from Tx queue
> + */
> + if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> + /* check DD bits on threshold descriptor */
> + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> + goto mempool_bulk;
> + }
> +
> + n = txq->tx_rs_thresh;
> +
> + /* first buffer to free from S/W ring is at index
> + * tx_next_dd - (tx_rs_thresh-1)
> + */
> + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
> +
> + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> + /* directly put mbufs from Tx to Rx,
> + * and initialize the mbufs in vector
> + */
> + for (i = 0; i < n; i++, rxep++, txep++) {
> + rxep[0].mbuf = txep[0].mbuf;
> +
> + /* Initialize rxdp descs */
> + mb0 = txep[0].mbuf;
> +
> + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
> + dma_addr0 = vdupq_n_u64(paddr);
> + /* flush desc with pa dma_addr */
> + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> + }
> + } else {
> + for (i = 0; i < n; i++) {
> + m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> + if (m != NULL) {
> + rxep[i].mbuf = m;
> +
> + /* Initialize rxdp descs */
> + paddr = m->buf_iova + RTE_PKTMBUF_HEADROOM;
> + dma_addr0 = vdupq_n_u64(paddr);
> + /* flush desc with pa dma_addr */
> + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> + nb_rearm++;
> + }
> + }
> + n = nb_rearm;
> + }
> +
> + /* update counters for Tx */
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + if (txq->tx_next_dd >= txq->nb_tx_desc)
> + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> + } else {
> +mempool_bulk:
> + /* if TX did not free bufs into Rx sw-ring,
> + * get new bufs from mempool
> + */
> + n = RTE_I40E_RXQ_REARM_THRESH;
> + if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) {
> + if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
> + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> + rxep[i].mbuf = &rxq->fake_mbuf;
> + vst1q_u64((uint64_t *)&rxdp[i].read, zero);
> + }
> + }
> + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n;
> + return;
> + }
> +
> + /* Initialize the mbufs in vector, process 2 mbufs in one loop */
> + for (i = 0; i < n; i += 2, rxep += 2) {
> + mb0 = rxep[0].mbuf;
> + mb1 = rxep[1].mbuf;
> +
> + paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
> + dma_addr0 = vdupq_n_u64(paddr);
> + /* flush desc with pa dma_addr */
> + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
> +
> + paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
> + dma_addr1 = vdupq_n_u64(paddr);
> + /* flush desc with pa dma_addr */
> + vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
> + }
> + }
> +
> + /* Update the descriptor initializer index */
> + rxq->rxrearm_start += n;
> + rx_id = rxq->rxrearm_start - 1;
> +
> + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> + if (!rxq->rxrearm_start)
> + rx_id = rxq->nb_rx_desc - 1;
> + else
> + rx_id = rxq->rxrearm_start - 1;
> + }
> +
> + rxq->rxrearm_nb -= n;
> +
> + rte_io_wmb();
> + /* Update the tail pointer on the NIC */
> + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
> +}
> +
> #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */
> static inline uint32x4_t
> @@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
> /* See if we need to rearm the RX queue - gives the prefetch a bit
> * of time to act
> */
> - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> - i40e_rxq_rearm(rxq);
> + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> + if (rxq->direct_rxrearm_enable)
> + i40e_rxq_direct_rearm(rxq);
> + else
> + i40e_rxq_rearm(rxq);
> + }
>
> /* Before we start moving massive data around, check to see if
> * there is actually a packet available
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c
> index 3782e8052f..b2f1ab2c8d 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
> @@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
> I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
> }
>
> +static inline void
> +i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
> +{
> + struct rte_eth_dev *dev;
> + struct i40e_tx_queue *txq;
> + volatile union i40e_rx_desc *rxdp;
> + struct i40e_tx_entry *txep;
> + struct i40e_rx_entry *rxep;
> + uint16_t tx_port_id, tx_queue_id;
> + uint16_t rx_id;
> + struct rte_mbuf *mb0, *mb1, *m;
> + __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
> + RTE_PKTMBUF_HEADROOM);
> + __m128i dma_addr0, dma_addr1;
> + __m128i vaddr0, vaddr1;
> + uint16_t i, n;
> + uint16_t nb_rearm = 0;
> +
> + rxdp = rxq->rx_ring + rxq->rxrearm_start;
> + rxep = &rxq->sw_ring[rxq->rxrearm_start];
> +
> + tx_port_id = rxq->direct_rxrearm_port;
> + tx_queue_id = rxq->direct_rxrearm_queue;
> + dev = &rte_eth_devices[tx_port_id];
> + txq = dev->data->tx_queues[tx_queue_id];
> +
> + /* check Rx queue is able to take in the whole
> + * batch of free mbufs from Tx queue
> + */
> + if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
> + /* check DD bits on threshold descriptor */
> + if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
> + rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
> + rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
> + goto mempool_bulk;
> + }
> +
> + n = txq->tx_rs_thresh;
> +
> + /* first buffer to free from S/W ring is at index
> + * tx_next_dd - (tx_rs_thresh-1)
> + */
> + txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
> +
> + if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
> + /* directly put mbufs from Tx to Rx,
> + * and initialize the mbufs in vector
> + */
> + for (i = 0; i < n; i++, rxep++, txep++) {
> + rxep[0].mbuf = txep[0].mbuf;
> +
> + /* Initialize rxdp descs */
> + mb0 = txep[0].mbuf;
> +
> + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> + offsetof(struct rte_mbuf, buf_addr) + 8);
> + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +
> + /* convert pa to dma_addr hdr/data */
> + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +
> + /* add headroom to pa values */
> + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +
> + /* flush desc with pa dma_addr */
> + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> + }
> + } else {
> + for (i = 0; i < n; i++) {
> + m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
> + if (m != NULL) {
> + rxep[i].mbuf = m;
> +
> + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> + offsetof(struct rte_mbuf, buf_addr) + 8);
> + vaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr);
> +
> + /* convert pa to dma_addr hdr/data */
> + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +
> + /* add headroom to pa values */
> + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +
> + /* flush desc with pa dma_addr */
> + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> + nb_rearm++;
> + }
> + }
> + n = nb_rearm;
> + }
> +
> + /* update counters for Tx */
> + txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
> + txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
> + if (txq->tx_next_dd >= txq->nb_tx_desc)
> + txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
> + } else {
> +mempool_bulk:
> + /* if TX did not free bufs into Rx sw-ring,
> + * get new bufs from mempool
> + */
> + n = RTE_I40E_RXQ_REARM_THRESH;
> + /* Pull 'n' more MBUFs into the software ring */
> + if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) {
> + if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
> + dma_addr0 = _mm_setzero_si128();
> + for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> + rxep[i].mbuf = &rxq->fake_mbuf;
> + _mm_store_si128((__m128i *)&rxdp[i].read,
> + dma_addr0);
> + }
> + }
> + rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
> + RTE_I40E_RXQ_REARM_THRESH;
> + return;
> + }
> +
> + /* Initialize the mbufs in vector, process 2 mbufs in one loop */
> + for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
> + mb0 = rxep[0].mbuf;
> + mb1 = rxep[1].mbuf;
> +
> + /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
> + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> + offsetof(struct rte_mbuf, buf_addr) + 8);
> + vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> + vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> + /* convert pa to dma_addr hdr/data */
> + dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> + dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
> +
> + /* add headroom to pa values */
> + dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> + dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
> +
> + /* flush desc with pa dma_addr */
> + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> + _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
> + }
> + }
> +
> + /* Update the descriptor initializer index */
> + rxq->rxrearm_start += n;
> + rx_id = rxq->rxrearm_start - 1;
> +
> + if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> + rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
> + if (!rxq->rxrearm_start)
> + rx_id = rxq->nb_rx_desc - 1;
> + else
> + rx_id = rxq->rxrearm_start - 1;
> + }
> +
> + rxq->rxrearm_nb -= n;
> +
> + /* Update the tail pointer on the NIC */
> + I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
> +}
> +
> #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
> static inline __m128i
> @@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
> /* See if we need to rearm the RX queue - gives the prefetch a bit
> * of time to act
> */
> - if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
> - i40e_rxq_rearm(rxq);
> + if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
> + if (rxq->direct_rxrearm_enable)
> + i40e_rxq_direct_rearm(rxq);
> + else
> + i40e_rxq_rearm(rxq);
> + }
>
> /* Before we start moving massive data around, check to see if
> * there is actually a packet available
@@ -102,6 +102,8 @@ struct i40e_rx_queue {
uint16_t rxrearm_nb; /**< number of remaining to be re-armed */
uint16_t rxrearm_start; /**< the idx we start the re-arming from */
+ uint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */
+ uint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */
uint64_t mbuf_initializer; /**< value to init mbufs */
uint16_t port_id; /**< device port ID */
@@ -121,6 +123,8 @@ struct i40e_rx_queue {
uint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */
uint8_t dcb_tc; /**< Traffic class of rx queue */
uint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */
+ /**< 0 if direct re-arm mode disabled, 1 when enabled */
+ bool direct_rxrearm_enable;
const struct rte_memzone *mz;
};
@@ -209,6 +209,275 @@ i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
/* Update the tail pointer on the NIC */
I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
}
+
+static __rte_always_inline void
+i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)
+{
+ struct rte_eth_dev *dev;
+ struct i40e_tx_queue *txq;
+ volatile union i40e_rx_desc *rxdp;
+ struct i40e_tx_entry *txep;
+ struct i40e_rx_entry *rxep;
+ struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
+ uint16_t tx_port_id, tx_queue_id;
+ uint16_t rx_id;
+ uint16_t i, n;
+ uint16_t nb_rearm = 0;
+
+ rxdp = rxq->rx_ring + rxq->rxrearm_start;
+ rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+ tx_port_id = rxq->direct_rxrearm_port;
+ tx_queue_id = rxq->direct_rxrearm_queue;
+ dev = &rte_eth_devices[tx_port_id];
+ txq = dev->data->tx_queues[tx_queue_id];
+
+ /* check Rx queue is able to take in the whole
+ * batch of free mbufs from Tx queue
+ */
+ if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+ /* check DD bits on threshold descriptor */
+ if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+ rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+ rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+ goto mempool_bulk;
+ }
+
+ if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
+ goto mempool_bulk;
+
+ n = txq->tx_rs_thresh;
+
+ /* first buffer to free from S/W ring is at index
+ * tx_next_dd - (tx_rs_thresh-1)
+ */
+ txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+ if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+ /* directly put mbufs from Tx to Rx,
+ * and initialize the mbufs in vector
+ */
+ for (i = 0; i < n; i++)
+ rxep[i].mbuf = txep[i].mbuf;
+ } else {
+ for (i = 0; i < n; i++) {
+ m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ /* ensure each Tx freed buffer is valid */
+ if (m[i] != NULL)
+ nb_rearm++;
+ }
+
+ if (nb_rearm != n) {
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+ goto mempool_bulk;
+ } else {
+ for (i = 0; i < n; i++)
+ rxep[i].mbuf = m[i];
+ }
+ }
+
+ /* update counters for Tx */
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+ } else {
+mempool_bulk:
+ /* if TX did not free bufs into Rx sw-ring,
+ * get new bufs from mempool
+ */
+ n = RTE_I40E_RXQ_REARM_THRESH;
+
+ /* Pull 'n' more MBUFs into the software ring */
+ if (rte_mempool_get_bulk(rxq->mp,
+ (void *)rxep,
+ RTE_I40E_RXQ_REARM_THRESH) < 0) {
+ if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+ rxq->nb_rx_desc) {
+ __m128i dma_addr0;
+ dma_addr0 = _mm_setzero_si128();
+ for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+ rxep[i].mbuf = &rxq->fake_mbuf;
+ _mm_store_si128((__m128i *)&rxdp[i].read,
+ dma_addr0);
+ }
+ }
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+ RTE_I40E_RXQ_REARM_THRESH;
+ return;
+ }
+ }
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+ struct rte_mbuf *mb0, *mb1;
+ __m128i dma_addr0, dma_addr1;
+ __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+ RTE_PKTMBUF_HEADROOM);
+ /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+ for (i = 0; i < n; i += 2, rxep += 2) {
+ __m128i vaddr0, vaddr1;
+
+ mb0 = rxep[0].mbuf;
+ mb1 = rxep[1].mbuf;
+
+ /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+ offsetof(struct rte_mbuf, buf_addr) + 8);
+ vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+ vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+ /* convert pa to dma_addr hdr/data */
+ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+ dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+ /* add headroom to pa values */
+ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+ dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+ /* flush desc with pa dma_addr */
+ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+ }
+#else
+#ifdef __AVX512VL__
+ if (avx512) {
+ struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+ struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+ __m512i dma_addr0_3, dma_addr4_7;
+ __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+ /* Initialize the mbufs in vector, process 8 mbufs in one loop */
+ for (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) {
+ __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+ __m128i vaddr4, vaddr5, vaddr6, vaddr7;
+ __m256i vaddr0_1, vaddr2_3;
+ __m256i vaddr4_5, vaddr6_7;
+ __m512i vaddr0_3, vaddr4_7;
+
+ mb0 = rxep[0].mbuf;
+ mb1 = rxep[1].mbuf;
+ mb2 = rxep[2].mbuf;
+ mb3 = rxep[3].mbuf;
+ mb4 = rxep[4].mbuf;
+ mb5 = rxep[5].mbuf;
+ mb6 = rxep[6].mbuf;
+ mb7 = rxep[7].mbuf;
+
+ /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+ offsetof(struct rte_mbuf, buf_addr) + 8);
+ vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+ vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+ vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+ vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+ vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+ vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
+ vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
+ vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
+
+ /**
+ * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+ * into the high lanes. Similarly for 2 & 3, and so on.
+ */
+ vaddr0_1 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
+ vaddr1, 1);
+ vaddr2_3 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
+ vaddr3, 1);
+ vaddr4_5 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
+ vaddr5, 1);
+ vaddr6_7 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
+ vaddr7, 1);
+ vaddr0_3 =
+ _mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
+ vaddr2_3, 1);
+ vaddr4_7 =
+ _mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
+ vaddr6_7, 1);
+
+ /* convert pa to dma_addr hdr/data */
+ dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
+ dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
+
+ /* add headroom to pa values */
+ dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
+ dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
+
+ /* flush desc with pa dma_addr */
+ _mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
+ _mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);
+ }
+ } else {
+#endif /* __AVX512VL__*/
+ struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+ __m256i dma_addr0_1, dma_addr2_3;
+ __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
+ /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+ for (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) {
+ __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+ __m256i vaddr0_1, vaddr2_3;
+
+ mb0 = rxep[0].mbuf;
+ mb1 = rxep[1].mbuf;
+ mb2 = rxep[2].mbuf;
+ mb3 = rxep[3].mbuf;
+
+ /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+ offsetof(struct rte_mbuf, buf_addr) + 8);
+ vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+ vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+ vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+ vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+
+ /**
+ * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+ * into the high lanes. Similarly for 2 & 3
+ */
+ vaddr0_1 = _mm256_inserti128_si256
+ (_mm256_castsi128_si256(vaddr0), vaddr1, 1);
+ vaddr2_3 = _mm256_inserti128_si256
+ (_mm256_castsi128_si256(vaddr2), vaddr3, 1);
+
+ /* convert pa to dma_addr hdr/data */
+ dma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);
+ dma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);
+
+ /* add headroom to pa values */
+ dma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);
+ dma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);
+
+ /* flush desc with pa dma_addr */
+ _mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1);
+ _mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3);
+ }
+ }
+
+#endif
+
+ /* Update the descriptor initializer index */
+ rxq->rxrearm_start += n;
+ rx_id = rxq->rxrearm_start - 1;
+
+ if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+ rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+ if (!rxq->rxrearm_start)
+ rx_id = rxq->nb_rx_desc - 1;
+ else
+ rx_id = rxq->rxrearm_start - 1;
+ }
+
+ rxq->rxrearm_nb -= n;
+
+ /* Update the tail pointer on the NIC */
+ I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
#endif /* __AVX2__*/
#endif /*_I40E_RXTX_COMMON_AVX_H_*/
@@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
return i40e_rxq_rearm_common(rxq, false);
}
+static __rte_always_inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+ return i40e_rxq_direct_rearm_common(rxq, false);
+}
+
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
/* Handles 32B descriptor FDIR ID processing:
* rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
@@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* See if we need to rearm the RX queue - gives the prefetch a bit
* of time to act
*/
- if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
- i40e_rxq_rearm(rxq);
+ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+ if (rxq->direct_rxrearm_enable)
+ i40e_rxq_direct_rearm(rxq);
+ else
+ i40e_rxq_rearm(rxq);
+ }
/* Before we start moving massive data around, check to see if
* there is actually a packet available
@@ -21,6 +21,12 @@
#define RTE_I40E_DESCS_PER_LOOP_AVX 8
+enum i40e_direct_rearm_type_value {
+ I40E_DIRECT_REARM_TYPE_NORMAL = 0x0,
+ I40E_DIRECT_REARM_TYPE_FAST_FREE = 0x1,
+ I40E_DIRECT_REARM_TYPE_PRE_FREE = 0x2,
+};
+
static __rte_always_inline void
i40e_rxq_rearm(struct i40e_rx_queue *rxq)
{
@@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
}
+static __rte_always_inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+ struct rte_eth_dev *dev;
+ struct i40e_tx_queue *txq;
+ volatile union i40e_rx_desc *rxdp;
+ struct i40e_vec_tx_entry *txep;
+ struct i40e_rx_entry *rxep;
+ struct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];
+ uint16_t tx_port_id, tx_queue_id;
+ uint16_t rx_id;
+ uint16_t i, n;
+ uint16_t j = 0;
+ uint16_t nb_rearm = 0;
+ enum i40e_direct_rearm_type_value type;
+ struct rte_mempool_cache *cache = NULL;
+
+ rxdp = rxq->rx_ring + rxq->rxrearm_start;
+ rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+ tx_port_id = rxq->direct_rxrearm_port;
+ tx_queue_id = rxq->direct_rxrearm_queue;
+ dev = &rte_eth_devices[tx_port_id];
+ txq = dev->data->tx_queues[tx_queue_id];
+
+ /* check Rx queue is able to take in the whole
+ * batch of free mbufs from Tx queue
+ */
+ if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+ /* check DD bits on threshold descriptor */
+ if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+ rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+ rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+ goto mempool_bulk;
+ }
+
+ if (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)
+ goto mempool_bulk;
+
+ n = txq->tx_rs_thresh;
+
+ /* first buffer to free from S/W ring is at index
+ * tx_next_dd - (tx_rs_thresh-1)
+ */
+ txep = (void *)txq->sw_ring;
+ txep += txq->tx_next_dd - (n - 1);
+
+ if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+ /* directly put mbufs from Tx to Rx */
+ uint32_t copied = 0;
+ /* n is multiple of 32 */
+ while (copied < n) {
+ const __m512i a = _mm512_load_si512(&txep[copied]);
+ const __m512i b = _mm512_load_si512(&txep[copied + 8]);
+ const __m512i c = _mm512_load_si512(&txep[copied + 16]);
+ const __m512i d = _mm512_load_si512(&txep[copied + 24]);
+
+ _mm512_storeu_si512(&rxep[copied], a);
+ _mm512_storeu_si512(&rxep[copied + 8], b);
+ _mm512_storeu_si512(&rxep[copied + 16], c);
+ _mm512_storeu_si512(&rxep[copied + 24], d);
+ copied += 32;
+ }
+ type = I40E_DIRECT_REARM_TYPE_FAST_FREE;
+ } else {
+ for (i = 0; i < n; i++) {
+ m[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ /* ensure each Tx freed buffer is valid */
+ if (m[i] != NULL)
+ nb_rearm++;
+ }
+
+ if (nb_rearm != n) {
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+ goto mempool_bulk;
+ } else {
+ type = I40E_DIRECT_REARM_TYPE_PRE_FREE;
+ }
+ }
+
+ /* update counters for Tx */
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+ } else {
+mempool_bulk:
+ cache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());
+
+ if (unlikely(!cache))
+ return i40e_rxq_rearm_common(rxq, true);
+
+ n = RTE_I40E_RXQ_REARM_THRESH;
+
+ /* We need to pull 'n' more MBUFs into the software ring from mempool
+ * We inline the mempool function here, so we can vectorize the copy
+ * from the cache into the shadow ring.
+ */
+
+ if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
+ /* No. Backfill the cache first, and then fill from it */
+ uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
+ cache->len);
+
+ /* How many do we require
+ * i.e. number to fill the cache + the request
+ */
+ int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
+ &cache->objs[cache->len], req);
+ if (ret == 0) {
+ cache->len += req;
+ } else {
+ if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+ rxq->nb_rx_desc) {
+ __m128i dma_addr0;
+
+ dma_addr0 = _mm_setzero_si128();
+ for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+ rxep[i].mbuf = &rxq->fake_mbuf;
+ _mm_store_si128
+ ((__m128i *)&rxdp[i].read,
+ dma_addr0);
+ }
+ }
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+ RTE_I40E_RXQ_REARM_THRESH;
+ return;
+ }
+ }
+
+ type = I40E_DIRECT_REARM_TYPE_NORMAL;
+ }
+
+ const __m512i iova_offsets = _mm512_set1_epi64
+ (offsetof(struct rte_mbuf, buf_iova));
+ const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+ /* to shuffle the addresses to correct slots. Values 4-7 will contain
+ * zeros, so use 7 for a zero-value.
+ */
+ const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
+#else
+ const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
+#endif
+
+ __m512i mbuf_ptrs;
+
+ /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
+ * from mempool cache and populating both shadow and HW rings
+ */
+ for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
+ switch (type) {
+ case I40E_DIRECT_REARM_TYPE_FAST_FREE:
+ mbuf_ptrs = _mm512_loadu_si512(rxep);
+ break;
+ case I40E_DIRECT_REARM_TYPE_PRE_FREE:
+ mbuf_ptrs = _mm512_loadu_si512(&m[j]);
+ _mm512_store_si512(rxep, mbuf_ptrs);
+ j += 8;
+ break;
+ case I40E_DIRECT_REARM_TYPE_NORMAL:
+ mbuf_ptrs = _mm512_loadu_si512
+ (&cache->objs[cache->len - 8]);
+ _mm512_store_si512(rxep, mbuf_ptrs);
+ cache->len -= 8;
+ break;
+ }
+
+ /* gather iova of mbuf0-7 into one zmm reg */
+ const __m512i iova_base_addrs = _mm512_i64gather_epi64
+ (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
+ 0, /* base */
+ 1 /* scale */);
+ const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
+ headroom);
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+ const __m512i iovas0 = _mm512_castsi256_si512
+ (_mm512_extracti64x4_epi64(iova_addrs, 0));
+ const __m512i iovas1 = _mm512_castsi256_si512
+ (_mm512_extracti64x4_epi64(iova_addrs, 1));
+
+ /* permute leaves desc 2-3 addresses in header address slots 0-1
+ * but these are ignored by driver since header split not
+ * enabled. Similarly for desc 4 & 5.
+ */
+ const __m512i desc_rd_0_1 = _mm512_permutexvar_epi64
+ (permute_idx, iovas0);
+ const __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);
+
+ const __m512i desc_rd_4_5 = _mm512_permutexvar_epi64
+ (permute_idx, iovas1);
+ const __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);
+
+ _mm512_store_si512((void *)rxdp, desc_rd_0_1);
+ _mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);
+ _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);
+ _mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);
+#else
+ /* permute leaves desc 4-7 addresses in header address slots 0-3
+ * but these are ignored by driver since header split not
+ * enabled.
+ */
+ const __m512i desc_rd_0_3 = _mm512_permutexvar_epi64
+ (permute_idx, iova_addrs);
+ const __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);
+
+ _mm512_store_si512((void *)rxdp, desc_rd_0_3);
+ _mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);
+#endif
+ rxdp += 8, rxep += 8;
+ }
+
+ /* Update the descriptor initializer index */
+ rxq->rxrearm_start += n;
+ rx_id = rxq->rxrearm_start - 1;
+
+ if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+ rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+ if (!rxq->rxrearm_start)
+ rx_id = rxq->nb_rx_desc - 1;
+ else
+ rx_id = rxq->rxrearm_start - 1;
+ }
+
+ rxq->rxrearm_nb -= n;
+
+ /* Update the tail pointer on the NIC */
+ I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
+}
+
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
/* Handles 32B descriptor FDIR ID processing:
* rxdp: receive descriptor ring, required to load 2nd 16B half of each desc
@@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* See if we need to rearm the RX queue - gives the prefetch a bit
* of time to act
*/
- if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
- i40e_rxq_rearm(rxq);
+ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+ if (rxq->direct_rxrearm_enable)
+ i40e_rxq_direct_rearm(rxq);
+ else
+ i40e_rxq_rearm(rxq);
+ }
/* Before we start moving massive data around, check to see if
* there is actually a packet available
@@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
}
+static inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+ struct rte_eth_dev *dev;
+ struct i40e_tx_queue *txq;
+ volatile union i40e_rx_desc *rxdp;
+ struct i40e_tx_entry *txep;
+ struct i40e_rx_entry *rxep;
+ uint16_t tx_port_id, tx_queue_id;
+ uint16_t rx_id;
+ struct rte_mbuf *mb0, *mb1, *m;
+ uint64x2_t dma_addr0, dma_addr1;
+ uint64x2_t zero = vdupq_n_u64(0);
+ uint64_t paddr;
+ uint16_t i, n;
+ uint16_t nb_rearm = 0;
+
+ rxdp = rxq->rx_ring + rxq->rxrearm_start;
+ rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+ tx_port_id = rxq->direct_rxrearm_port;
+ tx_queue_id = rxq->direct_rxrearm_queue;
+ dev = &rte_eth_devices[tx_port_id];
+ txq = dev->data->tx_queues[tx_queue_id];
+
+ /* check Rx queue is able to take in the whole
+ * batch of free mbufs from Tx queue
+ */
+ if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+ /* check DD bits on threshold descriptor */
+ if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+ rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+ rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+ goto mempool_bulk;
+ }
+
+ n = txq->tx_rs_thresh;
+
+ /* first buffer to free from S/W ring is at index
+ * tx_next_dd - (tx_rs_thresh-1)
+ */
+ txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+ if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+ /* directly put mbufs from Tx to Rx,
+ * and initialize the mbufs in vector
+ */
+ for (i = 0; i < n; i++, rxep++, txep++) {
+ rxep[0].mbuf = txep[0].mbuf;
+
+ /* Initialize rxdp descs */
+ mb0 = txep[0].mbuf;
+
+ paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+ dma_addr0 = vdupq_n_u64(paddr);
+ /* flush desc with pa dma_addr */
+ vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+ }
+ } else {
+ for (i = 0; i < n; i++) {
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ if (m != NULL) {
+ rxep[i].mbuf = m;
+
+ /* Initialize rxdp descs */
+ paddr = m->buf_iova + RTE_PKTMBUF_HEADROOM;
+ dma_addr0 = vdupq_n_u64(paddr);
+ /* flush desc with pa dma_addr */
+ vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+ nb_rearm++;
+ }
+ }
+ n = nb_rearm;
+ }
+
+ /* update counters for Tx */
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+ } else {
+mempool_bulk:
+ /* if TX did not free bufs into Rx sw-ring,
+ * get new bufs from mempool
+ */
+ n = RTE_I40E_RXQ_REARM_THRESH;
+ if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) {
+ if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
+ for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+ rxep[i].mbuf = &rxq->fake_mbuf;
+ vst1q_u64((uint64_t *)&rxdp[i].read, zero);
+ }
+ }
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n;
+ return;
+ }
+
+ /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+ for (i = 0; i < n; i += 2, rxep += 2) {
+ mb0 = rxep[0].mbuf;
+ mb1 = rxep[1].mbuf;
+
+ paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
+ dma_addr0 = vdupq_n_u64(paddr);
+ /* flush desc with pa dma_addr */
+ vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
+
+ paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
+ dma_addr1 = vdupq_n_u64(paddr);
+ /* flush desc with pa dma_addr */
+ vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
+ }
+ }
+
+ /* Update the descriptor initializer index */
+ rxq->rxrearm_start += n;
+ rx_id = rxq->rxrearm_start - 1;
+
+ if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+ rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+ if (!rxq->rxrearm_start)
+ rx_id = rxq->nb_rx_desc - 1;
+ else
+ rx_id = rxq->rxrearm_start - 1;
+ }
+
+ rxq->rxrearm_nb -= n;
+
+ rte_io_wmb();
+ /* Update the tail pointer on the NIC */
+ I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+}
+
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
/* NEON version of FDIR mark extraction for 4 32B descriptors at a time */
static inline uint32x4_t
@@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
/* See if we need to rearm the RX queue - gives the prefetch a bit
* of time to act
*/
- if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
- i40e_rxq_rearm(rxq);
+ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+ if (rxq->direct_rxrearm_enable)
+ i40e_rxq_direct_rearm(rxq);
+ else
+ i40e_rxq_rearm(rxq);
+ }
/* Before we start moving massive data around, check to see if
* there is actually a packet available
@@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)
I40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
}
+static inline void
+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)
+{
+ struct rte_eth_dev *dev;
+ struct i40e_tx_queue *txq;
+ volatile union i40e_rx_desc *rxdp;
+ struct i40e_tx_entry *txep;
+ struct i40e_rx_entry *rxep;
+ uint16_t tx_port_id, tx_queue_id;
+ uint16_t rx_id;
+ struct rte_mbuf *mb0, *mb1, *m;
+ __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+ RTE_PKTMBUF_HEADROOM);
+ __m128i dma_addr0, dma_addr1;
+ __m128i vaddr0, vaddr1;
+ uint16_t i, n;
+ uint16_t nb_rearm = 0;
+
+ rxdp = rxq->rx_ring + rxq->rxrearm_start;
+ rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+ tx_port_id = rxq->direct_rxrearm_port;
+ tx_queue_id = rxq->direct_rxrearm_queue;
+ dev = &rte_eth_devices[tx_port_id];
+ txq = dev->data->tx_queues[tx_queue_id];
+
+ /* check Rx queue is able to take in the whole
+ * batch of free mbufs from Tx queue
+ */
+ if (rxq->rxrearm_nb > txq->tx_rs_thresh) {
+ /* check DD bits on threshold descriptor */
+ if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+ rte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=
+ rte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {
+ goto mempool_bulk;
+ }
+
+ n = txq->tx_rs_thresh;
+
+ /* first buffer to free from S/W ring is at index
+ * tx_next_dd - (tx_rs_thresh-1)
+ */
+ txep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];
+
+ if (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {
+ /* directly put mbufs from Tx to Rx,
+ * and initialize the mbufs in vector
+ */
+ for (i = 0; i < n; i++, rxep++, txep++) {
+ rxep[0].mbuf = txep[0].mbuf;
+
+ /* Initialize rxdp descs */
+ mb0 = txep[0].mbuf;
+
+ /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+ offsetof(struct rte_mbuf, buf_addr) + 8);
+ vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+
+ /* convert pa to dma_addr hdr/data */
+ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+
+ /* add headroom to pa values */
+ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+
+ /* flush desc with pa dma_addr */
+ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+ }
+ } else {
+ for (i = 0; i < n; i++) {
+ m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+ if (m != NULL) {
+ rxep[i].mbuf = m;
+
+ /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+ offsetof(struct rte_mbuf, buf_addr) + 8);
+ vaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr);
+
+ /* convert pa to dma_addr hdr/data */
+ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+
+ /* add headroom to pa values */
+ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+
+ /* flush desc with pa dma_addr */
+ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+ nb_rearm++;
+ }
+ }
+ n = nb_rearm;
+ }
+
+ /* update counters for Tx */
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+ } else {
+mempool_bulk:
+ /* if TX did not free bufs into Rx sw-ring,
+ * get new bufs from mempool
+ */
+ n = RTE_I40E_RXQ_REARM_THRESH;
+ /* Pull 'n' more MBUFs into the software ring */
+ if (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) {
+ if (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {
+ dma_addr0 = _mm_setzero_si128();
+ for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+ rxep[i].mbuf = &rxq->fake_mbuf;
+ _mm_store_si128((__m128i *)&rxdp[i].read,
+ dma_addr0);
+ }
+ }
+ rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+ RTE_I40E_RXQ_REARM_THRESH;
+ return;
+ }
+
+ /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+ for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+ mb0 = rxep[0].mbuf;
+ mb1 = rxep[1].mbuf;
+
+ /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+ offsetof(struct rte_mbuf, buf_addr) + 8);
+ vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+ vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+ /* convert pa to dma_addr hdr/data */
+ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+ dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+ /* add headroom to pa values */
+ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+ dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+ /* flush desc with pa dma_addr */
+ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+ }
+ }
+
+ /* Update the descriptor initializer index */
+ rxq->rxrearm_start += n;
+ rx_id = rxq->rxrearm_start - 1;
+
+ if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
+ rxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;
+ if (!rxq->rxrearm_start)
+ rx_id = rxq->nb_rx_desc - 1;
+ else
+ rx_id = rxq->rxrearm_start - 1;
+ }
+
+ rxq->rxrearm_nb -= n;
+
+ /* Update the tail pointer on the NIC */
+ I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
+}
+
#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
/* SSE version of FDIR mark extraction for 4 32B descriptors at a time */
static inline __m128i
@@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
/* See if we need to rearm the RX queue - gives the prefetch a bit
* of time to act
*/
- if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
- i40e_rxq_rearm(rxq);
+ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {
+ if (rxq->direct_rxrearm_enable)
+ i40e_rxq_direct_rearm(rxq);
+ else
+ i40e_rxq_rearm(rxq);
+ }
/* Before we start moving massive data around, check to see if
* there is actually a packet available