get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/109908/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 109908,
    "url": "http://patchwork.dpdk.org/api/patches/109908/?format=api",
    "web_url": "http://patchwork.dpdk.org/project/dpdk/patch/20220420081650.2043183-3-feifei.wang2@arm.com/",
    "project": {
        "id": 1,
        "url": "http://patchwork.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20220420081650.2043183-3-feifei.wang2@arm.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20220420081650.2043183-3-feifei.wang2@arm.com",
    "date": "2022-04-20T08:16:47",
    "name": "[v1,2/5] net/i40e: enable direct rearm mode",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "425a1ebcf10be295333fdb062067f9f1b00f01db",
    "submitter": {
        "id": 1771,
        "url": "http://patchwork.dpdk.org/api/people/1771/?format=api",
        "name": "Feifei Wang",
        "email": "feifei.wang2@arm.com"
    },
    "delegate": {
        "id": 3961,
        "url": "http://patchwork.dpdk.org/api/users/3961/?format=api",
        "username": "arybchenko",
        "first_name": "Andrew",
        "last_name": "Rybchenko",
        "email": "andrew.rybchenko@oktetlabs.ru"
    },
    "mbox": "http://patchwork.dpdk.org/project/dpdk/patch/20220420081650.2043183-3-feifei.wang2@arm.com/mbox/",
    "series": [
        {
            "id": 22568,
            "url": "http://patchwork.dpdk.org/api/series/22568/?format=api",
            "web_url": "http://patchwork.dpdk.org/project/dpdk/list/?series=22568",
            "date": "2022-04-20T08:16:45",
            "name": "Direct re-arming of buffers on receive side",
            "version": 1,
            "mbox": "http://patchwork.dpdk.org/series/22568/mbox/"
        }
    ],
    "comments": "http://patchwork.dpdk.org/api/patches/109908/comments/",
    "check": "success",
    "checks": "http://patchwork.dpdk.org/api/patches/109908/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id C1D2DA00BE;\n\tWed, 20 Apr 2022 10:17:10 +0200 (CEST)",
            "from [217.70.189.124] (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id 5760C427E9;\n\tWed, 20 Apr 2022 10:17:06 +0200 (CEST)",
            "from foss.arm.com (foss.arm.com [217.140.110.172])\n by mails.dpdk.org (Postfix) with ESMTP id E9DE440687\n for <dev@dpdk.org>; Wed, 20 Apr 2022 10:17:04 +0200 (CEST)",
            "from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])\n by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 63AC61FB;\n Wed, 20 Apr 2022 01:17:04 -0700 (PDT)",
            "from net-x86-dell-8268.shanghai.arm.com\n (net-x86-dell-8268.shanghai.arm.com [10.169.210.114])\n by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 9ED413F73B;\n Wed, 20 Apr 2022 01:17:01 -0700 (PDT)"
        ],
        "From": "Feifei Wang <feifei.wang2@arm.com>",
        "To": "Beilei Xing <beilei.xing@intel.com>,\n Bruce Richardson <bruce.richardson@intel.com>,\n Konstantin Ananyev <konstantin.ananyev@intel.com>,\n Ruifeng Wang <ruifeng.wang@arm.com>",
        "Cc": "dev@dpdk.org, nd@arm.com, Feifei Wang <feifei.wang2@arm.com>,\n Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>",
        "Subject": "[PATCH v1 2/5] net/i40e: enable direct rearm mode",
        "Date": "Wed, 20 Apr 2022 16:16:47 +0800",
        "Message-Id": "<20220420081650.2043183-3-feifei.wang2@arm.com>",
        "X-Mailer": "git-send-email 2.25.1",
        "In-Reply-To": "<20220420081650.2043183-1-feifei.wang2@arm.com>",
        "References": "<20220420081650.2043183-1-feifei.wang2@arm.com>",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.29",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org"
    },
    "content": "For i40e driver, enable direct re-arm mode. This patch supports the case\nof mapping Rx/Tx queues from the same single lcore.\n\nSuggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>\nSigned-off-by: Feifei Wang <feifei.wang2@arm.com>\nReviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>\nReviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>\n---\n drivers/net/i40e/i40e_rxtx.h            |   4 +\n drivers/net/i40e/i40e_rxtx_common_avx.h | 269 ++++++++++++++++++++++++\n drivers/net/i40e/i40e_rxtx_vec_avx2.c   |  14 +-\n drivers/net/i40e/i40e_rxtx_vec_avx512.c | 249 +++++++++++++++++++++-\n drivers/net/i40e/i40e_rxtx_vec_neon.c   | 141 ++++++++++++-\n drivers/net/i40e/i40e_rxtx_vec_sse.c    | 170 ++++++++++++++-\n 6 files changed, 839 insertions(+), 8 deletions(-)",
    "diff": "diff --git a/drivers/net/i40e/i40e_rxtx.h b/drivers/net/i40e/i40e_rxtx.h\nindex 5e6eecc501..1fdf4305f4 100644\n--- a/drivers/net/i40e/i40e_rxtx.h\n+++ b/drivers/net/i40e/i40e_rxtx.h\n@@ -102,6 +102,8 @@ struct i40e_rx_queue {\n \n \tuint16_t rxrearm_nb;\t/**< number of remaining to be re-armed */\n \tuint16_t rxrearm_start;\t/**< the idx we start the re-arming from */\n+\tuint16_t direct_rxrearm_port; /** device TX port ID for direct re-arm mode */\n+\tuint16_t direct_rxrearm_queue; /** TX queue index for direct re-arm mode */\n \tuint64_t mbuf_initializer; /**< value to init mbufs */\n \n \tuint16_t port_id; /**< device port ID */\n@@ -121,6 +123,8 @@ struct i40e_rx_queue {\n \tuint16_t rx_using_sse; /**<flag indicate the usage of vPMD for rx */\n \tuint8_t dcb_tc;         /**< Traffic class of rx queue */\n \tuint64_t offloads; /**< Rx offload flags of RTE_ETH_RX_OFFLOAD_* */\n+\t/**<  0 if direct re-arm mode disabled, 1 when enabled */\n+\tbool direct_rxrearm_enable;\n \tconst struct rte_memzone *mz;\n };\n \ndiff --git a/drivers/net/i40e/i40e_rxtx_common_avx.h b/drivers/net/i40e/i40e_rxtx_common_avx.h\nindex cfc1e63173..a742723e07 100644\n--- a/drivers/net/i40e/i40e_rxtx_common_avx.h\n+++ b/drivers/net/i40e/i40e_rxtx_common_avx.h\n@@ -209,6 +209,275 @@ i40e_rxq_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)\n \t/* Update the tail pointer on the NIC */\n \tI40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);\n }\n+\n+static __rte_always_inline void\n+i40e_rxq_direct_rearm_common(struct i40e_rx_queue *rxq, __rte_unused bool avx512)\n+{\n+\tstruct rte_eth_dev *dev;\n+\tstruct i40e_tx_queue *txq;\n+\tvolatile union i40e_rx_desc *rxdp;\n+\tstruct i40e_tx_entry *txep;\n+\tstruct i40e_rx_entry *rxep;\n+\tstruct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];\n+\tuint16_t tx_port_id, tx_queue_id;\n+\tuint16_t rx_id;\n+\tuint16_t i, n;\n+\tuint16_t nb_rearm = 0;\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\trxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\n+\ttx_port_id = rxq->direct_rxrearm_port;\n+\ttx_queue_id = rxq->direct_rxrearm_queue;\n+\tdev = &rte_eth_devices[tx_port_id];\n+\ttxq = dev->data->tx_queues[tx_queue_id];\n+\n+\t/* check Rx queue is able to take in the whole\n+\t * batch of free mbufs from Tx queue\n+\t */\n+\tif (rxq->rxrearm_nb > txq->tx_rs_thresh) {\n+\t\t/* check DD bits on threshold descriptor */\n+\t\tif ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &\n+\t\t\t\trte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=\n+\t\t\t\trte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {\n+\t\t\tgoto mempool_bulk;\n+\t\t}\n+\n+\t\tif (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)\n+\t\t\tgoto mempool_bulk;\n+\n+\t\tn = txq->tx_rs_thresh;\n+\n+\t\t/* first buffer to free from S/W ring is at index\n+\t\t * tx_next_dd - (tx_rs_thresh-1)\n+\t\t */\n+\t\ttxep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];\n+\n+\t\tif (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {\n+\t\t\t/* directly put mbufs from Tx to Rx,\n+\t\t\t * and initialize the mbufs in vector\n+\t\t\t */\n+\t\t\tfor (i = 0; i < n; i++)\n+\t\t\t\trxep[i].mbuf = txep[i].mbuf;\n+\t\t} else {\n+\t\t\tfor (i = 0; i < n; i++) {\n+\t\t\t\tm[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\t\t/* ensure each Tx freed buffer is valid */\n+\t\t\t\tif (m[i] != NULL)\n+\t\t\t\t\tnb_rearm++;\n+\t\t\t}\n+\n+\t\t\tif (nb_rearm != n) {\n+\t\t\t\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\t\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\t\t\t\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\t\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\n+\t\t\t\tgoto mempool_bulk;\n+\t\t\t} else {\n+\t\t\t\tfor (i = 0; i < n; i++)\n+\t\t\t\t\trxep[i].mbuf = m[i];\n+\t\t\t}\n+\t\t}\n+\n+\t\t/* update counters for Tx */\n+\t\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\t\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\t} else {\n+mempool_bulk:\n+\t\t/* if TX did not free bufs into Rx sw-ring,\n+\t\t * get new bufs from mempool\n+\t\t */\n+\t\tn = RTE_I40E_RXQ_REARM_THRESH;\n+\n+\t\t/* Pull 'n' more MBUFs into the software ring */\n+\t\tif (rte_mempool_get_bulk(rxq->mp,\n+\t\t\t\t\t(void *)rxep,\n+\t\t\t\t\tRTE_I40E_RXQ_REARM_THRESH) < 0) {\n+\t\t\tif (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=\n+\t\t\t\trxq->nb_rx_desc) {\n+\t\t\t\t__m128i dma_addr0;\n+\t\t\t\tdma_addr0 = _mm_setzero_si128();\n+\t\t\t\tfor (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {\n+\t\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\t\t_mm_store_si128((__m128i *)&rxdp[i].read,\n+\t\t\t\t\t\t\tdma_addr0);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=\n+\t\t\t\tRTE_I40E_RXQ_REARM_THRESH;\n+\t\t\treturn;\n+\t\t}\n+\t}\n+\n+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n+\tstruct rte_mbuf *mb0, *mb1;\n+\t__m128i dma_addr0, dma_addr1;\n+\t__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,\n+\t\t\tRTE_PKTMBUF_HEADROOM);\n+\t/* Initialize the mbufs in vector, process 2 mbufs in one loop */\n+\tfor (i = 0; i < n; i += 2, rxep += 2) {\n+\t\t__m128i vaddr0, vaddr1;\n+\n+\t\tmb0 = rxep[0].mbuf;\n+\t\tmb1 = rxep[1].mbuf;\n+\n+\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\t\tvaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);\n+\n+\t\t/* convert pa to dma_addr hdr/data */\n+\t\tdma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);\n+\t\tdma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);\n+\n+\t\t/* add headroom to pa values */\n+\t\tdma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);\n+\t\tdma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);\n+\n+\t\t/* flush desc with pa dma_addr */\n+\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);\n+\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);\n+\t}\n+#else\n+#ifdef __AVX512VL__\n+\tif (avx512) {\n+\t\tstruct rte_mbuf *mb0, *mb1, *mb2, *mb3;\n+\t\tstruct rte_mbuf *mb4, *mb5, *mb6, *mb7;\n+\t\t__m512i dma_addr0_3, dma_addr4_7;\n+\t\t__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);\n+\t\t/* Initialize the mbufs in vector, process 8 mbufs in one loop */\n+\t\tfor (i = 0; i < n; i += 8, rxep += 8, rxdp += 8) {\n+\t\t\t__m128i vaddr0, vaddr1, vaddr2, vaddr3;\n+\t\t\t__m128i vaddr4, vaddr5, vaddr6, vaddr7;\n+\t\t\t__m256i vaddr0_1, vaddr2_3;\n+\t\t\t__m256i vaddr4_5, vaddr6_7;\n+\t\t\t__m512i vaddr0_3, vaddr4_7;\n+\n+\t\t\tmb0 = rxep[0].mbuf;\n+\t\t\tmb1 = rxep[1].mbuf;\n+\t\t\tmb2 = rxep[2].mbuf;\n+\t\t\tmb3 = rxep[3].mbuf;\n+\t\t\tmb4 = rxep[4].mbuf;\n+\t\t\tmb5 = rxep[5].mbuf;\n+\t\t\tmb6 = rxep[6].mbuf;\n+\t\t\tmb7 = rxep[7].mbuf;\n+\n+\t\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\t\t\tvaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);\n+\t\t\tvaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);\n+\t\t\tvaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);\n+\t\t\tvaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);\n+\t\t\tvaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);\n+\t\t\tvaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);\n+\t\t\tvaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);\n+\n+\t\t\t/**\n+\t\t\t * merge 0 & 1, by casting 0 to 256-bit and inserting 1\n+\t\t\t * into the high lanes. Similarly for 2 & 3, and so on.\n+\t\t\t */\n+\t\t\tvaddr0_1 =\n+\t\t\t\t_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),\n+\t\t\t\t\t\t\tvaddr1, 1);\n+\t\t\tvaddr2_3 =\n+\t\t\t\t_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),\n+\t\t\t\t\t\t\tvaddr3, 1);\n+\t\t\tvaddr4_5 =\n+\t\t\t\t_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),\n+\t\t\t\t\t\t\tvaddr5, 1);\n+\t\t\tvaddr6_7 =\n+\t\t\t\t_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),\n+\t\t\t\t\t\t\tvaddr7, 1);\n+\t\t\tvaddr0_3 =\n+\t\t\t\t_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),\n+\t\t\t\t\t\t   vaddr2_3, 1);\n+\t\t\tvaddr4_7 =\n+\t\t\t\t_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),\n+\t\t\t\t\t\t   vaddr6_7, 1);\n+\n+\t\t\t/* convert pa to dma_addr hdr/data */\n+\t\t\tdma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);\n+\t\t\tdma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);\n+\n+\t\t\t/* add headroom to pa values */\n+\t\t\tdma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);\n+\t\t\tdma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);\n+\n+\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t_mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);\n+\t\t\t_mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);\n+\t\t}\n+\t} else {\n+#endif /* __AVX512VL__*/\n+\t\tstruct rte_mbuf *mb0, *mb1, *mb2, *mb3;\n+\t\t__m256i dma_addr0_1, dma_addr2_3;\n+\t\t__m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);\n+\t\t/* Initialize the mbufs in vector, process 4 mbufs in one loop */\n+\t\tfor (i = 0; i < n; i += 4, rxep += 4, rxdp += 4) {\n+\t\t\t__m128i vaddr0, vaddr1, vaddr2, vaddr3;\n+\t\t\t__m256i vaddr0_1, vaddr2_3;\n+\n+\t\t\tmb0 = rxep[0].mbuf;\n+\t\t\tmb1 = rxep[1].mbuf;\n+\t\t\tmb2 = rxep[2].mbuf;\n+\t\t\tmb3 = rxep[3].mbuf;\n+\n+\t\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\t\t\tvaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);\n+\t\t\tvaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);\n+\t\t\tvaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);\n+\n+\t\t\t/**\n+\t\t\t * merge 0 & 1, by casting 0 to 256-bit and inserting 1\n+\t\t\t * into the high lanes. Similarly for 2 & 3\n+\t\t\t */\n+\t\t\tvaddr0_1 = _mm256_inserti128_si256\n+\t\t\t\t(_mm256_castsi128_si256(vaddr0), vaddr1, 1);\n+\t\t\tvaddr2_3 = _mm256_inserti128_si256\n+\t\t\t\t(_mm256_castsi128_si256(vaddr2), vaddr3, 1);\n+\n+\t\t\t/* convert pa to dma_addr hdr/data */\n+\t\t\tdma_addr0_1 = _mm256_unpackhi_epi64(vaddr0_1, vaddr0_1);\n+\t\t\tdma_addr2_3 = _mm256_unpackhi_epi64(vaddr2_3, vaddr2_3);\n+\n+\t\t\t/* add headroom to pa values */\n+\t\t\tdma_addr0_1 = _mm256_add_epi64(dma_addr0_1, hdr_room);\n+\t\t\tdma_addr2_3 = _mm256_add_epi64(dma_addr2_3, hdr_room);\n+\n+\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t_mm256_store_si256((__m256i *)&rxdp->read, dma_addr0_1);\n+\t\t\t_mm256_store_si256((__m256i *)&(rxdp + 2)->read, dma_addr2_3);\n+\t\t}\n+\t}\n+\n+#endif\n+\n+\t/* Update the descriptor initializer index */\n+\trxq->rxrearm_start += n;\n+\trx_id = rxq->rxrearm_start - 1;\n+\n+\tif (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {\n+\t\trxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;\n+\t\tif (!rxq->rxrearm_start)\n+\t\t\trx_id = rxq->nb_rx_desc - 1;\n+\t\telse\n+\t\t\trx_id = rxq->rxrearm_start - 1;\n+\t}\n+\n+\trxq->rxrearm_nb -= n;\n+\n+\t/* Update the tail pointer on the NIC */\n+\tI40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);\n+}\n #endif /* __AVX2__*/\n \n #endif /*_I40E_RXTX_COMMON_AVX_H_*/\ndiff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c b/drivers/net/i40e/i40e_rxtx_vec_avx2.c\nindex c73b2a321b..fcb7ba0273 100644\n--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c\n+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c\n@@ -25,6 +25,12 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)\n \treturn i40e_rxq_rearm_common(rxq, false);\n }\n \n+static __rte_always_inline void\n+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)\n+{\n+\treturn i40e_rxq_direct_rearm_common(rxq, false);\n+}\n+\n #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n /* Handles 32B descriptor FDIR ID processing:\n  * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc\n@@ -128,8 +134,12 @@ _recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n \t/* See if we need to rearm the RX queue - gives the prefetch a bit\n \t * of time to act\n \t */\n-\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)\n-\t\ti40e_rxq_rearm(rxq);\n+\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {\n+\t\tif (rxq->direct_rxrearm_enable)\n+\t\t\ti40e_rxq_direct_rearm(rxq);\n+\t\telse\n+\t\t\ti40e_rxq_rearm(rxq);\n+\t}\n \n \t/* Before we start moving massive data around, check to see if\n \t * there is actually a packet available\ndiff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c\nindex 2e8a3f0df6..d967095edc 100644\n--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c\n+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c\n@@ -21,6 +21,12 @@\n \n #define RTE_I40E_DESCS_PER_LOOP_AVX 8\n \n+enum i40e_direct_rearm_type_value {\n+\tI40E_DIRECT_REARM_TYPE_NORMAL\t\t= 0x0,\n+\tI40E_DIRECT_REARM_TYPE_FAST_FREE\t= 0x1,\n+\tI40E_DIRECT_REARM_TYPE_PRE_FREE\t\t= 0x2,\n+};\n+\n static __rte_always_inline void\n i40e_rxq_rearm(struct i40e_rx_queue *rxq)\n {\n@@ -150,6 +156,241 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)\n \tI40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);\n }\n \n+static __rte_always_inline void\n+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)\n+{\n+\tstruct rte_eth_dev *dev;\n+\tstruct i40e_tx_queue *txq;\n+\tvolatile union i40e_rx_desc *rxdp;\n+\tstruct i40e_vec_tx_entry *txep;\n+\tstruct i40e_rx_entry *rxep;\n+\tstruct rte_mbuf *m[RTE_I40E_RXQ_REARM_THRESH];\n+\tuint16_t tx_port_id, tx_queue_id;\n+\tuint16_t rx_id;\n+\tuint16_t i, n;\n+\tuint16_t j = 0;\n+\tuint16_t nb_rearm = 0;\n+\tenum i40e_direct_rearm_type_value type;\n+\tstruct rte_mempool_cache *cache = NULL;\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\trxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\n+\ttx_port_id = rxq->direct_rxrearm_port;\n+\ttx_queue_id = rxq->direct_rxrearm_queue;\n+\tdev = &rte_eth_devices[tx_port_id];\n+\ttxq = dev->data->tx_queues[tx_queue_id];\n+\n+\t/* check Rx queue is able to take in the whole\n+\t * batch of free mbufs from Tx queue\n+\t */\n+\tif (rxq->rxrearm_nb > txq->tx_rs_thresh) {\n+\t\t/* check DD bits on threshold descriptor */\n+\t\tif ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &\n+\t\t\trte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=\n+\t\t\trte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {\n+\t\t\tgoto mempool_bulk;\n+\t\t}\n+\n+\t\tif (txq->tx_rs_thresh != RTE_I40E_RXQ_REARM_THRESH)\n+\t\t\tgoto mempool_bulk;\n+\n+\t\tn = txq->tx_rs_thresh;\n+\n+\t\t/* first buffer to free from S/W ring is at index\n+\t\t * tx_next_dd - (tx_rs_thresh-1)\n+\t\t */\n+\t\ttxep = (void *)txq->sw_ring;\n+\t\ttxep += txq->tx_next_dd - (n - 1);\n+\n+\t\tif (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {\n+\t\t\t/* directly put mbufs from Tx to Rx */\n+\t\t\tuint32_t copied = 0;\n+\t\t\t/* n is multiple of 32 */\n+\t\t\twhile (copied < n) {\n+\t\t\t\tconst __m512i a = _mm512_load_si512(&txep[copied]);\n+\t\t\t\tconst __m512i b = _mm512_load_si512(&txep[copied + 8]);\n+\t\t\t\tconst __m512i c = _mm512_load_si512(&txep[copied + 16]);\n+\t\t\t\tconst __m512i d = _mm512_load_si512(&txep[copied + 24]);\n+\n+\t\t\t\t_mm512_storeu_si512(&rxep[copied], a);\n+\t\t\t\t_mm512_storeu_si512(&rxep[copied + 8], b);\n+\t\t\t\t_mm512_storeu_si512(&rxep[copied + 16], c);\n+\t\t\t\t_mm512_storeu_si512(&rxep[copied + 24], d);\n+\t\t\t\tcopied += 32;\n+\t\t\t}\n+\t\t\ttype = I40E_DIRECT_REARM_TYPE_FAST_FREE;\n+\t\t} else {\n+\t\t\tfor (i = 0; i < n; i++) {\n+\t\t\t\tm[i] = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\t\t/* ensure each Tx freed buffer is valid */\n+\t\t\t\tif (m[i] != NULL)\n+\t\t\t\t\tnb_rearm++;\n+\t\t\t}\n+\n+\t\t\tif (nb_rearm != n) {\n+\t\t\t\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\t\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\t\t\t\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\t\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\n+\t\t\t\tgoto mempool_bulk;\n+\t\t\t} else {\n+\t\t\t\ttype = I40E_DIRECT_REARM_TYPE_PRE_FREE;\n+\t\t\t}\n+\t\t}\n+\n+\t/* update counters for Tx */\n+\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\t} else {\n+mempool_bulk:\n+\t\tcache = rte_mempool_default_cache(rxq->mp, rte_lcore_id());\n+\n+\t\tif (unlikely(!cache))\n+\t\t\treturn i40e_rxq_rearm_common(rxq, true);\n+\n+\t\tn = RTE_I40E_RXQ_REARM_THRESH;\n+\n+\t\t/* We need to pull 'n' more MBUFs into the software ring from mempool\n+\t\t * We inline the mempool function here, so we can vectorize the copy\n+\t\t * from the cache into the shadow ring.\n+\t\t */\n+\n+\t\tif (cache->len < RTE_I40E_RXQ_REARM_THRESH) {\n+\t\t\t/* No. Backfill the cache first, and then fill from it */\n+\t\t\tuint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -\n+\t\t\t\t\tcache->len);\n+\n+\t\t\t/* How many do we require\n+\t\t\t * i.e. number to fill the cache + the request\n+\t\t\t */\n+\t\t\tint ret = rte_mempool_ops_dequeue_bulk(rxq->mp,\n+\t\t\t\t\t&cache->objs[cache->len], req);\n+\t\t\tif (ret == 0) {\n+\t\t\t\tcache->len += req;\n+\t\t\t} else {\n+\t\t\t\tif (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=\n+\t\t\t\t\t\trxq->nb_rx_desc) {\n+\t\t\t\t\t__m128i dma_addr0;\n+\n+\t\t\t\t\tdma_addr0 = _mm_setzero_si128();\n+\t\t\t\t\tfor (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {\n+\t\t\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\t\t\t_mm_store_si128\n+\t\t\t\t\t\t\t((__m128i *)&rxdp[i].read,\n+\t\t\t\t\t\t\t\tdma_addr0);\n+\t\t\t\t\t}\n+\t\t\t\t}\n+\t\t\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=\n+\t\t\t\t\t\tRTE_I40E_RXQ_REARM_THRESH;\n+\t\t\t\treturn;\n+\t\t\t}\n+\t\t}\n+\n+\t\ttype = I40E_DIRECT_REARM_TYPE_NORMAL;\n+\t}\n+\n+\tconst __m512i iova_offsets =  _mm512_set1_epi64\n+\t\t(offsetof(struct rte_mbuf, buf_iova));\n+\tconst __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);\n+\n+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n+\t/* to shuffle the addresses to correct slots. Values 4-7 will contain\n+\t * zeros, so use 7 for a zero-value.\n+\t */\n+\tconst __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);\n+#else\n+\tconst __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);\n+#endif\n+\n+\t__m512i mbuf_ptrs;\n+\n+\t/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking\n+\t * from mempool cache and populating both shadow and HW rings\n+\t */\n+\tfor (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {\n+\t\tswitch (type) {\n+\t\tcase I40E_DIRECT_REARM_TYPE_FAST_FREE:\n+\t\t\tmbuf_ptrs = _mm512_loadu_si512(rxep);\n+\t\t\tbreak;\n+\t\tcase I40E_DIRECT_REARM_TYPE_PRE_FREE:\n+\t\t\tmbuf_ptrs = _mm512_loadu_si512(&m[j]);\n+\t\t\t_mm512_store_si512(rxep, mbuf_ptrs);\n+\t\t\tj += 8;\n+\t\t\tbreak;\n+\t\tcase I40E_DIRECT_REARM_TYPE_NORMAL:\n+\t\t\tmbuf_ptrs = _mm512_loadu_si512\n+\t\t\t\t(&cache->objs[cache->len - 8]);\n+\t\t\t_mm512_store_si512(rxep, mbuf_ptrs);\n+\t\t\tcache->len -= 8;\n+\t\t\tbreak;\n+\t\t}\n+\n+\t\t/* gather iova of mbuf0-7 into one zmm reg */\n+\t\tconst __m512i iova_base_addrs = _mm512_i64gather_epi64\n+\t\t\t(_mm512_add_epi64(mbuf_ptrs, iova_offsets),\n+\t\t\t\t0, /* base */\n+\t\t\t\t1 /* scale */);\n+\t\tconst __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,\n+\t\t\t\theadroom);\n+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n+\t\tconst __m512i iovas0 = _mm512_castsi256_si512\n+\t\t\t(_mm512_extracti64x4_epi64(iova_addrs, 0));\n+\t\tconst __m512i iovas1 = _mm512_castsi256_si512\n+\t\t\t(_mm512_extracti64x4_epi64(iova_addrs, 1));\n+\n+\t\t/* permute leaves desc 2-3 addresses in header address slots 0-1\n+\t\t * but these are ignored by driver since header split not\n+\t\t * enabled. Similarly for desc 4 & 5.\n+\t\t */\n+\t\tconst __m512i desc_rd_0_1 = _mm512_permutexvar_epi64\n+\t\t\t(permute_idx, iovas0);\n+\t\tconst __m512i desc_rd_2_3 = _mm512_bsrli_epi128(desc_rd_0_1, 8);\n+\n+\t\tconst __m512i desc_rd_4_5 = _mm512_permutexvar_epi64\n+\t\t\t(permute_idx, iovas1);\n+\t\tconst __m512i desc_rd_6_7 = _mm512_bsrli_epi128(desc_rd_4_5, 8);\n+\n+\t\t_mm512_store_si512((void *)rxdp, desc_rd_0_1);\n+\t\t_mm512_store_si512((void *)(rxdp + 2), desc_rd_2_3);\n+\t\t_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_5);\n+\t\t_mm512_store_si512((void *)(rxdp + 6), desc_rd_6_7);\n+#else\n+\t\t/* permute leaves desc 4-7 addresses in header address slots 0-3\n+\t\t * but these are ignored by driver since header split not\n+\t\t * enabled.\n+\t\t */\n+\t\tconst __m512i desc_rd_0_3 = _mm512_permutexvar_epi64\n+\t\t\t(permute_idx, iova_addrs);\n+\t\tconst __m512i desc_rd_4_7 = _mm512_bsrli_epi128(desc_rd_0_3, 8);\n+\n+\t\t_mm512_store_si512((void *)rxdp, desc_rd_0_3);\n+\t\t_mm512_store_si512((void *)(rxdp + 4), desc_rd_4_7);\n+#endif\n+\t\trxdp += 8, rxep += 8;\n+\t}\n+\n+\t/* Update the descriptor initializer index */\n+\trxq->rxrearm_start += n;\n+\trx_id = rxq->rxrearm_start - 1;\n+\n+\tif (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {\n+\t\trxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;\n+\t\tif (!rxq->rxrearm_start)\n+\t\t\trx_id = rxq->nb_rx_desc - 1;\n+\t\telse\n+\t\t\trx_id = rxq->rxrearm_start - 1;\n+\t}\n+\n+\trxq->rxrearm_nb -= n;\n+\n+\t/* Update the tail pointer on the NIC */\n+\tI40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);\n+}\n+\n #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n /* Handles 32B descriptor FDIR ID processing:\n  * rxdp: receive descriptor ring, required to load 2nd 16B half of each desc\n@@ -252,8 +493,12 @@ _recv_raw_pkts_vec_avx512(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n \t/* See if we need to rearm the RX queue - gives the prefetch a bit\n \t * of time to act\n \t */\n-\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)\n-\t\ti40e_rxq_rearm(rxq);\n+\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {\n+\t\tif (rxq->direct_rxrearm_enable)\n+\t\t\ti40e_rxq_direct_rearm(rxq);\n+\t\telse\n+\t\t\ti40e_rxq_rearm(rxq);\n+\t}\n \n \t/* Before we start moving massive data around, check to see if\n \t * there is actually a packet available\ndiff --git a/drivers/net/i40e/i40e_rxtx_vec_neon.c b/drivers/net/i40e/i40e_rxtx_vec_neon.c\nindex fa9e6582c5..dc78e3c90b 100644\n--- a/drivers/net/i40e/i40e_rxtx_vec_neon.c\n+++ b/drivers/net/i40e/i40e_rxtx_vec_neon.c\n@@ -77,6 +77,139 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)\n \tI40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);\n }\n \n+static inline void\n+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)\n+{\n+\tstruct rte_eth_dev *dev;\n+\tstruct i40e_tx_queue *txq;\n+\tvolatile union i40e_rx_desc *rxdp;\n+\tstruct i40e_tx_entry *txep;\n+\tstruct i40e_rx_entry *rxep;\n+\tuint16_t tx_port_id, tx_queue_id;\n+\tuint16_t rx_id;\n+\tstruct rte_mbuf *mb0, *mb1, *m;\n+\tuint64x2_t dma_addr0, dma_addr1;\n+\tuint64x2_t zero = vdupq_n_u64(0);\n+\tuint64_t paddr;\n+\tuint16_t i, n;\n+\tuint16_t nb_rearm = 0;\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\trxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\n+\ttx_port_id = rxq->direct_rxrearm_port;\n+\ttx_queue_id = rxq->direct_rxrearm_queue;\n+\tdev = &rte_eth_devices[tx_port_id];\n+\ttxq = dev->data->tx_queues[tx_queue_id];\n+\n+\t/* check Rx queue is able to take in the whole\n+\t * batch of free mbufs from Tx queue\n+\t */\n+\tif (rxq->rxrearm_nb > txq->tx_rs_thresh) {\n+\t\t/* check DD bits on threshold descriptor */\n+\t\tif ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &\n+\t\t\t\trte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=\n+\t\t\t\trte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {\n+\t\t\tgoto mempool_bulk;\n+\t\t}\n+\n+\t\tn = txq->tx_rs_thresh;\n+\n+\t\t/* first buffer to free from S/W ring is at index\n+\t\t * tx_next_dd - (tx_rs_thresh-1)\n+\t\t */\n+\t\ttxep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];\n+\n+\t\tif (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {\n+\t\t\t/* directly put mbufs from Tx to Rx,\n+\t\t\t * and initialize the mbufs in vector\n+\t\t\t */\n+\t\t\tfor (i = 0; i < n; i++, rxep++, txep++) {\n+\t\t\t\trxep[0].mbuf = txep[0].mbuf;\n+\n+\t\t\t\t/* Initialize rxdp descs */\n+\t\t\t\tmb0 = txep[0].mbuf;\n+\n+\t\t\t\tpaddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;\n+\t\t\t\tdma_addr0 = vdupq_n_u64(paddr);\n+\t\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t\tvst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);\n+\t\t\t}\n+\t\t} else {\n+\t\t\tfor (i = 0; i < n; i++) {\n+\t\t\t\tm = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\t\tif (m != NULL) {\n+\t\t\t\t\trxep[i].mbuf = m;\n+\n+\t\t\t\t\t/* Initialize rxdp descs */\n+\t\t\t\t\tpaddr = m->buf_iova + RTE_PKTMBUF_HEADROOM;\n+\t\t\t\t\tdma_addr0 = vdupq_n_u64(paddr);\n+\t\t\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t\t\tvst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);\n+\t\t\t\t\tnb_rearm++;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tn = nb_rearm;\n+\t\t}\n+\n+\t\t/* update counters for Tx */\n+\t\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\t\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\t} else {\n+mempool_bulk:\n+\t\t/* if TX did not free bufs into Rx sw-ring,\n+\t\t * get new bufs from mempool\n+\t\t */\n+\t\tn = RTE_I40E_RXQ_REARM_THRESH;\n+\t\tif (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0)) {\n+\t\t\tif (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {\n+\t\t\t\tfor (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {\n+\t\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\t\tvst1q_u64((uint64_t *)&rxdp[i].read, zero);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += n;\n+\t\t\treturn;\n+\t\t}\n+\n+\t\t/* Initialize the mbufs in vector, process 2 mbufs in one loop */\n+\t\tfor (i = 0; i < n; i += 2, rxep += 2) {\n+\t\t\tmb0 = rxep[0].mbuf;\n+\t\t\tmb1 = rxep[1].mbuf;\n+\n+\t\t\tpaddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;\n+\t\t\tdma_addr0 = vdupq_n_u64(paddr);\n+\t\t\t/* flush desc with pa dma_addr */\n+\t\t\tvst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);\n+\n+\t\t\tpaddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;\n+\t\t\tdma_addr1 = vdupq_n_u64(paddr);\n+\t\t\t/* flush desc with pa dma_addr */\n+\t\t\tvst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);\n+\t\t}\n+\t}\n+\n+\t/* Update the descriptor initializer index */\n+\trxq->rxrearm_start += n;\n+\trx_id = rxq->rxrearm_start - 1;\n+\n+\tif (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {\n+\t\trxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;\n+\t\tif (!rxq->rxrearm_start)\n+\t\t\trx_id = rxq->nb_rx_desc - 1;\n+\t\telse\n+\t\t\trx_id = rxq->rxrearm_start - 1;\n+\t}\n+\n+\trxq->rxrearm_nb -= n;\n+\n+\trte_io_wmb();\n+\t/* Update the tail pointer on the NIC */\n+\tI40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);\n+}\n+\n #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n /* NEON version of FDIR mark extraction for 4 32B descriptors at a time */\n static inline uint32x4_t\n@@ -381,8 +514,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,\n \t/* See if we need to rearm the RX queue - gives the prefetch a bit\n \t * of time to act\n \t */\n-\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)\n-\t\ti40e_rxq_rearm(rxq);\n+\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {\n+\t\tif (rxq->direct_rxrearm_enable)\n+\t\t\ti40e_rxq_direct_rearm(rxq);\n+\t\telse\n+\t\t\ti40e_rxq_rearm(rxq);\n+\t}\n \n \t/* Before we start moving massive data around, check to see if\n \t * there is actually a packet available\ndiff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c b/drivers/net/i40e/i40e_rxtx_vec_sse.c\nindex 3782e8052f..b2f1ab2c8d 100644\n--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c\n+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c\n@@ -89,6 +89,168 @@ i40e_rxq_rearm(struct i40e_rx_queue *rxq)\n \tI40E_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);\n }\n \n+static inline void\n+i40e_rxq_direct_rearm(struct i40e_rx_queue *rxq)\n+{\n+\tstruct rte_eth_dev *dev;\n+\tstruct i40e_tx_queue *txq;\n+\tvolatile union i40e_rx_desc *rxdp;\n+\tstruct i40e_tx_entry *txep;\n+\tstruct i40e_rx_entry *rxep;\n+\tuint16_t tx_port_id, tx_queue_id;\n+\tuint16_t rx_id;\n+\tstruct rte_mbuf *mb0, *mb1, *m;\n+\t__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,\n+\t\t\tRTE_PKTMBUF_HEADROOM);\n+\t__m128i dma_addr0, dma_addr1;\n+\t__m128i vaddr0, vaddr1;\n+\tuint16_t i, n;\n+\tuint16_t nb_rearm = 0;\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\trxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\n+\ttx_port_id = rxq->direct_rxrearm_port;\n+\ttx_queue_id = rxq->direct_rxrearm_queue;\n+\tdev = &rte_eth_devices[tx_port_id];\n+\ttxq = dev->data->tx_queues[tx_queue_id];\n+\n+\t/* check Rx queue is able to take in the whole\n+\t * batch of free mbufs from Tx queue\n+\t */\n+\tif (rxq->rxrearm_nb > txq->tx_rs_thresh) {\n+\t\t/* check DD bits on threshold descriptor */\n+\t\tif ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &\n+\t\t\t\trte_cpu_to_le_64(I40E_TXD_QW1_DTYPE_MASK)) !=\n+\t\t\t\trte_cpu_to_le_64(I40E_TX_DESC_DTYPE_DESC_DONE)) {\n+\t\t\tgoto mempool_bulk;\n+\t\t}\n+\n+\t\tn = txq->tx_rs_thresh;\n+\n+\t\t/* first buffer to free from S/W ring is at index\n+\t\t * tx_next_dd - (tx_rs_thresh-1)\n+\t\t */\n+\t\ttxep = &txq->sw_ring[txq->tx_next_dd - (n - 1)];\n+\n+\t\tif (txq->offloads & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE) {\n+\t\t\t/* directly put mbufs from Tx to Rx,\n+\t\t\t * and initialize the mbufs in vector\n+\t\t\t */\n+\t\t\tfor (i = 0; i < n; i++, rxep++, txep++) {\n+\t\t\t\trxep[0].mbuf = txep[0].mbuf;\n+\n+\t\t\t\t/* Initialize rxdp descs */\n+\t\t\t\tmb0 = txep[0].mbuf;\n+\n+\t\t\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\t\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\t\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\n+\t\t\t\t/* convert pa to dma_addr hdr/data */\n+\t\t\t\tdma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);\n+\n+\t\t\t\t/* add headroom to pa values */\n+\t\t\t\tdma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);\n+\n+\t\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);\n+\t\t\t}\n+\t\t} else {\n+\t\t\tfor (i = 0; i < n; i++) {\n+\t\t\t\tm = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\t\tif (m != NULL) {\n+\t\t\t\t\trxep[i].mbuf = m;\n+\n+\t\t\t\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\t\t\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\t\t\t\tvaddr0 = _mm_loadu_si128((__m128i *)&m->buf_addr);\n+\n+\t\t\t\t\t/* convert pa to dma_addr hdr/data */\n+\t\t\t\t\tdma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);\n+\n+\t\t\t\t\t/* add headroom to pa values */\n+\t\t\t\t\tdma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);\n+\n+\t\t\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);\n+\t\t\t\t\tnb_rearm++;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\tn = nb_rearm;\n+\t\t}\n+\n+\t\t/* update counters for Tx */\n+\t\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\t\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\t} else {\n+mempool_bulk:\n+\t\t/* if TX did not free bufs into Rx sw-ring,\n+\t\t * get new bufs from mempool\n+\t\t */\n+\t\tn = RTE_I40E_RXQ_REARM_THRESH;\n+\t\t/* Pull 'n' more MBUFs into the software ring */\n+\t\tif (rte_mempool_get_bulk(rxq->mp, (void *)rxep, n) < 0) {\n+\t\t\tif (rxq->rxrearm_nb + n >= rxq->nb_rx_desc) {\n+\t\t\t\tdma_addr0 = _mm_setzero_si128();\n+\t\t\t\tfor (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {\n+\t\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\t\t_mm_store_si128((__m128i *)&rxdp[i].read,\n+\t\t\t\t\t\t\tdma_addr0);\n+\t\t\t\t}\n+\t\t\t}\n+\t\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=\n+\t\t\t\tRTE_I40E_RXQ_REARM_THRESH;\n+\t\t\treturn;\n+\t\t}\n+\n+\t\t/* Initialize the mbufs in vector, process 2 mbufs in one loop */\n+\t\tfor (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {\n+\t\t\tmb0 = rxep[0].mbuf;\n+\t\t\tmb1 = rxep[1].mbuf;\n+\n+\t\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\t\t\tvaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);\n+\n+\t\t\t/* convert pa to dma_addr hdr/data */\n+\t\t\tdma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);\n+\t\t\tdma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);\n+\n+\t\t\t/* add headroom to pa values */\n+\t\t\tdma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);\n+\t\t\tdma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);\n+\n+\t\t\t/* flush desc with pa dma_addr */\n+\t\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);\n+\t\t\t_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);\n+\t\t}\n+\t}\n+\n+\t/* Update the descriptor initializer index */\n+\trxq->rxrearm_start += n;\n+\trx_id = rxq->rxrearm_start - 1;\n+\n+\tif (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {\n+\t\trxq->rxrearm_start = rxq->rxrearm_start - rxq->nb_rx_desc;\n+\t\tif (!rxq->rxrearm_start)\n+\t\t\trx_id = rxq->nb_rx_desc - 1;\n+\t\telse\n+\t\t\trx_id = rxq->rxrearm_start - 1;\n+\t}\n+\n+\trxq->rxrearm_nb -= n;\n+\n+\t/* Update the tail pointer on the NIC */\n+\tI40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);\n+}\n+\n #ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC\n /* SSE version of FDIR mark extraction for 4 32B descriptors at a time */\n static inline __m128i\n@@ -394,8 +556,12 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n \t/* See if we need to rearm the RX queue - gives the prefetch a bit\n \t * of time to act\n \t */\n-\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)\n-\t\ti40e_rxq_rearm(rxq);\n+\tif (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) {\n+\t\tif (rxq->direct_rxrearm_enable)\n+\t\t\ti40e_rxq_direct_rearm(rxq);\n+\t\telse\n+\t\t\ti40e_rxq_rearm(rxq);\n+\t}\n \n \t/* Before we start moving massive data around, check to see if\n \t * there is actually a packet available\n",
    "prefixes": [
        "v1",
        "2/5"
    ]
}