get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/139208/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 139208,
    "url": "http://patchwork.dpdk.org/api/patches/139208/?format=api",
    "web_url": "http://patchwork.dpdk.org/project/dpdk/patch/20240409063135.21780-3-jiawenwu@trustnetic.com/",
    "project": {
        "id": 1,
        "url": "http://patchwork.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20240409063135.21780-3-jiawenwu@trustnetic.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20240409063135.21780-3-jiawenwu@trustnetic.com",
    "date": "2024-04-09T06:31:35",
    "name": "[v2,2/2] net/ngbe: add vectorized functions for Rx/Tx",
    "commit_ref": null,
    "pull_url": null,
    "state": "changes-requested",
    "archived": false,
    "hash": "685ae77e99292536266f7745d3771ae1d2a66358",
    "submitter": {
        "id": 1932,
        "url": "http://patchwork.dpdk.org/api/people/1932/?format=api",
        "name": "Jiawen Wu",
        "email": "jiawenwu@trustnetic.com"
    },
    "delegate": {
        "id": 319,
        "url": "http://patchwork.dpdk.org/api/users/319/?format=api",
        "username": "fyigit",
        "first_name": "Ferruh",
        "last_name": "Yigit",
        "email": "ferruh.yigit@amd.com"
    },
    "mbox": "http://patchwork.dpdk.org/project/dpdk/patch/20240409063135.21780-3-jiawenwu@trustnetic.com/mbox/",
    "series": [
        {
            "id": 31710,
            "url": "http://patchwork.dpdk.org/api/series/31710/?format=api",
            "web_url": "http://patchwork.dpdk.org/project/dpdk/list/?series=31710",
            "date": "2024-04-09T06:31:33",
            "name": "Wangxun support vector Rx/Tx",
            "version": 2,
            "mbox": "http://patchwork.dpdk.org/series/31710/mbox/"
        }
    ],
    "comments": "http://patchwork.dpdk.org/api/patches/139208/comments/",
    "check": "success",
    "checks": "http://patchwork.dpdk.org/api/patches/139208/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 038E743DDE;\n\tTue,  9 Apr 2024 08:31:53 +0200 (CEST)",
            "from mails.dpdk.org (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id E11E8402D9;\n\tTue,  9 Apr 2024 08:31:52 +0200 (CEST)",
            "from smtpbguseast3.qq.com (smtpbguseast3.qq.com [54.243.244.52])\n by mails.dpdk.org (Postfix) with ESMTP id 512DA402D9\n for <dev@dpdk.org>; Tue,  9 Apr 2024 08:31:49 +0200 (CEST)",
            "from lap-jiawenwu.trustnetic.com ( [183.128.132.155])\n by bizesmtp.qq.com (ESMTP) with\n id ; Tue, 09 Apr 2024 14:31:44 +0800 (CST)"
        ],
        "X-QQ-mid": "bizesmtp84t1712644304tfuj7vok",
        "X-QQ-Originating-IP": "ketFT+DAKfP1YdSMwDG7WrCskiAiR01OYIvJhv7LqLE=",
        "X-QQ-SSF": "01400000000000L0Z000000A0000000",
        "X-QQ-FEAT": "1KCaicv049ewg1JAdH/C77QnSfesu1OJEzgOi73LlCQpfpyBlcxAJ1jFx6brx\n FkJIZaZ+oxlWA41LqpSR8utS5mUTonTbDqJeATd5I9AsYIPJx1GBJ5O1nK1fgu3dk1cjVgR\n tiCz5H3jQlK7l79URavQbxxnJ5iULsmgVGaLRIPygnULvG+Bx0R4XqN3mDyYgVuXT38KdV4\n gKDawC3zDvbLQ5ZTRyDG0eZKYywLSHTAsb7I/53T4xY6HpJH4sZUq9Y7NjyCJ9istt9Z0cX\n m2mk3A+gWpI0mC4JZxPaUOuX+XIOf7QHKzB7EJoM1HLsaWq3jBHtUJPWNTJXHTOdO9c2kLj\n zAo/Dc6Croto4te1jHwI/c2vSY8Gbv3Ww9LzcCGqdpUgF0J9pao95LAVOtCfAKp3TMQBwVM\n 8bt02ok8l3sDczArsf+8Fw==",
        "X-QQ-GoodBg": "2",
        "X-BIZMAIL-ID": "9152158185628772707",
        "From": "Jiawen Wu <jiawenwu@trustnetic.com>",
        "To": "dev@dpdk.org",
        "Cc": "Jiawen Wu <jiawenwu@trustnetic.com>",
        "Subject": "[PATCH v2 2/2] net/ngbe: add vectorized functions for Rx/Tx",
        "Date": "Tue,  9 Apr 2024 14:31:35 +0800",
        "Message-Id": "<20240409063135.21780-3-jiawenwu@trustnetic.com>",
        "X-Mailer": "git-send-email 2.21.0.windows.1",
        "In-Reply-To": "<20240409063135.21780-1-jiawenwu@trustnetic.com>",
        "References": "<20240409063135.21780-1-jiawenwu@trustnetic.com>",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "X-QQ-SENDSIZE": "520",
        "Feedback-ID": "bizesmtp:trustnetic.com:qybglogicsvrgz:qybglogicsvrgz8a-1",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.29",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org"
    },
    "content": "To optimize Rx/Tx burst process, add SSE/NEON vector instructions on\nx86/arm architecture.\n\nThe performance test results on Phytium D2000(ARM64) show that,\nthroughput for 128-byte packets increased from 76.797% to 94.375%.\n\nSigned-off-by: Jiawen Wu <jiawenwu@trustnetic.com>\n---\n drivers/net/ngbe/meson.build            |   6 +\n drivers/net/ngbe/ngbe_ethdev.c          |   5 +\n drivers/net/ngbe/ngbe_ethdev.h          |   1 +\n drivers/net/ngbe/ngbe_rxtx.c            | 162 +++++-\n drivers/net/ngbe/ngbe_rxtx.h            |  32 +-\n drivers/net/ngbe/ngbe_rxtx_vec_common.h | 296 ++++++++++\n drivers/net/ngbe/ngbe_rxtx_vec_neon.c   | 602 +++++++++++++++++++++\n drivers/net/ngbe/ngbe_rxtx_vec_sse.c    | 688 ++++++++++++++++++++++++\n 8 files changed, 1788 insertions(+), 4 deletions(-)\n create mode 100644 drivers/net/ngbe/ngbe_rxtx_vec_common.h\n create mode 100644 drivers/net/ngbe/ngbe_rxtx_vec_neon.c\n create mode 100644 drivers/net/ngbe/ngbe_rxtx_vec_sse.c",
    "diff": "diff --git a/drivers/net/ngbe/meson.build b/drivers/net/ngbe/meson.build\nindex 8b5195aab3..5d395ee17f 100644\n--- a/drivers/net/ngbe/meson.build\n+++ b/drivers/net/ngbe/meson.build\n@@ -19,4 +19,10 @@ sources = files(\n \n deps += ['hash']\n \n+if arch_subdir == 'x86'\n+\tsources += files('ngbe_rxtx_vec_sse.c')\n+elif arch_subdir == 'arm'\n+\tsources += files('ngbe_rxtx_vec_neon.c')\n+endif\n+\n includes += include_directories('base')\ndiff --git a/drivers/net/ngbe/ngbe_ethdev.c b/drivers/net/ngbe/ngbe_ethdev.c\nindex 4cd07a0030..c2e186c3d6 100644\n--- a/drivers/net/ngbe/ngbe_ethdev.c\n+++ b/drivers/net/ngbe/ngbe_ethdev.c\n@@ -932,6 +932,7 @@ ngbe_dev_configure(struct rte_eth_dev *dev)\n \t * allocation Rx preconditions we will reset it.\n \t */\n \tadapter->rx_bulk_alloc_allowed = true;\n+\tadapter->rx_vec_allowed = true;\n \n \treturn 0;\n }\n@@ -1867,6 +1868,10 @@ const uint32_t *\n ngbe_dev_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)\n {\n \tif (dev->rx_pkt_burst == ngbe_recv_pkts ||\n+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)\n+\t    dev->rx_pkt_burst == ngbe_recv_pkts_vec ||\n+\t    dev->rx_pkt_burst == ngbe_recv_scattered_pkts_vec ||\n+#endif\n \t    dev->rx_pkt_burst == ngbe_recv_pkts_sc_single_alloc ||\n \t    dev->rx_pkt_burst == ngbe_recv_pkts_sc_bulk_alloc ||\n \t    dev->rx_pkt_burst == ngbe_recv_pkts_bulk_alloc)\ndiff --git a/drivers/net/ngbe/ngbe_ethdev.h b/drivers/net/ngbe/ngbe_ethdev.h\nindex 70ed1920dd..c748bfbe4d 100644\n--- a/drivers/net/ngbe/ngbe_ethdev.h\n+++ b/drivers/net/ngbe/ngbe_ethdev.h\n@@ -130,6 +130,7 @@ struct ngbe_adapter {\n \tstruct ngbe_vf_info        *vfdata;\n \tstruct ngbe_uta_info       uta_info;\n \tbool                       rx_bulk_alloc_allowed;\n+\tbool                       rx_vec_allowed;\n \tstruct rte_timecounter     systime_tc;\n \tstruct rte_timecounter     rx_tstamp_tc;\n \tstruct rte_timecounter     tx_tstamp_tc;\ndiff --git a/drivers/net/ngbe/ngbe_rxtx.c b/drivers/net/ngbe/ngbe_rxtx.c\nindex 8a873b858e..e92f241e46 100644\n--- a/drivers/net/ngbe/ngbe_rxtx.c\n+++ b/drivers/net/ngbe/ngbe_rxtx.c\n@@ -10,6 +10,7 @@\n #include <ethdev_driver.h>\n #include <rte_malloc.h>\n #include <rte_net.h>\n+#include <rte_vect.h>\n \n #include \"ngbe_logs.h\"\n #include \"base/ngbe.h\"\n@@ -267,6 +268,27 @@ ngbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,\n \treturn nb_tx;\n }\n \n+static uint16_t\n+ngbe_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,\n+\t\t   uint16_t nb_pkts)\n+{\n+\tstruct ngbe_tx_queue *txq = (struct ngbe_tx_queue *)tx_queue;\n+\tuint16_t nb_tx = 0;\n+\n+\twhile (nb_pkts) {\n+\t\tuint16_t ret, num;\n+\n+\t\tnum = (uint16_t)RTE_MIN(nb_pkts, txq->tx_free_thresh);\n+\t\tret = ngbe_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num);\n+\t\tnb_tx += ret;\n+\t\tnb_pkts -= ret;\n+\t\tif (ret < num)\n+\t\t\tbreak;\n+\t}\n+\n+\treturn nb_tx;\n+}\n+\n static inline void\n ngbe_set_xmit_ctx(struct ngbe_tx_queue *txq,\n \t\tvolatile struct ngbe_tx_ctx_desc *ctx_txd,\n@@ -1858,8 +1880,16 @@ ngbe_set_tx_function(struct rte_eth_dev *dev, struct ngbe_tx_queue *txq)\n \tif (txq->offloads == 0 &&\n \t\t\ttxq->tx_free_thresh >= RTE_PMD_NGBE_TX_MAX_BURST) {\n \t\tPMD_INIT_LOG(DEBUG, \"Using simple tx code path\");\n-\t\tdev->tx_pkt_burst = ngbe_xmit_pkts_simple;\n \t\tdev->tx_pkt_prepare = NULL;\n+\t\tif (txq->tx_free_thresh <= RTE_NGBE_TX_MAX_FREE_BUF_SZ &&\n+\t\t\t\trte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128 &&\n+\t\t\t\t(rte_eal_process_type() != RTE_PROC_PRIMARY ||\n+\t\t\t\t\tngbe_txq_vec_setup(txq) == 0)) {\n+\t\t\tPMD_INIT_LOG(DEBUG, \"Vector tx enabled.\");\n+\t\t\tdev->tx_pkt_burst = ngbe_xmit_pkts_vec;\n+\t\t} else {\n+\t\t\tdev->tx_pkt_burst = ngbe_xmit_pkts_simple;\n+\t\t}\n \t} else {\n \t\tPMD_INIT_LOG(DEBUG, \"Using full-featured tx code path\");\n \t\tPMD_INIT_LOG(DEBUG,\n@@ -1880,6 +1910,11 @@ static const struct {\n } ngbe_tx_burst_infos[] = {\n \t{ ngbe_xmit_pkts_simple,   \"Scalar Simple\"},\n \t{ ngbe_xmit_pkts,          \"Scalar\"},\n+#ifdef RTE_ARCH_X86\n+\t{ ngbe_xmit_pkts_vec,      \"Vector SSE\" },\n+#elif defined(RTE_ARCH_ARM)\n+\t{ ngbe_xmit_pkts_vec,      \"Vector Neon\" },\n+#endif\n };\n \n int\n@@ -2066,6 +2101,12 @@ ngbe_rx_queue_release_mbufs(struct ngbe_rx_queue *rxq)\n {\n \tunsigned int i;\n \n+\t/* SSE Vector driver has a different way of releasing mbufs. */\n+\tif (rxq->rx_using_sse) {\n+\t\tngbe_rx_queue_release_mbufs_vec(rxq);\n+\t\treturn;\n+\t}\n+\n \tif (rxq->sw_ring != NULL) {\n \t\tfor (i = 0; i < rxq->nb_rx_desc; i++) {\n \t\t\tif (rxq->sw_ring[i].mbuf != NULL) {\n@@ -2189,6 +2230,11 @@ ngbe_reset_rx_queue(struct ngbe_adapter *adapter, struct ngbe_rx_queue *rxq)\n \trxq->nb_rx_hold = 0;\n \trxq->pkt_first_seg = NULL;\n \trxq->pkt_last_seg = NULL;\n+\n+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)\n+\trxq->rxrearm_start = 0;\n+\trxq->rxrearm_nb = 0;\n+#endif\n }\n \n uint64_t\n@@ -2339,6 +2385,16 @@ ngbe_dev_rx_queue_setup(struct rte_eth_dev *dev,\n \t\t     rxq->sw_ring, rxq->sw_sc_ring, rxq->rx_ring,\n \t\t     rxq->rx_ring_phys_addr);\n \n+\tif (!rte_is_power_of_2(nb_desc)) {\n+\t\tPMD_INIT_LOG(DEBUG, \"queue[%d] doesn't meet Vector Rx \"\n+\t\t\t\t    \"preconditions - canceling the feature for \"\n+\t\t\t\t    \"the whole port[%d]\",\n+\t\t\t     rxq->queue_id, rxq->port_id);\n+\t\tadapter->rx_vec_allowed = false;\n+\t} else {\n+\t\tngbe_rxq_vec_setup(rxq);\n+\t}\n+\n \tdev->data->rx_queues[queue_idx] = rxq;\n \n \tngbe_reset_rx_queue(adapter, rxq);\n@@ -2379,7 +2435,12 @@ ngbe_dev_rx_descriptor_status(void *rx_queue, uint16_t offset)\n \tif (unlikely(offset >= rxq->nb_rx_desc))\n \t\treturn -EINVAL;\n \n-\tnb_hold = rxq->nb_rx_hold;\n+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)\n+\tif (rxq->rx_using_sse)\n+\t\tnb_hold = rxq->rxrearm_nb;\n+\telse\n+#endif\n+\t\tnb_hold = rxq->nb_rx_hold;\n \tif (offset >= rxq->nb_rx_desc - nb_hold)\n \t\treturn RTE_ETH_RX_DESC_UNAVAIL;\n \n@@ -2740,14 +2801,33 @@ ngbe_dev_mq_rx_configure(struct rte_eth_dev *dev)\n void\n ngbe_set_rx_function(struct rte_eth_dev *dev)\n {\n+\tuint16_t i, rx_using_sse;\n \tstruct ngbe_adapter *adapter = ngbe_dev_adapter(dev);\n \n+\t/*\n+\t * In order to allow Vector Rx there are a few configuration\n+\t * conditions to be met and Rx Bulk Allocation should be allowed.\n+\t */\n+\tif (ngbe_rx_vec_dev_conf_condition_check(dev) ||\n+\t    !adapter->rx_bulk_alloc_allowed ||\n+\t\t\trte_vect_get_max_simd_bitwidth() < RTE_VECT_SIMD_128) {\n+\t\tPMD_INIT_LOG(DEBUG,\n+\t\t\t     \"Port[%d] doesn't meet Vector Rx preconditions\",\n+\t\t\t     dev->data->port_id);\n+\t\tadapter->rx_vec_allowed = false;\n+\t}\n+\n \tif (dev->data->scattered_rx) {\n \t\t/*\n \t\t * Set the scattered callback: there are bulk and\n \t\t * single allocation versions.\n \t\t */\n-\t\tif (adapter->rx_bulk_alloc_allowed) {\n+\t\tif (adapter->rx_vec_allowed) {\n+\t\t\tPMD_INIT_LOG(DEBUG,\n+\t\t\t\t     \"Using Vector Scattered Rx callback (port=%d).\",\n+\t\t\t\t     dev->data->port_id);\n+\t\t\tdev->rx_pkt_burst = ngbe_recv_scattered_pkts_vec;\n+\t\t} else if (adapter->rx_bulk_alloc_allowed) {\n \t\t\tPMD_INIT_LOG(DEBUG, \"Using a Scattered with bulk \"\n \t\t\t\t\t   \"allocation callback (port=%d).\",\n \t\t\t\t     dev->data->port_id);\n@@ -2765,9 +2845,16 @@ ngbe_set_rx_function(struct rte_eth_dev *dev)\n \t * Below we set \"simple\" callbacks according to port/queues parameters.\n \t * If parameters allow we are going to choose between the following\n \t * callbacks:\n+\t *    - Vector\n \t *    - Bulk Allocation\n \t *    - Single buffer allocation (the simplest one)\n \t */\n+\t} else if (adapter->rx_vec_allowed) {\n+\t\tPMD_INIT_LOG(DEBUG, \"Vector rx enabled, please make sure Rx \"\n+\t\t\t\t    \"burst size no less than %d (port=%d).\",\n+\t\t\t     RTE_NGBE_DESCS_PER_LOOP,\n+\t\t\t     dev->data->port_id);\n+\t\tdev->rx_pkt_burst = ngbe_recv_pkts_vec;\n \t} else if (adapter->rx_bulk_alloc_allowed) {\n \t\tPMD_INIT_LOG(DEBUG, \"Rx Burst Bulk Alloc Preconditions are \"\n \t\t\t\t    \"satisfied. Rx Burst Bulk Alloc function \"\n@@ -2783,6 +2870,15 @@ ngbe_set_rx_function(struct rte_eth_dev *dev)\n \n \t\tdev->rx_pkt_burst = ngbe_recv_pkts;\n \t}\n+\n+\trx_using_sse = (dev->rx_pkt_burst == ngbe_recv_scattered_pkts_vec ||\n+\t\t\tdev->rx_pkt_burst == ngbe_recv_pkts_vec);\n+\n+\tfor (i = 0; i < dev->data->nb_rx_queues; i++) {\n+\t\tstruct ngbe_rx_queue *rxq = dev->data->rx_queues[i];\n+\n+\t\trxq->rx_using_sse = rx_using_sse;\n+\t}\n }\n \n static const struct {\n@@ -2793,6 +2889,13 @@ static const struct {\n \t{ ngbe_recv_pkts_sc_bulk_alloc,      \"Scalar Scattered Bulk Alloc\"},\n \t{ ngbe_recv_pkts_bulk_alloc,         \"Scalar Bulk Alloc\"},\n \t{ ngbe_recv_pkts,                    \"Scalar\"},\n+#ifdef RTE_ARCH_X86\n+\t{ ngbe_recv_scattered_pkts_vec,      \"Vector SSE Scattered\" },\n+\t{ ngbe_recv_pkts_vec,                \"Vector SSE\" },\n+#elif defined(RTE_ARCH_ARM64)\n+\t{ ngbe_recv_scattered_pkts_vec,      \"Vector Neon Scattered\" },\n+\t{ ngbe_recv_pkts_vec,                \"Vector Neon\" },\n+#endif\n };\n \n int\n@@ -3311,3 +3414,56 @@ ngbe_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,\n \tqinfo->conf.offloads = txq->offloads;\n \tqinfo->conf.tx_deferred_start = txq->tx_deferred_start;\n }\n+\n+/* Stubs needed for linkage when RTE_ARCH_PPC_64, RTE_ARCH_RISCV or\n+ * RTE_ARCH_LOONGARCH is set.\n+ */\n+#if defined(RTE_ARCH_PPC_64) || defined(RTE_ARCH_RISCV) || \\\n+\tdefined(RTE_ARCH_LOONGARCH)\n+int\n+ngbe_rx_vec_dev_conf_condition_check(__rte_unused struct rte_eth_dev *dev)\n+{\n+\treturn -1;\n+}\n+\n+uint16_t\n+ngbe_recv_pkts_vec(__rte_unused void *rx_queue,\n+\t\t   __rte_unused struct rte_mbuf **rx_pkts,\n+\t\t   __rte_unused uint16_t nb_pkts)\n+{\n+\treturn 0;\n+}\n+\n+uint16_t\n+ngbe_recv_scattered_pkts_vec(__rte_unused void *rx_queue,\n+\t\t\t     __rte_unused struct rte_mbuf **rx_pkts,\n+\t\t\t     __rte_unused uint16_t nb_pkts)\n+{\n+\treturn 0;\n+}\n+\n+int\n+ngbe_rxq_vec_setup(__rte_unused struct ngbe_rx_queue *rxq)\n+{\n+\treturn -1;\n+}\n+\n+uint16_t\n+ngbe_xmit_fixed_burst_vec(__rte_unused void *tx_queue,\n+\t\t\t  __rte_unused struct rte_mbuf **tx_pkts,\n+\t\t\t  __rte_unused uint16_t nb_pkts)\n+{\n+\treturn 0;\n+}\n+\n+int\n+ngbe_txq_vec_setup(__rte_unused struct ngbe_tx_queue *txq)\n+{\n+\treturn -1;\n+}\n+\n+void\n+ngbe_rx_queue_release_mbufs_vec(__rte_unused struct ngbe_rx_queue *rxq)\n+{\n+}\n+#endif\ndiff --git a/drivers/net/ngbe/ngbe_rxtx.h b/drivers/net/ngbe/ngbe_rxtx.h\nindex 9130f9d0df..41580ba0b9 100644\n--- a/drivers/net/ngbe/ngbe_rxtx.h\n+++ b/drivers/net/ngbe/ngbe_rxtx.h\n@@ -203,6 +203,8 @@ struct ngbe_tx_desc {\n #define RTE_PMD_NGBE_RX_MAX_BURST 32\n #define RTE_NGBE_TX_MAX_FREE_BUF_SZ 64\n \n+#define RTE_NGBE_DESCS_PER_LOOP    4\n+\n #define RX_RING_SZ ((NGBE_RING_DESC_MAX + RTE_PMD_NGBE_RX_MAX_BURST) * \\\n \t\t    sizeof(struct ngbe_rx_desc))\n \n@@ -237,6 +239,13 @@ struct ngbe_tx_entry {\n \tuint16_t last_id; /**< Index of last scattered descriptor. */\n };\n \n+/**\n+ * Structure associated with each descriptor of the Tx ring of a Tx queue.\n+ */\n+struct ngbe_tx_entry_v {\n+\tstruct rte_mbuf *mbuf; /**< mbuf associated with Tx desc, if any. */\n+};\n+\n /**\n  * Structure associated with each Rx queue.\n  */\n@@ -254,6 +263,7 @@ struct ngbe_rx_queue {\n \n \tstruct rte_mbuf *pkt_first_seg; /**< First segment of current packet */\n \tstruct rte_mbuf *pkt_last_seg; /**< Last segment of current packet */\n+\tuint64_t        mbuf_initializer; /**< value to init mbufs */\n \tuint16_t        nb_rx_desc; /**< number of Rx descriptors */\n \tuint16_t        rx_tail;  /**< current value of RDT register */\n \tuint16_t        nb_rx_hold; /**< number of held free Rx desc */\n@@ -262,6 +272,11 @@ struct ngbe_rx_queue {\n \tuint16_t rx_next_avail; /**< idx of next staged pkt to ret to app */\n \tuint16_t rx_free_trigger; /**< triggers rx buffer allocation */\n \n+\tuint8_t         rx_using_sse;   /**< indicates that vector Rx is in use */\n+#if defined(RTE_ARCH_X86) || defined(RTE_ARCH_ARM)\n+\tuint16_t        rxrearm_nb;     /**< number of remaining to be re-armed */\n+\tuint16_t        rxrearm_start;  /**< the idx we start the re-arming from */\n+#endif\n \tuint16_t        rx_free_thresh; /**< max free Rx desc to hold */\n \tuint16_t        queue_id; /**< RX queue index */\n \tuint16_t        reg_idx;  /**< RX queue register index */\n@@ -325,7 +340,12 @@ struct ngbe_tx_queue {\n \tvolatile struct ngbe_tx_desc *tx_ring;\n \n \tuint64_t             tx_ring_phys_addr; /**< Tx ring DMA address */\n-\tstruct ngbe_tx_entry *sw_ring; /**< address of SW ring for scalar PMD */\n+\tunion {\n+\t\t/**< address of SW ring for scalar PMD. */\n+\t\tstruct ngbe_tx_entry *sw_ring;\n+\t\t/**< address of SW ring for vector PMD */\n+\t\tstruct ngbe_tx_entry_v *sw_ring_v;\n+\t};\n \tvolatile uint32_t    *tdt_reg_addr; /**< Address of TDT register */\n \tvolatile uint32_t    *tdc_reg_addr; /**< Address of TDC register */\n \tuint16_t             nb_tx_desc;    /**< number of Tx descriptors */\n@@ -368,6 +388,16 @@ struct ngbe_txq_ops {\n void ngbe_set_tx_function(struct rte_eth_dev *dev, struct ngbe_tx_queue *txq);\n \n void ngbe_set_rx_function(struct rte_eth_dev *dev);\n+uint16_t ngbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t    uint16_t nb_pkts);\n+uint16_t ngbe_recv_scattered_pkts_vec(void *rx_queue,\n+\t\tstruct rte_mbuf **rx_pkts, uint16_t nb_pkts);\n+int ngbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev);\n+int ngbe_rxq_vec_setup(struct ngbe_rx_queue *rxq);\n+void ngbe_rx_queue_release_mbufs_vec(struct ngbe_rx_queue *rxq);\n+uint16_t ngbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,\n+\t\t\t\t   uint16_t nb_pkts);\n+int ngbe_txq_vec_setup(struct ngbe_tx_queue *txq);\n int ngbe_dev_tx_done_cleanup(void *tx_queue, uint32_t free_cnt);\n \n uint64_t ngbe_get_tx_port_offloads(struct rte_eth_dev *dev);\ndiff --git a/drivers/net/ngbe/ngbe_rxtx_vec_common.h b/drivers/net/ngbe/ngbe_rxtx_vec_common.h\nnew file mode 100644\nindex 0000000000..1ce175de52\n--- /dev/null\n+++ b/drivers/net/ngbe/ngbe_rxtx_vec_common.h\n@@ -0,0 +1,296 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(c) 2015-2024 Beijing WangXun Technology Co., Ltd.\n+ * Copyright(c) 2010-2015 Intel Corporation\n+ */\n+\n+#ifndef _NGBE_RXTX_VEC_COMMON_H_\n+#define _NGBE_RXTX_VEC_COMMON_H_\n+#include <stdint.h>\n+\n+#include \"ngbe_ethdev.h\"\n+#include \"ngbe_rxtx.h\"\n+\n+#define NGBE_RXD_PTID_SHIFT 9\n+\n+#define RTE_NGBE_RXQ_REARM_THRESH      32\n+#define RTE_NGBE_MAX_RX_BURST          RTE_NGBE_RXQ_REARM_THRESH\n+\n+static inline uint16_t\n+reassemble_packets(struct ngbe_rx_queue *rxq, struct rte_mbuf **rx_bufs,\n+\t\t   uint16_t nb_bufs, uint8_t *split_flags)\n+{\n+\tstruct rte_mbuf *pkts[nb_bufs]; /*finished pkts*/\n+\tstruct rte_mbuf *start = rxq->pkt_first_seg;\n+\tstruct rte_mbuf *end =  rxq->pkt_last_seg;\n+\tunsigned int pkt_idx, buf_idx;\n+\n+\tfor (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {\n+\t\tif (end != NULL) {\n+\t\t\t/* processing a split packet */\n+\t\t\tend->next = rx_bufs[buf_idx];\n+\t\t\trx_bufs[buf_idx]->data_len += rxq->crc_len;\n+\n+\t\t\tstart->nb_segs++;\n+\t\t\tstart->pkt_len += rx_bufs[buf_idx]->data_len;\n+\t\t\tend = end->next;\n+\n+\t\t\tif (!split_flags[buf_idx]) {\n+\t\t\t\t/* it's the last packet of the set */\n+\t\t\t\tstart->hash = end->hash;\n+\t\t\t\tstart->ol_flags = end->ol_flags;\n+\t\t\t\t/* we need to strip crc for the whole packet */\n+\t\t\t\tstart->pkt_len -= rxq->crc_len;\n+\t\t\t\tif (end->data_len > rxq->crc_len) {\n+\t\t\t\t\tend->data_len -= rxq->crc_len;\n+\t\t\t\t} else {\n+\t\t\t\t\t/* free up last mbuf */\n+\t\t\t\t\tstruct rte_mbuf *secondlast = start;\n+\n+\t\t\t\t\tstart->nb_segs--;\n+\t\t\t\t\twhile (secondlast->next != end)\n+\t\t\t\t\t\tsecondlast = secondlast->next;\n+\t\t\t\t\tsecondlast->data_len -= (rxq->crc_len -\n+\t\t\t\t\t\t\tend->data_len);\n+\t\t\t\t\tsecondlast->next = NULL;\n+\t\t\t\t\trte_pktmbuf_free_seg(end);\n+\t\t\t\t}\n+\t\t\t\tpkts[pkt_idx++] = start;\n+\t\t\t\tstart = NULL;\n+\t\t\t\tend = NULL;\n+\t\t\t}\n+\t\t} else {\n+\t\t\t/* not processing a split packet */\n+\t\t\tif (!split_flags[buf_idx]) {\n+\t\t\t\t/* not a split packet, save and skip */\n+\t\t\t\tpkts[pkt_idx++] = rx_bufs[buf_idx];\n+\t\t\t\tcontinue;\n+\t\t\t}\n+\t\t\tstart = rx_bufs[buf_idx];\n+\t\t\tend = start;\n+\t\t\trx_bufs[buf_idx]->data_len += rxq->crc_len;\n+\t\t\trx_bufs[buf_idx]->pkt_len += rxq->crc_len;\n+\t\t}\n+\t}\n+\n+\t/* save the partial packet for next time */\n+\trxq->pkt_first_seg = start;\n+\trxq->pkt_last_seg = end;\n+\tmemcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));\n+\treturn pkt_idx;\n+}\n+\n+static __rte_always_inline int\n+ngbe_tx_free_bufs(struct ngbe_tx_queue *txq)\n+{\n+\tstruct ngbe_tx_entry_v *txep;\n+\tuint32_t status;\n+\tuint32_t n;\n+\tuint32_t i;\n+\tint nb_free = 0;\n+\tstruct rte_mbuf *m, *free[RTE_NGBE_TX_MAX_FREE_BUF_SZ];\n+\n+\t/* check DD bit on threshold descriptor */\n+\tstatus = txq->tx_ring[txq->tx_next_dd].dw3;\n+\tif (!(status & NGBE_TXD_DD)) {\n+\t\tif (txq->nb_tx_free >> 1 < txq->tx_free_thresh)\n+\t\t\tngbe_set32_masked(txq->tdc_reg_addr,\n+\t\t\t\tNGBE_TXCFG_FLUSH, NGBE_TXCFG_FLUSH);\n+\t\treturn 0;\n+\t}\n+\n+\tn = txq->tx_free_thresh;\n+\n+\t/*\n+\t * first buffer to free from S/W ring is at index\n+\t * tx_next_dd - (tx_rs_thresh-1)\n+\t */\n+\ttxep = &txq->sw_ring_v[txq->tx_next_dd - (n - 1)];\n+\tm = rte_pktmbuf_prefree_seg(txep[0].mbuf);\n+\tif (likely(m != NULL)) {\n+\t\tfree[0] = m;\n+\t\tnb_free = 1;\n+\t\tfor (i = 1; i < n; i++) {\n+\t\t\tm = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\tif (likely(m != NULL)) {\n+\t\t\t\tif (likely(m->pool == free[0]->pool)) {\n+\t\t\t\t\tfree[nb_free++] = m;\n+\t\t\t\t} else {\n+\t\t\t\t\trte_mempool_put_bulk(free[0]->pool,\n+\t\t\t\t\t\t\t(void *)free, nb_free);\n+\t\t\t\t\tfree[0] = m;\n+\t\t\t\t\tnb_free = 1;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\trte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);\n+\t} else {\n+\t\tfor (i = 1; i < n; i++) {\n+\t\t\tm = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\tif (m != NULL)\n+\t\t\t\trte_mempool_put(m->pool, m);\n+\t\t}\n+\t}\n+\n+\t/* buffers were freed, update counters */\n+\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_free_thresh);\n+\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_free_thresh);\n+\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_free_thresh - 1);\n+\n+\treturn txq->tx_free_thresh;\n+}\n+\n+static __rte_always_inline void\n+tx_backlog_entry(struct ngbe_tx_entry_v *txep,\n+\t\t struct rte_mbuf **tx_pkts, uint16_t nb_pkts)\n+{\n+\tint i;\n+\n+\tfor (i = 0; i < (int)nb_pkts; ++i)\n+\t\ttxep[i].mbuf = tx_pkts[i];\n+}\n+\n+static inline void\n+_ngbe_tx_queue_release_mbufs_vec(struct ngbe_tx_queue *txq)\n+{\n+\tunsigned int i;\n+\tstruct ngbe_tx_entry_v *txe;\n+\tconst uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);\n+\n+\tif (txq->sw_ring == NULL || txq->nb_tx_free == max_desc)\n+\t\treturn;\n+\n+\t/* release the used mbufs in sw_ring */\n+\tfor (i = txq->tx_next_dd - (txq->tx_free_thresh - 1);\n+\t     i != txq->tx_tail;\n+\t     i = (i + 1) % txq->nb_tx_desc) {\n+\t\ttxe = &txq->sw_ring_v[i];\n+\t\trte_pktmbuf_free_seg(txe->mbuf);\n+\t}\n+\ttxq->nb_tx_free = max_desc;\n+\n+\t/* reset tx_entry */\n+\tfor (i = 0; i < txq->nb_tx_desc; i++) {\n+\t\ttxe = &txq->sw_ring_v[i];\n+\t\ttxe->mbuf = NULL;\n+\t}\n+}\n+\n+static inline void\n+_ngbe_rx_queue_release_mbufs_vec(struct ngbe_rx_queue *rxq)\n+{\n+\tunsigned int i;\n+\n+\tif (rxq->sw_ring == NULL || rxq->rxrearm_nb >= rxq->nb_rx_desc)\n+\t\treturn;\n+\n+\t/* free all mbufs that are valid in the ring */\n+\tif (rxq->rxrearm_nb == 0) {\n+\t\tfor (i = 0; i < rxq->nb_rx_desc; i++) {\n+\t\t\tif (rxq->sw_ring[i].mbuf != NULL)\n+\t\t\t\trte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);\n+\t\t}\n+\t} else {\n+\t\tfor (i = rxq->rx_tail;\n+\t\t     i != rxq->rxrearm_start;\n+\t\t     i = (i + 1) % rxq->nb_rx_desc) {\n+\t\t\tif (rxq->sw_ring[i].mbuf != NULL)\n+\t\t\t\trte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);\n+\t\t}\n+\t}\n+\n+\trxq->rxrearm_nb = rxq->nb_rx_desc;\n+\n+\t/* set all entries to NULL */\n+\tmemset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);\n+}\n+\n+static inline void\n+_ngbe_tx_free_swring_vec(struct ngbe_tx_queue *txq)\n+{\n+\tif (txq == NULL)\n+\t\treturn;\n+\n+\tif (txq->sw_ring != NULL) {\n+\t\trte_free(txq->sw_ring_v - 1);\n+\t\ttxq->sw_ring_v = NULL;\n+\t}\n+}\n+\n+static inline void\n+_ngbe_reset_tx_queue_vec(struct ngbe_tx_queue *txq)\n+{\n+\tstatic const struct ngbe_tx_desc zeroed_desc = {0};\n+\tstruct ngbe_tx_entry_v *txe = txq->sw_ring_v;\n+\tuint16_t i;\n+\n+\t/* Zero out HW ring memory */\n+\tfor (i = 0; i < txq->nb_tx_desc; i++)\n+\t\ttxq->tx_ring[i] = zeroed_desc;\n+\n+\t/* Initialize SW ring entries */\n+\tfor (i = 0; i < txq->nb_tx_desc; i++) {\n+\t\tvolatile struct ngbe_tx_desc *txd = &txq->tx_ring[i];\n+\n+\t\ttxd->dw3 = NGBE_TXD_DD;\n+\t\ttxe[i].mbuf = NULL;\n+\t}\n+\n+\ttxq->tx_next_dd = (uint16_t)(txq->tx_free_thresh - 1);\n+\n+\ttxq->tx_tail = 0;\n+\t/*\n+\t * Always allow 1 descriptor to be un-allocated to avoid\n+\t * a H/W race condition\n+\t */\n+\ttxq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);\n+\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);\n+\ttxq->ctx_curr = 0;\n+\tmemset((void *)&txq->ctx_cache, 0,\n+\t\tNGBE_CTX_NUM * sizeof(struct ngbe_ctx_info));\n+}\n+\n+static inline int\n+ngbe_rxq_vec_setup_default(struct ngbe_rx_queue *rxq)\n+{\n+\tuintptr_t p;\n+\tstruct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */\n+\n+\tmb_def.nb_segs = 1;\n+\tmb_def.data_off = RTE_PKTMBUF_HEADROOM;\n+\tmb_def.port = rxq->port_id;\n+\trte_mbuf_refcnt_set(&mb_def, 1);\n+\n+\t/* prevent compiler reordering: rearm_data covers previous fields */\n+\trte_compiler_barrier();\n+\tp = (uintptr_t)&mb_def.rearm_data;\n+\trxq->mbuf_initializer = *(uint64_t *)p;\n+\treturn 0;\n+}\n+\n+static inline int\n+ngbe_txq_vec_setup_default(struct ngbe_tx_queue *txq,\n+\t\t\t    const struct ngbe_txq_ops *txq_ops)\n+{\n+\tif (txq->sw_ring_v == NULL)\n+\t\treturn -1;\n+\n+\t/* leave the first one for overflow */\n+\ttxq->sw_ring_v = txq->sw_ring_v + 1;\n+\ttxq->ops = txq_ops;\n+\n+\treturn 0;\n+}\n+\n+static inline int\n+ngbe_rx_vec_dev_conf_condition_check_default(struct rte_eth_dev *dev)\n+{\n+\tRTE_SET_USED(dev);\n+#ifndef RTE_LIBRTE_IEEE1588\n+\n+\treturn 0;\n+#else\n+\treturn -1;\n+#endif\n+}\n+#endif\ndiff --git a/drivers/net/ngbe/ngbe_rxtx_vec_neon.c b/drivers/net/ngbe/ngbe_rxtx_vec_neon.c\nnew file mode 100644\nindex 0000000000..dcf12b7070\n--- /dev/null\n+++ b/drivers/net/ngbe/ngbe_rxtx_vec_neon.c\n@@ -0,0 +1,602 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(c) 2015-2024 Beijing WangXun Technology Co., Ltd.\n+ * Copyright(c) 2010-2015 Intel Corporation\n+ */\n+\n+#include <ethdev_driver.h>\n+#include <rte_malloc.h>\n+#include <rte_vect.h>\n+\n+#include \"ngbe_type.h\"\n+#include \"ngbe_ethdev.h\"\n+#include \"ngbe_rxtx.h\"\n+#include \"ngbe_rxtx_vec_common.h\"\n+\n+static inline void\n+ngbe_rxq_rearm(struct ngbe_rx_queue *rxq)\n+{\n+\tint i;\n+\tuint16_t rx_id;\n+\tvolatile struct ngbe_rx_desc *rxdp;\n+\tstruct ngbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\tstruct rte_mbuf *mb0, *mb1;\n+\tuint64x2_t dma_addr0, dma_addr1;\n+\tuint64x2_t zero = vdupq_n_u64(0);\n+\tuint64_t paddr;\n+\tuint8x8_t p;\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\n+\t/* Pull 'n' more MBUFs into the software ring */\n+\tif (unlikely(rte_mempool_get_bulk(rxq->mb_pool,\n+\t\t\t\t\t  (void *)rxep,\n+\t\t\t\t\t  RTE_NGBE_RXQ_REARM_THRESH) < 0)) {\n+\t\tif (rxq->rxrearm_nb + RTE_NGBE_RXQ_REARM_THRESH >=\n+\t\t    rxq->nb_rx_desc) {\n+\t\t\tfor (i = 0; i < RTE_NGBE_DESCS_PER_LOOP; i++) {\n+\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\tvst1q_u64((uint64_t *)(uintptr_t)&rxdp[i], zero);\n+\t\t\t}\n+\t\t}\n+\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=\n+\t\t\tRTE_NGBE_RXQ_REARM_THRESH;\n+\t\treturn;\n+\t}\n+\n+\tp = vld1_u8((uint8_t *)&rxq->mbuf_initializer);\n+\n+\t/* Initialize the mbufs in vector, process 2 mbufs in one loop */\n+\tfor (i = 0; i < RTE_NGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {\n+\t\tmb0 = rxep[0].mbuf;\n+\t\tmb1 = rxep[1].mbuf;\n+\n+\t\t/*\n+\t\t * Flush mbuf with pkt template.\n+\t\t * Data to be rearmed is 6 bytes long.\n+\t\t */\n+\t\tvst1_u8((uint8_t *)&mb0->rearm_data, p);\n+\t\tpaddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;\n+\t\tdma_addr0 = vsetq_lane_u64(paddr, zero, 0);\n+\t\t/* flush desc with pa dma_addr */\n+\t\tvst1q_u64((uint64_t *)(uintptr_t)rxdp++, dma_addr0);\n+\n+\t\tvst1_u8((uint8_t *)&mb1->rearm_data, p);\n+\t\tpaddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;\n+\t\tdma_addr1 = vsetq_lane_u64(paddr, zero, 0);\n+\t\tvst1q_u64((uint64_t *)(uintptr_t)rxdp++, dma_addr1);\n+\t}\n+\n+\trxq->rxrearm_start += RTE_NGBE_RXQ_REARM_THRESH;\n+\tif (rxq->rxrearm_start >= rxq->nb_rx_desc)\n+\t\trxq->rxrearm_start = 0;\n+\n+\trxq->rxrearm_nb -= RTE_NGBE_RXQ_REARM_THRESH;\n+\n+\trx_id = (uint16_t)((rxq->rxrearm_start == 0) ?\n+\t\t\t     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));\n+\n+\t/* Update the tail pointer on the NIC */\n+\tngbe_set32(rxq->rdt_reg_addr, rx_id);\n+}\n+\n+static inline void\n+desc_to_olflags_v(uint8x16x2_t sterr_tmp1, uint8x16x2_t sterr_tmp2,\n+\t\t  uint8x16_t staterr, uint8_t vlan_flags,\n+\t\t  struct rte_mbuf **rx_pkts)\n+{\n+\tuint8x16_t ptype;\n+\tuint8x16_t vtag_lo, vtag_hi, vtag;\n+\tuint8x16_t temp_csum, temp_vp;\n+\tuint8x16_t vtag_mask = vdupq_n_u8(0x0F);\n+\tuint32x4_t csum = {0, 0, 0, 0};\n+\n+\tunion {\n+\t\tuint16_t e[4];\n+\t\tuint64_t word;\n+\t} vol;\n+\n+\tconst uint8x16_t rsstype_msk = {\n+\t\t\t0x0F, 0x0F, 0x0F, 0x0F,\n+\t\t\t0x00, 0x00, 0x00, 0x00,\n+\t\t\t0x00, 0x00, 0x00, 0x00,\n+\t\t\t0x00, 0x00, 0x00, 0x00};\n+\n+\tconst uint8x16_t rss_flags = {\n+\t\t\t0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH,\n+\t\t\t0, RTE_MBUF_F_RX_RSS_HASH, 0, RTE_MBUF_F_RX_RSS_HASH,\n+\t\t\tRTE_MBUF_F_RX_RSS_HASH, 0, 0, 0,\n+\t\t\t0, 0, 0, RTE_MBUF_F_RX_FDIR};\n+\n+\t/* mask everything except vlan present and l4/ip csum error */\n+\tconst uint8x16_t vlan_csum_msk = {\n+\t\t\tNGBE_RXD_STAT_VLAN, NGBE_RXD_STAT_VLAN,\n+\t\t\tNGBE_RXD_STAT_VLAN, NGBE_RXD_STAT_VLAN,\n+\t\t\t0, 0, 0, 0,\n+\t\t\t0, 0, 0, 0,\n+\t\t\t(NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24,\n+\t\t\t(NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24,\n+\t\t\t(NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24,\n+\t\t\t(NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 24};\n+\n+\t/* map vlan present and l4/ip csum error to ol_flags */\n+\tconst uint8x16_t vlan_csum_map_lo = {\n+\t\t\tRTE_MBUF_F_RX_IP_CKSUM_GOOD,\n+\t\t\tRTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\t\tRTE_MBUF_F_RX_IP_CKSUM_BAD,\n+\t\t\tRTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\t\t0, 0, 0, 0,\n+\t\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD,\n+\t\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD,\n+\t\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\t\t0, 0, 0, 0};\n+\n+\tconst uint8x16_t vlan_csum_map_hi = {\n+\t\t\tRTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,\n+\t\t\tRTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,\n+\t\t\t0, 0, 0, 0,\n+\t\t\tRTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,\n+\t\t\tRTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,\n+\t\t\t0, 0, 0, 0};\n+\n+\tptype = vzipq_u8(sterr_tmp1.val[0], sterr_tmp2.val[0]).val[0];\n+\tptype = vandq_u8(ptype, rsstype_msk);\n+\tptype = vqtbl1q_u8(rss_flags, ptype);\n+\n+\t/* extract vlan_flags and csum_error from staterr */\n+\tvtag = vandq_u8(staterr, vlan_csum_msk);\n+\n+\t/* csum bits are in the most significant, to use shuffle we need to\n+\t * shift them. Change mask from 0xc0 to 0x03.\n+\t */\n+\ttemp_csum = vshrq_n_u8(vtag, 6);\n+\n+\t/* Change vlan present mask from 0x20 to 0x08.\n+\t */\n+\ttemp_vp = vshrq_n_u8(vtag, 2);\n+\n+\t/* 'OR' the most significant 32 bits containing the checksum flags with\n+\t * the vlan present flags. Then bits layout of each lane(8bits) will be\n+\t * 'xxxx,VLAN,x,ERR_IPCS,ERR_L4CS'\n+\t */\n+\tcsum = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u8(temp_csum), 3), csum, 0);\n+\tvtag = vorrq_u8(vreinterpretq_u8_u32(csum), vtag);\n+\tvtag = vorrq_u8(vtag, temp_vp);\n+\tvtag = vandq_u8(vtag, vtag_mask);\n+\n+\t/* convert L4 checksum correct type to vtag_hi */\n+\tvtag_hi = vqtbl1q_u8(vlan_csum_map_hi, vtag);\n+\tvtag_hi = vshrq_n_u8(vtag_hi, 7);\n+\n+\t/* convert VP, IPE, L4E to vtag_lo */\n+\tvtag_lo = vqtbl1q_u8(vlan_csum_map_lo, vtag);\n+\tvtag_lo = vorrq_u8(ptype, vtag_lo);\n+\n+\tvtag = vzipq_u8(vtag_lo, vtag_hi).val[0];\n+\tvol.word = vgetq_lane_u64(vreinterpretq_u64_u8(vtag), 0);\n+\n+\trx_pkts[0]->ol_flags = vol.e[0];\n+\trx_pkts[1]->ol_flags = vol.e[1];\n+\trx_pkts[2]->ol_flags = vol.e[2];\n+\trx_pkts[3]->ol_flags = vol.e[3];\n+}\n+\n+#define NGBE_VPMD_DESC_EOP_MASK\t0x02020202\n+#define NGBE_UINT8_BIT\t\t\t(CHAR_BIT * sizeof(uint8_t))\n+\n+static inline void\n+desc_to_ptype_v(uint64x2_t descs[4], uint16_t pkt_type_mask,\n+\t\tstruct rte_mbuf **rx_pkts)\n+{\n+\tuint32x4_t ptype_mask = vdupq_n_u32((uint32_t)pkt_type_mask);\n+\tuint32x4_t ptype0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),\n+\t\t\t\tvreinterpretq_u32_u64(descs[2])).val[0];\n+\tuint32x4_t ptype1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),\n+\t\t\t\tvreinterpretq_u32_u64(descs[3])).val[0];\n+\n+\t/* interleave low 32 bits,\n+\t * now we have 4 ptypes in a NEON register\n+\t */\n+\tptype0 = vzipq_u32(ptype0, ptype1).val[0];\n+\n+\t/* shift right by NGBE_RXD_PTID_SHIFT, and apply ptype mask */\n+\tptype0 = vandq_u32(vshrq_n_u32(ptype0, NGBE_RXD_PTID_SHIFT), ptype_mask);\n+\n+\trx_pkts[0]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 0));\n+\trx_pkts[1]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 1));\n+\trx_pkts[2]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 2));\n+\trx_pkts[3]->packet_type = ngbe_decode_ptype(vgetq_lane_u32(ptype0, 3));\n+}\n+\n+/**\n+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_NGBE_DESCS_PER_LOOP)\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet\n+ * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two\n+ */\n+static inline uint16_t\n+_recv_raw_pkts_vec(struct ngbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n+\t\t   uint16_t nb_pkts, uint8_t *split_packet)\n+{\n+\tvolatile struct ngbe_rx_desc *rxdp;\n+\tstruct ngbe_rx_entry *sw_ring;\n+\tuint16_t nb_pkts_recd;\n+\tint pos;\n+\tuint8x16_t shuf_msk = {\n+\t\t0xFF, 0xFF,\n+\t\t0xFF, 0xFF,  /* skip 32 bits pkt_type */\n+\t\t12, 13,      /* octet 12~13, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */\n+\t\t12, 13,      /* octet 12~13, 16 bits data_len */\n+\t\t14, 15,      /* octet 14~15, low 16 bits vlan_macip */\n+\t\t4, 5, 6, 7  /* octet 4~7, 32bits rss */\n+\t\t};\n+\tuint16x8_t crc_adjust = {0, 0, rxq->crc_len, 0,\n+\t\t\t\t rxq->crc_len, 0, 0, 0};\n+\tuint8_t vlan_flags;\n+\n+\t/* nb_pkts has to be floor-aligned to RTE_NGBE_DESCS_PER_LOOP */\n+\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_NGBE_DESCS_PER_LOOP);\n+\n+\t/* Just the act of getting into the function from the application is\n+\t * going to cost about 7 cycles\n+\t */\n+\trxdp = rxq->rx_ring + rxq->rx_tail;\n+\n+\trte_prefetch_non_temporal(rxdp);\n+\n+\t/* See if we need to rearm the RX queue - gives the prefetch a bit\n+\t * of time to act\n+\t */\n+\tif (rxq->rxrearm_nb > RTE_NGBE_RXQ_REARM_THRESH)\n+\t\tngbe_rxq_rearm(rxq);\n+\n+\t/* Before we start moving massive data around, check to see if\n+\t * there is actually a packet available\n+\t */\n+\tif (!(rxdp->qw1.lo.status & rte_cpu_to_le_32(NGBE_RXD_STAT_DD)))\n+\t\treturn 0;\n+\n+\t/* Cache is empty -> need to scan the buffer rings, but first move\n+\t * the next 'n' mbufs into the cache\n+\t */\n+\tsw_ring = &rxq->sw_ring[rxq->rx_tail];\n+\n+\t/* ensure these 2 flags are in the lower 8 bits */\n+\tRTE_BUILD_BUG_ON((RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED) > UINT8_MAX);\n+\tvlan_flags = rxq->vlan_flags & UINT8_MAX;\n+\n+\t/* A. load 4 packet in one loop\n+\t * B. copy 4 mbuf point from swring to rx_pkts\n+\t * C. calc the number of DD bits among the 4 packets\n+\t * [C*. extract the end-of-packet bit, if requested]\n+\t * D. fill info. from desc to mbuf\n+\t */\n+\tfor (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;\n+\t\t\tpos += RTE_NGBE_DESCS_PER_LOOP,\n+\t\t\trxdp += RTE_NGBE_DESCS_PER_LOOP) {\n+\t\tuint64x2_t descs[RTE_NGBE_DESCS_PER_LOOP];\n+\t\tuint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;\n+\t\tuint8x16x2_t sterr_tmp1, sterr_tmp2;\n+\t\tuint64x2_t mbp1, mbp2;\n+\t\tuint8x16_t staterr;\n+\t\tuint16x8_t tmp;\n+\t\tuint32_t stat;\n+\n+\t\t/* B.1 load 2 mbuf point */\n+\t\tmbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);\n+\n+\t\t/* B.2 copy 2 mbuf point into rx_pkts  */\n+\t\tvst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);\n+\n+\t\t/* B.1 load 2 mbuf point */\n+\t\tmbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);\n+\n+\t\t/* A. load 4 pkts descs */\n+\t\tdescs[0] =  vld1q_u64((uint64_t *)(uintptr_t)(rxdp));\n+\t\tdescs[1] =  vld1q_u64((uint64_t *)(uintptr_t)(rxdp + 1));\n+\t\tdescs[2] =  vld1q_u64((uint64_t *)(uintptr_t)(rxdp + 2));\n+\t\tdescs[3] =  vld1q_u64((uint64_t *)(uintptr_t)(rxdp + 3));\n+\n+\t\t/* B.2 copy 2 mbuf point into rx_pkts  */\n+\t\tvst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);\n+\n+\t\tif (split_packet) {\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos]);\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos + 1]);\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos + 2]);\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos + 3]);\n+\t\t}\n+\n+\t\t/* D.1 pkt 3,4 convert format from desc to pktmbuf */\n+\t\tpkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);\n+\t\tpkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);\n+\n+\t\t/* D.1 pkt 1,2 convert format from desc to pktmbuf */\n+\t\tpkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);\n+\t\tpkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);\n+\n+\t\t/* C.1 4=>2 filter staterr info only */\n+\t\tsterr_tmp2 = vzipq_u8(vreinterpretq_u8_u64(descs[1]),\n+\t\t\t\t      vreinterpretq_u8_u64(descs[3]));\n+\t\t/* C.1 4=>2 filter staterr info only */\n+\t\tsterr_tmp1 = vzipq_u8(vreinterpretq_u8_u64(descs[0]),\n+\t\t\t\t      vreinterpretq_u8_u64(descs[2]));\n+\n+\t\t/* C.2 get 4 pkts staterr value  */\n+\t\tstaterr = vzipq_u8(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0];\n+\n+\t\t/* set ol_flags with vlan packet type */\n+\t\tdesc_to_olflags_v(sterr_tmp1, sterr_tmp2, staterr, vlan_flags,\n+\t\t\t\t  &rx_pkts[pos]);\n+\n+\t\t/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */\n+\t\ttmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);\n+\t\tpkt_mb4 = vreinterpretq_u8_u16(tmp);\n+\t\ttmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);\n+\t\tpkt_mb3 = vreinterpretq_u8_u16(tmp);\n+\n+\t\t/* D.3 copy final 3,4 data to rx_pkts */\n+\t\tvst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,\n+\t\t\t pkt_mb4);\n+\t\tvst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,\n+\t\t\t pkt_mb3);\n+\n+\t\t/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */\n+\t\ttmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);\n+\t\tpkt_mb2 = vreinterpretq_u8_u16(tmp);\n+\t\ttmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);\n+\t\tpkt_mb1 = vreinterpretq_u8_u16(tmp);\n+\n+\t\t/* C* extract and record EOP bit */\n+\t\tif (split_packet) {\n+\t\t\tstat = vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);\n+\t\t\t/* and with mask to extract bits, flipping 1-0 */\n+\t\t\t*(int *)split_packet = ~stat & NGBE_VPMD_DESC_EOP_MASK;\n+\n+\t\t\tsplit_packet += RTE_NGBE_DESCS_PER_LOOP;\n+\t\t}\n+\n+\t\t/* C.4 expand DD bit to saturate UINT8 */\n+\t\tstaterr = vshlq_n_u8(staterr, NGBE_UINT8_BIT - 1);\n+\t\tstaterr = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(staterr),\n+\t\t\t\t\t      NGBE_UINT8_BIT - 1));\n+\t\tstat = ~vgetq_lane_u32(vreinterpretq_u32_u8(staterr), 0);\n+\n+\t\trte_prefetch_non_temporal(rxdp + RTE_NGBE_DESCS_PER_LOOP);\n+\n+\t\t/* D.3 copy final 1,2 data to rx_pkts */\n+\t\tvst1q_u8((uint8_t *)&rx_pkts[pos + 1]->rx_descriptor_fields1,\n+\t\t\t pkt_mb2);\n+\t\tvst1q_u8((uint8_t *)&rx_pkts[pos]->rx_descriptor_fields1,\n+\t\t\t pkt_mb1);\n+\n+\t\tdesc_to_ptype_v(descs, NGBE_PTID_MASK, &rx_pkts[pos]);\n+\n+\t\t/* C.5 calc available number of desc */\n+\t\tif (unlikely(stat == 0)) {\n+\t\t\tnb_pkts_recd += RTE_NGBE_DESCS_PER_LOOP;\n+\t\t} else {\n+\t\t\tnb_pkts_recd += rte_ctz32(stat) / NGBE_UINT8_BIT;\n+\t\t\tbreak;\n+\t\t}\n+\t}\n+\n+\t/* Update our internal tail pointer */\n+\trxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);\n+\trxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));\n+\trxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);\n+\n+\treturn nb_pkts_recd;\n+}\n+\n+/**\n+ * vPMD receive routine, only accept(nb_pkts >= RTE_NGBE_DESCS_PER_LOOP)\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet\n+ * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two\n+ */\n+uint16_t\n+ngbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\tuint16_t nb_pkts)\n+{\n+\treturn _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);\n+}\n+\n+/**\n+ * vPMD receive routine that reassembles scattered packets\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet\n+ * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two\n+ */\n+static uint16_t\n+ngbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t      uint16_t nb_pkts)\n+{\n+\tstruct ngbe_rx_queue *rxq = rx_queue;\n+\tuint8_t split_flags[RTE_NGBE_MAX_RX_BURST] = {0};\n+\n+\t/* get some new buffers */\n+\tuint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,\n+\t\t\tsplit_flags);\n+\tif (nb_bufs == 0)\n+\t\treturn 0;\n+\n+\t/* happy day case, full burst + no packets to be joined */\n+\tconst uint64_t *split_fl64 = (uint64_t *)split_flags;\n+\tif (rxq->pkt_first_seg == NULL &&\n+\t\t\tsplit_fl64[0] == 0 && split_fl64[1] == 0 &&\n+\t\t\tsplit_fl64[2] == 0 && split_fl64[3] == 0)\n+\t\treturn nb_bufs;\n+\n+\t/* reassemble any packets that need reassembly*/\n+\tunsigned int i = 0;\n+\tif (rxq->pkt_first_seg == NULL) {\n+\t\t/* find the first split flag, and only reassemble then*/\n+\t\twhile (i < nb_bufs && !split_flags[i])\n+\t\t\ti++;\n+\t\tif (i == nb_bufs)\n+\t\t\treturn nb_bufs;\n+\t\trxq->pkt_first_seg = rx_pkts[i];\n+\t}\n+\treturn i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,\n+\t\t&split_flags[i]);\n+}\n+\n+/**\n+ * vPMD receive routine that reassembles scattered packets.\n+ */\n+uint16_t\n+ngbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t     uint16_t nb_pkts)\n+{\n+\tuint16_t retval = 0;\n+\n+\twhile (nb_pkts > RTE_NGBE_MAX_RX_BURST) {\n+\t\tuint16_t burst;\n+\n+\t\tburst = ngbe_recv_scattered_burst_vec(rx_queue,\n+\t\t\t\t\t\t      rx_pkts + retval,\n+\t\t\t\t\t\t      RTE_NGBE_MAX_RX_BURST);\n+\t\tretval += burst;\n+\t\tnb_pkts -= burst;\n+\t\tif (burst < RTE_NGBE_MAX_RX_BURST)\n+\t\t\treturn retval;\n+\t}\n+\n+\treturn retval + ngbe_recv_scattered_burst_vec(rx_queue,\n+\t\t\t\t\t\t      rx_pkts + retval,\n+\t\t\t\t\t\t      nb_pkts);\n+}\n+\n+static inline void\n+vtx1(volatile struct ngbe_tx_desc *txdp,\n+\t\tstruct rte_mbuf *pkt, uint64_t flags)\n+{\n+\tuint64x2_t descriptor = {\n+\t\t\tpkt->buf_iova + pkt->data_off,\n+\t\t\t(uint64_t)pkt->pkt_len << 45 | flags | pkt->data_len};\n+\n+\tvst1q_u64((uint64_t *)(uintptr_t)txdp, descriptor);\n+}\n+\n+static inline void\n+vtx(volatile struct ngbe_tx_desc *txdp,\n+\t\tstruct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)\n+{\n+\tint i;\n+\n+\tfor (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)\n+\t\tvtx1(txdp, *pkt, flags);\n+}\n+\n+uint16_t\n+ngbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,\n+\t\t\t  uint16_t nb_pkts)\n+{\n+\tstruct ngbe_tx_queue *txq = (struct ngbe_tx_queue *)tx_queue;\n+\tvolatile struct ngbe_tx_desc *txdp;\n+\tstruct ngbe_tx_entry_v *txep;\n+\tuint16_t n, nb_commit, tx_id;\n+\tuint64_t flags = NGBE_TXD_FLAGS;\n+\tuint64_t rs = NGBE_TXD_FLAGS;\n+\tint i;\n+\n+\t/* cross rx_thresh boundary is not allowed */\n+\tnb_pkts = RTE_MIN(nb_pkts, txq->tx_free_thresh);\n+\n+\tif (txq->nb_tx_free < txq->tx_free_thresh)\n+\t\tngbe_tx_free_bufs(txq);\n+\n+\tnb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);\n+\tif (unlikely(nb_pkts == 0))\n+\t\treturn 0;\n+\n+\ttx_id = txq->tx_tail;\n+\ttxdp = &txq->tx_ring[tx_id];\n+\ttxep = &txq->sw_ring_v[tx_id];\n+\n+\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);\n+\n+\tn = (uint16_t)(txq->nb_tx_desc - tx_id);\n+\tnb_commit = nb_pkts;\n+\tif (nb_commit >= n) {\n+\t\ttx_backlog_entry(txep, tx_pkts, n);\n+\n+\t\tfor (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)\n+\t\t\tvtx1(txdp, *tx_pkts, flags);\n+\n+\t\tvtx1(txdp, *tx_pkts++, rs);\n+\n+\t\tnb_commit = (uint16_t)(nb_commit - n);\n+\n+\t\ttx_id = 0;\n+\n+\t\t/* avoid reach the end of ring */\n+\t\ttxdp = &txq->tx_ring[tx_id];\n+\t\ttxep = &txq->sw_ring_v[tx_id];\n+\t}\n+\n+\ttx_backlog_entry(txep, tx_pkts, nb_commit);\n+\n+\tvtx(txdp, tx_pkts, nb_commit, flags);\n+\n+\ttx_id = (uint16_t)(tx_id + nb_commit);\n+\n+\ttxq->tx_tail = tx_id;\n+\n+\tngbe_set32(txq->tdt_reg_addr, txq->tx_tail);\n+\n+\treturn nb_pkts;\n+}\n+\n+static void __rte_cold\n+ngbe_tx_queue_release_mbufs_vec(struct ngbe_tx_queue *txq)\n+{\n+\t_ngbe_tx_queue_release_mbufs_vec(txq);\n+}\n+\n+void __rte_cold\n+ngbe_rx_queue_release_mbufs_vec(struct ngbe_rx_queue *rxq)\n+{\n+\t_ngbe_rx_queue_release_mbufs_vec(rxq);\n+}\n+\n+static void __rte_cold\n+ngbe_tx_free_swring(struct ngbe_tx_queue *txq)\n+{\n+\t_ngbe_tx_free_swring_vec(txq);\n+}\n+\n+static void __rte_cold\n+ngbe_reset_tx_queue(struct ngbe_tx_queue *txq)\n+{\n+\t_ngbe_reset_tx_queue_vec(txq);\n+}\n+\n+static const struct ngbe_txq_ops vec_txq_ops = {\n+\t.release_mbufs = ngbe_tx_queue_release_mbufs_vec,\n+\t.free_swring = ngbe_tx_free_swring,\n+\t.reset = ngbe_reset_tx_queue,\n+};\n+\n+int __rte_cold\n+ngbe_rxq_vec_setup(struct ngbe_rx_queue *rxq)\n+{\n+\treturn ngbe_rxq_vec_setup_default(rxq);\n+}\n+\n+int __rte_cold\n+ngbe_txq_vec_setup(struct ngbe_tx_queue *txq)\n+{\n+\treturn ngbe_txq_vec_setup_default(txq, &vec_txq_ops);\n+}\n+\n+int __rte_cold\n+ngbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)\n+{\n+\treturn ngbe_rx_vec_dev_conf_condition_check_default(dev);\n+}\ndiff --git a/drivers/net/ngbe/ngbe_rxtx_vec_sse.c b/drivers/net/ngbe/ngbe_rxtx_vec_sse.c\nnew file mode 100644\nindex 0000000000..f703d0ea15\n--- /dev/null\n+++ b/drivers/net/ngbe/ngbe_rxtx_vec_sse.c\n@@ -0,0 +1,688 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(c) 2015-2024 Beijing WangXun Technology Co., Ltd.\n+ * Copyright(c) 2010-2015 Intel Corporation\n+ */\n+\n+#include <ethdev_driver.h>\n+#include <rte_malloc.h>\n+\n+#include \"ngbe_type.h\"\n+#include \"ngbe_ethdev.h\"\n+#include \"ngbe_rxtx.h\"\n+#include \"ngbe_rxtx_vec_common.h\"\n+\n+#include <tmmintrin.h>\n+\n+static inline void\n+ngbe_rxq_rearm(struct ngbe_rx_queue *rxq)\n+{\n+\tint i;\n+\tuint16_t rx_id;\n+\tvolatile struct ngbe_rx_desc *rxdp;\n+\tstruct ngbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];\n+\tstruct rte_mbuf *mb0, *mb1;\n+\t__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,\n+\t\t\tRTE_PKTMBUF_HEADROOM);\n+\t__m128i dma_addr0, dma_addr1;\n+\n+\tconst __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX);\n+\n+\trxdp = rxq->rx_ring + rxq->rxrearm_start;\n+\n+\t/* Pull 'n' more MBUFs into the software ring */\n+\tif (rte_mempool_get_bulk(rxq->mb_pool,\n+\t\t\t\t (void *)rxep,\n+\t\t\t\t RTE_NGBE_RXQ_REARM_THRESH) < 0) {\n+\t\tif (rxq->rxrearm_nb + RTE_NGBE_RXQ_REARM_THRESH >=\n+\t\t    rxq->nb_rx_desc) {\n+\t\t\tdma_addr0 = _mm_setzero_si128();\n+\t\t\tfor (i = 0; i < RTE_NGBE_DESCS_PER_LOOP; i++) {\n+\t\t\t\trxep[i].mbuf = &rxq->fake_mbuf;\n+\t\t\t\t_mm_store_si128((__m128i *)(uintptr_t)&rxdp[i],\n+\t\t\t\t\t\tdma_addr0);\n+\t\t\t}\n+\t\t}\n+\t\trte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=\n+\t\t\tRTE_NGBE_RXQ_REARM_THRESH;\n+\t\treturn;\n+\t}\n+\n+\t/* Initialize the mbufs in vector, process 2 mbufs in one loop */\n+\tfor (i = 0; i < RTE_NGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {\n+\t\t__m128i vaddr0, vaddr1;\n+\n+\t\tmb0 = rxep[0].mbuf;\n+\t\tmb1 = rxep[1].mbuf;\n+\n+\t\t/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */\n+\t\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=\n+\t\t\t\toffsetof(struct rte_mbuf, buf_addr) + 8);\n+\t\tvaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);\n+\t\tvaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);\n+\n+\t\t/* convert pa to dma_addr hdr/data */\n+\t\tdma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);\n+\t\tdma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);\n+\n+\t\t/* add headroom to pa values */\n+\t\tdma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);\n+\t\tdma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);\n+\n+\t\t/* set Header Buffer Address to zero */\n+\t\tdma_addr0 =  _mm_and_si128(dma_addr0, hba_msk);\n+\t\tdma_addr1 =  _mm_and_si128(dma_addr1, hba_msk);\n+\n+\t\t/* flush desc with pa dma_addr */\n+\t\t_mm_store_si128((__m128i *)(uintptr_t)rxdp++, dma_addr0);\n+\t\t_mm_store_si128((__m128i *)(uintptr_t)rxdp++, dma_addr1);\n+\t}\n+\n+\trxq->rxrearm_start += RTE_NGBE_RXQ_REARM_THRESH;\n+\tif (rxq->rxrearm_start >= rxq->nb_rx_desc)\n+\t\trxq->rxrearm_start = 0;\n+\n+\trxq->rxrearm_nb -= RTE_NGBE_RXQ_REARM_THRESH;\n+\n+\trx_id = (uint16_t)((rxq->rxrearm_start == 0) ?\n+\t\t\t   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));\n+\n+\t/* Update the tail pointer on the NIC */\n+\tngbe_set32(rxq->rdt_reg_addr, rx_id);\n+}\n+\n+static inline void\n+desc_to_olflags_v(__m128i descs[4], __m128i mbuf_init, uint8_t vlan_flags,\n+\tstruct rte_mbuf **rx_pkts)\n+{\n+\t__m128i ptype0, ptype1, vtag0, vtag1, csum, vp;\n+\t__m128i rearm0, rearm1, rearm2, rearm3;\n+\n+\t/* mask everything except rss type */\n+\tconst __m128i rsstype_msk = _mm_set_epi16(0x0000, 0x0000, 0x0000, 0x0000,\n+\t\t\t\t\t\t  0x000F, 0x000F, 0x000F, 0x000F);\n+\n+\t/* mask the lower byte of ol_flags */\n+\tconst __m128i ol_flags_msk = _mm_set_epi16(0x0000, 0x0000, 0x0000, 0x0000,\n+\t\t\t\t\t\t   0x00FF, 0x00FF, 0x00FF, 0x00FF);\n+\n+\t/* map rss type to rss hash flag */\n+\tconst __m128i rss_flags = _mm_set_epi8(RTE_MBUF_F_RX_FDIR, 0, 0, 0,\n+\t\t\t0, 0, 0, RTE_MBUF_F_RX_RSS_HASH,\n+\t\t\tRTE_MBUF_F_RX_RSS_HASH, 0, RTE_MBUF_F_RX_RSS_HASH, 0,\n+\t\t\tRTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH, 0);\n+\n+\t/* mask everything except vlan present and l4/ip csum error */\n+\tconst __m128i vlan_csum_msk =\n+\t\t_mm_set_epi16((NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 16,\n+\t\t\t      (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 16,\n+\t\t\t      (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 16,\n+\t\t\t      (NGBE_RXD_ERR_L4CS | NGBE_RXD_ERR_IPCS) >> 16,\n+\t\t\t      NGBE_RXD_STAT_VLAN, NGBE_RXD_STAT_VLAN,\n+\t\t\t      NGBE_RXD_STAT_VLAN, NGBE_RXD_STAT_VLAN);\n+\n+\t/* map vlan present and l4/ip csum error to ol_flags */\n+\tconst __m128i vlan_csum_map_lo = _mm_set_epi8(0, 0, 0, 0,\n+\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_BAD,\n+\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\tvlan_flags | RTE_MBUF_F_RX_IP_CKSUM_GOOD,\n+\t\t0, 0, 0, 0,\n+\t\tRTE_MBUF_F_RX_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\tRTE_MBUF_F_RX_IP_CKSUM_BAD,\n+\t\tRTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD,\n+\t\tRTE_MBUF_F_RX_IP_CKSUM_GOOD);\n+\n+\tconst __m128i vlan_csum_map_hi = _mm_set_epi8(0, 0, 0, 0,\n+\t\t0, RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,\n+\t\tRTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t),\n+\t\t0, 0, 0, 0,\n+\t\t0, RTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0,\n+\t\tRTE_MBUF_F_RX_L4_CKSUM_GOOD >> sizeof(uint8_t));\n+\n+\tconst __m128i vtag_msk = _mm_set_epi16(0x0000, 0x0000, 0x0000, 0x0000,\n+\t\t\t\t\t       0x000F, 0x000F, 0x000F, 0x000F);\n+\n+\tptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);\n+\tptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);\n+\tvtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);\n+\tvtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);\n+\n+\tptype0 = _mm_unpacklo_epi32(ptype0, ptype1);\n+\tptype0 = _mm_and_si128(ptype0, rsstype_msk);\n+\tptype0 = _mm_shuffle_epi8(rss_flags, ptype0);\n+\n+\tvtag1 = _mm_unpacklo_epi32(vtag0, vtag1);\n+\tvtag1 = _mm_and_si128(vtag1, vlan_csum_msk);\n+\n+\t/* csum bits are in the most significant, to use shuffle we need to\n+\t * shift them. Change mask to 0xc000 to 0x0003.\n+\t */\n+\tcsum = _mm_srli_epi16(vtag1, 14);\n+\n+\t/* Change mask to 0x20 to 0x08. */\n+\tvp = _mm_srli_epi16(vtag1, 2);\n+\n+\t/* now or the most significant 64 bits containing the checksum\n+\t * flags with the vlan present flags.\n+\t */\n+\tcsum = _mm_srli_si128(csum, 8);\n+\tvtag1 = _mm_or_si128(csum, vtag1);\n+\tvtag1 = _mm_or_si128(vtag1, vp);\n+\tvtag1 = _mm_and_si128(vtag1, vtag_msk);\n+\n+\t/* convert STAT_VLAN, ERR_IPCS, ERR_L4CS to ol_flags */\n+\tvtag0 = _mm_shuffle_epi8(vlan_csum_map_hi, vtag1);\n+\tvtag0 = _mm_slli_epi16(vtag0, sizeof(uint8_t));\n+\n+\tvtag1 = _mm_shuffle_epi8(vlan_csum_map_lo, vtag1);\n+\tvtag1 = _mm_and_si128(vtag1, ol_flags_msk);\n+\tvtag1 = _mm_or_si128(vtag0, vtag1);\n+\n+\tvtag1 = _mm_or_si128(ptype0, vtag1);\n+\n+\t/*\n+\t * At this point, we have the 4 sets of flags in the low 64-bits\n+\t * of vtag1 (4x16).\n+\t * We want to extract these, and merge them with the mbuf init data\n+\t * so we can do a single 16-byte write to the mbuf to set the flags\n+\t * and all the other initialization fields. Extracting the\n+\t * appropriate flags means that we have to do a shift and blend for\n+\t * each mbuf before we do the write.\n+\t */\n+\trearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtag1, 8), 0x10);\n+\trearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtag1, 6), 0x10);\n+\trearm2 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtag1, 4), 0x10);\n+\trearm3 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vtag1, 2), 0x10);\n+\n+\t/* write the rearm data and the olflags in one write */\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=\n+\t\t\toffsetof(struct rte_mbuf, rearm_data) + 8);\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=\n+\t\t\tRTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));\n+\t_mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0);\n+\t_mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1);\n+\t_mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2);\n+\t_mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3);\n+}\n+\n+static inline void\n+desc_to_ptype_v(__m128i descs[4], uint16_t pkt_type_mask,\n+\t\tstruct rte_mbuf **rx_pkts)\n+{\n+\t__m128i ptype_mask = _mm_set_epi32(pkt_type_mask, pkt_type_mask,\n+\t\t\t\t\tpkt_type_mask, pkt_type_mask);\n+\n+\t__m128i ptype0 = _mm_unpacklo_epi32(descs[0], descs[2]);\n+\t__m128i ptype1 = _mm_unpacklo_epi32(descs[1], descs[3]);\n+\n+\t/* interleave low 32 bits,\n+\t * now we have 4 ptypes in a XMM register\n+\t */\n+\tptype0 = _mm_unpacklo_epi32(ptype0, ptype1);\n+\n+\t/* shift left by NGBE_RXD_PTID_SHIFT, and apply ptype mask */\n+\tptype0 = _mm_and_si128(_mm_srli_epi32(ptype0, NGBE_RXD_PTID_SHIFT),\n+\t\t\t       ptype_mask);\n+\n+\trx_pkts[0]->packet_type = ngbe_decode_ptype(_mm_extract_epi32(ptype0, 0));\n+\trx_pkts[1]->packet_type = ngbe_decode_ptype(_mm_extract_epi32(ptype0, 1));\n+\trx_pkts[2]->packet_type = ngbe_decode_ptype(_mm_extract_epi32(ptype0, 2));\n+\trx_pkts[3]->packet_type = ngbe_decode_ptype(_mm_extract_epi32(ptype0, 3));\n+}\n+\n+/*\n+ * vPMD raw receive routine, only accept(nb_pkts >= RTE_NGBE_DESCS_PER_LOOP)\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet\n+ * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two\n+ */\n+static inline uint16_t\n+_recv_raw_pkts_vec(struct ngbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n+\t\tuint16_t nb_pkts, uint8_t *split_packet)\n+{\n+\tvolatile struct ngbe_rx_desc *rxdp;\n+\tstruct ngbe_rx_entry *sw_ring;\n+\tuint16_t nb_pkts_recd;\n+\tint pos;\n+\tuint64_t var;\n+\t__m128i shuf_msk;\n+\t__m128i crc_adjust = _mm_set_epi16(0, 0, 0, /* ignore non-length fields */\n+\t\t\t\t-rxq->crc_len, /* sub crc on data_len */\n+\t\t\t\t0,             /* ignore high-16bits of pkt_len */\n+\t\t\t\t-rxq->crc_len, /* sub crc on pkt_len */\n+\t\t\t\t0, 0);         /* ignore pkt_type field */\n+\n+\t/*\n+\t * compile-time check the above crc_adjust layout is correct.\n+\t * NOTE: the first field (lowest address) is given last in set_epi16\n+\t * call above.\n+\t */\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=\n+\t\t\toffsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=\n+\t\t\toffsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);\n+\t__m128i dd_check, eop_check;\n+\t__m128i mbuf_init;\n+\tuint8_t vlan_flags;\n+\n+\t/*\n+\t * Under the circumstance that `rx_tail` wrap back to zero\n+\t * and the advance speed of `rx_tail` is greater than `rxrearm_start`,\n+\t * `rx_tail` will catch up with `rxrearm_start` and surpass it.\n+\t * This may cause some mbufs be reused by application.\n+\t *\n+\t * So we need to make some restrictions to ensure that\n+\t * `rx_tail` will not exceed `rxrearm_start`.\n+\t */\n+\tnb_pkts = RTE_MIN(nb_pkts, RTE_NGBE_RXQ_REARM_THRESH);\n+\n+\t/* nb_pkts has to be floor-aligned to RTE_NGBE_DESCS_PER_LOOP */\n+\tnb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_NGBE_DESCS_PER_LOOP);\n+\n+\t/* Just the act of getting into the function from the application is\n+\t * going to cost about 7 cycles\n+\t */\n+\trxdp = rxq->rx_ring + rxq->rx_tail;\n+\n+\trte_prefetch0(rxdp);\n+\n+\t/* See if we need to rearm the RX queue - gives the prefetch a bit\n+\t * of time to act\n+\t */\n+\tif (rxq->rxrearm_nb > RTE_NGBE_RXQ_REARM_THRESH)\n+\t\tngbe_rxq_rearm(rxq);\n+\n+\t/* Before we start moving massive data around, check to see if\n+\t * there is actually a packet available\n+\t */\n+\tif (!(rxdp->qw1.lo.status &\n+\t\t\t\trte_cpu_to_le_32(NGBE_RXD_STAT_DD)))\n+\t\treturn 0;\n+\n+\t/* 4 packets DD mask */\n+\tdd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);\n+\n+\t/* 4 packets EOP mask */\n+\teop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);\n+\n+\t/* mask to shuffle from desc. to mbuf */\n+\tshuf_msk = _mm_set_epi8(7, 6, 5, 4,  /* octet 4~7, 32bits rss */\n+\t\t15, 14,      /* octet 14~15, low 16 bits vlan_macip */\n+\t\t13, 12,      /* octet 12~13, 16 bits data_len */\n+\t\t0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */\n+\t\t13, 12,      /* octet 12~13, low 16 bits pkt_len */\n+\t\t0xFF, 0xFF,  /* skip 32 bit pkt_type */\n+\t\t0xFF, 0xFF);\n+\t/*\n+\t * Compile-time verify the shuffle mask\n+\t * NOTE: some field positions already verified above, but duplicated\n+\t * here for completeness in case of future modifications.\n+\t */\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=\n+\t\t\toffsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=\n+\t\t\toffsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=\n+\t\t\toffsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);\n+\tRTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=\n+\t\t\toffsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);\n+\n+\tmbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);\n+\n+\t/* Cache is empty -> need to scan the buffer rings, but first move\n+\t * the next 'n' mbufs into the cache\n+\t */\n+\tsw_ring = &rxq->sw_ring[rxq->rx_tail];\n+\n+\t/* ensure these 2 flags are in the lower 8 bits */\n+\tRTE_BUILD_BUG_ON((RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED) > UINT8_MAX);\n+\tvlan_flags = rxq->vlan_flags & UINT8_MAX;\n+\n+\t/* A. load 4 packet in one loop\n+\t * [A*. mask out 4 unused dirty field in desc]\n+\t * B. copy 4 mbuf point from swring to rx_pkts\n+\t * C. calc the number of DD bits among the 4 packets\n+\t * [C*. extract the end-of-packet bit, if requested]\n+\t * D. fill info. from desc to mbuf\n+\t */\n+\tfor (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;\n+\t\t\tpos += RTE_NGBE_DESCS_PER_LOOP,\n+\t\t\trxdp += RTE_NGBE_DESCS_PER_LOOP) {\n+\t\t__m128i descs[RTE_NGBE_DESCS_PER_LOOP];\n+\t\t__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;\n+\t\t__m128i zero, staterr, sterr_tmp1, sterr_tmp2;\n+\t\t/* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */\n+\t\t__m128i mbp1;\n+#if defined(RTE_ARCH_X86_64)\n+\t\t__m128i mbp2;\n+#endif\n+\n+\t\t/* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */\n+\t\tmbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);\n+\n+\t\t/* Read desc statuses backwards to avoid race condition */\n+\t\t/* A.1 load desc[3] */\n+\t\tdescs[3] = _mm_loadu_si128((__m128i *)(uintptr_t)(rxdp + 3));\n+\t\trte_compiler_barrier();\n+\n+\t\t/* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */\n+\t\t_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);\n+\n+#if defined(RTE_ARCH_X86_64)\n+\t\t/* B.1 load 2 64 bit mbuf points */\n+\t\tmbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]);\n+#endif\n+\n+\t\t/* A.1 load desc[2-0] */\n+\t\tdescs[2] = _mm_loadu_si128((__m128i *)(uintptr_t)(rxdp + 2));\n+\t\trte_compiler_barrier();\n+\t\tdescs[1] = _mm_loadu_si128((__m128i *)(uintptr_t)(rxdp + 1));\n+\t\trte_compiler_barrier();\n+\t\tdescs[0] = _mm_loadu_si128((__m128i *)(uintptr_t)(rxdp));\n+\n+#if defined(RTE_ARCH_X86_64)\n+\t\t/* B.2 copy 2 mbuf point into rx_pkts  */\n+\t\t_mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2);\n+#endif\n+\n+\t\tif (split_packet) {\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos]);\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos + 1]);\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos + 2]);\n+\t\t\trte_mbuf_prefetch_part2(rx_pkts[pos + 3]);\n+\t\t}\n+\n+\t\t/* avoid compiler reorder optimization */\n+\t\trte_compiler_barrier();\n+\n+\t\t/* D.1 pkt 3,4 convert format from desc to pktmbuf */\n+\t\tpkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);\n+\t\tpkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);\n+\n+\t\t/* D.1 pkt 1,2 convert format from desc to pktmbuf */\n+\t\tpkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);\n+\t\tpkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);\n+\n+\t\t/* C.1 4=>2 filter staterr info only */\n+\t\tsterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);\n+\t\t/* C.1 4=>2 filter staterr info only */\n+\t\tsterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);\n+\n+\t\t/* set ol_flags with vlan packet type */\n+\t\tdesc_to_olflags_v(descs, mbuf_init, vlan_flags, &rx_pkts[pos]);\n+\n+\t\t/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */\n+\t\tpkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);\n+\t\tpkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);\n+\n+\t\t/* C.2 get 4 pkts staterr value  */\n+\t\tzero = _mm_xor_si128(dd_check, dd_check);\n+\t\tstaterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);\n+\n+\t\t/* D.3 copy final 3,4 data to rx_pkts */\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,\n+\t\t\t\tpkt_mb4);\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,\n+\t\t\t\tpkt_mb3);\n+\n+\t\t/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */\n+\t\tpkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);\n+\t\tpkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);\n+\n+\t\t/* C* extract and record EOP bit */\n+\t\tif (split_packet) {\n+\t\t\t__m128i eop_shuf_mask =\n+\t\t\t\t_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xFF,\n+\t\t\t\t\t     0xFF, 0xFF, 0xFF, 0xFF,\n+\t\t\t\t\t     0xFF, 0xFF, 0xFF, 0xFF,\n+\t\t\t\t\t     0x04, 0x0C, 0x00, 0x08);\n+\n+\t\t\t/* and with mask to extract bits, flipping 1-0 */\n+\t\t\t__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);\n+\t\t\t/* the staterr values are not in order, as the count\n+\t\t\t * of dd bits doesn't care. However, for end of\n+\t\t\t * packet tracking, we do care, so shuffle. This also\n+\t\t\t * compresses the 32-bit values to 8-bit\n+\t\t\t */\n+\t\t\teop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);\n+\t\t\t/* store the resulting 32-bit value */\n+\t\t\t*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);\n+\t\t\tsplit_packet += RTE_NGBE_DESCS_PER_LOOP;\n+\t\t}\n+\n+\t\t/* C.3 calc available number of desc */\n+\t\tstaterr = _mm_and_si128(staterr, dd_check);\n+\t\tstaterr = _mm_packs_epi32(staterr, zero);\n+\n+\t\t/* D.3 copy final 1,2 data to rx_pkts */\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,\n+\t\t\t\tpkt_mb2);\n+\t\t_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,\n+\t\t\t\tpkt_mb1);\n+\n+\t\tdesc_to_ptype_v(descs, NGBE_PTID_MASK, &rx_pkts[pos]);\n+\n+\t\t/* C.4 calc available number of desc */\n+\t\tvar = rte_popcount64(_mm_cvtsi128_si64(staterr));\n+\t\tnb_pkts_recd += var;\n+\t\tif (likely(var != RTE_NGBE_DESCS_PER_LOOP))\n+\t\t\tbreak;\n+\t}\n+\n+\t/* Update our internal tail pointer */\n+\trxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);\n+\trxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));\n+\trxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);\n+\n+\treturn nb_pkts_recd;\n+}\n+\n+/*\n+ * vPMD receive routine, only accept(nb_pkts >= RTE_NGBE_DESCS_PER_LOOP)\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet\n+ * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two\n+ */\n+uint16_t\n+ngbe_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\tuint16_t nb_pkts)\n+{\n+\treturn _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);\n+}\n+\n+/**\n+ * vPMD receive routine that reassembles scattered packets\n+ *\n+ * Notice:\n+ * - nb_pkts < RTE_NGBE_DESCS_PER_LOOP, just return no packet\n+ * - floor align nb_pkts to a RTE_NGBE_DESC_PER_LOOP power-of-two\n+ */\n+static uint16_t\n+ngbe_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t       uint16_t nb_pkts)\n+{\n+\tstruct ngbe_rx_queue *rxq = rx_queue;\n+\tuint8_t split_flags[RTE_NGBE_MAX_RX_BURST] = {0};\n+\n+\t/* get some new buffers */\n+\tuint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,\n+\t\t\tsplit_flags);\n+\tif (nb_bufs == 0)\n+\t\treturn 0;\n+\n+\t/* happy day case, full burst + no packets to be joined */\n+\tconst uint64_t *split_fl64 = (uint64_t *)split_flags;\n+\tif (rxq->pkt_first_seg == NULL &&\n+\t\t\tsplit_fl64[0] == 0 && split_fl64[1] == 0 &&\n+\t\t\tsplit_fl64[2] == 0 && split_fl64[3] == 0)\n+\t\treturn nb_bufs;\n+\n+\t/* reassemble any packets that need reassembly*/\n+\tunsigned int i = 0;\n+\tif (rxq->pkt_first_seg == NULL) {\n+\t\t/* find the first split flag, and only reassemble then*/\n+\t\twhile (i < nb_bufs && !split_flags[i])\n+\t\t\ti++;\n+\t\tif (i == nb_bufs)\n+\t\t\treturn nb_bufs;\n+\t\trxq->pkt_first_seg = rx_pkts[i];\n+\t}\n+\treturn i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,\n+\t\t&split_flags[i]);\n+}\n+\n+/**\n+ * vPMD receive routine that reassembles scattered packets.\n+ */\n+uint16_t\n+ngbe_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t     uint16_t nb_pkts)\n+{\n+\tuint16_t retval = 0;\n+\n+\twhile (nb_pkts > RTE_NGBE_MAX_RX_BURST) {\n+\t\tuint16_t burst;\n+\n+\t\tburst = ngbe_recv_scattered_burst_vec(rx_queue,\n+\t\t\t\t\t\t      rx_pkts + retval,\n+\t\t\t\t\t\t      RTE_NGBE_MAX_RX_BURST);\n+\t\tretval += burst;\n+\t\tnb_pkts -= burst;\n+\t\tif (burst < RTE_NGBE_MAX_RX_BURST)\n+\t\t\treturn retval;\n+\t}\n+\n+\treturn retval + ngbe_recv_scattered_burst_vec(rx_queue,\n+\t\t\t\t\t\t      rx_pkts + retval,\n+\t\t\t\t\t\t      nb_pkts);\n+}\n+\n+static inline void\n+vtx1(volatile struct ngbe_tx_desc *txdp,\n+\t\tstruct rte_mbuf *pkt, uint64_t flags)\n+{\n+\t__m128i descriptor = _mm_set_epi64x((uint64_t)pkt->pkt_len << 45 |\n+\t\t\tflags | pkt->data_len,\n+\t\t\tpkt->buf_iova + pkt->data_off);\n+\t_mm_store_si128((__m128i *)(uintptr_t)txdp, descriptor);\n+}\n+\n+static inline void\n+vtx(volatile struct ngbe_tx_desc *txdp,\n+\t\tstruct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)\n+{\n+\tint i;\n+\n+\tfor (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)\n+\t\tvtx1(txdp, *pkt, flags);\n+}\n+\n+uint16_t\n+ngbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,\n+\t\t\t  uint16_t nb_pkts)\n+{\n+\tstruct ngbe_tx_queue *txq = (struct ngbe_tx_queue *)tx_queue;\n+\tvolatile struct ngbe_tx_desc *txdp;\n+\tstruct ngbe_tx_entry_v *txep;\n+\tuint16_t n, nb_commit, tx_id;\n+\tuint64_t flags = NGBE_TXD_FLAGS;\n+\tuint64_t rs = NGBE_TXD_FLAGS;\n+\tint i;\n+\n+\t/* cross rx_thresh boundary is not allowed */\n+\tnb_pkts = RTE_MIN(nb_pkts, txq->tx_free_thresh);\n+\n+\tif (txq->nb_tx_free < txq->tx_free_thresh)\n+\t\tngbe_tx_free_bufs(txq);\n+\n+\tnb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);\n+\tif (unlikely(nb_pkts == 0))\n+\t\treturn 0;\n+\n+\ttx_id = txq->tx_tail;\n+\ttxdp = &txq->tx_ring[tx_id];\n+\ttxep = &txq->sw_ring_v[tx_id];\n+\n+\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);\n+\n+\tn = (uint16_t)(txq->nb_tx_desc - tx_id);\n+\tnb_commit = nb_pkts;\n+\tif (nb_commit >= n) {\n+\t\ttx_backlog_entry(txep, tx_pkts, n);\n+\n+\t\tfor (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)\n+\t\t\tvtx1(txdp, *tx_pkts, flags);\n+\n+\t\tvtx1(txdp, *tx_pkts++, rs);\n+\n+\t\tnb_commit = (uint16_t)(nb_commit - n);\n+\n+\t\ttx_id = 0;\n+\n+\t\t/* avoid reach the end of ring */\n+\t\ttxdp = &txq->tx_ring[tx_id];\n+\t\ttxep = &txq->sw_ring_v[tx_id];\n+\t}\n+\n+\ttx_backlog_entry(txep, tx_pkts, nb_commit);\n+\n+\tvtx(txdp, tx_pkts, nb_commit, flags);\n+\n+\ttx_id = (uint16_t)(tx_id + nb_commit);\n+\n+\ttxq->tx_tail = tx_id;\n+\n+\tngbe_set32(txq->tdt_reg_addr, txq->tx_tail);\n+\n+\treturn nb_pkts;\n+}\n+\n+static void __rte_cold\n+ngbe_tx_queue_release_mbufs_vec(struct ngbe_tx_queue *txq)\n+{\n+\t_ngbe_tx_queue_release_mbufs_vec(txq);\n+}\n+\n+void __rte_cold\n+ngbe_rx_queue_release_mbufs_vec(struct ngbe_rx_queue *rxq)\n+{\n+\t_ngbe_rx_queue_release_mbufs_vec(rxq);\n+}\n+\n+static void __rte_cold\n+ngbe_tx_free_swring(struct ngbe_tx_queue *txq)\n+{\n+\t_ngbe_tx_free_swring_vec(txq);\n+}\n+\n+static void __rte_cold\n+ngbe_reset_tx_queue(struct ngbe_tx_queue *txq)\n+{\n+\t_ngbe_reset_tx_queue_vec(txq);\n+}\n+\n+static const struct ngbe_txq_ops vec_txq_ops = {\n+\t.release_mbufs = ngbe_tx_queue_release_mbufs_vec,\n+\t.free_swring = ngbe_tx_free_swring,\n+\t.reset = ngbe_reset_tx_queue,\n+};\n+\n+int __rte_cold\n+ngbe_rxq_vec_setup(struct ngbe_rx_queue *rxq)\n+{\n+\treturn ngbe_rxq_vec_setup_default(rxq);\n+}\n+\n+int __rte_cold\n+ngbe_txq_vec_setup(struct ngbe_tx_queue *txq)\n+{\n+\treturn ngbe_txq_vec_setup_default(txq, &vec_txq_ops);\n+}\n+\n+int __rte_cold\n+ngbe_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)\n+{\n+\treturn ngbe_rx_vec_dev_conf_condition_check_default(dev);\n+}\n",
    "prefixes": [
        "v2",
        "2/2"
    ]
}