get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/74415/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 74415,
    "url": "http://patchwork.dpdk.org/api/patches/74415/?format=api",
    "web_url": "http://patchwork.dpdk.org/project/dpdk/patch/20200719041142.14485-1-akozyrev@mellanox.com/",
    "project": {
        "id": 1,
        "url": "http://patchwork.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20200719041142.14485-1-akozyrev@mellanox.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20200719041142.14485-1-akozyrev@mellanox.com",
    "date": "2020-07-19T04:11:42",
    "name": "net/mlx5: implement vectorized MPRQ burst",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "a82df6a000c1ac135eee644b55f882e631e5e5cc",
    "submitter": {
        "id": 1573,
        "url": "http://patchwork.dpdk.org/api/people/1573/?format=api",
        "name": "Alexander Kozyrev",
        "email": "akozyrev@mellanox.com"
    },
    "delegate": {
        "id": 3268,
        "url": "http://patchwork.dpdk.org/api/users/3268/?format=api",
        "username": "rasland",
        "first_name": "Raslan",
        "last_name": "Darawsheh",
        "email": "rasland@nvidia.com"
    },
    "mbox": "http://patchwork.dpdk.org/project/dpdk/patch/20200719041142.14485-1-akozyrev@mellanox.com/mbox/",
    "series": [
        {
            "id": 11143,
            "url": "http://patchwork.dpdk.org/api/series/11143/?format=api",
            "web_url": "http://patchwork.dpdk.org/project/dpdk/list/?series=11143",
            "date": "2020-07-19T04:11:42",
            "name": "net/mlx5: implement vectorized MPRQ burst",
            "version": 1,
            "mbox": "http://patchwork.dpdk.org/series/11143/mbox/"
        }
    ],
    "comments": "http://patchwork.dpdk.org/api/patches/74415/comments/",
    "check": "fail",
    "checks": "http://patchwork.dpdk.org/api/patches/74415/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 97410A052A;\n\tSun, 19 Jul 2020 06:11:48 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id CBA3F1C001;\n\tSun, 19 Jul 2020 06:11:47 +0200 (CEST)",
            "from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129])\n by dpdk.org (Postfix) with ESMTP id 0A0021BF5E\n for <dev@dpdk.org>; Sun, 19 Jul 2020 06:11:45 +0200 (CEST)",
            "from Internal Mail-Server by MTLPINE1 (envelope-from\n akozyrev@mellanox.com) with SMTP; 19 Jul 2020 07:11:44 +0300",
            "from pegasus02.mtr.labs.mlnx. (pegasus02.mtr.labs.mlnx\n [10.210.16.122])\n by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id 06J4BisG003593;\n Sun, 19 Jul 2020 07:11:44 +0300"
        ],
        "From": "Alexander Kozyrev <akozyrev@mellanox.com>",
        "To": "dev@dpdk.org",
        "Cc": "rasland@mellanox.com, viacheslavo@mellanox.com",
        "Date": "Sun, 19 Jul 2020 04:11:42 +0000",
        "Message-Id": "<20200719041142.14485-1-akozyrev@mellanox.com>",
        "X-Mailer": "git-send-email 2.24.1",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "Subject": "[dpdk-dev] [PATCH] net/mlx5: implement vectorized MPRQ burst",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "MPRQ (Multi-Packet Rx Queue) processes one packet at the time\nusing simple scalar instructions. MPRQ works by posting a single\nlarge buffer (consisted of multiple fixed-size strides) in order to\nreceive multiple packets at once on this buffer. A Rx packet is then\ncopied to a user-provided mbuf or PMD attaches the Rx packet to\nthe mbuf by the pointer to an external buffer.\n\nThere is an opportunity to speed up the packet receiving by processing\n4 packets simultaneously using SIMD (single instruction, multiple data)\nextensions. Allocate mbufs in batches for every MPRQ buffer and process\nthe packets in the groups of 4 until all the strides are exhausted. Then\nswitch to another MPRQ buffer and repeat the process over again.\n\nThe vectorized MPRQ burst routine is engaged automatically in case\nthe mprq_en=1 devarg is specified and the vectorization is not disabled\nexplicitly by providing rx_vec_en=0 devarg.  There are two limitations:\n- LRO is not supported and scalar MPRQ is selected if it is on.\n- CQE compression is disabled in case vectorized MPRQ is engaged.\n\nSigned-off-by: Alexander Kozyrev <akozyrev@mellanox.com>\n---\n drivers/net/mlx5/linux/mlx5_os.c         |   4 +\n drivers/net/mlx5/mlx5_ethdev.c           |  12 +-\n drivers/net/mlx5/mlx5_rxq.c              |  80 +--\n drivers/net/mlx5/mlx5_rxtx.c             |  30 +-\n drivers/net/mlx5/mlx5_rxtx.h             |   9 +-\n drivers/net/mlx5/mlx5_rxtx_vec.c         |  38 +-\n drivers/net/mlx5/mlx5_rxtx_vec.h         |  21 +\n drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 724 +++++++++++++++++++++++\n drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 577 ++++++++++++++++++\n drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 520 ++++++++++++++++\n 10 files changed, 1968 insertions(+), 47 deletions(-)",
    "diff": "diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c\nindex 742e2fba49..927fa07270 100644\n--- a/drivers/net/mlx5/linux/mlx5_os.c\n+++ b/drivers/net/mlx5/linux/mlx5_os.c\n@@ -568,6 +568,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,\n \t\tcqe_comp = 0;\n \telse\n \t\tcqe_comp = 1;\n+\tif (config.mprq.enabled)\n+\t\tcqe_comp = 0;\n \tconfig.cqe_comp = cqe_comp;\n #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD\n \t/* Whether device supports 128B Rx CQE padding. */\n@@ -973,6 +975,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,\n \t\t\t\t\" setting default value (%u)\",\n \t\t\t\t1 << config.mprq.stride_num_n);\n \t\t}\n+\t\tif (config.mprq.stride_size_n)\n+\t\t\tconfig.rx_vec_en = false;\n \t\tif (config.mprq.stride_size_n &&\n \t\t    (config.mprq.stride_size_n > mprq_max_stride_size_n ||\n \t\t     config.mprq.stride_size_n < mprq_min_stride_size_n)) {\ndiff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c\nindex cefb45064e..f48e8ea293 100644\n--- a/drivers/net/mlx5/mlx5_ethdev.c\n+++ b/drivers/net/mlx5/mlx5_ethdev.c\n@@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)\n \n \tif (dev->rx_pkt_burst == mlx5_rx_burst ||\n \t    dev->rx_pkt_burst == mlx5_rx_burst_mprq ||\n-\t    dev->rx_pkt_burst == mlx5_rx_burst_vec)\n+\t    dev->rx_pkt_burst == mlx5_rx_burst_vec ||\n+\t    dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)\n \t\treturn ptypes;\n \treturn NULL;\n }\n@@ -479,12 +480,19 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)\n \teth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;\n \n \tMLX5_ASSERT(dev != NULL);\n-\tif (mlx5_check_vec_rx_support(dev) > 0) {\n+\tif (mlx5_check_vec_rx_support(dev) > 0 &&\n+\t\tmlx5_mprq_enabled(dev)) {\n+\t\trx_pkt_burst = mlx5_rx_burst_mprq_vec;\n+\t\tDRV_LOG(DEBUG, \"port %u selected Multi-Packet Rx vectorized function\",\n+\t\t\tdev->data->port_id);\n+\t} else if (mlx5_check_vec_rx_support(dev) > 0) {\n \t\trx_pkt_burst = mlx5_rx_burst_vec;\n \t\tDRV_LOG(DEBUG, \"port %u selected Rx vectorized function\",\n \t\t\tdev->data->port_id);\n \t} else if (mlx5_mprq_enabled(dev)) {\n \t\trx_pkt_burst = mlx5_rx_burst_mprq;\n+\t\tDRV_LOG(DEBUG, \"port %u selected Multi-Packet Rx function\",\n+\t\t\tdev->data->port_id);\n \t}\n \treturn rx_pkt_burst;\n }\ndiff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c\nindex 67d996cabf..06e7650be9 100644\n--- a/drivers/net/mlx5/mlx5_rxq.c\n+++ b/drivers/net/mlx5/mlx5_rxq.c\n@@ -164,7 +164,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\trxq->mprq_repl = buf;\n \t}\n \tDRV_LOG(DEBUG,\n-\t\t\"port %u Rx queue %u allocated and configured %u segments\",\n+\t\t\"port %u Multi-Packet Rx queue %u allocated and configured %u segments\",\n \t\trxq->port_id, rxq->idx, wqe_n);\n \treturn 0;\n error:\n@@ -176,7 +176,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\t\t\t(*rxq->mprq_bufs)[i]);\n \t\t(*rxq->mprq_bufs)[i] = NULL;\n \t}\n-\tDRV_LOG(DEBUG, \"port %u Rx queue %u failed, freed everything\",\n+\tDRV_LOG(DEBUG, \"port %u Multi-Packet Rx queue %u failed, freed everything\",\n \t\trxq->port_id, rxq->idx);\n \trte_errno = err; /* Restore rte_errno. */\n \treturn -rte_errno;\n@@ -194,11 +194,14 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n static int\n rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n+\tstruct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;\n \tconst unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;\n \tunsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;\n \tunsigned int i;\n \tint err;\n \n+\tif (mlx5_rxq_mprq_enabled(rxq))\n+\t\telts_n *= (1U << rxq_ctrl->rxq.strd_num_n);\n \t/* Iterate on segments. */\n \tfor (i = 0; (i != elts_n); ++i) {\n \t\tstruct rte_mbuf *buf;\n@@ -284,8 +287,10 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n int\n rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n-\treturn mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?\n-\t       rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);\n+\tint ret = 0;\n+\tif (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))\n+\t\tret = rxq_alloc_elts_mprq(rxq_ctrl);\n+\treturn (ret || rxq_alloc_elts_sprq(rxq_ctrl));\n }\n \n /**\n@@ -304,7 +309,6 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\trxq->port_id, rxq->idx);\n \tif (rxq->mprq_bufs == NULL)\n \t\treturn;\n-\tMLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);\n \tfor (i = 0; (i != (1u << rxq->elts_n)); ++i) {\n \t\tif ((*rxq->mprq_bufs)[i] != NULL)\n \t\t\tmlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);\n@@ -326,15 +330,19 @@ static void\n rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n \tstruct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;\n-\tconst uint16_t q_n = (1 << rxq->elts_n);\n-\tconst uint16_t q_mask = q_n - 1;\n-\tuint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);\n+\tunsigned int q_n = (1 << rxq->elts_n);\n+\tuint16_t q_mask;\n+\tuint16_t used;\n \tuint16_t i;\n \n \tDRV_LOG(DEBUG, \"port %u Rx queue %u freeing WRs\",\n \t\tPORT_ID(rxq_ctrl->priv), rxq->idx);\n \tif (rxq->elts == NULL)\n \t\treturn;\n+\tif (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))\n+\t\tq_n *= (1U << rxq_ctrl->rxq.strd_num_n);\n+\tq_mask = q_n - 1;\n+\tused = q_n - (rxq->rq_ci - rxq->rq_pi);\n \t/**\n \t * Some mbuf in the Ring belongs to the application.  They cannot be\n \t * freed.\n@@ -344,7 +352,7 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)\n \t\t\t(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;\n \t\trxq->rq_pi = rxq->rq_ci;\n \t}\n-\tfor (i = 0; (i != (1u << rxq->elts_n)); ++i) {\n+\tfor (i = 0; (i != q_n); ++i) {\n \t\tif ((*rxq->elts)[i] != NULL)\n \t\t\trte_pktmbuf_free_seg((*rxq->elts)[i]);\n \t\t(*rxq->elts)[i] = NULL;\n@@ -362,8 +370,7 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)\n {\n \tif (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))\n \t\trxq_free_elts_mprq(rxq_ctrl);\n-\telse\n-\t\trxq_free_elts_sprq(rxq_ctrl);\n+\trxq_free_elts_sprq(rxq_ctrl);\n }\n \n /**\n@@ -1793,20 +1800,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \tstruct mlx5_priv *priv = dev->data->dev_private;\n \tstruct mlx5_rxq_ctrl *tmpl;\n \tunsigned int mb_len = rte_pktmbuf_data_room_size(mp);\n-\tunsigned int mprq_stride_nums;\n-\tunsigned int mprq_stride_size;\n-\tunsigned int mprq_stride_cap;\n \tstruct mlx5_dev_config *config = &priv->config;\n-\t/*\n-\t * Always allocate extra slots, even if eventually\n-\t * the vector Rx will not be used.\n-\t */\n-\tuint16_t desc_n =\n-\t\tdesc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;\n \tuint64_t offloads = conf->offloads |\n \t\t\t   dev->data->dev_conf.rxmode.offloads;\n \tunsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);\n-\tconst int mprq_en = mlx5_check_mprq_support(dev) > 0;\n \tunsigned int max_rx_pkt_len = lro_on_queue ?\n \t\t\tdev->data->dev_conf.rxmode.max_lro_pkt_size :\n \t\t\tdev->data->dev_conf.rxmode.max_rx_pkt_len;\n@@ -1814,6 +1811,23 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \t\t\t\t\t\t\tRTE_PKTMBUF_HEADROOM;\n \tunsigned int max_lro_size = 0;\n \tunsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;\n+\tconst int mprq_en = mlx5_check_mprq_support(dev) > 0;\n+\tunsigned int mprq_stride_nums = config->mprq.stride_num_n ?\n+\t\tconfig->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;\n+\tunsigned int mprq_stride_size = non_scatter_min_mbuf_size <=\n+\t\t(1U << config->mprq.max_stride_size_n) ?\n+\t\tlog2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;\n+\tunsigned int mprq_stride_cap = (config->mprq.stride_num_n ?\n+\t\t(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *\n+\t\t\t(config->mprq.stride_size_n ?\n+\t\t(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));\n+\t/*\n+\t * Always allocate extra slots, even if eventually\n+\t * the vector Rx will not be used.\n+\t */\n+\tuint16_t desc_n = desc +\n+\t\tconfig->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP *\n+\t\t(desc >> mprq_stride_nums);\n \n \tif (non_scatter_min_mbuf_size > mb_len && !(offloads &\n \t\t\t\t\t\t    DEV_RX_OFFLOAD_SCATTER)) {\n@@ -1825,8 +1839,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \t\trte_errno = ENOSPC;\n \t\treturn NULL;\n \t}\n-\ttmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +\n-\t\t\t   desc_n * sizeof(struct rte_mbuf *), 0, socket);\n+\ttmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,\n+\t\t\t\t sizeof(*tmpl) +\n+\t\t\t\t desc_n * sizeof(struct rte_mbuf *) +\n+\t\t\t\t (desc >> mprq_stride_nums) *\n+\t\t\t\t sizeof(struct mlx5_mprq_buf *),\n+\t\t\t\t 0, socket);\n \tif (!tmpl) {\n \t\trte_errno = ENOMEM;\n \t\treturn NULL;\n@@ -1840,15 +1858,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \ttmpl->socket = socket;\n \tif (dev->data->dev_conf.intr_conf.rxq)\n \t\ttmpl->irq = 1;\n-\tmprq_stride_nums = config->mprq.stride_num_n ?\n-\t\tconfig->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;\n-\tmprq_stride_size = non_scatter_min_mbuf_size <=\n-\t\t(1U << config->mprq.max_stride_size_n) ?\n-\t\tlog2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;\n-\tmprq_stride_cap = (config->mprq.stride_num_n ?\n-\t\t(1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *\n-\t\t\t(config->mprq.stride_size_n ?\n-\t\t(1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));\n \t/*\n \t * This Rx queue can be configured as a Multi-Packet RQ if all of the\n \t * following conditions are met:\n@@ -1996,7 +2005,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,\n \ttmpl->rxq.rq_repl_thresh =\n \t\tMLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);\n \ttmpl->rxq.elts =\n-\t\t(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);\n+\t\t(struct rte_mbuf *(*)[desc_n])(tmpl + 1);\n+\tif (mlx5_rxq_mprq_enabled(&tmpl->rxq)) {\n+\t\ttmpl->rxq.rq_repl_thresh = 1;\n+\t\ttmpl->rxq.mprq_bufs =\n+\t\t\t(struct mlx5_mprq_buf *(*)[desc])(tmpl + desc_n + 1);\n+\t}\n #ifndef RTE_ARCH_64\n \ttmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;\n #endif\ndiff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c\nindex 65239f9ffe..768a242518 100644\n--- a/drivers/net/mlx5/mlx5_rxtx.c\n+++ b/drivers/net/mlx5/mlx5_rxtx.c\n@@ -614,6 +614,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,\n \t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"Vector AltiVec\");\n #else\n \t\treturn -EINVAL;\n+#endif\n+\t} else if (pkt_burst == mlx5_rx_burst_mprq_vec) {\n+#if defined RTE_ARCH_X86_64\n+\t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"Multi-Packet RQ Vector SSE\");\n+#elif defined RTE_ARCH_ARM64\n+\t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"Multi-Packet RQ Vector Neon\");\n+#elif defined RTE_ARCH_PPC_64\n+\t\tsnprintf(mode->info, sizeof(mode->info), \"%s\", \"Multi-Packet RQ Vector AltiVec\");\n+#else\n+\t\treturn -EINVAL;\n #endif\n \t} else {\n \t\treturn -EINVAL;\n@@ -1075,7 +1085,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)\n {\n \tconst uint16_t cqe_n = 1 << rxq->cqe_n;\n \tconst uint16_t cqe_mask = cqe_n - 1;\n-\tconst unsigned int wqe_n = 1 << rxq->elts_n;\n+\tunsigned int wqe_n = 1 << rxq->elts_n;\n \tstruct mlx5_rxq_ctrl *rxq_ctrl =\n \t\t\tcontainer_of(rxq, struct mlx5_rxq_ctrl, rxq);\n \tunion {\n@@ -1139,11 +1149,17 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)\n \t\t\t\t\t\t    &sm))\n \t\t\t\treturn -1;\n \t\t\tif (vec) {\n-\t\t\t\tconst uint16_t q_mask = wqe_n - 1;\n+\t\t\t\tuint16_t q_mask;\n \t\t\t\tuint16_t elt_idx;\n \t\t\t\tstruct rte_mbuf **elt;\n \t\t\t\tint i;\n-\t\t\t\tunsigned int n = wqe_n - (rxq->rq_ci -\n+\t\t\t\tunsigned int n;\n+\n+\t\t\t\tif (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))\n+\t\t\t\t\twqe_n *= (1U <<\n+\t\t\t\t\t\t  rxq_ctrl->rxq.strd_num_n);\n+\t\t\t\tq_mask = wqe_n - 1;\n+\t\t\t\tn = wqe_n - (rxq->rq_ci -\n \t\t\t\t\t\t\t  rxq->rq_pi);\n \n \t\t\t\tfor (i = 0; i < (int)n; ++i) {\n@@ -1982,6 +1998,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,\n \treturn 0;\n }\n \n+__rte_weak uint16_t\n+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,\n+\t\t  struct rte_mbuf **pkts __rte_unused,\n+\t\t  uint16_t pkts_n __rte_unused)\n+{\n+\treturn 0;\n+}\n+\n __rte_weak int\n mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)\n {\ndiff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h\nindex 5116a15c33..3c44794d68 100644\n--- a/drivers/net/mlx5/mlx5_rxtx.h\n+++ b/drivers/net/mlx5/mlx5_rxtx.h\n@@ -141,11 +141,8 @@ struct mlx5_rxq_data {\n \tuint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */\n \tvolatile void *wqes;\n \tvolatile struct mlx5_cqe(*cqes)[];\n-\tRTE_STD_C11\n-\tunion  {\n-\t\tstruct rte_mbuf *(*elts)[];\n-\t\tstruct mlx5_mprq_buf *(*mprq_bufs)[];\n-\t};\n+\tstruct rte_mbuf *(*elts)[];\n+\tstruct mlx5_mprq_buf *(*mprq_bufs)[];\n \tstruct rte_mempool *mp;\n \tstruct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */\n \tstruct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */\n@@ -518,6 +515,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);\n int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);\n uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,\n \t\t\t   uint16_t pkts_n);\n+uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,\n+\t\t\t   uint16_t pkts_n);\n \n /* mlx5_mr.c */\n \ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c\nindex 7fae2010f9..53dd229271 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec.c\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c\n@@ -119,6 +119,40 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n \treturn tn;\n }\n \n+/**\n+ * DPDK callback for MPRQ vectorized RX.\n+ *\n+ * @param dpdk_rxq\n+ *   Generic pointer to RX queue structure.\n+ * @param[out] pkts\n+ *   Array to store received packets.\n+ * @param pkts_n\n+ *   Maximum number of packets in array.\n+ *\n+ * @return\n+ *   Number of packets successfully received (<= pkts_n).\n+ */\n+uint16_t\n+mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)\n+{\n+\tstruct mlx5_rxq_data *rxq = dpdk_rxq;\n+\tuint16_t nb_rx = 0;\n+\tuint16_t tn = 0;\n+\tuint64_t err = 0;\n+\tbool no_cq = false;\n+\n+\tdo {\n+\t\tnb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,\n+\t\t\t\t\t &err, &no_cq);\n+\t\tif (unlikely(err | rxq->err_state))\n+\t\t\tnb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);\n+\t\ttn += nb_rx;\n+\t\tif (unlikely(no_cq))\n+\t\t\tbreak;\n+\t} while (tn != pkts_n);\n+\treturn tn;\n+}\n+\n /**\n  * Check a RX queue can support vectorized RX.\n  *\n@@ -134,8 +168,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)\n \tstruct mlx5_rxq_ctrl *ctrl =\n \t\tcontainer_of(rxq, struct mlx5_rxq_ctrl, rxq);\n \n-\tif (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))\n-\t\treturn -ENOTSUP;\n \tif (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)\n \t\treturn -ENOTSUP;\n \tif (rxq->lro)\n@@ -160,8 +192,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)\n \n \tif (!priv->config.rx_vec_en)\n \t\treturn -ENOTSUP;\n-\tif (mlx5_mprq_enabled(dev))\n-\t\treturn -ENOTSUP;\n \t/* All the configured queues should support. */\n \tfor (i = 0; i < priv->rxqs_n; ++i) {\n \t\tstruct mlx5_rxq_data *rxq = (*priv->rxqs)[i];\ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h\nindex 6ddcbfb0ad..305c5a596a 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec.h\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h\n@@ -122,4 +122,25 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)\n \t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n }\n \n+static inline void\n+mlx5_rx_replenish_bulk_mprq_mbuf(struct mlx5_rxq_data *rxq,\n+\t\t\t\t uint16_t n, uint32_t rq_idx)\n+{\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tuint16_t elts_idx = rq_idx * strd_n +\n+\t\trq_idx * MLX5_VPMD_DESCS_PER_LOOP;\n+\tstruct rte_mbuf **elts = &(*rxq->elts)[elts_idx];\n+\tunsigned int i;\n+\n+\tn = RTE_MIN(n, strd_n - rxq->consumed_strd);\n+\tif (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {\n+\t\trxq->stats.rx_nombuf += n;\n+\t\treturn;\n+\t}\n+\trxq->rq_repl_thresh = 0;\n+\t/* Prevent overflowing into the next MPRQ mbufs. */\n+\tfor (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)\n+\t\t(*rxq->elts)[elts_idx + strd_n + i] = &rxq->fake_mbuf;\n+}\n+\n #endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */\ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h\nindex f5414eebad..8fc3e1fd66 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h\n@@ -59,6 +59,97 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)\n \t\tpkts[pos] = elts[pos];\n }\n \n+/**\n+ * Store free buffers to RX SW ring.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param pkts\n+ *   Pointer to array of packets to be stored.\n+ * @param pkts_n\n+ *   Number of packets to be stored.\n+ * @param buf\n+ *   MPRQ buffer to get packets from.\n+ * @param buf rq_ci\n+ *   WQE index.\n+ * @param strd_idx\n+ *   Stride number.\n+ * @param comp\n+ *   Whether CQE is compressed or not.\n+ */\n+static inline void\n+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t     uint16_t n, struct mlx5_mprq_buf *buf,\n+\t\t     uint16_t rq_ci, uint16_t strd_idx, bool comp)\n+{\n+\tconst unsigned int strd_sz = 1 << rxq->strd_sz_n;\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tconst unsigned int strd_shift =\n+\t\tMLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;\n+\tuint32_t offset;\n+\tvoid *addr;\n+\tint i = 0;\n+\n+\tif (comp) {\n+\t\tconst uint16_t q_mask = (1 << rxq->cqe_n) - 1;\n+\t\tstruct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];\n+\t\tunsigned int pos;\n+\t\tuint16_t p = n & -2;\n+\n+\t\tfor (pos = 0; pos < p; pos += 2) {\n+\t\t\tvector unsigned char mbp;\n+\n+\t\t\tmbp = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t\t(signed int const *)&elts[pos +\n+\t\t\t\t\t\t\t  rxq->consumed_strd]);\n+\t\t\t*(vector unsigned char *)&pkts[pos] = mbp;\n+\t\t}\n+\t\tif (n & 1)\n+\t\t\tpkts[pos] = elts[pos];\n+\t}\n+\n+\tfor (i = 0; i < n; ++i) {\n+\t\toffset = (strd_idx + i) * strd_sz + strd_shift;\n+\t\taddr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);\n+\t\tif (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||\n+\t\t    rxq->mprq_repl == NULL) {\n+\t\t\trte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),\n+\t\t\t\t   addr, pkts[i]->pkt_len);\n+\t\t} else {\n+\t\t\trte_iova_t buf_iova;\n+\t\t\tstruct rte_mbuf_ext_shared_info *shinfo;\n+\t\t\tuint16_t buf_len = strd_sz;\n+\t\t\tvoid *buf_addr;\n+\t\t\t/* Increment the refcnt of the whole chunk. */\n+\t\t\trte_atomic16_add_return(&buf->refcnt, 1);\n+\t\t\tMLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=\n+\t\t\t\t\tstrd_n + 1);\n+\t\t\tbuf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);\n+\t\t\t/*\n+\t\t\t * MLX5 device doesn't use iova but it is necessary in a\n+\t\t\t * case where the Rx packet is transmitted via a\n+\t\t\t * different PMD.\n+\t\t\t */\n+\t\t\tbuf_iova = rte_mempool_virt2iova(buf) +\n+\t\t\t\tRTE_PTR_DIFF(buf_addr, buf);\n+\t\t\tshinfo = &buf->shinfos[strd_idx];\n+\t\t\trte_mbuf_ext_refcnt_set(shinfo, 1);\n+\t\t\t/*\n+\t\t\t * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when\n+\t\t\t * attaching the stride to mbuf and more offload flags\n+\t\t\t * will be added below by calling rxq_cq_to_mbuf().\n+\t\t\t * Other fields will be overwritten.\n+\t\t\t */\n+\t\t\trte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,\n+\t\t\t\t\t\tbuf_len, shinfo);\n+\t\t\t/* Set mbuf head-room. */\n+\t\t\tSET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);\n+\t\t\tDATA_LEN(pkts[i]) = pkts[i]->pkt_len;\n+\t\t}\n+\t}\n+}\n+\n+\n /**\n  * Decompress a compressed completion and fill in mbufs in RX SW ring with data\n  * extracted from the title completion descriptor.\n@@ -1136,4 +1227,637 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,\n \treturn rcvd_pkt;\n }\n \n+static inline void\n+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,\n+\t\t const unsigned int strd_n)\n+{\n+\tstruct mlx5_mprq_buf *rep = rxq->mprq_repl;\n+\tvolatile struct mlx5_wqe_data_seg *wqe =\n+\t\t&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;\n+\tvoid *addr;\n+\n+\tMLX5_ASSERT(rep != NULL);\n+\t/* Replace MPRQ buf. */\n+\t(*rxq->mprq_bufs)[rq_idx] = rep;\n+\t/* Replace WQE. */\n+\taddr = mlx5_mprq_buf_addr(rep, strd_n);\n+\twqe->addr = rte_cpu_to_be_64((uintptr_t)addr);\n+\t/* If there's only one MR, no need to replace LKey in WQE. */\n+\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))\n+\t\twqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);\n+\t/* Stash a mbuf for next replacement. */\n+\tif (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))\n+\t\trxq->mprq_repl = rep;\n+\telse\n+\t\trxq->mprq_repl = NULL;\n+}\n+\n+/**\n+ * Receive burst of packets. An errored completion also consumes a mbuf, but the\n+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed\n+ * before returning to application.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param[out] pkts\n+ *   Array to store received packets.\n+ * @param pkts_n\n+ *   Maximum number of packets in array.\n+ * @param[out] err\n+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error\n+ *   packet to handle.\n+ * @param[out] no_cq\n+ *  Pointer to a boolean. Set true if no new CQE seen.\n+ *\n+ * @return\n+ *   Number of packets received including errors (<= pkts_n).\n+ */\n+static inline uint16_t\n+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t uint16_t pkts_n, uint64_t *err, bool *no_cq)\n+{\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tconst uint16_t q_n = 1 << rxq->cqe_n;\n+\tconst uint16_t q_mask = q_n - 1;\n+\tconst uint16_t e_n = 1 << rxq->elts_n;\n+\tconst uint16_t e_mask = e_n - 1;\n+\tvolatile struct mlx5_cqe *cq;\n+\tstruct rte_mbuf **elts;\n+\tunsigned int pos;\n+\tuint64_t n;\n+\tuint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;\n+\tuint16_t nocmp_n = 0;\n+\tuint16_t rcvd_pkt = 0;\n+\tunsigned int cq_ci = rxq->cq_ci;\n+\tunsigned int cq_idx = cq_ci & q_mask;\n+\tunsigned int rq_ci = rxq->rq_ci;\n+\tunsigned int rq_idx = rq_ci & e_mask;\n+\tstruct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];\n+\tunsigned int elts_idx;\n+\tunsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));\n+\tconst vector unsigned char zero = (vector unsigned char){0};\n+\tconst vector unsigned char ones = vec_splat_u8(-1);\n+\tconst vector unsigned char owner_check =\n+\t\t(vector unsigned char)(vector unsigned long){\n+\t\t0x0100000001000000LL, 0x0100000001000000LL};\n+\tconst vector unsigned char opcode_check =\n+\t\t(vector unsigned char)(vector unsigned long){\n+\t\t0xf0000000f0000000LL, 0xf0000000f0000000LL};\n+\tconst vector unsigned char format_check =\n+\t\t(vector unsigned char)(vector unsigned long){\n+\t\t0x0c0000000c000000LL, 0x0c0000000c000000LL};\n+\tconst vector unsigned char resp_err_check =\n+\t\t(vector unsigned char)(vector unsigned long){\n+\t\t0xe0000000e0000000LL, 0xe0000000e0000000LL};\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\tuint32_t rcvd_byte = 0;\n+\t/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */\n+\tconst vector unsigned char len_shuf_mask = (vector unsigned char){\n+\t\t 1,  0,  5,  4,\n+\t\t 9,  8, 13, 12,\n+\t\t-1, -1, -1, -1,\n+\t\t-1, -1, -1, -1};\n+#endif\n+\t/* Mask to shuffle from extracted CQE to mbuf. */\n+\tconst vector unsigned char shuf_mask = (vector unsigned char){\n+\t\t 5,  4,           /* bswap16, pkt_len */\n+\t\t-1, -1,           /* zero out 2nd half of pkt_len */\n+\t\t 5,  4,           /* bswap16, data_len */\n+\t\t11, 10,           /* bswap16, vlan+tci */\n+\t\t15, 14, 13, 12,   /* bswap32, rss */\n+\t\t 1,  2,  3, -1};  /* fdir.hi */\n+\t/* Mask to blend from the last Qword to the first DQword. */\n+\t/* Mask to blend from the last Qword to the first DQword. */\n+\tconst vector unsigned char blend_mask = (vector unsigned char){\n+\t\t-1,  0,  0,  0,\n+\t\t 0,  0,  0,  0,\n+\t\t-1, -1, -1, -1,\n+\t\t-1, -1, -1, -1};\n+\tconst vector unsigned char crc_adj =\n+\t\t(vector unsigned char)(vector unsigned short){\n+\t\trxq->crc_present * RTE_ETHER_CRC_LEN, 0,\n+\t\trxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0};\n+\tconst vector unsigned char flow_mark_adj =\n+\t\t(vector unsigned char)(vector unsigned int){\n+\t\t0, 0, 0, rxq->mark * (-1)};\n+\tconst vector unsigned short cqe_sel_mask1 =\n+\t\t(vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};\n+\tconst vector unsigned short cqe_sel_mask2 =\n+\t\t(vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};\n+\n+\tMLX5_ASSERT(rxq->sges_n == 0);\n+\tMLX5_ASSERT(rxq->cqe_n == rxq->elts_n);\n+\tif (rxq->consumed_strd == strd_n) {\n+\t\t/* Replace WQE only if the buffer is still in use. */\n+\t\tif (rte_atomic16_read(&buf->refcnt) > 1) {\n+\t\t\tmprq_buf_replace(rxq, rq_ci & e_mask, strd_n);\n+\t\t\t/* Release the old buffer. */\n+\t\t\tmlx5_mprq_buf_free(buf);\n+\t\t} else if (unlikely(rxq->mprq_repl == NULL)) {\n+\t\t\tstruct mlx5_mprq_buf *rep;\n+\n+\t\t\t/*\n+\t\t\t * Currently, the MPRQ mempool is out of buffer\n+\t\t\t * and doing memcpy regardless of the size of Rx\n+\t\t\t * packet. Retry allocation to get back to\n+\t\t\t * normal.\n+\t\t\t */\n+\t\t\tif (!rte_mempool_get(rxq->mprq_mp,\n+\t\t\t\t\t     (void **)&rep))\n+\t\t\t\trxq->mprq_repl = rep;\n+\t\t}\n+\t\t/* Advance to the next WQE. */\n+\t\trxq->consumed_strd = 0;\n+\t\t++rq_ci;\n+\t\tbuf = (*rxq->mprq_bufs)[rq_ci & e_mask];\n+\t\trxq->rq_repl_thresh = 1;\n+\t}\n+\tif (rxq->rq_repl_thresh)\n+\t\tmlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);\n+\n+\tcq = &(*rxq->cqes)[cq_idx];\n+\trte_prefetch0(cq);\n+\trte_prefetch0(cq + 1);\n+\trte_prefetch0(cq + 2);\n+\trte_prefetch0(cq + 3);\n+\telts_idx = (rq_ci & e_mask) * strd_n +\n+\t\t(rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;\n+\telts = &(*rxq->elts)[elts_idx];\n+\tpkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);\n+\t/* See if there're unreturned mbufs from compressed CQE. */\n+\trcvd_pkt = rxq->decompressed;\n+\tif (rcvd_pkt > 0) {\n+\t\trcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);\n+\t\trxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,\n+\t\t\t\t     rq_ci, rxq->consumed_strd, true);\n+\t\trxq->consumed_strd += rcvd_pkt;\n+\t\trxq->rq_pi += rcvd_pkt;\n+\t\trxq->decompressed -= rcvd_pkt;\n+\t\tpkts += rcvd_pkt;\n+\t}\n+\t/* Not to cross queue end. */\n+\tpkts_n = RTE_MIN(pkts_n, q_n - cq_idx);\n+\tpkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);\n+\tif (!pkts_n) {\n+\t\t*no_cq = !rcvd_pkt;\n+\t\treturn rcvd_pkt;\n+\t}\n+\t/* At this point, there shouldn't be any remaining packets. */\n+\tMLX5_ASSERT(rxq->decompressed == 0);\n+\n+\t/*\n+\t * A. load first Qword (8bytes) in one loop.\n+\t * B. copy 4 mbuf pointers from elts ring to returing pkts.\n+\t * C. load remaining CQE data and extract necessary fields.\n+\t *    Final 16bytes cqes[] extracted from original 64bytes CQE has the\n+\t *    following structure:\n+\t *        struct {\n+\t *          uint8_t  pkt_info;\n+\t *          uint8_t  flow_tag[3];\n+\t *          uint16_t byte_cnt;\n+\t *          uint8_t  rsvd4;\n+\t *          uint8_t  op_own;\n+\t *          uint16_t hdr_type_etc;\n+\t *          uint16_t vlan_info;\n+\t *          uint32_t rx_has_res;\n+\t *        } c;\n+\t * D. fill in mbuf.\n+\t * E. get valid CQEs.\n+\t * F. find compressed CQE.\n+\t */\n+\tfor (pos = 0;\n+\t     pos < pkts_n;\n+\t     pos += MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\tvector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP];\n+\t\tvector unsigned char cqe_tmp1, cqe_tmp2;\n+\t\tvector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;\n+\t\tvector unsigned char op_own, op_own_tmp1, op_own_tmp2;\n+\t\tvector unsigned char opcode, owner_mask, invalid_mask;\n+\t\tvector unsigned char comp_mask;\n+\t\tvector unsigned char mask;\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\tconst vector unsigned char lower_half = {\n+\t\t\t0, 1, 4, 5, 8, 9, 12, 13,\n+\t\t\t16, 17, 20, 21, 24, 25, 28, 29};\n+\t\tconst vector unsigned char upper_half = {\n+\t\t\t2, 3, 6, 7, 10, 11, 14, 15,\n+\t\t\t18, 19, 22, 23, 26, 27, 30, 31};\n+\t\tconst vector unsigned long shmax = {64, 64};\n+\t\tvector unsigned char byte_cnt;\n+\t\tvector unsigned short left, right;\n+\t\tvector unsigned long lshift;\n+\t\tvector __attribute__((altivec(bool__)))\n+\t\t\tunsigned long shmask;\n+#endif\n+\t\tvector unsigned char mbp1, mbp2;\n+\t\tvector unsigned char p =\n+\t\t\t(vector unsigned char)(vector unsigned short){\n+\t\t\t\t0, 1, 2, 3, 0, 0, 0, 0};\n+\t\tunsigned int p1, p2, p3;\n+\n+\t\t/* Prefetch next 4 CQEs. */\n+\t\tif (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);\n+\t\t}\n+\n+\t\t/* A.0 do not cross the end of CQ. */\n+\t\tmask = (vector unsigned char)(vector unsigned long){\n+\t\t\t(pkts_n - pos) * sizeof(uint16_t) * 8, 0};\n+\n+\t\t{\n+\t\t\tvector unsigned long lshift;\n+\t\t\tvector __attribute__((altivec(bool__)))\n+\t\t\t\tunsigned long shmask;\n+\t\t\tconst vector unsigned long shmax = {64, 64};\n+\n+\t\t\tlshift = vec_splat((vector unsigned long)mask, 0);\n+\t\t\tshmask = vec_cmpgt(shmax, lshift);\n+\t\t\tmask = (vector unsigned char)\n+\t\t\t\tvec_sl((vector unsigned long)ones, lshift);\n+\t\t\tmask = (vector unsigned char)\n+\t\t\t\tvec_sel((vector unsigned long)shmask,\n+\t\t\t\t(vector unsigned long)mask, shmask);\n+\t\t}\n+\n+\t\tp = (vector unsigned char)\n+\t\t\tvec_andc((vector unsigned long)p,\n+\t\t\t(vector unsigned long)mask);\n+\n+\t\t/* A.1 load cqes. */\n+\t\tp3 = (unsigned int)((vector unsigned short)p)[3];\n+\t\tcqes[3] = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos + p3].sop_drop_qpn, 0LL};\n+\t\trte_compiler_barrier();\n+\n+\t\tp2 = (unsigned int)((vector unsigned short)p)[2];\n+\t\tcqes[2] = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos + p2].sop_drop_qpn, 0LL};\n+\t\trte_compiler_barrier();\n+\n+\t\t/* B.1 load mbuf pointers. */\n+\t\tmbp1 = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t(signed int const *)&elts[pos + rxq->consumed_strd]);\n+\t\tmbp2 = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t(signed int const *)&elts[pos +\n+\t\t\t\t\t\t  rxq->consumed_strd + 2]);\n+\n+\t\t/* A.1 load a block having op_own. */\n+\t\tp1 = (unsigned int)((vector unsigned short)p)[1];\n+\t\tcqes[1] = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos + p1].sop_drop_qpn, 0LL};\n+\t\trte_compiler_barrier();\n+\n+\t\tcqes[0] = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos].sop_drop_qpn, 0LL};\n+\t\trte_compiler_barrier();\n+\n+\t\t/* B.2 copy mbuf pointers. */\n+\t\t*(vector unsigned char *)&pkts[pos] = mbp1;\n+\t\t*(vector unsigned char *)&pkts[pos + 2] = mbp2;\n+\t\trte_cio_rmb();\n+\n+\t\t/* C.1 load remaining CQE data and extract necessary fields. */\n+\t\tcqe_tmp2 = *(vector unsigned char *)\n+\t\t\t&cq[pos + p3].pkt_info;\n+\t\tcqe_tmp1 = *(vector unsigned char *)\n+\t\t\t&cq[pos + p2].pkt_info;\n+\t\tcqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask);\n+\t\tcqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask);\n+\t\tcqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t(signed int const *)&cq[pos + p3].csum);\n+\t\tcqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t(signed int const *)&cq[pos + p2].csum);\n+\t\tcqes[3] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[3],\n+\t\t\t(vector unsigned short)cqe_tmp2, cqe_sel_mask1);\n+\t\tcqes[2] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[2],\n+\t\t\t(vector unsigned short)cqe_tmp1, cqe_sel_mask1);\n+\t\tcqe_tmp2 = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos + p3].rsvd3[9], 0LL};\n+\t\tcqe_tmp1 = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos + p2].rsvd3[9], 0LL};\n+\t\tcqes[3] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[3],\n+\t\t\t(vector unsigned short)cqe_tmp2,\n+\t\t\t(vector unsigned short)cqe_sel_mask2);\n+\t\tcqes[2] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[2],\n+\t\t\t(vector unsigned short)cqe_tmp1,\n+\t\t\t(vector unsigned short)cqe_sel_mask2);\n+\n+\t\t/* C.2 generate final structure for mbuf with swapping bytes. */\n+\t\tpkt_mb3 = vec_perm(cqes[3], zero, shuf_mask);\n+\t\tpkt_mb2 = vec_perm(cqes[2], zero, shuf_mask);\n+\n+\t\t/* C.3 adjust CRC length. */\n+\t\tpkt_mb3 = (vector unsigned char)\n+\t\t\t((vector unsigned short)pkt_mb3 -\n+\t\t\t(vector unsigned short)crc_adj);\n+\t\tpkt_mb2 = (vector unsigned char)\n+\t\t\t((vector unsigned short)pkt_mb2 -\n+\t\t\t(vector unsigned short)crc_adj);\n+\n+\t\t/* C.4 adjust flow mark. */\n+\t\tpkt_mb3 = (vector unsigned char)\n+\t\t\t((vector unsigned int)pkt_mb3 +\n+\t\t\t(vector unsigned int)flow_mark_adj);\n+\t\tpkt_mb2 = (vector unsigned char)\n+\t\t\t((vector unsigned int)pkt_mb2 +\n+\t\t\t(vector unsigned int)flow_mark_adj);\n+\n+\t\t/* D.1 fill in mbuf - rx_descriptor_fields1. */\n+\t\t*(vector unsigned char *)\n+\t\t\t&pkts[pos + 3]->pkt_len = pkt_mb3;\n+\t\t*(vector unsigned char *)\n+\t\t\t&pkts[pos + 2]->pkt_len = pkt_mb2;\n+\n+\t\t/* E.1 extract op_own field. */\n+\t\top_own_tmp2 = (vector unsigned char)\n+\t\t\tvec_mergeh((vector unsigned int)cqes[2],\n+\t\t\t(vector unsigned int)cqes[3]);\n+\n+\t\t/* C.1 load remaining CQE data and extract necessary fields. */\n+\t\tcqe_tmp2 = *(vector unsigned char *)\n+\t\t\t&cq[pos + p1].pkt_info;\n+\t\tcqe_tmp1 = *(vector unsigned char *)\n+\t\t\t&cq[pos].pkt_info;\n+\t\tcqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask);\n+\t\tcqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask);\n+\t\tcqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t(signed int const *)&cq[pos + p1].csum);\n+\t\tcqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,\n+\t\t\t(signed int const *)&cq[pos].csum);\n+\t\tcqes[1] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[1],\n+\t\t\t(vector unsigned short)cqe_tmp2, cqe_sel_mask1);\n+\t\tcqes[0] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[0],\n+\t\t\t(vector unsigned short)cqe_tmp1, cqe_sel_mask1);\n+\t\tcqe_tmp2 = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos + p1].rsvd3[9], 0LL};\n+\t\tcqe_tmp1 = (vector unsigned char)(vector unsigned long){\n+\t\t\t*(__rte_aligned(8) unsigned long *)\n+\t\t\t&cq[pos].rsvd3[9], 0LL};\n+\t\tcqes[1] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[1],\n+\t\t\t(vector unsigned short)cqe_tmp2, cqe_sel_mask2);\n+\t\tcqes[0] = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned short)cqes[0],\n+\t\t\t(vector unsigned short)cqe_tmp1, cqe_sel_mask2);\n+\n+\t\t/* C.2 generate final structure for mbuf with swapping bytes. */\n+\t\tpkt_mb1 = vec_perm(cqes[1], zero, shuf_mask);\n+\t\tpkt_mb0 = vec_perm(cqes[0], zero, shuf_mask);\n+\n+\t\t/* C.3 adjust CRC length. */\n+\t\tpkt_mb1 = (vector unsigned char)\n+\t\t\t((vector unsigned short)pkt_mb1 -\n+\t\t\t(vector unsigned short)crc_adj);\n+\t\tpkt_mb0 = (vector unsigned char)\n+\t\t\t((vector unsigned short)pkt_mb0 -\n+\t\t\t(vector unsigned short)crc_adj);\n+\n+\t\t/* C.4 adjust flow mark. */\n+\t\tpkt_mb1 = (vector unsigned char)\n+\t\t\t((vector unsigned int)pkt_mb1 +\n+\t\t\t(vector unsigned int)flow_mark_adj);\n+\t\tpkt_mb0 = (vector unsigned char)\n+\t\t\t((vector unsigned int)pkt_mb0 +\n+\t\t\t(vector unsigned int)flow_mark_adj);\n+\n+\t\t/* E.1 extract op_own byte. */\n+\t\top_own_tmp1 = (vector unsigned char)\n+\t\t\tvec_mergeh((vector unsigned int)cqes[0],\n+\t\t\t(vector unsigned int)cqes[1]);\n+\t\top_own = (vector unsigned char)\n+\t\t\tvec_mergel((vector unsigned long)op_own_tmp1,\n+\t\t\t(vector unsigned long)op_own_tmp2);\n+\n+\t\t/* D.1 fill in mbuf - rx_descriptor_fields1. */\n+\t\t*(vector unsigned char *)\n+\t\t\t&pkts[pos + 1]->pkt_len = pkt_mb1;\n+\t\t*(vector unsigned char *)\n+\t\t\t&pkts[pos]->pkt_len = pkt_mb0;\n+\n+\t\t/* E.2 flip owner bit to mark CQEs from last round. */\n+\t\towner_mask = (vector unsigned char)\n+\t\t\tvec_and((vector unsigned long)op_own,\n+\t\t\t(vector unsigned long)owner_check);\n+\t\tif (ownership)\n+\t\t\towner_mask = (vector unsigned char)\n+\t\t\t\tvec_xor((vector unsigned long)owner_mask,\n+\t\t\t\t(vector unsigned long)owner_check);\n+\t\towner_mask = (vector unsigned char)\n+\t\t\tvec_cmpeq((vector unsigned int)owner_mask,\n+\t\t\t(vector unsigned int)owner_check);\n+\t\towner_mask = (vector unsigned char)\n+\t\t\tvec_packs((vector unsigned int)owner_mask,\n+\t\t\t(vector unsigned int)zero);\n+\n+\t\t/* E.3 get mask for invalidated CQEs. */\n+\t\topcode = (vector unsigned char)\n+\t\t\tvec_and((vector unsigned long)op_own,\n+\t\t\t(vector unsigned long)opcode_check);\n+\t\tinvalid_mask = (vector unsigned char)\n+\t\t\tvec_cmpeq((vector unsigned int)opcode_check,\n+\t\t\t(vector unsigned int)opcode);\n+\t\tinvalid_mask = (vector unsigned char)\n+\t\t\tvec_packs((vector unsigned int)invalid_mask,\n+\t\t\t(vector unsigned int)zero);\n+\n+\t\t/* E.4 mask out beyond boundary. */\n+\t\tinvalid_mask = (vector unsigned char)\n+\t\t\tvec_or((vector unsigned long)invalid_mask,\n+\t\t\t(vector unsigned long)mask);\n+\n+\t\t/* E.5 merge invalid_mask with invalid owner. */\n+\t\tinvalid_mask = (vector unsigned char)\n+\t\t\tvec_or((vector unsigned long)invalid_mask,\n+\t\t\t(vector unsigned long)owner_mask);\n+\n+\t\t/* F.1 find compressed CQE format. */\n+\t\tcomp_mask = (vector unsigned char)\n+\t\t\tvec_and((vector unsigned long)op_own,\n+\t\t\t(vector unsigned long)format_check);\n+\t\tcomp_mask = (vector unsigned char)\n+\t\t\tvec_cmpeq((vector unsigned int)comp_mask,\n+\t\t\t(vector unsigned int)format_check);\n+\t\tcomp_mask = (vector unsigned char)\n+\t\t\tvec_packs((vector unsigned int)comp_mask,\n+\t\t\t(vector unsigned int)zero);\n+\n+\t\t/* F.2 mask out invalid entries. */\n+\t\tcomp_mask = (vector unsigned char)\n+\t\t\tvec_andc((vector unsigned long)comp_mask,\n+\t\t\t(vector unsigned long)invalid_mask);\n+\t\tcomp_idx = ((vector unsigned long)comp_mask)[0];\n+\n+\t\t/* F.3 get the first compressed CQE. */\n+\t\tcomp_idx = comp_idx ? __builtin_ctzll(comp_idx) /\n+\t\t\t(sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP;\n+\n+\t\t/* E.6 mask out entries after the compressed CQE. */\n+\t\tmask = (vector unsigned char)(vector unsigned long){\n+\t\t\t(comp_idx * sizeof(uint16_t) * 8), 0};\n+\t\tlshift = vec_splat((vector unsigned long)mask, 0);\n+\t\tshmask = vec_cmpgt(shmax, lshift);\n+\t\tmask = (vector unsigned char)\n+\t\t\tvec_sl((vector unsigned long)ones, lshift);\n+\t\tmask = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned long)shmask,\n+\t\t\t(vector unsigned long)mask, shmask);\n+\t\tinvalid_mask = (vector unsigned char)\n+\t\t\tvec_or((vector unsigned long)invalid_mask,\n+\t\t\t(vector unsigned long)mask);\n+\n+\t\t/* E.7 count non-compressed valid CQEs. */\n+\t\tn = ((vector unsigned long)invalid_mask)[0];\n+\t\tn = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :\n+\t\t\tMLX5_VPMD_DESCS_PER_LOOP;\n+\t\tnocmp_n += n;\n+\n+\t\t/* D.2 get the final invalid mask. */\n+\t\tmask = (vector unsigned char)(vector unsigned long){\n+\t\t\t(n * sizeof(uint16_t) * 8), 0};\n+\t\tlshift = vec_splat((vector unsigned long)mask, 0);\n+\t\tshmask = vec_cmpgt(shmax, lshift);\n+\t\tmask = (vector unsigned char)\n+\t\t\tvec_sl((vector unsigned long)ones, lshift);\n+\t\tmask = (vector unsigned char)\n+\t\t\tvec_sel((vector unsigned long)shmask,\n+\t\t\t(vector unsigned long)mask, shmask);\n+\t\tinvalid_mask = (vector unsigned char)\n+\t\t\tvec_or((vector unsigned long)invalid_mask,\n+\t\t\t(vector unsigned long)mask);\n+\n+\t\t/* D.3 check error in opcode. */\n+\t\topcode = (vector unsigned char)\n+\t\t\tvec_cmpeq((vector unsigned int)resp_err_check,\n+\t\t\t(vector unsigned int)opcode);\n+\t\topcode = (vector unsigned char)\n+\t\t\tvec_packs((vector unsigned int)opcode,\n+\t\t\t(vector unsigned int)zero);\n+\t\topcode = (vector unsigned char)\n+\t\t\tvec_andc((vector unsigned long)opcode,\n+\t\t\t(vector unsigned long)invalid_mask);\n+\n+\t\t/* D.4 mark if any error is set */\n+\t\t*err |= ((vector unsigned long)opcode)[0];\n+\n+\t\t/* D.5 fill in mbuf - rearm_data and packet_type. */\n+\t\trxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);\n+\t\tif (rxq->hw_timestamp) {\n+\t\t\tpkts[pos]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos].timestamp);\n+\t\t\tpkts[pos + 1]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos + p1].timestamp);\n+\t\t\tpkts[pos + 2]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos + p2].timestamp);\n+\t\t\tpkts[pos + 3]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos + p3].timestamp);\n+\t\t}\n+\t\tif (rxq->dynf_meta) {\n+\t\t\tuint64_t flag = rxq->flow_meta_mask;\n+\t\t\tint32_t offs = rxq->flow_meta_offset;\n+\t\t\tuint32_t metadata;\n+\n+\t\t\t/* This code is subject for futher optimization. */\n+\t\t\tmetadata = cq[pos].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =\n+\t\t\t\t\t\t\t\tmetadata;\n+\t\t\tpkts[pos]->ol_flags |= metadata ? flag : 0ULL;\n+\t\t\tmetadata = cq[pos + 1].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =\n+\t\t\t\t\t\t\t\tmetadata;\n+\t\t\tpkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL;\n+\t\t\tmetadata = cq[pos + 2].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =\n+\t\t\t\t\t\t\t\tmetadata;\n+\t\t\tpkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL;\n+\t\t\tmetadata = cq[pos + 3].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =\n+\t\t\t\t\t\t\t\tmetadata;\n+\t\t\tpkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL;\n+\t\t}\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t/* Add up received bytes count. */\n+\t\tbyte_cnt = vec_perm(op_own, zero, len_shuf_mask);\n+\t\tbyte_cnt = (vector unsigned char)\n+\t\t\tvec_andc((vector unsigned long)byte_cnt,\n+\t\t\t(vector unsigned long)invalid_mask);\n+\t\tleft = vec_perm((vector unsigned short)byte_cnt,\n+\t\t\t(vector unsigned short)zero, lower_half);\n+\t\tright = vec_perm((vector unsigned short)byte_cnt,\n+\t\t\t(vector unsigned short)zero, upper_half);\n+\t\tbyte_cnt = (vector unsigned char)vec_add(left, right);\n+\t\tleft = vec_perm((vector unsigned short)byte_cnt,\n+\t\t\t(vector unsigned short)zero, lower_half);\n+\t\tright = vec_perm((vector unsigned short)byte_cnt,\n+\t\t\t(vector unsigned short)zero, upper_half);\n+\t\tbyte_cnt = (vector unsigned char)vec_add(left, right);\n+\t\trcvd_byte += ((vector unsigned long)byte_cnt)[0];\n+#endif\n+\n+\t\t/*\n+\t\t * Break the loop unless more valid CQE is expected, or if\n+\t\t * there's a compressed CQE.\n+\t\t */\n+\t\tif (n != MLX5_VPMD_DESCS_PER_LOOP)\n+\t\t\tbreak;\n+\t}\n+\t/* If no new CQE seen, return without updating cq_db. */\n+\tif (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {\n+\t\t*no_cq = true;\n+\t\treturn rcvd_pkt;\n+\t}\n+\t/* Update the consumer indexes for non-compressed CQEs. */\n+\tMLX5_ASSERT(nocmp_n <= pkts_n);\n+\trxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,\n+\t\t\t     rq_ci, rxq->consumed_strd, false);\n+\trxq->cq_ci += nocmp_n;\n+\trxq->consumed_strd += nocmp_n;\n+\trcvd_pkt += nocmp_n;\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\trxq->stats.ipackets += nocmp_n;\n+\trxq->stats.ibytes += rcvd_byte;\n+#endif\n+\t/* Decompress the last CQE if compressed. */\n+\tif (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {\n+\t\tMLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));\n+\t\trxq->decompressed =\n+\t\t\trxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);\n+\t\t/* Return more packets if needed. */\n+\t\tif (nocmp_n < pkts_n) {\n+\t\t\tuint16_t n = rxq->decompressed;\n+\n+\t\t\tn = RTE_MIN(n, pkts_n - nocmp_n);\n+\t\t\trxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,\n+\t\t\t\t\t     rq_ci, rxq->consumed_strd, true);\n+\t\t\trxq->consumed_strd += n;\n+\t\t\trcvd_pkt += n;\n+\t\t\trxq->decompressed -= n;\n+\t\t}\n+\t}\n+\trte_compiler_barrier();\n+\t*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);\n+\tif (rq_ci != rxq->rq_ci) {\n+\t\trxq->rq_ci = rq_ci;\n+\t\trte_cio_wmb();\n+\t\t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n+\t}\n+\t*no_cq = !rcvd_pkt;\n+\treturn rcvd_pkt;\n+}\n+\n #endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */\ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h\nindex 555c342626..53c8ed8a9b 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h\n@@ -54,6 +54,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)\n \t\tpkts[pos] = elts[pos];\n }\n \n+/**\n+ * Store free buffers to RX SW ring.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param pkts\n+ *   Pointer to array of packets to be stored.\n+ * @param pkts_n\n+ *   Number of packets to be stored.\n+ * @param buf\n+ *   MPRQ buffer to get packets from.\n+ * @param buf rq_ci\n+ *   WQE index.\n+ * @param strd_idx\n+ *   Stride number.\n+ * @param comp\n+ *   Whether CQE is compressed or not.\n+ */\n+static inline void\n+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t     uint16_t n, struct mlx5_mprq_buf *buf,\n+\t\t     uint16_t rq_ci, uint16_t strd_idx, bool comp)\n+{\n+\tconst unsigned int strd_sz = 1 << rxq->strd_sz_n;\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tconst unsigned int strd_shift =\n+\t\tMLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;\n+\tuint32_t offset;\n+\tvoid *addr;\n+\tint i = 0;\n+\n+\tif (comp) {\n+\t\tconst uint16_t q_mask = (1 << rxq->cqe_n) - 1;\n+\t\tstruct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];\n+\t\tunsigned int pos;\n+\t\tuint16_t p = n & -2;\n+\n+\t\tfor (pos = 0; pos < p; pos += 2) {\n+\t\t\tuint64x2_t mbp;\n+\n+\t\t\tmbp = vld1q_u64((void *)&elts[pos +\n+\t\t\t\t\t\t      rxq->consumed_strd]);\n+\t\t\tvst1q_u64((void *)&pkts[pos], mbp);\n+\t\t}\n+\t\tif (n & 1)\n+\t\t\tpkts[pos] = elts[pos];\n+\t}\n+\n+\tfor (i = 0; i < n; ++i) {\n+\t\toffset = (strd_idx + i) * strd_sz + strd_shift;\n+\t\taddr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);\n+\t\tif (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||\n+\t\t    rxq->mprq_repl == NULL) {\n+\t\t\trte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),\n+\t\t\t\t   addr, pkts[i]->pkt_len);\n+\t\t} else {\n+\t\t\trte_iova_t buf_iova;\n+\t\t\tstruct rte_mbuf_ext_shared_info *shinfo;\n+\t\t\tuint16_t buf_len = strd_sz;\n+\t\t\tvoid *buf_addr;\n+\t\t\t/* Increment the refcnt of the whole chunk. */\n+\t\t\trte_atomic16_add_return(&buf->refcnt, 1);\n+\t\t\tMLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=\n+\t\t\t\t    strd_n + 1);\n+\t\t\tbuf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);\n+\t\t\t/*\n+\t\t\t * MLX5 device doesn't use iova but it is necessary in a\n+\t\t\t * case where the Rx packet is transmitted via a\n+\t\t\t * different PMD.\n+\t\t\t */\n+\t\t\tbuf_iova = rte_mempool_virt2iova(buf) +\n+\t\t\t\tRTE_PTR_DIFF(buf_addr, buf);\n+\t\t\tshinfo = &buf->shinfos[strd_idx];\n+\t\t\trte_mbuf_ext_refcnt_set(shinfo, 1);\n+\t\t\t/*\n+\t\t\t * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when\n+\t\t\t * attaching the stride to mbuf and more offload flags\n+\t\t\t * will be added below by calling rxq_cq_to_mbuf().\n+\t\t\t * Other fields will be overwritten.\n+\t\t\t */\n+\t\t\trte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,\n+\t\t\t\t\t\t  buf_len, shinfo);\n+\t\t\t/* Set mbuf head-room. */\n+\t\t\tSET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);\n+\t\t\tDATA_LEN(pkts[i]) = pkts[i]->pkt_len;\n+\t\t}\n+\t}\n+}\n+\n /**\n  * Decompress a compressed completion and fill in mbufs in RX SW ring with data\n  * extracted from the title completion descriptor.\n@@ -806,4 +895,492 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,\n \treturn rcvd_pkt;\n }\n \n+static inline void\n+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,\n+\t\t const unsigned int strd_n)\n+{\n+\tstruct mlx5_mprq_buf *rep = rxq->mprq_repl;\n+\tvolatile struct mlx5_wqe_data_seg *wqe =\n+\t\t&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;\n+\tvoid *addr;\n+\n+\tMLX5_ASSERT(rep != NULL);\n+\t/* Replace MPRQ buf. */\n+\t(*rxq->mprq_bufs)[rq_idx] = rep;\n+\t/* Replace WQE. */\n+\taddr = mlx5_mprq_buf_addr(rep, strd_n);\n+\twqe->addr = rte_cpu_to_be_64((uintptr_t)addr);\n+\t/* If there's only one MR, no need to replace LKey in WQE. */\n+\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))\n+\t\twqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);\n+\t/* Stash a mbuf for next replacement. */\n+\tif (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))\n+\t\trxq->mprq_repl = rep;\n+\telse\n+\t\trxq->mprq_repl = NULL;\n+}\n+\n+/**\n+ * Receive burst of packets. An errored completion also consumes a mbuf, but the\n+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed\n+ * before returning to application.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param[out] pkts\n+ *   Array to store received packets.\n+ * @param pkts_n\n+ *   Maximum number of packets in array.\n+ * @param[out] err\n+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error\n+ *   packet to handle.\n+ * @param[out] no_cq\n+ *   Pointer to a boolean. Set true if no new CQE seen.\n+ *\n+ * @return\n+ *   Number of packets received including errors (<= pkts_n).\n+ */\n+static inline uint16_t\n+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t uint16_t pkts_n, uint64_t *err, bool *no_cq)\n+{\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tconst uint16_t q_n = 1 << rxq->cqe_n;\n+\tconst uint16_t q_mask = q_n - 1;\n+\tconst uint16_t e_n = 1 << rxq->elts_n;\n+\tconst uint16_t e_mask = e_n - 1;\n+\tvolatile struct mlx5_cqe *cq;\n+\tstruct rte_mbuf **elts;\n+\tunsigned int pos;\n+\tuint64_t n;\n+\tuint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;\n+\tuint16_t nocmp_n = 0;\n+\tuint16_t rcvd_pkt = 0;\n+\tunsigned int cq_ci = rxq->cq_ci;\n+\tunsigned int cq_idx = cq_ci & q_mask;\n+\tunsigned int rq_ci = rxq->rq_ci;\n+\tunsigned int rq_idx = rq_ci & e_mask;\n+\tstruct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];\n+\tunsigned int elts_idx;\n+\tconst uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));\n+\tconst uint16x4_t owner_check = vcreate_u16(0x0001000100010001);\n+\tconst uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);\n+\tconst uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);\n+\tconst uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\tuint32_t rcvd_byte = 0;\n+#endif\n+\t/* Mask to generate 16B length vector. */\n+\tconst uint8x8_t len_shuf_m = {\n+\t\t52, 53,         /* 4th CQE */\n+\t\t36, 37,         /* 3rd CQE */\n+\t\t20, 21,         /* 2nd CQE */\n+\t\t 4,  5          /* 1st CQE */\n+\t};\n+\t/* Mask to extract 16B data from a 64B CQE. */\n+\tconst uint8x16_t cqe_shuf_m = {\n+\t\t28, 29,         /* hdr_type_etc */\n+\t\t 0,             /* pkt_info */\n+\t\t-1,             /* null */\n+\t\t47, 46,         /* byte_cnt, bswap16 */\n+\t\t31, 30,         /* vlan_info, bswap16 */\n+\t\t15, 14, 13, 12, /* rx_hash_res, bswap32 */\n+\t\t57, 58, 59,     /* flow_tag */\n+\t\t63              /* op_own */\n+\t};\n+\t/* Mask to generate 16B data for mbuf. */\n+\tconst uint8x16_t mb_shuf_m = {\n+\t\t 4,  5, -1, -1, /* pkt_len */\n+\t\t 4,  5,         /* data_len */\n+\t\t 6,  7,         /* vlan_tci */\n+\t\t 8,  9, 10, 11, /* hash.rss */\n+\t\t12, 13, 14, -1  /* hash.fdir.hi */\n+\t};\n+\t/* Mask to generate 16B owner vector. */\n+\tconst uint8x8_t owner_shuf_m = {\n+\t\t63, -1,         /* 4th CQE */\n+\t\t47, -1,         /* 3rd CQE */\n+\t\t31, -1,         /* 2nd CQE */\n+\t\t15, -1          /* 1st CQE */\n+\t};\n+\t/* Mask to generate a vector having packet_type/ol_flags. */\n+\tconst uint8x16_t ptype_shuf_m = {\n+\t\t48, 49, 50, -1, /* 4th CQE */\n+\t\t32, 33, 34, -1, /* 3rd CQE */\n+\t\t16, 17, 18, -1, /* 2nd CQE */\n+\t\t 0,  1,  2, -1  /* 1st CQE */\n+\t};\n+\t/* Mask to generate a vector having flow tags. */\n+\tconst uint8x16_t ftag_shuf_m = {\n+\t\t60, 61, 62, -1, /* 4th CQE */\n+\t\t44, 45, 46, -1, /* 3rd CQE */\n+\t\t28, 29, 30, -1, /* 2nd CQE */\n+\t\t12, 13, 14, -1  /* 1st CQE */\n+\t};\n+\tconst uint16x8_t crc_adj = {\n+\t\t0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0\n+\t};\n+\tconst uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };\n+\n+\tMLX5_ASSERT(rxq->sges_n == 0);\n+\tMLX5_ASSERT(rxq->cqe_n == rxq->elts_n);\n+\tif (rxq->consumed_strd == strd_n) {\n+\t\t/* Replace WQE only if the buffer is still in use. */\n+\t\tif (rte_atomic16_read(&buf->refcnt) > 1) {\n+\t\t\tmprq_buf_replace(rxq, rq_idx, strd_n);\n+\t\t\t/* Release the old buffer. */\n+\t\t\tmlx5_mprq_buf_free(buf);\n+\t\t} else if (unlikely(rxq->mprq_repl == NULL)) {\n+\t\t\tstruct mlx5_mprq_buf *rep;\n+\n+\t\t\t/*\n+\t\t\t * Currently, the MPRQ mempool is out of buffer\n+\t\t\t * and doing memcpy regardless of the size of Rx\n+\t\t\t * packet. Retry allocation to get back to\n+\t\t\t * normal.\n+\t\t\t */\n+\t\t\tif (!rte_mempool_get(rxq->mprq_mp,\n+\t\t\t\t\t     (void **)&rep))\n+\t\t\t\trxq->mprq_repl = rep;\n+\t\t\t}\n+\t\t/* Advance to the next WQE. */\n+\t\trxq->consumed_strd = 0;\n+\t\t++rq_ci;\n+\t\trq_idx = rq_ci & e_mask;\n+\t\tbuf = (*rxq->mprq_bufs)[rq_idx];\n+\t\trxq->rq_repl_thresh = 1;\n+\t}\n+\tif (rxq->rq_repl_thresh)\n+\t\tmlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_idx);\n+\n+\tcq = &(*rxq->cqes)[cq_idx];\n+\trte_prefetch_non_temporal(cq);\n+\trte_prefetch_non_temporal(cq + 1);\n+\trte_prefetch_non_temporal(cq + 2);\n+\trte_prefetch_non_temporal(cq + 3);\n+\telts_idx = (rq_ci & e_mask) * strd_n +\n+\t\t(rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;\n+\telts = &(*rxq->elts)[elts_idx];\n+\tpkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);\n+\t/* See if there're unreturned mbufs from compressed CQE. */\n+\trcvd_pkt = rxq->decompressed;\n+\tif (rcvd_pkt > 0) {\n+\t\trcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);\n+\t\trxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,\n+\t\t\t\t     rq_ci, rxq->consumed_strd, true);\n+\t\trxq->consumed_strd += rcvd_pkt;\n+\t\tpkts += rcvd_pkt;\n+\t\trxq->decompressed -= rcvd_pkt;\n+\t}\n+\t/* Not to cross queue end. */\n+\tpkts_n = RTE_MIN(pkts_n, q_n - cq_idx);\n+\tpkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);\n+\tif (!pkts_n) {\n+\t\t*no_cq = !rcvd_pkt;\n+\t\treturn rcvd_pkt;\n+\t}\n+\t/* At this point, there shouldn't be any remained packets. */\n+\tMLX5_ASSERT(rxq->decompressed == 0);\n+\t/*\n+\t * Note that vectors have reverse order - {v3, v2, v1, v0}, because\n+\t * there's no instruction to count trailing zeros. __builtin_clzl() is\n+\t * used instead.\n+\t *\n+\t * A. copy 4 mbuf pointers from elts ring to returing pkts.\n+\t * B. load 64B CQE and extract necessary fields\n+\t *    Final 16bytes cqes[] extracted from original 64bytes CQE has the\n+\t *    following structure:\n+\t *        struct {\n+\t *          uint16_t hdr_type_etc;\n+\t *          uint8_t  pkt_info;\n+\t *          uint8_t  rsvd;\n+\t *          uint16_t byte_cnt;\n+\t *          uint16_t vlan_info;\n+\t *          uint32_t rx_has_res;\n+\t *          uint8_t  flow_tag[3];\n+\t *          uint8_t  op_own;\n+\t *        } c;\n+\t * C. fill in mbuf.\n+\t * D. get valid CQEs.\n+\t * E. find compressed CQE.\n+\t */\n+\tfor (pos = 0;\n+\t     pos < pkts_n;\n+\t     pos += MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\tuint16x4_t op_own;\n+\t\tuint16x4_t opcode, owner_mask, invalid_mask;\n+\t\tuint16x4_t comp_mask;\n+\t\tuint16x4_t mask;\n+\t\tuint16x4_t byte_cnt;\n+\t\tuint32x4_t ptype_info, flow_tag;\n+\t\tregister uint64x2_t c0, c1, c2, c3;\n+\t\tuint8_t *p0, *p1, *p2, *p3;\n+\t\tuint8_t *e0 = (void *)&elts[pos + rxq->consumed_strd]->pkt_len;\n+\t\tuint8_t *e1 = (void *)&elts[pos +\n+\t\t\t\t\t    rxq->consumed_strd + 1]->pkt_len;\n+\t\tuint8_t *e2 = (void *)&elts[pos +\n+\t\t\t\t\t    rxq->consumed_strd + 2]->pkt_len;\n+\t\tuint8_t *e3 = (void *)&elts[pos +\n+\t\t\t\t\t    rxq->consumed_strd + 3]->pkt_len;\n+\t\tvoid *elts_p = (void *)&elts[pos + rxq->consumed_strd];\n+\t\tvoid *pkts_p = (void *)&pkts[pos];\n+\n+\t\t/* A.0 do not cross the end of CQ. */\n+\t\tmask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?\n+\t\t\t\t   -1UL >> ((pkts_n - pos) *\n+\t\t\t\t\t    sizeof(uint16_t) * 8) : 0);\n+\t\tp0 = (void *)&cq[pos].pkt_info;\n+\t\tp1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);\n+\t\tp2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);\n+\t\tp3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);\n+\t\t/* B.0 (CQE 3) load a block having op_own. */\n+\t\tc3 = vld1q_u64((uint64_t *)(p3 + 48));\n+\t\t/* B.0 (CQE 2) load a block having op_own. */\n+\t\tc2 = vld1q_u64((uint64_t *)(p2 + 48));\n+\t\t/* B.0 (CQE 1) load a block having op_own. */\n+\t\tc1 = vld1q_u64((uint64_t *)(p1 + 48));\n+\t\t/* B.0 (CQE 0) load a block having op_own. */\n+\t\tc0 = vld1q_u64((uint64_t *)(p0 + 48));\n+\t\t/* Synchronize for loading the rest of blocks. */\n+\t\trte_cio_rmb();\n+\t\t/* Prefetch next 4 CQEs. */\n+\t\tif (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\t\tunsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;\n+\t\t\trte_prefetch_non_temporal(&cq[next]);\n+\t\t\trte_prefetch_non_temporal(&cq[next + 1]);\n+\t\t\trte_prefetch_non_temporal(&cq[next + 2]);\n+\t\t\trte_prefetch_non_temporal(&cq[next + 3]);\n+\t\t}\n+\t\t__asm__ volatile (\n+\t\t/* B.1 (CQE 3) load the rest of blocks. */\n+\t\t\"ld1 {v16.16b - v18.16b}, [%[p3]] \\n\\t\"\n+\t\t/* B.2 (CQE 3) move the block having op_own. */\n+\t\t\"mov v19.16b, %[c3].16b \\n\\t\"\n+\t\t/* B.3 (CQE 3) extract 16B fields. */\n+\t\t\"tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \\n\\t\"\n+\t\t/* B.1 (CQE 2) load the rest of blocks. */\n+\t\t\"ld1 {v16.16b - v18.16b}, [%[p2]] \\n\\t\"\n+\t\t/* B.4 (CQE 3) adjust CRC length. */\n+\t\t\"sub v23.8h, v23.8h, %[crc_adj].8h \\n\\t\"\n+\t\t/* C.1 (CQE 3) generate final structure for mbuf. */\n+\t\t\"tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \\n\\t\"\n+\t\t/* B.2 (CQE 2) move the block having op_own. */\n+\t\t\"mov v19.16b, %[c2].16b \\n\\t\"\n+\t\t/* B.3 (CQE 2) extract 16B fields. */\n+\t\t\"tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \\n\\t\"\n+\t\t/* B.1 (CQE 1) load the rest of blocks. */\n+\t\t\"ld1 {v16.16b - v18.16b}, [%[p1]] \\n\\t\"\n+\t\t/* B.4 (CQE 2) adjust CRC length. */\n+\t\t\"sub v22.8h, v22.8h, %[crc_adj].8h \\n\\t\"\n+\t\t/* C.1 (CQE 2) generate final structure for mbuf. */\n+\t\t\"tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \\n\\t\"\n+\t\t/* B.2 (CQE 1) move the block having op_own. */\n+\t\t\"mov v19.16b, %[c1].16b \\n\\t\"\n+\t\t/* B.3 (CQE 1) extract 16B fields. */\n+\t\t\"tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \\n\\t\"\n+\t\t/* B.1 (CQE 0) load the rest of blocks. */\n+\t\t\"ld1 {v16.16b - v18.16b}, [%[p0]] \\n\\t\"\n+\t\t/* B.4 (CQE 1) adjust CRC length. */\n+\t\t\"sub v21.8h, v21.8h, %[crc_adj].8h \\n\\t\"\n+\t\t/* C.1 (CQE 1) generate final structure for mbuf. */\n+\t\t\"tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \\n\\t\"\n+\t\t/* B.2 (CQE 0) move the block having op_own. */\n+\t\t\"mov v19.16b, %[c0].16b \\n\\t\"\n+\t\t/* A.1 load mbuf pointers. */\n+\t\t\"ld1 {v24.2d - v25.2d}, [%[elts_p]] \\n\\t\"\n+\t\t/* B.3 (CQE 0) extract 16B fields. */\n+\t\t\"tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \\n\\t\"\n+\t\t/* B.4 (CQE 0) adjust CRC length. */\n+\t\t\"sub v20.8h, v20.8h, %[crc_adj].8h \\n\\t\"\n+\t\t/* D.1 extract op_own byte. */\n+\t\t\"tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \\n\\t\"\n+\t\t/* C.2 (CQE 3) adjust flow mark. */\n+\t\t\"add v15.4s, v15.4s, %[flow_mark_adj].4s \\n\\t\"\n+\t\t/* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */\n+\t\t\"st1 {v15.2d}, [%[e3]] \\n\\t\"\n+\t\t/* C.2 (CQE 2) adjust flow mark. */\n+\t\t\"add v14.4s, v14.4s, %[flow_mark_adj].4s \\n\\t\"\n+\t\t/* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */\n+\t\t\"st1 {v14.2d}, [%[e2]] \\n\\t\"\n+\t\t/* C.1 (CQE 0) generate final structure for mbuf. */\n+\t\t\"tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \\n\\t\"\n+\t\t/* C.2 (CQE 1) adjust flow mark. */\n+\t\t\"add v13.4s, v13.4s, %[flow_mark_adj].4s \\n\\t\"\n+\t\t/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */\n+\t\t\"st1 {v13.2d}, [%[e1]] \\n\\t\"\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t/* Extract byte_cnt. */\n+\t\t\"tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \\n\\t\"\n+#endif\n+\t\t/* Extract ptype_info. */\n+\t\t\"tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \\n\\t\"\n+\t\t/* Extract flow_tag. */\n+\t\t\"tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \\n\\t\"\n+\t\t/* A.2 copy mbuf pointers. */\n+\t\t\"st1 {v24.2d - v25.2d}, [%[pkts_p]] \\n\\t\"\n+\t\t/* C.2 (CQE 0) adjust flow mark. */\n+\t\t\"add v12.4s, v12.4s, %[flow_mark_adj].4s \\n\\t\"\n+\t\t/* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */\n+\t\t\"st1 {v12.2d}, [%[e0]] \\n\\t\"\n+\t\t:[op_own]\"=&w\"(op_own),\n+\t\t [byte_cnt]\"=&w\"(byte_cnt),\n+\t\t [ptype_info]\"=&w\"(ptype_info),\n+\t\t [flow_tag]\"=&w\"(flow_tag)\n+\t\t:[p3]\"r\"(p3), [p2]\"r\"(p2), [p1]\"r\"(p1), [p0]\"r\"(p0),\n+\t\t [e3]\"r\"(e3), [e2]\"r\"(e2), [e1]\"r\"(e1), [e0]\"r\"(e0),\n+\t\t [c3]\"w\"(c3), [c2]\"w\"(c2), [c1]\"w\"(c1), [c0]\"w\"(c0),\n+\t\t [elts_p]\"r\"(elts_p),\n+\t\t [pkts_p]\"r\"(pkts_p),\n+\t\t [cqe_shuf_m]\"w\"(cqe_shuf_m),\n+\t\t [mb_shuf_m]\"w\"(mb_shuf_m),\n+\t\t [owner_shuf_m]\"w\"(owner_shuf_m),\n+\t\t [len_shuf_m]\"w\"(len_shuf_m),\n+\t\t [ptype_shuf_m]\"w\"(ptype_shuf_m),\n+\t\t [ftag_shuf_m]\"w\"(ftag_shuf_m),\n+\t\t [crc_adj]\"w\"(crc_adj),\n+\t\t [flow_mark_adj]\"w\"(flow_mark_adj)\n+\t\t:\"memory\",\n+\t\t \"v12\", \"v13\", \"v14\", \"v15\",\n+\t\t \"v16\", \"v17\", \"v18\", \"v19\",\n+\t\t \"v20\", \"v21\", \"v22\", \"v23\",\n+\t\t \"v24\", \"v25\");\n+\t\t/* D.2 flip owner bit to mark CQEs from last round. */\n+\t\towner_mask = vand_u16(op_own, owner_check);\n+\t\towner_mask = vceq_u16(owner_mask, ownership);\n+\t\t/* D.3 get mask for invalidated CQEs. */\n+\t\topcode = vand_u16(op_own, opcode_check);\n+\t\tinvalid_mask = vceq_u16(opcode_check, opcode);\n+\t\t/* E.1 find compressed CQE format. */\n+\t\tcomp_mask = vand_u16(op_own, format_check);\n+\t\tcomp_mask = vceq_u16(comp_mask, format_check);\n+\t\t/* D.4 mask out beyond boundary. */\n+\t\tinvalid_mask = vorr_u16(invalid_mask, mask);\n+\t\t/* D.5 merge invalid_mask with invalid owner. */\n+\t\tinvalid_mask = vorr_u16(invalid_mask, owner_mask);\n+\t\t/* E.2 mask out invalid entries. */\n+\t\tcomp_mask = vbic_u16(comp_mask, invalid_mask);\n+\t\t/* E.3 get the first compressed CQE. */\n+\t\tcomp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(\n+\t\t\t\t\t  comp_mask), 0)) /\n+\t\t\t\t\t  (sizeof(uint16_t) * 8);\n+\t\t/* D.6 mask out entries after the compressed CQE. */\n+\t\tmask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?\n+\t\t\t\t   -1UL >> (comp_idx * sizeof(uint16_t) * 8) :\n+\t\t\t\t   0);\n+\t\tinvalid_mask = vorr_u16(invalid_mask, mask);\n+\t\t/* D.7 count non-compressed valid CQEs. */\n+\t\tn = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(\n+\t\t\t\t   invalid_mask), 0)) / (sizeof(uint16_t) * 8);\n+\t\tnocmp_n += n;\n+\t\t/* D.2 get the final invalid mask. */\n+\t\tmask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?\n+\t\t\t\t   -1UL >> (n * sizeof(uint16_t) * 8) : 0);\n+\t\tinvalid_mask = vorr_u16(invalid_mask, mask);\n+\t\t/* D.3 check error in opcode. */\n+\t\topcode = vceq_u16(resp_err_check, opcode);\n+\t\topcode = vbic_u16(opcode, invalid_mask);\n+\t\t/* D.4 mark if any error is set */\n+\t\t*err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);\n+\t\t/* C.4 fill in mbuf - rearm_data and packet_type. */\n+\t\trxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,\n+\t\t\t\t\t opcode, &elts[pos]);\n+\t\tif (rxq->hw_timestamp) {\n+\t\t\telts[pos]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(\n+\t\t\t\t\tcontainer_of(p0, struct mlx5_cqe,\n+\t\t\t\t\t\t     pkt_info)->timestamp);\n+\t\t\telts[pos + 1]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(\n+\t\t\t\t\tcontainer_of(p1, struct mlx5_cqe,\n+\t\t\t\t\t\t     pkt_info)->timestamp);\n+\t\t\telts[pos + 2]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(\n+\t\t\t\t\tcontainer_of(p2, struct mlx5_cqe,\n+\t\t\t\t\t\t     pkt_info)->timestamp);\n+\t\t\telts[pos + 3]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(\n+\t\t\t\t\tcontainer_of(p3, struct mlx5_cqe,\n+\t\t\t\t\t\t     pkt_info)->timestamp);\n+\t\t}\n+\t\tif (!!rxq->flow_meta_mask) {\n+\t\t\t/* This code is subject for futher optimization. */\n+\t\t\tint32_t offs = rxq->flow_meta_offset;\n+\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =\n+\t\t\t\tcontainer_of(p0, struct mlx5_cqe,\n+\t\t\t\t\t     pkt_info)->flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =\n+\t\t\t\tcontainer_of(p1, struct mlx5_cqe,\n+\t\t\t\t\t     pkt_info)->flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =\n+\t\t\t\tcontainer_of(p2, struct mlx5_cqe,\n+\t\t\t\t\t     pkt_info)->flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =\n+\t\t\t\tcontainer_of(p3, struct mlx5_cqe,\n+\t\t\t\t\t     pkt_info)->flow_table_metadata;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))\n+\t\t\t\telts[pos]->ol_flags |= rxq->flow_meta_mask;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))\n+\t\t\t\telts[pos + 1]->ol_flags |= rxq->flow_meta_mask;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))\n+\t\t\t\telts[pos + 2]->ol_flags |= rxq->flow_meta_mask;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))\n+\t\t\t\telts[pos + 3]->ol_flags |= rxq->flow_meta_mask;\n+\t\t}\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t/* Add up received bytes count. */\n+\t\tbyte_cnt = vbic_u16(byte_cnt, invalid_mask);\n+\t\trcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);\n+#endif\n+\t\t/*\n+\t\t * Break the loop unless more valid CQE is expected, or if\n+\t\t * there's a compressed CQE.\n+\t\t */\n+\t\tif (n != MLX5_VPMD_DESCS_PER_LOOP)\n+\t\t\tbreak;\n+\t}\n+\t/* If no new CQE seen, return without updating cq_db. */\n+\tif (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {\n+\t\t*no_cq = true;\n+\t\treturn rcvd_pkt;\n+\t}\n+\t/* Update the consumer indexes for non-compressed CQEs. */\n+\tMLX5_ASSERT(nocmp_n <= pkts_n);\n+\trxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,\n+\t\t\t     rq_ci, rxq->consumed_strd, false);\n+\trxq->cq_ci += nocmp_n;\n+\trxq->consumed_strd += nocmp_n;\n+\trcvd_pkt += nocmp_n;\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\trxq->stats.ipackets += nocmp_n;\n+\trxq->stats.ibytes += rcvd_byte;\n+#endif\n+\t/* Decompress the last CQE if compressed. */\n+\tif (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {\n+\t\tMLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));\n+\t\trxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],\n+\t\t\t\t\t\t\t&elts[nocmp_n]);\n+\t\t/* Return more packets if needed. */\n+\t\tif (nocmp_n < pkts_n) {\n+\t\t\tuint16_t n = rxq->decompressed;\n+\n+\t\t\tn = RTE_MIN(n, pkts_n - nocmp_n);\n+\t\t\trxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,\n+\t\t\t\t\t     rq_ci, rxq->consumed_strd, true);\n+\t\t\trxq->consumed_strd += n;\n+\t\t\trcvd_pkt += n;\n+\t\t\trxq->decompressed -= n;\n+\t\t}\n+\t}\n+\trte_cio_wmb();\n+\t*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);\n+\tif (rq_ci != rxq->rq_ci) {\n+\t\trxq->rq_ci = rq_ci;\n+\t\trte_cio_wmb();\n+\t\t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n+\t}\n+\t*no_cq = !rcvd_pkt;\n+\treturn rcvd_pkt;\n+}\n+\n #endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */\ndiff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h\nindex 34e3397115..4054614674 100644\n--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h\n+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h\n@@ -56,6 +56,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)\n \t\tpkts[pos] = elts[pos];\n }\n \n+/**\n+ * Copy or attach MPRQ buffers to RX SW ring.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param pkts\n+ *   Pointer to array of packets to be stored.\n+ * @param pkts_n\n+ *   Number of packets to be stored.\n+ * @param buf\n+ *   MPRQ buffer to get packets from.\n+ * @param buf rq_ci\n+ *   WQE index.\n+ * @param strd_idx\n+ *   Stride number.\n+ * @param comp\n+ *   Whether CQE is compressed or not.\n+ */\n+static inline void\n+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t     uint16_t n, struct mlx5_mprq_buf *buf,\n+\t\t     uint16_t rq_ci, uint16_t strd_idx, bool comp)\n+{\n+\tconst unsigned int strd_sz = 1 << rxq->strd_sz_n;\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tconst unsigned int strd_shift =\n+\t\tMLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;\n+\tuint32_t offset;\n+\tvoid *addr;\n+\tint i = 0;\n+\n+\tif (comp) {\n+\t\tconst uint16_t q_mask = (1 << rxq->cqe_n) - 1;\n+\t\tstruct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];\n+\t\tunsigned int pos;\n+\t\tuint16_t p = n & -2;\n+\n+\t\tfor (pos = 0; pos < p; pos += 2) {\n+\t\t\t__m128i mbp;\n+\n+\t\t\tmbp = _mm_loadu_si128((__m128i *)&elts[pos +\n+\t\t\t\t\t\t\trxq->consumed_strd]);\n+\t\t\t_mm_storeu_si128((__m128i *)&pkts[pos], mbp);\n+\t\t}\n+\t\tif (n & 1)\n+\t\t\tpkts[pos] = elts[pos];\n+\t}\n+\n+\tfor (i = 0; i < n; ++i) {\n+\t\toffset = (strd_idx + i) * strd_sz + strd_shift;\n+\t\taddr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);\n+\t\tif (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||\n+\t\t    rxq->mprq_repl == NULL) {\n+\t\t\trte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),\n+\t\t\t\t   addr, pkts[i]->pkt_len);\n+\t\t} else {\n+\t\t\trte_iova_t buf_iova;\n+\t\t\tstruct rte_mbuf_ext_shared_info *shinfo;\n+\t\t\tuint16_t buf_len = strd_sz;\n+\t\t\tvoid *buf_addr;\n+\t\t\t/* Increment the refcnt of the whole chunk. */\n+\t\t\trte_atomic16_add_return(&buf->refcnt, 1);\n+\t\t\tMLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=\n+\t\t\t\t    strd_n + 1);\n+\t\t\tbuf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);\n+\t\t\t/*\n+\t\t\t * MLX5 device doesn't use iova but it is necessary in a\n+\t\t\t * case where the Rx packet is transmitted via a\n+\t\t\t * different PMD.\n+\t\t\t */\n+\t\t\tbuf_iova = rte_mempool_virt2iova(buf) +\n+\t\t\t\tRTE_PTR_DIFF(buf_addr, buf);\n+\t\t\tshinfo = &buf->shinfos[strd_idx];\n+\t\t\trte_mbuf_ext_refcnt_set(shinfo, 1);\n+\t\t\t/*\n+\t\t\t * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when\n+\t\t\t * attaching the stride to mbuf and more offload flags\n+\t\t\t * will be added below by calling rxq_cq_to_mbuf().\n+\t\t\t * Other fields will be overwritten.\n+\t\t\t */\n+\t\t\trte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,\n+\t\t\t\t\t\t  buf_len, shinfo);\n+\t\t\t/* Set mbuf head-room. */\n+\t\t\tSET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);\n+\t\t\tDATA_LEN(pkts[i]) = pkts[i]->pkt_len;\n+\t\t}\n+\t}\n+}\n+\n /**\n  * Decompress a compressed completion and fill in mbufs in RX SW ring with data\n  * extracted from the title completion descriptor.\n@@ -753,4 +842,435 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,\n \treturn rcvd_pkt;\n }\n \n+static inline void\n+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,\n+\t\t const unsigned int strd_n)\n+{\n+\tstruct mlx5_mprq_buf *rep = rxq->mprq_repl;\n+\tvolatile struct mlx5_wqe_data_seg *wqe =\n+\t\t&((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;\n+\tvoid *addr;\n+\n+\tMLX5_ASSERT(rep != NULL);\n+\t/* Replace MPRQ buf. */\n+\t(*rxq->mprq_bufs)[rq_idx] = rep;\n+\t/* Replace WQE. */\n+\taddr = mlx5_mprq_buf_addr(rep, strd_n);\n+\twqe->addr = rte_cpu_to_be_64((uintptr_t)addr);\n+\t/* If there's only one MR, no need to replace LKey in WQE. */\n+\tif (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))\n+\t\twqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);\n+\t/* Stash a mbuf for next replacement. */\n+\tif (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))\n+\t\trxq->mprq_repl = rep;\n+\telse\n+\t\trxq->mprq_repl = NULL;\n+}\n+\n+/**\n+ * Receive burst of packets. An errored completion also consumes a mbuf, but the\n+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed\n+ * before returning to application.\n+ *\n+ * @param rxq\n+ *   Pointer to RX queue structure.\n+ * @param[out] pkts\n+ *   Array to store received packets.\n+ * @param pkts_n\n+ *   Maximum number of packets in array.\n+ * @param[out] err\n+ *   Pointer to a flag. Set non-zero value if pkts array has at least one error\n+ *   packet to handle.\n+ * @param[out] no_cq\n+ *   Pointer to a boolean. Set true if no new CQE seen.\n+ * @return\n+ *   Number of packets received including errors (<= pkts_n).\n+ */\n+static inline uint16_t\n+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,\n+\t\t uint16_t pkts_n, uint64_t *err, bool *no_cq)\n+{\n+\tconst unsigned int strd_n = 1 << rxq->strd_num_n;\n+\tconst uint16_t q_n = 1 << rxq->cqe_n;\n+\tconst uint16_t q_mask = q_n - 1;\n+\tconst uint16_t e_n = 1 << rxq->elts_n;\n+\tconst uint16_t e_mask = e_n - 1;\n+\tvolatile struct mlx5_cqe *cq;\n+\tstruct rte_mbuf **elts;\n+\tunsigned int pos;\n+\tuint64_t n;\n+\tuint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;\n+\tuint16_t nocmp_n = 0;\n+\tuint16_t rcvd_pkt = 0;\n+\tunsigned int cq_ci = rxq->cq_ci;\n+\tunsigned int cq_idx = cq_ci & q_mask;\n+\tunsigned int rq_ci = rxq->rq_ci;\n+\tunsigned int rq_idx = rq_ci & e_mask;\n+\tstruct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];\n+\tunsigned int elts_idx;\n+\tunsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));\n+\tconst __m128i owner_check =\n+\t\t_mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);\n+\tconst __m128i opcode_check =\n+\t\t_mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);\n+\tconst __m128i format_check =\n+\t\t_mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);\n+\tconst __m128i resp_err_check =\n+\t\t_mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\tuint32_t rcvd_byte = 0;\n+\t/* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */\n+\tconst __m128i len_shuf_mask =\n+\t\t_mm_set_epi8(-1, -1, -1, -1,\n+\t\t\t     -1, -1, -1, -1,\n+\t\t\t     12, 13,  8,  9,\n+\t\t\t      4,  5,  0,  1);\n+#endif\n+\t/* Mask to shuffle from extracted CQE to mbuf. */\n+\tconst __m128i shuf_mask =\n+\t\t_mm_set_epi8(-1,  3,  2,  1, /* fdir.hi */\n+\t\t\t     12, 13, 14, 15, /* rss, bswap32 */\n+\t\t\t     10, 11,         /* vlan_tci, bswap16 */\n+\t\t\t      4,  5,         /* data_len, bswap16 */\n+\t\t\t     -1, -1,         /* zero out 2nd half of pkt_len */\n+\t\t\t      4,  5          /* pkt_len, bswap16 */);\n+\t/* Mask to blend from the last Qword to the first DQword. */\n+\tconst __m128i blend_mask =\n+\t\t_mm_set_epi8(-1, -1, -1, -1,\n+\t\t\t     -1, -1, -1, -1,\n+\t\t\t      0,  0,  0,  0,\n+\t\t\t      0,  0,  0, -1);\n+\tconst __m128i zero = _mm_setzero_si128();\n+\tconst __m128i ones = _mm_cmpeq_epi32(zero, zero);\n+\tconst __m128i crc_adj =\n+\t\t_mm_set_epi16(0, 0, 0, 0, 0,\n+\t\t\t      rxq->crc_present * RTE_ETHER_CRC_LEN,\n+\t\t\t      0,\n+\t\t\t      rxq->crc_present * RTE_ETHER_CRC_LEN);\n+\tconst __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);\n+\n+\tMLX5_ASSERT(rxq->sges_n == 0);\n+\tMLX5_ASSERT(rxq->cqe_n == rxq->elts_n);\n+\n+\tif (rxq->consumed_strd == strd_n) {\n+\t\t/* Replace WQE only if the buffer is still in use. */\n+\t\tif (rte_atomic16_read(&buf->refcnt) > 1) {\n+\t\t\tmprq_buf_replace(rxq, rq_ci & e_mask, strd_n);\n+\t\t\t/* Release the old buffer. */\n+\t\t\tmlx5_mprq_buf_free(buf);\n+\t\t} else if (unlikely(rxq->mprq_repl == NULL)) {\n+\t\t\tstruct mlx5_mprq_buf *rep;\n+\n+\t\t\t/*\n+\t\t\t * Currently, the MPRQ mempool is out of buffer\n+\t\t\t * and doing memcpy regardless of the size of Rx\n+\t\t\t * packet. Retry allocation to get back to\n+\t\t\t * normal.\n+\t\t\t */\n+\t\t\tif (!rte_mempool_get(rxq->mprq_mp,\n+\t\t\t\t\t     (void **)&rep))\n+\t\t\t\trxq->mprq_repl = rep;\n+\t\t}\n+\t\t/* Advance to the next WQE. */\n+\t\trxq->consumed_strd = 0;\n+\t\t++rq_ci;\n+\t\tbuf = (*rxq->mprq_bufs)[rq_ci & e_mask];\n+\t\trxq->rq_repl_thresh = 1;\n+\t}\n+\tif (rxq->rq_repl_thresh)\n+\t\tmlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);\n+\n+\tcq = &(*rxq->cqes)[cq_idx];\n+\trte_prefetch0(cq);\n+\trte_prefetch0(cq + 1);\n+\trte_prefetch0(cq + 2);\n+\trte_prefetch0(cq + 3);\n+\telts_idx = (rq_ci & e_mask) * strd_n +\n+\t\t(rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;\n+\telts = &(*rxq->elts)[elts_idx];\n+\tpkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);\n+\t/* See if there're unreturned mbufs from compressed CQE. */\n+\trcvd_pkt = rxq->decompressed;\n+\tif (rcvd_pkt > 0) {\n+\t\trcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);\n+\t\trxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,\n+\t\t\t\t     rq_ci, rxq->consumed_strd, true);\n+\t\trxq->consumed_strd += rcvd_pkt;\n+\t\trxq->decompressed -= rcvd_pkt;\n+\t\tpkts += rcvd_pkt;\n+\t}\n+\t/* Not to cross queue end. */\n+\tpkts_n = RTE_MIN(pkts_n, q_n - cq_idx);\n+\tpkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);\n+\tif (!pkts_n) {\n+\t\t*no_cq = !rcvd_pkt;\n+\t\treturn rcvd_pkt;\n+\t}\n+\t/* At this point, there shouldn't be any remained packets. */\n+\tMLX5_ASSERT(rxq->decompressed == 0);\n+\t/*\n+\t * A. load first Qword (8bytes) in one loop.\n+\t * B. copy 4 mbuf pointers from elts ring to returing pkts.\n+\t * C. load remained CQE data and extract necessary fields.\n+\t *    Final 16bytes cqes[] extracted from original 64bytes CQE has the\n+\t *    following structure:\n+\t *        struct {\n+\t *          uint8_t  pkt_info;\n+\t *          uint8_t  flow_tag[3];\n+\t *          uint16_t byte_cnt;\n+\t *          uint8_t  rsvd4;\n+\t *          uint8_t  op_own;\n+\t *          uint16_t hdr_type_etc;\n+\t *          uint16_t vlan_info;\n+\t *          uint32_t rx_has_res;\n+\t *        } c;\n+\t * D. fill in mbuf.\n+\t * E. get valid CQEs.\n+\t * F. find compressed CQE.\n+\t */\n+\tfor (pos = 0;\n+\t     pos < pkts_n;\n+\t     pos += MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\t__m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];\n+\t\t__m128i cqe_tmp1, cqe_tmp2;\n+\t\t__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;\n+\t\t__m128i op_own, op_own_tmp1, op_own_tmp2;\n+\t\t__m128i opcode, owner_mask, invalid_mask;\n+\t\t__m128i comp_mask;\n+\t\t__m128i mask;\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t__m128i byte_cnt;\n+#endif\n+\t\t__m128i mbp1, mbp2;\n+\t\t__m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);\n+\t\tunsigned int p1, p2, p3;\n+\n+\t\t/* Prefetch next 4 CQEs. */\n+\t\tif (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);\n+\t\t\trte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);\n+\t\t}\n+\t\t/* A.0 do not cross the end of CQ. */\n+\t\tmask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);\n+\t\tmask = _mm_sll_epi64(ones, mask);\n+\t\tp = _mm_andnot_si128(mask, p);\n+\t\t/* A.1 load cqes. */\n+\t\tp3 = _mm_extract_epi16(p, 3);\n+\t\tcqes[3] = _mm_loadl_epi64((__m128i *)\n+\t\t\t\t\t   &cq[pos + p3].sop_drop_qpn);\n+\t\trte_compiler_barrier();\n+\t\tp2 = _mm_extract_epi16(p, 2);\n+\t\tcqes[2] = _mm_loadl_epi64((__m128i *)\n+\t\t\t\t\t   &cq[pos + p2].sop_drop_qpn);\n+\t\trte_compiler_barrier();\n+\t\t/* B.1 load mbuf pointers. */\n+\t\tmbp1 = _mm_loadu_si128((__m128i *)&elts[pos +\n+\t\t\t\t\t\trxq->consumed_strd]);\n+\t\tmbp2 = _mm_loadu_si128((__m128i *)&elts[pos +\n+\t\t\t\t\t\trxq->consumed_strd + 2]);\n+\t\t/* A.1 load a block having op_own. */\n+\t\tp1 = _mm_extract_epi16(p, 1);\n+\t\tcqes[1] = _mm_loadl_epi64((__m128i *)\n+\t\t\t\t\t   &cq[pos + p1].sop_drop_qpn);\n+\t\trte_compiler_barrier();\n+\t\tcqes[0] = _mm_loadl_epi64((__m128i *)\n+\t\t\t\t\t   &cq[pos].sop_drop_qpn);\n+\t\t/* B.2 copy mbuf pointers. */\n+\t\t_mm_storeu_si128((__m128i *)&pkts[pos], mbp1);\n+\t\t_mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);\n+\t\trte_cio_rmb();\n+\t\t/* C.1 load remained CQE data and extract necessary fields. */\n+\t\tcqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);\n+\t\tcqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);\n+\t\tcqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);\n+\t\tcqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);\n+\t\tcqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);\n+\t\tcqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);\n+\t\tcqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);\n+\t\tcqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);\n+\t\tcqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);\n+\t\tcqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);\n+\t\tcqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);\n+\t\tcqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);\n+\t\t/* C.2 generate final structure for mbuf with swapping bytes. */\n+\t\tpkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);\n+\t\tpkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);\n+\t\t/* C.3 adjust CRC length. */\n+\t\tpkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);\n+\t\tpkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);\n+\t\t/* C.4 adjust flow mark. */\n+\t\tpkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);\n+\t\tpkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);\n+\t\t/* D.1 fill in mbuf - rx_descriptor_fields1. */\n+\t\t_mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);\n+\t\t_mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);\n+\t\t/* E.1 extract op_own field. */\n+\t\top_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);\n+\t\t/* C.1 load remained CQE data and extract necessary fields. */\n+\t\tcqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);\n+\t\tcqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);\n+\t\tcqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);\n+\t\tcqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);\n+\t\tcqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);\n+\t\tcqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);\n+\t\tcqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);\n+\t\tcqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);\n+\t\tcqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);\n+\t\tcqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);\n+\t\tcqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);\n+\t\tcqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);\n+\t\t/* C.2 generate final structure for mbuf with swapping bytes. */\n+\t\tpkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);\n+\t\tpkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);\n+\t\t/* C.3 adjust CRC length. */\n+\t\tpkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);\n+\t\tpkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);\n+\t\t/* C.4 adjust flow mark. */\n+\t\tpkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);\n+\t\tpkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);\n+\t\t/* E.1 extract op_own byte. */\n+\t\top_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);\n+\t\top_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);\n+\t\t/* D.1 fill in mbuf - rx_descriptor_fields1. */\n+\t\t_mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);\n+\t\t_mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);\n+\t\t/* E.2 flip owner bit to mark CQEs from last round. */\n+\t\towner_mask = _mm_and_si128(op_own, owner_check);\n+\t\tif (ownership)\n+\t\t\towner_mask = _mm_xor_si128(owner_mask, owner_check);\n+\t\towner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);\n+\t\towner_mask = _mm_packs_epi32(owner_mask, zero);\n+\t\t/* E.3 get mask for invalidated CQEs. */\n+\t\topcode = _mm_and_si128(op_own, opcode_check);\n+\t\tinvalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);\n+\t\tinvalid_mask = _mm_packs_epi32(invalid_mask, zero);\n+\t\t/* E.4 mask out beyond boundary. */\n+\t\tinvalid_mask = _mm_or_si128(invalid_mask, mask);\n+\t\t/* E.5 merge invalid_mask with invalid owner. */\n+\t\tinvalid_mask = _mm_or_si128(invalid_mask, owner_mask);\n+\t\t/* F.1 find compressed CQE format. */\n+\t\tcomp_mask = _mm_and_si128(op_own, format_check);\n+\t\tcomp_mask = _mm_cmpeq_epi32(comp_mask, format_check);\n+\t\tcomp_mask = _mm_packs_epi32(comp_mask, zero);\n+\t\t/* F.2 mask out invalid entries. */\n+\t\tcomp_mask = _mm_andnot_si128(invalid_mask, comp_mask);\n+\t\tcomp_idx = _mm_cvtsi128_si64(comp_mask);\n+\t\t/* F.3 get the first compressed CQE. */\n+\t\tcomp_idx = comp_idx ?\n+\t\t\t\t__builtin_ctzll(comp_idx) /\n+\t\t\t\t\t(sizeof(uint16_t) * 8) :\n+\t\t\t\tMLX5_VPMD_DESCS_PER_LOOP;\n+\t\t/* E.6 mask out entries after the compressed CQE. */\n+\t\tmask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);\n+\t\tmask = _mm_sll_epi64(ones, mask);\n+\t\tinvalid_mask = _mm_or_si128(invalid_mask, mask);\n+\t\t/* E.7 count non-compressed valid CQEs. */\n+\t\tn = _mm_cvtsi128_si64(invalid_mask);\n+\t\tn = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :\n+\t\t\tMLX5_VPMD_DESCS_PER_LOOP;\n+\t\tnocmp_n += n;\n+\t\t/* D.2 get the final invalid mask. */\n+\t\tmask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);\n+\t\tmask = _mm_sll_epi64(ones, mask);\n+\t\tinvalid_mask = _mm_or_si128(invalid_mask, mask);\n+\t\t/* D.3 check error in opcode. */\n+\t\topcode = _mm_cmpeq_epi32(resp_err_check, opcode);\n+\t\topcode = _mm_packs_epi32(opcode, zero);\n+\t\topcode = _mm_andnot_si128(invalid_mask, opcode);\n+\t\t/* D.4 mark if any error is set */\n+\t\t*err |= _mm_cvtsi128_si64(opcode);\n+\t\t/* D.5 fill in mbuf - rearm_data and packet_type. */\n+\t\trxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);\n+\t\tif (rxq->hw_timestamp) {\n+\t\t\tpkts[pos]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos].timestamp);\n+\t\t\tpkts[pos + 1]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos + p1].timestamp);\n+\t\t\tpkts[pos + 2]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos + p2].timestamp);\n+\t\t\tpkts[pos + 3]->timestamp =\n+\t\t\t\trte_be_to_cpu_64(cq[pos + p3].timestamp);\n+\t\t}\n+\t\tif (rxq->dynf_meta) {\n+\t\t\t/* This code is subject for futher optimization. */\n+\t\t\tint32_t offs = rxq->flow_meta_offset;\n+\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =\n+\t\t\t\tcq[pos].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =\n+\t\t\t\tcq[pos + p1].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =\n+\t\t\t\tcq[pos + p2].flow_table_metadata;\n+\t\t\t*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =\n+\t\t\t\tcq[pos + p3].flow_table_metadata;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))\n+\t\t\t\tpkts[pos]->ol_flags |= rxq->flow_meta_mask;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))\n+\t\t\t\tpkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))\n+\t\t\t\tpkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;\n+\t\t\tif (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))\n+\t\t\t\tpkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;\n+\t\t}\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\t\t/* Add up received bytes count. */\n+\t\tbyte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);\n+\t\tbyte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);\n+\t\tbyte_cnt = _mm_hadd_epi16(byte_cnt, zero);\n+\t\trcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));\n+#endif\n+\t\t/*\n+\t\t * Break the loop unless more valid CQE is expected, or if\n+\t\t * there's a compressed CQE.\n+\t\t */\n+\t\tif (n != MLX5_VPMD_DESCS_PER_LOOP)\n+\t\t\tbreak;\n+\t}\n+\t/* If no new CQE seen, return without updating cq_db. */\n+\tif (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {\n+\t\t*no_cq = true;\n+\t\treturn rcvd_pkt;\n+\t}\n+\t/* Update the consumer indexes for non-compressed CQEs. */\n+\tMLX5_ASSERT(nocmp_n <= pkts_n);\n+\trxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,\n+\t\t\t     rq_ci, rxq->consumed_strd, false);\n+\trxq->cq_ci += nocmp_n;\n+\trxq->consumed_strd += nocmp_n;\n+\trcvd_pkt += nocmp_n;\n+#ifdef MLX5_PMD_SOFT_COUNTERS\n+\trxq->stats.ipackets += nocmp_n;\n+\trxq->stats.ibytes += rcvd_byte;\n+#endif\n+\t/* Decompress the last CQE if compressed. */\n+\tif (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {\n+\t\tMLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));\n+\t\trxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],\n+\t\t\t\t\t&elts[nocmp_n + rxq->consumed_strd]);\n+\t\t/* Return more packets if needed. */\n+\t\tif (nocmp_n < pkts_n) {\n+\t\t\tuint16_t n = rxq->decompressed;\n+\n+\t\t\tn = RTE_MIN(n, pkts_n - nocmp_n);\n+\t\t\trxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,\n+\t\t\t\t\t     rq_ci, rxq->consumed_strd, true);\n+\t\t\trxq->consumed_strd += n;\n+\t\t\trcvd_pkt += n;\n+\t\t\trxq->decompressed -= n;\n+\t\t}\n+\t}\n+\n+\trte_compiler_barrier();\n+\t*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);\n+\tif (rq_ci != rxq->rq_ci) {\n+\t\trxq->rq_ci = rq_ci;\n+\t\trte_cio_wmb();\n+\t\t*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);\n+\t}\n+\t*no_cq = !rcvd_pkt;\n+\treturn rcvd_pkt;\n+}\n+\n #endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */\n",
    "prefixes": []
}