@@ -568,6 +568,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
cqe_comp = 0;
else
cqe_comp = 1;
+ if (config.mprq.enabled)
+ cqe_comp = 0;
config.cqe_comp = cqe_comp;
#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
/* Whether device supports 128B Rx CQE padding. */
@@ -973,6 +975,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
" setting default value (%u)",
1 << config.mprq.stride_num_n);
}
+ if (config.mprq.stride_size_n)
+ config.rx_vec_en = false;
if (config.mprq.stride_size_n &&
(config.mprq.stride_size_n > mprq_max_stride_size_n ||
config.mprq.stride_size_n < mprq_min_stride_size_n)) {
@@ -421,7 +421,8 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev)
if (dev->rx_pkt_burst == mlx5_rx_burst ||
dev->rx_pkt_burst == mlx5_rx_burst_mprq ||
- dev->rx_pkt_burst == mlx5_rx_burst_vec)
+ dev->rx_pkt_burst == mlx5_rx_burst_vec ||
+ dev->rx_pkt_burst == mlx5_rx_burst_mprq_vec)
return ptypes;
return NULL;
}
@@ -479,12 +480,19 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
MLX5_ASSERT(dev != NULL);
- if (mlx5_check_vec_rx_support(dev) > 0) {
+ if (mlx5_check_vec_rx_support(dev) > 0 &&
+ mlx5_mprq_enabled(dev)) {
+ rx_pkt_burst = mlx5_rx_burst_mprq_vec;
+ DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx vectorized function",
+ dev->data->port_id);
+ } else if (mlx5_check_vec_rx_support(dev) > 0) {
rx_pkt_burst = mlx5_rx_burst_vec;
DRV_LOG(DEBUG, "port %u selected Rx vectorized function",
dev->data->port_id);
} else if (mlx5_mprq_enabled(dev)) {
rx_pkt_burst = mlx5_rx_burst_mprq;
+ DRV_LOG(DEBUG, "port %u selected Multi-Packet Rx function",
+ dev->data->port_id);
}
return rx_pkt_burst;
}
@@ -164,7 +164,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
rxq->mprq_repl = buf;
}
DRV_LOG(DEBUG,
- "port %u Rx queue %u allocated and configured %u segments",
+ "port %u Multi-Packet Rx queue %u allocated and configured %u segments",
rxq->port_id, rxq->idx, wqe_n);
return 0;
error:
@@ -176,7 +176,7 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
(*rxq->mprq_bufs)[i]);
(*rxq->mprq_bufs)[i] = NULL;
}
- DRV_LOG(DEBUG, "port %u Rx queue %u failed, freed everything",
+ DRV_LOG(DEBUG, "port %u Multi-Packet Rx queue %u failed, freed everything",
rxq->port_id, rxq->idx);
rte_errno = err; /* Restore rte_errno. */
return -rte_errno;
@@ -194,11 +194,14 @@ rxq_alloc_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
static int
rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
{
+ struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
unsigned int i;
int err;
+ if (mlx5_rxq_mprq_enabled(rxq))
+ elts_n *= (1U << rxq_ctrl->rxq.strd_num_n);
/* Iterate on segments. */
for (i = 0; (i != elts_n); ++i) {
struct rte_mbuf *buf;
@@ -284,8 +287,10 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
int
rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
{
- return mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq) ?
- rxq_alloc_elts_mprq(rxq_ctrl) : rxq_alloc_elts_sprq(rxq_ctrl);
+ int ret = 0;
+ if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+ ret = rxq_alloc_elts_mprq(rxq_ctrl);
+ return (ret || rxq_alloc_elts_sprq(rxq_ctrl));
}
/**
@@ -304,7 +309,6 @@ rxq_free_elts_mprq(struct mlx5_rxq_ctrl *rxq_ctrl)
rxq->port_id, rxq->idx);
if (rxq->mprq_bufs == NULL)
return;
- MLX5_ASSERT(mlx5_rxq_check_vec_support(rxq) < 0);
for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
if ((*rxq->mprq_bufs)[i] != NULL)
mlx5_mprq_buf_free((*rxq->mprq_bufs)[i]);
@@ -326,15 +330,19 @@ static void
rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
{
struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
- const uint16_t q_n = (1 << rxq->elts_n);
- const uint16_t q_mask = q_n - 1;
- uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
+ unsigned int q_n = (1 << rxq->elts_n);
+ uint16_t q_mask;
+ uint16_t used;
uint16_t i;
DRV_LOG(DEBUG, "port %u Rx queue %u freeing WRs",
PORT_ID(rxq_ctrl->priv), rxq->idx);
if (rxq->elts == NULL)
return;
+ if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+ q_n *= (1U << rxq_ctrl->rxq.strd_num_n);
+ q_mask = q_n - 1;
+ used = q_n - (rxq->rq_ci - rxq->rq_pi);
/**
* Some mbuf in the Ring belongs to the application. They cannot be
* freed.
@@ -344,7 +352,7 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
(*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
rxq->rq_pi = rxq->rq_ci;
}
- for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
+ for (i = 0; (i != q_n); ++i) {
if ((*rxq->elts)[i] != NULL)
rte_pktmbuf_free_seg((*rxq->elts)[i]);
(*rxq->elts)[i] = NULL;
@@ -362,8 +370,7 @@ rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
{
if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
rxq_free_elts_mprq(rxq_ctrl);
- else
- rxq_free_elts_sprq(rxq_ctrl);
+ rxq_free_elts_sprq(rxq_ctrl);
}
/**
@@ -1793,20 +1800,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_rxq_ctrl *tmpl;
unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
- unsigned int mprq_stride_nums;
- unsigned int mprq_stride_size;
- unsigned int mprq_stride_cap;
struct mlx5_dev_config *config = &priv->config;
- /*
- * Always allocate extra slots, even if eventually
- * the vector Rx will not be used.
- */
- uint16_t desc_n =
- desc + config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
uint64_t offloads = conf->offloads |
dev->data->dev_conf.rxmode.offloads;
unsigned int lro_on_queue = !!(offloads & DEV_RX_OFFLOAD_TCP_LRO);
- const int mprq_en = mlx5_check_mprq_support(dev) > 0;
unsigned int max_rx_pkt_len = lro_on_queue ?
dev->data->dev_conf.rxmode.max_lro_pkt_size :
dev->data->dev_conf.rxmode.max_rx_pkt_len;
@@ -1814,6 +1811,23 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
RTE_PKTMBUF_HEADROOM;
unsigned int max_lro_size = 0;
unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+ const int mprq_en = mlx5_check_mprq_support(dev) > 0;
+ unsigned int mprq_stride_nums = config->mprq.stride_num_n ?
+ config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
+ unsigned int mprq_stride_size = non_scatter_min_mbuf_size <=
+ (1U << config->mprq.max_stride_size_n) ?
+ log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
+ unsigned int mprq_stride_cap = (config->mprq.stride_num_n ?
+ (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
+ (config->mprq.stride_size_n ?
+ (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
+ /*
+ * Always allocate extra slots, even if eventually
+ * the vector Rx will not be used.
+ */
+ uint16_t desc_n = desc +
+ config->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP *
+ (desc >> mprq_stride_nums);
if (non_scatter_min_mbuf_size > mb_len && !(offloads &
DEV_RX_OFFLOAD_SCATTER)) {
@@ -1825,8 +1839,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
rte_errno = ENOSPC;
return NULL;
}
- tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, sizeof(*tmpl) +
- desc_n * sizeof(struct rte_mbuf *), 0, socket);
+ tmpl = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
+ sizeof(*tmpl) +
+ desc_n * sizeof(struct rte_mbuf *) +
+ (desc >> mprq_stride_nums) *
+ sizeof(struct mlx5_mprq_buf *),
+ 0, socket);
if (!tmpl) {
rte_errno = ENOMEM;
return NULL;
@@ -1840,15 +1858,6 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->socket = socket;
if (dev->data->dev_conf.intr_conf.rxq)
tmpl->irq = 1;
- mprq_stride_nums = config->mprq.stride_num_n ?
- config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N;
- mprq_stride_size = non_scatter_min_mbuf_size <=
- (1U << config->mprq.max_stride_size_n) ?
- log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N;
- mprq_stride_cap = (config->mprq.stride_num_n ?
- (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) *
- (config->mprq.stride_size_n ?
- (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size));
/*
* This Rx queue can be configured as a Multi-Packet RQ if all of the
* following conditions are met:
@@ -1996,7 +2005,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
tmpl->rxq.rq_repl_thresh =
MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
tmpl->rxq.elts =
- (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+ (struct rte_mbuf *(*)[desc_n])(tmpl + 1);
+ if (mlx5_rxq_mprq_enabled(&tmpl->rxq)) {
+ tmpl->rxq.rq_repl_thresh = 1;
+ tmpl->rxq.mprq_bufs =
+ (struct mlx5_mprq_buf *(*)[desc])(tmpl + desc_n + 1);
+ }
#ifndef RTE_ARCH_64
tmpl->rxq.uar_lock_cq = &priv->sh->uar_lock_cq;
#endif
@@ -614,6 +614,16 @@ mlx5_rx_burst_mode_get(struct rte_eth_dev *dev,
snprintf(mode->info, sizeof(mode->info), "%s", "Vector AltiVec");
#else
return -EINVAL;
+#endif
+ } else if (pkt_burst == mlx5_rx_burst_mprq_vec) {
+#if defined RTE_ARCH_X86_64
+ snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector SSE");
+#elif defined RTE_ARCH_ARM64
+ snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector Neon");
+#elif defined RTE_ARCH_PPC_64
+ snprintf(mode->info, sizeof(mode->info), "%s", "Multi-Packet RQ Vector AltiVec");
+#else
+ return -EINVAL;
#endif
} else {
return -EINVAL;
@@ -1075,7 +1085,7 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
{
const uint16_t cqe_n = 1 << rxq->cqe_n;
const uint16_t cqe_mask = cqe_n - 1;
- const unsigned int wqe_n = 1 << rxq->elts_n;
+ unsigned int wqe_n = 1 << rxq->elts_n;
struct mlx5_rxq_ctrl *rxq_ctrl =
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
union {
@@ -1139,11 +1149,17 @@ mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec)
&sm))
return -1;
if (vec) {
- const uint16_t q_mask = wqe_n - 1;
+ uint16_t q_mask;
uint16_t elt_idx;
struct rte_mbuf **elt;
int i;
- unsigned int n = wqe_n - (rxq->rq_ci -
+ unsigned int n;
+
+ if (mlx5_rxq_mprq_enabled(&rxq_ctrl->rxq))
+ wqe_n *= (1U <<
+ rxq_ctrl->rxq.strd_num_n);
+ q_mask = wqe_n - 1;
+ n = wqe_n - (rxq->rq_ci -
rxq->rq_pi);
for (i = 0; i < (int)n; ++i) {
@@ -1982,6 +1998,14 @@ mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
return 0;
}
+__rte_weak uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ return 0;
+}
+
__rte_weak int
mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
{
@@ -141,11 +141,8 @@ struct mlx5_rxq_data {
uint16_t mprq_max_memcpy_len; /* Maximum size of packet to memcpy. */
volatile void *wqes;
volatile struct mlx5_cqe(*cqes)[];
- RTE_STD_C11
- union {
- struct rte_mbuf *(*elts)[];
- struct mlx5_mprq_buf *(*mprq_bufs)[];
- };
+ struct rte_mbuf *(*elts)[];
+ struct mlx5_mprq_buf *(*mprq_bufs)[];
struct rte_mempool *mp;
struct rte_mempool *mprq_mp; /* Mempool for Multi-Packet RQ. */
struct mlx5_mprq_buf *mprq_repl; /* Stashed mbuf for replenish. */
@@ -518,6 +515,8 @@ int mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq_data);
int mlx5_check_vec_rx_support(struct rte_eth_dev *dev);
uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
uint16_t pkts_n);
+uint16_t mlx5_rx_burst_mprq_vec(void *dpdk_txq, struct rte_mbuf **pkts,
+ uint16_t pkts_n);
/* mlx5_mr.c */
@@ -119,6 +119,40 @@ mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
return tn;
}
+/**
+ * DPDK callback for MPRQ vectorized RX.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct mlx5_rxq_data *rxq = dpdk_rxq;
+ uint16_t nb_rx = 0;
+ uint16_t tn = 0;
+ uint64_t err = 0;
+ bool no_cq = false;
+
+ do {
+ nb_rx = rxq_burst_mprq_v(rxq, pkts + tn, pkts_n - tn,
+ &err, &no_cq);
+ if (unlikely(err | rxq->err_state))
+ nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
+ tn += nb_rx;
+ if (unlikely(no_cq))
+ break;
+ } while (tn != pkts_n);
+ return tn;
+}
+
/**
* Check a RX queue can support vectorized RX.
*
@@ -134,8 +168,6 @@ mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq)
struct mlx5_rxq_ctrl *ctrl =
container_of(rxq, struct mlx5_rxq_ctrl, rxq);
- if (mlx5_mprq_enabled(ETH_DEV(ctrl->priv)))
- return -ENOTSUP;
if (!ctrl->priv->config.rx_vec_en || rxq->sges_n != 0)
return -ENOTSUP;
if (rxq->lro)
@@ -160,8 +192,6 @@ mlx5_check_vec_rx_support(struct rte_eth_dev *dev)
if (!priv->config.rx_vec_en)
return -ENOTSUP;
- if (mlx5_mprq_enabled(dev))
- return -ENOTSUP;
/* All the configured queues should support. */
for (i = 0; i < priv->rxqs_n; ++i) {
struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
@@ -122,4 +122,25 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
}
+static inline void
+mlx5_rx_replenish_bulk_mprq_mbuf(struct mlx5_rxq_data *rxq,
+ uint16_t n, uint32_t rq_idx)
+{
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ uint16_t elts_idx = rq_idx * strd_n +
+ rq_idx * MLX5_VPMD_DESCS_PER_LOOP;
+ struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
+ unsigned int i;
+
+ n = RTE_MIN(n, strd_n - rxq->consumed_strd);
+ if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
+ rxq->stats.rx_nombuf += n;
+ return;
+ }
+ rxq->rq_repl_thresh = 0;
+ /* Prevent overflowing into the next MPRQ mbufs. */
+ for (i = 0; i < MLX5_VPMD_DESCS_PER_LOOP; ++i)
+ (*rxq->elts)[elts_idx + strd_n + i] = &rxq->fake_mbuf;
+}
+
#endif /* RTE_PMD_MLX5_RXTX_VEC_H_ */
@@ -59,6 +59,97 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
pkts[pos] = elts[pos];
}
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ * @param buf
+ * MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ * WQE index.
+ * @param strd_idx
+ * Stride number.
+ * @param comp
+ * Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t n, struct mlx5_mprq_buf *buf,
+ uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+ const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const unsigned int strd_shift =
+ MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+ uint32_t offset;
+ void *addr;
+ int i = 0;
+
+ if (comp) {
+ const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+ struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+ unsigned int pos;
+ uint16_t p = n & -2;
+
+ for (pos = 0; pos < p; pos += 2) {
+ vector unsigned char mbp;
+
+ mbp = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&elts[pos +
+ rxq->consumed_strd]);
+ *(vector unsigned char *)&pkts[pos] = mbp;
+ }
+ if (n & 1)
+ pkts[pos] = elts[pos];
+ }
+
+ for (i = 0; i < n; ++i) {
+ offset = (strd_idx + i) * strd_sz + strd_shift;
+ addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+ if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+ rxq->mprq_repl == NULL) {
+ rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+ addr, pkts[i]->pkt_len);
+ } else {
+ rte_iova_t buf_iova;
+ struct rte_mbuf_ext_shared_info *shinfo;
+ uint16_t buf_len = strd_sz;
+ void *buf_addr;
+ /* Increment the refcnt of the whole chunk. */
+ rte_atomic16_add_return(&buf->refcnt, 1);
+ MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+ strd_n + 1);
+ buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+ /*
+ * MLX5 device doesn't use iova but it is necessary in a
+ * case where the Rx packet is transmitted via a
+ * different PMD.
+ */
+ buf_iova = rte_mempool_virt2iova(buf) +
+ RTE_PTR_DIFF(buf_addr, buf);
+ shinfo = &buf->shinfos[strd_idx];
+ rte_mbuf_ext_refcnt_set(shinfo, 1);
+ /*
+ * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+ * attaching the stride to mbuf and more offload flags
+ * will be added below by calling rxq_cq_to_mbuf().
+ * Other fields will be overwritten.
+ */
+ rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+ buf_len, shinfo);
+ /* Set mbuf head-room. */
+ SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+ DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+ }
+ }
+}
+
+
/**
* Decompress a compressed completion and fill in mbufs in RX SW ring with data
* extracted from the title completion descriptor.
@@ -1136,4 +1227,637 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
return rcvd_pkt;
}
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+ const unsigned int strd_n)
+{
+ struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+ void *addr;
+
+ MLX5_ASSERT(rep != NULL);
+ /* Replace MPRQ buf. */
+ (*rxq->mprq_bufs)[rq_idx] = rep;
+ /* Replace WQE. */
+ addr = mlx5_mprq_buf_addr(rep, strd_n);
+ wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+ /* Stash a mbuf for next replacement. */
+ if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+ rxq->mprq_repl = rep;
+ else
+ rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
+ * @param[out] no_cq
+ * Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ * Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const uint16_t q_n = 1 << rxq->cqe_n;
+ const uint16_t q_mask = q_n - 1;
+ const uint16_t e_n = 1 << rxq->elts_n;
+ const uint16_t e_mask = e_n - 1;
+ volatile struct mlx5_cqe *cq;
+ struct rte_mbuf **elts;
+ unsigned int pos;
+ uint64_t n;
+ uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+ uint16_t nocmp_n = 0;
+ uint16_t rcvd_pkt = 0;
+ unsigned int cq_ci = rxq->cq_ci;
+ unsigned int cq_idx = cq_ci & q_mask;
+ unsigned int rq_ci = rxq->rq_ci;
+ unsigned int rq_idx = rq_ci & e_mask;
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+ unsigned int elts_idx;
+ unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+ const vector unsigned char zero = (vector unsigned char){0};
+ const vector unsigned char ones = vec_splat_u8(-1);
+ const vector unsigned char owner_check =
+ (vector unsigned char)(vector unsigned long){
+ 0x0100000001000000LL, 0x0100000001000000LL};
+ const vector unsigned char opcode_check =
+ (vector unsigned char)(vector unsigned long){
+ 0xf0000000f0000000LL, 0xf0000000f0000000LL};
+ const vector unsigned char format_check =
+ (vector unsigned char)(vector unsigned long){
+ 0x0c0000000c000000LL, 0x0c0000000c000000LL};
+ const vector unsigned char resp_err_check =
+ (vector unsigned char)(vector unsigned long){
+ 0xe0000000e0000000LL, 0xe0000000e0000000LL};
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const vector unsigned char len_shuf_mask = (vector unsigned char){
+ 1, 0, 5, 4,
+ 9, 8, 13, 12,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1};
+#endif
+ /* Mask to shuffle from extracted CQE to mbuf. */
+ const vector unsigned char shuf_mask = (vector unsigned char){
+ 5, 4, /* bswap16, pkt_len */
+ -1, -1, /* zero out 2nd half of pkt_len */
+ 5, 4, /* bswap16, data_len */
+ 11, 10, /* bswap16, vlan+tci */
+ 15, 14, 13, 12, /* bswap32, rss */
+ 1, 2, 3, -1}; /* fdir.hi */
+ /* Mask to blend from the last Qword to the first DQword. */
+ /* Mask to blend from the last Qword to the first DQword. */
+ const vector unsigned char blend_mask = (vector unsigned char){
+ -1, 0, 0, 0,
+ 0, 0, 0, 0,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1};
+ const vector unsigned char crc_adj =
+ (vector unsigned char)(vector unsigned short){
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0};
+ const vector unsigned char flow_mark_adj =
+ (vector unsigned char)(vector unsigned int){
+ 0, 0, 0, rxq->mark * (-1)};
+ const vector unsigned short cqe_sel_mask1 =
+ (vector unsigned short){0, 0, 0, 0, 0xffff, 0xffff, 0, 0};
+ const vector unsigned short cqe_sel_mask2 =
+ (vector unsigned short){0, 0, 0xffff, 0, 0, 0, 0, 0};
+
+ MLX5_ASSERT(rxq->sges_n == 0);
+ MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+ if (rxq->consumed_strd == strd_n) {
+ /* Replace WQE only if the buffer is still in use. */
+ if (rte_atomic16_read(&buf->refcnt) > 1) {
+ mprq_buf_replace(rxq, rq_ci & e_mask, strd_n);
+ /* Release the old buffer. */
+ mlx5_mprq_buf_free(buf);
+ } else if (unlikely(rxq->mprq_repl == NULL)) {
+ struct mlx5_mprq_buf *rep;
+
+ /*
+ * Currently, the MPRQ mempool is out of buffer
+ * and doing memcpy regardless of the size of Rx
+ * packet. Retry allocation to get back to
+ * normal.
+ */
+ if (!rte_mempool_get(rxq->mprq_mp,
+ (void **)&rep))
+ rxq->mprq_repl = rep;
+ }
+ /* Advance to the next WQE. */
+ rxq->consumed_strd = 0;
+ ++rq_ci;
+ buf = (*rxq->mprq_bufs)[rq_ci & e_mask];
+ rxq->rq_repl_thresh = 1;
+ }
+ if (rxq->rq_repl_thresh)
+ mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);
+
+ cq = &(*rxq->cqes)[cq_idx];
+ rte_prefetch0(cq);
+ rte_prefetch0(cq + 1);
+ rte_prefetch0(cq + 2);
+ rte_prefetch0(cq + 3);
+ elts_idx = (rq_ci & e_mask) * strd_n +
+ (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+ elts = &(*rxq->elts)[elts_idx];
+ pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+ /* See if there're unreturned mbufs from compressed CQE. */
+ rcvd_pkt = rxq->decompressed;
+ if (rcvd_pkt > 0) {
+ rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+ rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+ rq_ci, rxq->consumed_strd, true);
+ rxq->consumed_strd += rcvd_pkt;
+ rxq->rq_pi += rcvd_pkt;
+ rxq->decompressed -= rcvd_pkt;
+ pkts += rcvd_pkt;
+ }
+ /* Not to cross queue end. */
+ pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+ pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+ if (!pkts_n) {
+ *no_cq = !rcvd_pkt;
+ return rcvd_pkt;
+ }
+ /* At this point, there shouldn't be any remaining packets. */
+ MLX5_ASSERT(rxq->decompressed == 0);
+
+ /*
+ * A. load first Qword (8bytes) in one loop.
+ * B. copy 4 mbuf pointers from elts ring to returing pkts.
+ * C. load remaining CQE data and extract necessary fields.
+ * Final 16bytes cqes[] extracted from original 64bytes CQE has the
+ * following structure:
+ * struct {
+ * uint8_t pkt_info;
+ * uint8_t flow_tag[3];
+ * uint16_t byte_cnt;
+ * uint8_t rsvd4;
+ * uint8_t op_own;
+ * uint16_t hdr_type_etc;
+ * uint16_t vlan_info;
+ * uint32_t rx_has_res;
+ * } c;
+ * D. fill in mbuf.
+ * E. get valid CQEs.
+ * F. find compressed CQE.
+ */
+ for (pos = 0;
+ pos < pkts_n;
+ pos += MLX5_VPMD_DESCS_PER_LOOP) {
+ vector unsigned char cqes[MLX5_VPMD_DESCS_PER_LOOP];
+ vector unsigned char cqe_tmp1, cqe_tmp2;
+ vector unsigned char pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+ vector unsigned char op_own, op_own_tmp1, op_own_tmp2;
+ vector unsigned char opcode, owner_mask, invalid_mask;
+ vector unsigned char comp_mask;
+ vector unsigned char mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ const vector unsigned char lower_half = {
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 16, 17, 20, 21, 24, 25, 28, 29};
+ const vector unsigned char upper_half = {
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 18, 19, 22, 23, 26, 27, 30, 31};
+ const vector unsigned long shmax = {64, 64};
+ vector unsigned char byte_cnt;
+ vector unsigned short left, right;
+ vector unsigned long lshift;
+ vector __attribute__((altivec(bool__)))
+ unsigned long shmask;
+#endif
+ vector unsigned char mbp1, mbp2;
+ vector unsigned char p =
+ (vector unsigned char)(vector unsigned short){
+ 0, 1, 2, 3, 0, 0, 0, 0};
+ unsigned int p1, p2, p3;
+
+ /* Prefetch next 4 CQEs. */
+ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+ }
+
+ /* A.0 do not cross the end of CQ. */
+ mask = (vector unsigned char)(vector unsigned long){
+ (pkts_n - pos) * sizeof(uint16_t) * 8, 0};
+
+ {
+ vector unsigned long lshift;
+ vector __attribute__((altivec(bool__)))
+ unsigned long shmask;
+ const vector unsigned long shmax = {64, 64};
+
+ lshift = vec_splat((vector unsigned long)mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)mask, shmask);
+ }
+
+ p = (vector unsigned char)
+ vec_andc((vector unsigned long)p,
+ (vector unsigned long)mask);
+
+ /* A.1 load cqes. */
+ p3 = (unsigned int)((vector unsigned short)p)[3];
+ cqes[3] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p3].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ p2 = (unsigned int)((vector unsigned short)p)[2];
+ cqes[2] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p2].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ /* B.1 load mbuf pointers. */
+ mbp1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&elts[pos + rxq->consumed_strd]);
+ mbp2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&elts[pos +
+ rxq->consumed_strd + 2]);
+
+ /* A.1 load a block having op_own. */
+ p1 = (unsigned int)((vector unsigned short)p)[1];
+ cqes[1] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p1].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ cqes[0] = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos].sop_drop_qpn, 0LL};
+ rte_compiler_barrier();
+
+ /* B.2 copy mbuf pointers. */
+ *(vector unsigned char *)&pkts[pos] = mbp1;
+ *(vector unsigned char *)&pkts[pos + 2] = mbp2;
+ rte_cio_rmb();
+
+ /* C.1 load remaining CQE data and extract necessary fields. */
+ cqe_tmp2 = *(vector unsigned char *)
+ &cq[pos + p3].pkt_info;
+ cqe_tmp1 = *(vector unsigned char *)
+ &cq[pos + p2].pkt_info;
+ cqes[3] = vec_sel(cqes[3], cqe_tmp2, blend_mask);
+ cqes[2] = vec_sel(cqes[2], cqe_tmp1, blend_mask);
+ cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos + p3].csum);
+ cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos + p2].csum);
+ cqes[3] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[3],
+ (vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+ cqes[2] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[2],
+ (vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+ cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p3].rsvd3[9], 0LL};
+ cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p2].rsvd3[9], 0LL};
+ cqes[3] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[3],
+ (vector unsigned short)cqe_tmp2,
+ (vector unsigned short)cqe_sel_mask2);
+ cqes[2] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[2],
+ (vector unsigned short)cqe_tmp1,
+ (vector unsigned short)cqe_sel_mask2);
+
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb3 = vec_perm(cqes[3], zero, shuf_mask);
+ pkt_mb2 = vec_perm(cqes[2], zero, shuf_mask);
+
+ /* C.3 adjust CRC length. */
+ pkt_mb3 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb3 -
+ (vector unsigned short)crc_adj);
+ pkt_mb2 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb2 -
+ (vector unsigned short)crc_adj);
+
+ /* C.4 adjust flow mark. */
+ pkt_mb3 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb3 +
+ (vector unsigned int)flow_mark_adj);
+ pkt_mb2 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb2 +
+ (vector unsigned int)flow_mark_adj);
+
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ *(vector unsigned char *)
+ &pkts[pos + 3]->pkt_len = pkt_mb3;
+ *(vector unsigned char *)
+ &pkts[pos + 2]->pkt_len = pkt_mb2;
+
+ /* E.1 extract op_own field. */
+ op_own_tmp2 = (vector unsigned char)
+ vec_mergeh((vector unsigned int)cqes[2],
+ (vector unsigned int)cqes[3]);
+
+ /* C.1 load remaining CQE data and extract necessary fields. */
+ cqe_tmp2 = *(vector unsigned char *)
+ &cq[pos + p1].pkt_info;
+ cqe_tmp1 = *(vector unsigned char *)
+ &cq[pos].pkt_info;
+ cqes[1] = vec_sel(cqes[1], cqe_tmp2, blend_mask);
+ cqes[0] = vec_sel(cqes[0], cqe_tmp2, blend_mask);
+ cqe_tmp2 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos + p1].csum);
+ cqe_tmp1 = (vector unsigned char)vec_vsx_ld(0,
+ (signed int const *)&cq[pos].csum);
+ cqes[1] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[1],
+ (vector unsigned short)cqe_tmp2, cqe_sel_mask1);
+ cqes[0] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[0],
+ (vector unsigned short)cqe_tmp1, cqe_sel_mask1);
+ cqe_tmp2 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos + p1].rsvd3[9], 0LL};
+ cqe_tmp1 = (vector unsigned char)(vector unsigned long){
+ *(__rte_aligned(8) unsigned long *)
+ &cq[pos].rsvd3[9], 0LL};
+ cqes[1] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[1],
+ (vector unsigned short)cqe_tmp2, cqe_sel_mask2);
+ cqes[0] = (vector unsigned char)
+ vec_sel((vector unsigned short)cqes[0],
+ (vector unsigned short)cqe_tmp1, cqe_sel_mask2);
+
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb1 = vec_perm(cqes[1], zero, shuf_mask);
+ pkt_mb0 = vec_perm(cqes[0], zero, shuf_mask);
+
+ /* C.3 adjust CRC length. */
+ pkt_mb1 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb1 -
+ (vector unsigned short)crc_adj);
+ pkt_mb0 = (vector unsigned char)
+ ((vector unsigned short)pkt_mb0 -
+ (vector unsigned short)crc_adj);
+
+ /* C.4 adjust flow mark. */
+ pkt_mb1 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb1 +
+ (vector unsigned int)flow_mark_adj);
+ pkt_mb0 = (vector unsigned char)
+ ((vector unsigned int)pkt_mb0 +
+ (vector unsigned int)flow_mark_adj);
+
+ /* E.1 extract op_own byte. */
+ op_own_tmp1 = (vector unsigned char)
+ vec_mergeh((vector unsigned int)cqes[0],
+ (vector unsigned int)cqes[1]);
+ op_own = (vector unsigned char)
+ vec_mergel((vector unsigned long)op_own_tmp1,
+ (vector unsigned long)op_own_tmp2);
+
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ *(vector unsigned char *)
+ &pkts[pos + 1]->pkt_len = pkt_mb1;
+ *(vector unsigned char *)
+ &pkts[pos]->pkt_len = pkt_mb0;
+
+ /* E.2 flip owner bit to mark CQEs from last round. */
+ owner_mask = (vector unsigned char)
+ vec_and((vector unsigned long)op_own,
+ (vector unsigned long)owner_check);
+ if (ownership)
+ owner_mask = (vector unsigned char)
+ vec_xor((vector unsigned long)owner_mask,
+ (vector unsigned long)owner_check);
+ owner_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)owner_mask,
+ (vector unsigned int)owner_check);
+ owner_mask = (vector unsigned char)
+ vec_packs((vector unsigned int)owner_mask,
+ (vector unsigned int)zero);
+
+ /* E.3 get mask for invalidated CQEs. */
+ opcode = (vector unsigned char)
+ vec_and((vector unsigned long)op_own,
+ (vector unsigned long)opcode_check);
+ invalid_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)opcode_check,
+ (vector unsigned int)opcode);
+ invalid_mask = (vector unsigned char)
+ vec_packs((vector unsigned int)invalid_mask,
+ (vector unsigned int)zero);
+
+ /* E.4 mask out beyond boundary. */
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)mask);
+
+ /* E.5 merge invalid_mask with invalid owner. */
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)owner_mask);
+
+ /* F.1 find compressed CQE format. */
+ comp_mask = (vector unsigned char)
+ vec_and((vector unsigned long)op_own,
+ (vector unsigned long)format_check);
+ comp_mask = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)comp_mask,
+ (vector unsigned int)format_check);
+ comp_mask = (vector unsigned char)
+ vec_packs((vector unsigned int)comp_mask,
+ (vector unsigned int)zero);
+
+ /* F.2 mask out invalid entries. */
+ comp_mask = (vector unsigned char)
+ vec_andc((vector unsigned long)comp_mask,
+ (vector unsigned long)invalid_mask);
+ comp_idx = ((vector unsigned long)comp_mask)[0];
+
+ /* F.3 get the first compressed CQE. */
+ comp_idx = comp_idx ? __builtin_ctzll(comp_idx) /
+ (sizeof(uint16_t) * 8) : MLX5_VPMD_DESCS_PER_LOOP;
+
+ /* E.6 mask out entries after the compressed CQE. */
+ mask = (vector unsigned char)(vector unsigned long){
+ (comp_idx * sizeof(uint16_t) * 8), 0};
+ lshift = vec_splat((vector unsigned long)mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)mask, shmask);
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)mask);
+
+ /* E.7 count non-compressed valid CQEs. */
+ n = ((vector unsigned long)invalid_mask)[0];
+ n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+ MLX5_VPMD_DESCS_PER_LOOP;
+ nocmp_n += n;
+
+ /* D.2 get the final invalid mask. */
+ mask = (vector unsigned char)(vector unsigned long){
+ (n * sizeof(uint16_t) * 8), 0};
+ lshift = vec_splat((vector unsigned long)mask, 0);
+ shmask = vec_cmpgt(shmax, lshift);
+ mask = (vector unsigned char)
+ vec_sl((vector unsigned long)ones, lshift);
+ mask = (vector unsigned char)
+ vec_sel((vector unsigned long)shmask,
+ (vector unsigned long)mask, shmask);
+ invalid_mask = (vector unsigned char)
+ vec_or((vector unsigned long)invalid_mask,
+ (vector unsigned long)mask);
+
+ /* D.3 check error in opcode. */
+ opcode = (vector unsigned char)
+ vec_cmpeq((vector unsigned int)resp_err_check,
+ (vector unsigned int)opcode);
+ opcode = (vector unsigned char)
+ vec_packs((vector unsigned int)opcode,
+ (vector unsigned int)zero);
+ opcode = (vector unsigned char)
+ vec_andc((vector unsigned long)opcode,
+ (vector unsigned long)invalid_mask);
+
+ /* D.4 mark if any error is set */
+ *err |= ((vector unsigned long)opcode)[0];
+
+ /* D.5 fill in mbuf - rearm_data and packet_type. */
+ rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+ if (rxq->hw_timestamp) {
+ pkts[pos]->timestamp =
+ rte_be_to_cpu_64(cq[pos].timestamp);
+ pkts[pos + 1]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p1].timestamp);
+ pkts[pos + 2]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p2].timestamp);
+ pkts[pos + 3]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p3].timestamp);
+ }
+ if (rxq->dynf_meta) {
+ uint64_t flag = rxq->flow_meta_mask;
+ int32_t offs = rxq->flow_meta_offset;
+ uint32_t metadata;
+
+ /* This code is subject for futher optimization. */
+ metadata = cq[pos].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ metadata;
+ pkts[pos]->ol_flags |= metadata ? flag : 0ULL;
+ metadata = cq[pos + 1].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+ metadata;
+ pkts[pos + 1]->ol_flags |= metadata ? flag : 0ULL;
+ metadata = cq[pos + 2].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+ metadata;
+ pkts[pos + 2]->ol_flags |= metadata ? flag : 0ULL;
+ metadata = cq[pos + 3].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+ metadata;
+ pkts[pos + 3]->ol_flags |= metadata ? flag : 0ULL;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Add up received bytes count. */
+ byte_cnt = vec_perm(op_own, zero, len_shuf_mask);
+ byte_cnt = (vector unsigned char)
+ vec_andc((vector unsigned long)byte_cnt,
+ (vector unsigned long)invalid_mask);
+ left = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, lower_half);
+ right = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, upper_half);
+ byte_cnt = (vector unsigned char)vec_add(left, right);
+ left = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, lower_half);
+ right = vec_perm((vector unsigned short)byte_cnt,
+ (vector unsigned short)zero, upper_half);
+ byte_cnt = (vector unsigned char)vec_add(left, right);
+ rcvd_byte += ((vector unsigned long)byte_cnt)[0];
+#endif
+
+ /*
+ * Break the loop unless more valid CQE is expected, or if
+ * there's a compressed CQE.
+ */
+ if (n != MLX5_VPMD_DESCS_PER_LOOP)
+ break;
+ }
+ /* If no new CQE seen, return without updating cq_db. */
+ if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+ *no_cq = true;
+ return rcvd_pkt;
+ }
+ /* Update the consumer indexes for non-compressed CQEs. */
+ MLX5_ASSERT(nocmp_n <= pkts_n);
+ rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+ rq_ci, rxq->consumed_strd, false);
+ rxq->cq_ci += nocmp_n;
+ rxq->consumed_strd += nocmp_n;
+ rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += nocmp_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ /* Decompress the last CQE if compressed. */
+ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+ MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+ rxq->decompressed =
+ rxq_cq_decompress_v(rxq, &cq[nocmp_n], &elts[nocmp_n]);
+ /* Return more packets if needed. */
+ if (nocmp_n < pkts_n) {
+ uint16_t n = rxq->decompressed;
+
+ n = RTE_MIN(n, pkts_n - nocmp_n);
+ rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+ rq_ci, rxq->consumed_strd, true);
+ rxq->consumed_strd += n;
+ rcvd_pkt += n;
+ rxq->decompressed -= n;
+ }
+ }
+ rte_compiler_barrier();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ if (rq_ci != rxq->rq_ci) {
+ rxq->rq_ci = rq_ci;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ }
+ *no_cq = !rcvd_pkt;
+ return rcvd_pkt;
+}
+
#endif /* RTE_PMD_MLX5_RXTX_VEC_ALTIVEC_H_ */
@@ -54,6 +54,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
pkts[pos] = elts[pos];
}
+/**
+ * Store free buffers to RX SW ring.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ * @param buf
+ * MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ * WQE index.
+ * @param strd_idx
+ * Stride number.
+ * @param comp
+ * Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t n, struct mlx5_mprq_buf *buf,
+ uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+ const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const unsigned int strd_shift =
+ MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+ uint32_t offset;
+ void *addr;
+ int i = 0;
+
+ if (comp) {
+ const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+ struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+ unsigned int pos;
+ uint16_t p = n & -2;
+
+ for (pos = 0; pos < p; pos += 2) {
+ uint64x2_t mbp;
+
+ mbp = vld1q_u64((void *)&elts[pos +
+ rxq->consumed_strd]);
+ vst1q_u64((void *)&pkts[pos], mbp);
+ }
+ if (n & 1)
+ pkts[pos] = elts[pos];
+ }
+
+ for (i = 0; i < n; ++i) {
+ offset = (strd_idx + i) * strd_sz + strd_shift;
+ addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+ if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+ rxq->mprq_repl == NULL) {
+ rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+ addr, pkts[i]->pkt_len);
+ } else {
+ rte_iova_t buf_iova;
+ struct rte_mbuf_ext_shared_info *shinfo;
+ uint16_t buf_len = strd_sz;
+ void *buf_addr;
+ /* Increment the refcnt of the whole chunk. */
+ rte_atomic16_add_return(&buf->refcnt, 1);
+ MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+ strd_n + 1);
+ buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+ /*
+ * MLX5 device doesn't use iova but it is necessary in a
+ * case where the Rx packet is transmitted via a
+ * different PMD.
+ */
+ buf_iova = rte_mempool_virt2iova(buf) +
+ RTE_PTR_DIFF(buf_addr, buf);
+ shinfo = &buf->shinfos[strd_idx];
+ rte_mbuf_ext_refcnt_set(shinfo, 1);
+ /*
+ * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+ * attaching the stride to mbuf and more offload flags
+ * will be added below by calling rxq_cq_to_mbuf().
+ * Other fields will be overwritten.
+ */
+ rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+ buf_len, shinfo);
+ /* Set mbuf head-room. */
+ SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+ DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+ }
+ }
+}
+
/**
* Decompress a compressed completion and fill in mbufs in RX SW ring with data
* extracted from the title completion descriptor.
@@ -806,4 +895,492 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
return rcvd_pkt;
}
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+ const unsigned int strd_n)
+{
+ struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+ void *addr;
+
+ MLX5_ASSERT(rep != NULL);
+ /* Replace MPRQ buf. */
+ (*rxq->mprq_bufs)[rq_idx] = rep;
+ /* Replace WQE. */
+ addr = mlx5_mprq_buf_addr(rep, strd_n);
+ wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+ /* Stash a mbuf for next replacement. */
+ if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+ rxq->mprq_repl = rep;
+ else
+ rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
+ * @param[out] no_cq
+ * Pointer to a boolean. Set true if no new CQE seen.
+ *
+ * @return
+ * Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const uint16_t q_n = 1 << rxq->cqe_n;
+ const uint16_t q_mask = q_n - 1;
+ const uint16_t e_n = 1 << rxq->elts_n;
+ const uint16_t e_mask = e_n - 1;
+ volatile struct mlx5_cqe *cq;
+ struct rte_mbuf **elts;
+ unsigned int pos;
+ uint64_t n;
+ uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+ uint16_t nocmp_n = 0;
+ uint16_t rcvd_pkt = 0;
+ unsigned int cq_ci = rxq->cq_ci;
+ unsigned int cq_idx = cq_ci & q_mask;
+ unsigned int rq_ci = rxq->rq_ci;
+ unsigned int rq_idx = rq_ci & e_mask;
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+ unsigned int elts_idx;
+ const uint16x4_t ownership = vdup_n_u16(!(rxq->cq_ci & (q_mask + 1)));
+ const uint16x4_t owner_check = vcreate_u16(0x0001000100010001);
+ const uint16x4_t opcode_check = vcreate_u16(0x00f000f000f000f0);
+ const uint16x4_t format_check = vcreate_u16(0x000c000c000c000c);
+ const uint16x4_t resp_err_check = vcreate_u16(0x00e000e000e000e0);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+#endif
+ /* Mask to generate 16B length vector. */
+ const uint8x8_t len_shuf_m = {
+ 52, 53, /* 4th CQE */
+ 36, 37, /* 3rd CQE */
+ 20, 21, /* 2nd CQE */
+ 4, 5 /* 1st CQE */
+ };
+ /* Mask to extract 16B data from a 64B CQE. */
+ const uint8x16_t cqe_shuf_m = {
+ 28, 29, /* hdr_type_etc */
+ 0, /* pkt_info */
+ -1, /* null */
+ 47, 46, /* byte_cnt, bswap16 */
+ 31, 30, /* vlan_info, bswap16 */
+ 15, 14, 13, 12, /* rx_hash_res, bswap32 */
+ 57, 58, 59, /* flow_tag */
+ 63 /* op_own */
+ };
+ /* Mask to generate 16B data for mbuf. */
+ const uint8x16_t mb_shuf_m = {
+ 4, 5, -1, -1, /* pkt_len */
+ 4, 5, /* data_len */
+ 6, 7, /* vlan_tci */
+ 8, 9, 10, 11, /* hash.rss */
+ 12, 13, 14, -1 /* hash.fdir.hi */
+ };
+ /* Mask to generate 16B owner vector. */
+ const uint8x8_t owner_shuf_m = {
+ 63, -1, /* 4th CQE */
+ 47, -1, /* 3rd CQE */
+ 31, -1, /* 2nd CQE */
+ 15, -1 /* 1st CQE */
+ };
+ /* Mask to generate a vector having packet_type/ol_flags. */
+ const uint8x16_t ptype_shuf_m = {
+ 48, 49, 50, -1, /* 4th CQE */
+ 32, 33, 34, -1, /* 3rd CQE */
+ 16, 17, 18, -1, /* 2nd CQE */
+ 0, 1, 2, -1 /* 1st CQE */
+ };
+ /* Mask to generate a vector having flow tags. */
+ const uint8x16_t ftag_shuf_m = {
+ 60, 61, 62, -1, /* 4th CQE */
+ 44, 45, 46, -1, /* 3rd CQE */
+ 28, 29, 30, -1, /* 2nd CQE */
+ 12, 13, 14, -1 /* 1st CQE */
+ };
+ const uint16x8_t crc_adj = {
+ 0, 0, rxq->crc_present * RTE_ETHER_CRC_LEN, 0, 0, 0, 0, 0
+ };
+ const uint32x4_t flow_mark_adj = { 0, 0, 0, rxq->mark * (-1) };
+
+ MLX5_ASSERT(rxq->sges_n == 0);
+ MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+ if (rxq->consumed_strd == strd_n) {
+ /* Replace WQE only if the buffer is still in use. */
+ if (rte_atomic16_read(&buf->refcnt) > 1) {
+ mprq_buf_replace(rxq, rq_idx, strd_n);
+ /* Release the old buffer. */
+ mlx5_mprq_buf_free(buf);
+ } else if (unlikely(rxq->mprq_repl == NULL)) {
+ struct mlx5_mprq_buf *rep;
+
+ /*
+ * Currently, the MPRQ mempool is out of buffer
+ * and doing memcpy regardless of the size of Rx
+ * packet. Retry allocation to get back to
+ * normal.
+ */
+ if (!rte_mempool_get(rxq->mprq_mp,
+ (void **)&rep))
+ rxq->mprq_repl = rep;
+ }
+ /* Advance to the next WQE. */
+ rxq->consumed_strd = 0;
+ ++rq_ci;
+ rq_idx = rq_ci & e_mask;
+ buf = (*rxq->mprq_bufs)[rq_idx];
+ rxq->rq_repl_thresh = 1;
+ }
+ if (rxq->rq_repl_thresh)
+ mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_idx);
+
+ cq = &(*rxq->cqes)[cq_idx];
+ rte_prefetch_non_temporal(cq);
+ rte_prefetch_non_temporal(cq + 1);
+ rte_prefetch_non_temporal(cq + 2);
+ rte_prefetch_non_temporal(cq + 3);
+ elts_idx = (rq_ci & e_mask) * strd_n +
+ (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+ elts = &(*rxq->elts)[elts_idx];
+ pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+ /* See if there're unreturned mbufs from compressed CQE. */
+ rcvd_pkt = rxq->decompressed;
+ if (rcvd_pkt > 0) {
+ rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+ rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+ rq_ci, rxq->consumed_strd, true);
+ rxq->consumed_strd += rcvd_pkt;
+ pkts += rcvd_pkt;
+ rxq->decompressed -= rcvd_pkt;
+ }
+ /* Not to cross queue end. */
+ pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+ pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+ if (!pkts_n) {
+ *no_cq = !rcvd_pkt;
+ return rcvd_pkt;
+ }
+ /* At this point, there shouldn't be any remained packets. */
+ MLX5_ASSERT(rxq->decompressed == 0);
+ /*
+ * Note that vectors have reverse order - {v3, v2, v1, v0}, because
+ * there's no instruction to count trailing zeros. __builtin_clzl() is
+ * used instead.
+ *
+ * A. copy 4 mbuf pointers from elts ring to returing pkts.
+ * B. load 64B CQE and extract necessary fields
+ * Final 16bytes cqes[] extracted from original 64bytes CQE has the
+ * following structure:
+ * struct {
+ * uint16_t hdr_type_etc;
+ * uint8_t pkt_info;
+ * uint8_t rsvd;
+ * uint16_t byte_cnt;
+ * uint16_t vlan_info;
+ * uint32_t rx_has_res;
+ * uint8_t flow_tag[3];
+ * uint8_t op_own;
+ * } c;
+ * C. fill in mbuf.
+ * D. get valid CQEs.
+ * E. find compressed CQE.
+ */
+ for (pos = 0;
+ pos < pkts_n;
+ pos += MLX5_VPMD_DESCS_PER_LOOP) {
+ uint16x4_t op_own;
+ uint16x4_t opcode, owner_mask, invalid_mask;
+ uint16x4_t comp_mask;
+ uint16x4_t mask;
+ uint16x4_t byte_cnt;
+ uint32x4_t ptype_info, flow_tag;
+ register uint64x2_t c0, c1, c2, c3;
+ uint8_t *p0, *p1, *p2, *p3;
+ uint8_t *e0 = (void *)&elts[pos + rxq->consumed_strd]->pkt_len;
+ uint8_t *e1 = (void *)&elts[pos +
+ rxq->consumed_strd + 1]->pkt_len;
+ uint8_t *e2 = (void *)&elts[pos +
+ rxq->consumed_strd + 2]->pkt_len;
+ uint8_t *e3 = (void *)&elts[pos +
+ rxq->consumed_strd + 3]->pkt_len;
+ void *elts_p = (void *)&elts[pos + rxq->consumed_strd];
+ void *pkts_p = (void *)&pkts[pos];
+
+ /* A.0 do not cross the end of CQ. */
+ mask = vcreate_u16(pkts_n - pos < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL >> ((pkts_n - pos) *
+ sizeof(uint16_t) * 8) : 0);
+ p0 = (void *)&cq[pos].pkt_info;
+ p1 = p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
+ p2 = p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
+ p3 = p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
+ /* B.0 (CQE 3) load a block having op_own. */
+ c3 = vld1q_u64((uint64_t *)(p3 + 48));
+ /* B.0 (CQE 2) load a block having op_own. */
+ c2 = vld1q_u64((uint64_t *)(p2 + 48));
+ /* B.0 (CQE 1) load a block having op_own. */
+ c1 = vld1q_u64((uint64_t *)(p1 + 48));
+ /* B.0 (CQE 0) load a block having op_own. */
+ c0 = vld1q_u64((uint64_t *)(p0 + 48));
+ /* Synchronize for loading the rest of blocks. */
+ rte_cio_rmb();
+ /* Prefetch next 4 CQEs. */
+ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+ unsigned int next = pos + MLX5_VPMD_DESCS_PER_LOOP;
+ rte_prefetch_non_temporal(&cq[next]);
+ rte_prefetch_non_temporal(&cq[next + 1]);
+ rte_prefetch_non_temporal(&cq[next + 2]);
+ rte_prefetch_non_temporal(&cq[next + 3]);
+ }
+ __asm__ volatile (
+ /* B.1 (CQE 3) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
+ /* B.2 (CQE 3) move the block having op_own. */
+ "mov v19.16b, %[c3].16b \n\t"
+ /* B.3 (CQE 3) extract 16B fields. */
+ "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 2) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
+ /* B.4 (CQE 3) adjust CRC length. */
+ "sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
+ /* C.1 (CQE 3) generate final structure for mbuf. */
+ "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
+ /* B.2 (CQE 2) move the block having op_own. */
+ "mov v19.16b, %[c2].16b \n\t"
+ /* B.3 (CQE 2) extract 16B fields. */
+ "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 1) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
+ /* B.4 (CQE 2) adjust CRC length. */
+ "sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
+ /* C.1 (CQE 2) generate final structure for mbuf. */
+ "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
+ /* B.2 (CQE 1) move the block having op_own. */
+ "mov v19.16b, %[c1].16b \n\t"
+ /* B.3 (CQE 1) extract 16B fields. */
+ "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.1 (CQE 0) load the rest of blocks. */
+ "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
+ /* B.4 (CQE 1) adjust CRC length. */
+ "sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
+ /* C.1 (CQE 1) generate final structure for mbuf. */
+ "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
+ /* B.2 (CQE 0) move the block having op_own. */
+ "mov v19.16b, %[c0].16b \n\t"
+ /* A.1 load mbuf pointers. */
+ "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
+ /* B.3 (CQE 0) extract 16B fields. */
+ "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
+ /* B.4 (CQE 0) adjust CRC length. */
+ "sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
+ /* D.1 extract op_own byte. */
+ "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b \n\t"
+ /* C.2 (CQE 3) adjust flow mark. */
+ "add v15.4s, v15.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 3) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v15.2d}, [%[e3]] \n\t"
+ /* C.2 (CQE 2) adjust flow mark. */
+ "add v14.4s, v14.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 2) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v14.2d}, [%[e2]] \n\t"
+ /* C.1 (CQE 0) generate final structure for mbuf. */
+ "tbl v12.16b, {v20.16b}, %[mb_shuf_m].16b \n\t"
+ /* C.2 (CQE 1) adjust flow mark. */
+ "add v13.4s, v13.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v13.2d}, [%[e1]] \n\t"
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Extract byte_cnt. */
+ "tbl %[byte_cnt].8b, {v20.16b - v23.16b}, %[len_shuf_m].8b \n\t"
+#endif
+ /* Extract ptype_info. */
+ "tbl %[ptype_info].16b, {v20.16b - v23.16b}, %[ptype_shuf_m].16b \n\t"
+ /* Extract flow_tag. */
+ "tbl %[flow_tag].16b, {v20.16b - v23.16b}, %[ftag_shuf_m].16b \n\t"
+ /* A.2 copy mbuf pointers. */
+ "st1 {v24.2d - v25.2d}, [%[pkts_p]] \n\t"
+ /* C.2 (CQE 0) adjust flow mark. */
+ "add v12.4s, v12.4s, %[flow_mark_adj].4s \n\t"
+ /* C.3 (CQE 1) fill in mbuf - rx_descriptor_fields1. */
+ "st1 {v12.2d}, [%[e0]] \n\t"
+ :[op_own]"=&w"(op_own),
+ [byte_cnt]"=&w"(byte_cnt),
+ [ptype_info]"=&w"(ptype_info),
+ [flow_tag]"=&w"(flow_tag)
+ :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
+ [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
+ [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
+ [elts_p]"r"(elts_p),
+ [pkts_p]"r"(pkts_p),
+ [cqe_shuf_m]"w"(cqe_shuf_m),
+ [mb_shuf_m]"w"(mb_shuf_m),
+ [owner_shuf_m]"w"(owner_shuf_m),
+ [len_shuf_m]"w"(len_shuf_m),
+ [ptype_shuf_m]"w"(ptype_shuf_m),
+ [ftag_shuf_m]"w"(ftag_shuf_m),
+ [crc_adj]"w"(crc_adj),
+ [flow_mark_adj]"w"(flow_mark_adj)
+ :"memory",
+ "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23",
+ "v24", "v25");
+ /* D.2 flip owner bit to mark CQEs from last round. */
+ owner_mask = vand_u16(op_own, owner_check);
+ owner_mask = vceq_u16(owner_mask, ownership);
+ /* D.3 get mask for invalidated CQEs. */
+ opcode = vand_u16(op_own, opcode_check);
+ invalid_mask = vceq_u16(opcode_check, opcode);
+ /* E.1 find compressed CQE format. */
+ comp_mask = vand_u16(op_own, format_check);
+ comp_mask = vceq_u16(comp_mask, format_check);
+ /* D.4 mask out beyond boundary. */
+ invalid_mask = vorr_u16(invalid_mask, mask);
+ /* D.5 merge invalid_mask with invalid owner. */
+ invalid_mask = vorr_u16(invalid_mask, owner_mask);
+ /* E.2 mask out invalid entries. */
+ comp_mask = vbic_u16(comp_mask, invalid_mask);
+ /* E.3 get the first compressed CQE. */
+ comp_idx = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+ comp_mask), 0)) /
+ (sizeof(uint16_t) * 8);
+ /* D.6 mask out entries after the compressed CQE. */
+ mask = vcreate_u16(comp_idx < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL >> (comp_idx * sizeof(uint16_t) * 8) :
+ 0);
+ invalid_mask = vorr_u16(invalid_mask, mask);
+ /* D.7 count non-compressed valid CQEs. */
+ n = __builtin_clzl(vget_lane_u64(vreinterpret_u64_u16(
+ invalid_mask), 0)) / (sizeof(uint16_t) * 8);
+ nocmp_n += n;
+ /* D.2 get the final invalid mask. */
+ mask = vcreate_u16(n < MLX5_VPMD_DESCS_PER_LOOP ?
+ -1UL >> (n * sizeof(uint16_t) * 8) : 0);
+ invalid_mask = vorr_u16(invalid_mask, mask);
+ /* D.3 check error in opcode. */
+ opcode = vceq_u16(resp_err_check, opcode);
+ opcode = vbic_u16(opcode, invalid_mask);
+ /* D.4 mark if any error is set */
+ *err |= vget_lane_u64(vreinterpret_u64_u16(opcode), 0);
+ /* C.4 fill in mbuf - rearm_data and packet_type. */
+ rxq_cq_to_ptype_oflags_v(rxq, ptype_info, flow_tag,
+ opcode, &elts[pos]);
+ if (rxq->hw_timestamp) {
+ elts[pos]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p0, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ elts[pos + 1]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p1, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ elts[pos + 2]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p2, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ elts[pos + 3]->timestamp =
+ rte_be_to_cpu_64(
+ container_of(p3, struct mlx5_cqe,
+ pkt_info)->timestamp);
+ }
+ if (!!rxq->flow_meta_mask) {
+ /* This code is subject for futher optimization. */
+ int32_t offs = rxq->flow_meta_offset;
+
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p0, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p1, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p2, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ container_of(p3, struct mlx5_cqe,
+ pkt_info)->flow_table_metadata;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+ elts[pos]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+ elts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+ elts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+ elts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Add up received bytes count. */
+ byte_cnt = vbic_u16(byte_cnt, invalid_mask);
+ rcvd_byte += vget_lane_u64(vpaddl_u32(vpaddl_u16(byte_cnt)), 0);
+#endif
+ /*
+ * Break the loop unless more valid CQE is expected, or if
+ * there's a compressed CQE.
+ */
+ if (n != MLX5_VPMD_DESCS_PER_LOOP)
+ break;
+ }
+ /* If no new CQE seen, return without updating cq_db. */
+ if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+ *no_cq = true;
+ return rcvd_pkt;
+ }
+ /* Update the consumer indexes for non-compressed CQEs. */
+ MLX5_ASSERT(nocmp_n <= pkts_n);
+ rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+ rq_ci, rxq->consumed_strd, false);
+ rxq->cq_ci += nocmp_n;
+ rxq->consumed_strd += nocmp_n;
+ rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += nocmp_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ /* Decompress the last CQE if compressed. */
+ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+ MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+ rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+ &elts[nocmp_n]);
+ /* Return more packets if needed. */
+ if (nocmp_n < pkts_n) {
+ uint16_t n = rxq->decompressed;
+
+ n = RTE_MIN(n, pkts_n - nocmp_n);
+ rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+ rq_ci, rxq->consumed_strd, true);
+ rxq->consumed_strd += n;
+ rcvd_pkt += n;
+ rxq->decompressed -= n;
+ }
+ }
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ if (rq_ci != rxq->rq_ci) {
+ rxq->rq_ci = rq_ci;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ }
+ *no_cq = !rcvd_pkt;
+ return rcvd_pkt;
+}
+
#endif /* RTE_PMD_MLX5_RXTX_VEC_NEON_H_ */
@@ -56,6 +56,95 @@ rxq_copy_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t n)
pkts[pos] = elts[pos];
}
+/**
+ * Copy or attach MPRQ buffers to RX SW ring.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ * @param buf
+ * MPRQ buffer to get packets from.
+ * @param buf rq_ci
+ * WQE index.
+ * @param strd_idx
+ * Stride number.
+ * @param comp
+ * Whether CQE is compressed or not.
+ */
+static inline void
+rxq_copy_mprq_mbuf_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t n, struct mlx5_mprq_buf *buf,
+ uint16_t rq_ci, uint16_t strd_idx, bool comp)
+{
+ const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const unsigned int strd_shift =
+ MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+ uint32_t offset;
+ void *addr;
+ int i = 0;
+
+ if (comp) {
+ const uint16_t q_mask = (1 << rxq->cqe_n) - 1;
+ struct rte_mbuf **elts = &(*rxq->elts)[rq_ci * strd_n & q_mask];
+ unsigned int pos;
+ uint16_t p = n & -2;
+
+ for (pos = 0; pos < p; pos += 2) {
+ __m128i mbp;
+
+ mbp = _mm_loadu_si128((__m128i *)&elts[pos +
+ rxq->consumed_strd]);
+ _mm_storeu_si128((__m128i *)&pkts[pos], mbp);
+ }
+ if (n & 1)
+ pkts[pos] = elts[pos];
+ }
+
+ for (i = 0; i < n; ++i) {
+ offset = (strd_idx + i) * strd_sz + strd_shift;
+ addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+ if (pkts[i]->pkt_len <= rxq->mprq_max_memcpy_len ||
+ rxq->mprq_repl == NULL) {
+ rte_memcpy(rte_pktmbuf_mtod(pkts[i], void *),
+ addr, pkts[i]->pkt_len);
+ } else {
+ rte_iova_t buf_iova;
+ struct rte_mbuf_ext_shared_info *shinfo;
+ uint16_t buf_len = strd_sz;
+ void *buf_addr;
+ /* Increment the refcnt of the whole chunk. */
+ rte_atomic16_add_return(&buf->refcnt, 1);
+ MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+ strd_n + 1);
+ buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
+ /*
+ * MLX5 device doesn't use iova but it is necessary in a
+ * case where the Rx packet is transmitted via a
+ * different PMD.
+ */
+ buf_iova = rte_mempool_virt2iova(buf) +
+ RTE_PTR_DIFF(buf_addr, buf);
+ shinfo = &buf->shinfos[strd_idx];
+ rte_mbuf_ext_refcnt_set(shinfo, 1);
+ /*
+ * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+ * attaching the stride to mbuf and more offload flags
+ * will be added below by calling rxq_cq_to_mbuf().
+ * Other fields will be overwritten.
+ */
+ rte_pktmbuf_attach_extbuf(pkts[i], buf_addr, buf_iova,
+ buf_len, shinfo);
+ /* Set mbuf head-room. */
+ SET_DATA_OFF(pkts[i], RTE_PKTMBUF_HEADROOM);
+ DATA_LEN(pkts[i]) = pkts[i]->pkt_len;
+ }
+ }
+}
+
/**
* Decompress a compressed completion and fill in mbufs in RX SW ring with data
* extracted from the title completion descriptor.
@@ -753,4 +842,435 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
return rcvd_pkt;
}
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+ const unsigned int strd_n)
+{
+ struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+ void *addr;
+
+ MLX5_ASSERT(rep != NULL);
+ /* Replace MPRQ buf. */
+ (*rxq->mprq_bufs)[rq_idx] = rep;
+ /* Replace WQE. */
+ addr = mlx5_mprq_buf_addr(rep, strd_n);
+ wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+ /* Stash a mbuf for next replacement. */
+ if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+ rxq->mprq_repl = rep;
+ else
+ rxq->mprq_repl = NULL;
+}
+
+/**
+ * Receive burst of packets. An errored completion also consumes a mbuf, but the
+ * packet_type is set to be RTE_PTYPE_ALL_MASK. Marked mbufs should be freed
+ * before returning to application.
+ *
+ * @param rxq
+ * Pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ * @param[out] err
+ * Pointer to a flag. Set non-zero value if pkts array has at least one error
+ * packet to handle.
+ * @param[out] no_cq
+ * Pointer to a boolean. Set true if no new CQE seen.
+ * @return
+ * Number of packets received including errors (<= pkts_n).
+ */
+static inline uint16_t
+rxq_burst_mprq_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts,
+ uint16_t pkts_n, uint64_t *err, bool *no_cq)
+{
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const uint16_t q_n = 1 << rxq->cqe_n;
+ const uint16_t q_mask = q_n - 1;
+ const uint16_t e_n = 1 << rxq->elts_n;
+ const uint16_t e_mask = e_n - 1;
+ volatile struct mlx5_cqe *cq;
+ struct rte_mbuf **elts;
+ unsigned int pos;
+ uint64_t n;
+ uint64_t comp_idx = MLX5_VPMD_DESCS_PER_LOOP;
+ uint16_t nocmp_n = 0;
+ uint16_t rcvd_pkt = 0;
+ unsigned int cq_ci = rxq->cq_ci;
+ unsigned int cq_idx = cq_ci & q_mask;
+ unsigned int rq_ci = rxq->rq_ci;
+ unsigned int rq_idx = rq_ci & e_mask;
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_idx];
+ unsigned int elts_idx;
+ unsigned int ownership = !!(rxq->cq_ci & (q_mask + 1));
+ const __m128i owner_check =
+ _mm_set_epi64x(0x0100000001000000LL, 0x0100000001000000LL);
+ const __m128i opcode_check =
+ _mm_set_epi64x(0xf0000000f0000000LL, 0xf0000000f0000000LL);
+ const __m128i format_check =
+ _mm_set_epi64x(0x0c0000000c000000LL, 0x0c0000000c000000LL);
+ const __m128i resp_err_check =
+ _mm_set_epi64x(0xe0000000e0000000LL, 0xe0000000e0000000LL);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ uint32_t rcvd_byte = 0;
+ /* Mask to shuffle byte_cnt to add up stats. Do bswap16 for all. */
+ const __m128i len_shuf_mask =
+ _mm_set_epi8(-1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 12, 13, 8, 9,
+ 4, 5, 0, 1);
+#endif
+ /* Mask to shuffle from extracted CQE to mbuf. */
+ const __m128i shuf_mask =
+ _mm_set_epi8(-1, 3, 2, 1, /* fdir.hi */
+ 12, 13, 14, 15, /* rss, bswap32 */
+ 10, 11, /* vlan_tci, bswap16 */
+ 4, 5, /* data_len, bswap16 */
+ -1, -1, /* zero out 2nd half of pkt_len */
+ 4, 5 /* pkt_len, bswap16 */);
+ /* Mask to blend from the last Qword to the first DQword. */
+ const __m128i blend_mask =
+ _mm_set_epi8(-1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, -1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ones = _mm_cmpeq_epi32(zero, zero);
+ const __m128i crc_adj =
+ _mm_set_epi16(0, 0, 0, 0, 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN,
+ 0,
+ rxq->crc_present * RTE_ETHER_CRC_LEN);
+ const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+
+ MLX5_ASSERT(rxq->sges_n == 0);
+ MLX5_ASSERT(rxq->cqe_n == rxq->elts_n);
+
+ if (rxq->consumed_strd == strd_n) {
+ /* Replace WQE only if the buffer is still in use. */
+ if (rte_atomic16_read(&buf->refcnt) > 1) {
+ mprq_buf_replace(rxq, rq_ci & e_mask, strd_n);
+ /* Release the old buffer. */
+ mlx5_mprq_buf_free(buf);
+ } else if (unlikely(rxq->mprq_repl == NULL)) {
+ struct mlx5_mprq_buf *rep;
+
+ /*
+ * Currently, the MPRQ mempool is out of buffer
+ * and doing memcpy regardless of the size of Rx
+ * packet. Retry allocation to get back to
+ * normal.
+ */
+ if (!rte_mempool_get(rxq->mprq_mp,
+ (void **)&rep))
+ rxq->mprq_repl = rep;
+ }
+ /* Advance to the next WQE. */
+ rxq->consumed_strd = 0;
+ ++rq_ci;
+ buf = (*rxq->mprq_bufs)[rq_ci & e_mask];
+ rxq->rq_repl_thresh = 1;
+ }
+ if (rxq->rq_repl_thresh)
+ mlx5_rx_replenish_bulk_mprq_mbuf(rxq, strd_n, rq_ci & e_mask);
+
+ cq = &(*rxq->cqes)[cq_idx];
+ rte_prefetch0(cq);
+ rte_prefetch0(cq + 1);
+ rte_prefetch0(cq + 2);
+ rte_prefetch0(cq + 3);
+ elts_idx = (rq_ci & e_mask) * strd_n +
+ (rq_ci & e_mask) * MLX5_VPMD_DESCS_PER_LOOP;
+ elts = &(*rxq->elts)[elts_idx];
+ pkts_n = RTE_MIN(pkts_n, MLX5_VPMD_RX_MAX_BURST);
+ /* See if there're unreturned mbufs from compressed CQE. */
+ rcvd_pkt = rxq->decompressed;
+ if (rcvd_pkt > 0) {
+ rcvd_pkt = RTE_MIN(rcvd_pkt, pkts_n);
+ rxq_copy_mprq_mbuf_v(rxq, pkts, rcvd_pkt, buf,
+ rq_ci, rxq->consumed_strd, true);
+ rxq->consumed_strd += rcvd_pkt;
+ rxq->decompressed -= rcvd_pkt;
+ pkts += rcvd_pkt;
+ }
+ /* Not to cross queue end. */
+ pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
+ pkts_n = RTE_MIN(pkts_n, strd_n - rxq->consumed_strd);
+ if (!pkts_n) {
+ *no_cq = !rcvd_pkt;
+ return rcvd_pkt;
+ }
+ /* At this point, there shouldn't be any remained packets. */
+ MLX5_ASSERT(rxq->decompressed == 0);
+ /*
+ * A. load first Qword (8bytes) in one loop.
+ * B. copy 4 mbuf pointers from elts ring to returing pkts.
+ * C. load remained CQE data and extract necessary fields.
+ * Final 16bytes cqes[] extracted from original 64bytes CQE has the
+ * following structure:
+ * struct {
+ * uint8_t pkt_info;
+ * uint8_t flow_tag[3];
+ * uint16_t byte_cnt;
+ * uint8_t rsvd4;
+ * uint8_t op_own;
+ * uint16_t hdr_type_etc;
+ * uint16_t vlan_info;
+ * uint32_t rx_has_res;
+ * } c;
+ * D. fill in mbuf.
+ * E. get valid CQEs.
+ * F. find compressed CQE.
+ */
+ for (pos = 0;
+ pos < pkts_n;
+ pos += MLX5_VPMD_DESCS_PER_LOOP) {
+ __m128i cqes[MLX5_VPMD_DESCS_PER_LOOP];
+ __m128i cqe_tmp1, cqe_tmp2;
+ __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
+ __m128i op_own, op_own_tmp1, op_own_tmp2;
+ __m128i opcode, owner_mask, invalid_mask;
+ __m128i comp_mask;
+ __m128i mask;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ __m128i byte_cnt;
+#endif
+ __m128i mbp1, mbp2;
+ __m128i p = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0);
+ unsigned int p1, p2, p3;
+
+ /* Prefetch next 4 CQEs. */
+ if (pkts_n - pos >= 2 * MLX5_VPMD_DESCS_PER_LOOP) {
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 1]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 2]);
+ rte_prefetch0(&cq[pos + MLX5_VPMD_DESCS_PER_LOOP + 3]);
+ }
+ /* A.0 do not cross the end of CQ. */
+ mask = _mm_set_epi64x(0, (pkts_n - pos) * sizeof(uint16_t) * 8);
+ mask = _mm_sll_epi64(ones, mask);
+ p = _mm_andnot_si128(mask, p);
+ /* A.1 load cqes. */
+ p3 = _mm_extract_epi16(p, 3);
+ cqes[3] = _mm_loadl_epi64((__m128i *)
+ &cq[pos + p3].sop_drop_qpn);
+ rte_compiler_barrier();
+ p2 = _mm_extract_epi16(p, 2);
+ cqes[2] = _mm_loadl_epi64((__m128i *)
+ &cq[pos + p2].sop_drop_qpn);
+ rte_compiler_barrier();
+ /* B.1 load mbuf pointers. */
+ mbp1 = _mm_loadu_si128((__m128i *)&elts[pos +
+ rxq->consumed_strd]);
+ mbp2 = _mm_loadu_si128((__m128i *)&elts[pos +
+ rxq->consumed_strd + 2]);
+ /* A.1 load a block having op_own. */
+ p1 = _mm_extract_epi16(p, 1);
+ cqes[1] = _mm_loadl_epi64((__m128i *)
+ &cq[pos + p1].sop_drop_qpn);
+ rte_compiler_barrier();
+ cqes[0] = _mm_loadl_epi64((__m128i *)
+ &cq[pos].sop_drop_qpn);
+ /* B.2 copy mbuf pointers. */
+ _mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
+ _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
+ rte_cio_rmb();
+ /* C.1 load remained CQE data and extract necessary fields. */
+ cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p3]);
+ cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos + p2]);
+ cqes[3] = _mm_blendv_epi8(cqes[3], cqe_tmp2, blend_mask);
+ cqes[2] = _mm_blendv_epi8(cqes[2], cqe_tmp1, blend_mask);
+ cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p3].csum);
+ cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos + p2].csum);
+ cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x30);
+ cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x30);
+ cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p3].rsvd4[2]);
+ cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos + p2].rsvd4[2]);
+ cqes[3] = _mm_blend_epi16(cqes[3], cqe_tmp2, 0x04);
+ cqes[2] = _mm_blend_epi16(cqes[2], cqe_tmp1, 0x04);
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb3 = _mm_shuffle_epi8(cqes[3], shuf_mask);
+ pkt_mb2 = _mm_shuffle_epi8(cqes[2], shuf_mask);
+ /* C.3 adjust CRC length. */
+ pkt_mb3 = _mm_sub_epi16(pkt_mb3, crc_adj);
+ pkt_mb2 = _mm_sub_epi16(pkt_mb2, crc_adj);
+ /* C.4 adjust flow mark. */
+ pkt_mb3 = _mm_add_epi32(pkt_mb3, flow_mark_adj);
+ pkt_mb2 = _mm_add_epi32(pkt_mb2, flow_mark_adj);
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ _mm_storeu_si128((void *)&pkts[pos + 3]->pkt_len, pkt_mb3);
+ _mm_storeu_si128((void *)&pkts[pos + 2]->pkt_len, pkt_mb2);
+ /* E.1 extract op_own field. */
+ op_own_tmp2 = _mm_unpacklo_epi32(cqes[2], cqes[3]);
+ /* C.1 load remained CQE data and extract necessary fields. */
+ cqe_tmp2 = _mm_load_si128((__m128i *)&cq[pos + p1]);
+ cqe_tmp1 = _mm_load_si128((__m128i *)&cq[pos]);
+ cqes[1] = _mm_blendv_epi8(cqes[1], cqe_tmp2, blend_mask);
+ cqes[0] = _mm_blendv_epi8(cqes[0], cqe_tmp1, blend_mask);
+ cqe_tmp2 = _mm_loadu_si128((__m128i *)&cq[pos + p1].csum);
+ cqe_tmp1 = _mm_loadu_si128((__m128i *)&cq[pos].csum);
+ cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x30);
+ cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x30);
+ cqe_tmp2 = _mm_loadl_epi64((__m128i *)&cq[pos + p1].rsvd4[2]);
+ cqe_tmp1 = _mm_loadl_epi64((__m128i *)&cq[pos].rsvd4[2]);
+ cqes[1] = _mm_blend_epi16(cqes[1], cqe_tmp2, 0x04);
+ cqes[0] = _mm_blend_epi16(cqes[0], cqe_tmp1, 0x04);
+ /* C.2 generate final structure for mbuf with swapping bytes. */
+ pkt_mb1 = _mm_shuffle_epi8(cqes[1], shuf_mask);
+ pkt_mb0 = _mm_shuffle_epi8(cqes[0], shuf_mask);
+ /* C.3 adjust CRC length. */
+ pkt_mb1 = _mm_sub_epi16(pkt_mb1, crc_adj);
+ pkt_mb0 = _mm_sub_epi16(pkt_mb0, crc_adj);
+ /* C.4 adjust flow mark. */
+ pkt_mb1 = _mm_add_epi32(pkt_mb1, flow_mark_adj);
+ pkt_mb0 = _mm_add_epi32(pkt_mb0, flow_mark_adj);
+ /* E.1 extract op_own byte. */
+ op_own_tmp1 = _mm_unpacklo_epi32(cqes[0], cqes[1]);
+ op_own = _mm_unpackhi_epi64(op_own_tmp1, op_own_tmp2);
+ /* D.1 fill in mbuf - rx_descriptor_fields1. */
+ _mm_storeu_si128((void *)&pkts[pos + 1]->pkt_len, pkt_mb1);
+ _mm_storeu_si128((void *)&pkts[pos]->pkt_len, pkt_mb0);
+ /* E.2 flip owner bit to mark CQEs from last round. */
+ owner_mask = _mm_and_si128(op_own, owner_check);
+ if (ownership)
+ owner_mask = _mm_xor_si128(owner_mask, owner_check);
+ owner_mask = _mm_cmpeq_epi32(owner_mask, owner_check);
+ owner_mask = _mm_packs_epi32(owner_mask, zero);
+ /* E.3 get mask for invalidated CQEs. */
+ opcode = _mm_and_si128(op_own, opcode_check);
+ invalid_mask = _mm_cmpeq_epi32(opcode_check, opcode);
+ invalid_mask = _mm_packs_epi32(invalid_mask, zero);
+ /* E.4 mask out beyond boundary. */
+ invalid_mask = _mm_or_si128(invalid_mask, mask);
+ /* E.5 merge invalid_mask with invalid owner. */
+ invalid_mask = _mm_or_si128(invalid_mask, owner_mask);
+ /* F.1 find compressed CQE format. */
+ comp_mask = _mm_and_si128(op_own, format_check);
+ comp_mask = _mm_cmpeq_epi32(comp_mask, format_check);
+ comp_mask = _mm_packs_epi32(comp_mask, zero);
+ /* F.2 mask out invalid entries. */
+ comp_mask = _mm_andnot_si128(invalid_mask, comp_mask);
+ comp_idx = _mm_cvtsi128_si64(comp_mask);
+ /* F.3 get the first compressed CQE. */
+ comp_idx = comp_idx ?
+ __builtin_ctzll(comp_idx) /
+ (sizeof(uint16_t) * 8) :
+ MLX5_VPMD_DESCS_PER_LOOP;
+ /* E.6 mask out entries after the compressed CQE. */
+ mask = _mm_set_epi64x(0, comp_idx * sizeof(uint16_t) * 8);
+ mask = _mm_sll_epi64(ones, mask);
+ invalid_mask = _mm_or_si128(invalid_mask, mask);
+ /* E.7 count non-compressed valid CQEs. */
+ n = _mm_cvtsi128_si64(invalid_mask);
+ n = n ? __builtin_ctzll(n) / (sizeof(uint16_t) * 8) :
+ MLX5_VPMD_DESCS_PER_LOOP;
+ nocmp_n += n;
+ /* D.2 get the final invalid mask. */
+ mask = _mm_set_epi64x(0, n * sizeof(uint16_t) * 8);
+ mask = _mm_sll_epi64(ones, mask);
+ invalid_mask = _mm_or_si128(invalid_mask, mask);
+ /* D.3 check error in opcode. */
+ opcode = _mm_cmpeq_epi32(resp_err_check, opcode);
+ opcode = _mm_packs_epi32(opcode, zero);
+ opcode = _mm_andnot_si128(invalid_mask, opcode);
+ /* D.4 mark if any error is set */
+ *err |= _mm_cvtsi128_si64(opcode);
+ /* D.5 fill in mbuf - rearm_data and packet_type. */
+ rxq_cq_to_ptype_oflags_v(rxq, cqes, opcode, &pkts[pos]);
+ if (rxq->hw_timestamp) {
+ pkts[pos]->timestamp =
+ rte_be_to_cpu_64(cq[pos].timestamp);
+ pkts[pos + 1]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p1].timestamp);
+ pkts[pos + 2]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p2].timestamp);
+ pkts[pos + 3]->timestamp =
+ rte_be_to_cpu_64(cq[pos + p3].timestamp);
+ }
+ if (rxq->dynf_meta) {
+ /* This code is subject for futher optimization. */
+ int32_t offs = rxq->flow_meta_offset;
+
+ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) =
+ cq[pos].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) =
+ cq[pos + p1].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) =
+ cq[pos + p2].flow_table_metadata;
+ *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) =
+ cq[pos + p3].flow_table_metadata;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *))
+ pkts[pos]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *))
+ pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *))
+ pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask;
+ if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *))
+ pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask;
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Add up received bytes count. */
+ byte_cnt = _mm_shuffle_epi8(op_own, len_shuf_mask);
+ byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
+ byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
+ rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
+#endif
+ /*
+ * Break the loop unless more valid CQE is expected, or if
+ * there's a compressed CQE.
+ */
+ if (n != MLX5_VPMD_DESCS_PER_LOOP)
+ break;
+ }
+ /* If no new CQE seen, return without updating cq_db. */
+ if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+ *no_cq = true;
+ return rcvd_pkt;
+ }
+ /* Update the consumer indexes for non-compressed CQEs. */
+ MLX5_ASSERT(nocmp_n <= pkts_n);
+ rxq_copy_mprq_mbuf_v(rxq, pkts, nocmp_n, buf,
+ rq_ci, rxq->consumed_strd, false);
+ rxq->cq_ci += nocmp_n;
+ rxq->consumed_strd += nocmp_n;
+ rcvd_pkt += nocmp_n;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ rxq->stats.ipackets += nocmp_n;
+ rxq->stats.ibytes += rcvd_byte;
+#endif
+ /* Decompress the last CQE if compressed. */
+ if (comp_idx < MLX5_VPMD_DESCS_PER_LOOP && comp_idx == n) {
+ MLX5_ASSERT(comp_idx == (nocmp_n % MLX5_VPMD_DESCS_PER_LOOP));
+ rxq->decompressed = rxq_cq_decompress_v(rxq, &cq[nocmp_n],
+ &elts[nocmp_n + rxq->consumed_strd]);
+ /* Return more packets if needed. */
+ if (nocmp_n < pkts_n) {
+ uint16_t n = rxq->decompressed;
+
+ n = RTE_MIN(n, pkts_n - nocmp_n);
+ rxq_copy_mprq_mbuf_v(rxq, &pkts[nocmp_n], n, buf,
+ rq_ci, rxq->consumed_strd, true);
+ rxq->consumed_strd += n;
+ rcvd_pkt += n;
+ rxq->decompressed -= n;
+ }
+ }
+
+ rte_compiler_barrier();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ if (rq_ci != rxq->rq_ci) {
+ rxq->rq_ci = rq_ci;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ }
+ *no_cq = !rcvd_pkt;
+ return rcvd_pkt;
+}
+
#endif /* RTE_PMD_MLX5_RXTX_VEC_SSE_H_ */