From patchwork Thu Aug 3 08:49:14 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Vasily Philipov X-Patchwork-Id: 27399 X-Patchwork-Delegate: ferruh.yigit@amd.com Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [IPv6:::1]) by dpdk.org (Postfix) with ESMTP id D76A7376C; Thu, 3 Aug 2017 10:49:34 +0200 (CEST) Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id D210A2E41 for ; Thu, 3 Aug 2017 10:49:31 +0200 (CEST) Received: from Internal Mail-Server by MTLPINE1 (envelope-from vasilyf@mellanox.com) with ESMTPS (AES256-SHA encrypted); 3 Aug 2017 11:49:19 +0300 Received: from hpchead.mtr.labs.mlnx. (hpchead.mtr.labs.mlnx [10.209.44.59]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id v738nJIP025657; Thu, 3 Aug 2017 11:49:19 +0300 Received: from hpchead.mtr.labs.mlnx. (localhost.localdomain [127.0.0.1]) by hpchead.mtr.labs.mlnx. (8.14.7/8.14.7) with ESMTP id v738nJ3Q005525; Thu, 3 Aug 2017 11:49:19 +0300 Received: (from vasilyf@localhost) by hpchead.mtr.labs.mlnx. (8.14.7/8.14.7/Submit) id v738nI04005316; Thu, 3 Aug 2017 11:49:18 +0300 From: Vasily Philipov To: dev@dpdk.org Cc: Vasily Philipov , Adrien Mazarguil , Nelio Laranjeiro Date: Thu, 3 Aug 2017 11:49:14 +0300 Message-Id: X-Mailer: git-send-email 1.8.3.1 Subject: [dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Getting hw directly on RX fast path without verbs call. Now the number of scatters is calculating on the fly, according to the maximum expected packet size. Signed-off-by: Vasily Philipov --- The series depends on: http://dpdk.org/dev/patchwork/patch/27313/ --- drivers/net/mlx4/mlx4.h | 3 + drivers/net/mlx4/mlx4_prm.h | 405 ++++++++++++++++++++++++++++++++++++++++++ drivers/net/mlx4/mlx4_rxq.c | 205 ++++++++++----------- drivers/net/mlx4/mlx4_rxtx.c | 266 ++++++++++++++++----------- drivers/net/mlx4/mlx4_rxtx.h | 18 +- drivers/net/mlx4/mlx4_utils.h | 20 +++ 6 files changed, 688 insertions(+), 229 deletions(-) create mode 100644 drivers/net/mlx4/mlx4_prm.h diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 1cd4db3..4b7f98b 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -42,6 +42,7 @@ #pragma GCC diagnostic ignored "-Wpedantic" #endif #include +#include "mlx4_prm.h" #ifdef PEDANTIC #pragma GCC diagnostic error "-Wpedantic" #endif @@ -57,6 +58,8 @@ /* Maximum size for inline data. */ #define MLX4_PMD_MAX_INLINE 0 +#include + /* * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP * from which buffers are to be transmitted will have to be mapped by this diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h new file mode 100644 index 0000000..03c1192 --- /dev/null +++ b/drivers/net/mlx4/mlx4_prm.h @@ -0,0 +1,405 @@ +/*- + * BSD LICENSE + * + * Copyright 2017 6WIND S.A. + * Copyright 2017 Mellanox. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of 6WIND S.A. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef RTE_PMD_MLX4_PRM_H_ +#define RTE_PMD_MLX4_PRM_H_ + +#include + +#include +#include +#include + +#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) +#if MLX4_GCC_VERSION >= 403 +# define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64))) +# define __MLX4_ALGN_DATA__ __attribute__((aligned(64))) +#else +# define __MLX4_ALGN_FUNC__ +# define __MLX4_ALGN_DATA__ +#endif + +/* Maximum number of physical ports. */ +#define MLX4_PMD_MAX_PHYS_PORTS 2 + +/* Generic macro to convert MLX4 to IBV flags. */ +#define MLX4_TRANSPOSE(val, from, to) \ + (((from) >= (to)) ? \ + (((val) & (from)) / ((from) / (to))) : \ + (((val) & (from)) * ((to) / (from)))) + +struct list_head { + struct list_head *next, *prev; +}; + +enum { + MLX4_INVALID_LKEY = 0x100, +}; + +enum { + MLX4_MAX_BFS_IN_PAGE = 8, + MLX4_BFS_STRIDE = 512, +}; + +enum { + MLX4_CQE_L2_TUNNEL_IPV4 = 1U << 25, + MLX4_CQE_L2_TUNNEL_L4_CSUM = 1U << 26, + MLX4_CQE_L2_TUNNEL = 1U << 27, + MLX4_CQE_VLAN_PRESENT_MASK = 1U << 29, + MLX4_CQE_L2_TUNNEL_IPOK = 1U << 31, + MLX4_CQE_QPN_MASK = 0xffffff, +}; + +enum { + MLX4_QP_TABLE_BITS = 8, + MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS, + MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 +}; + +enum { + MLX4_XSRQ_TABLE_BITS = 8, + MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS, + MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1 +}; + +struct mlx4_wqe_data_seg { + uint32_t byte_count; + uint32_t lkey; + uint64_t addr; +}; + +struct mlx4_xsrq_table { + struct { + struct mlx4_srq **table; + int refcnt; + } xsrq_table[MLX4_XSRQ_TABLE_SIZE]; + pthread_mutex_t mutex; + int num_xsrq; + int shift; + int mask; +}; + +enum qp_cap_cache { + MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1, + MLX4_RX_VXLAN = 1 << 2 +}; + +enum mlx4_db_type { + MLX4_DB_TYPE_CQ, + MLX4_DB_TYPE_RQ, + MLX4_NUM_DB_TYPE, +}; + +enum mlx4_lock_type { + MLX4_SPIN_LOCK = 0, + MLX4_MUTEX = 1, +}; + +enum mlx4_lock_state { + MLX4_USE_LOCK, + MLX4_LOCKED, + MLX4_UNLOCKED +}; + +struct mlx4_spinlock { + pthread_spinlock_t lock; + enum mlx4_lock_state state; +}; + +struct mlx4_lock { + pthread_mutex_t mutex; + pthread_spinlock_t slock; + enum mlx4_lock_state state; + enum mlx4_lock_type type; +}; + +/* struct for BF dedicated for one QP */ +struct mlx4_dedic_bf { + void *address; +}; + +/* struct for the common BF which may be shared by many QPs */ +struct mlx4_cmn_bf { + void *address; + /* + * Protect usage of BF address field including data written + * to the BF and the BF buffer toggling. + */ + struct mlx4_lock lock; +}; + +union mlx4_bf { + struct mlx4_dedic_bf dedic; + struct mlx4_cmn_bf cmn; +}; + +struct mlx4_bfs_data { + struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1]; + struct mlx4_cmn_bf cmn_bf; + uint8_t dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1]; + uint8_t dedic_bf_free; + /* + * protect dedicated BFs managing + * including dedic_bf_used and + * dedic_bf_free fields + */ + struct mlx4_spinlock dedic_bf_lock; + void *page; + uint16_t buf_size; + uint8_t num_dedic_bfs; +}; + +struct mlx4_db_page; + +struct mlx4_context { + union { + struct ibv_context ibv_ctx; + }; + /* protects send_db_list and send_db_num_uars */ + struct mlx4_spinlock send_db_lock; + struct list_head send_db_list; + unsigned int send_db_num_uars; + void *uar; + struct mlx4_spinlock uar_lock; + struct mlx4_bfs_data bfs; + int bf_regs_per_page; + int max_ctx_res_domain; + struct { + struct mlx4_qp **table; + int refcnt; + } qp_table[MLX4_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + int num_qps; + int qp_table_shift; + int qp_table_mask; + int max_qp_wr; + int max_sge; + int max_cqe; + uint64_t exp_device_cap_flags; + struct { + int offset; + int mult; + int shift; + uint64_t mask; + } core_clk; + void *hca_core_clock; + struct mlx4_xsrq_table xsrq_table; + struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; + pthread_mutex_t db_list_mutex; + int cqe_size; + int prefer_bf; + struct mlx4_spinlock hugetlb_lock; + struct list_head hugetlb_list; + int stall_enable; + pthread_mutex_t task_mutex; + struct { + uint8_t valid; + uint8_t link_layer; + enum ibv_port_cap_flags caps; + } port_query_cache[MLX4_PMD_MAX_PHYS_PORTS]; + pthread_mutex_t env_mtx; + int env_initialized; +}; + +struct mlx4_buf { + void *buf; + void *hmem; + size_t length; + int base; +}; + +struct mlx4_pd { + struct ibv_pd ibv_pd; + uint32_t pdn; +}; + +struct mlx4_cq { + struct ibv_cq ibv_cq __MLX4_ALGN_DATA__; + uint32_t pattern; + struct mlx4_buf buf; + struct mlx4_buf resize_buf; + struct mlx4_lock lock; + uint32_t cqn; + uint32_t cons_index; + uint32_t wait_index; + uint32_t wait_count; + uint32_t *set_ci_db; + uint32_t *arm_db; + int arm_sn; + int stall_next_poll; + int stall_enable; + int cqe_size; + int creation_flags; + struct mlx4_qp *last_qp; + uint32_t model_flags; /* use mlx4_cq_model_flags */ +}; + +struct mlx4_wq { + uint64_t *wrid; + struct mlx4_lock lock; + int wqe_cnt; + unsigned max_post; + char *buf; + unsigned head; + unsigned tail; + int max_gs; + int wqe_shift; + unsigned head_en_index; + unsigned head_en_count; +}; + +struct mlx4_inlr_rbuff { + void *rbuff; + int rlen; +}; + +struct mlx4_inlr_sg_list { + struct mlx4_inlr_rbuff *sg_list; + int list_len; +}; + +struct mlx4_inlr_buff { + struct mlx4_inlr_sg_list *buff; + int len; +}; + +struct mlx4_qp { + struct verbs_qp verbs_qp; + uint32_t pattern; + int buf_size; + uint32_t model_flags; /* use mlx4_qp_model_flags */ + /* hot post send data */ + struct mlx4_wq sq __MLX4_ALGN_DATA__; + int (*post_send_one)(struct ibv_send_wr *wr, + struct mlx4_qp *qp, + void *wqe, int *total_size, + int *inl, unsigned int ind); + union mlx4_bf *bf; + uint32_t *sdb; /* send DB */ + struct mlx4_buf buf; + unsigned last_db_head; + uint32_t doorbell_qpn; + uint32_t create_flags; + uint16_t max_inline_data; + uint16_t bf_buf_size; + uint16_t sq_spare_wqes; + uint8_t srcrb_flags_tbl[16]; + uint8_t db_method; + uint8_t qp_type; + /* RAW_PACKET hot data */ + uint8_t link_layer; + uint8_t is_masked_atomic; + /* post receive hot data */ + struct mlx4_wq rq __MLX4_ALGN_DATA__; + uint32_t *db; + uint32_t max_inlr_sg; + int32_t cached_rx_csum_flags; + int32_t transposed_rx_csum_flags; + struct mlx4_inlr_buff inlr_buff; + uint8_t qp_cap_cache; +}; + +struct mlx4_cqe { + uint32_t vlan_my_qpn; + uint32_t immed_rss_invalid; + uint32_t g_mlpath_rqpn; + union { + struct { + union { + struct { + uint16_t sl_vid; + uint16_t rlid; + }; + uint32_t timestamp_16_47; + }; + uint16_t status; + uint8_t reserved2; + uint8_t badfcs_enc; + }; + struct { + uint16_t reserved4; + uint8_t smac[6]; + }; + }; + uint32_t byte_cnt; + uint16_t wqe_index; + uint16_t checksum; + uint8_t reserved5[1]; + uint16_t timestamp_0_15; + uint8_t owner_sr_opcode; +} __attribute__((packed)); + +enum { + MLX4_CQE_OWNER_MASK = 0x80, + MLX4_CQE_IS_SEND_MASK = 0x40, + MLX4_CQE_INL_SCATTER_MASK = 0x20, + MLX4_CQE_OPCODE_MASK = 0x1f +}; + +enum { + MLX4_CQE_OPCODE_ERROR = 0x1e, + MLX4_CQE_OPCODE_RESIZE = 0x16, +}; + +enum { + MLX4_CQE_STATUS_L4_CSUM = 1 << 2, + MLX4_CQE_STATUS_IPV4 = 1 << 6, + MLX4_CQE_STATUS_IPV4F = 1 << 7, + MLX4_CQE_STATUS_IPV6 = 1 << 8, + MLX4_CQE_STATUS_IPV4OPT = 1 << 9, + MLX4_CQE_STATUS_TCP = 1 << 10, + MLX4_CQE_STATUS_UDP = 1 << 11, + MLX4_CQE_STATUS_IPOK = 1 << 12 +}; + +#define to_mxxx(xxx, type) \ + ((struct mlx4_##type *) \ + ((uint8_t *)ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx))) + +static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx) +{ + return to_mxxx(ctx, context); +} + +static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq) +{ + return to_mxxx(cq, cq); +} + +static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp) +{ + return container_of(container_of(ibqp, struct verbs_qp, qp), + struct mlx4_qp, verbs_qp); +} + +#endif /* RTE_PMD_MLX4_PRM_H_ */ diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c index 1456b5f..bbe9c89 100644 --- a/drivers/net/mlx4/mlx4_rxq.c +++ b/drivers/net/mlx4/mlx4_rxq.c @@ -78,103 +78,73 @@ */ static int mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, - struct rte_mbuf **pool) + struct rte_mbuf *(*pool)[]) { - unsigned int i; - struct rxq_elt (*elts)[elts_n] = - rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, - rxq->socket); + unsigned int i = 0; + const unsigned int sge_n = 1 << rxq->sge_n; + struct rte_mbuf *(*elts)[elts_n] = + rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, rxq->socket); if (elts == NULL) { rte_errno = ENOMEM; ERROR("%p: can't allocate packets array", (void *)rxq); goto error; } - /* For each WR (packet). */ - for (i = 0; (i != elts_n); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct ibv_recv_wr *wr = &elt->wr; - struct ibv_sge *sge = &(*elts)[i].sge; + rxq->elts = elts; + for (; i != elts_n; ++i) { struct rte_mbuf *buf; + volatile struct mlx4_wqe_data_seg *scat = + &(*rxq->hw.wqes)[i]; if (pool != NULL) { - buf = *(pool++); + buf = (*pool)[i]; assert(buf != NULL); rte_pktmbuf_reset(buf); - } else { + rte_pktmbuf_refcnt_update(buf, 1); + } else buf = rte_pktmbuf_alloc(rxq->mp); - } if (buf == NULL) { rte_errno = ENOMEM; assert(pool == NULL); ERROR("%p: empty mbuf pool", (void *)rxq); goto error; } - /* - * Configure WR. Work request ID contains its own index in - * the elts array and the offset between SGE buffer header and - * its data. - */ - WR_ID(wr->wr_id).id = i; - WR_ID(wr->wr_id).offset = - (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) - - (uintptr_t)buf); - wr->next = &(*elts)[(i + 1)].wr; - wr->sg_list = sge; - wr->num_sge = 1; /* Headroom is reserved by rte_pktmbuf_alloc(). */ assert(buf->data_off == RTE_PKTMBUF_HEADROOM); /* Buffer is supposed to be empty. */ assert(rte_pktmbuf_data_len(buf) == 0); assert(rte_pktmbuf_pkt_len(buf) == 0); - /* sge->addr must be able to store a pointer. */ - assert(sizeof(sge->addr) >= sizeof(uintptr_t)); - /* SGE keeps its headroom. */ - sge->addr = (uintptr_t) - ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); - sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); - sge->lkey = rxq->mr->lkey; - /* Redundant check for tailroom. */ - assert(sge->length == rte_pktmbuf_tailroom(buf)); - /* - * Make sure elts index and SGE mbuf pointer can be deduced - * from WR ID. - */ - if ((WR_ID(wr->wr_id).id != i) || - ((void *)((uintptr_t)sge->addr - - WR_ID(wr->wr_id).offset) != buf)) { - rte_errno = EOVERFLOW; - ERROR("%p: cannot store index and offset in WR ID", - (void *)rxq); - sge->addr = 0; - rte_pktmbuf_free(buf); - goto error; - } + assert(!buf->next); + /* Only the first segment keeps headroom. */ + if (i % sge_n) + buf->data_off = 0; + buf->port = rxq->port_id; + buf->data_len = rte_pktmbuf_tailroom(buf); + buf->pkt_len = rte_pktmbuf_tailroom(buf); + buf->nb_segs = 1; + /* scat->addr must be able to store a pointer. */ + assert(sizeof(scat->addr) >= sizeof(uintptr_t)); + *scat = (struct mlx4_wqe_data_seg){ + .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)), + .byte_count = htonl(buf->data_len), + .lkey = htonl(rxq->mr->lkey), + }; + (*rxq->elts)[i] = buf; } - /* The last WR pointer must be NULL. */ - (*elts)[(i - 1)].wr.next = NULL; - DEBUG("%p: allocated and configured %u single-segment WRs", - (void *)rxq, elts_n); - rxq->elts_n = elts_n; - rxq->elts_head = 0; - rxq->elts = elts; + DEBUG("%p: allocated and configured %u segments (max %u packets)", + (void *)rxq, elts_n, elts_n >> rxq->sge_n); + rxq->elts_n = log2above(elts_n); return 0; error: - if (elts != NULL) { - assert(pool == NULL); - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf; - - if (elt->sge.addr == 0) - continue; - assert(WR_ID(elt->wr.wr_id).id == i); - buf = (void *)((uintptr_t)elt->sge.addr - - WR_ID(elt->wr.wr_id).offset); - rte_pktmbuf_free_seg(buf); - } - rte_free(elts); + assert(pool == NULL); + elts_n = i; + for (i = 0; i != elts_n; ++i) { + if ((*rxq->elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq->elts)[i]); + (*rxq->elts)[i] = NULL; } + rte_free(rxq->elts); + rxq->elts = NULL; DEBUG("%p: failed, freed everything", (void *)rxq); assert(rte_errno > 0); return -rte_errno; @@ -190,26 +160,17 @@ mlx4_rxq_free_elts(struct rxq *rxq) { unsigned int i; - unsigned int elts_n = rxq->elts_n; - struct rxq_elt (*elts)[elts_n] = rxq->elts; DEBUG("%p: freeing WRs", (void *)rxq); - rxq->elts_n = 0; - rxq->elts = NULL; - if (elts == NULL) + if (rxq->elts == NULL) return; - for (i = 0; (i != RTE_DIM(*elts)); ++i) { - struct rxq_elt *elt = &(*elts)[i]; - struct rte_mbuf *buf; - if (elt->sge.addr == 0) - continue; - assert(WR_ID(elt->wr.wr_id).id == i); - buf = (void *)((uintptr_t)elt->sge.addr - - WR_ID(elt->wr.wr_id).offset); - rte_pktmbuf_free_seg(buf); + for (i = 0; i != (1u << rxq->elts_n); ++i) { + if ((*rxq->elts)[i] != NULL) + rte_pktmbuf_free_seg((*rxq->elts)[i]); + (*rxq->elts)[i] = NULL; } - rte_free(elts); + rte_free(rxq->elts); } /** @@ -251,7 +212,8 @@ * QP pointer or NULL in case of error and rte_errno is set. */ static struct ibv_qp * -mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc) +mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, + uint16_t desc, unsigned int sge_n) { struct ibv_qp *qp; struct ibv_qp_init_attr attr = { @@ -265,7 +227,7 @@ priv->device_attr.max_qp_wr : desc), /* Max number of scatter/gather elements in a WR. */ - .max_recv_sge = 1, + .max_recv_sge = sge_n, }, .qp_type = IBV_QPT_RAW_PACKET, }; @@ -307,26 +269,34 @@ .socket = socket }; struct ibv_qp_attr mod; - struct ibv_recv_wr *bad_wr; unsigned int mb_len; int ret; (void)conf; /* Thresholds configuration (ignored). */ mb_len = rte_pktmbuf_data_room_size(mp); - if (desc == 0) { - rte_errno = EINVAL; - ERROR("%p: invalid number of RX descriptors", (void *)dev); - goto error; - } /* Enable scattered packets support for this queue if necessary. */ assert(mb_len >= RTE_PKTMBUF_HEADROOM); if (dev->data->dev_conf.rxmode.max_rx_pkt_len <= (mb_len - RTE_PKTMBUF_HEADROOM)) { - ; + tmpl.sge_n = 0; } else if (dev->data->dev_conf.rxmode.enable_scatter) { - WARN("%p: scattered mode has been requested but is" - " not supported, this may lead to packet loss", - (void *)dev); + unsigned int sges_n; + unsigned int rx_pkt_len = + dev->data->dev_conf.rxmode.jumbo_frame ? + dev->data->dev_conf.rxmode.max_rx_pkt_len : + ETHER_MTU; + + if (rx_pkt_len < ETHER_MTU) + rx_pkt_len = ETHER_MTU; + /* Only the first mbuf has a headroom */ + rx_pkt_len = rx_pkt_len - mb_len + RTE_PKTMBUF_HEADROOM; + /* + * Determine the number of SGEs needed for a full packet + * and round it to the next power of two. + */ + sges_n = (rx_pkt_len / mb_len) + !!(rx_pkt_len / mb_len) + 1; + tmpl.sge_n = log2above(sges_n); + desc >>= tmpl.sge_n; } else { WARN("%p: the requested maximum Rx packet size (%u) is" " larger than a single mbuf (%u) and scattered" @@ -335,6 +305,8 @@ dev->data->dev_conf.rxmode.max_rx_pkt_len, mb_len - RTE_PKTMBUF_HEADROOM); } + DEBUG("%p: number of sges %u (%u WRs)", + (void *)dev, 1 << tmpl.sge_n, desc); /* Use the entire RX mempool as the memory region. */ tmpl.mr = mlx4_mp2mr(priv->pd, mp); if (tmpl.mr == NULL) { @@ -370,7 +342,7 @@ priv->device_attr.max_qp_wr); DEBUG("priv->device_attr.max_sge is %d", priv->device_attr.max_sge); - tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc); + tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc, 1 << tmpl.sge_n); if (tmpl.qp == NULL) { ERROR("%p: QP creation failure: %s", (void *)dev, strerror(rte_errno)); @@ -389,21 +361,6 @@ (void *)dev, strerror(rte_errno)); goto error; } - ret = mlx4_rxq_alloc_elts(&tmpl, desc, NULL); - if (ret) { - ERROR("%p: RXQ allocation failed: %s", - (void *)dev, strerror(rte_errno)); - goto error; - } - ret = ibv_post_recv(tmpl.qp, &(*tmpl.elts)[0].wr, &bad_wr); - if (ret) { - rte_errno = ret; - ERROR("%p: ibv_post_recv() failed for WR %p: %s", - (void *)dev, - (void *)bad_wr, - strerror(rte_errno)); - goto error; - } mod = (struct ibv_qp_attr){ .qp_state = IBV_QPS_RTR }; @@ -414,14 +371,32 @@ (void *)dev, strerror(rte_errno)); goto error; } + /* Init HW depended fields */ + tmpl.hw.wqes = + (volatile struct mlx4_wqe_data_seg (*)[]) + (uintptr_t)to_mqp(tmpl.qp)->rq.buf; + tmpl.hw.rq_db = + (volatile uint32_t *) + (uintptr_t)to_mqp(tmpl.qp)->db; + tmpl.hw.rq_ci = 0; /* Save port ID. */ tmpl.port_id = dev->data->port_id; DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id); + ret = mlx4_rxq_alloc_elts(&tmpl, desc << tmpl.sge_n, NULL); + if (ret) { + ERROR("%p: RXQ allocation failed: %s", + (void *)dev, strerror(rte_errno)); + goto error; + } /* Clean up rxq in case we're reinitializing it. */ DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq); mlx4_rxq_cleanup(rxq); *rxq = tmpl; DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl); + /* Update doorbell counter. */ + rxq->hw.rq_ci = desc; + rte_wmb(); + *rxq->hw.rq_db = htonl(rxq->hw.rq_ci); return 0; error: ret = rte_errno; @@ -459,6 +434,12 @@ struct rxq *rxq = (*priv->rxqs)[idx]; int ret; + if (!rte_is_power_of_2(desc)) { + desc = 1 << log2above(desc); + WARN("%p: increased number of descriptors in RX queue %u" + " to the next power of two (%d)", + (void *)dev, idx, desc); + } DEBUG("%p: configuring queue %u for %u descriptors", (void *)dev, idx, desc); if (idx >= priv->rxqs_n) { diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c index 944cf48..f11c84c 100644 --- a/drivers/net/mlx4/mlx4_rxtx.c +++ b/drivers/net/mlx4/mlx4_rxtx.c @@ -348,9 +348,73 @@ } /** - * DPDK callback for Rx. + * Get next cqe from HW. * - * The following function doesn't manage scattered packets. + * @param cq + * Pointer to CQ structure. + * + * @return + * Pointer to the CQ element or NULL in case there is no one. + */ +static inline struct mlx4_cqe * +mlx4_cq_get_next_cqe(struct mlx4_cq *cq) +{ + int cqe_off; + struct mlx4_cqe *cqe; + const int cqe_size = cq->cqe_size; + + /* CQE offset is 32 bytes in case if cqe_size is 64 */ + cqe_off = (cqe_size & 64) >> 1; + cqe = (struct mlx4_cqe *) + ((uint8_t *)cq->buf.buf + + (cq->cons_index & cq->ibv_cq.cqe) * cqe_size + + cqe_off); + /* Return NULL if HW hasn't produced cqe */ + if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(cq->cons_index & (cq->ibv_cq.cqe + 1))) + return NULL; + return cqe; +} + +/** + * Poll one CQE from CQ. + * + * @param cq + * Pointer to ibv CQ structure. + * @param[out] out + * Just polled cqe. + * + * @return + * byte_cnt of the cqe, 0 in case there is no completion, + * negative on failure. + */ +static int +mlx4_cq_poll_one(struct rxq *rxq, + struct mlx4_cqe **out) +{ + int ret = 0; + struct mlx4_cqe *cqe; + struct mlx4_cq *cq = to_mcq(rxq->cq); + + cqe = mlx4_cq_get_next_cqe(cq); + if (cqe) { + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rte_rmb(); + assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)); + assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != + MLX4_CQE_OPCODE_ERROR); + ret = ntohl(cqe->byte_cnt); + ++cq->cons_index; + } + *out = cqe; + return ret; +} + +/** + * DPDK callback for RX with scattered packets support. * * @param dpdk_rxq * Generic pointer to Rx queue structure. @@ -365,121 +429,109 @@ uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { - struct rxq *rxq = (struct rxq *)dpdk_rxq; - struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts; - const unsigned int elts_n = rxq->elts_n; - unsigned int elts_head = rxq->elts_head; - struct ibv_wc wcs[pkts_n]; - struct ibv_recv_wr *wr_head = NULL; - struct ibv_recv_wr **wr_next = &wr_head; - struct ibv_recv_wr *wr_bad = NULL; - unsigned int i; - unsigned int pkts_ret = 0; - int ret; + struct rxq *rxq = dpdk_rxq; + const unsigned int wr_cnt = (1 << rxq->elts_n) - 1; + const unsigned int sge_n = rxq->sge_n; + struct rte_mbuf *pkt = NULL; + struct rte_mbuf *seg = NULL; + unsigned int i = 0; + unsigned int rq_ci = (rxq->hw.rq_ci << sge_n); + int len = 0; - ret = ibv_poll_cq(rxq->cq, pkts_n, wcs); - if (unlikely(ret == 0)) - return 0; - if (unlikely(ret < 0)) { - DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)", - (void *)rxq, ret); - return 0; - } - assert(ret <= (int)pkts_n); - /* For each work completion. */ - for (i = 0; i != (unsigned int)ret; ++i) { - struct ibv_wc *wc = &wcs[i]; - struct rxq_elt *elt = &(*elts)[elts_head]; - struct ibv_recv_wr *wr = &elt->wr; - uint64_t wr_id = wr->wr_id; - uint32_t len = wc->byte_len; - struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr - - WR_ID(wr_id).offset); - struct rte_mbuf *rep; + while (pkts_n) { + struct mlx4_cqe *cqe; + unsigned int idx = rq_ci & wr_cnt; + struct rte_mbuf *rep = (*rxq->elts)[idx]; + volatile struct mlx4_wqe_data_seg *scat = + &(*rxq->hw.wqes)[idx]; - /* Sanity checks. */ - assert(WR_ID(wr_id).id < rxq->elts_n); - assert(wr_id == wc->wr_id); - assert(wr->sg_list == &elt->sge); - assert(wr->num_sge == 1); - assert(elts_head < rxq->elts_n); - assert(rxq->elts_head < rxq->elts_n); - /* - * Fetch initial bytes of packet descriptor into a - * cacheline while allocating rep. - */ - rte_mbuf_prefetch_part1(seg); - rte_mbuf_prefetch_part2(seg); - /* Link completed WRs together for repost. */ - *wr_next = wr; - wr_next = &wr->next; - if (unlikely(wc->status != IBV_WC_SUCCESS)) { - /* Whatever, just repost the offending WR. */ - DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work completion" - " status (%d): %s", - (void *)rxq, wr_id, wc->status, - ibv_wc_status_str(wc->status)); - /* Increment dropped packets counter. */ - ++rxq->stats.idropped; - goto repost; - } + /* Update the 'next' pointer of the previous segment */ + if (pkt) + seg->next = rep; + seg = rep; + rte_prefetch0(seg); + rte_prefetch0(scat); rep = rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { - /* - * Unable to allocate a replacement mbuf, - * repost WR. - */ - DEBUG("rxq=%p, wr_id=%" PRIu32 ":" - " can't allocate a new mbuf", - (void *)rxq, WR_ID(wr_id).id); - /* Increase out of memory counters. */ ++rxq->stats.rx_nombuf; - ++rxq->priv->dev->data->rx_mbuf_alloc_failed; - goto repost; + if (!pkt) { + /* + * no buffers before we even started, + * bail out silently. + */ + break; + } + while (pkt != seg) { + assert(pkt != (*rxq->elts)[idx]); + rep = pkt->next; + pkt->next = NULL; + pkt->nb_segs = 1; + rte_mbuf_raw_free(pkt); + pkt = rep; + } + break; } - /* Reconfigure sge to use rep instead of seg. */ - elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; - assert(elt->sge.lkey == rxq->mr->lkey); - WR_ID(wr->wr_id).offset = - (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) - - (uintptr_t)rep); - assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id); - /* Update seg information. */ - seg->data_off = RTE_PKTMBUF_HEADROOM; - seg->nb_segs = 1; - seg->port = rxq->port_id; - seg->next = NULL; - seg->pkt_len = len; + if (!pkt) { + /* Looking for the new packet */ + len = mlx4_cq_poll_one(rxq, &cqe); + if (!len) { + rte_mbuf_raw_free(rep); + break; + } + if (unlikely(len < 0)) { + /* RX error, packet is likely too large. */ + rte_mbuf_raw_free(rep); + ++rxq->stats.idropped; + goto skip; + } + pkt = seg; + pkt->packet_type = 0; + pkt->ol_flags = 0; + pkt->pkt_len = len; + } + rep->nb_segs = 1; + rep->port = rxq->port_id; + rep->data_len = seg->data_len; + rep->data_off = seg->data_off; + (*rxq->elts)[idx] = rep; + /* + * Fill NIC descriptor with the new buffer. The lkey and size + * of the buffers are already known, only the buffer address + * changes. + */ + scat->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t)); + if (len > seg->data_len) { + len -= seg->data_len; + ++pkt->nb_segs; + ++rq_ci; + continue; + } + /* The last segment. */ seg->data_len = len; - seg->packet_type = 0; - seg->ol_flags = 0; + /* Increment bytes counter. */ + rxq->stats.ibytes += pkt->pkt_len; /* Return packet. */ - *(pkts++) = seg; - ++pkts_ret; - /* Increase bytes counter. */ - rxq->stats.ibytes += len; -repost: - if (++elts_head >= elts_n) - elts_head = 0; - continue; + *(pkts++) = pkt; + pkt = NULL; + --pkts_n; + ++i; +skip: + /* Align consumer index to the next stride. */ + rq_ci >>= sge_n; + ++rq_ci; + rq_ci <<= sge_n; } - if (unlikely(i == 0)) + if (unlikely((i == 0) && ((rq_ci >> sge_n) == rxq->hw.rq_ci))) return 0; - /* Repost WRs. */ - *wr_next = NULL; - assert(wr_head); - ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad); - if (unlikely(ret)) { - /* Inability to repost WRs is fatal. */ - DEBUG("%p: recv_burst(): failed (ret=%d)", - (void *)rxq->priv, - ret); - abort(); - } - rxq->elts_head = elts_head; - /* Increase packets counter. */ - rxq->stats.ipackets += pkts_ret; - return pkts_ret; + /* Update the consumer index. */ + rxq->hw.rq_ci = rq_ci >> sge_n; + rte_wmb(); + *rxq->hw.rq_db = htonl(rxq->hw.rq_ci); + *to_mcq(rxq->cq)->set_ci_db = + htonl(to_mcq(rxq->cq)->cons_index & 0xffffff); + /* Increment packets counter. */ + rxq->stats.ipackets += i; + return i; } /** diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index a3d972b..077fdd8 100644 --- a/drivers/net/mlx4/mlx4_rxtx.h +++ b/drivers/net/mlx4/mlx4_rxtx.h @@ -70,13 +70,6 @@ struct mlx4_rxq_stats { uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */ }; -/** Rx element. */ -struct rxq_elt { - struct ibv_recv_wr wr; /**< Work request. */ - struct ibv_sge sge; /**< Scatter/gather element. */ - /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */ -}; - /** Rx queue descriptor. */ struct rxq { struct priv *priv; /**< Back pointer to private data. */ @@ -86,9 +79,14 @@ struct rxq { struct ibv_qp *qp; /**< Queue pair. */ struct ibv_comp_channel *channel; /**< Rx completion channel. */ unsigned int port_id; /**< Port ID for incoming packets. */ - unsigned int elts_n; /**< (*elts)[] length. */ - unsigned int elts_head; /**< Current index in (*elts)[]. */ - struct rxq_elt (*elts)[]; /**< Rx elements. */ + unsigned int elts_n; /**< Log 2 of Mbufs. */ + struct rte_mbuf *(*elts)[]; /**< Rx elements. */ + struct { + volatile struct mlx4_wqe_data_seg(*wqes)[]; + volatile uint32_t *rq_db; + uint16_t rq_ci; + } hw; + unsigned int sge_n; /**< Log 2 of SGEs number. */ struct mlx4_rxq_stats stats; /**< Rx queue counters. */ unsigned int socket; /**< CPU socket ID for allocations. */ }; diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h index e74b61b..a37a3e5 100644 --- a/drivers/net/mlx4/mlx4_utils.h +++ b/drivers/net/mlx4/mlx4_utils.h @@ -102,4 +102,24 @@ int mlx4_fd_set_non_blocking(int fd); +/** + * Return nearest power of two above input value. + * + * @param v + * Input value. + * + * @return + * Nearest power of two above input value. + */ +static inline unsigned int +log2above(unsigned int v) +{ + unsigned int l; + unsigned int r; + + for (l = 0, r = 0; (v >> 1); ++l, v >>= 1) + r |= (v & 1); + return l + r; +} + #endif /* MLX4_UTILS_H_ */