From patchwork Tue Nov 17 10:06:32 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Joyce Kong X-Patchwork-Id: 84259 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id 2DF80A04DB; Tue, 17 Nov 2020 11:07:24 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 575CBC8BC; Tue, 17 Nov 2020 11:07:15 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by dpdk.org (Postfix) with ESMTP id 2B192C8BC for ; Tue, 17 Nov 2020 11:07:14 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id AC2D01396; Tue, 17 Nov 2020 02:07:12 -0800 (PST) Received: from net-arm-thunderx2-03.shanghai.arm.com (net-arm-thunderx2-03.shanghai.arm.com [10.169.208.206]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id A24DF3F718; Tue, 17 Nov 2020 02:07:09 -0800 (PST) From: Joyce Kong To: maxime.coquelin@redhat.com, chenbo.xia@intel.com, jerinj@marvell.com, ruifeng.wang@arm.com, honnappa.nagarahalli@arm.com Cc: dev@dpdk.org, nd@arm.com Date: Tue, 17 Nov 2020 18:06:32 +0800 Message-Id: <20201117100635.27690-2-joyce.kong@arm.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201117100635.27690-1-joyce.kong@arm.com> References: <20200911120906.45995-1-joyce.kong@arm.com> <20201117100635.27690-1-joyce.kong@arm.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v1 1/4] net/virtio: move AVX based Rx and Tx code to separate file X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Split out AVX instruction based virtio packed ring Rx and Tx implementation to a separate file. Signed-off-by: Joyce Kong Reviewed-by: Ruifeng Wang Reviewed-by: Maxime Coquelin --- drivers/net/virtio/meson.build | 4 +- drivers/net/virtio/virtio_rxtx.c | 6 +- drivers/net/virtio/virtio_rxtx_packed.c | 137 +++++ drivers/net/virtio/virtio_rxtx_packed.h | 298 ++++++++++ drivers/net/virtio/virtio_rxtx_packed_avx.c | 626 -------------------- drivers/net/virtio/virtio_rxtx_packed_avx.h | 239 ++++++++ 6 files changed, 678 insertions(+), 632 deletions(-) create mode 100644 drivers/net/virtio/virtio_rxtx_packed.c create mode 100644 drivers/net/virtio/virtio_rxtx_packed.h delete mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.c create mode 100644 drivers/net/virtio/virtio_rxtx_packed_avx.h diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build index eaed46373..01b8de6d4 100644 --- a/drivers/net/virtio/meson.build +++ b/drivers/net/virtio/meson.build @@ -13,12 +13,12 @@ if arch_subdir == 'x86' if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw') cflags += ['-DCC_AVX512_SUPPORT'] virtio_avx512_lib = static_library('virtio_avx512_lib', - 'virtio_rxtx_packed_avx.c', + 'virtio_rxtx_packed.c', dependencies: [static_rte_ethdev, static_rte_kvargs, static_rte_bus_pci], include_directories: includes, c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl']) - objs += virtio_avx512_lib.extract_objects('virtio_rxtx_packed_avx.c') + objs += virtio_avx512_lib.extract_objects('virtio_rxtx_packed.c') if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) cflags += '-DVHOST_GCC_UNROLL_PRAGMA' elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c index 77934e8c5..622d4bf20 100644 --- a/drivers/net/virtio/virtio_rxtx.c +++ b/drivers/net/virtio/virtio_rxtx.c @@ -2025,8 +2025,7 @@ virtio_xmit_pkts_inorder(void *tx_queue, return nb_tx; } -#ifndef CC_AVX512_SUPPORT -uint16_t +__rte_weak uint16_t virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused, struct rte_mbuf **rx_pkts __rte_unused, uint16_t nb_pkts __rte_unused) @@ -2034,11 +2033,10 @@ virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused, return 0; } -uint16_t +__rte_weak uint16_t virtio_xmit_pkts_packed_vec(void *tx_queue __rte_unused, struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts __rte_unused) { return 0; } -#endif /* ifndef CC_AVX512_SUPPORT */ diff --git a/drivers/net/virtio/virtio_rxtx_packed.c b/drivers/net/virtio/virtio_rxtx_packed.c new file mode 100644 index 000000000..99d9a5a99 --- /dev/null +++ b/drivers/net/virtio/virtio_rxtx_packed.c @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2020 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include + +#include "virtio_logs.h" +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtio_rxtx_packed.h" +#include "virtqueue.h" + +#ifdef CC_AVX512_SUPPORT +#include "virtio_rxtx_packed_avx.h" +#endif + +uint16_t +virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_tx *txvq = tx_queue; + struct virtqueue *vq = txvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t nb_tx = 0; + uint16_t remained; + + if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts)) + return nb_tx; + + if (unlikely(nb_pkts < 1)) + return nb_pkts; + + PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); + + if (vq->vq_free_cnt <= vq->vq_nentries - vq->vq_free_thresh) + virtio_xmit_cleanup_inorder_packed(vq, vq->vq_free_thresh); + + remained = RTE_MIN(nb_pkts, vq->vq_free_cnt); + + while (remained) { + if (remained >= PACKED_BATCH_SIZE) { + if (!virtqueue_enqueue_batch_packed_vec(txvq, + &tx_pkts[nb_tx])) { + nb_tx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } + } + if (!virtqueue_enqueue_single_packed_vec(txvq, + tx_pkts[nb_tx])) { + nb_tx++; + remained--; + continue; + } + break; + }; + + txvq->stats.packets += nb_tx; + + if (likely(nb_tx)) { + if (unlikely(virtqueue_kick_prepare_packed(vq))) { + virtqueue_notify(vq); + PMD_TX_LOG(DEBUG, "Notified backend after xmit"); + } + } + + return nb_tx; +} + +uint16_t +virtio_recv_pkts_packed_vec(void *rx_queue, + struct rte_mbuf **rx_pkts, + uint16_t nb_pkts) +{ + struct virtnet_rx *rxvq = rx_queue; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t num, nb_rx = 0; + uint32_t nb_enqueued = 0; + uint16_t free_cnt = vq->vq_free_thresh; + + if (unlikely(hw->started == 0)) + return nb_rx; + + num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts); + if (likely(num > PACKED_BATCH_SIZE)) + num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE); + + while (num) { + if (!virtqueue_dequeue_batch_packed_vec(rxvq, + &rx_pkts[nb_rx])) { + nb_rx += PACKED_BATCH_SIZE; + num -= PACKED_BATCH_SIZE; + continue; + } + if (!virtqueue_dequeue_single_packed_vec(rxvq, + &rx_pkts[nb_rx])) { + nb_rx++; + num--; + continue; + } + break; + }; + + PMD_RX_LOG(DEBUG, "dequeue:%d", num); + + rxvq->stats.packets += nb_rx; + + if (likely(vq->vq_free_cnt >= free_cnt)) { + struct rte_mbuf *new_pkts[free_cnt]; + if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts, + free_cnt) == 0)) { + virtio_recv_refill_packed_vec(rxvq, new_pkts, + free_cnt); + nb_enqueued += free_cnt; + } else { + struct rte_eth_dev *dev = + &rte_eth_devices[rxvq->port_id]; + dev->data->rx_mbuf_alloc_failed += free_cnt; + } + } + + if (likely(nb_enqueued)) { + if (unlikely(virtqueue_kick_prepare_packed(vq))) { + virtqueue_notify(vq); + PMD_RX_LOG(DEBUG, "Notified"); + } + } + + return nb_rx; +} diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h new file mode 100644 index 000000000..b0b1d63ec --- /dev/null +++ b/drivers/net/virtio/virtio_rxtx_packed.h @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2020 Intel Corporation + */ + +#ifndef _VIRTIO_RXTX_PACKED_H_ +#define _VIRTIO_RXTX_PACKED_H_ + +#include +#include +#include +#include +#include + +#include + +#include "virtio_logs.h" +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtqueue.h" + +#define BYTE_SIZE 8 +/* flag bits offset in packed ring desc higher 64bits */ +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ + offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) + +#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \ + FLAGS_BITS_OFFSET) + +/* reference count offset in mbuf rearm data */ +#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \ + offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE) +/* segment number offset in mbuf rearm data */ +#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \ + offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE) + +/* default rearm data */ +#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \ + 1ULL << REFCNT_BITS_OFFSET) + +/* id bits offset in packed ring desc higher 64bits */ +#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \ + offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) + +/* net hdr short size mask */ +#define NET_HDR_MASK 0x3F + +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + +#ifdef VIRTIO_GCC_UNROLL_PRAGMA +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VIRTIO_ICC_UNROLL_PRAGMA +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifndef virtio_for_each_try_unroll +#define virtio_for_each_try_unroll(iter, val, size) \ + for (iter = val; iter < size; iter++) +#endif + +static inline void +virtio_update_batch_stats(struct virtnet_stats *stats, + uint16_t pkt_len1, + uint16_t pkt_len2, + uint16_t pkt_len3, + uint16_t pkt_len4) +{ + stats->bytes += pkt_len1; + stats->bytes += pkt_len2; + stats->bytes += pkt_len3; + stats->bytes += pkt_len4; +} + +static inline int +virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq, + struct rte_mbuf *txm) +{ + struct virtqueue *vq = txvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t hdr_size = hw->vtnet_hdr_size; + uint16_t slots, can_push = 0, use_indirect = 0; + int16_t need; + + /* optimize ring usage */ + if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) || + vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) && + rte_mbuf_refcnt_read(txm) == 1 && RTE_MBUF_DIRECT(txm) && + txm->nb_segs == 1 && rte_pktmbuf_headroom(txm) >= hdr_size) + can_push = 1; + else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) && + txm->nb_segs < VIRTIO_MAX_TX_INDIRECT) + use_indirect = 1; + + /* How many main ring entries are needed to this Tx? + * indirect => 1 + * any_layout => number of segments + * default => number of segments + 1 + */ + slots = use_indirect ? 1 : (txm->nb_segs + !can_push); + can_push = rte_mbuf_refcnt_read(txm) == 1 && + RTE_MBUF_DIRECT(txm) && + txm->nb_segs == 1 && + rte_pktmbuf_headroom(txm) >= hdr_size; + + slots = txm->nb_segs + !can_push; + need = slots - vq->vq_free_cnt; + + /* Positive value indicates it need free vring descriptors */ + if (unlikely(need > 0)) { + virtio_xmit_cleanup_inorder_packed(vq, need); + need = slots - vq->vq_free_cnt; + if (unlikely(need > 0)) { + PMD_TX_LOG(ERR, + "No free tx descriptors to transmit"); + return -1; + } + } + + /* Enqueue Packet buffers */ + virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect, + can_push, 1); + + txvq->stats.bytes += txm->pkt_len; + return 0; +} + +/* Optionally fill offload information in structure */ +static inline int +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr) +{ + struct rte_net_hdr_lens hdr_lens; + uint32_t hdrlen, ptype; + int l4_supported = 0; + + /* nothing to do */ + if (hdr->flags == 0) + return 0; + + /* GSO not support in vec path, skip check */ + m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN; + + ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); + m->packet_type = ptype; + if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP || + (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP || + (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) + l4_supported = 1; + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len; + if (hdr->csum_start <= hdrlen && l4_supported) { + m->ol_flags |= PKT_RX_L4_CKSUM_NONE; + } else { + /* Unknown proto or tunnel, do sw cksum. We can assume + * the cksum field is in the first segment since the + * buffers we provided to the host are large enough. + * In case of SCTP, this will be wrong since it's a CRC + * but there's nothing we can do. + */ + uint16_t csum = 0, off; + + if (rte_raw_cksum_mbuf(m, hdr->csum_start, + rte_pktmbuf_pkt_len(m) - hdr->csum_start, + &csum) < 0) + return -1; + if (likely(csum != 0xffff)) + csum = ~csum; + off = hdr->csum_offset + hdr->csum_start; + if (rte_pktmbuf_data_len(m) >= off + 1) + *rte_pktmbuf_mtod_offset(m, uint16_t *, + off) = csum; + } + } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) { + m->ol_flags |= PKT_RX_L4_CKSUM_GOOD; + } + + return 0; +} + +static inline uint16_t +virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq, + struct rte_mbuf **rx_pkts) +{ + uint16_t used_idx, id; + uint32_t len; + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint32_t hdr_size = hw->vtnet_hdr_size; + struct virtio_net_hdr *hdr; + struct vring_packed_desc *desc; + struct rte_mbuf *cookie; + + desc = vq->vq_packed.ring.desc; + used_idx = vq->vq_used_cons_idx; + if (!desc_is_used(&desc[used_idx], vq)) + return -1; + + len = desc[used_idx].len; + id = desc[used_idx].id; + cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie; + if (unlikely(cookie == NULL)) { + PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u", + vq->vq_used_cons_idx); + return -1; + } + rte_prefetch0(cookie); + rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *)); + + cookie->data_off = RTE_PKTMBUF_HEADROOM; + cookie->ol_flags = 0; + cookie->pkt_len = (uint32_t)(len - hdr_size); + cookie->data_len = (uint32_t)(len - hdr_size); + + hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr + + RTE_PKTMBUF_HEADROOM - hdr_size); + if (hw->has_rx_offload) + virtio_vec_rx_offload(cookie, hdr); + + *rx_pkts = cookie; + + rxvq->stats.bytes += cookie->pkt_len; + + vq->vq_free_cnt++; + vq->vq_used_cons_idx++; + if (vq->vq_used_cons_idx >= vq->vq_nentries) { + vq->vq_used_cons_idx -= vq->vq_nentries; + vq->vq_packed.used_wrap_counter ^= 1; + } + + return 0; +} + +static inline void +virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq, + struct rte_mbuf **cookie, + uint16_t num) +{ + struct virtqueue *vq = rxvq->vq; + struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc; + uint16_t flags = vq->vq_packed.cached_flags; + struct virtio_hw *hw = vq->hw; + struct vq_desc_extra *dxp; + uint16_t idx, i; + uint16_t batch_num, total_num = 0; + uint16_t head_idx = vq->vq_avail_idx; + uint16_t head_flag = vq->vq_packed.cached_flags; + uint64_t addr; + + do { + idx = vq->vq_avail_idx; + + batch_num = PACKED_BATCH_SIZE; + if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries)) + batch_num = vq->vq_nentries - idx; + if (unlikely((total_num + batch_num) > num)) + batch_num = num - total_num; + + virtio_for_each_try_unroll(i, 0, batch_num) { + dxp = &vq->vq_descx[idx + i]; + dxp->cookie = (void *)cookie[total_num + i]; + + addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) + + RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size; + start_dp[idx + i].addr = addr; + start_dp[idx + i].len = cookie[total_num + i]->buf_len + - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size; + if (total_num || i) { + virtqueue_store_flags_packed(&start_dp[idx + i], + flags, hw->weak_barriers); + } + } + + vq->vq_avail_idx += batch_num; + if (vq->vq_avail_idx >= vq->vq_nentries) { + vq->vq_avail_idx -= vq->vq_nentries; + vq->vq_packed.cached_flags ^= + VRING_PACKED_DESC_F_AVAIL_USED; + flags = vq->vq_packed.cached_flags; + } + total_num += batch_num; + } while (total_num < num); + + virtqueue_store_flags_packed(&start_dp[head_idx], head_flag, + hw->weak_barriers); + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num); +} + +#endif /* _VIRTIO_RXTX_PACKED_H_ */ diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c deleted file mode 100644 index 9bc62719e..000000000 --- a/drivers/net/virtio/virtio_rxtx_packed_avx.c +++ /dev/null @@ -1,626 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2020 Intel Corporation - */ - -#include -#include -#include -#include -#include - -#include - -#include "virtio_logs.h" -#include "virtio_ethdev.h" -#include "virtio_pci.h" -#include "virtqueue.h" - -#define BYTE_SIZE 8 -/* flag bits offset in packed ring desc higher 64bits */ -#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ - offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) - -#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \ - FLAGS_BITS_OFFSET) - -/* reference count offset in mbuf rearm data */ -#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \ - offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE) -/* segment number offset in mbuf rearm data */ -#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \ - offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE) - -/* default rearm data */ -#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \ - 1ULL << REFCNT_BITS_OFFSET) - -/* id bits offset in packed ring desc higher 64bits */ -#define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \ - offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) - -/* net hdr short size mask */ -#define NET_HDR_MASK 0x3F - -#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ - sizeof(struct vring_packed_desc)) -#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) - -#ifdef VIRTIO_GCC_UNROLL_PRAGMA -#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ - for (iter = val; iter < size; iter++) -#endif - -#ifdef VIRTIO_CLANG_UNROLL_PRAGMA -#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ - for (iter = val; iter < size; iter++) -#endif - -#ifdef VIRTIO_ICC_UNROLL_PRAGMA -#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ - for (iter = val; iter < size; iter++) -#endif - -#ifndef virtio_for_each_try_unroll -#define virtio_for_each_try_unroll(iter, val, num) \ - for (iter = val; iter < num; iter++) -#endif - -static inline void -virtio_update_batch_stats(struct virtnet_stats *stats, - uint16_t pkt_len1, - uint16_t pkt_len2, - uint16_t pkt_len3, - uint16_t pkt_len4) -{ - stats->bytes += pkt_len1; - stats->bytes += pkt_len2; - stats->bytes += pkt_len3; - stats->bytes += pkt_len4; -} - -static inline int -virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq, - struct rte_mbuf **tx_pkts) -{ - struct virtqueue *vq = txvq->vq; - uint16_t head_size = vq->hw->vtnet_hdr_size; - uint16_t idx = vq->vq_avail_idx; - struct virtio_net_hdr *hdr; - struct vq_desc_extra *dxp; - uint16_t i, cmp; - - if (vq->vq_avail_idx & PACKED_BATCH_MASK) - return -1; - - if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries)) - return -1; - - /* Load four mbufs rearm data */ - RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64); - RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64); - __m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data, - *tx_pkts[2]->rearm_data, - *tx_pkts[1]->rearm_data, - *tx_pkts[0]->rearm_data); - - /* refcnt=1 and nb_segs=1 */ - __m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA); - __m256i head_rooms = _mm256_set1_epi16(head_size); - - /* Check refcnt and nb_segs */ - const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12; - cmp = _mm256_mask_cmpneq_epu16_mask(mask, mbufs, mbuf_ref); - if (unlikely(cmp)) - return -1; - - /* Check headroom is enough */ - const __mmask16 data_mask = 0x1 | 0x1 << 4 | 0x1 << 8 | 0x1 << 12; - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) != - offsetof(struct rte_mbuf, rearm_data)); - cmp = _mm256_mask_cmplt_epu16_mask(data_mask, mbufs, head_rooms); - if (unlikely(cmp)) - return -1; - - virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { - dxp = &vq->vq_descx[idx + i]; - dxp->ndescs = 1; - dxp->cookie = tx_pkts[i]; - } - - virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { - tx_pkts[i]->data_off -= head_size; - tx_pkts[i]->data_len += head_size; - } - - __m512i descs_base = _mm512_set_epi64(tx_pkts[3]->data_len, - VIRTIO_MBUF_ADDR(tx_pkts[3], vq), - tx_pkts[2]->data_len, - VIRTIO_MBUF_ADDR(tx_pkts[2], vq), - tx_pkts[1]->data_len, - VIRTIO_MBUF_ADDR(tx_pkts[1], vq), - tx_pkts[0]->data_len, - VIRTIO_MBUF_ADDR(tx_pkts[0], vq)); - - /* id offset and data offset */ - __m512i data_offsets = _mm512_set_epi64((uint64_t)3 << ID_BITS_OFFSET, - tx_pkts[3]->data_off, - (uint64_t)2 << ID_BITS_OFFSET, - tx_pkts[2]->data_off, - (uint64_t)1 << ID_BITS_OFFSET, - tx_pkts[1]->data_off, - 0, tx_pkts[0]->data_off); - - __m512i new_descs = _mm512_add_epi64(descs_base, data_offsets); - - uint64_t flags_temp = (uint64_t)idx << ID_BITS_OFFSET | - (uint64_t)vq->vq_packed.cached_flags << FLAGS_BITS_OFFSET; - - /* flags offset and guest virtual address offset */ - __m128i flag_offset = _mm_set_epi64x(flags_temp, 0); - __m512i v_offset = _mm512_broadcast_i32x4(flag_offset); - __m512i v_desc = _mm512_add_epi64(new_descs, v_offset); - - if (!vq->hw->has_tx_offload) { - __m128i all_mask = _mm_set1_epi16(0xFFFF); - virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { - hdr = rte_pktmbuf_mtod_offset(tx_pkts[i], - struct virtio_net_hdr *, -head_size); - __m128i v_hdr = _mm_loadu_si128((void *)hdr); - if (unlikely(_mm_mask_test_epi16_mask(NET_HDR_MASK, - v_hdr, all_mask))) { - __m128i all_zero = _mm_setzero_si128(); - _mm_mask_storeu_epi16((void *)hdr, - NET_HDR_MASK, all_zero); - } - } - } else { - virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { - hdr = rte_pktmbuf_mtod_offset(tx_pkts[i], - struct virtio_net_hdr *, -head_size); - virtqueue_xmit_offload(hdr, tx_pkts[i], true); - } - } - - /* Enqueue Packet buffers */ - _mm512_storeu_si512((void *)&vq->vq_packed.ring.desc[idx], v_desc); - - virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len, - tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len, - tx_pkts[3]->pkt_len); - - vq->vq_avail_idx += PACKED_BATCH_SIZE; - vq->vq_free_cnt -= PACKED_BATCH_SIZE; - - if (vq->vq_avail_idx >= vq->vq_nentries) { - vq->vq_avail_idx -= vq->vq_nentries; - vq->vq_packed.cached_flags ^= - VRING_PACKED_DESC_F_AVAIL_USED; - } - - return 0; -} - -static inline int -virtqueue_enqueue_single_packed_vec(struct virtnet_tx *txvq, - struct rte_mbuf *txm) -{ - struct virtqueue *vq = txvq->vq; - struct virtio_hw *hw = vq->hw; - uint16_t hdr_size = hw->vtnet_hdr_size; - uint16_t slots, can_push = 0, use_indirect = 0; - int16_t need; - - /* optimize ring usage */ - if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) || - vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) && - rte_mbuf_refcnt_read(txm) == 1 && - RTE_MBUF_DIRECT(txm) && - txm->nb_segs == 1 && - rte_pktmbuf_headroom(txm) >= hdr_size) - can_push = 1; - else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) && - txm->nb_segs < VIRTIO_MAX_TX_INDIRECT) - use_indirect = 1; - /* How many main ring entries are needed to this Tx? - * indirect => 1 - * any_layout => number of segments - * default => number of segments + 1 - */ - slots = use_indirect ? 1 : (txm->nb_segs + !can_push); - need = slots - vq->vq_free_cnt; - - /* Positive value indicates it need free vring descriptors */ - if (unlikely(need > 0)) { - virtio_xmit_cleanup_inorder_packed(vq, need); - need = slots - vq->vq_free_cnt; - if (unlikely(need > 0)) { - PMD_TX_LOG(ERR, - "No free tx descriptors to transmit"); - return -1; - } - } - - /* Enqueue Packet buffers */ - virtqueue_enqueue_xmit_packed(txvq, txm, slots, use_indirect, - can_push, 1); - - txvq->stats.bytes += txm->pkt_len; - return 0; -} - -uint16_t -virtio_xmit_pkts_packed_vec(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts) -{ - struct virtnet_tx *txvq = tx_queue; - struct virtqueue *vq = txvq->vq; - struct virtio_hw *hw = vq->hw; - uint16_t nb_tx = 0; - uint16_t remained; - - if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts)) - return nb_tx; - - if (unlikely(nb_pkts < 1)) - return nb_pkts; - - PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); - - if (vq->vq_free_cnt <= vq->vq_nentries - vq->vq_free_thresh) - virtio_xmit_cleanup_inorder_packed(vq, vq->vq_free_thresh); - - remained = RTE_MIN(nb_pkts, vq->vq_free_cnt); - - while (remained) { - if (remained >= PACKED_BATCH_SIZE) { - if (!virtqueue_enqueue_batch_packed_vec(txvq, - &tx_pkts[nb_tx])) { - nb_tx += PACKED_BATCH_SIZE; - remained -= PACKED_BATCH_SIZE; - continue; - } - } - if (!virtqueue_enqueue_single_packed_vec(txvq, - tx_pkts[nb_tx])) { - nb_tx++; - remained--; - continue; - } - break; - }; - - txvq->stats.packets += nb_tx; - - if (likely(nb_tx)) { - if (unlikely(virtqueue_kick_prepare_packed(vq))) { - virtqueue_notify(vq); - PMD_TX_LOG(DEBUG, "Notified backend after xmit"); - } - } - - return nb_tx; -} - -/* Optionally fill offload information in structure */ -static inline int -virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr) -{ - struct rte_net_hdr_lens hdr_lens; - uint32_t hdrlen, ptype; - int l4_supported = 0; - - /* nothing to do */ - if (hdr->flags == 0) - return 0; - - /* GSO not support in vec path, skip check */ - m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN; - - ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK); - m->packet_type = ptype; - if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP || - (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP || - (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) - l4_supported = 1; - - if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len; - if (hdr->csum_start <= hdrlen && l4_supported) { - m->ol_flags |= PKT_RX_L4_CKSUM_NONE; - } else { - /* Unknown proto or tunnel, do sw cksum. We can assume - * the cksum field is in the first segment since the - * buffers we provided to the host are large enough. - * In case of SCTP, this will be wrong since it's a CRC - * but there's nothing we can do. - */ - uint16_t csum = 0, off; - - if (rte_raw_cksum_mbuf(m, hdr->csum_start, - rte_pktmbuf_pkt_len(m) - hdr->csum_start, - &csum) < 0) - return -1; - if (likely(csum != 0xffff)) - csum = ~csum; - off = hdr->csum_offset + hdr->csum_start; - if (rte_pktmbuf_data_len(m) >= off + 1) - *rte_pktmbuf_mtod_offset(m, uint16_t *, - off) = csum; - } - } else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) { - m->ol_flags |= PKT_RX_L4_CKSUM_GOOD; - } - - return 0; -} - -static inline uint16_t -virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq, - struct rte_mbuf **rx_pkts) -{ - struct virtqueue *vq = rxvq->vq; - struct virtio_hw *hw = vq->hw; - uint16_t hdr_size = hw->vtnet_hdr_size; - uint64_t addrs[PACKED_BATCH_SIZE]; - uint16_t id = vq->vq_used_cons_idx; - uint8_t desc_stats; - uint16_t i; - void *desc_addr; - - if (id & PACKED_BATCH_MASK) - return -1; - - if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries)) - return -1; - - /* only care avail/used bits */ -#if defined(RTE_ARCH_I686) - __m512i v_mask = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0, - PACKED_FLAGS_MASK, 0x0); -#else - __m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK); -#endif - desc_addr = &vq->vq_packed.ring.desc[id]; - - __m512i v_desc = _mm512_loadu_si512(desc_addr); - __m512i v_flag = _mm512_and_epi64(v_desc, v_mask); - - __m512i v_used_flag = _mm512_setzero_si512(); - if (vq->vq_packed.used_wrap_counter) -#if defined(RTE_ARCH_I686) - v_used_flag = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0, - PACKED_FLAGS_MASK, 0x0); -#else - v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK); -#endif - - /* Check all descs are used */ - desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag); - if (desc_stats) - return -1; - - virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { - rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie; - rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *)); - - addrs[i] = (uintptr_t)rx_pkts[i]->rx_descriptor_fields1; - } - - /* - * load len from desc, store into mbuf pkt_len and data_len - * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored - */ - const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12; - __m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA); - - /* reduce hdr_len from pkt_len and data_len */ - __m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask, - (uint32_t)-hdr_size); - - __m512i v_value = _mm512_add_epi32(values, mbuf_len_offset); - - /* assert offset of data_len */ - RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != - offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); - - __m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3], - addrs[2] + 8, addrs[2], - addrs[1] + 8, addrs[1], - addrs[0] + 8, addrs[0]); - /* batch store into mbufs */ - _mm512_i64scatter_epi64(0, v_index, v_value, 1); - - if (hw->has_rx_offload) { - virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { - char *addr = (char *)rx_pkts[i]->buf_addr + - RTE_PKTMBUF_HEADROOM - hdr_size; - virtio_vec_rx_offload(rx_pkts[i], - (struct virtio_net_hdr *)addr); - } - } - - virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len, - rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len, - rx_pkts[3]->pkt_len); - - vq->vq_free_cnt += PACKED_BATCH_SIZE; - - vq->vq_used_cons_idx += PACKED_BATCH_SIZE; - if (vq->vq_used_cons_idx >= vq->vq_nentries) { - vq->vq_used_cons_idx -= vq->vq_nentries; - vq->vq_packed.used_wrap_counter ^= 1; - } - - return 0; -} - -static uint16_t -virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq, - struct rte_mbuf **rx_pkts) -{ - uint16_t used_idx, id; - uint32_t len; - struct virtqueue *vq = rxvq->vq; - struct virtio_hw *hw = vq->hw; - uint32_t hdr_size = hw->vtnet_hdr_size; - struct virtio_net_hdr *hdr; - struct vring_packed_desc *desc; - struct rte_mbuf *cookie; - - desc = vq->vq_packed.ring.desc; - used_idx = vq->vq_used_cons_idx; - if (!desc_is_used(&desc[used_idx], vq)) - return -1; - - len = desc[used_idx].len; - id = desc[used_idx].id; - cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie; - if (unlikely(cookie == NULL)) { - PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u", - vq->vq_used_cons_idx); - return -1; - } - rte_prefetch0(cookie); - rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *)); - - cookie->data_off = RTE_PKTMBUF_HEADROOM; - cookie->ol_flags = 0; - cookie->pkt_len = (uint32_t)(len - hdr_size); - cookie->data_len = (uint32_t)(len - hdr_size); - - hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr + - RTE_PKTMBUF_HEADROOM - hdr_size); - if (hw->has_rx_offload) - virtio_vec_rx_offload(cookie, hdr); - - *rx_pkts = cookie; - - rxvq->stats.bytes += cookie->pkt_len; - - vq->vq_free_cnt++; - vq->vq_used_cons_idx++; - if (vq->vq_used_cons_idx >= vq->vq_nentries) { - vq->vq_used_cons_idx -= vq->vq_nentries; - vq->vq_packed.used_wrap_counter ^= 1; - } - - return 0; -} - -static inline void -virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq, - struct rte_mbuf **cookie, - uint16_t num) -{ - struct virtqueue *vq = rxvq->vq; - struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc; - uint16_t flags = vq->vq_packed.cached_flags; - struct virtio_hw *hw = vq->hw; - struct vq_desc_extra *dxp; - uint16_t idx, i; - uint16_t batch_num, total_num = 0; - uint16_t head_idx = vq->vq_avail_idx; - uint16_t head_flag = vq->vq_packed.cached_flags; - uint64_t addr; - - do { - idx = vq->vq_avail_idx; - - batch_num = PACKED_BATCH_SIZE; - if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries)) - batch_num = vq->vq_nentries - idx; - if (unlikely((total_num + batch_num) > num)) - batch_num = num - total_num; - - virtio_for_each_try_unroll(i, 0, batch_num) { - dxp = &vq->vq_descx[idx + i]; - dxp->cookie = (void *)cookie[total_num + i]; - - addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) + - RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size; - start_dp[idx + i].addr = addr; - start_dp[idx + i].len = cookie[total_num + i]->buf_len - - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size; - if (total_num || i) { - virtqueue_store_flags_packed(&start_dp[idx + i], - flags, hw->weak_barriers); - } - } - - vq->vq_avail_idx += batch_num; - if (vq->vq_avail_idx >= vq->vq_nentries) { - vq->vq_avail_idx -= vq->vq_nentries; - vq->vq_packed.cached_flags ^= - VRING_PACKED_DESC_F_AVAIL_USED; - flags = vq->vq_packed.cached_flags; - } - total_num += batch_num; - } while (total_num < num); - - virtqueue_store_flags_packed(&start_dp[head_idx], head_flag, - hw->weak_barriers); - vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num); -} - -uint16_t -virtio_recv_pkts_packed_vec(void *rx_queue, - struct rte_mbuf **rx_pkts, - uint16_t nb_pkts) -{ - struct virtnet_rx *rxvq = rx_queue; - struct virtqueue *vq = rxvq->vq; - struct virtio_hw *hw = vq->hw; - uint16_t num, nb_rx = 0; - uint32_t nb_enqueued = 0; - uint16_t free_cnt = vq->vq_free_thresh; - - if (unlikely(hw->started == 0)) - return nb_rx; - - num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts); - if (likely(num > PACKED_BATCH_SIZE)) - num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE); - - while (num) { - if (!virtqueue_dequeue_batch_packed_vec(rxvq, - &rx_pkts[nb_rx])) { - nb_rx += PACKED_BATCH_SIZE; - num -= PACKED_BATCH_SIZE; - continue; - } - if (!virtqueue_dequeue_single_packed_vec(rxvq, - &rx_pkts[nb_rx])) { - nb_rx++; - num--; - continue; - } - break; - }; - - PMD_RX_LOG(DEBUG, "dequeue:%d", num); - - rxvq->stats.packets += nb_rx; - - if (likely(vq->vq_free_cnt >= free_cnt)) { - struct rte_mbuf *new_pkts[free_cnt]; - if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts, - free_cnt) == 0)) { - virtio_recv_refill_packed_vec(rxvq, new_pkts, - free_cnt); - nb_enqueued += free_cnt; - } else { - struct rte_eth_dev *dev = - &rte_eth_devices[rxvq->port_id]; - dev->data->rx_mbuf_alloc_failed += free_cnt; - } - } - - if (likely(nb_enqueued)) { - if (unlikely(virtqueue_kick_prepare_packed(vq))) { - virtqueue_notify(vq); - PMD_RX_LOG(DEBUG, "Notified"); - } - } - - return nb_rx; -} diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.h b/drivers/net/virtio/virtio_rxtx_packed_avx.h new file mode 100644 index 000000000..f83182884 --- /dev/null +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.h @@ -0,0 +1,239 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2020 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include + +#include "virtio_logs.h" +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtio_rxtx_packed.h" +#include "virtqueue.h" + +static inline int +virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq, + struct rte_mbuf **tx_pkts) +{ + struct virtqueue *vq = txvq->vq; + uint16_t head_size = vq->hw->vtnet_hdr_size; + uint16_t idx = vq->vq_avail_idx; + struct virtio_net_hdr *hdr; + struct vq_desc_extra *dxp; + uint16_t i, cmp; + + if (vq->vq_avail_idx & PACKED_BATCH_MASK) + return -1; + + if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries)) + return -1; + + /* Load four mbufs rearm data */ + RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64); + RTE_BUILD_BUG_ON(SEG_NUM_BITS_OFFSET >= 64); + __m256i mbufs = _mm256_set_epi64x(*tx_pkts[3]->rearm_data, + *tx_pkts[2]->rearm_data, + *tx_pkts[1]->rearm_data, + *tx_pkts[0]->rearm_data); + + /* refcnt=1 and nb_segs=1 */ + __m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA); + __m256i head_rooms = _mm256_set1_epi16(head_size); + + /* Check refcnt and nb_segs */ + const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12; + cmp = _mm256_mask_cmpneq_epu16_mask(mask, mbufs, mbuf_ref); + if (unlikely(cmp)) + return -1; + + /* Check headroom is enough */ + const __mmask16 data_mask = 0x1 | 0x1 << 4 | 0x1 << 8 | 0x1 << 12; + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) != + offsetof(struct rte_mbuf, rearm_data)); + cmp = _mm256_mask_cmplt_epu16_mask(data_mask, mbufs, head_rooms); + if (unlikely(cmp)) + return -1; + + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + dxp = &vq->vq_descx[idx + i]; + dxp->ndescs = 1; + dxp->cookie = tx_pkts[i]; + } + + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + tx_pkts[i]->data_off -= head_size; + tx_pkts[i]->data_len += head_size; + } + + __m512i descs_base = _mm512_set_epi64(tx_pkts[3]->data_len, + VIRTIO_MBUF_ADDR(tx_pkts[3], vq), + tx_pkts[2]->data_len, + VIRTIO_MBUF_ADDR(tx_pkts[2], vq), + tx_pkts[1]->data_len, + VIRTIO_MBUF_ADDR(tx_pkts[1], vq), + tx_pkts[0]->data_len, + VIRTIO_MBUF_ADDR(tx_pkts[0], vq)); + + /* id offset and data offset */ + __m512i data_offsets = _mm512_set_epi64((uint64_t)3 << ID_BITS_OFFSET, + tx_pkts[3]->data_off, + (uint64_t)2 << ID_BITS_OFFSET, + tx_pkts[2]->data_off, + (uint64_t)1 << ID_BITS_OFFSET, + tx_pkts[1]->data_off, + 0, tx_pkts[0]->data_off); + + __m512i new_descs = _mm512_add_epi64(descs_base, data_offsets); + + uint64_t flags_temp = (uint64_t)idx << ID_BITS_OFFSET | + (uint64_t)vq->vq_packed.cached_flags << FLAGS_BITS_OFFSET; + + /* flags offset and guest virtual address offset */ + __m128i flag_offset = _mm_set_epi64x(flags_temp, 0); + __m512i v_offset = _mm512_broadcast_i32x4(flag_offset); + __m512i v_desc = _mm512_add_epi64(new_descs, v_offset); + + if (!vq->hw->has_tx_offload) { + __m128i all_mask = _mm_set1_epi16(0xFFFF); + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = rte_pktmbuf_mtod_offset(tx_pkts[i], + struct virtio_net_hdr *, -head_size); + __m128i v_hdr = _mm_loadu_si128((void *)hdr); + if (unlikely(_mm_mask_test_epi16_mask(NET_HDR_MASK, + v_hdr, all_mask))) { + __m128i all_zero = _mm_setzero_si128(); + _mm_mask_storeu_epi16((void *)hdr, + NET_HDR_MASK, all_zero); + } + } + } else { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = rte_pktmbuf_mtod_offset(tx_pkts[i], + struct virtio_net_hdr *, -head_size); + virtqueue_xmit_offload(hdr, tx_pkts[i], true); + } + } + + /* Enqueue Packet buffers */ + _mm512_storeu_si512((void *)&vq->vq_packed.ring.desc[idx], v_desc); + + virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len, + tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len, + tx_pkts[3]->pkt_len); + + vq->vq_avail_idx += PACKED_BATCH_SIZE; + vq->vq_free_cnt -= PACKED_BATCH_SIZE; + + if (vq->vq_avail_idx >= vq->vq_nentries) { + vq->vq_avail_idx -= vq->vq_nentries; + vq->vq_packed.cached_flags ^= + VRING_PACKED_DESC_F_AVAIL_USED; + } + + return 0; +} + +static inline uint16_t +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq, + struct rte_mbuf **rx_pkts) +{ + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t hdr_size = hw->vtnet_hdr_size; + uint64_t addrs[PACKED_BATCH_SIZE]; + uint16_t id = vq->vq_used_cons_idx; + uint8_t desc_stats; + uint16_t i; + void *desc_addr; + + if (id & PACKED_BATCH_MASK) + return -1; + + if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries)) + return -1; + + /* only care avail/used bits */ +#if defined(RTE_ARCH_I686) + __m512i v_mask = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0, + PACKED_FLAGS_MASK, 0x0); +#else + __m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK); +#endif + desc_addr = &vq->vq_packed.ring.desc[id]; + + __m512i v_desc = _mm512_loadu_si512(desc_addr); + __m512i v_flag = _mm512_and_epi64(v_desc, v_mask); + + __m512i v_used_flag = _mm512_setzero_si512(); + if (vq->vq_packed.used_wrap_counter) +#if defined(RTE_ARCH_I686) + v_used_flag = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0, + PACKED_FLAGS_MASK, 0x0); +#else + v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK); +#endif + + /* Check all descs are used */ + desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag); + if (desc_stats) + return -1; + + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie; + rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *)); + + addrs[i] = (uintptr_t)rx_pkts[i]->rx_descriptor_fields1; + } + + /* + * load len from desc, store into mbuf pkt_len and data_len + * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored + */ + const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12; + __m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA); + + /* reduce hdr_len from pkt_len and data_len */ + __m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask, + (uint32_t)-hdr_size); + + __m512i v_value = _mm512_add_epi32(values, mbuf_len_offset); + + /* assert offset of data_len */ + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != + offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); + + __m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3], + addrs[2] + 8, addrs[2], + addrs[1] + 8, addrs[1], + addrs[0] + 8, addrs[0]); + /* batch store into mbufs */ + _mm512_i64scatter_epi64(0, v_index, v_value, 1); + + if (hw->has_rx_offload) { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + char *addr = (char *)rx_pkts[i]->buf_addr + + RTE_PKTMBUF_HEADROOM - hdr_size; + virtio_vec_rx_offload(rx_pkts[i], + (struct virtio_net_hdr *)addr); + } + } + + virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len, + rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len, + rx_pkts[3]->pkt_len); + + vq->vq_free_cnt += PACKED_BATCH_SIZE; + + vq->vq_used_cons_idx += PACKED_BATCH_SIZE; + if (vq->vq_used_cons_idx >= vq->vq_nentries) { + vq->vq_used_cons_idx -= vq->vq_nentries; + vq->vq_packed.used_wrap_counter ^= 1; + } + + return 0; +} From patchwork Tue Nov 17 10:06:33 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Joyce Kong X-Patchwork-Id: 84260 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id B1166A04DB; Tue, 17 Nov 2020 11:07:51 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 722A9C8E6; Tue, 17 Nov 2020 11:07:22 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by dpdk.org (Postfix) with ESMTP id 9F9CBC8E6 for ; Tue, 17 Nov 2020 11:07:20 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 29B371474; Tue, 17 Nov 2020 02:07:19 -0800 (PST) Received: from net-arm-thunderx2-03.shanghai.arm.com (net-arm-thunderx2-03.shanghai.arm.com [10.169.208.206]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id A61103F718; Tue, 17 Nov 2020 02:07:16 -0800 (PST) From: Joyce Kong To: maxime.coquelin@redhat.com, chenbo.xia@intel.com, jerinj@marvell.com, ruifeng.wang@arm.com, honnappa.nagarahalli@arm.com Cc: dev@dpdk.org, nd@arm.com Date: Tue, 17 Nov 2020 18:06:33 +0800 Message-Id: <20201117100635.27690-3-joyce.kong@arm.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201117100635.27690-1-joyce.kong@arm.com> References: <20200911120906.45995-1-joyce.kong@arm.com> <20201117100635.27690-1-joyce.kong@arm.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v1 2/4] net/virtio: add vectorized packed ring Rx NEON path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Optimize packed ring Rx batch path with NEON instructions. Signed-off-by: Joyce Kong Reviewed-by: Ruifeng Wang --- drivers/net/virtio/virtio_rxtx_packed.h | 15 ++ drivers/net/virtio/virtio_rxtx_packed_neon.h | 150 +++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 drivers/net/virtio/virtio_rxtx_packed_neon.h diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h index b0b1d63ec..8f5198ad7 100644 --- a/drivers/net/virtio/virtio_rxtx_packed.h +++ b/drivers/net/virtio/virtio_rxtx_packed.h @@ -19,9 +19,16 @@ #include "virtqueue.h" #define BYTE_SIZE 8 + +#ifdef CC_AVX512_SUPPORT /* flag bits offset in packed ring desc higher 64bits */ #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) +#elif defined(RTE_ARCH_ARM) +/* flag bits offset in packed ring desc from ID */ +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ + offsetof(struct vring_packed_desc, id)) * BYTE_SIZE) +#endif #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \ FLAGS_BITS_OFFSET) @@ -44,8 +51,16 @@ /* net hdr short size mask */ #define NET_HDR_MASK 0x3F +#ifdef RTE_ARCH_ARM +/* The cache line size on different Arm platforms are different, so + * put a four batch size here to match with the minimum cache line + * size and accommodate NEON register size. + */ +#define PACKED_BATCH_SIZE 4 +#else #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) +#endif #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) #ifdef VIRTIO_GCC_UNROLL_PRAGMA diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h new file mode 100644 index 000000000..fb1e49909 --- /dev/null +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Arm Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "virtio_ethdev.h" +#include "virtio_pci.h" +#include "virtio_rxtx_packed.h" +#include "virtqueue.h" + +static inline uint16_t +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq, + struct rte_mbuf **rx_pkts) +{ + struct virtqueue *vq = rxvq->vq; + struct virtio_hw *hw = vq->hw; + uint16_t head_size = hw->vtnet_hdr_size; + uint16_t id = vq->vq_used_cons_idx; + struct vring_packed_desc *p_desc; + uint16_t i; + + if (id & PACKED_BATCH_MASK) + return -1; + + if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries)) + return -1; + + /* Map packed descriptor to mbuf fields. */ + uint8x16_t shuf_msk1 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */ + 0, 1, /* octet 1~0, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */ + 0, 1, /* octet 1~0, 16 bits data_len */ + 0xFF, 0xFF, /* vlan tci set as unknown */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + uint8x16_t shuf_msk2 = { + 0xFF, 0xFF, 0xFF, 0xFF, /* pkt_type set as unknown */ + 8, 9, /* octet 9~8, low 16 bits pkt_len */ + 0xFF, 0xFF, /* skip high 16 bits of pkt_len, zero out */ + 8, 9, /* octet 9~8, 16 bits data_len */ + 0xFF, 0xFF, /* vlan tci set as unknown */ + 0xFF, 0xFF, 0xFF, 0xFF + }; + + /* Subtract the header length. */ + uint16x8_t len_adjust = { + 0, 0, /* ignore pkt_type field */ + head_size, /* sub head_size on pkt_len */ + 0, /* ignore high 16 bits of pkt_len */ + head_size, /* sub head_size on data_len */ + 0, 0, 0 /* ignore non-length fields */ + }; + + uint64x2_t desc[PACKED_BATCH_SIZE / 2]; + uint64x2x2_t mbp[PACKED_BATCH_SIZE / 2]; + uint64x2_t pkt_mb[PACKED_BATCH_SIZE]; + + p_desc = &vq->vq_packed.ring.desc[id]; + /* Load high 64 bits of packed descriptor 0,1. */ + desc[0] = vld2q_u64((uint64_t *)(p_desc)).val[1]; + /* Load high 64 bits of packed descriptor 2,3. */ + desc[1] = vld2q_u64((uint64_t *)(p_desc + 2)).val[1]; + + /* Only care avail/used bits. */ + uint32x4_t v_mask = vdupq_n_u32(PACKED_FLAGS_MASK); + /* Extract high 32 bits of packed descriptor (id, flags). */ + uint32x4_t v_desc = vuzp2q_u32(vreinterpretq_u32_u64(desc[0]), + vreinterpretq_u32_u64(desc[1])); + uint32x4_t v_flag = vandq_u32(v_desc, v_mask); + + uint32x4_t v_used_flag = vdupq_n_u32(0); + if (vq->vq_packed.used_wrap_counter) + v_used_flag = vdupq_n_u32(PACKED_FLAGS_MASK); + + poly128_t desc_stats = vreinterpretq_p128_u32(~vceqq_u32(v_flag, v_used_flag)); + + /* Check all descs are used. */ + if (desc_stats) + return -1; + + /* Load 2 mbuf pointers per time. */ + mbp[0] = vld2q_u64((uint64_t *)&vq->vq_descx[id]); + vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0].val[0]); + + mbp[1] = vld2q_u64((uint64_t *)&vq->vq_descx[id + 2]); + vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1].val[0]); + + /** + * Update data length and packet length for descriptor. + * structure of pkt_mb: + * -------------------------------------------------------------------- + * |32 bits pkt_type|32 bits pkt_len|16 bits data_len|16 bits vlan_tci| + * -------------------------------------------------------------------- + */ + pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[0]), shuf_msk1)); + pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[0]), shuf_msk2)); + pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[1]), shuf_msk1))' + pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(desc[1]), shuf_msk2)); + + pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[0]), len_adjust)); + pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[1]), len_adjust)); + pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[2]), len_adjust)); + pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16( + vreinterpretq_u16_u64(pkt_mb[3]), len_adjust)); + + vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]); + vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]); + vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]); + vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]); + + if (hw->has_rx_offload) { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + char *addr = (char *)rx_pkts[i]->buf_addr + + RTE_PKTMBUF_HEADROOM - head_size; + virtio_vec_rx_offload(rx_pkts[i], + (struct virtio_net_hdr *)addr); + } + } + + virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len, + rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len, + rx_pkts[3]->pkt_len); + + vq->vq_free_cnt += PACKED_BATCH_SIZE; + + vq->vq_used_cons_idx += PACKED_BATCH_SIZE; + if (vq->vq_used_cons_idx >= vq->vq_nentries) { + vq->vq_used_cons_idx -= vq->vq_nentries; + vq->vq_packed.used_wrap_counter ^= 1; + } + + return 0; +} From patchwork Tue Nov 17 10:06:34 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Joyce Kong X-Patchwork-Id: 84261 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id CF224A04DB; Tue, 17 Nov 2020 11:08:13 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 2FA1BC8F4; Tue, 17 Nov 2020 11:07:26 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by dpdk.org (Postfix) with ESMTP id 5C66AC8DA for ; Tue, 17 Nov 2020 11:07:25 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id BCBA41477; Tue, 17 Nov 2020 02:07:23 -0800 (PST) Received: from net-arm-thunderx2-03.shanghai.arm.com (net-arm-thunderx2-03.shanghai.arm.com [10.169.208.206]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 454D93F718; Tue, 17 Nov 2020 02:07:21 -0800 (PST) From: Joyce Kong To: maxime.coquelin@redhat.com, chenbo.xia@intel.com, jerinj@marvell.com, ruifeng.wang@arm.com, honnappa.nagarahalli@arm.com Cc: dev@dpdk.org, nd@arm.com Date: Tue, 17 Nov 2020 18:06:34 +0800 Message-Id: <20201117100635.27690-4-joyce.kong@arm.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201117100635.27690-1-joyce.kong@arm.com> References: <20200911120906.45995-1-joyce.kong@arm.com> <20201117100635.27690-1-joyce.kong@arm.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v1 3/4] net/virtio: add vectorized packed ring Tx NEON path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Optimize packed ring Tx batch path with NEON instructions. Signed-off-by: Joyce Kong Reviewed-by: Ruifeng Wang Reviewed-by: Maxime Coquelin --- drivers/net/virtio/virtio_rxtx_packed.h | 6 +- drivers/net/virtio/virtio_rxtx_packed_neon.h | 143 +++++++++++++++++++ 2 files changed, 148 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio/virtio_rxtx_packed.h b/drivers/net/virtio/virtio_rxtx_packed.h index 8f5198ad7..016b6fb24 100644 --- a/drivers/net/virtio/virtio_rxtx_packed.h +++ b/drivers/net/virtio/virtio_rxtx_packed.h @@ -28,6 +28,8 @@ /* flag bits offset in packed ring desc from ID */ #define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ offsetof(struct vring_packed_desc, id)) * BYTE_SIZE) +#define FLAGS_LEN_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \ + offsetof(struct vring_packed_desc, len)) * BYTE_SIZE) #endif #define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \ @@ -36,13 +38,15 @@ /* reference count offset in mbuf rearm data */ #define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \ offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE) + +#ifdef CC_AVX512_SUPPORT /* segment number offset in mbuf rearm data */ #define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \ offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE) - /* default rearm data */ #define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \ 1ULL << REFCNT_BITS_OFFSET) +#endif /* id bits offset in packed ring desc higher 64bits */ #define ID_BITS_OFFSET ((offsetof(struct vring_packed_desc, id) - \ diff --git a/drivers/net/virtio/virtio_rxtx_packed_neon.h b/drivers/net/virtio/virtio_rxtx_packed_neon.h index fb1e49909..041f771ea 100644 --- a/drivers/net/virtio/virtio_rxtx_packed_neon.h +++ b/drivers/net/virtio/virtio_rxtx_packed_neon.h @@ -16,6 +16,149 @@ #include "virtio_rxtx_packed.h" #include "virtqueue.h" +static inline int +virtqueue_enqueue_batch_packed_vec(struct virtnet_tx *txvq, + struct rte_mbuf **tx_pkts) +{ + struct virtqueue *vq = txvq->vq; + uint16_t head_size = vq->hw->vtnet_hdr_size; + uint16_t idx = vq->vq_avail_idx; + struct virtio_net_hdr *hdr; + struct vq_desc_extra *dxp; + struct vring_packed_desc *p_desc; + uint16_t i; + + if (idx & PACKED_BATCH_MASK) + return -1; + + if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries)) + return -1; + + /* Map four refcnt and nb_segs from mbufs to one NEON register. */ + uint8x16_t ref_seg_msk = { + 2, 3, 4, 5, + 10, 11, 12, 13, + 18, 19, 20, 21, + 26, 27, 28, 29 + }; + + /* Map four data_off from mbufs to one NEON register. */ + uint8x8_t data_msk = { + 0, 1, + 8, 9, + 16, 17, + 24, 25 + }; + + uint16x8_t net_hdr_msk = { + 0xFFFF, 0xFFFF, + 0, 0, 0, 0 + }; + + uint16x4_t pkts[PACKED_BATCH_SIZE]; + uint8x16x2_t mbuf; + /* Load four mbufs rearm data. */ + RTE_BUILD_BUG_ON(REFCNT_BITS_OFFSET >= 64); + pkts[0] = vld1_u16((uint16_t *)&tx_pkts[0]->rearm_data); + pkts[1] = vld1_u16((uint16_t *)&tx_pkts[1]->rearm_data); + pkts[2] = vld1_u16((uint16_t *)&tx_pkts[2]->rearm_data); + pkts[3] = vld1_u16((uint16_t *)&tx_pkts[3]->rearm_data); + + mbuf.val[0] = vreinterpretq_u8_u16(vcombine_u16(pkts[0], pkts[1])); + mbuf.val[1] = vreinterpretq_u8_u16(vcombine_u16(pkts[2], pkts[3])); + + /* refcnt = 1 and nb_segs = 1 */ + uint32x4_t def_ref_seg = vdupq_n_u32(0x10001); + /* Check refcnt and nb_segs. */ + uint32x4_t ref_seg = vreinterpretq_u32_u8(vqtbl2q_u8(mbuf, ref_seg_msk)); + poly128_t cmp1 = vreinterpretq_p128_u32(~vceqq_u32(ref_seg, def_ref_seg)); + if (unlikely(cmp1)) + return -1; + + /* Check headroom is enough. */ + uint16x4_t head_rooms = vdup_n_u16(head_size); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) != + offsetof(struct rte_mbuf, rearm_data)); + uint16x4_t data_offset = vreinterpret_u16_u8(vqtbl2_u8(mbuf, data_msk)); + uint64x1_t cmp2 = vreinterpret_u64_u16(vclt_u16(data_offset, head_rooms)); + if (unlikely(vget_lane_u64(cmp2, 0))) + return -1; + + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + dxp = &vq->vq_descx[idx + i]; + dxp->ndescs = 1; + dxp->cookie = tx_pkts[i]; + } + + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + tx_pkts[i]->data_off -= head_size; + tx_pkts[i]->data_len += head_size; + } + + uint64x2x2_t desc[PACKED_BATCH_SIZE / 2]; + uint64x2_t base_addr0 = { + VIRTIO_MBUF_ADDR(tx_pkts[0], vq) + tx_pkts[0]->data_off, + VIRTIO_MBUF_ADDR(tx_pkts[1], vq) + tx_pkts[1]->data_off + }; + uint64x2_t base_addr1 = { + VIRTIO_MBUF_ADDR(tx_pkts[2], vq) + tx_pkts[2]->data_off, + VIRTIO_MBUF_ADDR(tx_pkts[3], vq) + tx_pkts[3]->data_off + }; + + desc[0].val[0] = base_addr0; + desc[1].val[0] = base_addr1; + + uint64_t flags = (uint64_t)vq->vq_packed.cached_flags << FLAGS_LEN_BITS_OFFSET; + uint64x2_t tx_desc0 = { + flags | (uint64_t)idx << ID_BITS_OFFSET | tx_pkts[0]->data_len, + flags | (uint64_t)(idx + 1) << ID_BITS_OFFSET | tx_pkts[1]->data_len + }; + + uint64x2_t tx_desc1 = { + flags | (uint64_t)(idx + 2) << ID_BITS_OFFSET | tx_pkts[2]->data_len, + flags | (uint64_t)(idx + 3) << ID_BITS_OFFSET | tx_pkts[3]->data_len + }; + + desc[0].val[1] = tx_desc0; + desc[1].val[1] = tx_desc1; + + if (!vq->hw->has_tx_offload) { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = rte_pktmbuf_mtod_offset(tx_pkts[i], + struct virtio_net_hdr *, -head_size); + /* Clear net hdr. */ + uint16x8_t v_hdr = vld1q_u16((void *)hdr); + vst1q_u16((void *)hdr, vandq_u16(v_hdr, net_hdr_msk)); + } + } else { + virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = rte_pktmbuf_mtod_offset(tx_pkts[i], + struct virtio_net_hdr *, -head_size); + virtqueue_xmit_offload(hdr, tx_pkts[i], true); + } + } + + /* Enqueue packet buffers. */ + p_desc = &vq->vq_packed.ring.desc[idx]; + vst2q_u64((uint64_t *)p_desc, desc[0]); + vst2q_u64((uint64_t *)(p_desc + 2), desc[1]); + + virtio_update_batch_stats(&txvq->stats, tx_pkts[0]->pkt_len, + tx_pkts[1]->pkt_len, tx_pkts[2]->pkt_len, + tx_pkts[3]->pkt_len); + + vq->vq_avail_idx += PACKED_BATCH_SIZE; + vq->vq_free_cnt -= PACKED_BATCH_SIZE; + + if (vq->vq_avail_idx >= vq->vq_nentries) { + vq->vq_avail_idx -= vq->vq_nentries; + vq->vq_packed.cached_flags ^= + VRING_PACKED_DESC_F_AVAIL_USED; + } + + return 0; +} + static inline uint16_t virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq, struct rte_mbuf **rx_pkts) From patchwork Tue Nov 17 10:06:35 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Joyce Kong X-Patchwork-Id: 84262 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id 67AFCA04DB; Tue, 17 Nov 2020 11:08:34 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id C03FEC900; Tue, 17 Nov 2020 11:07:31 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by dpdk.org (Postfix) with ESMTP id 4D1DDC8DC for ; Tue, 17 Nov 2020 11:07:30 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id C57A81478; Tue, 17 Nov 2020 02:07:28 -0800 (PST) Received: from net-arm-thunderx2-03.shanghai.arm.com (net-arm-thunderx2-03.shanghai.arm.com [10.169.208.206]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPA id 4E8F83F718; Tue, 17 Nov 2020 02:07:26 -0800 (PST) From: Joyce Kong To: maxime.coquelin@redhat.com, chenbo.xia@intel.com, jerinj@marvell.com, ruifeng.wang@arm.com, honnappa.nagarahalli@arm.com Cc: dev@dpdk.org, nd@arm.com Date: Tue, 17 Nov 2020 18:06:35 +0800 Message-Id: <20201117100635.27690-5-joyce.kong@arm.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201117100635.27690-1-joyce.kong@arm.com> References: <20200911120906.45995-1-joyce.kong@arm.com> <20201117100635.27690-1-joyce.kong@arm.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v1 4/4] net/virtio: add election for packed vector NEON path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Add NEON vectorized path selection logic. Default setting comes from vectorized devarg, then checks each criteria. Packed ring vectorized neon path need: NEON is supported by compiler and host VERSION_1 and IN_ORDER features are negotiated mergeable feature is not negotiated LRO offloading is disabled Signed-off-by: Joyce Kong Reviewed-by: Ruifeng Wang Reviewed-by: Maxime Coquelin --- doc/guides/nics/virtio.rst | 6 +++--- drivers/net/virtio/meson.build | 1 + drivers/net/virtio/virtio_ethdev.c | 19 +++++++++++++++---- drivers/net/virtio/virtio_rxtx_packed.c | 2 ++ drivers/net/virtio/virtio_user_ethdev.c | 2 +- 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/doc/guides/nics/virtio.rst b/doc/guides/nics/virtio.rst index c03c2d0fe..b7be3aca1 100644 --- a/doc/guides/nics/virtio.rst +++ b/doc/guides/nics/virtio.rst @@ -483,11 +483,11 @@ according to below configuration: #. Packed virtqueue in-order non-mergeable path: If in-order feature is negotiated and Rx mergeable is not negotiated, this path will be selected. #. Packed virtqueue vectorized Rx path: If building and running environment support - AVX512 && in-order feature is negotiated && Rx mergeable is not negotiated && - TCP_LRO Rx offloading is disabled && vectorized option enabled, + (AVX512 || NEON) && in-order feature is negotiated && Rx mergeable + is not negotiated && TCP_LRO Rx offloading is disabled && vectorized option enabled, this path will be selected. #. Packed virtqueue vectorized Tx path: If building and running environment support - AVX512 && in-order feature is negotiated && vectorized option enabled, + (AVX512 || NEON) && in-order feature is negotiated && vectorized option enabled, this path will be selected. Rx/Tx callbacks of each Virtio path diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build index 01b8de6d4..738d66746 100644 --- a/drivers/net/virtio/meson.build +++ b/drivers/net/virtio/meson.build @@ -32,6 +32,7 @@ if arch_subdir == 'x86' elif arch_subdir == 'ppc' sources += files('virtio_rxtx_simple_altivec.c') elif arch_subdir == 'arm' and host_machine.cpu_family().startswith('aarch64') + sources += files('virtio_rxtx_packed.c') sources += files('virtio_rxtx_simple_neon.c') endif diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c index 6c233b75b..54a6d6ca9 100644 --- a/drivers/net/virtio/virtio_ethdev.c +++ b/drivers/net/virtio/virtio_ethdev.c @@ -1967,12 +1967,12 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev) if (!vtpci_packed_queue(hw)) { hw->use_vec_rx = 1; } else { -#if !defined(CC_AVX512_SUPPORT) - PMD_DRV_LOG(INFO, - "building environment do not support packed ring vectorized"); -#else +#if defined(CC_AVX512_SUPPORT) || defined(RTE_ARCH_ARM) hw->use_vec_rx = 1; hw->use_vec_tx = 1; +#else + PMD_DRV_LOG(INFO, + "building environment do not support packed ring vectorized"); #endif } } @@ -2320,6 +2320,17 @@ virtio_dev_configure(struct rte_eth_dev *dev) hw->use_vec_rx = 0; hw->use_vec_tx = 0; } +#elif defined(RTE_ARCH_ARM) + if ((hw->use_vec_rx || hw->use_vec_tx) && + (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) || + !vtpci_with_feature(hw, VIRTIO_F_IN_ORDER) || + !vtpci_with_feature(hw, VIRTIO_F_VERSION_1) || + rte_vect_get_max_simd_bitwidth() < RTE_VECT_SIMD_128)) { + PMD_DRV_LOG(INFO, + "disabled packed ring vectorized path for requirements not met"); + hw->use_vec_rx = 0; + hw->use_vec_tx = 0; + } #else hw->use_vec_rx = 0; hw->use_vec_tx = 0; diff --git a/drivers/net/virtio/virtio_rxtx_packed.c b/drivers/net/virtio/virtio_rxtx_packed.c index 99d9a5a99..882dca36e 100644 --- a/drivers/net/virtio/virtio_rxtx_packed.c +++ b/drivers/net/virtio/virtio_rxtx_packed.c @@ -18,6 +18,8 @@ #ifdef CC_AVX512_SUPPORT #include "virtio_rxtx_packed_avx.h" +#elif defined(RTE_ARCH_ARM) +#include "virtio_rxtx_packed_neon.h" #endif uint16_t diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c index 40345193e..241808cd8 100644 --- a/drivers/net/virtio/virtio_user_ethdev.c +++ b/drivers/net/virtio/virtio_user_ethdev.c @@ -856,7 +856,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev) if (vectorized) { if (packed_vq) { -#if defined(CC_AVX512_SUPPORT) +#if defined(CC_AVX512_SUPPORT) || defined(RTE_ARCH_ARM) hw->use_vec_rx = 1; hw->use_vec_tx = 1; #else