[v10,6/9] net/virtio: add vectorized packed ring Rx path

Message ID 20200426021943.43158-7-yong.liu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series [v10,1/9] net/virtio: add Rx free threshold setting |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues

Commit Message

Marvin Liu April 26, 2020, 2:19 a.m. UTC
  Optimize packed ring Rx path with SIMD instructions. Solution of
optimization is pretty like vhost, is that split path into batch and
single functions. Batch function is further optimized by AVX512
instructions. Also pad desc extra structure to 16 bytes aligned, thus
four elements will be saved in one batch.

Signed-off-by: Marvin Liu <yong.liu@intel.com>
  

Comments

Maxime Coquelin April 27, 2020, 11:20 a.m. UTC | #1
On 4/26/20 4:19 AM, Marvin Liu wrote:
> Optimize packed ring Rx path with SIMD instructions. Solution of
> optimization is pretty like vhost, is that split path into batch and
> single functions. Batch function is further optimized by AVX512
> instructions. Also pad desc extra structure to 16 bytes aligned, thus
> four elements will be saved in one batch.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> index c9edb84ee..102b1deab 100644
> --- a/drivers/net/virtio/Makefile
> +++ b/drivers/net/virtio/Makefile
> @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
>  endif
>  
> +ifneq ($(FORCE_DISABLE_AVX512), y)
> +	CC_AVX512_SUPPORT=\
> +	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
> +	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
> +	grep -q AVX512 && echo 1)
> +endif
> +
> +ifeq ($(CC_AVX512_SUPPORT), 1)
> +CFLAGS += -DCC_AVX512_SUPPORT
> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
> +
> +ifeq ($(RTE_TOOLCHAIN), gcc)
> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), clang)
> +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1)
> +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), icc)
> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
> +endif
> +endif
> +
> +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -mavx512vl
> +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
> +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
> +endif
> +endif
> +
>  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
> diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
> index 15150eea1..8e68c3039 100644
> --- a/drivers/net/virtio/meson.build
> +++ b/drivers/net/virtio/meson.build
> @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
>  deps += ['kvargs', 'bus_pci']
>  
>  if arch_subdir == 'x86'
> +	if '-mno-avx512f' not in machine_args
> +		if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
> +			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
> +			cflags += ['-DCC_AVX512_SUPPORT']
> +			if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
> +				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> +			elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> +				cflags += '-DVHOST_CLANG_UNROLL_PRAGMA'
> +			elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0'))
> +				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
> +			endif
> +			sources += files('virtio_rxtx_packed_avx.c')
> +		endif
> +	endif
>  	sources += files('virtio_rxtx_simple_sse.c')
>  elif arch_subdir == 'ppc'
>  	sources += files('virtio_rxtx_simple_altivec.c')
> diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
> index febaf17a8..5c112cac7 100644
> --- a/drivers/net/virtio/virtio_ethdev.h
> +++ b/drivers/net/virtio/virtio_ethdev.h
> @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void *tx_queue, struct rte_mbuf **tx_pkts,
>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		uint16_t nb_pkts);
>  
> +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +		uint16_t nb_pkts);
> +
>  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
>  
>  void virtio_interrupt_handler(void *param);
> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> index a549991aa..534562cca 100644
> --- a/drivers/net/virtio/virtio_rxtx.c
> +++ b/drivers/net/virtio/virtio_rxtx.c
> @@ -2030,3 +2030,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
>  
>  	return nb_tx;
>  }
> +
> +__rte_weak uint16_t
> +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
> +			    struct rte_mbuf **rx_pkts __rte_unused,
> +			    uint16_t nb_pkts __rte_unused)
> +{
> +	return 0;
> +}
> diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> new file mode 100644
> index 000000000..8a7b459eb
> --- /dev/null
> +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> @@ -0,0 +1,374 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <rte_net.h>
> +
> +#include "virtio_logs.h"
> +#include "virtio_ethdev.h"
> +#include "virtio_pci.h"
> +#include "virtqueue.h"
> +
> +#define BYTE_SIZE 8
> +/* flag bits offset in packed ring desc higher 64bits */
> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> +
> +#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
> +	FLAGS_BITS_OFFSET)
> +
> +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> +	sizeof(struct vring_packed_desc))
> +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> +
> +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
> +	for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
> +	for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
> +	for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifndef virtio_for_each_try_unroll
> +#define virtio_for_each_try_unroll(iter, val, num) \
> +	for (iter = val; iter < num; iter++)
> +#endif
> +
> +static inline void
> +virtio_update_batch_stats(struct virtnet_stats *stats,
> +			  uint16_t pkt_len1,
> +			  uint16_t pkt_len2,
> +			  uint16_t pkt_len3,
> +			  uint16_t pkt_len4)
> +{
> +	stats->bytes += pkt_len1;
> +	stats->bytes += pkt_len2;
> +	stats->bytes += pkt_len3;
> +	stats->bytes += pkt_len4;
> +}
> +
> +/* Optionally fill offload information in structure */
> +static inline int
> +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
> +{
> +	struct rte_net_hdr_lens hdr_lens;
> +	uint32_t hdrlen, ptype;
> +	int l4_supported = 0;
> +
> +	/* nothing to do */
> +	if (hdr->flags == 0)
> +		return 0;
> +
> +	/* GSO not support in vec path, skip check */
> +	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
> +
> +	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
> +	m->packet_type = ptype;
> +	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
> +		l4_supported = 1;
> +
> +	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> +		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
> +		if (hdr->csum_start <= hdrlen && l4_supported) {
> +			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
> +		} else {
> +			/* Unknown proto or tunnel, do sw cksum. We can assume
> +			 * the cksum field is in the first segment since the
> +			 * buffers we provided to the host are large enough.
> +			 * In case of SCTP, this will be wrong since it's a CRC
> +			 * but there's nothing we can do.
> +			 */
> +			uint16_t csum = 0, off;
> +
> +			rte_raw_cksum_mbuf(m, hdr->csum_start,
> +				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
> +				&csum);
> +			if (likely(csum != 0xffff))
> +				csum = ~csum;
> +			off = hdr->csum_offset + hdr->csum_start;
> +			if (rte_pktmbuf_data_len(m) >= off + 1)
> +				*rte_pktmbuf_mtod_offset(m, uint16_t *,
> +					off) = csum;
> +		}
> +	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
> +		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
> +	}
> +
> +	return 0;
> +}
> +
> +static inline uint16_t
> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
> +				   struct rte_mbuf **rx_pkts)
> +{
> +	struct virtqueue *vq = rxvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint16_t hdr_size = hw->vtnet_hdr_size;
> +	uint64_t addrs[PACKED_BATCH_SIZE];
> +	uint16_t id = vq->vq_used_cons_idx;
> +	uint8_t desc_stats;
> +	uint16_t i;
> +	void *desc_addr;
> +
> +	if (id & PACKED_BATCH_MASK)
> +		return -1;
> +
> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
> +		return -1;
> +
> +	/* only care avail/used bits */
> +	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
> +	desc_addr = &vq->vq_packed.ring.desc[id];
> +
> +	__m512i v_desc = _mm512_loadu_si512(desc_addr);
> +	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
> +
> +	__m512i v_used_flag = _mm512_setzero_si512();
> +	if (vq->vq_packed.used_wrap_counter)
> +		v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
> +
> +	/* Check all descs are used */
> +	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
> +	if (desc_stats)
> +		return -1;
> +
> +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
> +		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
> +
> +		addrs[i] = (uint64_t)rx_pkts[i]->rx_descriptor_fields1;
> +	}
> +
> +	/*
> +	 * load len from desc, store into mbuf pkt_len and data_len
> +	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
> +	 */
> +	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
> +	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA);
> +
> +	/* reduce hdr_len from pkt_len and data_len */
> +	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
> +			(uint32_t)-hdr_size);
> +
> +	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
> +
> +	/* assert offset of data_len */
> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
> +		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
> +
> +	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
> +					   addrs[2] + 8, addrs[2],
> +					   addrs[1] + 8, addrs[1],
> +					   addrs[0] + 8, addrs[0]);
> +	/* batch store into mbufs */
> +	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
> +
> +	if (hw->has_rx_offload) {
> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			char *addr = (char *)rx_pkts[i]->buf_addr +
> +				RTE_PKTMBUF_HEADROOM - hdr_size;
> +			virtio_vec_rx_offload(rx_pkts[i],
> +					(struct virtio_net_hdr *)addr);
> +		}
> +	}
> +
> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
> +			rx_pkts[3]->pkt_len);
> +
> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
> +
> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> +		vq->vq_packed.used_wrap_counter ^= 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static uint16_t
> +virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
> +				    struct rte_mbuf **rx_pkts)
> +{
> +	uint16_t used_idx, id;
> +	uint32_t len;
> +	struct virtqueue *vq = rxvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint32_t hdr_size = hw->vtnet_hdr_size;
> +	struct virtio_net_hdr *hdr;
> +	struct vring_packed_desc *desc;
> +	struct rte_mbuf *cookie;
> +
> +	desc = vq->vq_packed.ring.desc;
> +	used_idx = vq->vq_used_cons_idx;
> +	if (!desc_is_used(&desc[used_idx], vq))
> +		return -1;
> +
> +	len = desc[used_idx].len;
> +	id = desc[used_idx].id;
> +	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
> +	if (unlikely(cookie == NULL)) {
> +		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
> +				vq->vq_used_cons_idx);
> +		return -1;
> +	}
> +	rte_prefetch0(cookie);
> +	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
> +
> +	cookie->data_off = RTE_PKTMBUF_HEADROOM;
> +	cookie->ol_flags = 0;
> +	cookie->pkt_len = (uint32_t)(len - hdr_size);
> +	cookie->data_len = (uint32_t)(len - hdr_size);
> +
> +	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
> +					RTE_PKTMBUF_HEADROOM - hdr_size);
> +	if (hw->has_rx_offload)
> +		virtio_vec_rx_offload(cookie, hdr);
> +
> +	*rx_pkts = cookie;
> +
> +	rxvq->stats.bytes += cookie->pkt_len;
> +
> +	vq->vq_free_cnt++;
> +	vq->vq_used_cons_idx++;
> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> +		vq->vq_packed.used_wrap_counter ^= 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static inline void
> +virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
> +			      struct rte_mbuf **cookie,
> +			      uint16_t num)
> +{
> +	struct virtqueue *vq = rxvq->vq;
> +	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
> +	uint16_t flags = vq->vq_packed.cached_flags;
> +	struct virtio_hw *hw = vq->hw;
> +	struct vq_desc_extra *dxp;
> +	uint16_t idx, i;
> +	uint16_t batch_num, total_num = 0;
> +	uint16_t head_idx = vq->vq_avail_idx;
> +	uint16_t head_flag = vq->vq_packed.cached_flags;
> +	uint64_t addr;
> +
> +	do {
> +		idx = vq->vq_avail_idx;
> +
> +		batch_num = PACKED_BATCH_SIZE;
> +		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
> +			batch_num = vq->vq_nentries - idx;
> +		if (unlikely((total_num + batch_num) > num))
> +			batch_num = num - total_num;
> +
> +		virtio_for_each_try_unroll(i, 0, batch_num) {
> +			dxp = &vq->vq_descx[idx + i];
> +			dxp->cookie = (void *)cookie[total_num + i];
> +
> +			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
> +				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
> +			start_dp[idx + i].addr = addr;
> +			start_dp[idx + i].len = cookie[total_num + i]->buf_len
> +				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
> +			if (total_num || i) {
> +				virtqueue_store_flags_packed(&start_dp[idx + i],
> +						flags, hw->weak_barriers);
> +			}
> +		}
> +
> +		vq->vq_avail_idx += batch_num;
> +		if (vq->vq_avail_idx >= vq->vq_nentries) {
> +			vq->vq_avail_idx -= vq->vq_nentries;
> +			vq->vq_packed.cached_flags ^=
> +				VRING_PACKED_DESC_F_AVAIL_USED;
> +			flags = vq->vq_packed.cached_flags;
> +		}
> +		total_num += batch_num;
> +	} while (total_num < num);
> +
> +	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
> +				hw->weak_barriers);
> +	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
> +}
> +
> +uint16_t
> +virtio_recv_pkts_packed_vec(void *rx_queue,
> +			    struct rte_mbuf **rx_pkts,
> +			    uint16_t nb_pkts)
> +{
> +	struct virtnet_rx *rxvq = rx_queue;
> +	struct virtqueue *vq = rxvq->vq;
> +	struct virtio_hw *hw = vq->hw;
> +	uint16_t num, nb_rx = 0;
> +	uint32_t nb_enqueued = 0;
> +	uint16_t free_cnt = vq->vq_free_thresh;
> +
> +	if (unlikely(hw->started == 0))
> +		return nb_rx;
> +
> +	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
> +	if (likely(num > PACKED_BATCH_SIZE))
> +		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
> +
> +	while (num) {
> +		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
> +					&rx_pkts[nb_rx])) {
> +			nb_rx += PACKED_BATCH_SIZE;
> +			num -= PACKED_BATCH_SIZE;
> +			continue;
> +		}
> +		if (!virtqueue_dequeue_single_packed_vec(rxvq,
> +					&rx_pkts[nb_rx])) {
> +			nb_rx++;
> +			num--;
> +			continue;
> +		}
> +		break;
> +	};
> +
> +	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
> +
> +	rxvq->stats.packets += nb_rx;
> +
> +	if (likely(vq->vq_free_cnt >= free_cnt)) {
> +		struct rte_mbuf *new_pkts[free_cnt];
> +		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
> +						free_cnt) == 0)) {
> +			virtio_recv_refill_packed_vec(rxvq, new_pkts,
> +					free_cnt);
> +			nb_enqueued += free_cnt;
> +		} else {
> +			struct rte_eth_dev *dev =
> +				&rte_eth_devices[rxvq->port_id];
> +			dev->data->rx_mbuf_alloc_failed += free_cnt;
> +		}
> +	}
> +
> +	if (likely(nb_enqueued)) {
> +		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
> +			virtqueue_notify(vq);
> +			PMD_RX_LOG(DEBUG, "Notified");
> +		}
> +	}
> +
> +	return nb_rx;
> +}
> diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
> index 40ad786cc..c54698ad1 100644
> --- a/drivers/net/virtio/virtio_user_ethdev.c
> +++ b/drivers/net/virtio/virtio_user_ethdev.c
> @@ -528,6 +528,7 @@ virtio_user_eth_dev_alloc(struct rte_vdev_device *vdev)
>  	hw->use_msix = 1;
>  	hw->modern   = 0;
>  	hw->use_vec_rx = 0;
> +	hw->use_vec_tx = 0;
>  	hw->use_inorder_rx = 0;
>  	hw->use_inorder_tx = 0;
>  	hw->virtio_user_dev = dev;
> @@ -739,8 +740,19 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev)
>  		goto end;
>  	}
>  
> -	if (vectorized)
> -		hw->use_vec_rx = 1;
> +	if (vectorized) {
> +		if (packed_vq) {
> +#if defined(CC_AVX512_SUPPORT)
> +			hw->use_vec_rx = 1;
> +			hw->use_vec_tx = 1;
> +#else
> +			PMD_INIT_LOG(INFO,
> +				"building environment do not support packed ring vectorized");
> +#endif
> +		} else {
> +			hw->use_vec_rx = 1;
> +		}
> +	}
>  
>  	rte_eth_dev_probing_finish(eth_dev);
>  	ret = 0;
> diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
> index ca1c10499..ce0340743 100644
> --- a/drivers/net/virtio/virtqueue.h
> +++ b/drivers/net/virtio/virtqueue.h
> @@ -239,7 +239,8 @@ struct vq_desc_extra {
>  	void *cookie;
>  	uint16_t ndescs;
>  	uint16_t next;
> -};
> +	uint8_t padding[4];
> +} __rte_packed __rte_aligned(16);

Can't this introduce a performance impact for the non-vectorized
case? I think of worse cache liens utilization.

For example with a burst of 32 descriptors with 32B cachelines, before
it would take 14 cachelines, after 16. So for each burst, one could face
2 extra cache misses.

If you could run non-vectorized benchamrks with and without that patch,
I would be grateful.

Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Thanks,
Maxime
  
Marvin Liu April 28, 2020, 1:14 a.m. UTC | #2
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, April 27, 2020 7:21 PM
> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> On 4/26/20 4:19 AM, Marvin Liu wrote:
> > Optimize packed ring Rx path with SIMD instructions. Solution of
> > optimization is pretty like vhost, is that split path into batch and
> > single functions. Batch function is further optimized by AVX512
> > instructions. Also pad desc extra structure to 16 bytes aligned, thus
> > four elements will be saved in one batch.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> > index c9edb84ee..102b1deab 100644
> > --- a/drivers/net/virtio/Makefile
> > +++ b/drivers/net/virtio/Makefile
> > @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM)
> $(CONFIG_RTE_ARCH_ARM64)),)
> >  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
> >  endif
> >
> > +ifneq ($(FORCE_DISABLE_AVX512), y)
> > +	CC_AVX512_SUPPORT=\
> > +	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
> > +	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
> > +	grep -q AVX512 && echo 1)
> > +endif
> > +
> > +ifeq ($(CC_AVX512_SUPPORT), 1)
> > +CFLAGS += -DCC_AVX512_SUPPORT
> > +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
> > +
> > +ifeq ($(RTE_TOOLCHAIN), gcc)
> > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> > +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
> > +endif
> > +endif
> > +
> > +ifeq ($(RTE_TOOLCHAIN), clang)
> > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -
> ge 37 && echo 1), 1)
> > +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
> > +endif
> > +endif
> > +
> > +ifeq ($(RTE_TOOLCHAIN), icc)
> > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> > +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
> > +endif
> > +endif
> > +
> > +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -mavx512vl
> > +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
> > +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
> > +endif
> > +endif
> > +
> >  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
> >  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
> > diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
> > index 15150eea1..8e68c3039 100644
> > --- a/drivers/net/virtio/meson.build
> > +++ b/drivers/net/virtio/meson.build
> > @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
> >  deps += ['kvargs', 'bus_pci']
> >
> >  if arch_subdir == 'x86'
> > +	if '-mno-avx512f' not in machine_args
> > +		if cc.has_argument('-mavx512f') and cc.has_argument('-
> mavx512vl') and cc.has_argument('-mavx512bw')
> > +			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
> > +			cflags += ['-DCC_AVX512_SUPPORT']
> > +			if (toolchain == 'gcc' and
> cc.version().version_compare('>=8.3.0'))
> > +				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> > +			elif (toolchain == 'clang' and
> cc.version().version_compare('>=3.7.0'))
> > +				cflags += '-
> DVHOST_CLANG_UNROLL_PRAGMA'
> > +			elif (toolchain == 'icc' and
> cc.version().version_compare('>=16.0.0'))
> > +				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
> > +			endif
> > +			sources += files('virtio_rxtx_packed_avx.c')
> > +		endif
> > +	endif
> >  	sources += files('virtio_rxtx_simple_sse.c')
> >  elif arch_subdir == 'ppc'
> >  	sources += files('virtio_rxtx_simple_altivec.c')
> > diff --git a/drivers/net/virtio/virtio_ethdev.h
> b/drivers/net/virtio/virtio_ethdev.h
> > index febaf17a8..5c112cac7 100644
> > --- a/drivers/net/virtio/virtio_ethdev.h
> > +++ b/drivers/net/virtio/virtio_ethdev.h
> > @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void *tx_queue,
> struct rte_mbuf **tx_pkts,
> >  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> >  		uint16_t nb_pkts);
> >
> > +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf
> **rx_pkts,
> > +		uint16_t nb_pkts);
> > +
> >  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
> >
> >  void virtio_interrupt_handler(void *param);
> > diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
> > index a549991aa..534562cca 100644
> > --- a/drivers/net/virtio/virtio_rxtx.c
> > +++ b/drivers/net/virtio/virtio_rxtx.c
> > @@ -2030,3 +2030,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
> >
> >  	return nb_tx;
> >  }
> > +
> > +__rte_weak uint16_t
> > +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
> > +			    struct rte_mbuf **rx_pkts __rte_unused,
> > +			    uint16_t nb_pkts __rte_unused)
> > +{
> > +	return 0;
> > +}
> > diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c
> b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > new file mode 100644
> > index 000000000..8a7b459eb
> > --- /dev/null
> > +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > @@ -0,0 +1,374 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2010-2020 Intel Corporation
> > + */
> > +
> > +#include <stdint.h>
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <errno.h>
> > +
> > +#include <rte_net.h>
> > +
> > +#include "virtio_logs.h"
> > +#include "virtio_ethdev.h"
> > +#include "virtio_pci.h"
> > +#include "virtqueue.h"
> > +
> > +#define BYTE_SIZE 8
> > +/* flag bits offset in packed ring desc higher 64bits */
> > +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> > +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> > +
> > +#define PACKED_FLAGS_MASK ((0ULL |
> VRING_PACKED_DESC_F_AVAIL_USED) << \
> > +	FLAGS_BITS_OFFSET)
> > +
> > +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> > +	sizeof(struct vring_packed_desc))
> > +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> > +
> > +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
> > +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4")
> \
> > +	for (iter = val; iter < size; iter++)
> > +#endif
> > +
> > +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
> > +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
> > +	for (iter = val; iter < size; iter++)
> > +#endif
> > +
> > +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
> > +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
> > +	for (iter = val; iter < size; iter++)
> > +#endif
> > +
> > +#ifndef virtio_for_each_try_unroll
> > +#define virtio_for_each_try_unroll(iter, val, num) \
> > +	for (iter = val; iter < num; iter++)
> > +#endif
> > +
> > +static inline void
> > +virtio_update_batch_stats(struct virtnet_stats *stats,
> > +			  uint16_t pkt_len1,
> > +			  uint16_t pkt_len2,
> > +			  uint16_t pkt_len3,
> > +			  uint16_t pkt_len4)
> > +{
> > +	stats->bytes += pkt_len1;
> > +	stats->bytes += pkt_len2;
> > +	stats->bytes += pkt_len3;
> > +	stats->bytes += pkt_len4;
> > +}
> > +
> > +/* Optionally fill offload information in structure */
> > +static inline int
> > +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
> > +{
> > +	struct rte_net_hdr_lens hdr_lens;
> > +	uint32_t hdrlen, ptype;
> > +	int l4_supported = 0;
> > +
> > +	/* nothing to do */
> > +	if (hdr->flags == 0)
> > +		return 0;
> > +
> > +	/* GSO not support in vec path, skip check */
> > +	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
> > +
> > +	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
> > +	m->packet_type = ptype;
> > +	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
> > +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
> > +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
> > +		l4_supported = 1;
> > +
> > +	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> > +		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
> > +		if (hdr->csum_start <= hdrlen && l4_supported) {
> > +			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
> > +		} else {
> > +			/* Unknown proto or tunnel, do sw cksum. We can
> assume
> > +			 * the cksum field is in the first segment since the
> > +			 * buffers we provided to the host are large enough.
> > +			 * In case of SCTP, this will be wrong since it's a CRC
> > +			 * but there's nothing we can do.
> > +			 */
> > +			uint16_t csum = 0, off;
> > +
> > +			rte_raw_cksum_mbuf(m, hdr->csum_start,
> > +				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
> > +				&csum);
> > +			if (likely(csum != 0xffff))
> > +				csum = ~csum;
> > +			off = hdr->csum_offset + hdr->csum_start;
> > +			if (rte_pktmbuf_data_len(m) >= off + 1)
> > +				*rte_pktmbuf_mtod_offset(m, uint16_t *,
> > +					off) = csum;
> > +		}
> > +	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID &&
> l4_supported) {
> > +		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static inline uint16_t
> > +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
> > +				   struct rte_mbuf **rx_pkts)
> > +{
> > +	struct virtqueue *vq = rxvq->vq;
> > +	struct virtio_hw *hw = vq->hw;
> > +	uint16_t hdr_size = hw->vtnet_hdr_size;
> > +	uint64_t addrs[PACKED_BATCH_SIZE];
> > +	uint16_t id = vq->vq_used_cons_idx;
> > +	uint8_t desc_stats;
> > +	uint16_t i;
> > +	void *desc_addr;
> > +
> > +	if (id & PACKED_BATCH_MASK)
> > +		return -1;
> > +
> > +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
> > +		return -1;
> > +
> > +	/* only care avail/used bits */
> > +	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa,
> PACKED_FLAGS_MASK);
> > +	desc_addr = &vq->vq_packed.ring.desc[id];
> > +
> > +	__m512i v_desc = _mm512_loadu_si512(desc_addr);
> > +	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
> > +
> > +	__m512i v_used_flag = _mm512_setzero_si512();
> > +	if (vq->vq_packed.used_wrap_counter)
> > +		v_used_flag = _mm512_maskz_set1_epi64(0xaa,
> PACKED_FLAGS_MASK);
> > +
> > +	/* Check all descs are used */
> > +	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
> > +	if (desc_stats)
> > +		return -1;
> > +
> > +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
> > +		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
> > +
> > +		addrs[i] = (uint64_t)rx_pkts[i]->rx_descriptor_fields1;
> > +	}
> > +
> > +	/*
> > +	 * load len from desc, store into mbuf pkt_len and data_len
> > +	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
> > +	 */
> > +	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
> > +	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc,
> 0xAA);
> > +
> > +	/* reduce hdr_len from pkt_len and data_len */
> > +	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
> > +			(uint32_t)-hdr_size);
> > +
> > +	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
> > +
> > +	/* assert offset of data_len */
> > +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
> > +		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
> > +
> > +	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
> > +					   addrs[2] + 8, addrs[2],
> > +					   addrs[1] + 8, addrs[1],
> > +					   addrs[0] + 8, addrs[0]);
> > +	/* batch store into mbufs */
> > +	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
> > +
> > +	if (hw->has_rx_offload) {
> > +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			char *addr = (char *)rx_pkts[i]->buf_addr +
> > +				RTE_PKTMBUF_HEADROOM - hdr_size;
> > +			virtio_vec_rx_offload(rx_pkts[i],
> > +					(struct virtio_net_hdr *)addr);
> > +		}
> > +	}
> > +
> > +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
> > +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
> > +			rx_pkts[3]->pkt_len);
> > +
> > +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
> > +
> > +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
> > +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> > +		vq->vq_used_cons_idx -= vq->vq_nentries;
> > +		vq->vq_packed.used_wrap_counter ^= 1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static uint16_t
> > +virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
> > +				    struct rte_mbuf **rx_pkts)
> > +{
> > +	uint16_t used_idx, id;
> > +	uint32_t len;
> > +	struct virtqueue *vq = rxvq->vq;
> > +	struct virtio_hw *hw = vq->hw;
> > +	uint32_t hdr_size = hw->vtnet_hdr_size;
> > +	struct virtio_net_hdr *hdr;
> > +	struct vring_packed_desc *desc;
> > +	struct rte_mbuf *cookie;
> > +
> > +	desc = vq->vq_packed.ring.desc;
> > +	used_idx = vq->vq_used_cons_idx;
> > +	if (!desc_is_used(&desc[used_idx], vq))
> > +		return -1;
> > +
> > +	len = desc[used_idx].len;
> > +	id = desc[used_idx].id;
> > +	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
> > +	if (unlikely(cookie == NULL)) {
> > +		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie
> at %u",
> > +				vq->vq_used_cons_idx);
> > +		return -1;
> > +	}
> > +	rte_prefetch0(cookie);
> > +	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
> > +
> > +	cookie->data_off = RTE_PKTMBUF_HEADROOM;
> > +	cookie->ol_flags = 0;
> > +	cookie->pkt_len = (uint32_t)(len - hdr_size);
> > +	cookie->data_len = (uint32_t)(len - hdr_size);
> > +
> > +	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
> > +					RTE_PKTMBUF_HEADROOM -
> hdr_size);
> > +	if (hw->has_rx_offload)
> > +		virtio_vec_rx_offload(cookie, hdr);
> > +
> > +	*rx_pkts = cookie;
> > +
> > +	rxvq->stats.bytes += cookie->pkt_len;
> > +
> > +	vq->vq_free_cnt++;
> > +	vq->vq_used_cons_idx++;
> > +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> > +		vq->vq_used_cons_idx -= vq->vq_nentries;
> > +		vq->vq_packed.used_wrap_counter ^= 1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static inline void
> > +virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
> > +			      struct rte_mbuf **cookie,
> > +			      uint16_t num)
> > +{
> > +	struct virtqueue *vq = rxvq->vq;
> > +	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
> > +	uint16_t flags = vq->vq_packed.cached_flags;
> > +	struct virtio_hw *hw = vq->hw;
> > +	struct vq_desc_extra *dxp;
> > +	uint16_t idx, i;
> > +	uint16_t batch_num, total_num = 0;
> > +	uint16_t head_idx = vq->vq_avail_idx;
> > +	uint16_t head_flag = vq->vq_packed.cached_flags;
> > +	uint64_t addr;
> > +
> > +	do {
> > +		idx = vq->vq_avail_idx;
> > +
> > +		batch_num = PACKED_BATCH_SIZE;
> > +		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
> > +			batch_num = vq->vq_nentries - idx;
> > +		if (unlikely((total_num + batch_num) > num))
> > +			batch_num = num - total_num;
> > +
> > +		virtio_for_each_try_unroll(i, 0, batch_num) {
> > +			dxp = &vq->vq_descx[idx + i];
> > +			dxp->cookie = (void *)cookie[total_num + i];
> > +
> > +			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i],
> vq) +
> > +				RTE_PKTMBUF_HEADROOM - hw-
> >vtnet_hdr_size;
> > +			start_dp[idx + i].addr = addr;
> > +			start_dp[idx + i].len = cookie[total_num + i]->buf_len
> > +				- RTE_PKTMBUF_HEADROOM + hw-
> >vtnet_hdr_size;
> > +			if (total_num || i) {
> > +				virtqueue_store_flags_packed(&start_dp[idx
> + i],
> > +						flags, hw->weak_barriers);
> > +			}
> > +		}
> > +
> > +		vq->vq_avail_idx += batch_num;
> > +		if (vq->vq_avail_idx >= vq->vq_nentries) {
> > +			vq->vq_avail_idx -= vq->vq_nentries;
> > +			vq->vq_packed.cached_flags ^=
> > +				VRING_PACKED_DESC_F_AVAIL_USED;
> > +			flags = vq->vq_packed.cached_flags;
> > +		}
> > +		total_num += batch_num;
> > +	} while (total_num < num);
> > +
> > +	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
> > +				hw->weak_barriers);
> > +	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
> > +}
> > +
> > +uint16_t
> > +virtio_recv_pkts_packed_vec(void *rx_queue,
> > +			    struct rte_mbuf **rx_pkts,
> > +			    uint16_t nb_pkts)
> > +{
> > +	struct virtnet_rx *rxvq = rx_queue;
> > +	struct virtqueue *vq = rxvq->vq;
> > +	struct virtio_hw *hw = vq->hw;
> > +	uint16_t num, nb_rx = 0;
> > +	uint32_t nb_enqueued = 0;
> > +	uint16_t free_cnt = vq->vq_free_thresh;
> > +
> > +	if (unlikely(hw->started == 0))
> > +		return nb_rx;
> > +
> > +	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
> > +	if (likely(num > PACKED_BATCH_SIZE))
> > +		num = num - ((vq->vq_used_cons_idx + num) %
> PACKED_BATCH_SIZE);
> > +
> > +	while (num) {
> > +		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
> > +					&rx_pkts[nb_rx])) {
> > +			nb_rx += PACKED_BATCH_SIZE;
> > +			num -= PACKED_BATCH_SIZE;
> > +			continue;
> > +		}
> > +		if (!virtqueue_dequeue_single_packed_vec(rxvq,
> > +					&rx_pkts[nb_rx])) {
> > +			nb_rx++;
> > +			num--;
> > +			continue;
> > +		}
> > +		break;
> > +	};
> > +
> > +	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
> > +
> > +	rxvq->stats.packets += nb_rx;
> > +
> > +	if (likely(vq->vq_free_cnt >= free_cnt)) {
> > +		struct rte_mbuf *new_pkts[free_cnt];
> > +		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
> > +						free_cnt) == 0)) {
> > +			virtio_recv_refill_packed_vec(rxvq, new_pkts,
> > +					free_cnt);
> > +			nb_enqueued += free_cnt;
> > +		} else {
> > +			struct rte_eth_dev *dev =
> > +				&rte_eth_devices[rxvq->port_id];
> > +			dev->data->rx_mbuf_alloc_failed += free_cnt;
> > +		}
> > +	}
> > +
> > +	if (likely(nb_enqueued)) {
> > +		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
> > +			virtqueue_notify(vq);
> > +			PMD_RX_LOG(DEBUG, "Notified");
> > +		}
> > +	}
> > +
> > +	return nb_rx;
> > +}
> > diff --git a/drivers/net/virtio/virtio_user_ethdev.c
> b/drivers/net/virtio/virtio_user_ethdev.c
> > index 40ad786cc..c54698ad1 100644
> > --- a/drivers/net/virtio/virtio_user_ethdev.c
> > +++ b/drivers/net/virtio/virtio_user_ethdev.c
> > @@ -528,6 +528,7 @@ virtio_user_eth_dev_alloc(struct rte_vdev_device
> *vdev)
> >  	hw->use_msix = 1;
> >  	hw->modern   = 0;
> >  	hw->use_vec_rx = 0;
> > +	hw->use_vec_tx = 0;
> >  	hw->use_inorder_rx = 0;
> >  	hw->use_inorder_tx = 0;
> >  	hw->virtio_user_dev = dev;
> > @@ -739,8 +740,19 @@ virtio_user_pmd_probe(struct rte_vdev_device
> *dev)
> >  		goto end;
> >  	}
> >
> > -	if (vectorized)
> > -		hw->use_vec_rx = 1;
> > +	if (vectorized) {
> > +		if (packed_vq) {
> > +#if defined(CC_AVX512_SUPPORT)
> > +			hw->use_vec_rx = 1;
> > +			hw->use_vec_tx = 1;
> > +#else
> > +			PMD_INIT_LOG(INFO,
> > +				"building environment do not support packed
> ring vectorized");
> > +#endif
> > +		} else {
> > +			hw->use_vec_rx = 1;
> > +		}
> > +	}
> >
> >  	rte_eth_dev_probing_finish(eth_dev);
> >  	ret = 0;
> > diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
> > index ca1c10499..ce0340743 100644
> > --- a/drivers/net/virtio/virtqueue.h
> > +++ b/drivers/net/virtio/virtqueue.h
> > @@ -239,7 +239,8 @@ struct vq_desc_extra {
> >  	void *cookie;
> >  	uint16_t ndescs;
> >  	uint16_t next;
> > -};
> > +	uint8_t padding[4];
> > +} __rte_packed __rte_aligned(16);
> 
> Can't this introduce a performance impact for the non-vectorized
> case? I think of worse cache liens utilization.
> 
> For example with a burst of 32 descriptors with 32B cachelines, before
> it would take 14 cachelines, after 16. So for each burst, one could face
> 2 extra cache misses.
> 
> If you could run non-vectorized benchamrks with and without that patch,
> I would be grateful.
> 

Maxime,
Thanks for point it out, it will add extra cache miss in datapath. 
And its impact on performance is around 1% in loopback case. 
While benefit of vectorized path will be more than that number.

Thanks,
Marvin

> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> 
> Thanks,
> Maxime
  
Maxime Coquelin April 28, 2020, 8:44 a.m. UTC | #3
On 4/28/20 3:14 AM, Liu, Yong wrote:
> 
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Monday, April 27, 2020 7:21 PM
>> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
>> Wang, Zhihong <zhihong.wang@intel.com>
>> Cc: dev@dpdk.org
>> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
>>
>>
>>
>> On 4/26/20 4:19 AM, Marvin Liu wrote:
>>> Optimize packed ring Rx path with SIMD instructions. Solution of
>>> optimization is pretty like vhost, is that split path into batch and
>>> single functions. Batch function is further optimized by AVX512
>>> instructions. Also pad desc extra structure to 16 bytes aligned, thus
>>> four elements will be saved in one batch.
>>>
>>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>>>
>>> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
>>> index c9edb84ee..102b1deab 100644
>>> --- a/drivers/net/virtio/Makefile
>>> +++ b/drivers/net/virtio/Makefile
>>> @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM)
>> $(CONFIG_RTE_ARCH_ARM64)),)
>>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
>>>  endif
>>>
>>> +ifneq ($(FORCE_DISABLE_AVX512), y)
>>> +	CC_AVX512_SUPPORT=\
>>> +	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
>>> +	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
>>> +	grep -q AVX512 && echo 1)
>>> +endif
>>> +
>>> +ifeq ($(CC_AVX512_SUPPORT), 1)
>>> +CFLAGS += -DCC_AVX512_SUPPORT
>>> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
>>> +
>>> +ifeq ($(RTE_TOOLCHAIN), gcc)
>>> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
>>> +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
>>> +endif
>>> +endif
>>> +
>>> +ifeq ($(RTE_TOOLCHAIN), clang)
>>> +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -
>> ge 37 && echo 1), 1)
>>> +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
>>> +endif
>>> +endif
>>> +
>>> +ifeq ($(RTE_TOOLCHAIN), icc)
>>> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
>>> +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
>>> +endif
>>> +endif
>>> +
>>> +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -mavx512vl
>>> +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
>>> +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
>>> +endif
>>> +endif
>>> +
>>>  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
>>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
>>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
>>> diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
>>> index 15150eea1..8e68c3039 100644
>>> --- a/drivers/net/virtio/meson.build
>>> +++ b/drivers/net/virtio/meson.build
>>> @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
>>>  deps += ['kvargs', 'bus_pci']
>>>
>>>  if arch_subdir == 'x86'
>>> +	if '-mno-avx512f' not in machine_args
>>> +		if cc.has_argument('-mavx512f') and cc.has_argument('-
>> mavx512vl') and cc.has_argument('-mavx512bw')
>>> +			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
>>> +			cflags += ['-DCC_AVX512_SUPPORT']
>>> +			if (toolchain == 'gcc' and
>> cc.version().version_compare('>=8.3.0'))
>>> +				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
>>> +			elif (toolchain == 'clang' and
>> cc.version().version_compare('>=3.7.0'))
>>> +				cflags += '-
>> DVHOST_CLANG_UNROLL_PRAGMA'
>>> +			elif (toolchain == 'icc' and
>> cc.version().version_compare('>=16.0.0'))
>>> +				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
>>> +			endif
>>> +			sources += files('virtio_rxtx_packed_avx.c')
>>> +		endif
>>> +	endif
>>>  	sources += files('virtio_rxtx_simple_sse.c')
>>>  elif arch_subdir == 'ppc'
>>>  	sources += files('virtio_rxtx_simple_altivec.c')
>>> diff --git a/drivers/net/virtio/virtio_ethdev.h
>> b/drivers/net/virtio/virtio_ethdev.h
>>> index febaf17a8..5c112cac7 100644
>>> --- a/drivers/net/virtio/virtio_ethdev.h
>>> +++ b/drivers/net/virtio/virtio_ethdev.h
>>> @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void *tx_queue,
>> struct rte_mbuf **tx_pkts,
>>>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>>>  		uint16_t nb_pkts);
>>>
>>> +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf
>> **rx_pkts,
>>> +		uint16_t nb_pkts);
>>> +
>>>  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
>>>
>>>  void virtio_interrupt_handler(void *param);
>>> diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
>>> index a549991aa..534562cca 100644
>>> --- a/drivers/net/virtio/virtio_rxtx.c
>>> +++ b/drivers/net/virtio/virtio_rxtx.c
>>> @@ -2030,3 +2030,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
>>>
>>>  	return nb_tx;
>>>  }
>>> +
>>> +__rte_weak uint16_t
>>> +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
>>> +			    struct rte_mbuf **rx_pkts __rte_unused,
>>> +			    uint16_t nb_pkts __rte_unused)
>>> +{
>>> +	return 0;
>>> +}
>>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c
>> b/drivers/net/virtio/virtio_rxtx_packed_avx.c
>>> new file mode 100644
>>> index 000000000..8a7b459eb
>>> --- /dev/null
>>> +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
>>> @@ -0,0 +1,374 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(c) 2010-2020 Intel Corporation
>>> + */
>>> +
>>> +#include <stdint.h>
>>> +#include <stdio.h>
>>> +#include <stdlib.h>
>>> +#include <string.h>
>>> +#include <errno.h>
>>> +
>>> +#include <rte_net.h>
>>> +
>>> +#include "virtio_logs.h"
>>> +#include "virtio_ethdev.h"
>>> +#include "virtio_pci.h"
>>> +#include "virtqueue.h"
>>> +
>>> +#define BYTE_SIZE 8
>>> +/* flag bits offset in packed ring desc higher 64bits */
>>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
>>> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
>>> +
>>> +#define PACKED_FLAGS_MASK ((0ULL |
>> VRING_PACKED_DESC_F_AVAIL_USED) << \
>>> +	FLAGS_BITS_OFFSET)
>>> +
>>> +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>>> +	sizeof(struct vring_packed_desc))
>>> +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
>>> +
>>> +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
>>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4")
>> \
>>> +	for (iter = val; iter < size; iter++)
>>> +#endif
>>> +
>>> +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
>>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
>>> +	for (iter = val; iter < size; iter++)
>>> +#endif
>>> +
>>> +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
>>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
>>> +	for (iter = val; iter < size; iter++)
>>> +#endif
>>> +
>>> +#ifndef virtio_for_each_try_unroll
>>> +#define virtio_for_each_try_unroll(iter, val, num) \
>>> +	for (iter = val; iter < num; iter++)
>>> +#endif
>>> +
>>> +static inline void
>>> +virtio_update_batch_stats(struct virtnet_stats *stats,
>>> +			  uint16_t pkt_len1,
>>> +			  uint16_t pkt_len2,
>>> +			  uint16_t pkt_len3,
>>> +			  uint16_t pkt_len4)
>>> +{
>>> +	stats->bytes += pkt_len1;
>>> +	stats->bytes += pkt_len2;
>>> +	stats->bytes += pkt_len3;
>>> +	stats->bytes += pkt_len4;
>>> +}
>>> +
>>> +/* Optionally fill offload information in structure */
>>> +static inline int
>>> +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
>>> +{
>>> +	struct rte_net_hdr_lens hdr_lens;
>>> +	uint32_t hdrlen, ptype;
>>> +	int l4_supported = 0;
>>> +
>>> +	/* nothing to do */
>>> +	if (hdr->flags == 0)
>>> +		return 0;
>>> +
>>> +	/* GSO not support in vec path, skip check */
>>> +	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
>>> +
>>> +	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
>>> +	m->packet_type = ptype;
>>> +	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
>>> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
>>> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
>>> +		l4_supported = 1;
>>> +
>>> +	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
>>> +		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
>>> +		if (hdr->csum_start <= hdrlen && l4_supported) {
>>> +			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
>>> +		} else {
>>> +			/* Unknown proto or tunnel, do sw cksum. We can
>> assume
>>> +			 * the cksum field is in the first segment since the
>>> +			 * buffers we provided to the host are large enough.
>>> +			 * In case of SCTP, this will be wrong since it's a CRC
>>> +			 * but there's nothing we can do.
>>> +			 */
>>> +			uint16_t csum = 0, off;
>>> +
>>> +			rte_raw_cksum_mbuf(m, hdr->csum_start,
>>> +				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
>>> +				&csum);
>>> +			if (likely(csum != 0xffff))
>>> +				csum = ~csum;
>>> +			off = hdr->csum_offset + hdr->csum_start;
>>> +			if (rte_pktmbuf_data_len(m) >= off + 1)
>>> +				*rte_pktmbuf_mtod_offset(m, uint16_t *,
>>> +					off) = csum;
>>> +		}
>>> +	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID &&
>> l4_supported) {
>>> +		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static inline uint16_t
>>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
>>> +				   struct rte_mbuf **rx_pkts)
>>> +{
>>> +	struct virtqueue *vq = rxvq->vq;
>>> +	struct virtio_hw *hw = vq->hw;
>>> +	uint16_t hdr_size = hw->vtnet_hdr_size;
>>> +	uint64_t addrs[PACKED_BATCH_SIZE];
>>> +	uint16_t id = vq->vq_used_cons_idx;
>>> +	uint8_t desc_stats;
>>> +	uint16_t i;
>>> +	void *desc_addr;
>>> +
>>> +	if (id & PACKED_BATCH_MASK)
>>> +		return -1;
>>> +
>>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
>>> +		return -1;
>>> +
>>> +	/* only care avail/used bits */
>>> +	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa,
>> PACKED_FLAGS_MASK);
>>> +	desc_addr = &vq->vq_packed.ring.desc[id];
>>> +
>>> +	__m512i v_desc = _mm512_loadu_si512(desc_addr);
>>> +	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
>>> +
>>> +	__m512i v_used_flag = _mm512_setzero_si512();
>>> +	if (vq->vq_packed.used_wrap_counter)
>>> +		v_used_flag = _mm512_maskz_set1_epi64(0xaa,
>> PACKED_FLAGS_MASK);
>>> +
>>> +	/* Check all descs are used */
>>> +	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
>>> +	if (desc_stats)
>>> +		return -1;
>>> +
>>> +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>>> +		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
>>> +		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
>>> +
>>> +		addrs[i] = (uint64_t)rx_pkts[i]->rx_descriptor_fields1;
>>> +	}
>>> +
>>> +	/*
>>> +	 * load len from desc, store into mbuf pkt_len and data_len
>>> +	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
>>> +	 */
>>> +	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
>>> +	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc,
>> 0xAA);
>>> +
>>> +	/* reduce hdr_len from pkt_len and data_len */
>>> +	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
>>> +			(uint32_t)-hdr_size);
>>> +
>>> +	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
>>> +
>>> +	/* assert offset of data_len */
>>> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
>>> +		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
>>> +
>>> +	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
>>> +					   addrs[2] + 8, addrs[2],
>>> +					   addrs[1] + 8, addrs[1],
>>> +					   addrs[0] + 8, addrs[0]);
>>> +	/* batch store into mbufs */
>>> +	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
>>> +
>>> +	if (hw->has_rx_offload) {
>>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
>>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
>>> +				RTE_PKTMBUF_HEADROOM - hdr_size;
>>> +			virtio_vec_rx_offload(rx_pkts[i],
>>> +					(struct virtio_net_hdr *)addr);
>>> +		}
>>> +	}
>>> +
>>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
>>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
>>> +			rx_pkts[3]->pkt_len);
>>> +
>>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
>>> +
>>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
>>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>>> +		vq->vq_packed.used_wrap_counter ^= 1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static uint16_t
>>> +virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
>>> +				    struct rte_mbuf **rx_pkts)
>>> +{
>>> +	uint16_t used_idx, id;
>>> +	uint32_t len;
>>> +	struct virtqueue *vq = rxvq->vq;
>>> +	struct virtio_hw *hw = vq->hw;
>>> +	uint32_t hdr_size = hw->vtnet_hdr_size;
>>> +	struct virtio_net_hdr *hdr;
>>> +	struct vring_packed_desc *desc;
>>> +	struct rte_mbuf *cookie;
>>> +
>>> +	desc = vq->vq_packed.ring.desc;
>>> +	used_idx = vq->vq_used_cons_idx;
>>> +	if (!desc_is_used(&desc[used_idx], vq))
>>> +		return -1;
>>> +
>>> +	len = desc[used_idx].len;
>>> +	id = desc[used_idx].id;
>>> +	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
>>> +	if (unlikely(cookie == NULL)) {
>>> +		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie
>> at %u",
>>> +				vq->vq_used_cons_idx);
>>> +		return -1;
>>> +	}
>>> +	rte_prefetch0(cookie);
>>> +	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
>>> +
>>> +	cookie->data_off = RTE_PKTMBUF_HEADROOM;
>>> +	cookie->ol_flags = 0;
>>> +	cookie->pkt_len = (uint32_t)(len - hdr_size);
>>> +	cookie->data_len = (uint32_t)(len - hdr_size);
>>> +
>>> +	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
>>> +					RTE_PKTMBUF_HEADROOM -
>> hdr_size);
>>> +	if (hw->has_rx_offload)
>>> +		virtio_vec_rx_offload(cookie, hdr);
>>> +
>>> +	*rx_pkts = cookie;
>>> +
>>> +	rxvq->stats.bytes += cookie->pkt_len;
>>> +
>>> +	vq->vq_free_cnt++;
>>> +	vq->vq_used_cons_idx++;
>>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
>>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
>>> +		vq->vq_packed.used_wrap_counter ^= 1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static inline void
>>> +virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
>>> +			      struct rte_mbuf **cookie,
>>> +			      uint16_t num)
>>> +{
>>> +	struct virtqueue *vq = rxvq->vq;
>>> +	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
>>> +	uint16_t flags = vq->vq_packed.cached_flags;
>>> +	struct virtio_hw *hw = vq->hw;
>>> +	struct vq_desc_extra *dxp;
>>> +	uint16_t idx, i;
>>> +	uint16_t batch_num, total_num = 0;
>>> +	uint16_t head_idx = vq->vq_avail_idx;
>>> +	uint16_t head_flag = vq->vq_packed.cached_flags;
>>> +	uint64_t addr;
>>> +
>>> +	do {
>>> +		idx = vq->vq_avail_idx;
>>> +
>>> +		batch_num = PACKED_BATCH_SIZE;
>>> +		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
>>> +			batch_num = vq->vq_nentries - idx;
>>> +		if (unlikely((total_num + batch_num) > num))
>>> +			batch_num = num - total_num;
>>> +
>>> +		virtio_for_each_try_unroll(i, 0, batch_num) {
>>> +			dxp = &vq->vq_descx[idx + i];
>>> +			dxp->cookie = (void *)cookie[total_num + i];
>>> +
>>> +			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i],
>> vq) +
>>> +				RTE_PKTMBUF_HEADROOM - hw-
>>> vtnet_hdr_size;
>>> +			start_dp[idx + i].addr = addr;
>>> +			start_dp[idx + i].len = cookie[total_num + i]->buf_len
>>> +				- RTE_PKTMBUF_HEADROOM + hw-
>>> vtnet_hdr_size;
>>> +			if (total_num || i) {
>>> +				virtqueue_store_flags_packed(&start_dp[idx
>> + i],
>>> +						flags, hw->weak_barriers);
>>> +			}
>>> +		}
>>> +
>>> +		vq->vq_avail_idx += batch_num;
>>> +		if (vq->vq_avail_idx >= vq->vq_nentries) {
>>> +			vq->vq_avail_idx -= vq->vq_nentries;
>>> +			vq->vq_packed.cached_flags ^=
>>> +				VRING_PACKED_DESC_F_AVAIL_USED;
>>> +			flags = vq->vq_packed.cached_flags;
>>> +		}
>>> +		total_num += batch_num;
>>> +	} while (total_num < num);
>>> +
>>> +	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
>>> +				hw->weak_barriers);
>>> +	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
>>> +}
>>> +
>>> +uint16_t
>>> +virtio_recv_pkts_packed_vec(void *rx_queue,
>>> +			    struct rte_mbuf **rx_pkts,
>>> +			    uint16_t nb_pkts)
>>> +{
>>> +	struct virtnet_rx *rxvq = rx_queue;
>>> +	struct virtqueue *vq = rxvq->vq;
>>> +	struct virtio_hw *hw = vq->hw;
>>> +	uint16_t num, nb_rx = 0;
>>> +	uint32_t nb_enqueued = 0;
>>> +	uint16_t free_cnt = vq->vq_free_thresh;
>>> +
>>> +	if (unlikely(hw->started == 0))
>>> +		return nb_rx;
>>> +
>>> +	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
>>> +	if (likely(num > PACKED_BATCH_SIZE))
>>> +		num = num - ((vq->vq_used_cons_idx + num) %
>> PACKED_BATCH_SIZE);
>>> +
>>> +	while (num) {
>>> +		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
>>> +					&rx_pkts[nb_rx])) {
>>> +			nb_rx += PACKED_BATCH_SIZE;
>>> +			num -= PACKED_BATCH_SIZE;
>>> +			continue;
>>> +		}
>>> +		if (!virtqueue_dequeue_single_packed_vec(rxvq,
>>> +					&rx_pkts[nb_rx])) {
>>> +			nb_rx++;
>>> +			num--;
>>> +			continue;
>>> +		}
>>> +		break;
>>> +	};
>>> +
>>> +	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
>>> +
>>> +	rxvq->stats.packets += nb_rx;
>>> +
>>> +	if (likely(vq->vq_free_cnt >= free_cnt)) {
>>> +		struct rte_mbuf *new_pkts[free_cnt];
>>> +		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
>>> +						free_cnt) == 0)) {
>>> +			virtio_recv_refill_packed_vec(rxvq, new_pkts,
>>> +					free_cnt);
>>> +			nb_enqueued += free_cnt;
>>> +		} else {
>>> +			struct rte_eth_dev *dev =
>>> +				&rte_eth_devices[rxvq->port_id];
>>> +			dev->data->rx_mbuf_alloc_failed += free_cnt;
>>> +		}
>>> +	}
>>> +
>>> +	if (likely(nb_enqueued)) {
>>> +		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
>>> +			virtqueue_notify(vq);
>>> +			PMD_RX_LOG(DEBUG, "Notified");
>>> +		}
>>> +	}
>>> +
>>> +	return nb_rx;
>>> +}
>>> diff --git a/drivers/net/virtio/virtio_user_ethdev.c
>> b/drivers/net/virtio/virtio_user_ethdev.c
>>> index 40ad786cc..c54698ad1 100644
>>> --- a/drivers/net/virtio/virtio_user_ethdev.c
>>> +++ b/drivers/net/virtio/virtio_user_ethdev.c
>>> @@ -528,6 +528,7 @@ virtio_user_eth_dev_alloc(struct rte_vdev_device
>> *vdev)
>>>  	hw->use_msix = 1;
>>>  	hw->modern   = 0;
>>>  	hw->use_vec_rx = 0;
>>> +	hw->use_vec_tx = 0;
>>>  	hw->use_inorder_rx = 0;
>>>  	hw->use_inorder_tx = 0;
>>>  	hw->virtio_user_dev = dev;
>>> @@ -739,8 +740,19 @@ virtio_user_pmd_probe(struct rte_vdev_device
>> *dev)
>>>  		goto end;
>>>  	}
>>>
>>> -	if (vectorized)
>>> -		hw->use_vec_rx = 1;
>>> +	if (vectorized) {
>>> +		if (packed_vq) {
>>> +#if defined(CC_AVX512_SUPPORT)
>>> +			hw->use_vec_rx = 1;
>>> +			hw->use_vec_tx = 1;
>>> +#else
>>> +			PMD_INIT_LOG(INFO,
>>> +				"building environment do not support packed
>> ring vectorized");
>>> +#endif
>>> +		} else {
>>> +			hw->use_vec_rx = 1;
>>> +		}
>>> +	}
>>>
>>>  	rte_eth_dev_probing_finish(eth_dev);
>>>  	ret = 0;
>>> diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
>>> index ca1c10499..ce0340743 100644
>>> --- a/drivers/net/virtio/virtqueue.h
>>> +++ b/drivers/net/virtio/virtqueue.h
>>> @@ -239,7 +239,8 @@ struct vq_desc_extra {
>>>  	void *cookie;
>>>  	uint16_t ndescs;
>>>  	uint16_t next;
>>> -};
>>> +	uint8_t padding[4];
>>> +} __rte_packed __rte_aligned(16);
>>
>> Can't this introduce a performance impact for the non-vectorized
>> case? I think of worse cache liens utilization.
>>
>> For example with a burst of 32 descriptors with 32B cachelines, before
>> it would take 14 cachelines, after 16. So for each burst, one could face
>> 2 extra cache misses.
>>
>> If you could run non-vectorized benchamrks with and without that patch,
>> I would be grateful.
>>
> 
> Maxime,
> Thanks for point it out, it will add extra cache miss in datapath. 
> And its impact on performance is around 1% in loopback case. 

Ok, thanks for doing the test. I'll try to run some PVP benchmarks
on my side because when doing IO loopback, the cache pressure is
much less important.

> While benefit of vectorized path will be more than that number.

Ok, but I disagree for two reasons:
 1. You have to keep in mind than non-vectorized is the default and
encouraged mode to use. Indeed, it takes a lot of shortcuts like not
checking header length (so no error stats), etc...

 2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A because
the gain is 20% on $CPU_VENDOR_B.

In the case we see more degradation in real-world scenario, you might
want to consider using ifdefs to avoid adding padding in the non-
vectorized case, like you did to differentiate Virtio PMD to Virtio-user
PMD in patch 7.

Thanks,
Maxime

> Thanks,
> Marvin
> 
>> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>
>> Thanks,
>> Maxime
>
  
Marvin Liu April 28, 2020, 1:01 p.m. UTC | #4
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 28, 2020 4:44 PM
> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> On 4/28/20 3:14 AM, Liu, Yong wrote:
> >
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Monday, April 27, 2020 7:21 PM
> >> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong
> <xiaolong.ye@intel.com>;
> >> Wang, Zhihong <zhihong.wang@intel.com>
> >> Cc: dev@dpdk.org
> >> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
> path
> >>
> >>
> >>
> >> On 4/26/20 4:19 AM, Marvin Liu wrote:
> >>> Optimize packed ring Rx path with SIMD instructions. Solution of
> >>> optimization is pretty like vhost, is that split path into batch and
> >>> single functions. Batch function is further optimized by AVX512
> >>> instructions. Also pad desc extra structure to 16 bytes aligned, thus
> >>> four elements will be saved in one batch.
> >>>
> >>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >>>
> >>> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> >>> index c9edb84ee..102b1deab 100644
> >>> --- a/drivers/net/virtio/Makefile
> >>> +++ b/drivers/net/virtio/Makefile
> >>> @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM)
> >> $(CONFIG_RTE_ARCH_ARM64)),)
> >>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) +=
> virtio_rxtx_simple_neon.c
> >>>  endif
> >>>
> >>> +ifneq ($(FORCE_DISABLE_AVX512), y)
> >>> +	CC_AVX512_SUPPORT=\
> >>> +	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
> >>> +	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
> >>> +	grep -q AVX512 && echo 1)
> >>> +endif
> >>> +
> >>> +ifeq ($(CC_AVX512_SUPPORT), 1)
> >>> +CFLAGS += -DCC_AVX512_SUPPORT
> >>> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
> >>> +
> >>> +ifeq ($(RTE_TOOLCHAIN), gcc)
> >>> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> >>> +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
> >>> +endif
> >>> +endif
> >>> +
> >>> +ifeq ($(RTE_TOOLCHAIN), clang)
> >>> +ifeq ($(shell test
> $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -
> >> ge 37 && echo 1), 1)
> >>> +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
> >>> +endif
> >>> +endif
> >>> +
> >>> +ifeq ($(RTE_TOOLCHAIN), icc)
> >>> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> >>> +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
> >>> +endif
> >>> +endif
> >>> +
> >>> +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -
> mavx512vl
> >>> +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
> >>> +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
> >>> +endif
> >>> +endif
> >>> +
> >>>  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
> >>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
> >>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) +=
> virtio_user/vhost_kernel.c
> >>> diff --git a/drivers/net/virtio/meson.build
> b/drivers/net/virtio/meson.build
> >>> index 15150eea1..8e68c3039 100644
> >>> --- a/drivers/net/virtio/meson.build
> >>> +++ b/drivers/net/virtio/meson.build
> >>> @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
> >>>  deps += ['kvargs', 'bus_pci']
> >>>
> >>>  if arch_subdir == 'x86'
> >>> +	if '-mno-avx512f' not in machine_args
> >>> +		if cc.has_argument('-mavx512f') and cc.has_argument('-
> >> mavx512vl') and cc.has_argument('-mavx512bw')
> >>> +			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
> >>> +			cflags += ['-DCC_AVX512_SUPPORT']
> >>> +			if (toolchain == 'gcc' and
> >> cc.version().version_compare('>=8.3.0'))
> >>> +				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> >>> +			elif (toolchain == 'clang' and
> >> cc.version().version_compare('>=3.7.0'))
> >>> +				cflags += '-
> >> DVHOST_CLANG_UNROLL_PRAGMA'
> >>> +			elif (toolchain == 'icc' and
> >> cc.version().version_compare('>=16.0.0'))
> >>> +				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
> >>> +			endif
> >>> +			sources += files('virtio_rxtx_packed_avx.c')
> >>> +		endif
> >>> +	endif
> >>>  	sources += files('virtio_rxtx_simple_sse.c')
> >>>  elif arch_subdir == 'ppc'
> >>>  	sources += files('virtio_rxtx_simple_altivec.c')
> >>> diff --git a/drivers/net/virtio/virtio_ethdev.h
> >> b/drivers/net/virtio/virtio_ethdev.h
> >>> index febaf17a8..5c112cac7 100644
> >>> --- a/drivers/net/virtio/virtio_ethdev.h
> >>> +++ b/drivers/net/virtio/virtio_ethdev.h
> >>> @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void
> *tx_queue,
> >> struct rte_mbuf **tx_pkts,
> >>>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf
> **rx_pkts,
> >>>  		uint16_t nb_pkts);
> >>>
> >>> +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf
> >> **rx_pkts,
> >>> +		uint16_t nb_pkts);
> >>> +
> >>>  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
> >>>
> >>>  void virtio_interrupt_handler(void *param);
> >>> diff --git a/drivers/net/virtio/virtio_rxtx.c
> b/drivers/net/virtio/virtio_rxtx.c
> >>> index a549991aa..534562cca 100644
> >>> --- a/drivers/net/virtio/virtio_rxtx.c
> >>> +++ b/drivers/net/virtio/virtio_rxtx.c
> >>> @@ -2030,3 +2030,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
> >>>
> >>>  	return nb_tx;
> >>>  }
> >>> +
> >>> +__rte_weak uint16_t
> >>> +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
> >>> +			    struct rte_mbuf **rx_pkts __rte_unused,
> >>> +			    uint16_t nb_pkts __rte_unused)
> >>> +{
> >>> +	return 0;
> >>> +}
> >>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c
> >> b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> >>> new file mode 100644
> >>> index 000000000..8a7b459eb
> >>> --- /dev/null
> >>> +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> >>> @@ -0,0 +1,374 @@
> >>> +/* SPDX-License-Identifier: BSD-3-Clause
> >>> + * Copyright(c) 2010-2020 Intel Corporation
> >>> + */
> >>> +
> >>> +#include <stdint.h>
> >>> +#include <stdio.h>
> >>> +#include <stdlib.h>
> >>> +#include <string.h>
> >>> +#include <errno.h>
> >>> +
> >>> +#include <rte_net.h>
> >>> +
> >>> +#include "virtio_logs.h"
> >>> +#include "virtio_ethdev.h"
> >>> +#include "virtio_pci.h"
> >>> +#include "virtqueue.h"
> >>> +
> >>> +#define BYTE_SIZE 8
> >>> +/* flag bits offset in packed ring desc higher 64bits */
> >>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags)
> - \
> >>> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> >>> +
> >>> +#define PACKED_FLAGS_MASK ((0ULL |
> >> VRING_PACKED_DESC_F_AVAIL_USED) << \
> >>> +	FLAGS_BITS_OFFSET)
> >>> +
> >>> +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> >>> +	sizeof(struct vring_packed_desc))
> >>> +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> >>> +
> >>> +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
> >>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll
> 4")
> >> \
> >>> +	for (iter = val; iter < size; iter++)
> >>> +#endif
> >>> +
> >>> +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
> >>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
> >>> +	for (iter = val; iter < size; iter++)
> >>> +#endif
> >>> +
> >>> +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
> >>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)")
> \
> >>> +	for (iter = val; iter < size; iter++)
> >>> +#endif
> >>> +
> >>> +#ifndef virtio_for_each_try_unroll
> >>> +#define virtio_for_each_try_unroll(iter, val, num) \
> >>> +	for (iter = val; iter < num; iter++)
> >>> +#endif
> >>> +
> >>> +static inline void
> >>> +virtio_update_batch_stats(struct virtnet_stats *stats,
> >>> +			  uint16_t pkt_len1,
> >>> +			  uint16_t pkt_len2,
> >>> +			  uint16_t pkt_len3,
> >>> +			  uint16_t pkt_len4)
> >>> +{
> >>> +	stats->bytes += pkt_len1;
> >>> +	stats->bytes += pkt_len2;
> >>> +	stats->bytes += pkt_len3;
> >>> +	stats->bytes += pkt_len4;
> >>> +}
> >>> +
> >>> +/* Optionally fill offload information in structure */
> >>> +static inline int
> >>> +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
> >>> +{
> >>> +	struct rte_net_hdr_lens hdr_lens;
> >>> +	uint32_t hdrlen, ptype;
> >>> +	int l4_supported = 0;
> >>> +
> >>> +	/* nothing to do */
> >>> +	if (hdr->flags == 0)
> >>> +		return 0;
> >>> +
> >>> +	/* GSO not support in vec path, skip check */
> >>> +	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
> >>> +
> >>> +	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
> >>> +	m->packet_type = ptype;
> >>> +	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
> >>> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
> >>> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
> >>> +		l4_supported = 1;
> >>> +
> >>> +	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> >>> +		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
> >>> +		if (hdr->csum_start <= hdrlen && l4_supported) {
> >>> +			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
> >>> +		} else {
> >>> +			/* Unknown proto or tunnel, do sw cksum. We can
> >> assume
> >>> +			 * the cksum field is in the first segment since the
> >>> +			 * buffers we provided to the host are large enough.
> >>> +			 * In case of SCTP, this will be wrong since it's a CRC
> >>> +			 * but there's nothing we can do.
> >>> +			 */
> >>> +			uint16_t csum = 0, off;
> >>> +
> >>> +			rte_raw_cksum_mbuf(m, hdr->csum_start,
> >>> +				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
> >>> +				&csum);
> >>> +			if (likely(csum != 0xffff))
> >>> +				csum = ~csum;
> >>> +			off = hdr->csum_offset + hdr->csum_start;
> >>> +			if (rte_pktmbuf_data_len(m) >= off + 1)
> >>> +				*rte_pktmbuf_mtod_offset(m, uint16_t *,
> >>> +					off) = csum;
> >>> +		}
> >>> +	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID &&
> >> l4_supported) {
> >>> +		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
> >>> +	}
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static inline uint16_t
> >>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
> >>> +				   struct rte_mbuf **rx_pkts)
> >>> +{
> >>> +	struct virtqueue *vq = rxvq->vq;
> >>> +	struct virtio_hw *hw = vq->hw;
> >>> +	uint16_t hdr_size = hw->vtnet_hdr_size;
> >>> +	uint64_t addrs[PACKED_BATCH_SIZE];
> >>> +	uint16_t id = vq->vq_used_cons_idx;
> >>> +	uint8_t desc_stats;
> >>> +	uint16_t i;
> >>> +	void *desc_addr;
> >>> +
> >>> +	if (id & PACKED_BATCH_MASK)
> >>> +		return -1;
> >>> +
> >>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
> >>> +		return -1;
> >>> +
> >>> +	/* only care avail/used bits */
> >>> +	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa,
> >> PACKED_FLAGS_MASK);
> >>> +	desc_addr = &vq->vq_packed.ring.desc[id];
> >>> +
> >>> +	__m512i v_desc = _mm512_loadu_si512(desc_addr);
> >>> +	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
> >>> +
> >>> +	__m512i v_used_flag = _mm512_setzero_si512();
> >>> +	if (vq->vq_packed.used_wrap_counter)
> >>> +		v_used_flag = _mm512_maskz_set1_epi64(0xaa,
> >> PACKED_FLAGS_MASK);
> >>> +
> >>> +	/* Check all descs are used */
> >>> +	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
> >>> +	if (desc_stats)
> >>> +		return -1;
> >>> +
> >>> +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> >>> +		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
> >>> +		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
> >>> +
> >>> +		addrs[i] = (uint64_t)rx_pkts[i]->rx_descriptor_fields1;
> >>> +	}
> >>> +
> >>> +	/*
> >>> +	 * load len from desc, store into mbuf pkt_len and data_len
> >>> +	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
> >>> +	 */
> >>> +	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
> >>> +	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc,
> >> 0xAA);
> >>> +
> >>> +	/* reduce hdr_len from pkt_len and data_len */
> >>> +	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
> >>> +			(uint32_t)-hdr_size);
> >>> +
> >>> +	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
> >>> +
> >>> +	/* assert offset of data_len */
> >>> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
> >>> +		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
> >>> +
> >>> +	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
> >>> +					   addrs[2] + 8, addrs[2],
> >>> +					   addrs[1] + 8, addrs[1],
> >>> +					   addrs[0] + 8, addrs[0]);
> >>> +	/* batch store into mbufs */
> >>> +	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
> >>> +
> >>> +	if (hw->has_rx_offload) {
> >>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> >>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
> >>> +				RTE_PKTMBUF_HEADROOM - hdr_size;
> >>> +			virtio_vec_rx_offload(rx_pkts[i],
> >>> +					(struct virtio_net_hdr *)addr);
> >>> +		}
> >>> +	}
> >>> +
> >>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
> >>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
> >>> +			rx_pkts[3]->pkt_len);
> >>> +
> >>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
> >>> +
> >>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
> >>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> >>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> >>> +		vq->vq_packed.used_wrap_counter ^= 1;
> >>> +	}
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static uint16_t
> >>> +virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
> >>> +				    struct rte_mbuf **rx_pkts)
> >>> +{
> >>> +	uint16_t used_idx, id;
> >>> +	uint32_t len;
> >>> +	struct virtqueue *vq = rxvq->vq;
> >>> +	struct virtio_hw *hw = vq->hw;
> >>> +	uint32_t hdr_size = hw->vtnet_hdr_size;
> >>> +	struct virtio_net_hdr *hdr;
> >>> +	struct vring_packed_desc *desc;
> >>> +	struct rte_mbuf *cookie;
> >>> +
> >>> +	desc = vq->vq_packed.ring.desc;
> >>> +	used_idx = vq->vq_used_cons_idx;
> >>> +	if (!desc_is_used(&desc[used_idx], vq))
> >>> +		return -1;
> >>> +
> >>> +	len = desc[used_idx].len;
> >>> +	id = desc[used_idx].id;
> >>> +	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
> >>> +	if (unlikely(cookie == NULL)) {
> >>> +		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie
> >> at %u",
> >>> +				vq->vq_used_cons_idx);
> >>> +		return -1;
> >>> +	}
> >>> +	rte_prefetch0(cookie);
> >>> +	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
> >>> +
> >>> +	cookie->data_off = RTE_PKTMBUF_HEADROOM;
> >>> +	cookie->ol_flags = 0;
> >>> +	cookie->pkt_len = (uint32_t)(len - hdr_size);
> >>> +	cookie->data_len = (uint32_t)(len - hdr_size);
> >>> +
> >>> +	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
> >>> +					RTE_PKTMBUF_HEADROOM -
> >> hdr_size);
> >>> +	if (hw->has_rx_offload)
> >>> +		virtio_vec_rx_offload(cookie, hdr);
> >>> +
> >>> +	*rx_pkts = cookie;
> >>> +
> >>> +	rxvq->stats.bytes += cookie->pkt_len;
> >>> +
> >>> +	vq->vq_free_cnt++;
> >>> +	vq->vq_used_cons_idx++;
> >>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> >>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> >>> +		vq->vq_packed.used_wrap_counter ^= 1;
> >>> +	}
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +static inline void
> >>> +virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
> >>> +			      struct rte_mbuf **cookie,
> >>> +			      uint16_t num)
> >>> +{
> >>> +	struct virtqueue *vq = rxvq->vq;
> >>> +	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
> >>> +	uint16_t flags = vq->vq_packed.cached_flags;
> >>> +	struct virtio_hw *hw = vq->hw;
> >>> +	struct vq_desc_extra *dxp;
> >>> +	uint16_t idx, i;
> >>> +	uint16_t batch_num, total_num = 0;
> >>> +	uint16_t head_idx = vq->vq_avail_idx;
> >>> +	uint16_t head_flag = vq->vq_packed.cached_flags;
> >>> +	uint64_t addr;
> >>> +
> >>> +	do {
> >>> +		idx = vq->vq_avail_idx;
> >>> +
> >>> +		batch_num = PACKED_BATCH_SIZE;
> >>> +		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
> >>> +			batch_num = vq->vq_nentries - idx;
> >>> +		if (unlikely((total_num + batch_num) > num))
> >>> +			batch_num = num - total_num;
> >>> +
> >>> +		virtio_for_each_try_unroll(i, 0, batch_num) {
> >>> +			dxp = &vq->vq_descx[idx + i];
> >>> +			dxp->cookie = (void *)cookie[total_num + i];
> >>> +
> >>> +			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i],
> >> vq) +
> >>> +				RTE_PKTMBUF_HEADROOM - hw-
> >>> vtnet_hdr_size;
> >>> +			start_dp[idx + i].addr = addr;
> >>> +			start_dp[idx + i].len = cookie[total_num + i]-
> >buf_len
> >>> +				- RTE_PKTMBUF_HEADROOM + hw-
> >>> vtnet_hdr_size;
> >>> +			if (total_num || i) {
> >>> +				virtqueue_store_flags_packed(&start_dp[idx
> >> + i],
> >>> +						flags, hw->weak_barriers);
> >>> +			}
> >>> +		}
> >>> +
> >>> +		vq->vq_avail_idx += batch_num;
> >>> +		if (vq->vq_avail_idx >= vq->vq_nentries) {
> >>> +			vq->vq_avail_idx -= vq->vq_nentries;
> >>> +			vq->vq_packed.cached_flags ^=
> >>> +				VRING_PACKED_DESC_F_AVAIL_USED;
> >>> +			flags = vq->vq_packed.cached_flags;
> >>> +		}
> >>> +		total_num += batch_num;
> >>> +	} while (total_num < num);
> >>> +
> >>> +	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
> >>> +				hw->weak_barriers);
> >>> +	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
> >>> +}
> >>> +
> >>> +uint16_t
> >>> +virtio_recv_pkts_packed_vec(void *rx_queue,
> >>> +			    struct rte_mbuf **rx_pkts,
> >>> +			    uint16_t nb_pkts)
> >>> +{
> >>> +	struct virtnet_rx *rxvq = rx_queue;
> >>> +	struct virtqueue *vq = rxvq->vq;
> >>> +	struct virtio_hw *hw = vq->hw;
> >>> +	uint16_t num, nb_rx = 0;
> >>> +	uint32_t nb_enqueued = 0;
> >>> +	uint16_t free_cnt = vq->vq_free_thresh;
> >>> +
> >>> +	if (unlikely(hw->started == 0))
> >>> +		return nb_rx;
> >>> +
> >>> +	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
> >>> +	if (likely(num > PACKED_BATCH_SIZE))
> >>> +		num = num - ((vq->vq_used_cons_idx + num) %
> >> PACKED_BATCH_SIZE);
> >>> +
> >>> +	while (num) {
> >>> +		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
> >>> +					&rx_pkts[nb_rx])) {
> >>> +			nb_rx += PACKED_BATCH_SIZE;
> >>> +			num -= PACKED_BATCH_SIZE;
> >>> +			continue;
> >>> +		}
> >>> +		if (!virtqueue_dequeue_single_packed_vec(rxvq,
> >>> +					&rx_pkts[nb_rx])) {
> >>> +			nb_rx++;
> >>> +			num--;
> >>> +			continue;
> >>> +		}
> >>> +		break;
> >>> +	};
> >>> +
> >>> +	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
> >>> +
> >>> +	rxvq->stats.packets += nb_rx;
> >>> +
> >>> +	if (likely(vq->vq_free_cnt >= free_cnt)) {
> >>> +		struct rte_mbuf *new_pkts[free_cnt];
> >>> +		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
> >>> +						free_cnt) == 0)) {
> >>> +			virtio_recv_refill_packed_vec(rxvq, new_pkts,
> >>> +					free_cnt);
> >>> +			nb_enqueued += free_cnt;
> >>> +		} else {
> >>> +			struct rte_eth_dev *dev =
> >>> +				&rte_eth_devices[rxvq->port_id];
> >>> +			dev->data->rx_mbuf_alloc_failed += free_cnt;
> >>> +		}
> >>> +	}
> >>> +
> >>> +	if (likely(nb_enqueued)) {
> >>> +		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
> >>> +			virtqueue_notify(vq);
> >>> +			PMD_RX_LOG(DEBUG, "Notified");
> >>> +		}
> >>> +	}
> >>> +
> >>> +	return nb_rx;
> >>> +}
> >>> diff --git a/drivers/net/virtio/virtio_user_ethdev.c
> >> b/drivers/net/virtio/virtio_user_ethdev.c
> >>> index 40ad786cc..c54698ad1 100644
> >>> --- a/drivers/net/virtio/virtio_user_ethdev.c
> >>> +++ b/drivers/net/virtio/virtio_user_ethdev.c
> >>> @@ -528,6 +528,7 @@ virtio_user_eth_dev_alloc(struct
> rte_vdev_device
> >> *vdev)
> >>>  	hw->use_msix = 1;
> >>>  	hw->modern   = 0;
> >>>  	hw->use_vec_rx = 0;
> >>> +	hw->use_vec_tx = 0;
> >>>  	hw->use_inorder_rx = 0;
> >>>  	hw->use_inorder_tx = 0;
> >>>  	hw->virtio_user_dev = dev;
> >>> @@ -739,8 +740,19 @@ virtio_user_pmd_probe(struct rte_vdev_device
> >> *dev)
> >>>  		goto end;
> >>>  	}
> >>>
> >>> -	if (vectorized)
> >>> -		hw->use_vec_rx = 1;
> >>> +	if (vectorized) {
> >>> +		if (packed_vq) {
> >>> +#if defined(CC_AVX512_SUPPORT)
> >>> +			hw->use_vec_rx = 1;
> >>> +			hw->use_vec_tx = 1;
> >>> +#else
> >>> +			PMD_INIT_LOG(INFO,
> >>> +				"building environment do not support
> packed
> >> ring vectorized");
> >>> +#endif
> >>> +		} else {
> >>> +			hw->use_vec_rx = 1;
> >>> +		}
> >>> +	}
> >>>
> >>>  	rte_eth_dev_probing_finish(eth_dev);
> >>>  	ret = 0;
> >>> diff --git a/drivers/net/virtio/virtqueue.h
> b/drivers/net/virtio/virtqueue.h
> >>> index ca1c10499..ce0340743 100644
> >>> --- a/drivers/net/virtio/virtqueue.h
> >>> +++ b/drivers/net/virtio/virtqueue.h
> >>> @@ -239,7 +239,8 @@ struct vq_desc_extra {
> >>>  	void *cookie;
> >>>  	uint16_t ndescs;
> >>>  	uint16_t next;
> >>> -};
> >>> +	uint8_t padding[4];
> >>> +} __rte_packed __rte_aligned(16);
> >>
> >> Can't this introduce a performance impact for the non-vectorized
> >> case? I think of worse cache liens utilization.
> >>
> >> For example with a burst of 32 descriptors with 32B cachelines, before
> >> it would take 14 cachelines, after 16. So for each burst, one could face
> >> 2 extra cache misses.
> >>
> >> If you could run non-vectorized benchamrks with and without that patch,
> >> I would be grateful.
> >>
> >
> > Maxime,
> > Thanks for point it out, it will add extra cache miss in datapath.
> > And its impact on performance is around 1% in loopback case.
> 
> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
> on my side because when doing IO loopback, the cache pressure is
> much less important.
> 
> > While benefit of vectorized path will be more than that number.
> 
> Ok, but I disagree for two reasons:
>  1. You have to keep in mind than non-vectorized is the default and
> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
> checking header length (so no error stats), etc...
> 
Ok, I will keep non-vectorized same as before. 

>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A because
> the gain is 20% on $CPU_VENDOR_B.
> 
> In the case we see more degradation in real-world scenario, you might
> want to consider using ifdefs to avoid adding padding in the non-
> vectorized case, like you did to differentiate Virtio PMD to Virtio-user
> PMD in patch 7.
> 

Maxime,
The performance difference is so slight, so I ignored for it look like a sampling error. 
It maybe not suitable to add new configuration for such setting which only used inside driver.
Virtio driver can check whether virtqueue is using vectorized path when initialization, will use padded structure if it is.
I have added some tested code and now performance came back.  Since code has changed in initialization process,  it need some time for regression check.

Regards,
Marvin

> Thanks,
> Maxime
> 
> > Thanks,
> > Marvin
> >
> >> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>
> >> Thanks,
> >> Maxime
> >
  
Maxime Coquelin April 28, 2020, 1:46 p.m. UTC | #5
On 4/28/20 3:01 PM, Liu, Yong wrote:
>>> Maxime,
>>> Thanks for point it out, it will add extra cache miss in datapath.
>>> And its impact on performance is around 1% in loopback case.
>> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
>> on my side because when doing IO loopback, the cache pressure is
>> much less important.
>>
>>> While benefit of vectorized path will be more than that number.
>> Ok, but I disagree for two reasons:
>>  1. You have to keep in mind than non-vectorized is the default and
>> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
>> checking header length (so no error stats), etc...
>>
> Ok, I will keep non-vectorized same as before.
> 
>>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A because
>> the gain is 20% on $CPU_VENDOR_B.
>>
>> In the case we see more degradation in real-world scenario, you might
>> want to consider using ifdefs to avoid adding padding in the non-
>> vectorized case, like you did to differentiate Virtio PMD to Virtio-user
>> PMD in patch 7.
>>
> Maxime,
> The performance difference is so slight, so I ignored for it look like a sampling error. 

Agree for IO loopback, but it adds one more cache line access per burst,
which might be see in some real-life use cases.

> It maybe not suitable to add new configuration for such setting which only used inside driver.

Wait, the Virtio-user #ifdef is based on the defconfig options? How can
it work since both Virtio PMD and Virtio-user PMD can be selected at the
same time?

I thought it was a define set before the headers inclusion and unset
afterwards, but I didn't checked carefully.

> Virtio driver can check whether virtqueue is using vectorized path when initialization, will use padded structure if it is.
> I have added some tested code and now performance came back.  Since code has changed in initialization process,  it need some time for regression check.

Ok, works for me.

I am investigating a linkage issue with your series, which does not
happen systematically (see below, it happens also with clang). David
pointed me to some Intel patches removing the usage if __rte_weak,
could it be related?


gcc  -o app/test/dpdk-test
'app/test/3062f5d@@dpdk-test@exe/commands.c.o'
'app/test/3062f5d@@dpdk-test@exe/packet_burst_generator.c.o'
'app/test/3062f5d@@dpdk-test@exe/test.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_acl.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_alarm.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_atomic.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_barrier.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_bpf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_byteorder.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_cirbuf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_etheraddr.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_ipaddr.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_lib.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_num.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_portlist.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cmdline_string.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_common.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cpuflags.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_crc.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cryptodev.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cryptodev_asym.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cryptodev_blockcipher.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cryptodev_security_pdcp.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_cycles.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_debug.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_distributor.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_distributor_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_eal_flags.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_eal_fs.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_efd.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_efd_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_errno.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_event_crypto_adapter.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_event_eth_rx_adapter.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_event_ring.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_event_timer_adapter.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_eventdev.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_external_mem.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_fbarray.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_fib.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_fib_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_fib6.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_fib6_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_func_reentrancy.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_flow_classify.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_hash.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_hash_functions.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_hash_multiwriter.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_hash_readwrite.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_hash_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_hash_readwrite_lf_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_interrupts.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ipfrag.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ipsec.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ipsec_sad.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_kni.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_kvargs.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_link_bonding.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_link_bonding_rssconf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_logs.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_lpm.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_lpm6.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_lpm6_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_lpm_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_malloc.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_mbuf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_member.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_member_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_memcpy.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_memcpy_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_memory.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_mempool.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_mempool_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_memzone.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_meter.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_metrics.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_mcslock.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_mp_secondary.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_per_lcore.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_pmd_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_power.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_power_cpufreq.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_power_kvm_vm.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_prefetch.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rand_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rawdev.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rcu_qsbr.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rcu_qsbr_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_reciprocal_division.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_reciprocal_division_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_red.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_reorder.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rib.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rib6.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring_mpmc_stress.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring_hts_stress.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring_peek_stress.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring_rts_stress.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ring_stress.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_rwlock.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_sched.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_security.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_service_cores.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_spinlock.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_stack.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_stack_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_string_fns.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_table.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_table_acl.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_table_combined.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_table_pipeline.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_table_ports.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_table_tables.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_tailq.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_thash.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_timer.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_timer_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_timer_racecond.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_timer_secondary.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_ticketlock.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_trace.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_trace_register.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_trace_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_version.c.o'
'app/test/3062f5d@@dpdk-test@exe/virtual_pmd.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_pmd_ring_perf.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_pmd_ring.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_event_eth_tx_adapter.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_bitratestats.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_latencystats.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_link_bonding_mode4.c.o'
'app/test/3062f5d@@dpdk-test@exe/sample_packet_forward.c.o'
'app/test/3062f5d@@dpdk-test@exe/test_pdump.c.o' -Wl,--no-undefined
-Wl,--as-needed -Wl,-O1 -Wl,--whole-archive -Wl,--start-group
drivers/librte_common_cpt.a drivers/librte_common_dpaax.a
drivers/librte_common_iavf.a drivers/librte_common_octeontx.a
drivers/librte_common_octeontx2.a drivers/librte_bus_dpaa.a
drivers/librte_bus_fslmc.a drivers/librte_bus_ifpga.a
drivers/librte_bus_pci.a drivers/librte_bus_vdev.a
drivers/librte_bus_vmbus.a drivers/librte_mempool_bucket.a
drivers/librte_mempool_dpaa.a drivers/librte_mempool_dpaa2.a
drivers/librte_mempool_octeontx.a drivers/librte_mempool_octeontx2.a
drivers/librte_mempool_ring.a drivers/librte_mempool_stack.a
drivers/librte_pmd_af_packet.a drivers/librte_pmd_ark.a
drivers/librte_pmd_atlantic.a drivers/librte_pmd_avp.a
drivers/librte_pmd_axgbe.a drivers/librte_pmd_bond.a
drivers/librte_pmd_bnxt.a drivers/librte_pmd_cxgbe.a
drivers/librte_pmd_dpaa.a drivers/librte_pmd_dpaa2.a
drivers/librte_pmd_e1000.a drivers/librte_pmd_ena.a
drivers/librte_pmd_enetc.a drivers/librte_pmd_enic.a
drivers/librte_pmd_failsafe.a drivers/librte_pmd_fm10k.a
drivers/librte_pmd_i40e.a drivers/librte_pmd_hinic.a
drivers/librte_pmd_hns3.a drivers/librte_pmd_iavf.a
drivers/librte_pmd_ice.a drivers/librte_pmd_igc.a
drivers/librte_pmd_ixgbe.a drivers/librte_pmd_kni.a
drivers/librte_pmd_liquidio.a drivers/librte_pmd_memif.a
drivers/librte_pmd_netvsc.a drivers/librte_pmd_nfp.a
drivers/librte_pmd_null.a drivers/librte_pmd_octeontx.a
drivers/librte_pmd_octeontx2.a drivers/librte_pmd_pfe.a
drivers/librte_pmd_qede.a drivers/librte_pmd_ring.a
drivers/librte_pmd_sfc.a drivers/librte_pmd_softnic.a
drivers/librte_pmd_tap.a drivers/librte_pmd_thunderx.a
drivers/librte_pmd_vdev_netvsc.a drivers/librte_pmd_vhost.a
drivers/librte_pmd_virtio.a drivers/librte_pmd_vmxnet3.a
drivers/librte_rawdev_dpaa2_cmdif.a drivers/librte_rawdev_dpaa2_qdma.a
drivers/librte_rawdev_ioat.a drivers/librte_rawdev_ntb.a
drivers/librte_rawdev_octeontx2_dma.a
drivers/librte_rawdev_octeontx2_ep.a drivers/librte_rawdev_skeleton.a
drivers/librte_pmd_caam_jr.a drivers/librte_pmd_dpaa_sec.a
drivers/librte_pmd_dpaa2_sec.a drivers/librte_pmd_nitrox.a
drivers/librte_pmd_null_crypto.a drivers/librte_pmd_octeontx_crypto.a
drivers/librte_pmd_octeontx2_crypto.a
drivers/librte_pmd_crypto_scheduler.a drivers/librte_pmd_virtio_crypto.a
drivers/librte_pmd_octeontx_compress.a drivers/librte_pmd_qat.a
drivers/librte_pmd_ifc.a drivers/librte_pmd_dpaa_event.a
drivers/librte_pmd_dpaa2_event.a drivers/librte_pmd_octeontx2_event.a
drivers/librte_pmd_opdl_event.a drivers/librte_pmd_skeleton_event.a
drivers/librte_pmd_sw_event.a drivers/librte_pmd_dsw_event.a
drivers/librte_pmd_octeontx_event.a drivers/librte_pmd_bbdev_null.a
drivers/librte_pmd_bbdev_turbo_sw.a
drivers/librte_pmd_bbdev_fpga_lte_fec.a
drivers/librte_pmd_bbdev_fpga_5gnr_fec.a -Wl,--no-whole-archive
-Wl,--no-as-needed -pthread -lm -ldl -lnuma lib/librte_acl.a
lib/librte_eal.a lib/librte_kvargs.a lib/librte_bitratestats.a
lib/librte_ethdev.a lib/librte_net.a lib/librte_mbuf.a
lib/librte_mempool.a lib/librte_ring.a lib/librte_meter.a
lib/librte_metrics.a lib/librte_bpf.a lib/librte_cfgfile.a
lib/librte_cmdline.a lib/librte_cryptodev.a lib/librte_distributor.a
lib/librte_efd.a lib/librte_hash.a lib/librte_eventdev.a
lib/librte_timer.a lib/librte_fib.a lib/librte_rib.a
lib/librte_flow_classify.a lib/librte_table.a lib/librte_port.a
lib/librte_sched.a lib/librte_ip_frag.a lib/librte_kni.a
lib/librte_pci.a lib/librte_lpm.a lib/librte_ipsec.a
lib/librte_security.a lib/librte_latencystats.a lib/librte_member.a
lib/librte_pipeline.a lib/librte_rawdev.a lib/librte_rcu.a
lib/librte_reorder.a lib/librte_stack.a lib/librte_power.a
lib/librte_pdump.a lib/librte_gso.a lib/librte_vhost.a
lib/librte_compressdev.a lib/librte_bbdev.a -Wl,--end-group
'-Wl,-rpath,$ORIGIN/../../lib:$ORIGIN/../../drivers'
-Wl,-rpath-link,/tmp/dpdk_build/meson_buildir_gcc/lib:/tmp/dpdk_build/meson_buildir_gcc/drivers
drivers/librte_pmd_virtio.a(net_virtio_virtio_ethdev.c.o): In function
`set_rxtx_funcs':
virtio_ethdev.c:(.text.unlikely+0x6f): undefined reference to
`virtio_xmit_pkts_packed_vec'
collect2: error: ld returned 1 exit status
ninja: build stopped: subcommand failed.

> Regards,
> Marvin
>
  
Marvin Liu April 28, 2020, 2:43 p.m. UTC | #6
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 28, 2020 9:46 PM
> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> On 4/28/20 3:01 PM, Liu, Yong wrote:
> >>> Maxime,
> >>> Thanks for point it out, it will add extra cache miss in datapath.
> >>> And its impact on performance is around 1% in loopback case.
> >> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
> >> on my side because when doing IO loopback, the cache pressure is
> >> much less important.
> >>
> >>> While benefit of vectorized path will be more than that number.
> >> Ok, but I disagree for two reasons:
> >>  1. You have to keep in mind than non-vectorized is the default and
> >> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
> >> checking header length (so no error stats), etc...
> >>
> > Ok, I will keep non-vectorized same as before.
> >
> >>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A because
> >> the gain is 20% on $CPU_VENDOR_B.
> >>
> >> In the case we see more degradation in real-world scenario, you might
> >> want to consider using ifdefs to avoid adding padding in the non-
> >> vectorized case, like you did to differentiate Virtio PMD to Virtio-user
> >> PMD in patch 7.
> >>
> > Maxime,
> > The performance difference is so slight, so I ignored for it look like a
> sampling error.
> 
> Agree for IO loopback, but it adds one more cache line access per burst,
> which might be see in some real-life use cases.
> 
> > It maybe not suitable to add new configuration for such setting which
> only used inside driver.
> 
> Wait, the Virtio-user #ifdef is based on the defconfig options? How can
> it work since both Virtio PMD and Virtio-user PMD can be selected at the
> same time?
> 
> I thought it was a define set before the headers inclusion and unset
> afterwards, but I didn't checked carefully.
> 

Maxime,
The difference between virtio PMD and Virtio-user PMD addresses is handled by vq->offset. 

When virtio PMD is running, offset will be set to buf_iova.
vq->offset = offsetof(struct rte_mbuf, buf_iova);

When virtio_user PMD is running, offset will be set to buf_addr.
vq->offset = offsetof(struct rte_mbuf, buf_addr);

> > Virtio driver can check whether virtqueue is using vectorized path when
> initialization, will use padded structure if it is.
> > I have added some tested code and now performance came back.  Since
> code has changed in initialization process,  it need some time for regression
> check.
> 
> Ok, works for me.
> 
> I am investigating a linkage issue with your series, which does not
> happen systematically (see below, it happens also with clang). David
> pointed me to some Intel patches removing the usage if __rte_weak,
> could it be related?
> 

I checked David's patch, it only changed i40e driver. Meanwhile attribute __rte_weak should still be in virtio_rxtx.c. 
I will follow David's patch, eliminate the usage of weak attribute. 

> 
> gcc  -o app/test/dpdk-test
> 'app/test/3062f5d@@dpdk-test@exe/commands.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/packet_burst_generator.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_acl.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_alarm.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_atomic.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_barrier.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_bpf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_byteorder.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_cirbuf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_etheraddr.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_ipaddr.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_lib.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_num.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_portlist.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cmdline_string.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_common.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cpuflags.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_crc.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cryptodev.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cryptodev_asym.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cryptodev_blockcipher.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cryptodev_security_pdcp.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_cycles.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_debug.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_distributor.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_distributor_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_eal_flags.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_eal_fs.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_efd.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_efd_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_errno.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_event_crypto_adapter.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_event_eth_rx_adapter.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_event_ring.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_event_timer_adapter.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_eventdev.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_external_mem.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_fbarray.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_fib.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_fib_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_fib6.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_fib6_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_func_reentrancy.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_flow_classify.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_hash.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_hash_functions.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_hash_multiwriter.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_hash_readwrite.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_hash_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_hash_readwrite_lf_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_interrupts.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ipfrag.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ipsec.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ipsec_sad.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_kni.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_kvargs.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_link_bonding.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_link_bonding_rssconf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_logs.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_lpm.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_lpm6.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_lpm6_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_lpm_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_malloc.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_mbuf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_member.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_member_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_memcpy.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_memcpy_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_memory.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_mempool.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_mempool_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_memzone.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_meter.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_metrics.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_mcslock.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_mp_secondary.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_per_lcore.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_pmd_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_power.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_power_cpufreq.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_power_kvm_vm.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_prefetch.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rand_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rawdev.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rcu_qsbr.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rcu_qsbr_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_reciprocal_division.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_reciprocal_division_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_red.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_reorder.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rib.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rib6.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring_mpmc_stress.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring_hts_stress.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring_peek_stress.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring_rts_stress.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ring_stress.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_rwlock.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_sched.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_security.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_service_cores.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_spinlock.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_stack.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_stack_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_string_fns.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_table.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_table_acl.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_table_combined.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_table_pipeline.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_table_ports.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_table_tables.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_tailq.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_thash.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_timer.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_timer_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_timer_racecond.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_timer_secondary.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_ticketlock.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_trace.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_trace_register.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_trace_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_version.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/virtual_pmd.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_pmd_ring_perf.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_pmd_ring.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_event_eth_tx_adapter.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_bitratestats.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_latencystats.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_link_bonding_mode4.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/sample_packet_forward.c.o'
> 'app/test/3062f5d@@dpdk-test@exe/test_pdump.c.o' -Wl,--no-undefined
> -Wl,--as-needed -Wl,-O1 -Wl,--whole-archive -Wl,--start-group
> drivers/librte_common_cpt.a drivers/librte_common_dpaax.a
> drivers/librte_common_iavf.a drivers/librte_common_octeontx.a
> drivers/librte_common_octeontx2.a drivers/librte_bus_dpaa.a
> drivers/librte_bus_fslmc.a drivers/librte_bus_ifpga.a
> drivers/librte_bus_pci.a drivers/librte_bus_vdev.a
> drivers/librte_bus_vmbus.a drivers/librte_mempool_bucket.a
> drivers/librte_mempool_dpaa.a drivers/librte_mempool_dpaa2.a
> drivers/librte_mempool_octeontx.a drivers/librte_mempool_octeontx2.a
> drivers/librte_mempool_ring.a drivers/librte_mempool_stack.a
> drivers/librte_pmd_af_packet.a drivers/librte_pmd_ark.a
> drivers/librte_pmd_atlantic.a drivers/librte_pmd_avp.a
> drivers/librte_pmd_axgbe.a drivers/librte_pmd_bond.a
> drivers/librte_pmd_bnxt.a drivers/librte_pmd_cxgbe.a
> drivers/librte_pmd_dpaa.a drivers/librte_pmd_dpaa2.a
> drivers/librte_pmd_e1000.a drivers/librte_pmd_ena.a
> drivers/librte_pmd_enetc.a drivers/librte_pmd_enic.a
> drivers/librte_pmd_failsafe.a drivers/librte_pmd_fm10k.a
> drivers/librte_pmd_i40e.a drivers/librte_pmd_hinic.a
> drivers/librte_pmd_hns3.a drivers/librte_pmd_iavf.a
> drivers/librte_pmd_ice.a drivers/librte_pmd_igc.a
> drivers/librte_pmd_ixgbe.a drivers/librte_pmd_kni.a
> drivers/librte_pmd_liquidio.a drivers/librte_pmd_memif.a
> drivers/librte_pmd_netvsc.a drivers/librte_pmd_nfp.a
> drivers/librte_pmd_null.a drivers/librte_pmd_octeontx.a
> drivers/librte_pmd_octeontx2.a drivers/librte_pmd_pfe.a
> drivers/librte_pmd_qede.a drivers/librte_pmd_ring.a
> drivers/librte_pmd_sfc.a drivers/librte_pmd_softnic.a
> drivers/librte_pmd_tap.a drivers/librte_pmd_thunderx.a
> drivers/librte_pmd_vdev_netvsc.a drivers/librte_pmd_vhost.a
> drivers/librte_pmd_virtio.a drivers/librte_pmd_vmxnet3.a
> drivers/librte_rawdev_dpaa2_cmdif.a drivers/librte_rawdev_dpaa2_qdma.a
> drivers/librte_rawdev_ioat.a drivers/librte_rawdev_ntb.a
> drivers/librte_rawdev_octeontx2_dma.a
> drivers/librte_rawdev_octeontx2_ep.a drivers/librte_rawdev_skeleton.a
> drivers/librte_pmd_caam_jr.a drivers/librte_pmd_dpaa_sec.a
> drivers/librte_pmd_dpaa2_sec.a drivers/librte_pmd_nitrox.a
> drivers/librte_pmd_null_crypto.a drivers/librte_pmd_octeontx_crypto.a
> drivers/librte_pmd_octeontx2_crypto.a
> drivers/librte_pmd_crypto_scheduler.a drivers/librte_pmd_virtio_crypto.a
> drivers/librte_pmd_octeontx_compress.a drivers/librte_pmd_qat.a
> drivers/librte_pmd_ifc.a drivers/librte_pmd_dpaa_event.a
> drivers/librte_pmd_dpaa2_event.a drivers/librte_pmd_octeontx2_event.a
> drivers/librte_pmd_opdl_event.a drivers/librte_pmd_skeleton_event.a
> drivers/librte_pmd_sw_event.a drivers/librte_pmd_dsw_event.a
> drivers/librte_pmd_octeontx_event.a drivers/librte_pmd_bbdev_null.a
> drivers/librte_pmd_bbdev_turbo_sw.a
> drivers/librte_pmd_bbdev_fpga_lte_fec.a
> drivers/librte_pmd_bbdev_fpga_5gnr_fec.a -Wl,--no-whole-archive
> -Wl,--no-as-needed -pthread -lm -ldl -lnuma lib/librte_acl.a
> lib/librte_eal.a lib/librte_kvargs.a lib/librte_bitratestats.a
> lib/librte_ethdev.a lib/librte_net.a lib/librte_mbuf.a
> lib/librte_mempool.a lib/librte_ring.a lib/librte_meter.a
> lib/librte_metrics.a lib/librte_bpf.a lib/librte_cfgfile.a
> lib/librte_cmdline.a lib/librte_cryptodev.a lib/librte_distributor.a
> lib/librte_efd.a lib/librte_hash.a lib/librte_eventdev.a
> lib/librte_timer.a lib/librte_fib.a lib/librte_rib.a
> lib/librte_flow_classify.a lib/librte_table.a lib/librte_port.a
> lib/librte_sched.a lib/librte_ip_frag.a lib/librte_kni.a
> lib/librte_pci.a lib/librte_lpm.a lib/librte_ipsec.a
> lib/librte_security.a lib/librte_latencystats.a lib/librte_member.a
> lib/librte_pipeline.a lib/librte_rawdev.a lib/librte_rcu.a
> lib/librte_reorder.a lib/librte_stack.a lib/librte_power.a
> lib/librte_pdump.a lib/librte_gso.a lib/librte_vhost.a
> lib/librte_compressdev.a lib/librte_bbdev.a -Wl,--end-group
> '-Wl,-rpath,$ORIGIN/../../lib:$ORIGIN/../../drivers'
> -Wl,-rpath-
> link,/tmp/dpdk_build/meson_buildir_gcc/lib:/tmp/dpdk_build/meson_buil
> dir_gcc/drivers
> drivers/librte_pmd_virtio.a(net_virtio_virtio_ethdev.c.o): In function
> `set_rxtx_funcs':
> virtio_ethdev.c:(.text.unlikely+0x6f): undefined reference to
> `virtio_xmit_pkts_packed_vec'
> collect2: error: ld returned 1 exit status
> ninja: build stopped: subcommand failed.
> 
> > Regards,
> > Marvin
> >
  
Maxime Coquelin April 28, 2020, 2:50 p.m. UTC | #7
On 4/28/20 4:43 PM, Liu, Yong wrote:
> 
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Tuesday, April 28, 2020 9:46 PM
>> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
>> Wang, Zhihong <zhihong.wang@intel.com>
>> Cc: dev@dpdk.org; Honnappa Nagarahalli
>> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
>> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
>>
>>
>>
>> On 4/28/20 3:01 PM, Liu, Yong wrote:
>>>>> Maxime,
>>>>> Thanks for point it out, it will add extra cache miss in datapath.
>>>>> And its impact on performance is around 1% in loopback case.
>>>> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
>>>> on my side because when doing IO loopback, the cache pressure is
>>>> much less important.
>>>>
>>>>> While benefit of vectorized path will be more than that number.
>>>> Ok, but I disagree for two reasons:
>>>>  1. You have to keep in mind than non-vectorized is the default and
>>>> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
>>>> checking header length (so no error stats), etc...
>>>>
>>> Ok, I will keep non-vectorized same as before.
>>>
>>>>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A because
>>>> the gain is 20% on $CPU_VENDOR_B.
>>>>
>>>> In the case we see more degradation in real-world scenario, you might
>>>> want to consider using ifdefs to avoid adding padding in the non-
>>>> vectorized case, like you did to differentiate Virtio PMD to Virtio-user
>>>> PMD in patch 7.
>>>>
>>> Maxime,
>>> The performance difference is so slight, so I ignored for it look like a
>> sampling error.
>>
>> Agree for IO loopback, but it adds one more cache line access per burst,
>> which might be see in some real-life use cases.
>>
>>> It maybe not suitable to add new configuration for such setting which
>> only used inside driver.
>>
>> Wait, the Virtio-user #ifdef is based on the defconfig options? How can
>> it work since both Virtio PMD and Virtio-user PMD can be selected at the
>> same time?
>>
>> I thought it was a define set before the headers inclusion and unset
>> afterwards, but I didn't checked carefully.
>>
> 
> Maxime,
> The difference between virtio PMD and Virtio-user PMD addresses is handled by vq->offset. 
> 
> When virtio PMD is running, offset will be set to buf_iova.
> vq->offset = offsetof(struct rte_mbuf, buf_iova);
> 
> When virtio_user PMD is running, offset will be set to buf_addr.
> vq->offset = offsetof(struct rte_mbuf, buf_addr);

Ok, but below is a build time check:

+#ifdef RTE_VIRTIO_USER
+	__m128i flag_offset = _mm_set_epi64x(flags_temp, (uint64_t)vq->offset);
+#else
+	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
+#endif

So how can it work for a single build for both Virtio and Virtio-user?

>>> Virtio driver can check whether virtqueue is using vectorized path when
>> initialization, will use padded structure if it is.
>>> I have added some tested code and now performance came back.  Since
>> code has changed in initialization process,  it need some time for regression
>> check.
>>
>> Ok, works for me.
>>
>> I am investigating a linkage issue with your series, which does not
>> happen systematically (see below, it happens also with clang). David
>> pointed me to some Intel patches removing the usage if __rte_weak,
>> could it be related?
>>
> 
> I checked David's patch, it only changed i40e driver. Meanwhile attribute __rte_weak should still be in virtio_rxtx.c. 
> I will follow David's patch, eliminate the usage of weak attribute. 

Yeah, I meant below issue could be linked to __rte_weak, not that i40e
patch was the cause of this problem.
  
Marvin Liu April 28, 2020, 3:35 p.m. UTC | #8
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 28, 2020 10:50 PM
> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> On 4/28/20 4:43 PM, Liu, Yong wrote:
> >
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Tuesday, April 28, 2020 9:46 PM
> >> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong
> <xiaolong.ye@intel.com>;
> >> Wang, Zhihong <zhihong.wang@intel.com>
> >> Cc: dev@dpdk.org; Honnappa Nagarahalli
> >> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
> >> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
> path
> >>
> >>
> >>
> >> On 4/28/20 3:01 PM, Liu, Yong wrote:
> >>>>> Maxime,
> >>>>> Thanks for point it out, it will add extra cache miss in datapath.
> >>>>> And its impact on performance is around 1% in loopback case.
> >>>> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
> >>>> on my side because when doing IO loopback, the cache pressure is
> >>>> much less important.
> >>>>
> >>>>> While benefit of vectorized path will be more than that number.
> >>>> Ok, but I disagree for two reasons:
> >>>>  1. You have to keep in mind than non-vectorized is the default and
> >>>> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
> >>>> checking header length (so no error stats), etc...
> >>>>
> >>> Ok, I will keep non-vectorized same as before.
> >>>
> >>>>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A
> because
> >>>> the gain is 20% on $CPU_VENDOR_B.
> >>>>
> >>>> In the case we see more degradation in real-world scenario, you might
> >>>> want to consider using ifdefs to avoid adding padding in the non-
> >>>> vectorized case, like you did to differentiate Virtio PMD to Virtio-user
> >>>> PMD in patch 7.
> >>>>
> >>> Maxime,
> >>> The performance difference is so slight, so I ignored for it look like a
> >> sampling error.
> >>
> >> Agree for IO loopback, but it adds one more cache line access per burst,
> >> which might be see in some real-life use cases.
> >>
> >>> It maybe not suitable to add new configuration for such setting which
> >> only used inside driver.
> >>
> >> Wait, the Virtio-user #ifdef is based on the defconfig options? How can
> >> it work since both Virtio PMD and Virtio-user PMD can be selected at the
> >> same time?
> >>
> >> I thought it was a define set before the headers inclusion and unset
> >> afterwards, but I didn't checked carefully.
> >>
> >
> > Maxime,
> > The difference between virtio PMD and Virtio-user PMD addresses is
> handled by vq->offset.
> >
> > When virtio PMD is running, offset will be set to buf_iova.
> > vq->offset = offsetof(struct rte_mbuf, buf_iova);
> >
> > When virtio_user PMD is running, offset will be set to buf_addr.
> > vq->offset = offsetof(struct rte_mbuf, buf_addr);
> 
> Ok, but below is a build time check:
> 
> +#ifdef RTE_VIRTIO_USER
> +	__m128i flag_offset = _mm_set_epi64x(flags_temp, (uint64_t)vq-
> >offset);
> +#else
> +	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
> +#endif
> 
> So how can it work for a single build for both Virtio and Virtio-user?
> 

Sorry, here is an implementation error. vq->offset should be used in descs_base for getting the iova address. 
It will work the same as VIRTIO_MBUF_ADDR macro.

> >>> Virtio driver can check whether virtqueue is using vectorized path when
> >> initialization, will use padded structure if it is.
> >>> I have added some tested code and now performance came back.  Since
> >> code has changed in initialization process,  it need some time for
> regression
> >> check.
> >>
> >> Ok, works for me.
> >>
> >> I am investigating a linkage issue with your series, which does not
> >> happen systematically (see below, it happens also with clang). David
> >> pointed me to some Intel patches removing the usage if __rte_weak,
> >> could it be related?
> >>
> >
> > I checked David's patch, it only changed i40e driver. Meanwhile attribute
> __rte_weak should still be in virtio_rxtx.c.
> > I will follow David's patch, eliminate the usage of weak attribute.
> 
> Yeah, I meant below issue could be linked to __rte_weak, not that i40e
> patch was the cause of this problem.
> 

Maxime,
I haven't seen any build issue related to __rte_weak both with gcc and clang.   

Thanks,
Marvin
  
Maxime Coquelin April 28, 2020, 3:40 p.m. UTC | #9
On 4/28/20 5:35 PM, Liu, Yong wrote:
> 
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Tuesday, April 28, 2020 10:50 PM
>> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
>> Wang, Zhihong <zhihong.wang@intel.com>
>> Cc: dev@dpdk.org; Honnappa Nagarahalli
>> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
>> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
>>
>>
>>
>> On 4/28/20 4:43 PM, Liu, Yong wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Tuesday, April 28, 2020 9:46 PM
>>>> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong
>> <xiaolong.ye@intel.com>;
>>>> Wang, Zhihong <zhihong.wang@intel.com>
>>>> Cc: dev@dpdk.org; Honnappa Nagarahalli
>>>> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
>>>> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
>> path
>>>>
>>>>
>>>>
>>>> On 4/28/20 3:01 PM, Liu, Yong wrote:
>>>>>>> Maxime,
>>>>>>> Thanks for point it out, it will add extra cache miss in datapath.
>>>>>>> And its impact on performance is around 1% in loopback case.
>>>>>> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
>>>>>> on my side because when doing IO loopback, the cache pressure is
>>>>>> much less important.
>>>>>>
>>>>>>> While benefit of vectorized path will be more than that number.
>>>>>> Ok, but I disagree for two reasons:
>>>>>>  1. You have to keep in mind than non-vectorized is the default and
>>>>>> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
>>>>>> checking header length (so no error stats), etc...
>>>>>>
>>>>> Ok, I will keep non-vectorized same as before.
>>>>>
>>>>>>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A
>> because
>>>>>> the gain is 20% on $CPU_VENDOR_B.
>>>>>>
>>>>>> In the case we see more degradation in real-world scenario, you might
>>>>>> want to consider using ifdefs to avoid adding padding in the non-
>>>>>> vectorized case, like you did to differentiate Virtio PMD to Virtio-user
>>>>>> PMD in patch 7.
>>>>>>
>>>>> Maxime,
>>>>> The performance difference is so slight, so I ignored for it look like a
>>>> sampling error.
>>>>
>>>> Agree for IO loopback, but it adds one more cache line access per burst,
>>>> which might be see in some real-life use cases.
>>>>
>>>>> It maybe not suitable to add new configuration for such setting which
>>>> only used inside driver.
>>>>
>>>> Wait, the Virtio-user #ifdef is based on the defconfig options? How can
>>>> it work since both Virtio PMD and Virtio-user PMD can be selected at the
>>>> same time?
>>>>
>>>> I thought it was a define set before the headers inclusion and unset
>>>> afterwards, but I didn't checked carefully.
>>>>
>>>
>>> Maxime,
>>> The difference between virtio PMD and Virtio-user PMD addresses is
>> handled by vq->offset.
>>>
>>> When virtio PMD is running, offset will be set to buf_iova.
>>> vq->offset = offsetof(struct rte_mbuf, buf_iova);
>>>
>>> When virtio_user PMD is running, offset will be set to buf_addr.
>>> vq->offset = offsetof(struct rte_mbuf, buf_addr);
>>
>> Ok, but below is a build time check:
>>
>> +#ifdef RTE_VIRTIO_USER
>> +	__m128i flag_offset = _mm_set_epi64x(flags_temp, (uint64_t)vq-
>>> offset);
>> +#else
>> +	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
>> +#endif
>>
>> So how can it work for a single build for both Virtio and Virtio-user?
>>
> 
> Sorry, here is an implementation error. vq->offset should be used in descs_base for getting the iova address. 
> It will work the same as VIRTIO_MBUF_ADDR macro.
> 
>>>>> Virtio driver can check whether virtqueue is using vectorized path when
>>>> initialization, will use padded structure if it is.
>>>>> I have added some tested code and now performance came back.  Since
>>>> code has changed in initialization process,  it need some time for
>> regression
>>>> check.
>>>>
>>>> Ok, works for me.
>>>>
>>>> I am investigating a linkage issue with your series, which does not
>>>> happen systematically (see below, it happens also with clang). David
>>>> pointed me to some Intel patches removing the usage if __rte_weak,
>>>> could it be related?
>>>>
>>>
>>> I checked David's patch, it only changed i40e driver. Meanwhile attribute
>> __rte_weak should still be in virtio_rxtx.c.
>>> I will follow David's patch, eliminate the usage of weak attribute.
>>
>> Yeah, I meant below issue could be linked to __rte_weak, not that i40e
>> patch was the cause of this problem.
>>
> 
> Maxime,
> I haven't seen any build issue related to __rte_weak both with gcc and clang.   

Note that this build (which does not fail systematically) is when using
binutils 2.30, which cause AVX512 support to be disabled.

> Thanks,
> Marvin
>
  
Marvin Liu April 28, 2020, 3:55 p.m. UTC | #10
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, April 28, 2020 11:40 PM
> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> On 4/28/20 5:35 PM, Liu, Yong wrote:
> >
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Tuesday, April 28, 2020 10:50 PM
> >> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong
> <xiaolong.ye@intel.com>;
> >> Wang, Zhihong <zhihong.wang@intel.com>
> >> Cc: dev@dpdk.org; Honnappa Nagarahalli
> >> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
> >> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
> path
> >>
> >>
> >>
> >> On 4/28/20 4:43 PM, Liu, Yong wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>> Sent: Tuesday, April 28, 2020 9:46 PM
> >>>> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong
> >> <xiaolong.ye@intel.com>;
> >>>> Wang, Zhihong <zhihong.wang@intel.com>
> >>>> Cc: dev@dpdk.org; Honnappa Nagarahalli
> >>>> <Honnappa.Nagarahalli@arm.com>; jerinj@marvell.com
> >>>> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
> >> path
> >>>>
> >>>>
> >>>>
> >>>> On 4/28/20 3:01 PM, Liu, Yong wrote:
> >>>>>>> Maxime,
> >>>>>>> Thanks for point it out, it will add extra cache miss in datapath.
> >>>>>>> And its impact on performance is around 1% in loopback case.
> >>>>>> Ok, thanks for doing the test. I'll try to run some PVP benchmarks
> >>>>>> on my side because when doing IO loopback, the cache pressure is
> >>>>>> much less important.
> >>>>>>
> >>>>>>> While benefit of vectorized path will be more than that number.
> >>>>>> Ok, but I disagree for two reasons:
> >>>>>>  1. You have to keep in mind than non-vectorized is the default and
> >>>>>> encouraged mode to use. Indeed, it takes a lot of shortcuts like not
> >>>>>> checking header length (so no error stats), etc...
> >>>>>>
> >>>>> Ok, I will keep non-vectorized same as before.
> >>>>>
> >>>>>>  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A
> >> because
> >>>>>> the gain is 20% on $CPU_VENDOR_B.
> >>>>>>
> >>>>>> In the case we see more degradation in real-world scenario, you
> might
> >>>>>> want to consider using ifdefs to avoid adding padding in the non-
> >>>>>> vectorized case, like you did to differentiate Virtio PMD to Virtio-
> user
> >>>>>> PMD in patch 7.
> >>>>>>
> >>>>> Maxime,
> >>>>> The performance difference is so slight, so I ignored for it look like a
> >>>> sampling error.
> >>>>
> >>>> Agree for IO loopback, but it adds one more cache line access per
> burst,
> >>>> which might be see in some real-life use cases.
> >>>>
> >>>>> It maybe not suitable to add new configuration for such setting
> which
> >>>> only used inside driver.
> >>>>
> >>>> Wait, the Virtio-user #ifdef is based on the defconfig options? How
> can
> >>>> it work since both Virtio PMD and Virtio-user PMD can be selected at
> the
> >>>> same time?
> >>>>
> >>>> I thought it was a define set before the headers inclusion and unset
> >>>> afterwards, but I didn't checked carefully.
> >>>>
> >>>
> >>> Maxime,
> >>> The difference between virtio PMD and Virtio-user PMD addresses is
> >> handled by vq->offset.
> >>>
> >>> When virtio PMD is running, offset will be set to buf_iova.
> >>> vq->offset = offsetof(struct rte_mbuf, buf_iova);
> >>>
> >>> When virtio_user PMD is running, offset will be set to buf_addr.
> >>> vq->offset = offsetof(struct rte_mbuf, buf_addr);
> >>
> >> Ok, but below is a build time check:
> >>
> >> +#ifdef RTE_VIRTIO_USER
> >> +	__m128i flag_offset = _mm_set_epi64x(flags_temp, (uint64_t)vq-
> >>> offset);
> >> +#else
> >> +	__m128i flag_offset = _mm_set_epi64x(flags_temp, 0);
> >> +#endif
> >>
> >> So how can it work for a single build for both Virtio and Virtio-user?
> >>
> >
> > Sorry, here is an implementation error. vq->offset should be used in
> descs_base for getting the iova address.
> > It will work the same as VIRTIO_MBUF_ADDR macro.
> >
> >>>>> Virtio driver can check whether virtqueue is using vectorized path
> when
> >>>> initialization, will use padded structure if it is.
> >>>>> I have added some tested code and now performance came back.
> Since
> >>>> code has changed in initialization process,  it need some time for
> >> regression
> >>>> check.
> >>>>
> >>>> Ok, works for me.
> >>>>
> >>>> I am investigating a linkage issue with your series, which does not
> >>>> happen systematically (see below, it happens also with clang). David
> >>>> pointed me to some Intel patches removing the usage if __rte_weak,
> >>>> could it be related?
> >>>>
> >>>
> >>> I checked David's patch, it only changed i40e driver. Meanwhile
> attribute
> >> __rte_weak should still be in virtio_rxtx.c.
> >>> I will follow David's patch, eliminate the usage of weak attribute.
> >>
> >> Yeah, I meant below issue could be linked to __rte_weak, not that i40e
> >> patch was the cause of this problem.
> >>
> >
> > Maxime,
> > I haven't seen any build issue related to __rte_weak both with gcc and
> clang.
> 
> Note that this build (which does not fail systematically) is when using
> binutils 2.30, which cause AVX512 support to be disabled.
> 

Just change to binutils 2.30,  AVX512 code will be skipped as expected in meson build. 
Could you please supply more information, I will try to reproduce it.

> > Thanks,
> > Marvin
> >
  
Marvin Liu April 28, 2020, 5:01 p.m. UTC | #11
> -----Original Message-----
> From: Liu, Yong
> Sent: Tuesday, April 28, 2020 9:01 PM
> To: 'Maxime Coquelin' <maxime.coquelin@redhat.com>; Ye, Xiaolong
> <xiaolong.ye@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx path
> 
> 
> 
> > -----Original Message-----
> > From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > Sent: Tuesday, April 28, 2020 4:44 PM
> > To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>;
> > Wang, Zhihong <zhihong.wang@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
> path
> >
> >
> >
> > On 4/28/20 3:14 AM, Liu, Yong wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > >> Sent: Monday, April 27, 2020 7:21 PM
> > >> To: Liu, Yong <yong.liu@intel.com>; Ye, Xiaolong
> > <xiaolong.ye@intel.com>;
> > >> Wang, Zhihong <zhihong.wang@intel.com>
> > >> Cc: dev@dpdk.org
> > >> Subject: Re: [PATCH v10 6/9] net/virtio: add vectorized packed ring Rx
> > path
> > >>
> > >>
> > >>
> > >> On 4/26/20 4:19 AM, Marvin Liu wrote:
> > >>> Optimize packed ring Rx path with SIMD instructions. Solution of
> > >>> optimization is pretty like vhost, is that split path into batch and
> > >>> single functions. Batch function is further optimized by AVX512
> > >>> instructions. Also pad desc extra structure to 16 bytes aligned, thus
> > >>> four elements will be saved in one batch.
> > >>>
> > >>> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> > >>>
> > >>> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> > >>> index c9edb84ee..102b1deab 100644
> > >>> --- a/drivers/net/virtio/Makefile
> > >>> +++ b/drivers/net/virtio/Makefile
> > >>> @@ -36,6 +36,41 @@ else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM)
> > >> $(CONFIG_RTE_ARCH_ARM64)),)
> > >>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) +=
> > virtio_rxtx_simple_neon.c
> > >>>  endif
> > >>>
> > >>> +ifneq ($(FORCE_DISABLE_AVX512), y)
> > >>> +	CC_AVX512_SUPPORT=\
> > >>> +	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
> > >>> +	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
> > >>> +	grep -q AVX512 && echo 1)
> > >>> +endif
> > >>> +
> > >>> +ifeq ($(CC_AVX512_SUPPORT), 1)
> > >>> +CFLAGS += -DCC_AVX512_SUPPORT
> > >>> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) +=
> virtio_rxtx_packed_avx.c
> > >>> +
> > >>> +ifeq ($(RTE_TOOLCHAIN), gcc)
> > >>> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> > >>> +CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
> > >>> +endif
> > >>> +endif
> > >>> +
> > >>> +ifeq ($(RTE_TOOLCHAIN), clang)
> > >>> +ifeq ($(shell test
> > $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -
> > >> ge 37 && echo 1), 1)
> > >>> +CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
> > >>> +endif
> > >>> +endif
> > >>> +
> > >>> +ifeq ($(RTE_TOOLCHAIN), icc)
> > >>> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> > >>> +CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
> > >>> +endif
> > >>> +endif
> > >>> +
> > >>> +CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -
> > mavx512vl
> > >>> +ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
> > >>> +CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
> > >>> +endif
> > >>> +endif
> > >>> +
> > >>>  ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
> > >>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) +=
> virtio_user/vhost_user.c
> > >>>  SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) +=
> > virtio_user/vhost_kernel.c
> > >>> diff --git a/drivers/net/virtio/meson.build
> > b/drivers/net/virtio/meson.build
> > >>> index 15150eea1..8e68c3039 100644
> > >>> --- a/drivers/net/virtio/meson.build
> > >>> +++ b/drivers/net/virtio/meson.build
> > >>> @@ -9,6 +9,20 @@ sources += files('virtio_ethdev.c',
> > >>>  deps += ['kvargs', 'bus_pci']
> > >>>
> > >>>  if arch_subdir == 'x86'
> > >>> +	if '-mno-avx512f' not in machine_args
> > >>> +		if cc.has_argument('-mavx512f') and cc.has_argument('-
> > >> mavx512vl') and cc.has_argument('-mavx512bw')
> > >>> +			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
> > >>> +			cflags += ['-DCC_AVX512_SUPPORT']
> > >>> +			if (toolchain == 'gcc' and
> > >> cc.version().version_compare('>=8.3.0'))
> > >>> +				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> > >>> +			elif (toolchain == 'clang' and
> > >> cc.version().version_compare('>=3.7.0'))
> > >>> +				cflags += '-
> > >> DVHOST_CLANG_UNROLL_PRAGMA'
> > >>> +			elif (toolchain == 'icc' and
> > >> cc.version().version_compare('>=16.0.0'))
> > >>> +				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
> > >>> +			endif
> > >>> +			sources += files('virtio_rxtx_packed_avx.c')
> > >>> +		endif
> > >>> +	endif
> > >>>  	sources += files('virtio_rxtx_simple_sse.c')
> > >>>  elif arch_subdir == 'ppc'
> > >>>  	sources += files('virtio_rxtx_simple_altivec.c')
> > >>> diff --git a/drivers/net/virtio/virtio_ethdev.h
> > >> b/drivers/net/virtio/virtio_ethdev.h
> > >>> index febaf17a8..5c112cac7 100644
> > >>> --- a/drivers/net/virtio/virtio_ethdev.h
> > >>> +++ b/drivers/net/virtio/virtio_ethdev.h
> > >>> @@ -105,6 +105,9 @@ uint16_t virtio_xmit_pkts_inorder(void
> > *tx_queue,
> > >> struct rte_mbuf **tx_pkts,
> > >>>  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf
> > **rx_pkts,
> > >>>  		uint16_t nb_pkts);
> > >>>
> > >>> +uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct
> rte_mbuf
> > >> **rx_pkts,
> > >>> +		uint16_t nb_pkts);
> > >>> +
> > >>>  int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
> > >>>
> > >>>  void virtio_interrupt_handler(void *param);
> > >>> diff --git a/drivers/net/virtio/virtio_rxtx.c
> > b/drivers/net/virtio/virtio_rxtx.c
> > >>> index a549991aa..534562cca 100644
> > >>> --- a/drivers/net/virtio/virtio_rxtx.c
> > >>> +++ b/drivers/net/virtio/virtio_rxtx.c
> > >>> @@ -2030,3 +2030,11 @@ virtio_xmit_pkts_inorder(void *tx_queue,
> > >>>
> > >>>  	return nb_tx;
> > >>>  }
> > >>> +
> > >>> +__rte_weak uint16_t
> > >>> +virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
> > >>> +			    struct rte_mbuf **rx_pkts __rte_unused,
> > >>> +			    uint16_t nb_pkts __rte_unused)
> > >>> +{
> > >>> +	return 0;
> > >>> +}
> > >>> diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > >> b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > >>> new file mode 100644
> > >>> index 000000000..8a7b459eb
> > >>> --- /dev/null
> > >>> +++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
> > >>> @@ -0,0 +1,374 @@
> > >>> +/* SPDX-License-Identifier: BSD-3-Clause
> > >>> + * Copyright(c) 2010-2020 Intel Corporation
> > >>> + */
> > >>> +
> > >>> +#include <stdint.h>
> > >>> +#include <stdio.h>
> > >>> +#include <stdlib.h>
> > >>> +#include <string.h>
> > >>> +#include <errno.h>
> > >>> +
> > >>> +#include <rte_net.h>
> > >>> +
> > >>> +#include "virtio_logs.h"
> > >>> +#include "virtio_ethdev.h"
> > >>> +#include "virtio_pci.h"
> > >>> +#include "virtqueue.h"
> > >>> +
> > >>> +#define BYTE_SIZE 8
> > >>> +/* flag bits offset in packed ring desc higher 64bits */
> > >>> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc,
> flags)
> > - \
> > >>> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> > >>> +
> > >>> +#define PACKED_FLAGS_MASK ((0ULL |
> > >> VRING_PACKED_DESC_F_AVAIL_USED) << \
> > >>> +	FLAGS_BITS_OFFSET)
> > >>> +
> > >>> +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> > >>> +	sizeof(struct vring_packed_desc))
> > >>> +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> > >>> +
> > >>> +#ifdef VIRTIO_GCC_UNROLL_PRAGMA
> > >>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC
> unroll
> > 4")
> > >> \
> > >>> +	for (iter = val; iter < size; iter++)
> > >>> +#endif
> > >>> +
> > >>> +#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
> > >>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4")
> \
> > >>> +	for (iter = val; iter < size; iter++)
> > >>> +#endif
> > >>> +
> > >>> +#ifdef VIRTIO_ICC_UNROLL_PRAGMA
> > >>> +#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll
> (4)")
> > \
> > >>> +	for (iter = val; iter < size; iter++)
> > >>> +#endif
> > >>> +
> > >>> +#ifndef virtio_for_each_try_unroll
> > >>> +#define virtio_for_each_try_unroll(iter, val, num) \
> > >>> +	for (iter = val; iter < num; iter++)
> > >>> +#endif
> > >>> +
> > >>> +static inline void
> > >>> +virtio_update_batch_stats(struct virtnet_stats *stats,
> > >>> +			  uint16_t pkt_len1,
> > >>> +			  uint16_t pkt_len2,
> > >>> +			  uint16_t pkt_len3,
> > >>> +			  uint16_t pkt_len4)
> > >>> +{
> > >>> +	stats->bytes += pkt_len1;
> > >>> +	stats->bytes += pkt_len2;
> > >>> +	stats->bytes += pkt_len3;
> > >>> +	stats->bytes += pkt_len4;
> > >>> +}
> > >>> +
> > >>> +/* Optionally fill offload information in structure */
> > >>> +static inline int
> > >>> +virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
> > >>> +{
> > >>> +	struct rte_net_hdr_lens hdr_lens;
> > >>> +	uint32_t hdrlen, ptype;
> > >>> +	int l4_supported = 0;
> > >>> +
> > >>> +	/* nothing to do */
> > >>> +	if (hdr->flags == 0)
> > >>> +		return 0;
> > >>> +
> > >>> +	/* GSO not support in vec path, skip check */
> > >>> +	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
> > >>> +
> > >>> +	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
> > >>> +	m->packet_type = ptype;
> > >>> +	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
> > >>> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
> > >>> +	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
> > >>> +		l4_supported = 1;
> > >>> +
> > >>> +	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> > >>> +		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
> > >>> +		if (hdr->csum_start <= hdrlen && l4_supported) {
> > >>> +			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
> > >>> +		} else {
> > >>> +			/* Unknown proto or tunnel, do sw cksum. We can
> > >> assume
> > >>> +			 * the cksum field is in the first segment since the
> > >>> +			 * buffers we provided to the host are large enough.
> > >>> +			 * In case of SCTP, this will be wrong since it's a CRC
> > >>> +			 * but there's nothing we can do.
> > >>> +			 */
> > >>> +			uint16_t csum = 0, off;
> > >>> +
> > >>> +			rte_raw_cksum_mbuf(m, hdr->csum_start,
> > >>> +				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
> > >>> +				&csum);
> > >>> +			if (likely(csum != 0xffff))
> > >>> +				csum = ~csum;
> > >>> +			off = hdr->csum_offset + hdr->csum_start;
> > >>> +			if (rte_pktmbuf_data_len(m) >= off + 1)
> > >>> +				*rte_pktmbuf_mtod_offset(m, uint16_t *,
> > >>> +					off) = csum;
> > >>> +		}
> > >>> +	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID &&
> > >> l4_supported) {
> > >>> +		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
> > >>> +	}
> > >>> +
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +static inline uint16_t
> > >>> +virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
> > >>> +				   struct rte_mbuf **rx_pkts)
> > >>> +{
> > >>> +	struct virtqueue *vq = rxvq->vq;
> > >>> +	struct virtio_hw *hw = vq->hw;
> > >>> +	uint16_t hdr_size = hw->vtnet_hdr_size;
> > >>> +	uint64_t addrs[PACKED_BATCH_SIZE];
> > >>> +	uint16_t id = vq->vq_used_cons_idx;
> > >>> +	uint8_t desc_stats;
> > >>> +	uint16_t i;
> > >>> +	void *desc_addr;
> > >>> +
> > >>> +	if (id & PACKED_BATCH_MASK)
> > >>> +		return -1;
> > >>> +
> > >>> +	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
> > >>> +		return -1;
> > >>> +
> > >>> +	/* only care avail/used bits */
> > >>> +	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa,
> > >> PACKED_FLAGS_MASK);
> > >>> +	desc_addr = &vq->vq_packed.ring.desc[id];
> > >>> +
> > >>> +	__m512i v_desc = _mm512_loadu_si512(desc_addr);
> > >>> +	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
> > >>> +
> > >>> +	__m512i v_used_flag = _mm512_setzero_si512();
> > >>> +	if (vq->vq_packed.used_wrap_counter)
> > >>> +		v_used_flag = _mm512_maskz_set1_epi64(0xaa,
> > >> PACKED_FLAGS_MASK);
> > >>> +
> > >>> +	/* Check all descs are used */
> > >>> +	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
> > >>> +	if (desc_stats)
> > >>> +		return -1;
> > >>> +
> > >>> +	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > >>> +		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
> > >>> +		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
> > >>> +
> > >>> +		addrs[i] = (uint64_t)rx_pkts[i]->rx_descriptor_fields1;
> > >>> +	}
> > >>> +
> > >>> +	/*
> > >>> +	 * load len from desc, store into mbuf pkt_len and data_len
> > >>> +	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
> > >>> +	 */
> > >>> +	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
> > >>> +	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc,
> > >> 0xAA);
> > >>> +
> > >>> +	/* reduce hdr_len from pkt_len and data_len */
> > >>> +	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
> > >>> +			(uint32_t)-hdr_size);
> > >>> +
> > >>> +	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
> > >>> +
> > >>> +	/* assert offset of data_len */
> > >>> +	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
> > >>> +		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
> > >>> +
> > >>> +	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
> > >>> +					   addrs[2] + 8, addrs[2],
> > >>> +					   addrs[1] + 8, addrs[1],
> > >>> +					   addrs[0] + 8, addrs[0]);
> > >>> +	/* batch store into mbufs */
> > >>> +	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
> > >>> +
> > >>> +	if (hw->has_rx_offload) {
> > >>> +		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > >>> +			char *addr = (char *)rx_pkts[i]->buf_addr +
> > >>> +				RTE_PKTMBUF_HEADROOM - hdr_size;
> > >>> +			virtio_vec_rx_offload(rx_pkts[i],
> > >>> +					(struct virtio_net_hdr *)addr);
> > >>> +		}
> > >>> +	}
> > >>> +
> > >>> +	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
> > >>> +			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
> > >>> +			rx_pkts[3]->pkt_len);
> > >>> +
> > >>> +	vq->vq_free_cnt += PACKED_BATCH_SIZE;
> > >>> +
> > >>> +	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
> > >>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> > >>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> > >>> +		vq->vq_packed.used_wrap_counter ^= 1;
> > >>> +	}
> > >>> +
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +static uint16_t
> > >>> +virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
> > >>> +				    struct rte_mbuf **rx_pkts)
> > >>> +{
> > >>> +	uint16_t used_idx, id;
> > >>> +	uint32_t len;
> > >>> +	struct virtqueue *vq = rxvq->vq;
> > >>> +	struct virtio_hw *hw = vq->hw;
> > >>> +	uint32_t hdr_size = hw->vtnet_hdr_size;
> > >>> +	struct virtio_net_hdr *hdr;
> > >>> +	struct vring_packed_desc *desc;
> > >>> +	struct rte_mbuf *cookie;
> > >>> +
> > >>> +	desc = vq->vq_packed.ring.desc;
> > >>> +	used_idx = vq->vq_used_cons_idx;
> > >>> +	if (!desc_is_used(&desc[used_idx], vq))
> > >>> +		return -1;
> > >>> +
> > >>> +	len = desc[used_idx].len;
> > >>> +	id = desc[used_idx].id;
> > >>> +	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
> > >>> +	if (unlikely(cookie == NULL)) {
> > >>> +		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie
> > >> at %u",
> > >>> +				vq->vq_used_cons_idx);
> > >>> +		return -1;
> > >>> +	}
> > >>> +	rte_prefetch0(cookie);
> > >>> +	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
> > >>> +
> > >>> +	cookie->data_off = RTE_PKTMBUF_HEADROOM;
> > >>> +	cookie->ol_flags = 0;
> > >>> +	cookie->pkt_len = (uint32_t)(len - hdr_size);
> > >>> +	cookie->data_len = (uint32_t)(len - hdr_size);
> > >>> +
> > >>> +	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
> > >>> +					RTE_PKTMBUF_HEADROOM -
> > >> hdr_size);
> > >>> +	if (hw->has_rx_offload)
> > >>> +		virtio_vec_rx_offload(cookie, hdr);
> > >>> +
> > >>> +	*rx_pkts = cookie;
> > >>> +
> > >>> +	rxvq->stats.bytes += cookie->pkt_len;
> > >>> +
> > >>> +	vq->vq_free_cnt++;
> > >>> +	vq->vq_used_cons_idx++;
> > >>> +	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
> > >>> +		vq->vq_used_cons_idx -= vq->vq_nentries;
> > >>> +		vq->vq_packed.used_wrap_counter ^= 1;
> > >>> +	}
> > >>> +
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +static inline void
> > >>> +virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
> > >>> +			      struct rte_mbuf **cookie,
> > >>> +			      uint16_t num)
> > >>> +{
> > >>> +	struct virtqueue *vq = rxvq->vq;
> > >>> +	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
> > >>> +	uint16_t flags = vq->vq_packed.cached_flags;
> > >>> +	struct virtio_hw *hw = vq->hw;
> > >>> +	struct vq_desc_extra *dxp;
> > >>> +	uint16_t idx, i;
> > >>> +	uint16_t batch_num, total_num = 0;
> > >>> +	uint16_t head_idx = vq->vq_avail_idx;
> > >>> +	uint16_t head_flag = vq->vq_packed.cached_flags;
> > >>> +	uint64_t addr;
> > >>> +
> > >>> +	do {
> > >>> +		idx = vq->vq_avail_idx;
> > >>> +
> > >>> +		batch_num = PACKED_BATCH_SIZE;
> > >>> +		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
> > >>> +			batch_num = vq->vq_nentries - idx;
> > >>> +		if (unlikely((total_num + batch_num) > num))
> > >>> +			batch_num = num - total_num;
> > >>> +
> > >>> +		virtio_for_each_try_unroll(i, 0, batch_num) {
> > >>> +			dxp = &vq->vq_descx[idx + i];
> > >>> +			dxp->cookie = (void *)cookie[total_num + i];
> > >>> +
> > >>> +			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i],
> > >> vq) +
> > >>> +				RTE_PKTMBUF_HEADROOM - hw-
> > >>> vtnet_hdr_size;
> > >>> +			start_dp[idx + i].addr = addr;
> > >>> +			start_dp[idx + i].len = cookie[total_num + i]-
> > >buf_len
> > >>> +				- RTE_PKTMBUF_HEADROOM + hw-
> > >>> vtnet_hdr_size;
> > >>> +			if (total_num || i) {
> > >>> +				virtqueue_store_flags_packed(&start_dp[idx
> > >> + i],
> > >>> +						flags, hw->weak_barriers);
> > >>> +			}
> > >>> +		}
> > >>> +
> > >>> +		vq->vq_avail_idx += batch_num;
> > >>> +		if (vq->vq_avail_idx >= vq->vq_nentries) {
> > >>> +			vq->vq_avail_idx -= vq->vq_nentries;
> > >>> +			vq->vq_packed.cached_flags ^=
> > >>> +				VRING_PACKED_DESC_F_AVAIL_USED;
> > >>> +			flags = vq->vq_packed.cached_flags;
> > >>> +		}
> > >>> +		total_num += batch_num;
> > >>> +	} while (total_num < num);
> > >>> +
> > >>> +	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
> > >>> +				hw->weak_barriers);
> > >>> +	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
> > >>> +}
> > >>> +
> > >>> +uint16_t
> > >>> +virtio_recv_pkts_packed_vec(void *rx_queue,
> > >>> +			    struct rte_mbuf **rx_pkts,
> > >>> +			    uint16_t nb_pkts)
> > >>> +{
> > >>> +	struct virtnet_rx *rxvq = rx_queue;
> > >>> +	struct virtqueue *vq = rxvq->vq;
> > >>> +	struct virtio_hw *hw = vq->hw;
> > >>> +	uint16_t num, nb_rx = 0;
> > >>> +	uint32_t nb_enqueued = 0;
> > >>> +	uint16_t free_cnt = vq->vq_free_thresh;
> > >>> +
> > >>> +	if (unlikely(hw->started == 0))
> > >>> +		return nb_rx;
> > >>> +
> > >>> +	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
> > >>> +	if (likely(num > PACKED_BATCH_SIZE))
> > >>> +		num = num - ((vq->vq_used_cons_idx + num) %
> > >> PACKED_BATCH_SIZE);
> > >>> +
> > >>> +	while (num) {
> > >>> +		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
> > >>> +					&rx_pkts[nb_rx])) {
> > >>> +			nb_rx += PACKED_BATCH_SIZE;
> > >>> +			num -= PACKED_BATCH_SIZE;
> > >>> +			continue;
> > >>> +		}
> > >>> +		if (!virtqueue_dequeue_single_packed_vec(rxvq,
> > >>> +					&rx_pkts[nb_rx])) {
> > >>> +			nb_rx++;
> > >>> +			num--;
> > >>> +			continue;
> > >>> +		}
> > >>> +		break;
> > >>> +	};
> > >>> +
> > >>> +	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
> > >>> +
> > >>> +	rxvq->stats.packets += nb_rx;
> > >>> +
> > >>> +	if (likely(vq->vq_free_cnt >= free_cnt)) {
> > >>> +		struct rte_mbuf *new_pkts[free_cnt];
> > >>> +		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
> > >>> +						free_cnt) == 0)) {
> > >>> +			virtio_recv_refill_packed_vec(rxvq, new_pkts,
> > >>> +					free_cnt);
> > >>> +			nb_enqueued += free_cnt;
> > >>> +		} else {
> > >>> +			struct rte_eth_dev *dev =
> > >>> +				&rte_eth_devices[rxvq->port_id];
> > >>> +			dev->data->rx_mbuf_alloc_failed += free_cnt;
> > >>> +		}
> > >>> +	}
> > >>> +
> > >>> +	if (likely(nb_enqueued)) {
> > >>> +		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
> > >>> +			virtqueue_notify(vq);
> > >>> +			PMD_RX_LOG(DEBUG, "Notified");
> > >>> +		}
> > >>> +	}
> > >>> +
> > >>> +	return nb_rx;
> > >>> +}
> > >>> diff --git a/drivers/net/virtio/virtio_user_ethdev.c
> > >> b/drivers/net/virtio/virtio_user_ethdev.c
> > >>> index 40ad786cc..c54698ad1 100644
> > >>> --- a/drivers/net/virtio/virtio_user_ethdev.c
> > >>> +++ b/drivers/net/virtio/virtio_user_ethdev.c
> > >>> @@ -528,6 +528,7 @@ virtio_user_eth_dev_alloc(struct
> > rte_vdev_device
> > >> *vdev)
> > >>>  	hw->use_msix = 1;
> > >>>  	hw->modern   = 0;
> > >>>  	hw->use_vec_rx = 0;
> > >>> +	hw->use_vec_tx = 0;
> > >>>  	hw->use_inorder_rx = 0;
> > >>>  	hw->use_inorder_tx = 0;
> > >>>  	hw->virtio_user_dev = dev;
> > >>> @@ -739,8 +740,19 @@ virtio_user_pmd_probe(struct
> rte_vdev_device
> > >> *dev)
> > >>>  		goto end;
> > >>>  	}
> > >>>
> > >>> -	if (vectorized)
> > >>> -		hw->use_vec_rx = 1;
> > >>> +	if (vectorized) {
> > >>> +		if (packed_vq) {
> > >>> +#if defined(CC_AVX512_SUPPORT)
> > >>> +			hw->use_vec_rx = 1;
> > >>> +			hw->use_vec_tx = 1;
> > >>> +#else
> > >>> +			PMD_INIT_LOG(INFO,
> > >>> +				"building environment do not support
> > packed
> > >> ring vectorized");
> > >>> +#endif
> > >>> +		} else {
> > >>> +			hw->use_vec_rx = 1;
> > >>> +		}
> > >>> +	}
> > >>>
> > >>>  	rte_eth_dev_probing_finish(eth_dev);
> > >>>  	ret = 0;
> > >>> diff --git a/drivers/net/virtio/virtqueue.h
> > b/drivers/net/virtio/virtqueue.h
> > >>> index ca1c10499..ce0340743 100644
> > >>> --- a/drivers/net/virtio/virtqueue.h
> > >>> +++ b/drivers/net/virtio/virtqueue.h
> > >>> @@ -239,7 +239,8 @@ struct vq_desc_extra {
> > >>>  	void *cookie;
> > >>>  	uint16_t ndescs;
> > >>>  	uint16_t next;
> > >>> -};
> > >>> +	uint8_t padding[4];
> > >>> +} __rte_packed __rte_aligned(16);
> > >>
> > >> Can't this introduce a performance impact for the non-vectorized
> > >> case? I think of worse cache liens utilization.
> > >>
> > >> For example with a burst of 32 descriptors with 32B cachelines, before
> > >> it would take 14 cachelines, after 16. So for each burst, one could face
> > >> 2 extra cache misses.
> > >>
> > >> If you could run non-vectorized benchamrks with and without that
> patch,
> > >> I would be grateful.
> > >>
> > >
> > > Maxime,
> > > Thanks for point it out, it will add extra cache miss in datapath.
> > > And its impact on performance is around 1% in loopback case.
> >
> > Ok, thanks for doing the test. I'll try to run some PVP benchmarks
> > on my side because when doing IO loopback, the cache pressure is
> > much less important.
> >
> > > While benefit of vectorized path will be more than that number.
> >
> > Ok, but I disagree for two reasons:
> >  1. You have to keep in mind than non-vectorized is the default and
> > encouraged mode to use. Indeed, it takes a lot of shortcuts like not
> > checking header length (so no error stats), etc...
> >
> Ok, I will keep non-vectorized same as before.
> 
> >  2. It's like saying it's OK it degrades by 5% on $CPU_VENDOR_A because
> > the gain is 20% on $CPU_VENDOR_B.
> >
> > In the case we see more degradation in real-world scenario, you might
> > want to consider using ifdefs to avoid adding padding in the non-
> > vectorized case, like you did to differentiate Virtio PMD to Virtio-user
> > PMD in patch 7.
> >
> 
> Maxime,
> The performance difference is so slight, so I ignored for it look like a
> sampling error.
> It maybe not suitable to add new configuration for such setting which only
> used inside driver.
> Virtio driver can check whether virtqueue is using vectorized path when
> initialization, will use padded structure if it is.
> I have added some tested code and now performance came back.  Since
> code has changed in initialization process,  it need some time for regression
> check.
> 

+ one more update.
Batch store with padding structure won't have benefit based on the latest code.
It may due to addition load/store cost can't be hidden by saved cpu cycles.
Will moved padding structure and make things clear as before.

> Regards,
> Marvin
> 
> > Thanks,
> > Maxime
> >
> > > Thanks,
> > > Marvin
> > >
> > >> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > >>
> > >> Thanks,
> > >> Maxime
> > >
  

Patch

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index c9edb84ee..102b1deab 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -36,6 +36,41 @@  else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
 endif
 
+ifneq ($(FORCE_DISABLE_AVX512), y)
+	CC_AVX512_SUPPORT=\
+	$(shell $(CC) -march=native -dM -E - </dev/null 2>&1 | \
+	sed '/./{H;$$!d} ; x ; /AVX512F/!d; /AVX512BW/!d; /AVX512VL/!d' | \
+	grep -q AVX512 && echo 1)
+endif
+
+ifeq ($(CC_AVX512_SUPPORT), 1)
+CFLAGS += -DCC_AVX512_SUPPORT
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_packed_avx.c
+
+ifeq ($(RTE_TOOLCHAIN), gcc)
+ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
+CFLAGS += -DVIRTIO_GCC_UNROLL_PRAGMA
+endif
+endif
+
+ifeq ($(RTE_TOOLCHAIN), clang)
+ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1)
+CFLAGS += -DVIRTIO_CLANG_UNROLL_PRAGMA
+endif
+endif
+
+ifeq ($(RTE_TOOLCHAIN), icc)
+ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
+CFLAGS += -DVIRTIO_ICC_UNROLL_PRAGMA
+endif
+endif
+
+CFLAGS_virtio_rxtx_packed_avx.o += -mavx512f -mavx512bw -mavx512vl
+ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1)
+CFLAGS_virtio_rxtx_packed_avx.o += -Wno-zero-length-bounds
+endif
+endif
+
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
diff --git a/drivers/net/virtio/meson.build b/drivers/net/virtio/meson.build
index 15150eea1..8e68c3039 100644
--- a/drivers/net/virtio/meson.build
+++ b/drivers/net/virtio/meson.build
@@ -9,6 +9,20 @@  sources += files('virtio_ethdev.c',
 deps += ['kvargs', 'bus_pci']
 
 if arch_subdir == 'x86'
+	if '-mno-avx512f' not in machine_args
+		if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
+			cflags += ['-mavx512f', '-mavx512bw', '-mavx512vl']
+			cflags += ['-DCC_AVX512_SUPPORT']
+			if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
+				cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
+			elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
+				cflags += '-DVHOST_CLANG_UNROLL_PRAGMA'
+			elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0'))
+				cflags += '-DVHOST_ICC_UNROLL_PRAGMA'
+			endif
+			sources += files('virtio_rxtx_packed_avx.c')
+		endif
+	endif
 	sources += files('virtio_rxtx_simple_sse.c')
 elif arch_subdir == 'ppc'
 	sources += files('virtio_rxtx_simple_altivec.c')
diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
index febaf17a8..5c112cac7 100644
--- a/drivers/net/virtio/virtio_ethdev.h
+++ b/drivers/net/virtio/virtio_ethdev.h
@@ -105,6 +105,9 @@  uint16_t virtio_xmit_pkts_inorder(void *tx_queue, struct rte_mbuf **tx_pkts,
 uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		uint16_t nb_pkts);
 
+uint16_t virtio_recv_pkts_packed_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		uint16_t nb_pkts);
+
 int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
 
 void virtio_interrupt_handler(void *param);
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a549991aa..534562cca 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -2030,3 +2030,11 @@  virtio_xmit_pkts_inorder(void *tx_queue,
 
 	return nb_tx;
 }
+
+__rte_weak uint16_t
+virtio_recv_pkts_packed_vec(void *rx_queue __rte_unused,
+			    struct rte_mbuf **rx_pkts __rte_unused,
+			    uint16_t nb_pkts __rte_unused)
+{
+	return 0;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_packed_avx.c b/drivers/net/virtio/virtio_rxtx_packed_avx.c
new file mode 100644
index 000000000..8a7b459eb
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_packed_avx.c
@@ -0,0 +1,374 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_net.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+#define BYTE_SIZE 8
+/* flag bits offset in packed ring desc higher 64bits */
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+#define PACKED_FLAGS_MASK ((0ULL | VRING_PACKED_DESC_F_AVAIL_USED) << \
+	FLAGS_BITS_OFFSET)
+
+#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
+	sizeof(struct vring_packed_desc))
+#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
+
+#ifdef VIRTIO_GCC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_CLANG_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifdef VIRTIO_ICC_UNROLL_PRAGMA
+#define virtio_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
+	for (iter = val; iter < size; iter++)
+#endif
+
+#ifndef virtio_for_each_try_unroll
+#define virtio_for_each_try_unroll(iter, val, num) \
+	for (iter = val; iter < num; iter++)
+#endif
+
+static inline void
+virtio_update_batch_stats(struct virtnet_stats *stats,
+			  uint16_t pkt_len1,
+			  uint16_t pkt_len2,
+			  uint16_t pkt_len3,
+			  uint16_t pkt_len4)
+{
+	stats->bytes += pkt_len1;
+	stats->bytes += pkt_len2;
+	stats->bytes += pkt_len3;
+	stats->bytes += pkt_len4;
+}
+
+/* Optionally fill offload information in structure */
+static inline int
+virtio_vec_rx_offload(struct rte_mbuf *m, struct virtio_net_hdr *hdr)
+{
+	struct rte_net_hdr_lens hdr_lens;
+	uint32_t hdrlen, ptype;
+	int l4_supported = 0;
+
+	/* nothing to do */
+	if (hdr->flags == 0)
+		return 0;
+
+	/* GSO not support in vec path, skip check */
+	m->ol_flags |= PKT_RX_IP_CKSUM_UNKNOWN;
+
+	ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
+	m->packet_type = ptype;
+	if ((ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP ||
+	    (ptype & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP)
+		l4_supported = 1;
+
+	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		hdrlen = hdr_lens.l2_len + hdr_lens.l3_len + hdr_lens.l4_len;
+		if (hdr->csum_start <= hdrlen && l4_supported) {
+			m->ol_flags |= PKT_RX_L4_CKSUM_NONE;
+		} else {
+			/* Unknown proto or tunnel, do sw cksum. We can assume
+			 * the cksum field is in the first segment since the
+			 * buffers we provided to the host are large enough.
+			 * In case of SCTP, this will be wrong since it's a CRC
+			 * but there's nothing we can do.
+			 */
+			uint16_t csum = 0, off;
+
+			rte_raw_cksum_mbuf(m, hdr->csum_start,
+				rte_pktmbuf_pkt_len(m) - hdr->csum_start,
+				&csum);
+			if (likely(csum != 0xffff))
+				csum = ~csum;
+			off = hdr->csum_offset + hdr->csum_start;
+			if (rte_pktmbuf_data_len(m) >= off + 1)
+				*rte_pktmbuf_mtod_offset(m, uint16_t *,
+					off) = csum;
+		}
+	} else if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID && l4_supported) {
+		m->ol_flags |= PKT_RX_L4_CKSUM_GOOD;
+	}
+
+	return 0;
+}
+
+static inline uint16_t
+virtqueue_dequeue_batch_packed_vec(struct virtnet_rx *rxvq,
+				   struct rte_mbuf **rx_pkts)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t hdr_size = hw->vtnet_hdr_size;
+	uint64_t addrs[PACKED_BATCH_SIZE];
+	uint16_t id = vq->vq_used_cons_idx;
+	uint8_t desc_stats;
+	uint16_t i;
+	void *desc_addr;
+
+	if (id & PACKED_BATCH_MASK)
+		return -1;
+
+	if (unlikely((id + PACKED_BATCH_SIZE) > vq->vq_nentries))
+		return -1;
+
+	/* only care avail/used bits */
+	__m512i v_mask = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
+	desc_addr = &vq->vq_packed.ring.desc[id];
+
+	__m512i v_desc = _mm512_loadu_si512(desc_addr);
+	__m512i v_flag = _mm512_and_epi64(v_desc, v_mask);
+
+	__m512i v_used_flag = _mm512_setzero_si512();
+	if (vq->vq_packed.used_wrap_counter)
+		v_used_flag = _mm512_maskz_set1_epi64(0xaa, PACKED_FLAGS_MASK);
+
+	/* Check all descs are used */
+	desc_stats = _mm512_cmpneq_epu64_mask(v_flag, v_used_flag);
+	if (desc_stats)
+		return -1;
+
+	virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		rx_pkts[i] = (struct rte_mbuf *)vq->vq_descx[id + i].cookie;
+		rte_packet_prefetch(rte_pktmbuf_mtod(rx_pkts[i], void *));
+
+		addrs[i] = (uint64_t)rx_pkts[i]->rx_descriptor_fields1;
+	}
+
+	/*
+	 * load len from desc, store into mbuf pkt_len and data_len
+	 * len limiated by l6bit buf_len, pkt_len[16:31] can be ignored
+	 */
+	const __mmask16 mask = 0x6 | 0x6 << 4 | 0x6 << 8 | 0x6 << 12;
+	__m512i values = _mm512_maskz_shuffle_epi32(mask, v_desc, 0xAA);
+
+	/* reduce hdr_len from pkt_len and data_len */
+	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(mask,
+			(uint32_t)-hdr_size);
+
+	__m512i v_value = _mm512_add_epi32(values, mbuf_len_offset);
+
+	/* assert offset of data_len */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+		offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+
+	__m512i v_index = _mm512_set_epi64(addrs[3] + 8, addrs[3],
+					   addrs[2] + 8, addrs[2],
+					   addrs[1] + 8, addrs[1],
+					   addrs[0] + 8, addrs[0]);
+	/* batch store into mbufs */
+	_mm512_i64scatter_epi64(0, v_index, v_value, 1);
+
+	if (hw->has_rx_offload) {
+		virtio_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			char *addr = (char *)rx_pkts[i]->buf_addr +
+				RTE_PKTMBUF_HEADROOM - hdr_size;
+			virtio_vec_rx_offload(rx_pkts[i],
+					(struct virtio_net_hdr *)addr);
+		}
+	}
+
+	virtio_update_batch_stats(&rxvq->stats, rx_pkts[0]->pkt_len,
+			rx_pkts[1]->pkt_len, rx_pkts[2]->pkt_len,
+			rx_pkts[3]->pkt_len);
+
+	vq->vq_free_cnt += PACKED_BATCH_SIZE;
+
+	vq->vq_used_cons_idx += PACKED_BATCH_SIZE;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+static uint16_t
+virtqueue_dequeue_single_packed_vec(struct virtnet_rx *rxvq,
+				    struct rte_mbuf **rx_pkts)
+{
+	uint16_t used_idx, id;
+	uint32_t len;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint32_t hdr_size = hw->vtnet_hdr_size;
+	struct virtio_net_hdr *hdr;
+	struct vring_packed_desc *desc;
+	struct rte_mbuf *cookie;
+
+	desc = vq->vq_packed.ring.desc;
+	used_idx = vq->vq_used_cons_idx;
+	if (!desc_is_used(&desc[used_idx], vq))
+		return -1;
+
+	len = desc[used_idx].len;
+	id = desc[used_idx].id;
+	cookie = (struct rte_mbuf *)vq->vq_descx[id].cookie;
+	if (unlikely(cookie == NULL)) {
+		PMD_DRV_LOG(ERR, "vring descriptor with no mbuf cookie at %u",
+				vq->vq_used_cons_idx);
+		return -1;
+	}
+	rte_prefetch0(cookie);
+	rte_packet_prefetch(rte_pktmbuf_mtod(cookie, void *));
+
+	cookie->data_off = RTE_PKTMBUF_HEADROOM;
+	cookie->ol_flags = 0;
+	cookie->pkt_len = (uint32_t)(len - hdr_size);
+	cookie->data_len = (uint32_t)(len - hdr_size);
+
+	hdr = (struct virtio_net_hdr *)((char *)cookie->buf_addr +
+					RTE_PKTMBUF_HEADROOM - hdr_size);
+	if (hw->has_rx_offload)
+		virtio_vec_rx_offload(cookie, hdr);
+
+	*rx_pkts = cookie;
+
+	rxvq->stats.bytes += cookie->pkt_len;
+
+	vq->vq_free_cnt++;
+	vq->vq_used_cons_idx++;
+	if (vq->vq_used_cons_idx >= vq->vq_nentries) {
+		vq->vq_used_cons_idx -= vq->vq_nentries;
+		vq->vq_packed.used_wrap_counter ^= 1;
+	}
+
+	return 0;
+}
+
+static inline void
+virtio_recv_refill_packed_vec(struct virtnet_rx *rxvq,
+			      struct rte_mbuf **cookie,
+			      uint16_t num)
+{
+	struct virtqueue *vq = rxvq->vq;
+	struct vring_packed_desc *start_dp = vq->vq_packed.ring.desc;
+	uint16_t flags = vq->vq_packed.cached_flags;
+	struct virtio_hw *hw = vq->hw;
+	struct vq_desc_extra *dxp;
+	uint16_t idx, i;
+	uint16_t batch_num, total_num = 0;
+	uint16_t head_idx = vq->vq_avail_idx;
+	uint16_t head_flag = vq->vq_packed.cached_flags;
+	uint64_t addr;
+
+	do {
+		idx = vq->vq_avail_idx;
+
+		batch_num = PACKED_BATCH_SIZE;
+		if (unlikely((idx + PACKED_BATCH_SIZE) > vq->vq_nentries))
+			batch_num = vq->vq_nentries - idx;
+		if (unlikely((total_num + batch_num) > num))
+			batch_num = num - total_num;
+
+		virtio_for_each_try_unroll(i, 0, batch_num) {
+			dxp = &vq->vq_descx[idx + i];
+			dxp->cookie = (void *)cookie[total_num + i];
+
+			addr = VIRTIO_MBUF_ADDR(cookie[total_num + i], vq) +
+				RTE_PKTMBUF_HEADROOM - hw->vtnet_hdr_size;
+			start_dp[idx + i].addr = addr;
+			start_dp[idx + i].len = cookie[total_num + i]->buf_len
+				- RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
+			if (total_num || i) {
+				virtqueue_store_flags_packed(&start_dp[idx + i],
+						flags, hw->weak_barriers);
+			}
+		}
+
+		vq->vq_avail_idx += batch_num;
+		if (vq->vq_avail_idx >= vq->vq_nentries) {
+			vq->vq_avail_idx -= vq->vq_nentries;
+			vq->vq_packed.cached_flags ^=
+				VRING_PACKED_DESC_F_AVAIL_USED;
+			flags = vq->vq_packed.cached_flags;
+		}
+		total_num += batch_num;
+	} while (total_num < num);
+
+	virtqueue_store_flags_packed(&start_dp[head_idx], head_flag,
+				hw->weak_barriers);
+	vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - num);
+}
+
+uint16_t
+virtio_recv_pkts_packed_vec(void *rx_queue,
+			    struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	struct virtio_hw *hw = vq->hw;
+	uint16_t num, nb_rx = 0;
+	uint32_t nb_enqueued = 0;
+	uint16_t free_cnt = vq->vq_free_thresh;
+
+	if (unlikely(hw->started == 0))
+		return nb_rx;
+
+	num = RTE_MIN(VIRTIO_MBUF_BURST_SZ, nb_pkts);
+	if (likely(num > PACKED_BATCH_SIZE))
+		num = num - ((vq->vq_used_cons_idx + num) % PACKED_BATCH_SIZE);
+
+	while (num) {
+		if (!virtqueue_dequeue_batch_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx += PACKED_BATCH_SIZE;
+			num -= PACKED_BATCH_SIZE;
+			continue;
+		}
+		if (!virtqueue_dequeue_single_packed_vec(rxvq,
+					&rx_pkts[nb_rx])) {
+			nb_rx++;
+			num--;
+			continue;
+		}
+		break;
+	};
+
+	PMD_RX_LOG(DEBUG, "dequeue:%d", num);
+
+	rxvq->stats.packets += nb_rx;
+
+	if (likely(vq->vq_free_cnt >= free_cnt)) {
+		struct rte_mbuf *new_pkts[free_cnt];
+		if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,
+						free_cnt) == 0)) {
+			virtio_recv_refill_packed_vec(rxvq, new_pkts,
+					free_cnt);
+			nb_enqueued += free_cnt;
+		} else {
+			struct rte_eth_dev *dev =
+				&rte_eth_devices[rxvq->port_id];
+			dev->data->rx_mbuf_alloc_failed += free_cnt;
+		}
+	}
+
+	if (likely(nb_enqueued)) {
+		if (unlikely(virtqueue_kick_prepare_packed(vq))) {
+			virtqueue_notify(vq);
+			PMD_RX_LOG(DEBUG, "Notified");
+		}
+	}
+
+	return nb_rx;
+}
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 40ad786cc..c54698ad1 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -528,6 +528,7 @@  virtio_user_eth_dev_alloc(struct rte_vdev_device *vdev)
 	hw->use_msix = 1;
 	hw->modern   = 0;
 	hw->use_vec_rx = 0;
+	hw->use_vec_tx = 0;
 	hw->use_inorder_rx = 0;
 	hw->use_inorder_tx = 0;
 	hw->virtio_user_dev = dev;
@@ -739,8 +740,19 @@  virtio_user_pmd_probe(struct rte_vdev_device *dev)
 		goto end;
 	}
 
-	if (vectorized)
-		hw->use_vec_rx = 1;
+	if (vectorized) {
+		if (packed_vq) {
+#if defined(CC_AVX512_SUPPORT)
+			hw->use_vec_rx = 1;
+			hw->use_vec_tx = 1;
+#else
+			PMD_INIT_LOG(INFO,
+				"building environment do not support packed ring vectorized");
+#endif
+		} else {
+			hw->use_vec_rx = 1;
+		}
+	}
 
 	rte_eth_dev_probing_finish(eth_dev);
 	ret = 0;
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index ca1c10499..ce0340743 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -239,7 +239,8 @@  struct vq_desc_extra {
 	void *cookie;
 	uint16_t ndescs;
 	uint16_t next;
-};
+	uint8_t padding[4];
+} __rte_packed __rte_aligned(16);
 
 struct virtqueue {
 	struct virtio_hw  *hw; /**< virtio_hw structure pointer. */