[v2,5/5] vhost: add packed ring vectorized enqueue

Message ID 20200921064837.15957-6-yong.liu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost add vectorized data path |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed

Commit Message

Marvin Liu Sept. 21, 2020, 6:48 a.m. UTC
  Optimize vhost packed ring enqueue path with SIMD instructions. Four
descriptors status and length are batched handled with AVX512
instructions. Address translation operations are also accelerated
by AVX512 instructions.

Signed-off-by: Marvin Liu <yong.liu@intel.com>
  

Comments

Maxime Coquelin Oct. 6, 2020, 3 p.m. UTC | #1
On 9/21/20 8:48 AM, Marvin Liu wrote:
> Optimize vhost packed ring enqueue path with SIMD instructions. Four
> descriptors status and length are batched handled with AVX512
> instructions. Address translation operations are also accelerated
> by AVX512 instructions.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index fc7daf2145..b78b2c5c1b 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -1132,4 +1132,10 @@ vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
>  				 uint16_t avail_idx,
>  				 uintptr_t *desc_addrs,
>  				 uint16_t *ids);
> +
> +int
> +virtio_dev_rx_batch_packed_avx(struct virtio_net *dev,
> +			       struct vhost_virtqueue *vq,
> +			       struct rte_mbuf **pkts);
> +
>  #endif /* _VHOST_NET_CDEV_H_ */
> diff --git a/lib/librte_vhost/vhost_vec_avx.c b/lib/librte_vhost/vhost_vec_avx.c
> index dc5322d002..7d2250ed86 100644
> --- a/lib/librte_vhost/vhost_vec_avx.c
> +++ b/lib/librte_vhost/vhost_vec_avx.c
> @@ -35,9 +35,15 @@
>  #define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) << FLAGS_BITS_OFFSET)
>  #define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
>  	FLAGS_BITS_OFFSET)
> +#define PACKED_WRITE_AVAIL_FLAG (PACKED_AVAIL_FLAG | \
> +	((0ULL | VRING_DESC_F_WRITE) << FLAGS_BITS_OFFSET))
> +#define PACKED_WRITE_AVAIL_FLAG_WRAP (PACKED_AVAIL_FLAG_WRAP | \
> +	((0ULL | VRING_DESC_F_WRITE) << FLAGS_BITS_OFFSET))
>  
>  #define DESC_FLAGS_POS 0xaa
>  #define MBUF_LENS_POS 0x6666
> +#define DESC_LENS_POS 0x4444
> +#define DESC_LENS_FLAGS_POS 0xB0B0B0B0
>  
>  int
>  vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> @@ -179,3 +185,154 @@ vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
>  
>  	return -1;
>  }
> +
> +int
> +virtio_dev_rx_batch_packed_avx(struct virtio_net *dev,
> +			       struct vhost_virtqueue *vq,
> +			       struct rte_mbuf **pkts)
> +{
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint64_t desc_addrs[PACKED_BATCH_SIZE];
> +	uint32_t buf_offset = dev->vhost_hlen;
> +	uint32_t desc_status;
> +	uint64_t lens[PACKED_BATCH_SIZE];
> +	uint16_t i;
> +	void *desc_addr;
> +	uint8_t cmp_low, cmp_high, cmp_result;
> +
> +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> +		return -1;

Same comment as for patch 4. Packed ring size may not be a pow2.

> +	/* check refcnt and nb_segs */
> +	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
> +
> +	/* load four mbufs rearm data */
> +	__m256i mbufs = _mm256_set_epi64x(
> +				*pkts[3]->rearm_data,
> +				*pkts[2]->rearm_data,
> +				*pkts[1]->rearm_data,
> +				*pkts[0]->rearm_data);
> +
> +	uint16_t cmp = _mm256_cmpneq_epu16_mask(mbufs, mbuf_ref);
> +	if (cmp & MBUF_LENS_POS)
> +		return -1;
> +
> +	/* check desc status */
> +	desc_addr = &vq->desc_packed[avail_idx];
> +	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
> +
> +	__m512i avail_flag_vec;
> +	__m512i used_flag_vec;
> +	if (vq->avail_wrap_counter) {
> +#if defined(RTE_ARCH_I686)

Is supporting AVX512 on i686 really useful/necessary?

> +		avail_flag_vec = _mm512_set4_epi64(PACKED_WRITE_AVAIL_FLAG,
> +					0x0, PACKED_WRITE_AVAIL_FLAG, 0x0);
> +		used_flag_vec = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
> +					PACKED_FLAGS_MASK, 0x0);
> +#else
> +		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_WRITE_AVAIL_FLAG);
> +		used_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_FLAGS_MASK);
> +#endif
> +	} else {
> +#if defined(RTE_ARCH_I686)
> +		avail_flag_vec = _mm512_set4_epi64(
> +					PACKED_WRITE_AVAIL_FLAG_WRAP, 0x0,
> +					PACKED_WRITE_AVAIL_FLAG, 0x0);
> +		used_flag_vec = _mm512_set4_epi64(0x0, 0x0, 0x0, 0x0);
> +#else
> +		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_WRITE_AVAIL_FLAG_WRAP);
> +		used_flag_vec = _mm512_setzero_epi32();
> +#endif
> +	}
> +
> +	desc_status = _mm512_mask_cmp_epu16_mask(BATCH_FLAGS_MASK, desc_vec,
> +				avail_flag_vec, _MM_CMPINT_NE);
> +	if (desc_status)
> +		return -1;
> +
> +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			uint64_t size = (uint64_t)descs[avail_idx + i].len;
> +			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
> +				descs[avail_idx + i].addr, &size,
> +				VHOST_ACCESS_RW);
> +
> +			if (!desc_addrs[i])
> +				return -1;
> +
> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
> +					0));
> +		}
> +	} else {
> +		/* check buffer fit into one region & translate address */
> +		__m512i regions_low_addrs =
> +			_mm512_loadu_si512((void *)&dev->regions_low_addrs);
> +		__m512i regions_high_addrs =
> +			_mm512_loadu_si512((void *)&dev->regions_high_addrs);
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			uint64_t addr_low = descs[avail_idx + i].addr;
> +			uint64_t addr_high = addr_low +
> +						descs[avail_idx + i].len;
> +			__m512i low_addr_vec = _mm512_set1_epi64(addr_low);
> +			__m512i high_addr_vec = _mm512_set1_epi64(addr_high);
> +
> +			cmp_low = _mm512_cmp_epi64_mask(low_addr_vec,
> +					regions_low_addrs, _MM_CMPINT_NLT);
> +			cmp_high = _mm512_cmp_epi64_mask(high_addr_vec,
> +					regions_high_addrs, _MM_CMPINT_LT);
> +			cmp_result = cmp_low & cmp_high;
> +			int index = __builtin_ctz(cmp_result);
> +			if (unlikely((uint32_t)index >= dev->mem->nregions))
> +				return -1;
> +
> +			desc_addrs[i] = addr_low +
> +				dev->mem->regions[index].host_user_addr -
> +				dev->mem->regions[index].guest_phys_addr;
> +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
> +					0));
> +		}
> +	}
> +
> +	/* check length is enough */
> +	__m512i pkt_lens = _mm512_set_epi32(
> +			0, pkts[3]->pkt_len, 0, 0,
> +			0, pkts[2]->pkt_len, 0, 0,
> +			0, pkts[1]->pkt_len, 0, 0,
> +			0, pkts[0]->pkt_len, 0, 0);
> +
> +	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(DESC_LENS_POS,
> +					dev->vhost_hlen);
> +	__m512i buf_len_vec = _mm512_add_epi32(pkt_lens, mbuf_len_offset);
> +	uint16_t lens_cmp = _mm512_mask_cmp_epu32_mask(DESC_LENS_POS,
> +				desc_vec, buf_len_vec, _MM_CMPINT_LT);
> +	if (lens_cmp)
> +		return -1;
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
> +			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
> +			   pkts[i]->pkt_len);
> +	}
> +
> +	if (unlikely((dev->features & (1ULL << VHOST_F_LOG_ALL)))) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			lens[i] = descs[avail_idx + i].len;
> +			vhost_log_cache_write_iova(dev, vq,
> +				descs[avail_idx + i].addr, lens[i]);
> +		}
> +	}
> +
> +	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
> +	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
> +	/* save len and flags, skip addr and id */
> +	__m512i desc_updated = _mm512_mask_add_epi16(desc_vec,
> +					DESC_LENS_FLAGS_POS, buf_len_vec,
> +					used_flag_vec);
> +	_mm512_storeu_si512(desc_addr, desc_updated);
> +
> +	return 0;
> +}
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index e4d2e2e7d6..5c56a8d6ff 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1354,6 +1354,21 @@ virtio_dev_rx_single_packed(struct virtio_net *dev,
>  	return 0;
>  }
>  
> +static __rte_always_inline int
> +virtio_dev_rx_handle_batch_packed(struct virtio_net *dev,
> +			   struct vhost_virtqueue *vq,
> +			   struct rte_mbuf **pkts)
> +
> +{
> +	if (unlikely(dev->vectorized))
> +#ifdef CC_AVX512_SUPPORT
> +		return virtio_dev_rx_batch_packed_avx(dev, vq, pkts);
> +#else
> +		return virtio_dev_rx_batch_packed(dev, vq, pkts);
> +#endif
> +	return virtio_dev_rx_batch_packed(dev, vq, pkts);

It should be as below to not have any performance impact when
CC_AVX512_SUPPORT is not set:

#ifdef CC_AVX512_SUPPORT
	if (unlikely(dev->vectorized))
		return virtio_dev_rx_batch_packed_avx(dev, vq, pkts);
#else
	return virtio_dev_rx_batch_packed(dev, vq, pkts);
#endif

> +}
> +
>  static __rte_noinline uint32_t
>  virtio_dev_rx_packed(struct virtio_net *dev,
>  		     struct vhost_virtqueue *__rte_restrict vq,
> @@ -1367,8 +1382,8 @@ virtio_dev_rx_packed(struct virtio_net *dev,
>  		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
>  
>  		if (remained >= PACKED_BATCH_SIZE) {
> -			if (!virtio_dev_rx_batch_packed(dev, vq,
> -							&pkts[pkt_idx])) {
> +			if (!virtio_dev_rx_handle_batch_packed(dev, vq,
> +				&pkts[pkt_idx])) {
>  				pkt_idx += PACKED_BATCH_SIZE;
>  				remained -= PACKED_BATCH_SIZE;
>  				continue;
>
  
Marvin Liu Oct. 8, 2020, 7:09 a.m. UTC | #2
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, October 6, 2020 11:00 PM
> To: Liu, Yong <yong.liu@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v2 5/5] vhost: add packed ring vectorized enqueue
> 
> 
> 
> On 9/21/20 8:48 AM, Marvin Liu wrote:
> > Optimize vhost packed ring enqueue path with SIMD instructions. Four
> > descriptors status and length are batched handled with AVX512
> > instructions. Address translation operations are also accelerated
> > by AVX512 instructions.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index fc7daf2145..b78b2c5c1b 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -1132,4 +1132,10 @@ vhost_reserve_avail_batch_packed_avx(struct
> virtio_net *dev,
> >  				 uint16_t avail_idx,
> >  				 uintptr_t *desc_addrs,
> >  				 uint16_t *ids);
> > +
> > +int
> > +virtio_dev_rx_batch_packed_avx(struct virtio_net *dev,
> > +			       struct vhost_virtqueue *vq,
> > +			       struct rte_mbuf **pkts);
> > +
> >  #endif /* _VHOST_NET_CDEV_H_ */
> > diff --git a/lib/librte_vhost/vhost_vec_avx.c
> b/lib/librte_vhost/vhost_vec_avx.c
> > index dc5322d002..7d2250ed86 100644
> > --- a/lib/librte_vhost/vhost_vec_avx.c
> > +++ b/lib/librte_vhost/vhost_vec_avx.c
> > @@ -35,9 +35,15 @@
> >  #define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) <<
> FLAGS_BITS_OFFSET)
> >  #define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
> >  	FLAGS_BITS_OFFSET)
> > +#define PACKED_WRITE_AVAIL_FLAG (PACKED_AVAIL_FLAG | \
> > +	((0ULL | VRING_DESC_F_WRITE) << FLAGS_BITS_OFFSET))
> > +#define PACKED_WRITE_AVAIL_FLAG_WRAP
> (PACKED_AVAIL_FLAG_WRAP | \
> > +	((0ULL | VRING_DESC_F_WRITE) << FLAGS_BITS_OFFSET))
> >
> >  #define DESC_FLAGS_POS 0xaa
> >  #define MBUF_LENS_POS 0x6666
> > +#define DESC_LENS_POS 0x4444
> > +#define DESC_LENS_FLAGS_POS 0xB0B0B0B0
> >
> >  int
> >  vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> > @@ -179,3 +185,154 @@ vhost_reserve_avail_batch_packed_avx(struct
> virtio_net *dev,
> >
> >  	return -1;
> >  }
> > +
> > +int
> > +virtio_dev_rx_batch_packed_avx(struct virtio_net *dev,
> > +			       struct vhost_virtqueue *vq,
> > +			       struct rte_mbuf **pkts)
> > +{
> > +	struct vring_packed_desc *descs = vq->desc_packed;
> > +	uint16_t avail_idx = vq->last_avail_idx;
> > +	uint64_t desc_addrs[PACKED_BATCH_SIZE];
> > +	uint32_t buf_offset = dev->vhost_hlen;
> > +	uint32_t desc_status;
> > +	uint64_t lens[PACKED_BATCH_SIZE];
> > +	uint16_t i;
> > +	void *desc_addr;
> > +	uint8_t cmp_low, cmp_high, cmp_result;
> > +
> > +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> > +		return -1;
> 
> Same comment as for patch 4. Packed ring size may not be a pow2.
> 
Thanks, will fix in next version.

> > +	/* check refcnt and nb_segs */
> > +	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
> > +
> > +	/* load four mbufs rearm data */
> > +	__m256i mbufs = _mm256_set_epi64x(
> > +				*pkts[3]->rearm_data,
> > +				*pkts[2]->rearm_data,
> > +				*pkts[1]->rearm_data,
> > +				*pkts[0]->rearm_data);
> > +
> > +	uint16_t cmp = _mm256_cmpneq_epu16_mask(mbufs, mbuf_ref);
> > +	if (cmp & MBUF_LENS_POS)
> > +		return -1;
> > +
> > +	/* check desc status */
> > +	desc_addr = &vq->desc_packed[avail_idx];
> > +	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
> > +
> > +	__m512i avail_flag_vec;
> > +	__m512i used_flag_vec;
> > +	if (vq->avail_wrap_counter) {
> > +#if defined(RTE_ARCH_I686)
> 
> Is supporting AVX512 on i686 really useful/necessary?
> 
It is useless for function point of view.  Here is for successful compilation if enabled i686 build. 

> > +		avail_flag_vec =
> _mm512_set4_epi64(PACKED_WRITE_AVAIL_FLAG,
> > +					0x0, PACKED_WRITE_AVAIL_FLAG,
> 0x0);
> > +		used_flag_vec = _mm512_set4_epi64(PACKED_FLAGS_MASK,
> 0x0,
> > +					PACKED_FLAGS_MASK, 0x0);
> > +#else
> > +		avail_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_WRITE_AVAIL_FLAG);
> > +		used_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_FLAGS_MASK);
> > +#endif
> > +	} else {
> > +#if defined(RTE_ARCH_I686)
> > +		avail_flag_vec = _mm512_set4_epi64(
> > +					PACKED_WRITE_AVAIL_FLAG_WRAP,
> 0x0,
> > +					PACKED_WRITE_AVAIL_FLAG, 0x0);
> > +		used_flag_vec = _mm512_set4_epi64(0x0, 0x0, 0x0, 0x0);
> > +#else
> > +		avail_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_WRITE_AVAIL_FLAG_WRAP);
> > +		used_flag_vec = _mm512_setzero_epi32();
> > +#endif
> > +	}
> > +
> > +	desc_status =
> _mm512_mask_cmp_epu16_mask(BATCH_FLAGS_MASK, desc_vec,
> > +				avail_flag_vec, _MM_CMPINT_NE);
> > +	if (desc_status)
> > +		return -1;
> > +
> > +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			uint64_t size = (uint64_t)descs[avail_idx + i].len;
> > +			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
> > +				descs[avail_idx + i].addr, &size,
> > +				VHOST_ACCESS_RW);
> > +
> > +			if (!desc_addrs[i])
> > +				return -1;
> > +
> > +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void
> *,
> > +					0));
> > +		}
> > +	} else {
> > +		/* check buffer fit into one region & translate address */
> > +		__m512i regions_low_addrs =
> > +			_mm512_loadu_si512((void *)&dev-
> >regions_low_addrs);
> > +		__m512i regions_high_addrs =
> > +			_mm512_loadu_si512((void *)&dev-
> >regions_high_addrs);
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			uint64_t addr_low = descs[avail_idx + i].addr;
> > +			uint64_t addr_high = addr_low +
> > +						descs[avail_idx + i].len;
> > +			__m512i low_addr_vec =
> _mm512_set1_epi64(addr_low);
> > +			__m512i high_addr_vec =
> _mm512_set1_epi64(addr_high);
> > +
> > +			cmp_low =
> _mm512_cmp_epi64_mask(low_addr_vec,
> > +					regions_low_addrs,
> _MM_CMPINT_NLT);
> > +			cmp_high =
> _mm512_cmp_epi64_mask(high_addr_vec,
> > +					regions_high_addrs,
> _MM_CMPINT_LT);
> > +			cmp_result = cmp_low & cmp_high;
> > +			int index = __builtin_ctz(cmp_result);
> > +			if (unlikely((uint32_t)index >= dev->mem->nregions))
> > +				return -1;
> > +
> > +			desc_addrs[i] = addr_low +
> > +				dev->mem->regions[index].host_user_addr -
> > +				dev->mem->regions[index].guest_phys_addr;
> > +			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void
> *,
> > +					0));
> > +		}
> > +	}
> > +
> > +	/* check length is enough */
> > +	__m512i pkt_lens = _mm512_set_epi32(
> > +			0, pkts[3]->pkt_len, 0, 0,
> > +			0, pkts[2]->pkt_len, 0, 0,
> > +			0, pkts[1]->pkt_len, 0, 0,
> > +			0, pkts[0]->pkt_len, 0, 0);
> > +
> > +	__m512i mbuf_len_offset =
> _mm512_maskz_set1_epi32(DESC_LENS_POS,
> > +					dev->vhost_hlen);
> > +	__m512i buf_len_vec = _mm512_add_epi32(pkt_lens,
> mbuf_len_offset);
> > +	uint16_t lens_cmp =
> _mm512_mask_cmp_epu32_mask(DESC_LENS_POS,
> > +				desc_vec, buf_len_vec, _MM_CMPINT_LT);
> > +	if (lens_cmp)
> > +		return -1;
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
> > +			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
> > +			   pkts[i]->pkt_len);
> > +	}
> > +
> > +	if (unlikely((dev->features & (1ULL << VHOST_F_LOG_ALL)))) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			lens[i] = descs[avail_idx + i].len;
> > +			vhost_log_cache_write_iova(dev, vq,
> > +				descs[avail_idx + i].addr, lens[i]);
> > +		}
> > +	}
> > +
> > +	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
> > +	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
> > +	/* save len and flags, skip addr and id */
> > +	__m512i desc_updated = _mm512_mask_add_epi16(desc_vec,
> > +					DESC_LENS_FLAGS_POS, buf_len_vec,
> > +					used_flag_vec);
> > +	_mm512_storeu_si512(desc_addr, desc_updated);
> > +
> > +	return 0;
> > +}
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index e4d2e2e7d6..5c56a8d6ff 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1354,6 +1354,21 @@ virtio_dev_rx_single_packed(struct virtio_net
> *dev,
> >  	return 0;
> >  }
> >
> > +static __rte_always_inline int
> > +virtio_dev_rx_handle_batch_packed(struct virtio_net *dev,
> > +			   struct vhost_virtqueue *vq,
> > +			   struct rte_mbuf **pkts)
> > +
> > +{
> > +	if (unlikely(dev->vectorized))
> > +#ifdef CC_AVX512_SUPPORT
> > +		return virtio_dev_rx_batch_packed_avx(dev, vq, pkts);
> > +#else
> > +		return virtio_dev_rx_batch_packed(dev, vq, pkts);
> > +#endif
> > +	return virtio_dev_rx_batch_packed(dev, vq, pkts);
> 
> It should be as below to not have any performance impact when
> CC_AVX512_SUPPORT is not set:
> 
> #ifdef CC_AVX512_SUPPORT
> 	if (unlikely(dev->vectorized))
> 		return virtio_dev_rx_batch_packed_avx(dev, vq, pkts);
> #else
> 	return virtio_dev_rx_batch_packed(dev, vq, pkts);
> #endif
> 
Got, will fix in next version.

> > +}
> > +
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_packed(struct virtio_net *dev,
> >  		     struct vhost_virtqueue *__rte_restrict vq,
> > @@ -1367,8 +1382,8 @@ virtio_dev_rx_packed(struct virtio_net *dev,
> >  		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> >
> >  		if (remained >= PACKED_BATCH_SIZE) {
> > -			if (!virtio_dev_rx_batch_packed(dev, vq,
> > -							&pkts[pkt_idx])) {
> > +			if (!virtio_dev_rx_handle_batch_packed(dev, vq,
> > +				&pkts[pkt_idx])) {
> >  				pkt_idx += PACKED_BATCH_SIZE;
> >  				remained -= PACKED_BATCH_SIZE;
> >  				continue;
> >
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index fc7daf2145..b78b2c5c1b 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -1132,4 +1132,10 @@  vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
 				 uint16_t avail_idx,
 				 uintptr_t *desc_addrs,
 				 uint16_t *ids);
+
+int
+virtio_dev_rx_batch_packed_avx(struct virtio_net *dev,
+			       struct vhost_virtqueue *vq,
+			       struct rte_mbuf **pkts);
+
 #endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost_vec_avx.c b/lib/librte_vhost/vhost_vec_avx.c
index dc5322d002..7d2250ed86 100644
--- a/lib/librte_vhost/vhost_vec_avx.c
+++ b/lib/librte_vhost/vhost_vec_avx.c
@@ -35,9 +35,15 @@ 
 #define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) << FLAGS_BITS_OFFSET)
 #define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
 	FLAGS_BITS_OFFSET)
+#define PACKED_WRITE_AVAIL_FLAG (PACKED_AVAIL_FLAG | \
+	((0ULL | VRING_DESC_F_WRITE) << FLAGS_BITS_OFFSET))
+#define PACKED_WRITE_AVAIL_FLAG_WRAP (PACKED_AVAIL_FLAG_WRAP | \
+	((0ULL | VRING_DESC_F_WRITE) << FLAGS_BITS_OFFSET))
 
 #define DESC_FLAGS_POS 0xaa
 #define MBUF_LENS_POS 0x6666
+#define DESC_LENS_POS 0x4444
+#define DESC_LENS_FLAGS_POS 0xB0B0B0B0
 
 int
 vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
@@ -179,3 +185,154 @@  vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
 
 	return -1;
 }
+
+int
+virtio_dev_rx_batch_packed_avx(struct virtio_net *dev,
+			       struct vhost_virtqueue *vq,
+			       struct rte_mbuf **pkts)
+{
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint64_t desc_addrs[PACKED_BATCH_SIZE];
+	uint32_t buf_offset = dev->vhost_hlen;
+	uint32_t desc_status;
+	uint64_t lens[PACKED_BATCH_SIZE];
+	uint16_t i;
+	void *desc_addr;
+	uint8_t cmp_low, cmp_high, cmp_result;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+
+	/* check refcnt and nb_segs */
+	__m256i mbuf_ref = _mm256_set1_epi64x(DEFAULT_REARM_DATA);
+
+	/* load four mbufs rearm data */
+	__m256i mbufs = _mm256_set_epi64x(
+				*pkts[3]->rearm_data,
+				*pkts[2]->rearm_data,
+				*pkts[1]->rearm_data,
+				*pkts[0]->rearm_data);
+
+	uint16_t cmp = _mm256_cmpneq_epu16_mask(mbufs, mbuf_ref);
+	if (cmp & MBUF_LENS_POS)
+		return -1;
+
+	/* check desc status */
+	desc_addr = &vq->desc_packed[avail_idx];
+	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
+
+	__m512i avail_flag_vec;
+	__m512i used_flag_vec;
+	if (vq->avail_wrap_counter) {
+#if defined(RTE_ARCH_I686)
+		avail_flag_vec = _mm512_set4_epi64(PACKED_WRITE_AVAIL_FLAG,
+					0x0, PACKED_WRITE_AVAIL_FLAG, 0x0);
+		used_flag_vec = _mm512_set4_epi64(PACKED_FLAGS_MASK, 0x0,
+					PACKED_FLAGS_MASK, 0x0);
+#else
+		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_WRITE_AVAIL_FLAG);
+		used_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_FLAGS_MASK);
+#endif
+	} else {
+#if defined(RTE_ARCH_I686)
+		avail_flag_vec = _mm512_set4_epi64(
+					PACKED_WRITE_AVAIL_FLAG_WRAP, 0x0,
+					PACKED_WRITE_AVAIL_FLAG, 0x0);
+		used_flag_vec = _mm512_set4_epi64(0x0, 0x0, 0x0, 0x0);
+#else
+		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_WRITE_AVAIL_FLAG_WRAP);
+		used_flag_vec = _mm512_setzero_epi32();
+#endif
+	}
+
+	desc_status = _mm512_mask_cmp_epu16_mask(BATCH_FLAGS_MASK, desc_vec,
+				avail_flag_vec, _MM_CMPINT_NE);
+	if (desc_status)
+		return -1;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			uint64_t size = (uint64_t)descs[avail_idx + i].len;
+			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
+				descs[avail_idx + i].addr, &size,
+				VHOST_ACCESS_RW);
+
+			if (!desc_addrs[i])
+				return -1;
+
+			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
+					0));
+		}
+	} else {
+		/* check buffer fit into one region & translate address */
+		__m512i regions_low_addrs =
+			_mm512_loadu_si512((void *)&dev->regions_low_addrs);
+		__m512i regions_high_addrs =
+			_mm512_loadu_si512((void *)&dev->regions_high_addrs);
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			uint64_t addr_low = descs[avail_idx + i].addr;
+			uint64_t addr_high = addr_low +
+						descs[avail_idx + i].len;
+			__m512i low_addr_vec = _mm512_set1_epi64(addr_low);
+			__m512i high_addr_vec = _mm512_set1_epi64(addr_high);
+
+			cmp_low = _mm512_cmp_epi64_mask(low_addr_vec,
+					regions_low_addrs, _MM_CMPINT_NLT);
+			cmp_high = _mm512_cmp_epi64_mask(high_addr_vec,
+					regions_high_addrs, _MM_CMPINT_LT);
+			cmp_result = cmp_low & cmp_high;
+			int index = __builtin_ctz(cmp_result);
+			if (unlikely((uint32_t)index >= dev->mem->nregions))
+				return -1;
+
+			desc_addrs[i] = addr_low +
+				dev->mem->regions[index].host_user_addr -
+				dev->mem->regions[index].guest_phys_addr;
+			rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *,
+					0));
+		}
+	}
+
+	/* check length is enough */
+	__m512i pkt_lens = _mm512_set_epi32(
+			0, pkts[3]->pkt_len, 0, 0,
+			0, pkts[2]->pkt_len, 0, 0,
+			0, pkts[1]->pkt_len, 0, 0,
+			0, pkts[0]->pkt_len, 0, 0);
+
+	__m512i mbuf_len_offset = _mm512_maskz_set1_epi32(DESC_LENS_POS,
+					dev->vhost_hlen);
+	__m512i buf_len_vec = _mm512_add_epi32(pkt_lens, mbuf_len_offset);
+	uint16_t lens_cmp = _mm512_mask_cmp_epu32_mask(DESC_LENS_POS,
+				desc_vec, buf_len_vec, _MM_CMPINT_LT);
+	if (lens_cmp)
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset),
+			   rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
+			   pkts[i]->pkt_len);
+	}
+
+	if (unlikely((dev->features & (1ULL << VHOST_F_LOG_ALL)))) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			lens[i] = descs[avail_idx + i].len;
+			vhost_log_cache_write_iova(dev, vq,
+				descs[avail_idx + i].addr, lens[i]);
+		}
+	}
+
+	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
+	vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE);
+	/* save len and flags, skip addr and id */
+	__m512i desc_updated = _mm512_mask_add_epi16(desc_vec,
+					DESC_LENS_FLAGS_POS, buf_len_vec,
+					used_flag_vec);
+	_mm512_storeu_si512(desc_addr, desc_updated);
+
+	return 0;
+}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index e4d2e2e7d6..5c56a8d6ff 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1354,6 +1354,21 @@  virtio_dev_rx_single_packed(struct virtio_net *dev,
 	return 0;
 }
 
+static __rte_always_inline int
+virtio_dev_rx_handle_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mbuf **pkts)
+
+{
+	if (unlikely(dev->vectorized))
+#ifdef CC_AVX512_SUPPORT
+		return virtio_dev_rx_batch_packed_avx(dev, vq, pkts);
+#else
+		return virtio_dev_rx_batch_packed(dev, vq, pkts);
+#endif
+	return virtio_dev_rx_batch_packed(dev, vq, pkts);
+}
+
 static __rte_noinline uint32_t
 virtio_dev_rx_packed(struct virtio_net *dev,
 		     struct vhost_virtqueue *__rte_restrict vq,
@@ -1367,8 +1382,8 @@  virtio_dev_rx_packed(struct virtio_net *dev,
 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
 
 		if (remained >= PACKED_BATCH_SIZE) {
-			if (!virtio_dev_rx_batch_packed(dev, vq,
-							&pkts[pkt_idx])) {
+			if (!virtio_dev_rx_handle_batch_packed(dev, vq,
+				&pkts[pkt_idx])) {
 				pkt_idx += PACKED_BATCH_SIZE;
 				remained -= PACKED_BATCH_SIZE;
 				continue;