[v3,1/5] build: add meson option to configure IOVA mode as VA

Message ID 4fbe435f0d86ef1bc7930bdb5847f41e2042f693.1663767715.git.sthotton@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series mbuf dynamic field expansion |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Shijith Thotton Sept. 21, 2022, 1:56 p.m. UTC
  IOVA mode in DPDK is either PA or VA. The new build option iova_as_va
configures the mode to VA at compile time and prevents setting it to PA
at runtime. For now, all drivers which are not always enabled are
disabled with this option. Supported driver can set the flag
pmd_iova_as_va in its build file to enable build.

mbuf structure holds the physical (PA) and virtual address (VA) of a
buffer. if IOVA mode is set to VA, PA is redundant as it is the same as
VA. So PA field need not be updated and marked invalid if the build is
configured to use only VA.

Signed-off-by: Shijith Thotton <sthotton@marvell.com>
---
 app/test-bbdev/test_bbdev_perf.c         |  2 +-
 app/test-crypto-perf/cperf_test_common.c |  5 +--
 app/test/test_bpf.c                      |  2 +-
 app/test/test_dmadev.c                   | 33 ++++++---------
 app/test/test_mbuf.c                     | 12 +++---
 app/test/test_pcapng.c                   |  2 +-
 config/meson.build                       |  1 +
 drivers/meson.build                      |  6 +++
 lib/eal/linux/eal.c                      |  7 +++
 lib/mbuf/rte_mbuf.c                      |  8 ++--
 lib/mbuf/rte_mbuf.h                      | 17 +++++---
 lib/mbuf/rte_mbuf_core.h                 | 10 +++++
 lib/vhost/vhost.h                        |  2 +-
 lib/vhost/vhost_crypto.c                 | 54 ++++++++++++++++++------
 meson_options.txt                        |  2 +
 15 files changed, 109 insertions(+), 54 deletions(-)
  

Comments

Olivier Matz Sept. 28, 2022, 12:52 p.m. UTC | #1
On Wed, Sep 21, 2022 at 07:26:17PM +0530, Shijith Thotton wrote:
> IOVA mode in DPDK is either PA or VA. The new build option iova_as_va
> configures the mode to VA at compile time and prevents setting it to PA
> at runtime. For now, all drivers which are not always enabled are
> disabled with this option. Supported driver can set the flag
> pmd_iova_as_va in its build file to enable build.
> 
> mbuf structure holds the physical (PA) and virtual address (VA) of a
> buffer. if IOVA mode is set to VA, PA is redundant as it is the same as
> VA. So PA field need not be updated and marked invalid if the build is
> configured to use only VA.
> 
> Signed-off-by: Shijith Thotton <sthotton@marvell.com>
> ---
>  app/test-bbdev/test_bbdev_perf.c         |  2 +-
>  app/test-crypto-perf/cperf_test_common.c |  5 +--
>  app/test/test_bpf.c                      |  2 +-
>  app/test/test_dmadev.c                   | 33 ++++++---------
>  app/test/test_mbuf.c                     | 12 +++---
>  app/test/test_pcapng.c                   |  2 +-
>  config/meson.build                       |  1 +
>  drivers/meson.build                      |  6 +++
>  lib/eal/linux/eal.c                      |  7 +++
>  lib/mbuf/rte_mbuf.c                      |  8 ++--
>  lib/mbuf/rte_mbuf.h                      | 17 +++++---
>  lib/mbuf/rte_mbuf_core.h                 | 10 +++++
>  lib/vhost/vhost.h                        |  2 +-
>  lib/vhost/vhost_crypto.c                 | 54 ++++++++++++++++++------
>  meson_options.txt                        |  2 +
>  15 files changed, 109 insertions(+), 54 deletions(-)
> 
> diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
> index 8fab52d821..f6aa25b67d 100644
> --- a/app/test-bbdev/test_bbdev_perf.c
> +++ b/app/test-bbdev/test_bbdev_perf.c
> @@ -1001,7 +1001,7 @@ init_op_data_objs(struct rte_bbdev_op_data *bufs,
>  					seg->length);
>  				memcpy(data, seg->addr, seg->length);
>  				m_head->buf_addr = data;
> -				m_head->buf_iova = rte_malloc_virt2iova(data);
> +				rte_mbuf_iova_set(m_head, rte_malloc_virt2iova(data));

Wouldn't it be better to have a preliminary patch that replaces direct
accesses to m->buf_iova by rte_mbuf_iova_*() functions in app and libs?
This would make this commit smaller to read.

If I understand properly, the drivers/ part has to be done at the same time
than setting "pmd_iova_as_va" in the meson config.

>  				m_head->data_off = 0;
>  				m_head->data_len = seg->length;
>  			} else {
> diff --git a/app/test-crypto-perf/cperf_test_common.c b/app/test-crypto-perf/cperf_test_common.c
> index 00aadc9a47..27646cd619 100644
> --- a/app/test-crypto-perf/cperf_test_common.c
> +++ b/app/test-crypto-perf/cperf_test_common.c
> @@ -26,8 +26,7 @@ fill_single_seg_mbuf(struct rte_mbuf *m, struct rte_mempool *mp,
>  	/* start of buffer is after mbuf structure and priv data */
>  	m->priv_size = 0;
>  	m->buf_addr = (char *)m + mbuf_hdr_size;
> -	m->buf_iova = rte_mempool_virt2iova(obj) +
> -		mbuf_offset + mbuf_hdr_size;
> +	rte_mbuf_iova_set(m, rte_mempool_virt2iova(obj) + mbuf_offset + mbuf_hdr_size);
>  	m->buf_len = segment_sz;
>  	m->data_len = data_len;
>  	m->pkt_len = data_len;
> @@ -58,7 +57,7 @@ fill_multi_seg_mbuf(struct rte_mbuf *m, struct rte_mempool *mp,
>  		/* start of buffer is after mbuf structure and priv data */
>  		m->priv_size = 0;
>  		m->buf_addr = (char *)m + mbuf_hdr_size;
> -		m->buf_iova = next_seg_phys_addr;
> +		rte_mbuf_iova_set(m, next_seg_phys_addr);
>  		next_seg_phys_addr += mbuf_hdr_size + segment_sz;
>  		m->buf_len = segment_sz;
>  		m->data_len = data_len;
> diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
> index 97f500809e..f5af5e8a3f 100644
> --- a/app/test/test_bpf.c
> +++ b/app/test/test_bpf.c
> @@ -2600,7 +2600,7 @@ dummy_mbuf_prep(struct rte_mbuf *mb, uint8_t buf[], uint32_t buf_len,
>  	uint8_t *db;
>  
>  	mb->buf_addr = buf;
> -	mb->buf_iova = (uintptr_t)buf;
> +	rte_mbuf_iova_set(mb, (uintptr_t)buf);
>  	mb->buf_len = buf_len;
>  	rte_mbuf_refcnt_set(mb, 1);
>  
> diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
> index 9e8e101f40..8306947eda 100644
> --- a/app/test/test_dmadev.c
> +++ b/app/test/test_dmadev.c
> @@ -110,8 +110,8 @@ do_multi_copies(int16_t dev_id, uint16_t vchan,
>  		for (j = 0; j < COPY_LEN/sizeof(uint64_t); j++)
>  			src_data[j] = rte_rand();
>  
> -		if (rte_dma_copy(dev_id, vchan, srcs[i]->buf_iova + srcs[i]->data_off,
> -				dsts[i]->buf_iova + dsts[i]->data_off, COPY_LEN, 0) != id_count++)
> +		if (rte_dma_copy(dev_id, vchan, rte_pktmbuf_iova_offset(srcs[i], 0),
> +				 rte_pktmbuf_iova_offset(dsts[i], 0), COPY_LEN, 0) != id_count++)
>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", i);
>  	}
>  	rte_dma_submit(dev_id, vchan);
> @@ -317,9 +317,8 @@ test_failure_in_full_burst(int16_t dev_id, uint16_t vchan, bool fence,
>  	rte_dma_stats_get(dev_id, vchan, &baseline); /* get a baseline set of stats */
>  	for (i = 0; i < COMP_BURST_SZ; i++) {
>  		int id = rte_dma_copy(dev_id, vchan,
> -				(i == fail_idx ? 0 : (srcs[i]->buf_iova + srcs[i]->data_off)),
> -				dsts[i]->buf_iova + dsts[i]->data_off,
> -				COPY_LEN, OPT_FENCE(i));
> +				      (i == fail_idx ? 0 : rte_pktmbuf_iova_offset(srcs[i], 0)),
> +				      rte_pktmbuf_iova_offset(dsts[i], 0), COPY_LEN, OPT_FENCE(i));
>  		if (id < 0)
>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", i);
>  		if (i == fail_idx)
> @@ -407,9 +406,8 @@ test_individual_status_query_with_failure(int16_t dev_id, uint16_t vchan, bool f
>  
>  	for (j = 0; j < COMP_BURST_SZ; j++) {
>  		int id = rte_dma_copy(dev_id, vchan,
> -				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
> -				dsts[j]->buf_iova + dsts[j]->data_off,
> -				COPY_LEN, OPT_FENCE(j));
> +				      (j == fail_idx ? 0 : rte_pktmbuf_iova_offset(srcs[j], 0)),
> +				      rte_pktmbuf_iova_offset(dsts[j], 0), COPY_LEN, OPT_FENCE(j));
>  		if (id < 0)
>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
>  		if (j == fail_idx)
> @@ -470,9 +468,8 @@ test_single_item_status_query_with_failure(int16_t dev_id, uint16_t vchan,
>  
>  	for (j = 0; j < COMP_BURST_SZ; j++) {
>  		int id = rte_dma_copy(dev_id, vchan,
> -				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
> -				dsts[j]->buf_iova + dsts[j]->data_off,
> -				COPY_LEN, 0);
> +				      (j == fail_idx ? 0 : rte_pktmbuf_iova_offset(srcs[j], 0)),
> +				      rte_pktmbuf_iova_offset(dsts[j], 0), COPY_LEN, 0);
>  		if (id < 0)
>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
>  		if (j == fail_idx)
> @@ -529,15 +526,14 @@ test_multi_failure(int16_t dev_id, uint16_t vchan, struct rte_mbuf **srcs, struc
>  
>  	/* enqueue and gather completions in one go */
>  	for (j = 0; j < COMP_BURST_SZ; j++) {
> -		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
> +		uintptr_t src = rte_pktmbuf_iova_offset(srcs[j], 0);
>  		/* set up for failure if the current index is anywhere is the fails array */
>  		for (i = 0; i < num_fail; i++)
>  			if (j == fail[i])
>  				src = 0;
>  
> -		int id = rte_dma_copy(dev_id, vchan,
> -				src, dsts[j]->buf_iova + dsts[j]->data_off,
> -				COPY_LEN, 0);
> +		int id = rte_dma_copy(dev_id, vchan, src, rte_pktmbuf_iova_offset(dsts[j], 0),
> +				      COPY_LEN, 0);
>  		if (id < 0)
>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
>  	}
> @@ -565,15 +561,14 @@ test_multi_failure(int16_t dev_id, uint16_t vchan, struct rte_mbuf **srcs, struc
>  
>  	/* enqueue and gather completions in bursts, but getting errors one at a time */
>  	for (j = 0; j < COMP_BURST_SZ; j++) {
> -		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
> +		uintptr_t src = rte_pktmbuf_iova_offset(srcs[j], 0);
>  		/* set up for failure if the current index is anywhere is the fails array */
>  		for (i = 0; i < num_fail; i++)
>  			if (j == fail[i])
>  				src = 0;
>  
> -		int id = rte_dma_copy(dev_id, vchan,
> -				src, dsts[j]->buf_iova + dsts[j]->data_off,
> -				COPY_LEN, 0);
> +		int id = rte_dma_copy(dev_id, vchan, src, rte_pktmbuf_iova_offset(dsts[j], 0),
> +				      COPY_LEN, 0);
>  		if (id < 0)
>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
>  	}
> diff --git a/app/test/test_mbuf.c b/app/test/test_mbuf.c
> index e09b2549ca..45431f2c9c 100644
> --- a/app/test/test_mbuf.c
> +++ b/app/test/test_mbuf.c
> @@ -1232,11 +1232,13 @@ test_failing_mbuf_sanity_check(struct rte_mempool *pktmbuf_pool)
>  		return -1;
>  	}
>  
> -	badbuf = *buf;
> -	badbuf.buf_iova = 0;
> -	if (verify_mbuf_check_panics(&badbuf)) {
> -		printf("Error with bad-physaddr mbuf test\n");
> -		return -1;
> +	if (!RTE_IOVA_AS_VA) {
> +		badbuf = *buf;
> +		rte_mbuf_iova_set(&badbuf, 0);
> +		if (verify_mbuf_check_panics(&badbuf)) {
> +			printf("Error with bad-physaddr mbuf test\n");
> +			return -1;
> +		}
>  	}
>  
>  	badbuf = *buf;
> diff --git a/app/test/test_pcapng.c b/app/test/test_pcapng.c
> index 320dacea34..abbf00f6da 100644
> --- a/app/test/test_pcapng.c
> +++ b/app/test/test_pcapng.c
> @@ -40,7 +40,7 @@ dummy_mbuf_prep(struct rte_mbuf *mb, uint8_t buf[], uint32_t buf_len,
>  	uint8_t *db;
>  
>  	mb->buf_addr = buf;
> -	mb->buf_iova = (uintptr_t)buf;
> +	rte_mbuf_iova_set(mb, (uintptr_t)buf);
>  	mb->buf_len = buf_len;
>  	rte_mbuf_refcnt_set(mb, 1);
>  
> diff --git a/config/meson.build b/config/meson.build
> index 7f7b6c92fd..6b6c3e7eb6 100644
> --- a/config/meson.build
> +++ b/config/meson.build
> @@ -309,6 +309,7 @@ endif
>  if get_option('mbuf_refcnt_atomic')
>      dpdk_conf.set('RTE_MBUF_REFCNT_ATOMIC', true)
>  endif
> +dpdk_conf.set10('RTE_IOVA_AS_VA', get_option('iova_as_va'))
>  
>  compile_time_cpuflags = []
>  subdir(arch_subdir)
> diff --git a/drivers/meson.build b/drivers/meson.build
> index 376a64f4da..989770cffd 100644
> --- a/drivers/meson.build
> +++ b/drivers/meson.build
> @@ -105,6 +105,7 @@ foreach subpath:subdirs
>          ext_deps = []
>          pkgconfig_extra_libs = []
>          testpmd_sources = []
> +        pmd_iova_as_va = false

This option should be documented, however I don't know where is the proper
place. A comment here would be a good start I think.

I'm trying to find a more explicit name, but it's not easy.
What do you think about pmd_supports_disable_iova_as_pa?

Explicit is always better, it could avoid someone adding a new driver to
blindly copy the flag from a template driver.

>  
>          if not enable_drivers.contains(drv_path)
>              build = false
> @@ -122,6 +123,11 @@ foreach subpath:subdirs
>              # pull in driver directory which should update all the local variables
>              subdir(drv_path)
>  
> +            if dpdk_conf.get('RTE_IOVA_AS_VA') == 1 and not pmd_iova_as_va and not always_enable.contains(drv_path)
> +                build = false
> +                reason = 'driver does not support IOVA as VA mode'
> +            endif
> +
>              # get dependency objs from strings
>              shared_deps = ext_deps
>              static_deps = ext_deps
> diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
> index 37d29643a5..b70c4dcc5f 100644
> --- a/lib/eal/linux/eal.c
> +++ b/lib/eal/linux/eal.c
> @@ -1127,6 +1127,13 @@ rte_eal_init(int argc, char **argv)
>  		return -1;
>  	}
>  
> +	if (rte_eal_iova_mode() == RTE_IOVA_PA && RTE_IOVA_AS_VA) {
> +		rte_eal_init_alert(
> +			"Cannot use IOVA as 'PA' since build is configured to use only 'VA'");
> +		rte_errno = EINVAL;
> +		return -1;
> +	}
> +
>  	RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
>  		rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
>  
> diff --git a/lib/mbuf/rte_mbuf.c b/lib/mbuf/rte_mbuf.c
> index a2307cebe6..5af290c53a 100644
> --- a/lib/mbuf/rte_mbuf.c
> +++ b/lib/mbuf/rte_mbuf.c
> @@ -89,7 +89,7 @@ rte_pktmbuf_init(struct rte_mempool *mp,
>  	/* start of buffer is after mbuf structure and priv data */
>  	m->priv_size = priv_size;
>  	m->buf_addr = (char *)m + mbuf_size;
> -	m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
> +	rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
>  	m->buf_len = (uint16_t)buf_len;
>  
>  	/* keep some headroom between start of buffer and data */
> @@ -187,8 +187,8 @@ __rte_pktmbuf_init_extmem(struct rte_mempool *mp,
>  	RTE_ASSERT(ctx->off + ext_mem->elt_size <= ext_mem->buf_len);
>  
>  	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
> -	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
> -		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
> +	rte_mbuf_iova_set(m, ext_mem->buf_iova == RTE_BAD_IOVA ? RTE_BAD_IOVA :
> +								 (ext_mem->buf_iova + ctx->off));
>  
>  	ctx->off += ext_mem->elt_size;
>  	if (ctx->off + ext_mem->elt_size > ext_mem->buf_len) {
> @@ -388,7 +388,7 @@ int rte_mbuf_check(const struct rte_mbuf *m, int is_header,
>  		*reason = "bad mbuf pool";
>  		return -1;
>  	}
> -	if (m->buf_iova == 0) {
> +	if (m->buf_iova == 0 && !RTE_IOVA_AS_VA) {
>  		*reason = "bad IO addr";
>  		return -1;
>  	}
> diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
> index 9811e8c760..05be146bc2 100644
> --- a/lib/mbuf/rte_mbuf.h
> +++ b/lib/mbuf/rte_mbuf.h
> @@ -146,7 +146,7 @@ static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
>  static inline rte_iova_t
>  rte_mbuf_data_iova(const struct rte_mbuf *mb)
>  {
> -	return mb->buf_iova + mb->data_off;
> +	return (RTE_IOVA_AS_VA ? (uint64_t)mb->buf_addr : mb->buf_iova) + mb->data_off;

nit: cast should be rte_iova_t instead of uint64_t

>  }
>  
>  /**
> @@ -164,7 +164,7 @@ rte_mbuf_data_iova(const struct rte_mbuf *mb)
>  static inline rte_iova_t
>  rte_mbuf_data_iova_default(const struct rte_mbuf *mb)
>  {
> -	return mb->buf_iova + RTE_PKTMBUF_HEADROOM;
> +	return (RTE_IOVA_AS_VA ? (uint64_t)mb->buf_addr : mb->buf_iova) + RTE_PKTMBUF_HEADROOM;
>  }

same here

>  
>  /**
> @@ -469,6 +469,13 @@ rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
>  				 __ATOMIC_ACQ_REL);
>  }
>  
> +static inline void
> +rte_mbuf_iova_set(struct rte_mbuf *m, rte_iova_t iova)
> +{
> +	if (!RTE_IOVA_AS_VA)
> +		m->buf_iova = iova;
> +}
> +
>  /** Mbuf prefetch */
>  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
>  	if ((m) != NULL)                        \
> @@ -1056,7 +1063,7 @@ rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
>  	RTE_ASSERT(shinfo->free_cb != NULL);
>  
>  	m->buf_addr = buf_addr;
> -	m->buf_iova = buf_iova;
> +	rte_mbuf_iova_set(m, buf_iova);
>  	m->buf_len = buf_len;
>  
>  	m->data_len = 0;
> @@ -1143,7 +1150,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  
>  	mi->data_off = m->data_off;
>  	mi->data_len = m->data_len;
> -	mi->buf_iova = m->buf_iova;
> +	rte_mbuf_iova_set(mi, m->buf_iova);
>  	mi->buf_addr = m->buf_addr;
>  	mi->buf_len = m->buf_len;
>  
> @@ -1245,7 +1252,7 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>  
>  	m->priv_size = priv_size;
>  	m->buf_addr = (char *)m + mbuf_size;
> -	m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
> +	rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
>  	m->buf_len = (uint16_t)buf_len;
>  	rte_pktmbuf_reset_headroom(m);
>  	m->data_len = 0;
> diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> index 3d6ddd6773..c6292e7252 100644
> --- a/lib/mbuf/rte_mbuf_core.h
> +++ b/lib/mbuf/rte_mbuf_core.h
> @@ -581,6 +581,8 @@ struct rte_mbuf {
>  	void *buf_addr;           /**< Virtual address of segment buffer. */
>  	/**
>  	 * Physical address of segment buffer.
> +	 * This field is invalid if the build is configured to use only
> +	 * virtual address as IOVA (i.e. RTE_IOVA_AS_VA is 1).
>  	 * Force alignment to 8-bytes, so as to ensure we have the exact
>  	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
>  	 * working on vector drivers easier.

If the field is invalid, can't we add an #if condition ? I mean:

#if !RTE_IOVA_AS_VA
        rte_iova_t buf_iova;
#else
        uint64_t dummy;
#endif

I think it is preferable, because it would ensure that we never use
buf_iova when RTE_IOVA_AS_VA is set (especially useful when compiling
out-of-tree drivers).

This would certainly require to change some of the static inline
functions to use #if instead of if(), but I think it's worth the effort.


> @@ -848,8 +850,12 @@ struct rte_mbuf_ext_shared_info {
>   * @param o
>   *   The offset into the data to calculate address from.
>   */
> +#if RTE_IOVA_AS_VA
> +#define rte_pktmbuf_iova_offset(m, o) rte_pktmbuf_mtod_offset(m, rte_iova_t, o)
> +#else
>  #define rte_pktmbuf_iova_offset(m, o) \
>  	(rte_iova_t)((m)->buf_iova + (m)->data_off + (o))
> +#endif
>  
>  /**
>   * A macro that returns the IO address that points to the start of the
> @@ -858,7 +864,11 @@ struct rte_mbuf_ext_shared_info {
>   * @param m
>   *   The packet mbuf.
>   */
> +#if RTE_IOVA_AS_VA
> +#define rte_pktmbuf_iova(m) rte_pktmbuf_mtod(m, rte_iova_t)
> +#else
>  #define rte_pktmbuf_iova(m) rte_pktmbuf_iova_offset(m, 0)
> +#endif
>  
>  #ifdef __cplusplus
>  }
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
> index 782d916ae0..05cde6e118 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -967,7 +967,7 @@ restore_mbuf(struct rte_mbuf *m)
>  		/* start of buffer is after mbuf structure and priv data */
>  
>  		m->buf_addr = (char *)m + mbuf_size;
> -		m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
> +		rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
>  		m = m->next;
>  	}
>  }
> diff --git a/lib/vhost/vhost_crypto.c b/lib/vhost/vhost_crypto.c
> index 54946f46d9..7b50735796 100644
> --- a/lib/vhost/vhost_crypto.c
> +++ b/lib/vhost/vhost_crypto.c
> @@ -823,11 +823,17 @@ prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
>  	switch (vcrypto->option) {
>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
>  		m_src->data_len = cipher->para.src_data_len;
> -		m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
> -				cipher->para.src_data_len);
> +		if (!RTE_IOVA_AS_VA) {
> +			m_src->buf_iova =
> +				gpa_to_hpa(vcrypto->dev, desc->addr, cipher->para.src_data_len);
> +			if (unlikely(m_src->buf_iova == 0)) {
> +				VC_LOG_ERR("zero_copy may fail due to cross page data");
> +				ret = VIRTIO_CRYPTO_ERR;
> +				goto error_exit;
> +			}
> +		}
>  		m_src->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
> -		if (unlikely(m_src->buf_iova == 0 ||
> -				m_src->buf_addr == NULL)) {
> +		if (unlikely(m_src->buf_addr == NULL)) {
>  			VC_LOG_ERR("zero_copy may fail due to cross page data");
>  			ret = VIRTIO_CRYPTO_ERR;
>  			goto error_exit;
> @@ -867,10 +873,17 @@ prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
>  
>  	switch (vcrypto->option) {
>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
> -		m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
> -				desc->addr, cipher->para.dst_data_len);
> +		if (!RTE_IOVA_AS_VA) {
> +			m_dst->buf_iova =
> +				gpa_to_hpa(vcrypto->dev, desc->addr, cipher->para.dst_data_len);
> +			if (unlikely(m_dst->buf_iova == 0)) {
> +				VC_LOG_ERR("zero_copy may fail due to cross page data");
> +				ret = VIRTIO_CRYPTO_ERR;
> +				goto error_exit;
> +			}
> +		}
>  		m_dst->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RW);
> -		if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
> +		if (unlikely(m_dst->buf_addr == NULL)) {
>  			VC_LOG_ERR("zero_copy may fail due to cross page data");
>  			ret = VIRTIO_CRYPTO_ERR;
>  			goto error_exit;
> @@ -980,11 +993,17 @@ prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
>  		m_src->data_len = chain->para.src_data_len;
>  		m_dst->data_len = chain->para.dst_data_len;
> -
> -		m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
> -				chain->para.src_data_len);
> +		if (!RTE_IOVA_AS_VA) {
> +			m_src->buf_iova =
> +				gpa_to_hpa(vcrypto->dev, desc->addr, chain->para.src_data_len);
> +			if (unlikely(m_src->buf_iova == 0)) {
> +				VC_LOG_ERR("zero_copy may fail due to cross page data");
> +				ret = VIRTIO_CRYPTO_ERR;
> +				goto error_exit;
> +			}
> +		}
>  		m_src->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
> -		if (unlikely(m_src->buf_iova == 0 || m_src->buf_addr == NULL)) {
> +		if (unlikely(m_src->buf_addr == NULL)) {
>  			VC_LOG_ERR("zero_copy may fail due to cross page data");
>  			ret = VIRTIO_CRYPTO_ERR;
>  			goto error_exit;
> @@ -1024,10 +1043,17 @@ prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
>  
>  	switch (vcrypto->option) {
>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
> -		m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
> -				desc->addr, chain->para.dst_data_len);
> +		if (!RTE_IOVA_AS_VA) {
> +			m_dst->buf_iova =
> +				gpa_to_hpa(vcrypto->dev, desc->addr, chain->para.dst_data_len);
> +			if (unlikely(m_dst->buf_iova == 0)) {
> +				VC_LOG_ERR("zero_copy may fail due to cross page data");
> +				ret = VIRTIO_CRYPTO_ERR;
> +				goto error_exit;
> +			}
> +		}
>  		m_dst->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RW);
> -		if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
> +		if (unlikely(m_dst->buf_addr == NULL)) {
>  			VC_LOG_ERR("zero_copy may fail due to cross page data");
>  			ret = VIRTIO_CRYPTO_ERR;
>  			goto error_exit;
> diff --git a/meson_options.txt b/meson_options.txt
> index 7c220ad68d..f0fa6cf04c 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -44,6 +44,8 @@ option('platform', type: 'string', value: 'native', description:
>         'Platform to build, either "native", "generic" or a SoC. Please refer to the Linux build guide for more information.')
>  option('enable_trace_fp', type: 'boolean', value: false, description:
>         'enable fast path trace points.')
> +option('iova_as_va', type: 'boolean', value: false, description:
> +       'Build which only supports IOVA as VA mode. Unsupported drivers are disabled.')

I wonder if we can find a better name for the option. Currently, it is a bit
confusing to me, because iova_as_va=false does not mean that iova_as_va is
disabled.

What about iova_as_pa=true|false, or enable_iova_as_pa=true|false, or
disable_iova_as_pa=true|false?

The help string is maybe easier to find, something like
"Enable or disable support for IOVA as PA mode."

We can also explain that enabling this option removes the buf_iova field from
the mbuf.

>  option('tests', type: 'boolean', value: true, description:
>         'build unit tests')
>  option('use_hpet', type: 'boolean', value: false, description:
> -- 
> 2.25.1
>
  
Shijith Thotton Sept. 29, 2022, 5:48 a.m. UTC | #2
>> IOVA mode in DPDK is either PA or VA. The new build option iova_as_va
>> configures the mode to VA at compile time and prevents setting it to PA
>> at runtime. For now, all drivers which are not always enabled are
>> disabled with this option. Supported driver can set the flag
>> pmd_iova_as_va in its build file to enable build.
>>
>> mbuf structure holds the physical (PA) and virtual address (VA) of a
>> buffer. if IOVA mode is set to VA, PA is redundant as it is the same as
>> VA. So PA field need not be updated and marked invalid if the build is
>> configured to use only VA.
>>
>> Signed-off-by: Shijith Thotton <sthotton@marvell.com>
>> ---
>>  app/test-bbdev/test_bbdev_perf.c         |  2 +-
>>  app/test-crypto-perf/cperf_test_common.c |  5 +--
>>  app/test/test_bpf.c                      |  2 +-
>>  app/test/test_dmadev.c                   | 33 ++++++---------
>>  app/test/test_mbuf.c                     | 12 +++---
>>  app/test/test_pcapng.c                   |  2 +-
>>  config/meson.build                       |  1 +
>>  drivers/meson.build                      |  6 +++
>>  lib/eal/linux/eal.c                      |  7 +++
>>  lib/mbuf/rte_mbuf.c                      |  8 ++--
>>  lib/mbuf/rte_mbuf.h                      | 17 +++++---
>>  lib/mbuf/rte_mbuf_core.h                 | 10 +++++
>>  lib/vhost/vhost.h                        |  2 +-
>>  lib/vhost/vhost_crypto.c                 | 54 ++++++++++++++++++------
>>  meson_options.txt                        |  2 +
>>  15 files changed, 109 insertions(+), 54 deletions(-)
>>
>> diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-
>bbdev/test_bbdev_perf.c
>> index 8fab52d821..f6aa25b67d 100644
>> --- a/app/test-bbdev/test_bbdev_perf.c
>> +++ b/app/test-bbdev/test_bbdev_perf.c
>> @@ -1001,7 +1001,7 @@ init_op_data_objs(struct rte_bbdev_op_data *bufs,
>>  					seg->length);
>>  				memcpy(data, seg->addr, seg->length);
>>  				m_head->buf_addr = data;
>> -				m_head->buf_iova = rte_malloc_virt2iova(data);
>> +				rte_mbuf_iova_set(m_head,
>rte_malloc_virt2iova(data));
>
>Wouldn't it be better to have a preliminary patch that replaces direct
>accesses to m->buf_iova by rte_mbuf_iova_*() functions in app and libs?
>This would make this commit smaller to read.
 
Yes. I will add this change in v4.

>
>If I understand properly, the drivers/ part has to be done at the same time
>than setting "pmd_iova_as_va" in the meson config.
>

This approach was taken as per previous discussions. Also removing buf_iova from
a PMD would require proper testing and performance checks. Current approach
would give ample time for this.

>>  				m_head->data_off = 0;
>>  				m_head->data_len = seg->length;
>>  			} else {
>> diff --git a/app/test-crypto-perf/cperf_test_common.c b/app/test-crypto-
>perf/cperf_test_common.c
>> index 00aadc9a47..27646cd619 100644
>> --- a/app/test-crypto-perf/cperf_test_common.c
>> +++ b/app/test-crypto-perf/cperf_test_common.c
>> @@ -26,8 +26,7 @@ fill_single_seg_mbuf(struct rte_mbuf *m, struct
>rte_mempool *mp,
>>  	/* start of buffer is after mbuf structure and priv data */
>>  	m->priv_size = 0;
>>  	m->buf_addr = (char *)m + mbuf_hdr_size;
>> -	m->buf_iova = rte_mempool_virt2iova(obj) +
>> -		mbuf_offset + mbuf_hdr_size;
>> +	rte_mbuf_iova_set(m, rte_mempool_virt2iova(obj) + mbuf_offset +
>mbuf_hdr_size);
>>  	m->buf_len = segment_sz;
>>  	m->data_len = data_len;
>>  	m->pkt_len = data_len;
>> @@ -58,7 +57,7 @@ fill_multi_seg_mbuf(struct rte_mbuf *m, struct
>rte_mempool *mp,
>>  		/* start of buffer is after mbuf structure and priv data */
>>  		m->priv_size = 0;
>>  		m->buf_addr = (char *)m + mbuf_hdr_size;
>> -		m->buf_iova = next_seg_phys_addr;
>> +		rte_mbuf_iova_set(m, next_seg_phys_addr);
>>  		next_seg_phys_addr += mbuf_hdr_size + segment_sz;
>>  		m->buf_len = segment_sz;
>>  		m->data_len = data_len;
>> diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
>> index 97f500809e..f5af5e8a3f 100644
>> --- a/app/test/test_bpf.c
>> +++ b/app/test/test_bpf.c
>> @@ -2600,7 +2600,7 @@ dummy_mbuf_prep(struct rte_mbuf *mb, uint8_t
>buf[], uint32_t buf_len,
>>  	uint8_t *db;
>>
>>  	mb->buf_addr = buf;
>> -	mb->buf_iova = (uintptr_t)buf;
>> +	rte_mbuf_iova_set(mb, (uintptr_t)buf);
>>  	mb->buf_len = buf_len;
>>  	rte_mbuf_refcnt_set(mb, 1);
>>
>> diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
>> index 9e8e101f40..8306947eda 100644
>> --- a/app/test/test_dmadev.c
>> +++ b/app/test/test_dmadev.c
>> @@ -110,8 +110,8 @@ do_multi_copies(int16_t dev_id, uint16_t vchan,
>>  		for (j = 0; j < COPY_LEN/sizeof(uint64_t); j++)
>>  			src_data[j] = rte_rand();
>>
>> -		if (rte_dma_copy(dev_id, vchan, srcs[i]->buf_iova + srcs[i]-
>>data_off,
>> -				dsts[i]->buf_iova + dsts[i]->data_off, COPY_LEN,
>0) != id_count++)
>> +		if (rte_dma_copy(dev_id, vchan, rte_pktmbuf_iova_offset(srcs[i],
>0),
>> +				 rte_pktmbuf_iova_offset(dsts[i], 0), COPY_LEN,
>0) != id_count++)
>>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n",
>i);
>>  	}
>>  	rte_dma_submit(dev_id, vchan);
>> @@ -317,9 +317,8 @@ test_failure_in_full_burst(int16_t dev_id, uint16_t
>vchan, bool fence,
>>  	rte_dma_stats_get(dev_id, vchan, &baseline); /* get a baseline set of
>stats */
>>  	for (i = 0; i < COMP_BURST_SZ; i++) {
>>  		int id = rte_dma_copy(dev_id, vchan,
>> -				(i == fail_idx ? 0 : (srcs[i]->buf_iova + srcs[i]-
>>data_off)),
>> -				dsts[i]->buf_iova + dsts[i]->data_off,
>> -				COPY_LEN, OPT_FENCE(i));
>> +				      (i == fail_idx ? 0 :
>rte_pktmbuf_iova_offset(srcs[i], 0)),
>> +				      rte_pktmbuf_iova_offset(dsts[i], 0),
>COPY_LEN, OPT_FENCE(i));
>>  		if (id < 0)
>>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n",
>i);
>>  		if (i == fail_idx)
>> @@ -407,9 +406,8 @@ test_individual_status_query_with_failure(int16_t
>dev_id, uint16_t vchan, bool f
>>
>>  	for (j = 0; j < COMP_BURST_SZ; j++) {
>>  		int id = rte_dma_copy(dev_id, vchan,
>> -				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]-
>>data_off)),
>> -				dsts[j]->buf_iova + dsts[j]->data_off,
>> -				COPY_LEN, OPT_FENCE(j));
>> +				      (j == fail_idx ? 0 :
>rte_pktmbuf_iova_offset(srcs[j], 0)),
>> +				      rte_pktmbuf_iova_offset(dsts[j], 0),
>COPY_LEN, OPT_FENCE(j));
>>  		if (id < 0)
>>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n",
>j);
>>  		if (j == fail_idx)
>> @@ -470,9 +468,8 @@ test_single_item_status_query_with_failure(int16_t
>dev_id, uint16_t vchan,
>>
>>  	for (j = 0; j < COMP_BURST_SZ; j++) {
>>  		int id = rte_dma_copy(dev_id, vchan,
>> -				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]-
>>data_off)),
>> -				dsts[j]->buf_iova + dsts[j]->data_off,
>> -				COPY_LEN, 0);
>> +				      (j == fail_idx ? 0 :
>rte_pktmbuf_iova_offset(srcs[j], 0)),
>> +				      rte_pktmbuf_iova_offset(dsts[j], 0),
>COPY_LEN, 0);
>>  		if (id < 0)
>>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n",
>j);
>>  		if (j == fail_idx)
>> @@ -529,15 +526,14 @@ test_multi_failure(int16_t dev_id, uint16_t vchan,
>struct rte_mbuf **srcs, struc
>>
>>  	/* enqueue and gather completions in one go */
>>  	for (j = 0; j < COMP_BURST_SZ; j++) {
>> -		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
>> +		uintptr_t src = rte_pktmbuf_iova_offset(srcs[j], 0);
>>  		/* set up for failure if the current index is anywhere is the fails
>array */
>>  		for (i = 0; i < num_fail; i++)
>>  			if (j == fail[i])
>>  				src = 0;
>>
>> -		int id = rte_dma_copy(dev_id, vchan,
>> -				src, dsts[j]->buf_iova + dsts[j]->data_off,
>> -				COPY_LEN, 0);
>> +		int id = rte_dma_copy(dev_id, vchan, src,
>rte_pktmbuf_iova_offset(dsts[j], 0),
>> +				      COPY_LEN, 0);
>>  		if (id < 0)
>>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n",
>j);
>>  	}
>> @@ -565,15 +561,14 @@ test_multi_failure(int16_t dev_id, uint16_t vchan,
>struct rte_mbuf **srcs, struc
>>
>>  	/* enqueue and gather completions in bursts, but getting errors one at a
>time */
>>  	for (j = 0; j < COMP_BURST_SZ; j++) {
>> -		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
>> +		uintptr_t src = rte_pktmbuf_iova_offset(srcs[j], 0);
>>  		/* set up for failure if the current index is anywhere is the fails
>array */
>>  		for (i = 0; i < num_fail; i++)
>>  			if (j == fail[i])
>>  				src = 0;
>>
>> -		int id = rte_dma_copy(dev_id, vchan,
>> -				src, dsts[j]->buf_iova + dsts[j]->data_off,
>> -				COPY_LEN, 0);
>> +		int id = rte_dma_copy(dev_id, vchan, src,
>rte_pktmbuf_iova_offset(dsts[j], 0),
>> +				      COPY_LEN, 0);
>>  		if (id < 0)
>>  			ERR_RETURN("Error with rte_dma_copy for buffer %u\n",
>j);
>>  	}
>> diff --git a/app/test/test_mbuf.c b/app/test/test_mbuf.c
>> index e09b2549ca..45431f2c9c 100644
>> --- a/app/test/test_mbuf.c
>> +++ b/app/test/test_mbuf.c
>> @@ -1232,11 +1232,13 @@ test_failing_mbuf_sanity_check(struct
>rte_mempool *pktmbuf_pool)
>>  		return -1;
>>  	}
>>
>> -	badbuf = *buf;
>> -	badbuf.buf_iova = 0;
>> -	if (verify_mbuf_check_panics(&badbuf)) {
>> -		printf("Error with bad-physaddr mbuf test\n");
>> -		return -1;
>> +	if (!RTE_IOVA_AS_VA) {
>> +		badbuf = *buf;
>> +		rte_mbuf_iova_set(&badbuf, 0);
>> +		if (verify_mbuf_check_panics(&badbuf)) {
>> +			printf("Error with bad-physaddr mbuf test\n");
>> +			return -1;
>> +		}
>>  	}
>>
>>  	badbuf = *buf;
>> diff --git a/app/test/test_pcapng.c b/app/test/test_pcapng.c
>> index 320dacea34..abbf00f6da 100644
>> --- a/app/test/test_pcapng.c
>> +++ b/app/test/test_pcapng.c
>> @@ -40,7 +40,7 @@ dummy_mbuf_prep(struct rte_mbuf *mb, uint8_t buf[],
>uint32_t buf_len,
>>  	uint8_t *db;
>>
>>  	mb->buf_addr = buf;
>> -	mb->buf_iova = (uintptr_t)buf;
>> +	rte_mbuf_iova_set(mb, (uintptr_t)buf);
>>  	mb->buf_len = buf_len;
>>  	rte_mbuf_refcnt_set(mb, 1);
>>
>> diff --git a/config/meson.build b/config/meson.build
>> index 7f7b6c92fd..6b6c3e7eb6 100644
>> --- a/config/meson.build
>> +++ b/config/meson.build
>> @@ -309,6 +309,7 @@ endif
>>  if get_option('mbuf_refcnt_atomic')
>>      dpdk_conf.set('RTE_MBUF_REFCNT_ATOMIC', true)
>>  endif
>> +dpdk_conf.set10('RTE_IOVA_AS_VA', get_option('iova_as_va'))
>>
>>  compile_time_cpuflags = []
>>  subdir(arch_subdir)
>> diff --git a/drivers/meson.build b/drivers/meson.build
>> index 376a64f4da..989770cffd 100644
>> --- a/drivers/meson.build
>> +++ b/drivers/meson.build
>> @@ -105,6 +105,7 @@ foreach subpath:subdirs
>>          ext_deps = []
>>          pkgconfig_extra_libs = []
>>          testpmd_sources = []
>> +        pmd_iova_as_va = false
>
>This option should be documented, however I don't know where is the proper
>place. A comment here would be a good start I think.
>
 
Will add.

>I'm trying to find a more explicit name, but it's not easy.
>What do you think about pmd_supports_disable_iova_as_pa?
>

Makes sense. I will change to pmd_supports_disable_iova_as_pa.

>Explicit is always better, it could avoid someone adding a new driver to
>blindly copy the flag from a template driver.
>
>>
>>          if not enable_drivers.contains(drv_path)
>>              build = false
>> @@ -122,6 +123,11 @@ foreach subpath:subdirs
>>              # pull in driver directory which should update all the local variables
>>              subdir(drv_path)
>>
>> +            if dpdk_conf.get('RTE_IOVA_AS_VA') == 1 and not pmd_iova_as_va and
>not always_enable.contains(drv_path)
>> +                build = false
>> +                reason = 'driver does not support IOVA as VA mode'
>> +            endif
>> +
>>              # get dependency objs from strings
>>              shared_deps = ext_deps
>>              static_deps = ext_deps
>> diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
>> index 37d29643a5..b70c4dcc5f 100644
>> --- a/lib/eal/linux/eal.c
>> +++ b/lib/eal/linux/eal.c
>> @@ -1127,6 +1127,13 @@ rte_eal_init(int argc, char **argv)
>>  		return -1;
>>  	}
>>
>> +	if (rte_eal_iova_mode() == RTE_IOVA_PA && RTE_IOVA_AS_VA) {
>> +		rte_eal_init_alert(
>> +			"Cannot use IOVA as 'PA' since build is configured to use
>only 'VA'");
>> +		rte_errno = EINVAL;
>> +		return -1;
>> +	}
>> +
>>  	RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
>>  		rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
>>
>> diff --git a/lib/mbuf/rte_mbuf.c b/lib/mbuf/rte_mbuf.c
>> index a2307cebe6..5af290c53a 100644
>> --- a/lib/mbuf/rte_mbuf.c
>> +++ b/lib/mbuf/rte_mbuf.c
>> @@ -89,7 +89,7 @@ rte_pktmbuf_init(struct rte_mempool *mp,
>>  	/* start of buffer is after mbuf structure and priv data */
>>  	m->priv_size = priv_size;
>>  	m->buf_addr = (char *)m + mbuf_size;
>> -	m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
>> +	rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
>>  	m->buf_len = (uint16_t)buf_len;
>>
>>  	/* keep some headroom between start of buffer and data */
>> @@ -187,8 +187,8 @@ __rte_pktmbuf_init_extmem(struct rte_mempool *mp,
>>  	RTE_ASSERT(ctx->off + ext_mem->elt_size <= ext_mem->buf_len);
>>
>>  	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
>> -	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
>> -		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
>> +	rte_mbuf_iova_set(m, ext_mem->buf_iova == RTE_BAD_IOVA ?
>RTE_BAD_IOVA :
>> +								 (ext_mem-
>>buf_iova + ctx->off));
>>
>>  	ctx->off += ext_mem->elt_size;
>>  	if (ctx->off + ext_mem->elt_size > ext_mem->buf_len) {
>> @@ -388,7 +388,7 @@ int rte_mbuf_check(const struct rte_mbuf *m, int
>is_header,
>>  		*reason = "bad mbuf pool";
>>  		return -1;
>>  	}
>> -	if (m->buf_iova == 0) {
>> +	if (m->buf_iova == 0 && !RTE_IOVA_AS_VA) {
>>  		*reason = "bad IO addr";
>>  		return -1;
>>  	}
>> diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
>> index 9811e8c760..05be146bc2 100644
>> --- a/lib/mbuf/rte_mbuf.h
>> +++ b/lib/mbuf/rte_mbuf.h
>> @@ -146,7 +146,7 @@ static inline uint16_t rte_pktmbuf_priv_size(struct
>rte_mempool *mp);
>>  static inline rte_iova_t
>>  rte_mbuf_data_iova(const struct rte_mbuf *mb)
>>  {
>> -	return mb->buf_iova + mb->data_off;
>> +	return (RTE_IOVA_AS_VA ? (uint64_t)mb->buf_addr : mb->buf_iova) +
>mb->data_off;
>
>nit: cast should be rte_iova_t instead of uint64_t
 
Will change.

>
>>  }
>>
>>  /**
>> @@ -164,7 +164,7 @@ rte_mbuf_data_iova(const struct rte_mbuf *mb)
>>  static inline rte_iova_t
>>  rte_mbuf_data_iova_default(const struct rte_mbuf *mb)
>>  {
>> -	return mb->buf_iova + RTE_PKTMBUF_HEADROOM;
>> +	return (RTE_IOVA_AS_VA ? (uint64_t)mb->buf_addr : mb->buf_iova) +
>RTE_PKTMBUF_HEADROOM;
>>  }
>
>same here
>

Will change.

>>
>>  /**
>> @@ -469,6 +469,13 @@ rte_mbuf_ext_refcnt_update(struct
>rte_mbuf_ext_shared_info *shinfo,
>>  				 __ATOMIC_ACQ_REL);
>>  }
>>
>> +static inline void
>> +rte_mbuf_iova_set(struct rte_mbuf *m, rte_iova_t iova)
>> +{
>> +	if (!RTE_IOVA_AS_VA)
>> +		m->buf_iova = iova;
>> +}
>> +
>>  /** Mbuf prefetch */
>>  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
>>  	if ((m) != NULL)                        \
>> @@ -1056,7 +1063,7 @@ rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void
>*buf_addr,
>>  	RTE_ASSERT(shinfo->free_cb != NULL);
>>
>>  	m->buf_addr = buf_addr;
>> -	m->buf_iova = buf_iova;
>> +	rte_mbuf_iova_set(m, buf_iova);
>>  	m->buf_len = buf_len;
>>
>>  	m->data_len = 0;
>> @@ -1143,7 +1150,7 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf
>*mi, struct rte_mbuf *m)
>>
>>  	mi->data_off = m->data_off;
>>  	mi->data_len = m->data_len;
>> -	mi->buf_iova = m->buf_iova;
>> +	rte_mbuf_iova_set(mi, m->buf_iova);
>>  	mi->buf_addr = m->buf_addr;
>>  	mi->buf_len = m->buf_len;
>>
>> @@ -1245,7 +1252,7 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf
>*m)
>>
>>  	m->priv_size = priv_size;
>>  	m->buf_addr = (char *)m + mbuf_size;
>> -	m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
>> +	rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
>>  	m->buf_len = (uint16_t)buf_len;
>>  	rte_pktmbuf_reset_headroom(m);
>>  	m->data_len = 0;
>> diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
>> index 3d6ddd6773..c6292e7252 100644
>> --- a/lib/mbuf/rte_mbuf_core.h
>> +++ b/lib/mbuf/rte_mbuf_core.h
>> @@ -581,6 +581,8 @@ struct rte_mbuf {
>>  	void *buf_addr;           /**< Virtual address of segment buffer. */
>>  	/**
>>  	 * Physical address of segment buffer.
>> +	 * This field is invalid if the build is configured to use only
>> +	 * virtual address as IOVA (i.e. RTE_IOVA_AS_VA is 1).
>>  	 * Force alignment to 8-bytes, so as to ensure we have the exact
>>  	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
>>  	 * working on vector drivers easier.
>
>If the field is invalid, can't we add an #if condition ? I mean:
>
>#if !RTE_IOVA_AS_VA
>        rte_iova_t buf_iova;
>#else
>        uint64_t dummy;
>#endif
>
>I think it is preferable, because it would ensure that we never use
>buf_iova when RTE_IOVA_AS_VA is set (especially useful when compiling
>out-of-tree drivers).
>
>This would certainly require to change some of the static inline
>functions to use #if instead of if(), but I think it's worth the effort.
>
>
 
Agree. I will change in v4.

>> @@ -848,8 +850,12 @@ struct rte_mbuf_ext_shared_info {
>>   * @param o
>>   *   The offset into the data to calculate address from.
>>   */
>> +#if RTE_IOVA_AS_VA
>> +#define rte_pktmbuf_iova_offset(m, o) rte_pktmbuf_mtod_offset(m,
>rte_iova_t, o)
>> +#else
>>  #define rte_pktmbuf_iova_offset(m, o) \
>>  	(rte_iova_t)((m)->buf_iova + (m)->data_off + (o))
>> +#endif
>>
>>  /**
>>   * A macro that returns the IO address that points to the start of the
>> @@ -858,7 +864,11 @@ struct rte_mbuf_ext_shared_info {
>>   * @param m
>>   *   The packet mbuf.
>>   */
>> +#if RTE_IOVA_AS_VA
>> +#define rte_pktmbuf_iova(m) rte_pktmbuf_mtod(m, rte_iova_t)
>> +#else
>>  #define rte_pktmbuf_iova(m) rte_pktmbuf_iova_offset(m, 0)
>> +#endif
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
>> index 782d916ae0..05cde6e118 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -967,7 +967,7 @@ restore_mbuf(struct rte_mbuf *m)
>>  		/* start of buffer is after mbuf structure and priv data */
>>
>>  		m->buf_addr = (char *)m + mbuf_size;
>> -		m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
>> +		rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
>>  		m = m->next;
>>  	}
>>  }
>> diff --git a/lib/vhost/vhost_crypto.c b/lib/vhost/vhost_crypto.c
>> index 54946f46d9..7b50735796 100644
>> --- a/lib/vhost/vhost_crypto.c
>> +++ b/lib/vhost/vhost_crypto.c
>> @@ -823,11 +823,17 @@ prepare_sym_cipher_op(struct vhost_crypto
>*vcrypto, struct rte_crypto_op *op,
>>  	switch (vcrypto->option) {
>>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
>>  		m_src->data_len = cipher->para.src_data_len;
>> -		m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
>> -				cipher->para.src_data_len);
>> +		if (!RTE_IOVA_AS_VA) {
>> +			m_src->buf_iova =
>> +				gpa_to_hpa(vcrypto->dev, desc->addr, cipher-
>>para.src_data_len);
>> +			if (unlikely(m_src->buf_iova == 0)) {
>> +				VC_LOG_ERR("zero_copy may fail due to cross
>page data");
>> +				ret = VIRTIO_CRYPTO_ERR;
>> +				goto error_exit;
>> +			}
>> +		}
>>  		m_src->buf_addr = get_data_ptr(vc_req, desc,
>VHOST_ACCESS_RO);
>> -		if (unlikely(m_src->buf_iova == 0 ||
>> -				m_src->buf_addr == NULL)) {
>> +		if (unlikely(m_src->buf_addr == NULL)) {
>>  			VC_LOG_ERR("zero_copy may fail due to cross page
>data");
>>  			ret = VIRTIO_CRYPTO_ERR;
>>  			goto error_exit;
>> @@ -867,10 +873,17 @@ prepare_sym_cipher_op(struct vhost_crypto
>*vcrypto, struct rte_crypto_op *op,
>>
>>  	switch (vcrypto->option) {
>>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
>> -		m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
>> -				desc->addr, cipher->para.dst_data_len);
>> +		if (!RTE_IOVA_AS_VA) {
>> +			m_dst->buf_iova =
>> +				gpa_to_hpa(vcrypto->dev, desc->addr, cipher-
>>para.dst_data_len);
>> +			if (unlikely(m_dst->buf_iova == 0)) {
>> +				VC_LOG_ERR("zero_copy may fail due to cross
>page data");
>> +				ret = VIRTIO_CRYPTO_ERR;
>> +				goto error_exit;
>> +			}
>> +		}
>>  		m_dst->buf_addr = get_data_ptr(vc_req, desc,
>VHOST_ACCESS_RW);
>> -		if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
>> +		if (unlikely(m_dst->buf_addr == NULL)) {
>>  			VC_LOG_ERR("zero_copy may fail due to cross page
>data");
>>  			ret = VIRTIO_CRYPTO_ERR;
>>  			goto error_exit;
>> @@ -980,11 +993,17 @@ prepare_sym_chain_op(struct vhost_crypto *vcrypto,
>struct rte_crypto_op *op,
>>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
>>  		m_src->data_len = chain->para.src_data_len;
>>  		m_dst->data_len = chain->para.dst_data_len;
>> -
>> -		m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
>> -				chain->para.src_data_len);
>> +		if (!RTE_IOVA_AS_VA) {
>> +			m_src->buf_iova =
>> +				gpa_to_hpa(vcrypto->dev, desc->addr, chain-
>>para.src_data_len);
>> +			if (unlikely(m_src->buf_iova == 0)) {
>> +				VC_LOG_ERR("zero_copy may fail due to cross
>page data");
>> +				ret = VIRTIO_CRYPTO_ERR;
>> +				goto error_exit;
>> +			}
>> +		}
>>  		m_src->buf_addr = get_data_ptr(vc_req, desc,
>VHOST_ACCESS_RO);
>> -		if (unlikely(m_src->buf_iova == 0 || m_src->buf_addr == NULL)) {
>> +		if (unlikely(m_src->buf_addr == NULL)) {
>>  			VC_LOG_ERR("zero_copy may fail due to cross page
>data");
>>  			ret = VIRTIO_CRYPTO_ERR;
>>  			goto error_exit;
>> @@ -1024,10 +1043,17 @@ prepare_sym_chain_op(struct vhost_crypto
>*vcrypto, struct rte_crypto_op *op,
>>
>>  	switch (vcrypto->option) {
>>  	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
>> -		m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
>> -				desc->addr, chain->para.dst_data_len);
>> +		if (!RTE_IOVA_AS_VA) {
>> +			m_dst->buf_iova =
>> +				gpa_to_hpa(vcrypto->dev, desc->addr, chain-
>>para.dst_data_len);
>> +			if (unlikely(m_dst->buf_iova == 0)) {
>> +				VC_LOG_ERR("zero_copy may fail due to cross
>page data");
>> +				ret = VIRTIO_CRYPTO_ERR;
>> +				goto error_exit;
>> +			}
>> +		}
>>  		m_dst->buf_addr = get_data_ptr(vc_req, desc,
>VHOST_ACCESS_RW);
>> -		if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
>> +		if (unlikely(m_dst->buf_addr == NULL)) {
>>  			VC_LOG_ERR("zero_copy may fail due to cross page
>data");
>>  			ret = VIRTIO_CRYPTO_ERR;
>>  			goto error_exit;
>> diff --git a/meson_options.txt b/meson_options.txt
>> index 7c220ad68d..f0fa6cf04c 100644
>> --- a/meson_options.txt
>> +++ b/meson_options.txt
>> @@ -44,6 +44,8 @@ option('platform', type: 'string', value: 'native', description:
>>         'Platform to build, either "native", "generic" or a SoC. Please refer to the
>Linux build guide for more information.')
>>  option('enable_trace_fp', type: 'boolean', value: false, description:
>>         'enable fast path trace points.')
>> +option('iova_as_va', type: 'boolean', value: false, description:
>> +       'Build which only supports IOVA as VA mode. Unsupported drivers are
>disabled.')
>
>I wonder if we can find a better name for the option. Currently, it is a bit
>confusing to me, because iova_as_va=false does not mean that iova_as_va is
>disabled.
>
>What about iova_as_pa=true|false, or enable_iova_as_pa=true|false, or
>disable_iova_as_pa=true|false?
>
 
Agree. Will go with the option enable_iova_as_pa and will rename macro to RTE_IOVA_AS_PA.

>The help string is maybe easier to find, something like
>"Enable or disable support for IOVA as PA mode."
>
>We can also explain that enabling this option removes the buf_iova field from
>the mbuf.
>
>>  option('tests', type: 'boolean', value: true, description:
>>         'build unit tests')
>>  option('use_hpet', type: 'boolean', value: false, description:
>> --
>> 2.25.1
>>
  

Patch

diff --git a/app/test-bbdev/test_bbdev_perf.c b/app/test-bbdev/test_bbdev_perf.c
index 8fab52d821..f6aa25b67d 100644
--- a/app/test-bbdev/test_bbdev_perf.c
+++ b/app/test-bbdev/test_bbdev_perf.c
@@ -1001,7 +1001,7 @@  init_op_data_objs(struct rte_bbdev_op_data *bufs,
 					seg->length);
 				memcpy(data, seg->addr, seg->length);
 				m_head->buf_addr = data;
-				m_head->buf_iova = rte_malloc_virt2iova(data);
+				rte_mbuf_iova_set(m_head, rte_malloc_virt2iova(data));
 				m_head->data_off = 0;
 				m_head->data_len = seg->length;
 			} else {
diff --git a/app/test-crypto-perf/cperf_test_common.c b/app/test-crypto-perf/cperf_test_common.c
index 00aadc9a47..27646cd619 100644
--- a/app/test-crypto-perf/cperf_test_common.c
+++ b/app/test-crypto-perf/cperf_test_common.c
@@ -26,8 +26,7 @@  fill_single_seg_mbuf(struct rte_mbuf *m, struct rte_mempool *mp,
 	/* start of buffer is after mbuf structure and priv data */
 	m->priv_size = 0;
 	m->buf_addr = (char *)m + mbuf_hdr_size;
-	m->buf_iova = rte_mempool_virt2iova(obj) +
-		mbuf_offset + mbuf_hdr_size;
+	rte_mbuf_iova_set(m, rte_mempool_virt2iova(obj) + mbuf_offset + mbuf_hdr_size);
 	m->buf_len = segment_sz;
 	m->data_len = data_len;
 	m->pkt_len = data_len;
@@ -58,7 +57,7 @@  fill_multi_seg_mbuf(struct rte_mbuf *m, struct rte_mempool *mp,
 		/* start of buffer is after mbuf structure and priv data */
 		m->priv_size = 0;
 		m->buf_addr = (char *)m + mbuf_hdr_size;
-		m->buf_iova = next_seg_phys_addr;
+		rte_mbuf_iova_set(m, next_seg_phys_addr);
 		next_seg_phys_addr += mbuf_hdr_size + segment_sz;
 		m->buf_len = segment_sz;
 		m->data_len = data_len;
diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index 97f500809e..f5af5e8a3f 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -2600,7 +2600,7 @@  dummy_mbuf_prep(struct rte_mbuf *mb, uint8_t buf[], uint32_t buf_len,
 	uint8_t *db;
 
 	mb->buf_addr = buf;
-	mb->buf_iova = (uintptr_t)buf;
+	rte_mbuf_iova_set(mb, (uintptr_t)buf);
 	mb->buf_len = buf_len;
 	rte_mbuf_refcnt_set(mb, 1);
 
diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
index 9e8e101f40..8306947eda 100644
--- a/app/test/test_dmadev.c
+++ b/app/test/test_dmadev.c
@@ -110,8 +110,8 @@  do_multi_copies(int16_t dev_id, uint16_t vchan,
 		for (j = 0; j < COPY_LEN/sizeof(uint64_t); j++)
 			src_data[j] = rte_rand();
 
-		if (rte_dma_copy(dev_id, vchan, srcs[i]->buf_iova + srcs[i]->data_off,
-				dsts[i]->buf_iova + dsts[i]->data_off, COPY_LEN, 0) != id_count++)
+		if (rte_dma_copy(dev_id, vchan, rte_pktmbuf_iova_offset(srcs[i], 0),
+				 rte_pktmbuf_iova_offset(dsts[i], 0), COPY_LEN, 0) != id_count++)
 			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", i);
 	}
 	rte_dma_submit(dev_id, vchan);
@@ -317,9 +317,8 @@  test_failure_in_full_burst(int16_t dev_id, uint16_t vchan, bool fence,
 	rte_dma_stats_get(dev_id, vchan, &baseline); /* get a baseline set of stats */
 	for (i = 0; i < COMP_BURST_SZ; i++) {
 		int id = rte_dma_copy(dev_id, vchan,
-				(i == fail_idx ? 0 : (srcs[i]->buf_iova + srcs[i]->data_off)),
-				dsts[i]->buf_iova + dsts[i]->data_off,
-				COPY_LEN, OPT_FENCE(i));
+				      (i == fail_idx ? 0 : rte_pktmbuf_iova_offset(srcs[i], 0)),
+				      rte_pktmbuf_iova_offset(dsts[i], 0), COPY_LEN, OPT_FENCE(i));
 		if (id < 0)
 			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", i);
 		if (i == fail_idx)
@@ -407,9 +406,8 @@  test_individual_status_query_with_failure(int16_t dev_id, uint16_t vchan, bool f
 
 	for (j = 0; j < COMP_BURST_SZ; j++) {
 		int id = rte_dma_copy(dev_id, vchan,
-				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
-				dsts[j]->buf_iova + dsts[j]->data_off,
-				COPY_LEN, OPT_FENCE(j));
+				      (j == fail_idx ? 0 : rte_pktmbuf_iova_offset(srcs[j], 0)),
+				      rte_pktmbuf_iova_offset(dsts[j], 0), COPY_LEN, OPT_FENCE(j));
 		if (id < 0)
 			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
 		if (j == fail_idx)
@@ -470,9 +468,8 @@  test_single_item_status_query_with_failure(int16_t dev_id, uint16_t vchan,
 
 	for (j = 0; j < COMP_BURST_SZ; j++) {
 		int id = rte_dma_copy(dev_id, vchan,
-				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
-				dsts[j]->buf_iova + dsts[j]->data_off,
-				COPY_LEN, 0);
+				      (j == fail_idx ? 0 : rte_pktmbuf_iova_offset(srcs[j], 0)),
+				      rte_pktmbuf_iova_offset(dsts[j], 0), COPY_LEN, 0);
 		if (id < 0)
 			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
 		if (j == fail_idx)
@@ -529,15 +526,14 @@  test_multi_failure(int16_t dev_id, uint16_t vchan, struct rte_mbuf **srcs, struc
 
 	/* enqueue and gather completions in one go */
 	for (j = 0; j < COMP_BURST_SZ; j++) {
-		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
+		uintptr_t src = rte_pktmbuf_iova_offset(srcs[j], 0);
 		/* set up for failure if the current index is anywhere is the fails array */
 		for (i = 0; i < num_fail; i++)
 			if (j == fail[i])
 				src = 0;
 
-		int id = rte_dma_copy(dev_id, vchan,
-				src, dsts[j]->buf_iova + dsts[j]->data_off,
-				COPY_LEN, 0);
+		int id = rte_dma_copy(dev_id, vchan, src, rte_pktmbuf_iova_offset(dsts[j], 0),
+				      COPY_LEN, 0);
 		if (id < 0)
 			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
 	}
@@ -565,15 +561,14 @@  test_multi_failure(int16_t dev_id, uint16_t vchan, struct rte_mbuf **srcs, struc
 
 	/* enqueue and gather completions in bursts, but getting errors one at a time */
 	for (j = 0; j < COMP_BURST_SZ; j++) {
-		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
+		uintptr_t src = rte_pktmbuf_iova_offset(srcs[j], 0);
 		/* set up for failure if the current index is anywhere is the fails array */
 		for (i = 0; i < num_fail; i++)
 			if (j == fail[i])
 				src = 0;
 
-		int id = rte_dma_copy(dev_id, vchan,
-				src, dsts[j]->buf_iova + dsts[j]->data_off,
-				COPY_LEN, 0);
+		int id = rte_dma_copy(dev_id, vchan, src, rte_pktmbuf_iova_offset(dsts[j], 0),
+				      COPY_LEN, 0);
 		if (id < 0)
 			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
 	}
diff --git a/app/test/test_mbuf.c b/app/test/test_mbuf.c
index e09b2549ca..45431f2c9c 100644
--- a/app/test/test_mbuf.c
+++ b/app/test/test_mbuf.c
@@ -1232,11 +1232,13 @@  test_failing_mbuf_sanity_check(struct rte_mempool *pktmbuf_pool)
 		return -1;
 	}
 
-	badbuf = *buf;
-	badbuf.buf_iova = 0;
-	if (verify_mbuf_check_panics(&badbuf)) {
-		printf("Error with bad-physaddr mbuf test\n");
-		return -1;
+	if (!RTE_IOVA_AS_VA) {
+		badbuf = *buf;
+		rte_mbuf_iova_set(&badbuf, 0);
+		if (verify_mbuf_check_panics(&badbuf)) {
+			printf("Error with bad-physaddr mbuf test\n");
+			return -1;
+		}
 	}
 
 	badbuf = *buf;
diff --git a/app/test/test_pcapng.c b/app/test/test_pcapng.c
index 320dacea34..abbf00f6da 100644
--- a/app/test/test_pcapng.c
+++ b/app/test/test_pcapng.c
@@ -40,7 +40,7 @@  dummy_mbuf_prep(struct rte_mbuf *mb, uint8_t buf[], uint32_t buf_len,
 	uint8_t *db;
 
 	mb->buf_addr = buf;
-	mb->buf_iova = (uintptr_t)buf;
+	rte_mbuf_iova_set(mb, (uintptr_t)buf);
 	mb->buf_len = buf_len;
 	rte_mbuf_refcnt_set(mb, 1);
 
diff --git a/config/meson.build b/config/meson.build
index 7f7b6c92fd..6b6c3e7eb6 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -309,6 +309,7 @@  endif
 if get_option('mbuf_refcnt_atomic')
     dpdk_conf.set('RTE_MBUF_REFCNT_ATOMIC', true)
 endif
+dpdk_conf.set10('RTE_IOVA_AS_VA', get_option('iova_as_va'))
 
 compile_time_cpuflags = []
 subdir(arch_subdir)
diff --git a/drivers/meson.build b/drivers/meson.build
index 376a64f4da..989770cffd 100644
--- a/drivers/meson.build
+++ b/drivers/meson.build
@@ -105,6 +105,7 @@  foreach subpath:subdirs
         ext_deps = []
         pkgconfig_extra_libs = []
         testpmd_sources = []
+        pmd_iova_as_va = false
 
         if not enable_drivers.contains(drv_path)
             build = false
@@ -122,6 +123,11 @@  foreach subpath:subdirs
             # pull in driver directory which should update all the local variables
             subdir(drv_path)
 
+            if dpdk_conf.get('RTE_IOVA_AS_VA') == 1 and not pmd_iova_as_va and not always_enable.contains(drv_path)
+                build = false
+                reason = 'driver does not support IOVA as VA mode'
+            endif
+
             # get dependency objs from strings
             shared_deps = ext_deps
             static_deps = ext_deps
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 37d29643a5..b70c4dcc5f 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -1127,6 +1127,13 @@  rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	if (rte_eal_iova_mode() == RTE_IOVA_PA && RTE_IOVA_AS_VA) {
+		rte_eal_init_alert(
+			"Cannot use IOVA as 'PA' since build is configured to use only 'VA'");
+		rte_errno = EINVAL;
+		return -1;
+	}
+
 	RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
 		rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
 
diff --git a/lib/mbuf/rte_mbuf.c b/lib/mbuf/rte_mbuf.c
index a2307cebe6..5af290c53a 100644
--- a/lib/mbuf/rte_mbuf.c
+++ b/lib/mbuf/rte_mbuf.c
@@ -89,7 +89,7 @@  rte_pktmbuf_init(struct rte_mempool *mp,
 	/* start of buffer is after mbuf structure and priv data */
 	m->priv_size = priv_size;
 	m->buf_addr = (char *)m + mbuf_size;
-	m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
+	rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
 	m->buf_len = (uint16_t)buf_len;
 
 	/* keep some headroom between start of buffer and data */
@@ -187,8 +187,8 @@  __rte_pktmbuf_init_extmem(struct rte_mempool *mp,
 	RTE_ASSERT(ctx->off + ext_mem->elt_size <= ext_mem->buf_len);
 
 	m->buf_addr = RTE_PTR_ADD(ext_mem->buf_ptr, ctx->off);
-	m->buf_iova = ext_mem->buf_iova == RTE_BAD_IOVA ?
-		      RTE_BAD_IOVA : (ext_mem->buf_iova + ctx->off);
+	rte_mbuf_iova_set(m, ext_mem->buf_iova == RTE_BAD_IOVA ? RTE_BAD_IOVA :
+								 (ext_mem->buf_iova + ctx->off));
 
 	ctx->off += ext_mem->elt_size;
 	if (ctx->off + ext_mem->elt_size > ext_mem->buf_len) {
@@ -388,7 +388,7 @@  int rte_mbuf_check(const struct rte_mbuf *m, int is_header,
 		*reason = "bad mbuf pool";
 		return -1;
 	}
-	if (m->buf_iova == 0) {
+	if (m->buf_iova == 0 && !RTE_IOVA_AS_VA) {
 		*reason = "bad IO addr";
 		return -1;
 	}
diff --git a/lib/mbuf/rte_mbuf.h b/lib/mbuf/rte_mbuf.h
index 9811e8c760..05be146bc2 100644
--- a/lib/mbuf/rte_mbuf.h
+++ b/lib/mbuf/rte_mbuf.h
@@ -146,7 +146,7 @@  static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
 static inline rte_iova_t
 rte_mbuf_data_iova(const struct rte_mbuf *mb)
 {
-	return mb->buf_iova + mb->data_off;
+	return (RTE_IOVA_AS_VA ? (uint64_t)mb->buf_addr : mb->buf_iova) + mb->data_off;
 }
 
 /**
@@ -164,7 +164,7 @@  rte_mbuf_data_iova(const struct rte_mbuf *mb)
 static inline rte_iova_t
 rte_mbuf_data_iova_default(const struct rte_mbuf *mb)
 {
-	return mb->buf_iova + RTE_PKTMBUF_HEADROOM;
+	return (RTE_IOVA_AS_VA ? (uint64_t)mb->buf_addr : mb->buf_iova) + RTE_PKTMBUF_HEADROOM;
 }
 
 /**
@@ -469,6 +469,13 @@  rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
 				 __ATOMIC_ACQ_REL);
 }
 
+static inline void
+rte_mbuf_iova_set(struct rte_mbuf *m, rte_iova_t iova)
+{
+	if (!RTE_IOVA_AS_VA)
+		m->buf_iova = iova;
+}
+
 /** Mbuf prefetch */
 #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
 	if ((m) != NULL)                        \
@@ -1056,7 +1063,7 @@  rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
 	RTE_ASSERT(shinfo->free_cb != NULL);
 
 	m->buf_addr = buf_addr;
-	m->buf_iova = buf_iova;
+	rte_mbuf_iova_set(m, buf_iova);
 	m->buf_len = buf_len;
 
 	m->data_len = 0;
@@ -1143,7 +1150,7 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 
 	mi->data_off = m->data_off;
 	mi->data_len = m->data_len;
-	mi->buf_iova = m->buf_iova;
+	rte_mbuf_iova_set(mi, m->buf_iova);
 	mi->buf_addr = m->buf_addr;
 	mi->buf_len = m->buf_len;
 
@@ -1245,7 +1252,7 @@  static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
 
 	m->priv_size = priv_size;
 	m->buf_addr = (char *)m + mbuf_size;
-	m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
+	rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
 	m->buf_len = (uint16_t)buf_len;
 	rte_pktmbuf_reset_headroom(m);
 	m->data_len = 0;
diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
index 3d6ddd6773..c6292e7252 100644
--- a/lib/mbuf/rte_mbuf_core.h
+++ b/lib/mbuf/rte_mbuf_core.h
@@ -581,6 +581,8 @@  struct rte_mbuf {
 	void *buf_addr;           /**< Virtual address of segment buffer. */
 	/**
 	 * Physical address of segment buffer.
+	 * This field is invalid if the build is configured to use only
+	 * virtual address as IOVA (i.e. RTE_IOVA_AS_VA is 1).
 	 * Force alignment to 8-bytes, so as to ensure we have the exact
 	 * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
 	 * working on vector drivers easier.
@@ -848,8 +850,12 @@  struct rte_mbuf_ext_shared_info {
  * @param o
  *   The offset into the data to calculate address from.
  */
+#if RTE_IOVA_AS_VA
+#define rte_pktmbuf_iova_offset(m, o) rte_pktmbuf_mtod_offset(m, rte_iova_t, o)
+#else
 #define rte_pktmbuf_iova_offset(m, o) \
 	(rte_iova_t)((m)->buf_iova + (m)->data_off + (o))
+#endif
 
 /**
  * A macro that returns the IO address that points to the start of the
@@ -858,7 +864,11 @@  struct rte_mbuf_ext_shared_info {
  * @param m
  *   The packet mbuf.
  */
+#if RTE_IOVA_AS_VA
+#define rte_pktmbuf_iova(m) rte_pktmbuf_mtod(m, rte_iova_t)
+#else
 #define rte_pktmbuf_iova(m) rte_pktmbuf_iova_offset(m, 0)
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 782d916ae0..05cde6e118 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -967,7 +967,7 @@  restore_mbuf(struct rte_mbuf *m)
 		/* start of buffer is after mbuf structure and priv data */
 
 		m->buf_addr = (char *)m + mbuf_size;
-		m->buf_iova = rte_mempool_virt2iova(m) + mbuf_size;
+		rte_mbuf_iova_set(m, rte_mempool_virt2iova(m) + mbuf_size);
 		m = m->next;
 	}
 }
diff --git a/lib/vhost/vhost_crypto.c b/lib/vhost/vhost_crypto.c
index 54946f46d9..7b50735796 100644
--- a/lib/vhost/vhost_crypto.c
+++ b/lib/vhost/vhost_crypto.c
@@ -823,11 +823,17 @@  prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
 	switch (vcrypto->option) {
 	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
 		m_src->data_len = cipher->para.src_data_len;
-		m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
-				cipher->para.src_data_len);
+		if (!RTE_IOVA_AS_VA) {
+			m_src->buf_iova =
+				gpa_to_hpa(vcrypto->dev, desc->addr, cipher->para.src_data_len);
+			if (unlikely(m_src->buf_iova == 0)) {
+				VC_LOG_ERR("zero_copy may fail due to cross page data");
+				ret = VIRTIO_CRYPTO_ERR;
+				goto error_exit;
+			}
+		}
 		m_src->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
-		if (unlikely(m_src->buf_iova == 0 ||
-				m_src->buf_addr == NULL)) {
+		if (unlikely(m_src->buf_addr == NULL)) {
 			VC_LOG_ERR("zero_copy may fail due to cross page data");
 			ret = VIRTIO_CRYPTO_ERR;
 			goto error_exit;
@@ -867,10 +873,17 @@  prepare_sym_cipher_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
 
 	switch (vcrypto->option) {
 	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
-		m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
-				desc->addr, cipher->para.dst_data_len);
+		if (!RTE_IOVA_AS_VA) {
+			m_dst->buf_iova =
+				gpa_to_hpa(vcrypto->dev, desc->addr, cipher->para.dst_data_len);
+			if (unlikely(m_dst->buf_iova == 0)) {
+				VC_LOG_ERR("zero_copy may fail due to cross page data");
+				ret = VIRTIO_CRYPTO_ERR;
+				goto error_exit;
+			}
+		}
 		m_dst->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RW);
-		if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
+		if (unlikely(m_dst->buf_addr == NULL)) {
 			VC_LOG_ERR("zero_copy may fail due to cross page data");
 			ret = VIRTIO_CRYPTO_ERR;
 			goto error_exit;
@@ -980,11 +993,17 @@  prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
 	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
 		m_src->data_len = chain->para.src_data_len;
 		m_dst->data_len = chain->para.dst_data_len;
-
-		m_src->buf_iova = gpa_to_hpa(vcrypto->dev, desc->addr,
-				chain->para.src_data_len);
+		if (!RTE_IOVA_AS_VA) {
+			m_src->buf_iova =
+				gpa_to_hpa(vcrypto->dev, desc->addr, chain->para.src_data_len);
+			if (unlikely(m_src->buf_iova == 0)) {
+				VC_LOG_ERR("zero_copy may fail due to cross page data");
+				ret = VIRTIO_CRYPTO_ERR;
+				goto error_exit;
+			}
+		}
 		m_src->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RO);
-		if (unlikely(m_src->buf_iova == 0 || m_src->buf_addr == NULL)) {
+		if (unlikely(m_src->buf_addr == NULL)) {
 			VC_LOG_ERR("zero_copy may fail due to cross page data");
 			ret = VIRTIO_CRYPTO_ERR;
 			goto error_exit;
@@ -1024,10 +1043,17 @@  prepare_sym_chain_op(struct vhost_crypto *vcrypto, struct rte_crypto_op *op,
 
 	switch (vcrypto->option) {
 	case RTE_VHOST_CRYPTO_ZERO_COPY_ENABLE:
-		m_dst->buf_iova = gpa_to_hpa(vcrypto->dev,
-				desc->addr, chain->para.dst_data_len);
+		if (!RTE_IOVA_AS_VA) {
+			m_dst->buf_iova =
+				gpa_to_hpa(vcrypto->dev, desc->addr, chain->para.dst_data_len);
+			if (unlikely(m_dst->buf_iova == 0)) {
+				VC_LOG_ERR("zero_copy may fail due to cross page data");
+				ret = VIRTIO_CRYPTO_ERR;
+				goto error_exit;
+			}
+		}
 		m_dst->buf_addr = get_data_ptr(vc_req, desc, VHOST_ACCESS_RW);
-		if (unlikely(m_dst->buf_iova == 0 || m_dst->buf_addr == NULL)) {
+		if (unlikely(m_dst->buf_addr == NULL)) {
 			VC_LOG_ERR("zero_copy may fail due to cross page data");
 			ret = VIRTIO_CRYPTO_ERR;
 			goto error_exit;
diff --git a/meson_options.txt b/meson_options.txt
index 7c220ad68d..f0fa6cf04c 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -44,6 +44,8 @@  option('platform', type: 'string', value: 'native', description:
        'Platform to build, either "native", "generic" or a SoC. Please refer to the Linux build guide for more information.')
 option('enable_trace_fp', type: 'boolean', value: false, description:
        'enable fast path trace points.')
+option('iova_as_va', type: 'boolean', value: false, description:
+       'Build which only supports IOVA as VA mode. Unsupported drivers are disabled.')
 option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description: