[02/10] vdpa/sfc: add support for device initialization

Message ID 20210706164418.32615-3-vsrivast@xilinx.com (mailing list archive)
State Changes Requested, archived
Delegated to: Maxime Coquelin
Headers
Series vdpa/sfc: introduce Xilinx vDPA driver |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Vijay Srivastava July 6, 2021, 4:44 p.m. UTC
  From: Vijay Kumar Srivastava <vsrivast@xilinx.com>

Add HW initialization and vDPA device registration support.

Signed-off-by: Vijay Kumar Srivastava <vsrivast@xilinx.com>
---
 doc/guides/vdpadevs/sfc.rst       |   6 +
 drivers/vdpa/sfc/meson.build      |   3 +
 drivers/vdpa/sfc/sfc_vdpa.c       |  23 +++
 drivers/vdpa/sfc/sfc_vdpa.h       |  49 +++++-
 drivers/vdpa/sfc/sfc_vdpa_debug.h |  21 +++
 drivers/vdpa/sfc/sfc_vdpa_hw.c    | 322 ++++++++++++++++++++++++++++++++++++++
 drivers/vdpa/sfc/sfc_vdpa_log.h   |   3 +
 drivers/vdpa/sfc/sfc_vdpa_mcdi.c  |  74 +++++++++
 drivers/vdpa/sfc/sfc_vdpa_ops.c   | 129 +++++++++++++++
 drivers/vdpa/sfc/sfc_vdpa_ops.h   |  36 +++++
 10 files changed, 665 insertions(+), 1 deletion(-)
 create mode 100644 drivers/vdpa/sfc/sfc_vdpa_debug.h
 create mode 100644 drivers/vdpa/sfc/sfc_vdpa_hw.c
 create mode 100644 drivers/vdpa/sfc/sfc_vdpa_mcdi.c
 create mode 100644 drivers/vdpa/sfc/sfc_vdpa_ops.c
 create mode 100644 drivers/vdpa/sfc/sfc_vdpa_ops.h
  

Comments

Maxime Coquelin Aug. 30, 2021, 9:16 a.m. UTC | #1
On 7/6/21 6:44 PM, Vijay Srivastava wrote:
> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> 
> Add HW initialization and vDPA device registration support.
> 
> Signed-off-by: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> ---
>  doc/guides/vdpadevs/sfc.rst       |   6 +
>  drivers/vdpa/sfc/meson.build      |   3 +
>  drivers/vdpa/sfc/sfc_vdpa.c       |  23 +++
>  drivers/vdpa/sfc/sfc_vdpa.h       |  49 +++++-
>  drivers/vdpa/sfc/sfc_vdpa_debug.h |  21 +++
>  drivers/vdpa/sfc/sfc_vdpa_hw.c    | 322 ++++++++++++++++++++++++++++++++++++++
>  drivers/vdpa/sfc/sfc_vdpa_log.h   |   3 +
>  drivers/vdpa/sfc/sfc_vdpa_mcdi.c  |  74 +++++++++
>  drivers/vdpa/sfc/sfc_vdpa_ops.c   | 129 +++++++++++++++
>  drivers/vdpa/sfc/sfc_vdpa_ops.h   |  36 +++++
>  10 files changed, 665 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_debug.h
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_hw.c
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_mcdi.c
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_ops.c
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_ops.h
> 

...

> diff --git a/drivers/vdpa/sfc/sfc_vdpa_hw.c b/drivers/vdpa/sfc/sfc_vdpa_hw.c
> new file mode 100644
> index 0000000..83f3696
> --- /dev/null
> +++ b/drivers/vdpa/sfc/sfc_vdpa_hw.c
> @@ -0,0 +1,322 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + *
> + * Copyright(c) 2020-2021 Xilinx, Inc.
> + */
> +
> +#include <unistd.h>
> +
> +#include <rte_common.h>
> +#include <rte_errno.h>
> +#include <rte_vfio.h>
> +
> +#include "efx.h"
> +#include "sfc_vdpa.h"
> +#include "sfc_vdpa_ops.h"
> +
> +extern uint32_t sfc_logtype_driver;
> +
> +#ifndef PAGE_SIZE
> +#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
> +#endif
> +
> +int
> +sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
> +		   size_t len, efsys_mem_t *esmp)
> +{
> +	void *mcdi_buf;
> +	uint64_t mcdi_iova;
> +	size_t mcdi_buff_size;
> +	int ret;
> +
> +	mcdi_buff_size = RTE_ALIGN_CEIL(len, PAGE_SIZE);
> +
> +	sfc_vdpa_log_init(sva, "name=%s, len=%zu", name, len);
> +
> +	mcdi_buf = rte_zmalloc(name, mcdi_buff_size, PAGE_SIZE);

You might want to allocate on the same NUMA node the device is on?

> +	if (mcdi_buf == NULL) {
> +		sfc_vdpa_err(sva, "cannot reserve memory for %s: len=%#x: %s",
> +			     name, (unsigned int)len, rte_strerror(rte_errno));
> +		return -ENOMEM;
> +	}
> +
> +	/* IOVA address for MCDI would be re-calculated if mapping
> +	 * using default IOVA would fail.
> +	 * TODO: Earlier there was no way to get valid IOVA range.
> +	 * Recently a patch has been submitted to get the IOVA range
> +	 * using ioctl. VFIO_IOMMU_GET_INFO. This patch is available
> +	 * in the kernel version >= 5.4. Support to get the default
> +	 * IOVA address for MCDI buffer using available IOVA range
> +	 * would be added later. Meanwhile default IOVA for MCDI buffer
> +	 * is kept at high mem at 2TB. In case of overlap new available
> +	 * addresses would be searched and same would be used.
> +	 */
> +	mcdi_iova = SFC_VDPA_DEFAULT_MCDI_IOVA;
> +
> +	do {
> +		ret = rte_vfio_container_dma_map(sva->vfio_container_fd,
> +						 (uint64_t)mcdi_buf, mcdi_iova,
> +						 mcdi_buff_size);
> +		if (ret == 0)
> +			break;
> +
> +		mcdi_iova = mcdi_iova >> 1;
> +		if (mcdi_iova < mcdi_buff_size)	{
> +			sfc_vdpa_err(sva,
> +				     "DMA mapping failed for MCDI : %s",
> +				     rte_strerror(rte_errno));
> +			return ret;

You leak mcdi_buf here if DMA map fails.

> +		}
> +
> +	} while (ret < 0);
> +
> +	esmp->esm_addr = mcdi_iova;
> +	esmp->esm_base = mcdi_buf;
> +	sva->mcdi_buff_size = mcdi_buff_size;
> +
> +	sfc_vdpa_info(sva,
> +		      "DMA name=%s len=%zu => virt=%p iova=%" PRIx64,
> +		      name, len, esmp->esm_base, esmp->esm_addr);
> +
> +	return 0;
> +}
> +

Thanks,
Maxime
  
Chenbo Xia Aug. 30, 2021, 10:52 a.m. UTC | #2
Hi Vijay,

> -----Original Message-----
> From: Vijay Srivastava <vijay.srivastava@xilinx.com>
> Sent: Wednesday, July 7, 2021 12:44 AM
> To: dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>;
> andrew.rybchenko@oktetlabs.ru; Vijay Kumar Srivastava <vsrivast@xilinx.com>
> Subject: [PATCH 02/10] vdpa/sfc: add support for device initialization
> 
> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> 
> Add HW initialization and vDPA device registration support.
> 
> Signed-off-by: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> ---
>  doc/guides/vdpadevs/sfc.rst       |   6 +
>  drivers/vdpa/sfc/meson.build      |   3 +
>  drivers/vdpa/sfc/sfc_vdpa.c       |  23 +++
>  drivers/vdpa/sfc/sfc_vdpa.h       |  49 +++++-
>  drivers/vdpa/sfc/sfc_vdpa_debug.h |  21 +++
>  drivers/vdpa/sfc/sfc_vdpa_hw.c    | 322
> ++++++++++++++++++++++++++++++++++++++
>  drivers/vdpa/sfc/sfc_vdpa_log.h   |   3 +
>  drivers/vdpa/sfc/sfc_vdpa_mcdi.c  |  74 +++++++++
>  drivers/vdpa/sfc/sfc_vdpa_ops.c   | 129 +++++++++++++++
>  drivers/vdpa/sfc/sfc_vdpa_ops.h   |  36 +++++
>  10 files changed, 665 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_debug.h
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_hw.c
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_mcdi.c
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_ops.c
>  create mode 100644 drivers/vdpa/sfc/sfc_vdpa_ops.h
> 
> diff --git a/doc/guides/vdpadevs/sfc.rst b/doc/guides/vdpadevs/sfc.rst
> index 59f990b..abb5900 100644
> --- a/doc/guides/vdpadevs/sfc.rst
> +++ b/doc/guides/vdpadevs/sfc.rst
> @@ -95,3 +95,9 @@ SFC vDPA PMD provides the following log types available for
> control:
>    Matches a subset of per-port log types registered during runtime.
>    A full name for a particular type may be obtained by appending a
>    dot and a PCI device identifier (``XXXX:XX:XX.X``) to the prefix.
> +
> +- ``pmd.vdpa.sfc.mcdi`` (default level is **notice**)
> +
> +  Extra logging of the communication with the NIC's management CPU.
> +  The format of the log is consumed by the netlogdecode cross-platform
> +  tool. May be managed per-port, as explained above.
> diff --git a/drivers/vdpa/sfc/meson.build b/drivers/vdpa/sfc/meson.build
> index d916389..aac7c51 100644
> --- a/drivers/vdpa/sfc/meson.build
> +++ b/drivers/vdpa/sfc/meson.build
> @@ -30,4 +30,7 @@ endforeach
>  deps += ['common_sfc_efx', 'bus_pci']
>  sources = files(
>  	'sfc_vdpa.c',
> +	'sfc_vdpa_hw.c',
> +	'sfc_vdpa_mcdi.c',
> +	'sfc_vdpa_ops.c',
>  )
> diff --git a/drivers/vdpa/sfc/sfc_vdpa.c b/drivers/vdpa/sfc/sfc_vdpa.c
> index d8faaca..12e8d6e 100644
> --- a/drivers/vdpa/sfc/sfc_vdpa.c
> +++ b/drivers/vdpa/sfc/sfc_vdpa.c
> @@ -232,6 +232,19 @@ struct sfc_vdpa_adapter *
>  		goto fail_vfio_setup;
>  	}
> 
> +	sfc_vdpa_log_init(sva, "hw init");
> +	if (sfc_vdpa_hw_init(sva) != 0) {
> +		sfc_vdpa_err(sva, "failed to init HW %s", pci_dev->name);
> +		goto fail_hw_init;
> +	}
> +
> +	sfc_vdpa_log_init(sva, "dev init");
> +	sva->ops_data = sfc_vdpa_device_init(sva, SFC_VDPA_AS_VF);
> +	if (sva->ops_data == NULL) {
> +		sfc_vdpa_err(sva, "failed vDPA dev init %s", pci_dev->name);
> +		goto fail_dev_init;
> +	}
> +
>  	pthread_mutex_lock(&sfc_vdpa_adapter_list_lock);
>  	TAILQ_INSERT_TAIL(&sfc_vdpa_adapter_list, sva, next);
>  	pthread_mutex_unlock(&sfc_vdpa_adapter_list_lock);
> @@ -240,6 +253,12 @@ struct sfc_vdpa_adapter *
> 
>  	return 0;
> 
> +fail_dev_init:
> +	sfc_vdpa_hw_fini(sva);
> +
> +fail_hw_init:
> +	sfc_vdpa_vfio_teardown(sva);
> +
>  fail_vfio_setup:
>  fail_set_log_prefix:
>  	rte_free(sva);
> @@ -266,6 +285,10 @@ struct sfc_vdpa_adapter *
>  	TAILQ_REMOVE(&sfc_vdpa_adapter_list, sva, next);
>  	pthread_mutex_unlock(&sfc_vdpa_adapter_list_lock);
> 
> +	sfc_vdpa_device_fini(sva->ops_data);
> +
> +	sfc_vdpa_hw_fini(sva);
> +
>  	sfc_vdpa_vfio_teardown(sva);
> 
>  	rte_free(sva);
> diff --git a/drivers/vdpa/sfc/sfc_vdpa.h b/drivers/vdpa/sfc/sfc_vdpa.h
> index 3b77900..fb97258 100644
> --- a/drivers/vdpa/sfc/sfc_vdpa.h
> +++ b/drivers/vdpa/sfc/sfc_vdpa.h
> @@ -11,14 +11,38 @@
> 
>  #include <rte_bus_pci.h>
> 
> +#include "sfc_efx.h"
> +#include "sfc_efx_mcdi.h"
> +#include "sfc_vdpa_debug.h"
>  #include "sfc_vdpa_log.h"
> +#include "sfc_vdpa_ops.h"
> +
> +#define SFC_VDPA_DEFAULT_MCDI_IOVA		0x200000000000
> 
>  /* Adapter private data */
>  struct sfc_vdpa_adapter {
>  	TAILQ_ENTRY(sfc_vdpa_adapter)	next;
> +	/*
> +	 * PMD setup and configuration is not thread safe. Since it is not
> +	 * performance sensitive, it is better to guarantee thread-safety
> +	 * and add device level lock. vDPA control operations which
> +	 * change its state should acquire the lock.
> +	 */
> +	rte_spinlock_t			lock;
>  	struct rte_pci_device		*pdev;
>  	struct rte_pci_addr		pci_addr;
> 
> +	efx_family_t			family;
> +	efx_nic_t			*nic;
> +	rte_spinlock_t			nic_lock;
> +
> +	efsys_bar_t			mem_bar;
> +
> +	struct sfc_efx_mcdi		mcdi;
> +	size_t				mcdi_buff_size;
> +
> +	uint32_t			max_queue_count;
> +
>  	char				log_prefix[SFC_VDPA_LOG_PREFIX_MAX];
>  	uint32_t			logtype_main;
> 
> @@ -26,6 +50,7 @@ struct sfc_vdpa_adapter {
>  	int				vfio_dev_fd;
>  	int				vfio_container_fd;
>  	int				iommu_group_num;
> +	struct sfc_vdpa_ops_data	*ops_data;
>  };
> 
>  uint32_t
> @@ -36,5 +61,27 @@ struct sfc_vdpa_adapter {
>  struct sfc_vdpa_adapter *
>  sfc_vdpa_get_adapter_by_dev(struct rte_pci_device *pdev);
> 
> -#endif  /* _SFC_VDPA_H */
> +int
> +sfc_vdpa_hw_init(struct sfc_vdpa_adapter *sva);
> +void
> +sfc_vdpa_hw_fini(struct sfc_vdpa_adapter *sa);

Better to align the name here: sa -> sva

> 
> +int
> +sfc_vdpa_mcdi_init(struct sfc_vdpa_adapter *sva);
> +void
> +sfc_vdpa_mcdi_fini(struct sfc_vdpa_adapter *sva);
> +
> +int
> +sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
> +		   size_t len, efsys_mem_t *esmp);
> +
> +void
> +sfc_vdpa_dma_free(struct sfc_vdpa_adapter *sva, efsys_mem_t *esmp);
> +
> +static inline struct sfc_vdpa_adapter *
> +sfc_vdpa_adapter_by_dev_handle(void *dev_handle)
> +{
> +	return (struct sfc_vdpa_adapter *)dev_handle;
> +}
> +
> +#endif  /* _SFC_VDPA_H */
> diff --git a/drivers/vdpa/sfc/sfc_vdpa_debug.h
> b/drivers/vdpa/sfc/sfc_vdpa_debug.h
> new file mode 100644
> index 0000000..cfa8cc5
> --- /dev/null
> +++ b/drivers/vdpa/sfc/sfc_vdpa_debug.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + *
> + * Copyright(c) 2020-2021 Xilinx, Inc.
> + */
> +
> +#ifndef _SFC_VDPA_DEBUG_H_
> +#define _SFC_VDPA_DEBUG_H_
> +
> +#include <rte_debug.h>
> +
> +#ifdef RTE_LIBRTE_SFC_VDPA_DEBUG
> +/* Avoid dependency from RTE_LOG_DP_LEVEL to be able to enable debug check
> + * in the driver only.
> + */
> +#define SFC_VDPA_ASSERT(exp)			RTE_VERIFY(exp)
> +#else
> +/* If the driver debug is not enabled, follow DPDK debug/non-debug */
> +#define SFC_VDPA_ASSERT(exp)			RTE_ASSERT(exp)
> +#endif
> +
> +#endif /* _SFC_VDPA_DEBUG_H_ */
> diff --git a/drivers/vdpa/sfc/sfc_vdpa_hw.c b/drivers/vdpa/sfc/sfc_vdpa_hw.c
> new file mode 100644
> index 0000000..83f3696
> --- /dev/null
> +++ b/drivers/vdpa/sfc/sfc_vdpa_hw.c
> @@ -0,0 +1,322 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + *
> + * Copyright(c) 2020-2021 Xilinx, Inc.
> + */
> +
> +#include <unistd.h>
> +
> +#include <rte_common.h>
> +#include <rte_errno.h>
> +#include <rte_vfio.h>
> +
> +#include "efx.h"
> +#include "sfc_vdpa.h"
> +#include "sfc_vdpa_ops.h"
> +
> +extern uint32_t sfc_logtype_driver;
> +
> +#ifndef PAGE_SIZE
> +#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
> +#endif
> +
> +int
> +sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
> +		   size_t len, efsys_mem_t *esmp)
> +{
> +	void *mcdi_buf;
> +	uint64_t mcdi_iova;
> +	size_t mcdi_buff_size;
> +	int ret;
> +
> +	mcdi_buff_size = RTE_ALIGN_CEIL(len, PAGE_SIZE);
> +
> +	sfc_vdpa_log_init(sva, "name=%s, len=%zu", name, len);
> +
> +	mcdi_buf = rte_zmalloc(name, mcdi_buff_size, PAGE_SIZE);
> +	if (mcdi_buf == NULL) {
> +		sfc_vdpa_err(sva, "cannot reserve memory for %s: len=%#x: %s",
> +			     name, (unsigned int)len, rte_strerror(rte_errno));
> +		return -ENOMEM;
> +	}
> +
> +	/* IOVA address for MCDI would be re-calculated if mapping

What is MCDI?

> +	 * using default IOVA would fail.
> +	 * TODO: Earlier there was no way to get valid IOVA range.
> +	 * Recently a patch has been submitted to get the IOVA range
> +	 * using ioctl. VFIO_IOMMU_GET_INFO. This patch is available
> +	 * in the kernel version >= 5.4. Support to get the default
> +	 * IOVA address for MCDI buffer using available IOVA range
> +	 * would be added later. Meanwhile default IOVA for MCDI buffer
> +	 * is kept at high mem at 2TB. In case of overlap new available
> +	 * addresses would be searched and same would be used.
> +	 */
> +	mcdi_iova = SFC_VDPA_DEFAULT_MCDI_IOVA;
> +
> +	do {
> +		ret = rte_vfio_container_dma_map(sva->vfio_container_fd,
> +						 (uint64_t)mcdi_buf, mcdi_iova,
> +						 mcdi_buff_size);
> +		if (ret == 0)
> +			break;
> +
> +		mcdi_iova = mcdi_iova >> 1;
> +		if (mcdi_iova < mcdi_buff_size)	{
> +			sfc_vdpa_err(sva,
> +				     "DMA mapping failed for MCDI : %s",
> +				     rte_strerror(rte_errno));
> +			return ret;
> +		}
> +
> +	} while (ret < 0);

Is this DMA region for some hardware-specific control msg?

And how do you make sure this IOVA space you defined in this driver will
not conflict with the IOVA space that vdpa device consumer (Most likely QEMU)
defines (If QEMU, IOVA = guest physical address)

Thanks,
Chenbo

> +
> +	esmp->esm_addr = mcdi_iova;
> +	esmp->esm_base = mcdi_buf;
> +	sva->mcdi_buff_size = mcdi_buff_size;
> +
> +	sfc_vdpa_info(sva,
> +		      "DMA name=%s len=%zu => virt=%p iova=%" PRIx64,
> +		      name, len, esmp->esm_base, esmp->esm_addr);
> +
> +	return 0;
> +}
> +
  
Vijay Kumar Srivastava Sept. 3, 2021, 1:19 p.m. UTC | #3
Hi Chenbo,

>-----Original Message-----
>From: Xia, Chenbo <chenbo.xia@intel.com>
>Sent: Monday, August 30, 2021 4:22 PM
>To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
>Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Vijay
>Kumar Srivastava <vsrivast@xilinx.com>
>Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
>
>Hi Vijay,
>
>> -----Original Message-----
>> From: Vijay Srivastava <vijay.srivastava@xilinx.com>
>> Sent: Wednesday, July 7, 2021 12:44 AM
>> To: dev@dpdk.org
>> Cc: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>;
>> andrew.rybchenko@oktetlabs.ru; Vijay Kumar Srivastava
>> <vsrivast@xilinx.com>
>> Subject: [PATCH 02/10] vdpa/sfc: add support for device initialization
>>
>> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
>>
>> Add HW initialization and vDPA device registration support.
>>
>> Signed-off-by: Vijay Kumar Srivastava <vsrivast@xilinx.com>
>> ---

[snip]

>> +sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
>> +		   size_t len, efsys_mem_t *esmp)
>> +{
>> +	void *mcdi_buf;
>> +	uint64_t mcdi_iova;
>> +	size_t mcdi_buff_size;
>> +	int ret;
>> +
>> +	mcdi_buff_size = RTE_ALIGN_CEIL(len, PAGE_SIZE);
>> +
>> +	sfc_vdpa_log_init(sva, "name=%s, len=%zu", name, len);
>> +
>> +	mcdi_buf = rte_zmalloc(name, mcdi_buff_size, PAGE_SIZE);
>> +	if (mcdi_buf == NULL) {
>> +		sfc_vdpa_err(sva, "cannot reserve memory for %s: len=%#x:
>%s",
>> +			     name, (unsigned int)len, rte_strerror(rte_errno));
>> +		return -ENOMEM;
>> +	}
>> +
>> +	/* IOVA address for MCDI would be re-calculated if mapping
>
>What is MCDI?

MCDI is a control interface between driver and firmware. 
It is used by the host drivers to configure the adapter and retrieve status.

>> +	 * using default IOVA would fail.
>> +	 * TODO: Earlier there was no way to get valid IOVA range.
>> +	 * Recently a patch has been submitted to get the IOVA range
>> +	 * using ioctl. VFIO_IOMMU_GET_INFO. This patch is available
>> +	 * in the kernel version >= 5.4. Support to get the default
>> +	 * IOVA address for MCDI buffer using available IOVA range
>> +	 * would be added later. Meanwhile default IOVA for MCDI buffer
>> +	 * is kept at high mem at 2TB. In case of overlap new available
>> +	 * addresses would be searched and same would be used.
>> +	 */
>> +	mcdi_iova = SFC_VDPA_DEFAULT_MCDI_IOVA;
>> +
>> +	do {
>> +		ret = rte_vfio_container_dma_map(sva->vfio_container_fd,
>> +						 (uint64_t)mcdi_buf,
>mcdi_iova,
>> +						 mcdi_buff_size);
>> +		if (ret == 0)
>> +			break;
>> +
>> +		mcdi_iova = mcdi_iova >> 1;
>> +		if (mcdi_iova < mcdi_buff_size)	{
>> +			sfc_vdpa_err(sva,
>> +				     "DMA mapping failed for MCDI : %s",
>> +				     rte_strerror(rte_errno));
>> +			return ret;
>> +		}
>> +
>> +	} while (ret < 0);
>
>Is this DMA region for some hardware-specific control msg?
>
>And how do you make sure this IOVA space you defined in this driver will not
>conflict with the IOVA space that vdpa device consumer (Most likely QEMU)
>defines (If QEMU, IOVA = guest physical address)

Currently IOVA for MCDI buffer is kept at very high mem at 2TB.

To handle IOVA overlap detection scenario a patch is in progress which will be submitted soon.
In that patch, upon IOVA overlap detection new available IOVA would be calculated and MCDI buffer would be remapped to new IOVA.

[snip]

Thanks,
Vijay
  
Chenbo Xia Sept. 6, 2021, 3:02 a.m. UTC | #4
Hi,

> -----Original Message-----
> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> Sent: Friday, September 3, 2021 9:20 PM
> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet Singh
> Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
> 
> 
> Hi Chenbo,
> 
> >-----Original Message-----
> >From: Xia, Chenbo <chenbo.xia@intel.com>
> >Sent: Monday, August 30, 2021 4:22 PM
> >To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
> >Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Vijay
> >Kumar Srivastava <vsrivast@xilinx.com>
> >Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
> >
> >Hi Vijay,
> >
> >> -----Original Message-----
> >> From: Vijay Srivastava <vijay.srivastava@xilinx.com>
> >> Sent: Wednesday, July 7, 2021 12:44 AM
> >> To: dev@dpdk.org
> >> Cc: maxime.coquelin@redhat.com; Xia, Chenbo <chenbo.xia@intel.com>;
> >> andrew.rybchenko@oktetlabs.ru; Vijay Kumar Srivastava
> >> <vsrivast@xilinx.com>
> >> Subject: [PATCH 02/10] vdpa/sfc: add support for device initialization
> >>
> >> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> >>
> >> Add HW initialization and vDPA device registration support.
> >>
> >> Signed-off-by: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> >> ---
> 
> [snip]
> 
> >> +sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
> >> +		   size_t len, efsys_mem_t *esmp)
> >> +{
> >> +	void *mcdi_buf;
> >> +	uint64_t mcdi_iova;
> >> +	size_t mcdi_buff_size;
> >> +	int ret;
> >> +
> >> +	mcdi_buff_size = RTE_ALIGN_CEIL(len, PAGE_SIZE);
> >> +
> >> +	sfc_vdpa_log_init(sva, "name=%s, len=%zu", name, len);
> >> +
> >> +	mcdi_buf = rte_zmalloc(name, mcdi_buff_size, PAGE_SIZE);
> >> +	if (mcdi_buf == NULL) {
> >> +		sfc_vdpa_err(sva, "cannot reserve memory for %s: len=%#x:
> >%s",
> >> +			     name, (unsigned int)len, rte_strerror(rte_errno));
> >> +		return -ENOMEM;
> >> +	}
> >> +
> >> +	/* IOVA address for MCDI would be re-calculated if mapping
> >
> >What is MCDI?
> 
> MCDI is a control interface between driver and firmware.
> It is used by the host drivers to configure the adapter and retrieve status.

Cool, thanks for explanation.

> 
> >> +	 * using default IOVA would fail.
> >> +	 * TODO: Earlier there was no way to get valid IOVA range.
> >> +	 * Recently a patch has been submitted to get the IOVA range
> >> +	 * using ioctl. VFIO_IOMMU_GET_INFO. This patch is available
> >> +	 * in the kernel version >= 5.4. Support to get the default
> >> +	 * IOVA address for MCDI buffer using available IOVA range
> >> +	 * would be added later. Meanwhile default IOVA for MCDI buffer
> >> +	 * is kept at high mem at 2TB. In case of overlap new available
> >> +	 * addresses would be searched and same would be used.
> >> +	 */
> >> +	mcdi_iova = SFC_VDPA_DEFAULT_MCDI_IOVA;
> >> +
> >> +	do {
> >> +		ret = rte_vfio_container_dma_map(sva->vfio_container_fd,
> >> +						 (uint64_t)mcdi_buf,
> >mcdi_iova,
> >> +						 mcdi_buff_size);
> >> +		if (ret == 0)
> >> +			break;
> >> +
> >> +		mcdi_iova = mcdi_iova >> 1;
> >> +		if (mcdi_iova < mcdi_buff_size)	{
> >> +			sfc_vdpa_err(sva,
> >> +				     "DMA mapping failed for MCDI : %s",
> >> +				     rte_strerror(rte_errno));
> >> +			return ret;
> >> +		}
> >> +
> >> +	} while (ret < 0);
> >
> >Is this DMA region for some hardware-specific control msg?
> >
> >And how do you make sure this IOVA space you defined in this driver will not
> >conflict with the IOVA space that vdpa device consumer (Most likely QEMU)
> >defines (If QEMU, IOVA = guest physical address)
> 
> Currently IOVA for MCDI buffer is kept at very high mem at 2TB.

OK. That sounds a work-around to me but we can't make assumption of consumer not
using that address range. And there is a security issue here, please see below
comment.

> 
> To handle IOVA overlap detection scenario a patch is in progress which will be
> submitted soon.
> In that patch, upon IOVA overlap detection new available IOVA would be
> calculated and MCDI buffer would be remapped to new IOVA.

Let's say there is a malicious guest who knows your initial IOVA range that is set
up by your driver (even if it does not know, it can use tests to know. So use static
IOVA range in host is more dangerous). It can use that address in any DMA-able queue
and make DMA into the vdpa app. I think it could cause some security issue as you
let guest easily writing host memory.

For now I don't see a perfect solution except PASID(Process Address Space ID). IIRC,
We could let QEMU have a primary PASID and vdpa app have a secondary PASID so that
VM can't perform DMA to vdpa app. But since it needs HW support and related support
in vfio is not mature, I don't think we are able to use that solution now.

Any solution you can think of for your HW?

Thanks,
Chenbo

> 
> [snip]
> 
> Thanks,
> Vijay
  
Vijay Kumar Srivastava Oct. 1, 2021, 5:31 p.m. UTC | #5
Hi Chenbo,

>-----Original Message-----
>From: Xia, Chenbo <chenbo.xia@intel.com>
>Sent: Monday, September 6, 2021 8:32 AM
>To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
>Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet
>Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
>Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
>
>Hi,
>
>> -----Original Message-----
>> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
>> Sent: Friday, September 3, 2021 9:20 PM
>> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
>> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru;
>> Harpreet Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain
>> <praveenj@xilinx.com>
>> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
>> initialization
>>

[snip]

>> To handle IOVA overlap detection scenario a patch is in progress 
>> which will be submitted soon.
>> In that patch, upon IOVA overlap detection new available IOVA would 
>> be calculated and MCDI buffer would be remapped to new IOVA.
>Let's say there is a malicious guest who knows your initial IOVA range that is set
>up by your driver (even if it does not know, it can use tests to know. So use static
>IOVA range in host is more dangerous). 
Upcoming patch will handle IOVA conflict scenario. With that patch hardcoded IOVA would not be needed.
If malicious guest will try to use MCDI IOVA address then vDPA driver would detect IOVA overlap and would remap MCDI buffer to another available IOVA address.
This IOVA address is for MCDI buffer which is used for the control path.
Just by only writing to MCDI buffer does not imply that malicious guest can send any control message to NIC to modify HW configuration.

>It can use that address in any DMA-able queue and make DMA into the vdpa app. I think it could cause some security issue
>as you let guest easily writing host memory.
Can you please elaborate on this ? 
In what scenarios host physical address can be accessed by malicious guest ?

>For now I don't see a perfect solution except PASID(Process Address Space ID).
>IIRC, We could let QEMU have a primary PASID and vdpa app have a secondary
>PASID so that VM can't perform DMA to vdpa app. But since it needs HW support
>and related support in vfio is not mature, I don't think we are able to use that
>solution now.
>Any solution you can think of for your HW?
Yes, It can be used. Our next version of HW will have the PASID support.

Regards,
Vijay
  
Chenbo Xia Oct. 9, 2021, 3:06 a.m. UTC | #6
Hi Vijay,

> -----Original Message-----
> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> Sent: Saturday, October 2, 2021 1:32 AM
> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet
> Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
> 
> Hi Chenbo,
> 
> >-----Original Message-----
> >From: Xia, Chenbo <chenbo.xia@intel.com>
> >Sent: Monday, September 6, 2021 8:32 AM
> >To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
> >Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet
> >Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
> >Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
> initialization
> >
> >Hi,
> >
> >> -----Original Message-----
> >> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> >> Sent: Friday, September 3, 2021 9:20 PM
> >> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
> >> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru;
> >> Harpreet Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain
> >> <praveenj@xilinx.com>
> >> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
> >> initialization
> >>
> 
> [snip]
> 
> >> To handle IOVA overlap detection scenario a patch is in progress
> >> which will be submitted soon.
> >> In that patch, upon IOVA overlap detection new available IOVA would
> >> be calculated and MCDI buffer would be remapped to new IOVA.
> >Let's say there is a malicious guest who knows your initial IOVA range
> that is set
> >up by your driver (even if it does not know, it can use tests to know. So
> use static
> >IOVA range in host is more dangerous).
> Upcoming patch will handle IOVA conflict scenario. With that patch
> hardcoded IOVA would not be needed.
> If malicious guest will try to use MCDI IOVA address then vDPA driver
> would detect IOVA overlap and would remap MCDI buffer to another available
> IOVA address.

Yes, I think I know your solution of driver solving the overlap problem.

> This IOVA address is for MCDI buffer which is used for the control path.
> Just by only writing to MCDI buffer does not imply that malicious guest
> can send any control message to NIC to modify HW configuration.
> 
> >It can use that address in any DMA-able queue and make DMA into the vdpa
> app. I think it could cause some security issue
> >as you let guest easily writing host memory.
> Can you please elaborate on this ?
> In what scenarios host physical address can be accessed by malicious
> guest ?

As I have not reviewed the full series and not familiar with your HW.
You can correct my understanding below:

I think your vdpa HW (let's say a VF) have two DMA regions: one in guest (w/o vIOMMU)
and the other in vdpa app. Both share the same IOVA address space, and we
don't want them overlap. Let's say we can make sure no overlap will happen and take
an example here: guest DMA region's IOVA (GPA) range is 0x0000 to 0x1000 and vdpa app's
is 0x1000 to 0x2000. A malicious guest could use a malicious driver to write 0x1500
in its virtio RX ring, so that HW will DMA to that address when packets come. Then
the malicious guest performed an DMA to host memory. Although the guest does not
know IOVA range of vdpa app, he can randomly guess to do the attack.

Any solution your HW/driver can prevent this from happening without PASID? Or do I
miss something here?

Thanks,
Chenbo

> 
> >For now I don't see a perfect solution except PASID(Process Address Space
> ID).
> >IIRC, We could let QEMU have a primary PASID and vdpa app have a
> secondary
> >PASID so that VM can't perform DMA to vdpa app. But since it needs HW
> support
> >and related support in vfio is not mature, I don't think we are able to
> use that
> >solution now.
> >Any solution you can think of for your HW?
> Yes, It can be used. Our next version of HW will have the PASID support.
> 
> Regards,
> Vijay
>
  
Vijay Kumar Srivastava Oct. 18, 2021, 10:06 a.m. UTC | #7
Hi Chenbo,

>-----Original Message-----
>From: Xia, Chenbo <chenbo.xia@intel.com>
>Sent: Saturday, October 9, 2021 8:36 AM
>To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
>Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet
>Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
>Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
>
>Hi Vijay,
>
>> -----Original Message-----
>> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
>> Sent: Saturday, October 2, 2021 1:32 AM
>> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
>> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru;
>> Harpreet Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain
>> <praveenj@xilinx.com>
>> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
>> initialization
>>
>> Hi Chenbo,
>>
>> >-----Original Message-----
>> >From: Xia, Chenbo <chenbo.xia@intel.com>
>> >Sent: Monday, September 6, 2021 8:32 AM
>> >To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
>> >Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru;
>> >Harpreet Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain
>> ><praveenj@xilinx.com>
>> >Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
>> initialization

[Snip]

>I think your vdpa HW (let's say a VF) have two DMA regions: one in guest (w/o
>vIOMMU) and the other in vdpa app. Both share the same IOVA address space,
>and we don't want them overlap. Let's say we can make sure no overlap will
>happen and take an example here: guest DMA region's IOVA (GPA) range is
>0x0000 to 0x1000 and vdpa app's is 0x1000 to 0x2000. A malicious guest could
>use a malicious driver to write 0x1500 in its virtio RX ring, so that HW will DMA
>to that address when packets come. Then the malicious guest performed an
>DMA to host memory. Although the guest does not know IOVA range of vdpa
>app, he can randomly guess to do the attack.
>
>Any solution your HW/driver can prevent this from happening without PASID?
>Or do I miss something here ?

Rx packet will carry headers making highly unlikely any proper MCDI data can be written to the IOVA address (for MCDI buffer) to work with by the FW. 
Writing to the buffer does not imply to issue the MCDI message. Even if MCDI is sent then FW is resilient enough to identify the incorrect MCDI and will reject the message. 

This is going to affect only to VF on which malicious guest is present, as this MCDI buffer is specific to the corresponding VF. 
So it won't affect any control path operation on the any other VF or host.

For SW assisted Live migration implemented in the ifcvf vDPA driver it uses hard coded IOVA addresses for mediated vring. Could it have similar issue ?
  
Chenbo Xia Oct. 19, 2021, 2:16 a.m. UTC | #8
Hi Vijay,

> -----Original Message-----
> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> Sent: Monday, October 18, 2021 6:06 PM
> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet Singh
> Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
> 
> Hi Chenbo,
> 
> >-----Original Message-----
> >From: Xia, Chenbo <chenbo.xia@intel.com>
> >Sent: Saturday, October 9, 2021 8:36 AM
> >To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
> >Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet
> >Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
> >Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization
> >
> >Hi Vijay,
> >
> >> -----Original Message-----
> >> From: Vijay Kumar Srivastava <vsrivast@xilinx.com>
> >> Sent: Saturday, October 2, 2021 1:32 AM
> >> To: Xia, Chenbo <chenbo.xia@intel.com>; dev@dpdk.org
> >> Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru;
> >> Harpreet Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain
> >> <praveenj@xilinx.com>
> >> Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
> >> initialization
> >>
> >> Hi Chenbo,
> >>
> >> >-----Original Message-----
> >> >From: Xia, Chenbo <chenbo.xia@intel.com>
> >> >Sent: Monday, September 6, 2021 8:32 AM
> >> >To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
> >> >Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru;
> >> >Harpreet Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain
> >> ><praveenj@xilinx.com>
> >> >Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device
> >> initialization
> 
> [Snip]
> 
> >I think your vdpa HW (let's say a VF) have two DMA regions: one in guest (w/o
> >vIOMMU) and the other in vdpa app. Both share the same IOVA address space,
> >and we don't want them overlap. Let's say we can make sure no overlap will
> >happen and take an example here: guest DMA region's IOVA (GPA) range is
> >0x0000 to 0x1000 and vdpa app's is 0x1000 to 0x2000. A malicious guest could
> >use a malicious driver to write 0x1500 in its virtio RX ring, so that HW will
> DMA
> >to that address when packets come. Then the malicious guest performed an
> >DMA to host memory. Although the guest does not know IOVA range of vdpa
> >app, he can randomly guess to do the attack.
> >
> >Any solution your HW/driver can prevent this from happening without PASID?
> >Or do I miss something here ?
> 
> Rx packet will carry headers making highly unlikely any proper MCDI data can
> be written to the IOVA address (for MCDI buffer) to work with by the FW.
> Writing to the buffer does not imply to issue the MCDI message. Even if MCDI
> is sent then FW is resilient enough to identify the incorrect MCDI and will
> reject the message.
> 
> This is going to affect only to VF on which malicious guest is present, as
> this MCDI buffer is specific to the corresponding VF.
> So it won't affect any control path operation on the any other VF or host.

OK. So it's very hard to do attack with the FW detection. But about 'won't affect
host', I think it depends on how you handle the DMA-ed control messages. Take a bad
example: if one DMA address saves a pointer and the malicious DMA makes the pointer
be NULL, it will segfaults the program (But I don't think this will happen in your driver,
just help you understand my point). So please check the control messages handling
is robust.

And in the future, I would like to see this problem solved by PASID when your HW has
the support.

> 
> For SW assisted Live migration implemented in the ifcvf vDPA driver it uses
> hard coded IOVA addresses for mediated vring. Could it have similar issue ?

Good point. It will and I think we may also need to check if will affect the host program,
or deprecated the feature later.

Thanks,
Chenbo
  
Vijay Kumar Srivastava Oct. 25, 2021, 6:11 a.m. UTC | #9
Hi Chenbo, 

>-----Original Message-----
>From: Xia, Chenbo <chenbo.xia@intel.com>
>Sent: Tuesday, October 19, 2021 7:47 AM
>To: Vijay Kumar Srivastava <vsrivast@xilinx.com>; dev@dpdk.org
>Cc: maxime.coquelin@redhat.com; andrew.rybchenko@oktetlabs.ru; Harpreet
>Singh Anand <hanand@xilinx.com>; Praveen Kumar Jain <praveenj@xilinx.com>
>Subject: RE: [PATCH 02/10] vdpa/sfc: add support for device initialization

[Snip]

>> Rx packet will carry headers making highly unlikely any proper MCDI
>> data can be written to the IOVA address (for MCDI buffer) to work with by the
>FW.
>> Writing to the buffer does not imply to issue the MCDI message. Even
>> if MCDI is sent then FW is resilient enough to identify the incorrect
>> MCDI and will reject the message.
>>
>> This is going to affect only to VF on which malicious guest is
>> present, as this MCDI buffer is specific to the corresponding VF.
>> So it won't affect any control path operation on the any other VF or host.
>
>OK. So it's very hard to do attack with the FW detection. But about 'won't affect
>host', I think it depends on how you handle the DMA-ed control messages. Take
>a bad
>example: if one DMA address saves a pointer and the malicious DMA makes the
>pointer be NULL, it will segfaults the program (But I don't think this will happen
>in your driver, just help you understand my point). So please check the control
>messages handling is robust.
Yes, It is highly unlikely that it can affect the host. 

>And in the future, I would like to see this problem solved by PASID when your
>HW has the support.
Yes. Sure.

Regards,
Vijay
  

Patch

diff --git a/doc/guides/vdpadevs/sfc.rst b/doc/guides/vdpadevs/sfc.rst
index 59f990b..abb5900 100644
--- a/doc/guides/vdpadevs/sfc.rst
+++ b/doc/guides/vdpadevs/sfc.rst
@@ -95,3 +95,9 @@  SFC vDPA PMD provides the following log types available for control:
   Matches a subset of per-port log types registered during runtime.
   A full name for a particular type may be obtained by appending a
   dot and a PCI device identifier (``XXXX:XX:XX.X``) to the prefix.
+
+- ``pmd.vdpa.sfc.mcdi`` (default level is **notice**)
+
+  Extra logging of the communication with the NIC's management CPU.
+  The format of the log is consumed by the netlogdecode cross-platform
+  tool. May be managed per-port, as explained above.
diff --git a/drivers/vdpa/sfc/meson.build b/drivers/vdpa/sfc/meson.build
index d916389..aac7c51 100644
--- a/drivers/vdpa/sfc/meson.build
+++ b/drivers/vdpa/sfc/meson.build
@@ -30,4 +30,7 @@  endforeach
 deps += ['common_sfc_efx', 'bus_pci']
 sources = files(
 	'sfc_vdpa.c',
+	'sfc_vdpa_hw.c',
+	'sfc_vdpa_mcdi.c',
+	'sfc_vdpa_ops.c',
 )
diff --git a/drivers/vdpa/sfc/sfc_vdpa.c b/drivers/vdpa/sfc/sfc_vdpa.c
index d8faaca..12e8d6e 100644
--- a/drivers/vdpa/sfc/sfc_vdpa.c
+++ b/drivers/vdpa/sfc/sfc_vdpa.c
@@ -232,6 +232,19 @@  struct sfc_vdpa_adapter *
 		goto fail_vfio_setup;
 	}
 
+	sfc_vdpa_log_init(sva, "hw init");
+	if (sfc_vdpa_hw_init(sva) != 0) {
+		sfc_vdpa_err(sva, "failed to init HW %s", pci_dev->name);
+		goto fail_hw_init;
+	}
+
+	sfc_vdpa_log_init(sva, "dev init");
+	sva->ops_data = sfc_vdpa_device_init(sva, SFC_VDPA_AS_VF);
+	if (sva->ops_data == NULL) {
+		sfc_vdpa_err(sva, "failed vDPA dev init %s", pci_dev->name);
+		goto fail_dev_init;
+	}
+
 	pthread_mutex_lock(&sfc_vdpa_adapter_list_lock);
 	TAILQ_INSERT_TAIL(&sfc_vdpa_adapter_list, sva, next);
 	pthread_mutex_unlock(&sfc_vdpa_adapter_list_lock);
@@ -240,6 +253,12 @@  struct sfc_vdpa_adapter *
 
 	return 0;
 
+fail_dev_init:
+	sfc_vdpa_hw_fini(sva);
+
+fail_hw_init:
+	sfc_vdpa_vfio_teardown(sva);
+
 fail_vfio_setup:
 fail_set_log_prefix:
 	rte_free(sva);
@@ -266,6 +285,10 @@  struct sfc_vdpa_adapter *
 	TAILQ_REMOVE(&sfc_vdpa_adapter_list, sva, next);
 	pthread_mutex_unlock(&sfc_vdpa_adapter_list_lock);
 
+	sfc_vdpa_device_fini(sva->ops_data);
+
+	sfc_vdpa_hw_fini(sva);
+
 	sfc_vdpa_vfio_teardown(sva);
 
 	rte_free(sva);
diff --git a/drivers/vdpa/sfc/sfc_vdpa.h b/drivers/vdpa/sfc/sfc_vdpa.h
index 3b77900..fb97258 100644
--- a/drivers/vdpa/sfc/sfc_vdpa.h
+++ b/drivers/vdpa/sfc/sfc_vdpa.h
@@ -11,14 +11,38 @@ 
 
 #include <rte_bus_pci.h>
 
+#include "sfc_efx.h"
+#include "sfc_efx_mcdi.h"
+#include "sfc_vdpa_debug.h"
 #include "sfc_vdpa_log.h"
+#include "sfc_vdpa_ops.h"
+
+#define SFC_VDPA_DEFAULT_MCDI_IOVA		0x200000000000
 
 /* Adapter private data */
 struct sfc_vdpa_adapter {
 	TAILQ_ENTRY(sfc_vdpa_adapter)	next;
+	/*
+	 * PMD setup and configuration is not thread safe. Since it is not
+	 * performance sensitive, it is better to guarantee thread-safety
+	 * and add device level lock. vDPA control operations which
+	 * change its state should acquire the lock.
+	 */
+	rte_spinlock_t			lock;
 	struct rte_pci_device		*pdev;
 	struct rte_pci_addr		pci_addr;
 
+	efx_family_t			family;
+	efx_nic_t			*nic;
+	rte_spinlock_t			nic_lock;
+
+	efsys_bar_t			mem_bar;
+
+	struct sfc_efx_mcdi		mcdi;
+	size_t				mcdi_buff_size;
+
+	uint32_t			max_queue_count;
+
 	char				log_prefix[SFC_VDPA_LOG_PREFIX_MAX];
 	uint32_t			logtype_main;
 
@@ -26,6 +50,7 @@  struct sfc_vdpa_adapter {
 	int				vfio_dev_fd;
 	int				vfio_container_fd;
 	int				iommu_group_num;
+	struct sfc_vdpa_ops_data	*ops_data;
 };
 
 uint32_t
@@ -36,5 +61,27 @@  struct sfc_vdpa_adapter {
 struct sfc_vdpa_adapter *
 sfc_vdpa_get_adapter_by_dev(struct rte_pci_device *pdev);
 
-#endif  /* _SFC_VDPA_H */
+int
+sfc_vdpa_hw_init(struct sfc_vdpa_adapter *sva);
+void
+sfc_vdpa_hw_fini(struct sfc_vdpa_adapter *sa);
 
+int
+sfc_vdpa_mcdi_init(struct sfc_vdpa_adapter *sva);
+void
+sfc_vdpa_mcdi_fini(struct sfc_vdpa_adapter *sva);
+
+int
+sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
+		   size_t len, efsys_mem_t *esmp);
+
+void
+sfc_vdpa_dma_free(struct sfc_vdpa_adapter *sva, efsys_mem_t *esmp);
+
+static inline struct sfc_vdpa_adapter *
+sfc_vdpa_adapter_by_dev_handle(void *dev_handle)
+{
+	return (struct sfc_vdpa_adapter *)dev_handle;
+}
+
+#endif  /* _SFC_VDPA_H */
diff --git a/drivers/vdpa/sfc/sfc_vdpa_debug.h b/drivers/vdpa/sfc/sfc_vdpa_debug.h
new file mode 100644
index 0000000..cfa8cc5
--- /dev/null
+++ b/drivers/vdpa/sfc/sfc_vdpa_debug.h
@@ -0,0 +1,21 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2020-2021 Xilinx, Inc.
+ */
+
+#ifndef _SFC_VDPA_DEBUG_H_
+#define _SFC_VDPA_DEBUG_H_
+
+#include <rte_debug.h>
+
+#ifdef RTE_LIBRTE_SFC_VDPA_DEBUG
+/* Avoid dependency from RTE_LOG_DP_LEVEL to be able to enable debug check
+ * in the driver only.
+ */
+#define SFC_VDPA_ASSERT(exp)			RTE_VERIFY(exp)
+#else
+/* If the driver debug is not enabled, follow DPDK debug/non-debug */
+#define SFC_VDPA_ASSERT(exp)			RTE_ASSERT(exp)
+#endif
+
+#endif /* _SFC_VDPA_DEBUG_H_ */
diff --git a/drivers/vdpa/sfc/sfc_vdpa_hw.c b/drivers/vdpa/sfc/sfc_vdpa_hw.c
new file mode 100644
index 0000000..83f3696
--- /dev/null
+++ b/drivers/vdpa/sfc/sfc_vdpa_hw.c
@@ -0,0 +1,322 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2020-2021 Xilinx, Inc.
+ */
+
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#include "efx.h"
+#include "sfc_vdpa.h"
+#include "sfc_vdpa_ops.h"
+
+extern uint32_t sfc_logtype_driver;
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE   (sysconf(_SC_PAGESIZE))
+#endif
+
+int
+sfc_vdpa_dma_alloc(struct sfc_vdpa_adapter *sva, const char *name,
+		   size_t len, efsys_mem_t *esmp)
+{
+	void *mcdi_buf;
+	uint64_t mcdi_iova;
+	size_t mcdi_buff_size;
+	int ret;
+
+	mcdi_buff_size = RTE_ALIGN_CEIL(len, PAGE_SIZE);
+
+	sfc_vdpa_log_init(sva, "name=%s, len=%zu", name, len);
+
+	mcdi_buf = rte_zmalloc(name, mcdi_buff_size, PAGE_SIZE);
+	if (mcdi_buf == NULL) {
+		sfc_vdpa_err(sva, "cannot reserve memory for %s: len=%#x: %s",
+			     name, (unsigned int)len, rte_strerror(rte_errno));
+		return -ENOMEM;
+	}
+
+	/* IOVA address for MCDI would be re-calculated if mapping
+	 * using default IOVA would fail.
+	 * TODO: Earlier there was no way to get valid IOVA range.
+	 * Recently a patch has been submitted to get the IOVA range
+	 * using ioctl. VFIO_IOMMU_GET_INFO. This patch is available
+	 * in the kernel version >= 5.4. Support to get the default
+	 * IOVA address for MCDI buffer using available IOVA range
+	 * would be added later. Meanwhile default IOVA for MCDI buffer
+	 * is kept at high mem at 2TB. In case of overlap new available
+	 * addresses would be searched and same would be used.
+	 */
+	mcdi_iova = SFC_VDPA_DEFAULT_MCDI_IOVA;
+
+	do {
+		ret = rte_vfio_container_dma_map(sva->vfio_container_fd,
+						 (uint64_t)mcdi_buf, mcdi_iova,
+						 mcdi_buff_size);
+		if (ret == 0)
+			break;
+
+		mcdi_iova = mcdi_iova >> 1;
+		if (mcdi_iova < mcdi_buff_size)	{
+			sfc_vdpa_err(sva,
+				     "DMA mapping failed for MCDI : %s",
+				     rte_strerror(rte_errno));
+			return ret;
+		}
+
+	} while (ret < 0);
+
+	esmp->esm_addr = mcdi_iova;
+	esmp->esm_base = mcdi_buf;
+	sva->mcdi_buff_size = mcdi_buff_size;
+
+	sfc_vdpa_info(sva,
+		      "DMA name=%s len=%zu => virt=%p iova=%" PRIx64,
+		      name, len, esmp->esm_base, esmp->esm_addr);
+
+	return 0;
+}
+
+void
+sfc_vdpa_dma_free(struct sfc_vdpa_adapter *sva, efsys_mem_t *esmp)
+{
+	int ret;
+
+	sfc_vdpa_log_init(sva, "name=%s", esmp->esm_mz->name);
+
+	ret = rte_vfio_container_dma_unmap(sva->vfio_container_fd,
+					   (uint64_t)esmp->esm_base,
+					   esmp->esm_addr, sva->mcdi_buff_size);
+	if (ret < 0)
+		sfc_vdpa_err(sva, "DMA unmap failed for MCDI : %s",
+			     rte_strerror(rte_errno));
+
+	sfc_vdpa_info(sva,
+		      "DMA free name=%s => virt=%p iova=%" PRIx64,
+		      esmp->esm_mz->name, esmp->esm_base, esmp->esm_addr);
+
+	rte_free((void *)(esmp->esm_base));
+
+	sva->mcdi_buff_size = 0;
+	memset(esmp, 0, sizeof(*esmp));
+}
+
+static int
+sfc_vdpa_mem_bar_init(struct sfc_vdpa_adapter *sva,
+		      const efx_bar_region_t *mem_ebrp)
+{
+	struct rte_pci_device *pci_dev = sva->pdev;
+	efsys_bar_t *ebp = &sva->mem_bar;
+	struct rte_mem_resource *res =
+		&pci_dev->mem_resource[mem_ebrp->ebr_index];
+
+	SFC_BAR_LOCK_INIT(ebp, pci_dev->name);
+	ebp->esb_rid = mem_ebrp->ebr_index;
+	ebp->esb_dev = pci_dev;
+	ebp->esb_base = res->addr;
+
+	return 0;
+}
+
+static void
+sfc_vdpa_mem_bar_fini(struct sfc_vdpa_adapter *sva)
+{
+	efsys_bar_t *ebp = &sva->mem_bar;
+
+	SFC_BAR_LOCK_DESTROY(ebp);
+	memset(ebp, 0, sizeof(*ebp));
+}
+
+static int
+sfc_vdpa_nic_probe(struct sfc_vdpa_adapter *sva)
+{
+	efx_nic_t *enp = sva->nic;
+	int rc;
+
+	rc = efx_nic_probe(enp, EFX_FW_VARIANT_DONT_CARE);
+	if (rc != 0)
+		sfc_vdpa_err(sva, "nic probe failed: %s", rte_strerror(rc));
+
+	return rc;
+}
+
+static int
+sfc_vdpa_estimate_resource_limits(struct sfc_vdpa_adapter *sva)
+{
+	efx_drv_limits_t limits;
+	int rc;
+	uint32_t evq_allocated;
+	uint32_t rxq_allocated;
+	uint32_t txq_allocated;
+	uint32_t max_queue_cnt;
+
+	memset(&limits, 0, sizeof(limits));
+
+	/* Request at least one Rx and Tx queue */
+	limits.edl_min_rxq_count = 1;
+	limits.edl_min_txq_count = 1;
+	/* Management event queue plus event queue for Tx/Rx queue */
+	limits.edl_min_evq_count =
+		1 + RTE_MAX(limits.edl_min_rxq_count, limits.edl_min_txq_count);
+
+	limits.edl_max_rxq_count = SFC_VDPA_MAX_QUEUE_PAIRS;
+	limits.edl_max_txq_count = SFC_VDPA_MAX_QUEUE_PAIRS;
+	limits.edl_max_evq_count = 1 + SFC_VDPA_MAX_QUEUE_PAIRS;
+
+	SFC_VDPA_ASSERT(limits.edl_max_evq_count >= limits.edl_min_rxq_count);
+	SFC_VDPA_ASSERT(limits.edl_max_rxq_count >= limits.edl_min_rxq_count);
+	SFC_VDPA_ASSERT(limits.edl_max_txq_count >= limits.edl_min_rxq_count);
+
+	/* Configure the minimum required resources needed for the
+	 * driver to operate, and the maximum desired resources that the
+	 * driver is capable of using.
+	 */
+	sfc_vdpa_log_init(sva, "set drv limit");
+	efx_nic_set_drv_limits(sva->nic, &limits);
+
+	sfc_vdpa_log_init(sva, "init nic");
+	rc = efx_nic_init(sva->nic);
+	if (rc != 0) {
+		sfc_vdpa_err(sva, "nic init failed: %s", rte_strerror(rc));
+		goto fail_nic_init;
+	}
+
+	/* Find resource dimensions assigned by firmware to this function */
+	rc = efx_nic_get_vi_pool(sva->nic, &evq_allocated, &rxq_allocated,
+				 &txq_allocated);
+	if (rc != 0) {
+		sfc_vdpa_err(sva, "vi pool get failed: %s", rte_strerror(rc));
+		goto fail_get_vi_pool;
+	}
+
+	/* It still may allocate more than maximum, ensure limit */
+	evq_allocated = RTE_MIN(evq_allocated, limits.edl_max_evq_count);
+	rxq_allocated = RTE_MIN(rxq_allocated, limits.edl_max_rxq_count);
+	txq_allocated = RTE_MIN(txq_allocated, limits.edl_max_txq_count);
+
+
+	max_queue_cnt = RTE_MIN(rxq_allocated, txq_allocated);
+	/* Subtract management EVQ not used for traffic */
+	max_queue_cnt = RTE_MIN(evq_allocated - 1, max_queue_cnt);
+
+	SFC_VDPA_ASSERT(max_queue_cnt > 0);
+
+	sva->max_queue_count = max_queue_cnt;
+
+	return 0;
+
+fail_get_vi_pool:
+	efx_nic_fini(sva->nic);
+fail_nic_init:
+	sfc_vdpa_log_init(sva, "failed: %s", rte_strerror(rc));
+	return rc;
+}
+
+int
+sfc_vdpa_hw_init(struct sfc_vdpa_adapter *sva)
+{
+	efx_bar_region_t mem_ebr;
+	efx_nic_t *enp;
+	int rc;
+
+	sfc_vdpa_log_init(sva, "entry");
+
+	sfc_vdpa_log_init(sva, "get family");
+	rc = sfc_efx_family(sva->pdev, &mem_ebr, &sva->family);
+	if (rc != 0)
+		goto fail_family;
+	sfc_vdpa_log_init(sva,
+			  "family is %u, membar is %u,"
+			  "function control window offset is %#" PRIx64,
+			  sva->family, mem_ebr.ebr_index, mem_ebr.ebr_offset);
+
+	sfc_vdpa_log_init(sva, "init mem bar");
+	rc = sfc_vdpa_mem_bar_init(sva, &mem_ebr);
+	if (rc != 0)
+		goto fail_mem_bar_init;
+
+	sfc_vdpa_log_init(sva, "create nic");
+	rte_spinlock_init(&sva->nic_lock);
+	rc = efx_nic_create(sva->family, (efsys_identifier_t *)sva,
+			    &sva->mem_bar, mem_ebr.ebr_offset,
+			    &sva->nic_lock, &enp);
+	if (rc != 0) {
+		sfc_vdpa_err(sva, "nic create failed: %s", rte_strerror(rc));
+		goto fail_nic_create;
+	}
+	sva->nic = enp;
+
+	sfc_vdpa_log_init(sva, "init mcdi");
+	rc = sfc_vdpa_mcdi_init(sva);
+	if (rc != 0) {
+		sfc_vdpa_err(sva, "mcdi init failed: %s", rte_strerror(rc));
+		goto fail_mcdi_init;
+	}
+
+	sfc_vdpa_log_init(sva, "probe nic");
+	rc = sfc_vdpa_nic_probe(sva);
+	if (rc != 0)
+		goto fail_nic_probe;
+
+	sfc_vdpa_log_init(sva, "reset nic");
+	rc = efx_nic_reset(enp);
+	if (rc != 0) {
+		sfc_vdpa_err(sva, "nic reset failed: %s", rte_strerror(rc));
+		goto fail_nic_reset;
+	}
+
+	sfc_vdpa_log_init(sva, "estimate resource limits");
+	rc = sfc_vdpa_estimate_resource_limits(sva);
+	if (rc != 0)
+		goto fail_estimate_rsrc_limits;
+
+	sfc_vdpa_log_init(sva, "done");
+
+	return 0;
+
+fail_estimate_rsrc_limits:
+fail_nic_reset:
+	efx_nic_unprobe(enp);
+
+fail_nic_probe:
+	sfc_vdpa_mcdi_fini(sva);
+
+fail_mcdi_init:
+	sfc_vdpa_log_init(sva, "destroy nic");
+	sva->nic = NULL;
+	efx_nic_destroy(enp);
+
+fail_nic_create:
+	sfc_vdpa_mem_bar_fini(sva);
+
+fail_mem_bar_init:
+fail_family:
+	sfc_vdpa_log_init(sva, "failed: %s", rte_strerror(rc));
+	return rc;
+}
+
+void
+sfc_vdpa_hw_fini(struct sfc_vdpa_adapter *sva)
+{
+	efx_nic_t *enp = sva->nic;
+
+	sfc_vdpa_log_init(sva, "entry");
+
+	sfc_vdpa_log_init(sva, "unprobe nic");
+	efx_nic_unprobe(enp);
+
+	sfc_vdpa_log_init(sva, "mcdi fini");
+	sfc_vdpa_mcdi_fini(sva);
+
+	sfc_vdpa_log_init(sva, "nic fini");
+	efx_nic_fini(enp);
+
+	sfc_vdpa_log_init(sva, "destroy nic");
+	sva->nic = NULL;
+	efx_nic_destroy(enp);
+
+	sfc_vdpa_mem_bar_fini(sva);
+}
diff --git a/drivers/vdpa/sfc/sfc_vdpa_log.h b/drivers/vdpa/sfc/sfc_vdpa_log.h
index 0a3d6ad..59af790 100644
--- a/drivers/vdpa/sfc/sfc_vdpa_log.h
+++ b/drivers/vdpa/sfc/sfc_vdpa_log.h
@@ -21,6 +21,9 @@ 
 /** Name prefix for the per-device log type used to report basic information */
 #define SFC_VDPA_LOGTYPE_MAIN_STR	SFC_VDPA_LOGTYPE_PREFIX "main"
 
+/** Device MCDI log type name prefix */
+#define SFC_VDPA_LOGTYPE_MCDI_STR	SFC_VDPA_LOGTYPE_PREFIX "mcdi"
+
 #define SFC_VDPA_LOG_PREFIX_MAX	32
 
 /* Log PMD message, automatically add prefix and \n */
diff --git a/drivers/vdpa/sfc/sfc_vdpa_mcdi.c b/drivers/vdpa/sfc/sfc_vdpa_mcdi.c
new file mode 100644
index 0000000..961d2d3
--- /dev/null
+++ b/drivers/vdpa/sfc/sfc_vdpa_mcdi.c
@@ -0,0 +1,74 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2020-2021 Xilinx, Inc.
+ */
+
+#include "sfc_efx_mcdi.h"
+
+#include "sfc_vdpa.h"
+#include "sfc_vdpa_debug.h"
+#include "sfc_vdpa_log.h"
+
+static sfc_efx_mcdi_dma_alloc_cb sfc_vdpa_mcdi_dma_alloc;
+static int
+sfc_vdpa_mcdi_dma_alloc(void *cookie, const char *name, size_t len,
+			efsys_mem_t *esmp)
+{
+	struct sfc_vdpa_adapter *sva = cookie;
+
+	return sfc_vdpa_dma_alloc(sva, name, len, esmp);
+}
+
+static sfc_efx_mcdi_dma_free_cb sfc_vdpa_mcdi_dma_free;
+static void
+sfc_vdpa_mcdi_dma_free(void *cookie, efsys_mem_t *esmp)
+{
+	struct sfc_vdpa_adapter *sva = cookie;
+
+	sfc_vdpa_dma_free(sva, esmp);
+}
+
+static sfc_efx_mcdi_sched_restart_cb sfc_vdpa_mcdi_sched_restart;
+static void
+sfc_vdpa_mcdi_sched_restart(void *cookie)
+{
+	RTE_SET_USED(cookie);
+}
+
+static sfc_efx_mcdi_mgmt_evq_poll_cb sfc_vdpa_mcdi_mgmt_evq_poll;
+static void
+sfc_vdpa_mcdi_mgmt_evq_poll(void *cookie)
+{
+	RTE_SET_USED(cookie);
+}
+
+static const struct sfc_efx_mcdi_ops sfc_vdpa_mcdi_ops = {
+	.dma_alloc	= sfc_vdpa_mcdi_dma_alloc,
+	.dma_free	= sfc_vdpa_mcdi_dma_free,
+	.sched_restart  = sfc_vdpa_mcdi_sched_restart,
+	.mgmt_evq_poll  = sfc_vdpa_mcdi_mgmt_evq_poll,
+
+};
+
+int
+sfc_vdpa_mcdi_init(struct sfc_vdpa_adapter *sva)
+{
+	uint32_t logtype;
+
+	sfc_vdpa_log_init(sva, "entry");
+
+	logtype = sfc_vdpa_register_logtype(&(sva->pdev->addr),
+					    SFC_VDPA_LOGTYPE_MCDI_STR,
+					    RTE_LOG_NOTICE);
+
+	return sfc_efx_mcdi_init(&sva->mcdi, logtype,
+				 sva->log_prefix, sva->nic,
+				 &sfc_vdpa_mcdi_ops, sva);
+}
+
+void
+sfc_vdpa_mcdi_fini(struct sfc_vdpa_adapter *sva)
+{
+	sfc_vdpa_log_init(sva, "entry");
+	sfc_efx_mcdi_fini(&sva->mcdi);
+}
diff --git a/drivers/vdpa/sfc/sfc_vdpa_ops.c b/drivers/vdpa/sfc/sfc_vdpa_ops.c
new file mode 100644
index 0000000..71696be
--- /dev/null
+++ b/drivers/vdpa/sfc/sfc_vdpa_ops.c
@@ -0,0 +1,129 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2020-2021 Xilinx, Inc.
+ */
+
+#include <rte_malloc.h>
+#include <rte_vdpa.h>
+#include <rte_vdpa_dev.h>
+#include <rte_vhost.h>
+
+#include "sfc_vdpa_ops.h"
+#include "sfc_vdpa.h"
+
+/* Dummy functions for mandatory vDPA ops to pass vDPA device registration.
+ * In subsequent patches these ops would be implemented.
+ */
+static int
+sfc_vdpa_get_queue_num(struct rte_vdpa_device *vdpa_dev, uint32_t *queue_num)
+{
+	RTE_SET_USED(vdpa_dev);
+	RTE_SET_USED(queue_num);
+
+	return -1;
+}
+
+static int
+sfc_vdpa_get_features(struct rte_vdpa_device *vdpa_dev, uint64_t *features)
+{
+	RTE_SET_USED(vdpa_dev);
+	RTE_SET_USED(features);
+
+	return -1;
+}
+
+static int
+sfc_vdpa_get_protocol_features(struct rte_vdpa_device *vdpa_dev,
+			       uint64_t *features)
+{
+	RTE_SET_USED(vdpa_dev);
+	RTE_SET_USED(features);
+
+	return -1;
+}
+
+static int
+sfc_vdpa_dev_config(int vid)
+{
+	RTE_SET_USED(vid);
+
+	return -1;
+}
+
+static int
+sfc_vdpa_dev_close(int vid)
+{
+	RTE_SET_USED(vid);
+
+	return -1;
+}
+
+static int
+sfc_vdpa_set_vring_state(int vid, int vring, int state)
+{
+	RTE_SET_USED(vid);
+	RTE_SET_USED(vring);
+	RTE_SET_USED(state);
+
+	return -1;
+}
+
+static int
+sfc_vdpa_set_features(int vid)
+{
+	RTE_SET_USED(vid);
+
+	return -1;
+}
+
+static struct rte_vdpa_dev_ops sfc_vdpa_ops = {
+	.get_queue_num = sfc_vdpa_get_queue_num,
+	.get_features = sfc_vdpa_get_features,
+	.get_protocol_features = sfc_vdpa_get_protocol_features,
+	.dev_conf = sfc_vdpa_dev_config,
+	.dev_close = sfc_vdpa_dev_close,
+	.set_vring_state = sfc_vdpa_set_vring_state,
+	.set_features = sfc_vdpa_set_features,
+};
+
+struct sfc_vdpa_ops_data *
+sfc_vdpa_device_init(void *dev_handle, enum sfc_vdpa_context context)
+{
+	struct sfc_vdpa_ops_data *ops_data;
+	struct rte_pci_device *pci_dev;
+
+	/* Create vDPA ops context */
+	ops_data = rte_zmalloc("vdpa", sizeof(struct sfc_vdpa_ops_data), 0);
+	if (ops_data == NULL)
+		return NULL;
+
+	ops_data->vdpa_context = context;
+	ops_data->dev_handle = dev_handle;
+
+	pci_dev = sfc_vdpa_adapter_by_dev_handle(dev_handle)->pdev;
+
+	/* Register vDPA Device */
+	sfc_vdpa_log_init(dev_handle, "register vDPA device");
+	ops_data->vdpa_dev =
+		rte_vdpa_register_device(&pci_dev->device, &sfc_vdpa_ops);
+	if (ops_data->vdpa_dev == NULL) {
+		sfc_vdpa_err(dev_handle, "vDPA device registration failed");
+		goto fail_register_device;
+	}
+
+	ops_data->state = SFC_VDPA_STATE_INITIALIZED;
+
+	return ops_data;
+
+fail_register_device:
+	rte_free(ops_data);
+	return NULL;
+}
+
+void
+sfc_vdpa_device_fini(struct sfc_vdpa_ops_data *ops_data)
+{
+	rte_vdpa_unregister_device(ops_data->vdpa_dev);
+
+	rte_free(ops_data);
+}
diff --git a/drivers/vdpa/sfc/sfc_vdpa_ops.h b/drivers/vdpa/sfc/sfc_vdpa_ops.h
new file mode 100644
index 0000000..817b302
--- /dev/null
+++ b/drivers/vdpa/sfc/sfc_vdpa_ops.h
@@ -0,0 +1,36 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2020-2021 Xilinx, Inc.
+ */
+
+#ifndef _SFC_VDPA_OPS_H
+#define _SFC_VDPA_OPS_H
+
+#include <rte_vdpa.h>
+
+#define SFC_VDPA_MAX_QUEUE_PAIRS		1
+
+enum sfc_vdpa_context {
+	SFC_VDPA_AS_PF = 0,
+	SFC_VDPA_AS_VF
+};
+
+enum sfc_vdpa_state {
+	SFC_VDPA_STATE_UNINITIALIZED = 0,
+	SFC_VDPA_STATE_INITIALIZED,
+	SFC_VDPA_STATE_NSTATES
+};
+
+struct sfc_vdpa_ops_data {
+	void				*dev_handle;
+	struct rte_vdpa_device		*vdpa_dev;
+	enum sfc_vdpa_context		vdpa_context;
+	enum sfc_vdpa_state		state;
+};
+
+struct sfc_vdpa_ops_data *
+sfc_vdpa_device_init(void *adapter, enum sfc_vdpa_context context);
+void
+sfc_vdpa_device_fini(struct sfc_vdpa_ops_data *ops_data);
+
+#endif /* _SFC_VDPA_OPS_H */