[2/4] vfio: add VFIO IOMMUFD support

Message ID 20231222194453.3049693-3-beilei.xing@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series add VFIO IOMMUFD/CDEV support |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Xing, Beilei Dec. 22, 2023, 7:44 p.m. UTC
  From: Beilei Xing <beilei.xing@intel.com>

VFIO IOMMUFD is a new component added to co-work with IOMMUFD.
IOMMUFD has no impact on the existing VFIO Container/Group
interface, while the latest IOMMU feature(e.g. PASID/SSID) may
be only available through VFIO IOMMUFD/CDEV interface.

This path exposes setup/release vfio device functions with VFIO
IOMMUFD/CDEV interface.

Signed-off-by: Beilei Xing <beilei.xing@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
---
 lib/eal/include/rte_vfio.h       |  55 +++++
 lib/eal/linux/eal_vfio.h         |   3 +
 lib/eal/linux/eal_vfio_iommufd.c | 385 +++++++++++++++++++++++++++++++
 lib/eal/linux/meson.build        |   1 +
 lib/eal/version.map              |   2 +
 5 files changed, 446 insertions(+)
 create mode 100644 lib/eal/linux/eal_vfio_iommufd.c
  

Comments

Stephen Hemminger Dec. 22, 2023, 5:17 p.m. UTC | #1
On Fri, 22 Dec 2023 19:44:51 +0000
beilei.xing@intel.com wrote:

> diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
> index 22832afd0f..7a9b26b0f7 100644
> --- a/lib/eal/include/rte_vfio.h
> +++ b/lib/eal/include/rte_vfio.h
> @@ -17,6 +17,8 @@ extern "C" {
>  #include <stdbool.h>
>  #include <stdint.h>
>  
> +#include <rte_compat.h>
> +
>  /*
>   * determine if VFIO is present on the system
>   */
> @@ -28,6 +30,9 @@ extern "C" {
>  #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
>  #define HAVE_VFIO_DEV_REQ_INTERFACE
>  #endif /* kernel version >= 4.0.0 */
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
> +#define VFIO_IOMMUFD_PRESENT
> +#endif /* kernel version >= 6.6.0 */
>  #endif /* RTE_EAL_VFIO */

Depending on kernel version macro is a mistake because many enterprise
distro's backport features and do not change kernel version.
Also, it means the build and target machine have to be same kernel version.
  
Xing, Beilei Dec. 25, 2023, 6:30 a.m. UTC | #2
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Saturday, December 23, 2023 1:17 AM
> To: Xing, Beilei <beilei.xing@intel.com>
> Cc: Burakov, Anatoly <anatoly.burakov@intel.com>; dev@dpdk.org;
> thomas@monjalon.net; ferruh.yigit@amd.com; Richardson, Bruce
> <bruce.richardson@intel.com>; chenbox@nvidia.com; Cao, Yahui
> <yahui.cao@intel.com>
> Subject: Re: [PATCH 2/4] vfio: add VFIO IOMMUFD support
> 
> On Fri, 22 Dec 2023 19:44:51 +0000
> beilei.xing@intel.com wrote:
> 
> > diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
> > index 22832afd0f..7a9b26b0f7 100644
> > --- a/lib/eal/include/rte_vfio.h
> > +++ b/lib/eal/include/rte_vfio.h
> > @@ -17,6 +17,8 @@ extern "C" {
> >  #include <stdbool.h>
> >  #include <stdint.h>
> >
> > +#include <rte_compat.h>
> > +
> >  /*
> >   * determine if VFIO is present on the system
> >   */
> > @@ -28,6 +30,9 @@ extern "C" {
> >  #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)  #define
> > HAVE_VFIO_DEV_REQ_INTERFACE  #endif /* kernel version >= 4.0.0 */
> > +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0) #define
> > +VFIO_IOMMUFD_PRESENT #endif /* kernel version >= 6.6.0 */
> >  #endif /* RTE_EAL_VFIO */
> 
> Depending on kernel version macro is a mistake because many enterprise
> distro's backport features and do not change kernel version.

Make sense. We defined VFIO_IOMMUFD_PRESENT with reference to
VFIO_PRESENT. Do you have suggestion for this point? Thanks a lot.

> Also, it means the build and target machine have to be same kernel version.
  

Patch

diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 22832afd0f..7a9b26b0f7 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -17,6 +17,8 @@  extern "C" {
 #include <stdbool.h>
 #include <stdint.h>
 
+#include <rte_compat.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -28,6 +30,9 @@  extern "C" {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
 #define HAVE_VFIO_DEV_REQ_INTERFACE
 #endif /* kernel version >= 4.0.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
 #endif /* RTE_EAL_VFIO */
 
 #ifdef VFIO_PRESENT
@@ -42,6 +47,10 @@  extern "C" {
 #define VFIO_NOIOMMU_MODE      \
 	"/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
 
+#ifdef VFIO_IOMMUFD_PRESENT
+#define VFIO_CDEV_CLASS_DIR "/sys/class/vfio-dev"
+#endif
+
 /* NOIOMMU is defined from kernel version 4.5 onwards */
 #ifdef VFIO_NOIOMMU_IOMMU
 #define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
@@ -137,6 +146,33 @@  struct vfio_device_info;
 int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info);
 
+/**
+ * Setup iommufd_cfg for the device identified by its address.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param sysfs_base
+ *   sysfs path prefix.
+ *
+ * @param dev_addr
+ *   device location.
+ *
+ * @param vfio_dev_fd
+ *   VFIO fd.
+ *
+ * @param device_info
+ *   Device information.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ *   >1 if the device cannot be managed this way.
+ */
+__rte_experimental
+int rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+				  int *vfio_dev_fd, struct vfio_device_info *device_info);
+
 /**
  * Release a device mapped to a VFIO-managed I/O MMU group.
  *
@@ -158,6 +194,25 @@  int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
  */
 int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
 
+/**
+ * Release a device mapped to a VFIO-iommufd-managed I/O MMU group.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param dev_addr
+ *   device location.
+ *
+ * @param fd
+ *   VFIO fd.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ */
+__rte_experimental
+int rte_vfio_iommufd_release_device(const char *dev_addr, int fd);
+
 /**
  * Enable a VFIO-related kmod.
  *
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 23a787ad20..c94409e828 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -17,6 +17,9 @@ 
 #else
 #pragma message("VFIO configured but not supported by this kernel, disabling.")
 #endif /* kernel version >= 3.6.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
 #endif /* RTE_EAL_VFIO */
 
 #ifdef VFIO_PRESENT
diff --git a/lib/eal/linux/eal_vfio_iommufd.c b/lib/eal/linux/eal_vfio_iommufd.c
new file mode 100644
index 0000000000..02996a588a
--- /dev/null
+++ b/lib/eal/linux/eal_vfio_iommufd.c
@@ -0,0 +1,385 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+#ifdef VFIO_IOMMUFD_PRESENT
+#include <linux/iommufd.h>
+#include "eal_iommufd.h"
+
+#define VFIO_IOMMUFD_MEM_EVENT_CLB_NAME "vfio_iommufd_mem_event_clb"
+
+struct ioas_info {
+	int iommufd;
+	uint32_t ioas_id;
+};
+
+static int
+vfio_iommufd_add_device(const char *dev_addr, int vfio_dev_fd)
+{
+	struct iommufd_config *iommufd_cfg;
+	int iommufd;
+	uint32_t ioas_id;
+	struct vfio_device_bind_iommufd bind = {};
+	struct vfio_device_attach_iommufd_pt attach = {};
+	int ret = 0;
+
+	iommufd_cfg = default_iommufd_cfg;
+	iommufd = iommufd_cfg->iommufd;
+	ioas_id = iommufd_cfg->ioas_id;
+
+	bind.argsz = sizeof(bind);
+	bind.iommufd = iommufd;
+	bind.flags = 0;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Device %s cannot bind to iommufd\n", dev_addr);
+		return ret;
+	}
+
+	attach.argsz = sizeof(attach);
+	attach.flags = 0;
+	attach.pt_id = ioas_id;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Device %s cannot attach to ioas\n", dev_addr);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int
+vfio_iommufd_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+			size_t len, void *arg)
+{
+	struct ioas_info *info = arg;
+
+	if (msl->external)
+		return 0;
+
+	return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+				   ms->iova, len, 1);
+}
+
+static int
+vfio_iommufd_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+		 void *arg)
+{
+	struct ioas_info *info = arg;
+
+	/* skip external memory that isn't a heap */
+	if (msl->external && !msl->heap)
+		return 0;
+
+	/* skip any segments with invalid IOVA addresses */
+	if (ms->iova == RTE_BAD_IOVA)
+		return 0;
+
+	/* if IOVA mode is VA, we've already mapped the internal segments */
+	if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
+		return 0;
+
+	return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+				   ms->iova,  ms->len, 1);
+}
+
+static int
+vfio_iommufd_dma_map(int iommufd, uint32_t ioasid)
+{
+	struct ioas_info info = {.iommufd = iommufd, .ioas_id = ioasid};
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		/* with IOVA as VA mode, we can get away with mapping contiguous
+		 * chunks rather than going page-by-page.
+		 */
+		int ret = rte_memseg_contig_walk(vfio_iommufd_map_contig,
+						 &info);
+		if (ret)
+			return ret;
+		/* we have to continue the walk because we've skipped the
+		 * external segments during the config walk.
+		 */
+	}
+	return rte_memseg_walk(vfio_iommufd_map, &info);
+}
+
+static void
+vfio_iommufd_mem_event_callback(enum rte_mem_event type, const void *addr,
+				size_t len, void *arg __rte_unused)
+{
+	struct rte_memseg_list *msl;
+	struct rte_memseg *ms;
+	size_t cur_len = 0;
+
+	msl = rte_mem_virt2memseg_list(addr);
+
+	/* for IOVA as VA mode, no need to care for IOVA addresses */
+	if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
+		uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+		uint64_t page_sz = msl->page_sz;
+
+		/* Maintain granularity of DMA map/unmap to memseg size */
+		for (; cur_len < len; cur_len += page_sz) {
+			if (type == RTE_MEM_EVENT_ALLOC)
+				iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+						    default_iommufd_cfg->ioas_id,
+						    vfio_va, vfio_va, page_sz, 1);
+			else
+				iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+						    default_iommufd_cfg->ioas_id,
+						    vfio_va, vfio_va, page_sz, 0);
+			vfio_va += page_sz;
+		}
+
+		return;
+	}
+
+	/* memsegs are contiguous in memory */
+	ms = rte_mem_virt2memseg(addr, msl);
+	while (cur_len < len) {
+		/* some memory segments may have invalid IOVA */
+		if (ms->iova == RTE_BAD_IOVA) {
+			RTE_LOG(DEBUG, EAL,
+				"Memory segment at %p has bad IOVA, skipping\n",
+				ms->addr);
+			goto next;
+		}
+		if (type == RTE_MEM_EVENT_ALLOC)
+			iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+					    default_iommufd_cfg->ioas_id,
+					    ms->addr_64, ms->iova, ms->len, 1);
+		else
+			iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+					    default_iommufd_cfg->ioas_id,
+					    ms->addr_64, ms->iova, ms->len, 0);
+next:
+		cur_len += ms->len;
+		++ms;
+	}
+}
+
+static int
+vfio_iommufd_get_fd(const char *sysfs_base, const char *dev_addr)
+{
+	char vfio_cdev_path[PATH_MAX];
+	char vfio_path[PATH_MAX];
+	char dirname[PATH_MAX];
+	int vfio_dev_fd;
+	struct dirent *dent;
+	unsigned int major, minor;
+	struct stat st;
+	dev_t cdev;
+	DIR *dir;
+	FILE *f;
+	int ret = 0;
+
+	memset(vfio_cdev_path, 0, sizeof(vfio_cdev_path));
+	memset(vfio_path, 0, sizeof(vfio_path));
+	memset(dirname, 0, sizeof(dirname));
+
+	snprintf(dirname, sizeof(dirname), "%s/%s/vfio-dev",
+		 sysfs_base, dev_addr);
+
+	dir = opendir(dirname);
+	if (dir == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+			__func__, strerror(errno));
+		return -1;
+	}
+
+	while ((dent = readdir(dir)) != NULL) {
+		if (!strncmp(dent->d_name, "vfio", 4)) {
+			snprintf(vfio_cdev_path, sizeof(vfio_cdev_path),
+				 "%s/%s/vfio-dev/%s/dev", sysfs_base,
+				 dev_addr, dent->d_name);
+			break;
+		}
+	}
+
+	f = fopen(vfio_cdev_path, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get major:minor\n",
+			__func__);
+		ret = -1;
+		goto err_fopen;
+	}
+
+	ret = fscanf(f, "%u:%u", &major, &minor);
+	if (ret != 2) {
+		RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get major:minor\n",
+			__func__);
+		ret = -1;
+		goto err_fscanf;
+	}
+
+	cdev = makedev(major, minor);
+
+	snprintf(vfio_path, sizeof(vfio_path), "/dev/vfio/devices/%s", dent->d_name);
+	vfio_dev_fd = open(vfio_path, O_RDWR);
+	if (vfio_dev_fd == -1) {
+		RTE_LOG(ERR, EAL, "%s(): can't open %s: %s\n",
+			__func__, vfio_path, strerror(errno));
+		ret = -1;
+		goto err_fscanf;
+	}
+
+	if (fstat(vfio_dev_fd, &st) || !S_ISCHR(st.st_mode) ||
+	    (cdev != 0 && st.st_rdev != cdev)) {
+		RTE_LOG(ERR, EAL, "%s(): vfio char device is not matched\n",
+			__func__);
+		ret = -1;
+	}
+
+	ret = vfio_dev_fd;
+
+err_fscanf:
+	fclose(f);
+err_fopen:
+	closedir(dir);
+	return ret;
+}
+
+int
+rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+			      int *vfio_dev_fd, struct vfio_device_info *device_info)
+{
+	struct iommufd_config *iommufd_cfg;
+	int iommufd;
+	uint32_t ioas_id;
+	int ret = 0;
+	const struct internal_config *internal_conf =
+		eal_get_internal_configuration();
+
+	iommufd_cfg = default_iommufd_cfg;
+	iommufd = iommufd_cfg->iommufd;
+	ioas_id = iommufd_cfg->ioas_id;
+
+	*vfio_dev_fd = vfio_iommufd_get_fd(sysfs_base, dev_addr);
+	if (*vfio_dev_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to get device fd for device %s\n", dev_addr);
+		return -1;
+	}
+
+	if (vfio_iommufd_add_device(dev_addr, *vfio_dev_fd)) {
+		RTE_LOG(ERR, EAL, "Failed to add device %s to iommufd\n", dev_addr);
+		ret = -1;
+		goto err_add_dev;
+	}
+
+	if (!iommufd_cfg->dma_init &&
+	    internal_conf->process_type == RTE_PROC_PRIMARY &&
+	    iommufd != -1) {
+		/* lock memory hotplug before mapping and release it
+		 * after registering callback, to prevent races
+		 */
+		rte_mcfg_mem_read_lock();
+		ret = vfio_iommufd_dma_map(iommufd, ioas_id);
+		if (ret) {
+			RTE_LOG(ERR, EAL,
+				"%s DMA remapping failed, error "
+				"%i (%s)\n",
+				dev_addr, errno, strerror(errno));
+			rte_mcfg_mem_read_unlock();
+			ret = -1;
+			goto err_dma_map;
+		}
+
+		/* register callback for mem events */
+		ret = rte_mem_event_callback_register(
+			VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+			vfio_iommufd_mem_event_callback, NULL);
+
+		/* unlock memory hotplug */
+		rte_mcfg_mem_read_unlock();
+
+		if (ret && rte_errno != ENOTSUP) {
+			RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
+			ret = -1;
+			goto err_dma_map;
+		}
+		if (ret)
+			RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
+		else
+			RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
+
+		iommufd_cfg->dma_init = true;
+	}
+
+	ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "%s cannot get device info, "
+			"error %i (%s)\n", dev_addr, errno,
+			strerror(errno));
+		ret = -1;
+		goto err_dma_map;
+	}
+
+	return 0;
+
+err_dma_map:
+	rte_vfio_iommufd_release_device(dev_addr, *vfio_dev_fd);
+err_add_dev:
+	close(*vfio_dev_fd);
+	return ret;
+}
+
+int
+rte_vfio_iommufd_release_device(const char *dev_addr, int vfio_dev_fd)
+{
+	struct vfio_device_detach_iommufd_pt detach = {};
+	int ret = 0;
+
+	rte_mcfg_mem_read_lock();
+
+	detach.argsz = sizeof(detach);
+	detach.flags = 0;
+
+	ret = ioctl(vfio_dev_fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Device %s cannot detach from iommufd\n", dev_addr);
+		goto err;
+	}
+
+	close(vfio_dev_fd);
+
+	rte_mem_event_callback_unregister(VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+					  NULL);
+
+err:
+	rte_mcfg_mem_read_unlock();
+	return ret;
+}
+
+#else
+int
+rte_vfio_iommufd_setup_device(__rte_unused const char *sysfs_base,
+			      __rte_unused const char *dev_addr,
+			      __rte_unused int *vfio_dev_fd,
+			      __rte_unused struct vfio_device_info *device_info)
+{
+	return -1;
+}
+
+int
+rte_vfio_iommufd_release_device(__rte_unused const char *dev_addr,
+				__rte_unused int vfio_dev_fd)
+{
+	return -1;
+}
+
+#endif /* VFIO_IOMMUFD_PRESENT */
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index 8081087584..bf246e64c9 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@  sources += files(
         'eal_thread.c',
         'eal_timer.c',
         'eal_vfio.c',
+	'eal_vfio_iommufd.c',
         'eal_iommufd.c',
         'eal_vfio_mp_sync.c',
 )
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 30e66a7267..9c1e70feca 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -396,6 +396,8 @@  EXPERIMENTAL {
 
 	rte_iommufd_enable; # WINDOWS_NO_EXPORT
 	rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
+	rte_vfio_iommufd_release_device; # WINDOWS_NO_EXPORT
+	rte_vfio_iommufd_setup_device; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {