[1/4] iommufd: add IOMMUFD support

Message ID 20231222194453.3049693-2-beilei.xing@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series add VFIO IOMMUFD/CDEV support |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Xing, Beilei Dec. 22, 2023, 7:44 p.m. UTC
  From: Yahui Cao <yahui.cao@intel.com>

IOMMUFD is a new standalone IOMMU subsystem introduced in Linux.

Linux now includes multiple device-passthrough frameworks (e.g. VFIO and
vDPA) and those frameworks implements their own logic for managing I/O
page tables, which is hard to scale to support modern IOMMU features like
PASID, I/O page fault, IOMMU dirty page tracking. The goal of IOMMUFD is
to make Linux subsystems like VFIO and vDPA to consume a unified IOMMU
framework.

This patch exports basic enable function, default isolation domain and
per-IOMMUFD dma mapping function. The IOMMUFD consumer should use the
default isolation domain and dma mapping function when user-initiated
DMA is required.

Signed-off-by: Yahui Cao <yahui.cao@intel.com>
Signed-off-by: Beilei Xing <beilei.xing@intel.com>
---
 config/meson.build            |   3 +
 config/rte_config.h           |   1 +
 lib/eal/include/rte_iommufd.h |  73 ++++++++++++++
 lib/eal/linux/eal.c           |  22 ++++
 lib/eal/linux/eal_iommufd.c   | 183 ++++++++++++++++++++++++++++++++++
 lib/eal/linux/eal_iommufd.h   |  43 ++++++++
 lib/eal/linux/meson.build     |   1 +
 lib/eal/version.map           |   3 +
 8 files changed, 329 insertions(+)
 create mode 100644 lib/eal/include/rte_iommufd.h
 create mode 100644 lib/eal/linux/eal_iommufd.c
 create mode 100644 lib/eal/linux/eal_iommufd.h
  

Patch

diff --git a/config/meson.build b/config/meson.build
index a9ccd56deb..93c63984c8 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -442,6 +442,9 @@  install_headers(['rte_config.h'],
 # enable VFIO only if it is linux OS
 dpdk_conf.set('RTE_EAL_VFIO', is_linux)
 
+# enable IOMMUFD only if it is linux OS
+dpdk_conf.set('RTE_EAL_IOMMUFD', is_linux)
+
 # specify -D_GNU_SOURCE unconditionally
 add_project_arguments('-D_GNU_SOURCE', language: 'c')
 
diff --git a/config/rte_config.h b/config/rte_config.h
index da265d7dd2..25a6dccd8f 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -38,6 +38,7 @@ 
 #define RTE_MAX_TAILQ 32
 #define RTE_LOG_DP_LEVEL RTE_LOG_INFO
 #define RTE_MAX_VFIO_CONTAINERS 64
+#define RTE_MAX_IOMMUFD_FD 1
 
 /* bsd module defines */
 #define RTE_CONTIGMEM_MAX_NUM_BUFS 64
diff --git a/lib/eal/include/rte_iommufd.h b/lib/eal/include/rte_iommufd.h
new file mode 100644
index 0000000000..ac42713018
--- /dev/null
+++ b/lib/eal/include/rte_iommufd.h
@@ -0,0 +1,73 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef _RTE_IOMMUFD_H_
+#define _RTE_IOMMUFD_H_
+
+/**
+ * @file
+ * RTE IOMMUFD. This library provides various IOMMUFD related utility functions.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <rte_compat.h>
+/*
+ * determine if IOMMUFD is present on the system
+ */
+#if !defined(IOMMUFD_PRESENT) && defined(RTE_EAL_IOMMUFD)
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
+#endif /* RTE_EAL_IOMMUFD */
+
+#ifdef IOMMUFD_PRESENT
+
+#define IOMMUFD_PATH "/dev/iommu"
+
+#else /* not IOMMUFD_PRESENT */
+#endif /* IOMMUFD_PRESENT */
+
+/**
+ * Enable a IOMMUFD-related kmod.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param modname
+ *   kernel module name.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ */
+__rte_experimental
+int rte_iommufd_enable(const char *modname);
+
+/**
+ * Check whether a IOMMUFD-related kmod is enabled.
+ *
+ * This function is only relevant to Linux.
+ *
+ * @param modname
+ *   kernel module name.
+ *
+ * @return
+ *   1 if true.
+ *   0 otherwise.
+ */
+__rte_experimental
+int rte_iommufd_is_enabled(const char *modname);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_IOMMUFD_H_ */
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 57da058cec..4c8e0a7b6e 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -41,6 +41,7 @@ 
 #include <rte_version.h>
 #include <malloc_heap.h>
 #include <rte_vfio.h>
+#include <rte_iommufd.h>
 
 #include <telemetry_internal.h>
 #include "eal_private.h"
@@ -52,6 +53,7 @@ 
 #include "eal_trace.h"
 #include "eal_options.h"
 #include "eal_vfio.h"
+#include "eal_iommufd.h"
 #include "hotplug_mp.h"
 #include "log_internal.h"
 
@@ -877,6 +879,16 @@  static int rte_eal_vfio_setup(void)
 }
 #endif
 
+#ifdef IOMMUFD_PRESENT
+static int rte_eal_iommufd_setup(void)
+{
+	if (rte_iommufd_enable("iommufd"))
+		return -1;
+
+	return 0;
+}
+#endif
+
 static void rte_eal_init_alert(const char *msg)
 {
 	fprintf(stderr, "EAL: FATAL: %s\n", msg);
@@ -1162,6 +1174,16 @@  rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 #endif
+
+#ifdef IOMMUFD_PRESENT
+	if (rte_eal_iommufd_setup() < 0) {
+		rte_eal_init_alert("Cannot init IOMMUFD");
+		rte_errno = EAGAIN;
+		rte_atomic_store_explicit(&run_once, 0, rte_memory_order_relaxed);
+		return -1;
+	}
+#endif
+
 	/* in secondary processes, memory init may allocate additional fbarrays
 	 * not present in primary processes, so to avoid any potential issues,
 	 * initialize memzones first.
diff --git a/lib/eal/linux/eal_iommufd.c b/lib/eal/linux/eal_iommufd.c
new file mode 100644
index 0000000000..8866aa60c1
--- /dev/null
+++ b/lib/eal/linux/eal_iommufd.c
@@ -0,0 +1,183 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+
+#include <rte_iommufd.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
+
+#include "eal_iommufd.h"
+#include "eal_private.h"
+
+#ifdef IOMMUFD_PRESENT
+#include <linux/iommufd.h>
+
+/* per-process IOMMUFD config */
+static struct iommufd_config iommufd_cfgs[IOMMUFD_MAX_FD];
+struct iommufd_config *default_iommufd_cfg = &iommufd_cfgs[0];
+
+static void
+iommufd_get_ioas(int *iommufd, uint32_t *ioas_id)
+{
+	int iommu_fd, ret;
+	struct iommu_ioas_alloc alloc_data = {};
+
+	*iommufd = -1;
+	iommu_fd = open(IOMMUFD_PATH, O_RDWR);
+	if (iommu_fd < 0) {
+		RTE_LOG(ERR, EAL, "Failed to open iommufd!\n");
+		return;
+	}
+
+	alloc_data.size = sizeof(alloc_data);
+	ret = ioctl(iommu_fd, IOMMU_IOAS_ALLOC, &alloc_data);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "Failed to alloc ioas!\n");
+		return;
+	}
+
+	*iommufd = iommu_fd;
+	*ioas_id = alloc_data.out_ioas_id;
+}
+
+int
+rte_iommufd_enable(const char *modname)
+{
+	/* initialize device list */
+	int i;
+	int iommufd_available;
+	const struct internal_config *internal_conf =
+		eal_get_internal_configuration();
+
+	for (i = 0; i < IOMMUFD_MAX_FD; i++) {
+		iommufd_cfgs[i].iommufd_enabled = 0;
+		iommufd_cfgs[i].iommufd = -1;
+		iommufd_cfgs[i].ioas_id = 0;
+		iommufd_cfgs[i].dma_init = false;
+	}
+
+	RTE_LOG(DEBUG, EAL, "Probing IOMMUFD support...\n");
+
+	/* check if iommufd module is loaded */
+	iommufd_available = rte_eal_check_module(modname);
+
+	/* return error directly */
+	if (iommufd_available == -1) {
+		RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
+		return -1;
+	}
+
+	/* return 0 if IOMMUFD modules not loaded */
+	if (iommufd_available == 0) {
+		RTE_LOG(DEBUG, EAL,
+			"IOMMUFD modules not loaded, skipping IOMMUFD support...\n");
+		return 0;
+	}
+
+	if (internal_conf->process_type == RTE_PROC_PRIMARY)
+		iommufd_get_ioas(&default_iommufd_cfg->iommufd, &default_iommufd_cfg->ioas_id);
+
+	/* check if we have IOMMUFD driver enabled */
+	if (default_iommufd_cfg->iommufd != -1) {
+		RTE_LOG(INFO, EAL, "IOMMUFD support initialized\n");
+		default_iommufd_cfg->iommufd_enabled = 1;
+	} else {
+		RTE_LOG(NOTICE, EAL, "IOMMUFD support could not be initialized\n");
+	}
+
+	return 0;
+}
+
+int
+rte_iommufd_is_enabled(const char *modname)
+{
+	const int mod_available = rte_eal_check_module(modname) > 0;
+	return default_iommufd_cfg->iommufd_enabled && mod_available;
+}
+
+int
+iommufd_dma_mem_map(int iommufd, uint32_t ioasid, uint64_t vaddr,
+			uint64_t iova, uint64_t len, int do_map)
+{
+	struct iommu_ioas_map dma_map;
+	struct iommu_ioas_unmap dma_unmap;
+	int ret;
+
+	if (do_map != 0) {
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.ioas_id = ioasid;
+		dma_map.size = sizeof(struct iommu_ioas_map);
+		dma_map.user_va = vaddr;
+		dma_map.length = len;
+		dma_map.iova = iova;
+		dma_map.flags = IOMMU_IOAS_MAP_READABLE |
+				IOMMU_IOAS_MAP_WRITEABLE |
+				IOMMU_IOAS_MAP_FIXED_IOVA;
+
+		ret = ioctl(iommufd, IOMMU_IOAS_MAP, &dma_map);
+		if (ret) {
+			/**
+			 * In case the mapping was already done EEXIST will be
+			 * returned from kernel.
+			 */
+			if (errno == EEXIST) {
+				RTE_LOG(DEBUG, EAL,
+					"Memory segment is already mapped, skipping");
+			} else {
+				RTE_LOG(ERR, EAL,
+					"Cannot set up DMA remapping, error "
+					"%i (%s)\n", errno, strerror(errno));
+				return -1;
+			}
+		}
+	} else {
+		memset(&dma_unmap, 0, sizeof(dma_unmap));
+		dma_unmap.ioas_id = ioasid;
+		dma_unmap.size = sizeof(struct iommu_ioas_unmap);
+		dma_unmap.length = len;
+		dma_unmap.iova = iova;
+
+		ret = ioctl(iommufd, IOMMU_IOAS_UNMAP, &dma_unmap);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "Cannot clear DMA remapping, error "
+					"%i (%s)\n", errno, strerror(errno));
+			return -1;
+		} else if (dma_unmap.length != len) {
+			RTE_LOG(ERR, EAL, "Unexpected size %"PRIu64
+				" of DMA remapping cleared instead of %"PRIu64"\n",
+				(uint64_t)dma_unmap.size, len);
+			rte_errno = EIO;
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+#else /* not IOMMUFD_PRESENT */
+
+int
+rte_iommufd_enable(__rte_unused const char *modname)
+{
+	return -1;
+}
+
+int
+rte_iommufd_is_enabled(__rte_unused const char *modname)
+{
+	return -1;
+}
+
+int
+iommufd_dma_mem_map(__rte_unused int iommufd, __rte_unused uint32_t ioasid,
+		    __rte_unused uint64_t vaddr, __rte_unused  uint64_t iova,
+		    __rte_unused uint64_t len, __rte_unused int do_map)
+{
+	return -1;
+}
+
+#endif /* IOMMUFD_PRESENT */
diff --git a/lib/eal/linux/eal_iommufd.h b/lib/eal/linux/eal_iommufd.h
new file mode 100644
index 0000000000..d9b67a7fd9
--- /dev/null
+++ b/lib/eal/linux/eal_iommufd.h
@@ -0,0 +1,43 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#ifndef EAL_IOMMUFD_H_
+#define EAL_IOMMUFD_H_
+
+#include <rte_common.h>
+#include <stdbool.h>
+
+/*
+ * determine if IOMMUFD is present on the system
+ */
+#if !defined(IOMMUFD_PRESENT) && defined(RTE_EAL_IOMMUFD)
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define IOMMUFD_PRESENT
+#else
+#pragma message("IOMMUFD configured but not supported by this kernel, disabling.")
+#endif /* kernel version >= 6.6.0 */
+#endif /* RTE_EAL_IOMMUFD */
+
+#ifdef IOMMUFD_PRESENT
+
+#define IOMMUFD_MAX_FD RTE_MAX_IOMMUFD_FD
+
+struct iommufd_config {
+	int iommufd_enabled;
+	int iommufd;
+	uint32_t ioas_id;
+	bool dma_init;
+};
+
+/* per-process IOMMUFD config */
+extern struct iommufd_config *default_iommufd_cfg;
+
+#endif /* IOMMUFD_PRESENT */
+
+int
+iommufd_dma_mem_map(int iommufd, uint32_t ioasid, uint64_t vaddr,
+		    uint64_t iova, uint64_t len, int do_map);
+
+#endif /* EAL_IOMMUFD_H_ */
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index e99ebed256..8081087584 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@  sources += files(
         'eal_thread.c',
         'eal_timer.c',
         'eal_vfio.c',
+        'eal_iommufd.c',
         'eal_vfio_mp_sync.c',
 )
 
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 5e0cd47c82..30e66a7267 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -393,6 +393,9 @@  EXPERIMENTAL {
 	# added in 23.07
 	rte_memzone_max_get;
 	rte_memzone_max_set;
+
+	rte_iommufd_enable; # WINDOWS_NO_EXPORT
+	rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {