[RFC,v3,6/6] bus/pci: add sparse mmap support for mediated PCI devices

Message ID 20210601030644.3318-7-chenbo.xia@intel.com (mailing list archive)
State Changes Requested, archived
Delegated to: David Marchand
Headers
Series Add mdev (Mediated device) support in DPDK |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues
ci/intel-Testing success Testing PASS

Commit Message

Chenbo Xia June 1, 2021, 3:06 a.m. UTC
  This patch adds sparse mmap support in PCI bus. Sparse mmap is a
capability defined in VFIO which allows multiple mmap areas in one
VFIO region. Mediated pci devices could use this capability to let
mdev parent driver have control over access of non-mmapable part
of regions.

Signed-off-by: Chenbo Xia <chenbo.xia@intel.com>
---
 drivers/bus/pci/linux/pci_vfio.c | 229 +++++++++++++++++++++++++++----
 drivers/bus/pci/private.h        |   2 +
 drivers/bus/pci/rte_bus_pci.h    |  18 ++-
 3 files changed, 218 insertions(+), 31 deletions(-)
  

Patch

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 00ba5db03a..e68eccb63f 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -654,6 +654,82 @@  pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 	return 0;
 }
 
+static int
+pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
+		struct vfio_region_sparse_mmap_area *vfio_areas,
+		uint32_t nr_areas, int bar_index, int additional_flags,
+		int numa_node)
+{
+	struct pci_map *map = &vfio_res->maps[bar_index];
+	struct rte_mem_map_area *area;
+	struct vfio_region_sparse_mmap_area *sparse;
+	void *bar_addr;
+	uint32_t i, j;
+
+	map->nr_areas = nr_areas;
+
+	if (map->size == 0) {
+		RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index);
+		return 0;
+	}
+
+	if (!map->nr_areas) {
+		RTE_LOG(DEBUG, EAL, "Skip bar %d with no sparse mmap areas\n",
+			bar_index);
+		map->areas = NULL;
+		return 0;
+	}
+
+	if (map->areas == NULL) {
+		map->areas = rte_zmalloc_socket(NULL,
+				sizeof(*map->areas) * nr_areas,
+				RTE_CACHE_LINE_SIZE, numa_node);
+		if (map->areas == NULL) {
+			RTE_LOG(ERR, EAL,
+				"Cannot alloc memory for sparse map areas\n");
+			return -1;
+		}
+	}
+
+	for (i = 0; i < map->nr_areas; i++) {
+		area = &map->areas[i];
+		sparse = &vfio_areas[i];
+
+		bar_addr = mmap(map->addr, sparse->size, 0, MAP_PRIVATE |
+				MAP_ANONYMOUS | additional_flags, -1, 0);
+		if (bar_addr != MAP_FAILED) {
+			area->addr = pci_map_resource(bar_addr, vfio_dev_fd,
+				map->offset + sparse->offset, sparse->size,
+				RTE_MAP_FORCE_ADDRESS);
+			if (area->addr == NULL) {
+				munmap(bar_addr, sparse->size);
+				RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
+					bar_index);
+				goto err_map;
+			}
+
+			area->offset = sparse->offset;
+			area->size = sparse->size;
+		} else {
+			RTE_LOG(ERR, EAL, "Failed to create inaccessible mapping for BAR%d\n",
+				bar_index);
+			goto err_map;
+		}
+	}
+
+	return 0;
+
+err_map:
+	for (j = 0; j < i; j++) {
+		pci_unmap_resource(map->areas[j].addr, map->areas[j].size);
+		map->areas[j].offset = 0;
+		map->areas[j].size = 0;
+	}
+	rte_free(map->areas);
+	map->nr_areas = 0;
+	return -1;
+}
+
 /*
  * region info may contain capability headers, so we need to keep reallocating
  * the memory until we match allocated memory size with argsz.
@@ -770,6 +846,31 @@  pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd,
 	return 0;
 }
 
+static void
+clean_up_pci_resource(struct mapped_pci_resource *vfio_res)
+{
+	struct pci_map *map;
+	uint32_t i, j;
+
+	for (i = 0; i < PCI_MAX_RESOURCE; i++) {
+		map = &vfio_res->maps[i];
+		if (map->nr_areas > 1) {
+			for (j = 0; j < map->nr_areas; j++)
+				pci_unmap_resource(map->areas[j].addr,
+					map->areas[j].size);
+		} else {
+			/*
+			 * We do not need to be aware of MSI-X BAR mappings.
+			 * Using current maps array is enough.
+			 */
+			if (map->addr)
+				pci_unmap_resource(map->addr, map->size);
+		}
+	}
+
+	rte_free(map->areas);
+}
+
 static int
 pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
@@ -866,6 +967,8 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 
 	for (i = 0; i < vfio_res->nb_maps; i++) {
 		void *bar_addr;
+		struct vfio_info_cap_header *hdr;
+		struct vfio_region_info_cap_sparse_mmap *sparse;
 
 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
 		if (ret < 0) {
@@ -911,15 +1014,59 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 		maps[i].size = reg->size;
 		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 
-		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
-		if (ret < 0) {
-			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
-					pci_addr, i, strerror(errno));
-			free(reg);
-			goto err_vfio_res;
-		}
+		hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
+
+		if (dev->is_mdev && hdr != NULL) {
+			sparse = container_of(hdr,
+				struct vfio_region_info_cap_sparse_mmap,
+				header);
+
+			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res,
+				sparse->areas, sparse->nr_areas, i, 0,
+				dev->device.numa_node);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				free(reg);
+				goto err_vfio_res;
+			}
 
-		dev->mem_resource[i].addr = maps[i].addr;
+			dev->sparse_mem[i].size = reg->size;
+			dev->sparse_mem[i].nr_maps = vfio_res->maps[i].nr_areas;
+			dev->sparse_mem[i].areas = vfio_res->maps[i].areas;
+		} else {
+			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				free(reg);
+				goto err_vfio_res;
+			}
+
+			if (dev->is_mdev) {
+				struct pci_map *mdev_map = &maps[i];
+				mdev_map->nr_areas = 1;
+				mdev_map->areas = rte_zmalloc_socket(NULL,
+					sizeof(*mdev_map->areas),
+					RTE_CACHE_LINE_SIZE,
+					dev->device.numa_node);
+				if (maps[i].areas == NULL) {
+					RTE_LOG(ERR, EAL,
+						"Cannot allocate memory for sparse map areas\n");
+					goto err_vfio_res;
+				}
+				mdev_map->areas[0].addr = maps[i].addr;
+				mdev_map->areas[0].offset = 0;
+				mdev_map->areas[0].size = reg->size;
+				dev->sparse_mem[i].size = reg->size;
+				dev->sparse_mem[i].nr_maps = 1;
+				dev->sparse_mem[i].areas = mdev_map->areas;
+			} else {
+				maps[i].nr_areas = 0;
+				maps[i].areas = NULL;
+				dev->mem_resource[i].addr = maps[i].addr;
+			}
+		}
 
 		free(reg);
 	}
@@ -940,6 +1087,7 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 
 	return 0;
 err_vfio_res:
+	clean_up_pci_resource(vfio_res);
 	rte_free(vfio_res);
 err_vfio_dev_fd:
 	rte_vfio_release_device(rte_pci_get_sysfs_path(),
@@ -960,7 +1108,7 @@  pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 	struct mapped_pci_res_list *vfio_res_list =
 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
 
-	struct pci_map *maps;
+	struct pci_map *maps, *cur;
 
 	dev->intr_handle.fd = -1;
 #ifdef HAVE_VFIO_DEV_REQ_INTERFACE
@@ -1012,14 +1160,49 @@  pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 	maps = vfio_res->maps;
 
 	for (i = 0; i < vfio_res->nb_maps; i++) {
-		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
-		if (ret < 0) {
-			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
-					pci_addr, i, strerror(errno));
-			goto err_vfio_dev_fd;
+		cur = &maps[i];
+		if (cur->nr_areas > 1) {
+			struct vfio_region_sparse_mmap_area *areas;
+			uint32_t i;
+
+			areas = malloc(sizeof(*areas) * cur->nr_areas);
+			if (areas == NULL) {
+				RTE_LOG(ERR, EAL, "Failed to alloc vfio areas for %s\n",
+					pci_addr);
+				goto err_vfio_dev_fd;
+			}
+
+			for (i = 0; i < cur->nr_areas; i++) {
+				areas[i].offset = cur->areas[i].offset;
+				areas[i].size = cur->areas[i].size;
+			}
+
+			ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res,
+				areas, cur->nr_areas, i, MAP_FIXED,
+				dev->device.numa_node);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				free(areas);
+				goto err_vfio_dev_fd;
+			}
+
+			free(areas);
+		} else {
+			ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res,
+				i, MAP_FIXED);
+			if (ret < 0) {
+				RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
+						pci_addr, i, strerror(errno));
+				goto err_vfio_dev_fd;
+			}
+
+			if (dev->is_mdev)
+				cur->areas[0].addr = cur->addr;
+			else
+				dev->mem_resource[i].addr = cur->addr;
 		}
 
-		dev->mem_resource[i].addr = maps[i].addr;
 	}
 
 	/* we need save vfio_dev_fd, so it can be used during release */
@@ -1054,8 +1237,6 @@  find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 			const char *pci_addr)
 {
 	struct mapped_pci_resource *vfio_res = NULL;
-	struct pci_map *maps;
-	int i;
 
 	/* Get vfio_res */
 	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
@@ -1079,19 +1260,7 @@  find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 	RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n",
 		pci_addr);
 
-	maps = vfio_res->maps;
-	for (i = 0; i < vfio_res->nb_maps; i++) {
-
-		/*
-		 * We do not need to be aware of MSI-X table BAR mappings as
-		 * when mapping. Just using current maps array is enough
-		 */
-		if (maps[i].addr) {
-			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
-				pci_addr, maps[i].addr);
-			pci_unmap_resource(maps[i].addr, maps[i].size);
-		}
-	}
+	clean_up_pci_resource(vfio_res);
 
 	return vfio_res;
 }
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 3515c086aa..8d94d8acf8 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -110,6 +110,8 @@  struct pci_map {
 	uint64_t offset;
 	uint64_t size;
 	uint64_t phaddr;
+	uint32_t nr_areas;
+	struct rte_mem_map_area *areas;
 };
 
 struct pci_msix_table {
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index fb7d934bd0..ddc913f121 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -70,6 +70,18 @@  enum rte_pci_kernel_driver {
 	RTE_PCI_KDRV_NET_UIO,      /* NetUIO for Windows */
 };
 
+struct rte_mem_map_area {
+	void *addr;
+	uint64_t offset;
+	uint64_t size;
+};
+
+struct rte_sparse_mem_map {
+	uint64_t size;
+	uint32_t nr_maps;
+	struct rte_mem_map_area *areas;
+};
+
 /**
  * A structure describing a PCI device.
  */
@@ -82,8 +94,12 @@  struct rte_pci_device {
 	};
 	uint8_t is_mdev;                    /**< True for mediated PCI device */
 	struct rte_pci_id id;               /**< PCI ID. */
-	struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
+	union {
+		struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
 					    /**< PCI Memory Resource */
+		struct rte_sparse_mem_map sparse_mem[PCI_MAX_RESOURCE];
+					    /**< Sparse Memory Map for Mdev */
+	};
 	struct rte_intr_handle intr_handle; /**< Interrupt handle */
 	struct rte_pci_driver *driver;      /**< PCI driver used in probing */
 	uint16_t max_vfs;                   /**< sriov enable if not zero */