[2/2] bus/pci: support region based device mapping

Message ID 20220628135339.2882914-2-skori@marvell.com (mailing list archive)
State Not Applicable, archived
Delegated to: Thomas Monjalon
Headers
Series [1/2] doc: announce region based device mapping support |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/github-robot: build fail github build: failed
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-x86_64-compile-testing fail Testing issues
ci/iol-x86_64-unit-testing fail Testing issues
ci/Intel-compilation fail Compilation issues
ci/intel-Testing success Testing PASS
ci/iol-abi-testing warning Testing issues

Commit Message

Sunil Kumar Kori June 28, 2022, 1:53 p.m. UTC
  From: Sunil Kumar Kori <skori@marvell.com>

This commit allows driver to define a list of sparse memory
regions to map for a given device instead mapping the whole BAR.

To do that, a driver must register itself with following information:

 * rte_pci_driver::drv_flags - RTE_PCI_DRV_NEED_REGION_MAPPING must be set.
 * rte_pci_driver::regions - It contains list of regions. Region
   information are explained below.
 * rte_pci_driver::valid_bars: It contains information about BARs for which
   entries are mentioned in rte_pci_driver::regions.

Each entry in region map specifies a particular area in given BAR to map
into the virtual space assigned for given device. Regions may lie within
the same BAR or in different BARs.

It results a sparse virtual memory reservation with only valid areas in
it being defined by the region tables.

Example:
If user wishes to map BAR 2 region at offset 0x20000000000 of length
0x2000000 and BAR 4 region at offset 0x40000000000 of length 0x10000
then following information need to be set in driver while registering:

static struct rte_pci_region_map xyz_pci_nic_regions[] = {
	{0x20000000000, 0x2000000, 2, false},
	{0x40000000000, 0x10000, 4, false},
	{0x0, 0x0, 0x0, false},
};

static struct rte_pci_driver xyz_pci_nic = {
	.valid_bars = {false, false, true, false, true, false},
	.regions = xyz_pci_nic_regions,
	.drv_flags = RTE_PCI_DRV_NEED_REGION_MAPPINGA | RTE_PCI_DRV_XYZ
}

And resultant mapping will be reflected as given below:
* (X + 0x20000000000) to (X + 0x20000000000 + 0x2000000)
* (Y + 0x40000000000) to (Y + 0x40000000000 + 0x10000)

Signed-off-by: Sunil Kumar Kori <skori@marvell.com>
---
 drivers/bus/pci/linux/pci.c      |  30 +++++++-
 drivers/bus/pci/linux/pci_vfio.c | 117 ++++++++++++++++++++++++++-----
 drivers/bus/pci/pci_common.c     |   4 +-
 drivers/bus/pci/private.h        |   5 ++
 drivers/bus/pci/rte_bus_pci.h    |  25 +++++++
 lib/pci/rte_pci.h                |  15 ++++
 6 files changed, 176 insertions(+), 20 deletions(-)
  

Patch

diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
index e521459870..e6eb172e92 100644
--- a/drivers/bus/pci/linux/pci.c
+++ b/drivers/bus/pci/linux/pci.c
@@ -173,7 +173,7 @@  pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev)
 {
 	FILE *f;
 	char buf[BUFSIZ];
-	int i;
+	int i, j;
 	uint64_t phys_addr, end_addr, flags;
 
 	f = fopen(filename, "r");
@@ -198,6 +198,14 @@  pci_parse_sysfs_resource(const char *filename, struct rte_pci_device *dev)
 			dev->mem_resource[i].len = end_addr - phys_addr + 1;
 			/* not mapped for now */
 			dev->mem_resource[i].addr = NULL;
+
+			/* update the same in regions too */
+			for (j = 0; j < PCI_MAX_REGION_PER_RESOURCE; j++) {
+				dev->regions[i][j].phys_addr = phys_addr;
+				dev->regions[i][j].len = end_addr - phys_addr + 1;
+				/* not mapped for now */
+				dev->regions[i][j].addr = NULL;
+			}
 		}
 	}
 	fclose(f);
@@ -640,6 +648,26 @@  pci_device_iova_mode(const struct rte_pci_driver *pdrv,
 	return iova_mode;
 }
 
+bool
+pci_device_get_region_info(const struct rte_pci_driver *drv,
+	uint32_t bar_idx, uint64_t *offset, uint64_t *size)
+{
+	struct rte_pci_region_map *region;
+	bool is_present = false;
+
+	for (region = drv->regions; region->size != 0; region++) {
+		if ((region->bar_idx == bar_idx) && (region->mapped == false)) {
+			*offset = region->offset;
+			*size = region->size;
+			region->mapped = true;
+			is_present = true;
+			break;
+		}
+	}
+
+	return is_present;
+}
+
 /* Read PCI config space. */
 int rte_pci_read_config(const struct rte_pci_device *device,
 		void *buf, size_t len, off_t offset)
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index cd0d0b1670..90cbfbd699 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -509,21 +509,28 @@  pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
 
 static int
 pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
-		int bar_index, int additional_flags)
+		int bar_index, int reg_idx, bool map_reg, int additional_flags)
 {
 	struct memreg {
 		uint64_t offset;
 		size_t   size;
 	} memreg[2] = {};
-	void *bar_addr;
+	void *bar_addr = NULL;
+	struct pci_map *region = &vfio_res->regions[bar_index][reg_idx];
 	struct pci_msix_table *msix_table = &vfio_res->msix_table;
 	struct pci_map *bar = &vfio_res->maps[bar_index];
 
-	if (bar->size == 0) {
+	if (!map_reg && bar->size == 0) {
 		RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index);
 		return 0;
 	}
 
+	if (map_reg && region->size == 0) {
+		RTE_LOG(DEBUG, EAL, "Region size is 0, skip BAR:REG=(%d:%d)\n",
+			bar_index, reg_idx);
+		return 0;
+	}
+
 	if (msix_table->bar_index == bar_index) {
 		/*
 		 * VFIO will not let us map the MSI-X table,
@@ -571,12 +578,19 @@  pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 			memreg[0].offset, memreg[0].size,
 			memreg[1].offset, memreg[1].size);
 	} else {
-		memreg[0].offset = bar->offset;
-		memreg[0].size = bar->size;
+		if (map_reg) {
+			bar_addr = region->addr;
+			memreg[0].offset = region->offset;
+			memreg[0].size = region->size;
+		} else {
+			bar_addr = bar->addr;
+			memreg[0].offset = bar->offset;
+			memreg[0].size = bar->size;
+		}
 	}
 
 	/* reserve the address using an inaccessible mapping */
-	bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
+	bar_addr = mmap(bar_addr, memreg[0].size, 0, MAP_PRIVATE |
 			MAP_ANONYMOUS | additional_flags, -1, 0);
 	if (bar_addr != MAP_FAILED) {
 		void *map_addr = NULL;
@@ -627,7 +641,11 @@  pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 		return -1;
 	}
 
-	bar->addr = bar_addr;
+	if (map_reg)
+		region->addr = bar_addr;
+	else
+		bar->addr = bar_addr;
+
 	return 0;
 }
 
@@ -727,12 +745,15 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 	char pci_addr[PATH_MAX] = {0};
 	int vfio_dev_fd;
 	struct rte_pci_addr *loc = &dev->addr;
+	struct rte_pci_driver *drv = dev->driver;
 	int i, ret;
 	struct mapped_pci_resource *vfio_res = NULL;
 	struct mapped_pci_res_list *vfio_res_list =
 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
 
+	struct rte_pci_region_map *drv_reg;
 	struct pci_map *maps;
+	bool map_reg;
 
 	if (rte_intr_fd_set(dev->intr_handle, -1))
 		return -1;
@@ -791,9 +812,18 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 		}
 	}
 
+	map_reg = drv->drv_flags & RTE_PCI_DRV_NEED_REGION_MAPPING ? true : false;
+	if (map_reg) {
+		for (drv_reg = drv->regions; drv_reg->size != 0; drv_reg++)
+			drv_reg->mapped = false;
+	}
+
 	for (i = 0; i < vfio_res->nb_maps; i++) {
 		struct vfio_region_info *reg = NULL;
-		void *bar_addr;
+		struct pci_map *region = NULL;
+		uint64_t offset = 0, size = 0;
+		void *bar_addr = NULL;
+		uint32_t reg_idx = 0;
 
 		ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
 		if (ret < 0) {
@@ -821,22 +851,41 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 			continue;
 		}
 
+next_region:
+		/* skip BARs if driver requested for region mapping and
+		 * entry in regions table is not available
+		 */
+		if (map_reg && drv->valid_bars[i] == true &&
+		    (pci_device_get_region_info(drv, i, &offset, &size) == false)) {
+			free(reg);
+			continue;
+		}
+
 		/* try mapping somewhere close to the end of hugepages */
 		if (pci_map_addr == NULL)
 			pci_map_addr = pci_find_max_end_va();
 
 		bar_addr = pci_map_addr;
-		pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
+
+		if (map_reg && drv->valid_bars[i] == true) {
+			region = &vfio_res->regions[i][reg_idx];
+			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) size);
+			region->addr = bar_addr;
+			region->path = NULL; /* vfio doesn't have per-resource paths */
+			region->offset = offset;
+			region->size = size;
+		} else {
+			pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
+			maps[i].addr = bar_addr;
+			maps[i].path = NULL; /* vfio doesn't have per-resource paths */
+			maps[i].offset = reg->offset;
+			maps[i].size = reg->size;
+		}
 
 		pci_map_addr = RTE_PTR_ALIGN(pci_map_addr,
 					sysconf(_SC_PAGE_SIZE));
 
-		maps[i].addr = bar_addr;
-		maps[i].offset = reg->offset;
-		maps[i].size = reg->size;
-		maps[i].path = NULL; /* vfio doesn't have per-resource paths */
-
-		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
+		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, reg_idx, map_reg, 0);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
 					pci_addr, i, strerror(errno));
@@ -844,8 +893,15 @@  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 			goto err_vfio_res;
 		}
 
-		dev->mem_resource[i].addr = maps[i].addr;
+		if (map_reg && (drv->valid_bars[i] == true)) {
+			dev->regions[i][reg_idx].addr = region->addr;
+			dev->regions[i][reg_idx].len = region->size;
+			reg_idx++;
+			goto next_region;
+		}
 
+		dev->mem_resource[i].addr = maps[i].addr;
+		reg_idx = 0;
 		free(reg);
 	}
 
@@ -877,14 +933,19 @@  pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 {
 	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 	char pci_addr[PATH_MAX] = {0};
+	struct rte_pci_driver *drv = dev->driver;
 	int vfio_dev_fd;
 	struct rte_pci_addr *loc = &dev->addr;
-	int i, ret;
+	int i, ret, j = 0;
 	struct mapped_pci_resource *vfio_res = NULL;
 	struct mapped_pci_res_list *vfio_res_list =
 		RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
 
+	struct rte_pci_region_map *drv_reg;
+	uint64_t offset = 0, size = 0;
+	struct pci_map *region;
 	struct pci_map *maps;
+	bool map_reg = false;
 
 	if (rte_intr_fd_set(dev->intr_handle, -1))
 		return -1;
@@ -918,16 +979,36 @@  pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 
 	/* map BARs */
 	maps = vfio_res->maps;
+	for (drv_reg = drv->regions; drv_reg->size != 0; drv_reg++)
+		drv_reg->mapped = false;
 
 	for (i = 0; i < vfio_res->nb_maps; i++) {
-		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
+next_region:
+		if (drv->drv_flags & RTE_PCI_DRV_NEED_REGION_MAPPING &&
+		    drv->valid_bars[i] == true) {
+			map_reg = pci_device_get_region_info(drv, i, &offset, &size);
+			if (map_reg == false)
+				continue;
+			region = &vfio_res->regions[i][j];
+		}
+
+		ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, j, map_reg,
+					MAP_FIXED);
 		if (ret < 0) {
 			RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
 					pci_addr, i, strerror(errno));
 			goto err_vfio_dev_fd;
 		}
 
+		if (map_reg) {
+			dev->regions[i][j].addr = region->addr;
+			j++;
+			map_reg = false;
+			goto next_region;
+		}
+
 		dev->mem_resource[i].addr = maps[i].addr;
+		j = 0;
 	}
 
 	/* we need save vfio_dev_fd, so it can be used during release */
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index 37ab879779..656b35ec30 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -248,7 +248,8 @@  rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 		 * to use driver flags for adjusting configuration.
 		 */
 		dev->driver = dr;
-		if (dev->driver->drv_flags & RTE_PCI_DRV_NEED_MAPPING) {
+		if (dev->driver->drv_flags & RTE_PCI_DRV_NEED_MAPPING ||
+		    dev->driver->drv_flags & RTE_PCI_DRV_NEED_REGION_MAPPING) {
 			ret = rte_pci_map_device(dev);
 			if (ret != 0) {
 				dev->driver = NULL;
@@ -256,6 +257,7 @@  rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 				dev->vfio_req_intr_handle = NULL;
 				rte_intr_instance_free(dev->intr_handle);
 				dev->intr_handle = NULL;
+				dev->driver = NULL;
 				return ret;
 			}
 		}
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 0fbef8e1d8..3cd6b2b90b 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -98,6 +98,7 @@  struct mapped_pci_resource {
 	int nb_maps;
 	struct pci_map maps[PCI_MAX_RESOURCE];
 	struct pci_msix_table msix_table;
+	struct pci_map regions[PCI_MAX_RESOURCE][PCI_MAX_REGION_PER_RESOURCE];
 };
 
 /** mapped pci device list */
@@ -236,6 +237,10 @@  enum rte_iova_mode
 pci_device_iova_mode(const struct rte_pci_driver *pci_drv,
 		     const struct rte_pci_device *pci_dev);
 
+bool
+pci_device_get_region_info(const struct rte_pci_driver *drv, uint32_t bar_idx,
+	uint64_t *offset, uint64_t *size);
+
 /**
  * Get iommu class of PCI devices on the bus.
  * And return their preferred iova mapping mode.
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index 1c6a8fdd7b..a39dc3f026 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -76,6 +76,8 @@  struct rte_pci_device {
 	char name[PCI_PRI_STR_SIZE+1];      /**< PCI location (ASCII) */
 	struct rte_intr_handle *vfio_req_intr_handle;
 				/**< Handler of VFIO request interrupt */
+	struct rte_mem_resource regions[PCI_MAX_RESOURCE][PCI_MAX_REGION_PER_RESOURCE];
+					    /**< PCI Memory regions per resource */
 };
 
 /**
@@ -167,6 +169,8 @@  struct rte_pci_driver {
 	pci_dma_map_t *dma_map;		   /**< device dma map function. */
 	pci_dma_unmap_t *dma_unmap;	   /**< device dma unmap function. */
 	const struct rte_pci_id *id_table; /**< ID table, NULL terminated. */
+	struct rte_pci_region_map *regions; /**< MAP table, NULL terminated. */
+	bool valid_bars[PCI_MAX_RESOURCE]; /**< Valid BARs which has region config */
 	uint32_t drv_flags;                /**< Flags RTE_PCI_DRV_*. */
 };
 
@@ -193,6 +197,27 @@  struct rte_pci_bus {
 #define RTE_PCI_DRV_KEEP_MAPPED_RES 0x0020
 /** Device driver needs IOVA as VA and cannot work with IOVA as PA */
 #define RTE_PCI_DRV_NEED_IOVA_AS_VA 0x0040
+/** Device needs PCI BAR mapping for given region (done with either IGB_UIO or VFIO)
+ * i.e. if regions for a given device is defined as:
+
+  .regions = {
+    {
+      .bar_idx = PCI_BAR_0,
+      .offset = 0x1000,
+      .size = 0x100
+    },
+    {
+      .bar_idx = PCI_BAR_0,
+      .offset = 0x5000,
+      .size = 0x1000
+    }
+  },
+
+then the only valid address mappings will be:
+* X + 0x1000 to X + 0x10FF
+* X + 0x5000 to X + 0x5FFF
+*/
+#define RTE_PCI_DRV_NEED_REGION_MAPPING 0x0080
 
 /**
  * Map the PCI device resources in user space virtual memory address
diff --git a/lib/pci/rte_pci.h b/lib/pci/rte_pci.h
index 5088157e74..9d29113f2b 100644
--- a/lib/pci/rte_pci.h
+++ b/lib/pci/rte_pci.h
@@ -74,6 +74,9 @@  extern "C" {
 /** Maximum number of PCI resources. */
 #define PCI_MAX_RESOURCE 6
 
+/** Maximum number of regions per resource. */
+#define PCI_MAX_REGION_PER_RESOURCE 8
+
 /**
  * A structure describing an ID for a PCI driver. Each driver provides a
  * table of these IDs for each device that it supports.
@@ -96,6 +99,18 @@  struct rte_pci_addr {
 	uint8_t function;               /**< Device function. */
 };
 
+/**
+ * A structure describing region mapping information. Driver provides a
+ * table of these mapping if it supports region mapping i.e. drv_flags is set
+ * to RTE_PCI_DRV_NEED_REGION_MAPPING.
+ */
+struct rte_pci_region_map {
+	uint64_t offset;  /**< Offset from where mapping is to be done. */
+	uint64_t size;    /**< Memory size. */
+	uint8_t bar_idx;  /**< BAR number. */
+	uint8_t mapped;   /**< Is region mapped or not */
+};
+
 /** Any PCI device identifier (vendor, device, ...) */
 #define RTE_PCI_ANY_ID (0xffff)
 /** @deprecated Replaced with RTE_PCI_ANY_ID */