From patchwork Tue Jun 1 03:06:39 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chenbo Xia X-Patchwork-Id: 93678 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id A4B9AA0524; Tue, 1 Jun 2021 05:17:33 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 7F07540689; Tue, 1 Jun 2021 05:17:33 +0200 (CEST) Received: from mga03.intel.com (mga03.intel.com [134.134.136.65]) by mails.dpdk.org (Postfix) with ESMTP id 24F7940040 for ; Tue, 1 Jun 2021 05:17:31 +0200 (CEST) IronPort-SDR: K5ZMD5JE0pxEUQtBk6lfTRrDpqhCzQeYepePKOHasan/vXxrGEndRPRWFw9hL9qtP0x/g7jcHk WfYTH2Xbfhjg== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="203496648" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="203496648" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by orsmga103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:17:30 -0700 IronPort-SDR: Z4m5i+sT8FASWA4ksMnJFKGkOzb/Znx0t5wt2ItGpNdoAeZei2Gp60rkP+sSGAZ0yqQ8l4C9vm d5pJLNTDBZCQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315373" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:17:23 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com, Tiwei Bie Date: Tue, 1 Jun 2021 11:06:39 +0800 Message-Id: <20210601030644.3318-2-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> Subject: [dpdk-dev] [RFC v3 1/6] bus/pci: introduce an internal representation of PCI device X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From: Tiwei Bie This patch introduces an internal representation of the PCI device which will be used to store the internal information that don't have to be exposed, e.g. the VFIO region sizes/offsets. In this patch, the internal structure is simply a wrapper of the rte_pci_device structure. More fields will be added in the coming patches. Suggested-by: David Marchand Signed-off-by: Tiwei Bie Signed-off-by: Chenbo Xia --- drivers/bus/pci/bsd/pci.c | 14 +++++++++----- drivers/bus/pci/linux/pci.c | 27 ++++++++++++++++----------- drivers/bus/pci/pci_common.c | 2 +- drivers/bus/pci/private.h | 12 ++++++++++++ 4 files changed, 38 insertions(+), 17 deletions(-) diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c index 4b8a208781..20ce979f60 100644 --- a/drivers/bus/pci/bsd/pci.c +++ b/drivers/bus/pci/bsd/pci.c @@ -212,16 +212,20 @@ pci_uio_map_resource_by_index(struct rte_pci_device *dev, int res_idx, static int pci_scan_one(int dev_pci_fd, struct pci_conf *conf) { + struct rte_pci_device_internal *pdev; struct rte_pci_device *dev; struct pci_bar_io bar; unsigned i, max; - dev = malloc(sizeof(*dev)); - if (dev == NULL) { + pdev = malloc(sizeof(*pdev)); + if (pdev == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for internal pci device\n"); return -1; } - memset(dev, 0, sizeof(*dev)); + memset(pdev, 0, sizeof(*pdev)); + + dev = &pdev->device; dev->device.bus = &rte_pci_bus.bus; dev->addr.domain = conf->pc_sel.pc_domain; @@ -307,7 +311,7 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf) memmove(dev2->mem_resource, dev->mem_resource, sizeof(dev->mem_resource)); - free(dev); + free(pdev); } return 0; } @@ -317,7 +321,7 @@ pci_scan_one(int dev_pci_fd, struct pci_conf *conf) return 0; skipdev: - free(dev); + free(pdev); return 0; } diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 0dc99e9cb2..6dbba10657 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -218,22 +218,27 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) { char filename[PATH_MAX]; unsigned long tmp; + struct rte_pci_device_internal *pdev; struct rte_pci_device *dev; char driver[PATH_MAX]; int ret; - dev = malloc(sizeof(*dev)); - if (dev == NULL) + pdev = malloc(sizeof(*pdev)); + if (pdev == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate memory for internal pci device\n"); return -1; + } + + memset(pdev, 0, sizeof(*pdev)); - memset(dev, 0, sizeof(*dev)); + dev = &pdev->device; dev->device.bus = &rte_pci_bus.bus; dev->addr = *addr; /* get vendor id */ snprintf(filename, sizeof(filename), "%s/vendor", dirname); if (eal_parse_sysfs_value(filename, &tmp) < 0) { - free(dev); + free(pdev); return -1; } dev->id.vendor_id = (uint16_t)tmp; @@ -241,7 +246,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) /* get device id */ snprintf(filename, sizeof(filename), "%s/device", dirname); if (eal_parse_sysfs_value(filename, &tmp) < 0) { - free(dev); + free(pdev); return -1; } dev->id.device_id = (uint16_t)tmp; @@ -250,7 +255,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) snprintf(filename, sizeof(filename), "%s/subsystem_vendor", dirname); if (eal_parse_sysfs_value(filename, &tmp) < 0) { - free(dev); + free(pdev); return -1; } dev->id.subsystem_vendor_id = (uint16_t)tmp; @@ -259,7 +264,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) snprintf(filename, sizeof(filename), "%s/subsystem_device", dirname); if (eal_parse_sysfs_value(filename, &tmp) < 0) { - free(dev); + free(pdev); return -1; } dev->id.subsystem_device_id = (uint16_t)tmp; @@ -268,7 +273,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) snprintf(filename, sizeof(filename), "%s/class", dirname); if (eal_parse_sysfs_value(filename, &tmp) < 0) { - free(dev); + free(pdev); return -1; } /* the least 24 bits are valid: class, subclass, program interface */ @@ -308,7 +313,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) snprintf(filename, sizeof(filename), "%s/resource", dirname); if (pci_parse_sysfs_resource(filename, dev) < 0) { RTE_LOG(ERR, EAL, "%s(): cannot parse resource\n", __func__); - free(dev); + free(pdev); return -1; } @@ -317,7 +322,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver)); if (ret < 0) { RTE_LOG(ERR, EAL, "Fail to get kernel driver\n"); - free(dev); + free(pdev); return -1; } @@ -386,7 +391,7 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) pci_name_set(dev2); } } - free(dev); + free(pdev); } return 0; } diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c index ee7f966358..1c368c254c 100644 --- a/drivers/bus/pci/pci_common.c +++ b/drivers/bus/pci/pci_common.c @@ -571,7 +571,7 @@ pci_unplug(struct rte_device *dev) if (ret == 0) { rte_pci_remove_device(pdev); rte_devargs_remove(dev->devargs); - free(pdev); + free(RTE_PCI_DEVICE_INTERNAL(pdev)); } return ret; } diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 4cd9d14ec7..49a29d45cf 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -12,11 +12,23 @@ #include #include +/* + * Convert struct rte_pci_device to struct rte_pci_device_internal + */ +#define RTE_PCI_DEVICE_INTERNAL(ptr) \ + container_of(ptr, struct rte_pci_device_internal, device) +#define RTE_PCI_DEVICE_INTERNAL_CONST(ptr) \ + container_of(ptr, const struct rte_pci_device_internal, device) + extern struct rte_pci_bus rte_pci_bus; struct rte_pci_driver; struct rte_pci_device; +struct rte_pci_device_internal { + struct rte_pci_device device; +}; + /** * Scan the content of the PCI bus, and the devices in the devices * list From patchwork Tue Jun 1 03:06:40 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chenbo Xia X-Patchwork-Id: 93679 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 288EFA0524; Tue, 1 Jun 2021 05:17:46 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 1682F40DF7; Tue, 1 Jun 2021 05:17:46 +0200 (CEST) Received: from mga07.intel.com (mga07.intel.com [134.134.136.100]) by mails.dpdk.org (Postfix) with ESMTP id 3970140040 for ; Tue, 1 Jun 2021 05:17:44 +0200 (CEST) IronPort-SDR: iJJi2XcnrGBMTsoW9PAvTU16UY3Af5VkHlm+Lj/dUamAGZifjOt8rHq1gUpLKB+YtOUNW4oPkO n2iaVLrBhslg== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="267339139" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="267339139" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:17:43 -0700 IronPort-SDR: 6Aodv40XfJIh8masBc+PWu4B4keVaSrlV8DoFtzSbLDPjVZHHeRrwGjiMFSqscsiZf7dcer3gH JObLKpw4rc7g== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315408" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:17:35 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com, Tiwei Bie Date: Tue, 1 Jun 2021 11:06:40 +0800 Message-Id: <20210601030644.3318-3-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> Subject: [dpdk-dev] [RFC v3 2/6] bus/pci: avoid depending on private value in kernel source X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From: Tiwei Bie The value 40 used in VFIO_GET_REGION_ADDR() is a private value (VFIO_PCI_OFFSET_SHIFT) defined in Linux kernel source [1]. It is not part of VFIO API, and we should not depend on it. [1] https://github.com/torvalds/linux/blob/v5.12/drivers/vfio/pci/vfio_pci_private.h Signed-off-by: Tiwei Bie --- drivers/bus/pci/linux/pci.c | 4 +- drivers/bus/pci/linux/pci_init.h | 4 +- drivers/bus/pci/linux/pci_vfio.c | 176 ++++++++++++++++++++++++------- drivers/bus/pci/private.h | 9 ++ 4 files changed, 153 insertions(+), 40 deletions(-) diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 6dbba10657..8f1fddbf20 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -647,7 +647,7 @@ int rte_pci_read_config(const struct rte_pci_device *device, return pci_uio_read_config(intr_handle, buf, len, offset); #ifdef VFIO_PRESENT case RTE_PCI_KDRV_VFIO: - return pci_vfio_read_config(intr_handle, buf, len, offset); + return pci_vfio_read_config(device, buf, len, offset); #endif default: rte_pci_device_name(&device->addr, devname, @@ -671,7 +671,7 @@ int rte_pci_write_config(const struct rte_pci_device *device, return pci_uio_write_config(intr_handle, buf, len, offset); #ifdef VFIO_PRESENT case RTE_PCI_KDRV_VFIO: - return pci_vfio_write_config(intr_handle, buf, len, offset); + return pci_vfio_write_config(device, buf, len, offset); #endif default: rte_pci_device_name(&device->addr, devname, diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h index dcea726186..9f6659ba6e 100644 --- a/drivers/bus/pci/linux/pci_init.h +++ b/drivers/bus/pci/linux/pci_init.h @@ -66,9 +66,9 @@ int pci_uio_ioport_unmap(struct rte_pci_ioport *p); #endif /* access config space */ -int pci_vfio_read_config(const struct rte_intr_handle *intr_handle, +int pci_vfio_read_config(const struct rte_pci_device *dev, void *buf, size_t len, off_t offs); -int pci_vfio_write_config(const struct rte_intr_handle *intr_handle, +int pci_vfio_write_config(const struct rte_pci_device *dev, const void *buf, size_t len, off_t offs); int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 07706f7338..012e7f72c1 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -43,35 +43,82 @@ static struct rte_tailq_elem rte_vfio_tailq = { }; EAL_REGISTER_TAILQ(rte_vfio_tailq) +static int +pci_vfio_get_region(const struct rte_pci_device *dev, int index, + uint64_t *size, uint64_t *offset) +{ + const struct rte_pci_device_internal *pdev = + RTE_PCI_DEVICE_INTERNAL_CONST(dev); + + if (index >= VFIO_PCI_NUM_REGIONS || index >= RTE_MAX_PCI_REGIONS) + return -1; + + if (pdev->region[index].size == 0 && pdev->region[index].offset == 0) + return -1; + + *size = pdev->region[index].size; + *offset = pdev->region[index].offset; + + return 0; +} + int -pci_vfio_read_config(const struct rte_intr_handle *intr_handle, +pci_vfio_read_config(const struct rte_pci_device *dev, void *buf, size_t len, off_t offs) { - return pread64(intr_handle->vfio_dev_fd, buf, len, - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); + uint64_t size, offset; + int fd; + + fd = dev->intr_handle.vfio_dev_fd; + + if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX, + &size, &offset) != 0) + return -1; + + if ((uint64_t)len + offs > size) + return -1; + + return pread64(fd, buf, len, offset + offs); } int -pci_vfio_write_config(const struct rte_intr_handle *intr_handle, +pci_vfio_write_config(const struct rte_pci_device *dev, const void *buf, size_t len, off_t offs) { - return pwrite64(intr_handle->vfio_dev_fd, buf, len, - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs); + uint64_t size, offset; + int fd; + + fd = dev->intr_handle.vfio_dev_fd; + + if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX, + &size, &offset) != 0) + return -1; + + if ((uint64_t)len + offs > size) + return -1; + + return pwrite64(fd, buf, len, offset + offs); } /* get PCI BAR number where MSI-X interrupts are */ static int -pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) +pci_vfio_get_msix_bar(const struct rte_pci_device *dev, int fd, + struct pci_msix_table *msix_table) { int ret; uint32_t reg; uint16_t flags; uint8_t cap_id, cap_offset; + uint64_t size, offset; + + if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX, + &size, &offset) != 0) { + RTE_LOG(ERR, EAL, "Cannot get offset of CONFIG region.\n"); + return -1; + } /* read PCI capability pointer from config space */ - ret = pread64(fd, ®, sizeof(reg), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - PCI_CAPABILITY_LIST); + ret = pread64(fd, ®, sizeof(reg), offset + PCI_CAPABILITY_LIST); if (ret != sizeof(reg)) { RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI config space!\n"); @@ -84,9 +131,7 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) while (cap_offset) { /* read PCI capability ID */ - ret = pread64(fd, ®, sizeof(reg), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - cap_offset); + ret = pread64(fd, ®, sizeof(reg), offset + cap_offset); if (ret != sizeof(reg)) { RTE_LOG(ERR, EAL, "Cannot read capability ID from PCI config space!\n"); @@ -99,8 +144,7 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) /* if we haven't reached MSI-X, check next capability */ if (cap_id != PCI_CAP_ID_MSIX) { ret = pread64(fd, ®, sizeof(reg), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - cap_offset); + offset + cap_offset); if (ret != sizeof(reg)) { RTE_LOG(ERR, EAL, "Cannot read capability pointer from PCI config space!\n"); @@ -116,8 +160,7 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) else { /* table offset resides in the next 4 bytes */ ret = pread64(fd, ®, sizeof(reg), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - cap_offset + 4); + offset + cap_offset + 4); if (ret != sizeof(reg)) { RTE_LOG(ERR, EAL, "Cannot read table offset from PCI config space!\n"); @@ -125,8 +168,7 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) } ret = pread64(fd, &flags, sizeof(flags), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - cap_offset + 2); + offset + cap_offset + 2); if (ret != sizeof(flags)) { RTE_LOG(ERR, EAL, "Cannot read table flags from PCI config space!\n"); @@ -178,14 +220,19 @@ pci_vfio_enable_bus_memory(int dev_fd) /* set PCI bus mastering */ static int -pci_vfio_set_bus_master(int dev_fd, bool op) +pci_vfio_set_bus_master(const struct rte_pci_device *dev, int dev_fd, bool op) { + uint64_t size, offset; uint16_t reg; int ret; - ret = pread64(dev_fd, ®, sizeof(reg), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - PCI_COMMAND); + if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX, + &size, &offset) != 0) { + RTE_LOG(ERR, EAL, "Cannot get offset of CONFIG region.\n"); + return -1; + } + + ret = pread64(dev_fd, ®, sizeof(reg), offset + PCI_COMMAND); if (ret != sizeof(reg)) { RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); return -1; @@ -197,10 +244,7 @@ pci_vfio_set_bus_master(int dev_fd, bool op) else reg &= ~(PCI_COMMAND_MASTER); - ret = pwrite64(dev_fd, ®, sizeof(reg), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + - PCI_COMMAND); - + ret = pwrite64(dev_fd, ®, sizeof(reg), offset + PCI_COMMAND); if (ret != sizeof(reg)) { RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); return -1; @@ -429,14 +473,21 @@ pci_vfio_disable_notifier(struct rte_pci_device *dev) #endif static int -pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index) +pci_vfio_is_ioport_bar(const struct rte_pci_device *dev, int vfio_dev_fd, + int bar_index) { + uint64_t size, offset; uint32_t ioport_bar; int ret; + if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX, + &size, &offset) != 0) { + RTE_LOG(ERR, EAL, "Cannot get offset of CONFIG region.\n"); + return -1; + } + ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar), - VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) - + PCI_BASE_ADDRESS_0 + bar_index*4); + offset + PCI_BASE_ADDRESS_0 + bar_index * 4); if (ret != sizeof(ioport_bar)) { RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n", PCI_BASE_ADDRESS_0 + bar_index*4); @@ -460,7 +511,7 @@ pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) } /* set bus mastering for the device */ - if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { + if (pci_vfio_set_bus_master(dev, vfio_dev_fd, true)) { RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); return -1; } @@ -690,11 +741,40 @@ pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) return ret; } +static int +pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd, + struct vfio_device_info *device_info) +{ + struct rte_pci_device_internal *pdev = RTE_PCI_DEVICE_INTERNAL(dev); + struct vfio_region_info *reg = NULL; + int nb_maps, i, ret; + + nb_maps = RTE_MIN((int)device_info->num_regions, + VFIO_PCI_CONFIG_REGION_INDEX + 1); + + for (i = 0; i < nb_maps; i++) { + ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s cannot get device region info error %i (%s)\n", + dev->name, errno, strerror(errno)); + return -1; + } + + pdev->region[i].size = reg->size; + pdev->region[i].offset = reg->offset; + + free(reg); + } + + return 0; +} static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { + struct rte_pci_device_internal *pdev = RTE_PCI_DEVICE_INTERNAL(dev); struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + struct vfio_region_info *reg = NULL; char pci_addr[PATH_MAX] = {0}; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; @@ -735,11 +815,22 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) /* map BARs */ maps = vfio_res->maps; + ret = pci_vfio_get_region_info(vfio_dev_fd, ®, + VFIO_PCI_CONFIG_REGION_INDEX); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s cannot get device region info error %i (%s)\n", + dev->name, errno, strerror(errno)); + goto err_vfio_res; + } + pdev->region[VFIO_PCI_CONFIG_REGION_INDEX].size = reg->size; + pdev->region[VFIO_PCI_CONFIG_REGION_INDEX].offset = reg->offset; + free(reg); + vfio_res->msix_table.bar_index = -1; /* get MSI-X BAR, if any (we have to know where it is because we can't * easily mmap it when using VFIO) */ - ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); + ret = pci_vfio_get_msix_bar(dev, vfio_dev_fd, &vfio_res->msix_table); if (ret < 0) { RTE_LOG(ERR, EAL, "%s cannot get MSI-X BAR number!\n", pci_addr); @@ -760,7 +851,6 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) } for (i = 0; i < vfio_res->nb_maps; i++) { - struct vfio_region_info *reg = NULL; void *bar_addr; ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); @@ -771,8 +861,11 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) goto err_vfio_res; } + pdev->region[i].size = reg->size; + pdev->region[i].offset = reg->offset; + /* chk for io port region */ - ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); + ret = pci_vfio_is_ioport_bar(dev, vfio_dev_fd, i); if (ret < 0) { free(reg); goto err_vfio_res; @@ -882,6 +975,10 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) if (ret) return ret; + ret = pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info); + if (ret) + return ret; + /* map BARs */ maps = vfio_res->maps; @@ -988,7 +1085,7 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) return -1; } - if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) { + if (pci_vfio_set_bus_master(dev, dev->intr_handle.vfio_dev_fd, false)) { RTE_LOG(ERR, EAL, "%s cannot unset bus mastering for PCI device!\n", pci_addr); return -1; @@ -1064,14 +1161,21 @@ int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p) { + uint64_t size, offset; + if (bar < VFIO_PCI_BAR0_REGION_INDEX || bar > VFIO_PCI_BAR5_REGION_INDEX) { RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar); return -1; } + if (pci_vfio_get_region(dev, bar, &size, &offset) != 0) { + RTE_LOG(ERR, EAL, "Cannot get offset of region %d.\n", bar); + return -1; + } + p->dev = dev; - p->base = VFIO_GET_REGION_ADDR(bar); + p->base = offset; return 0; } diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 49a29d45cf..8b5fa70641 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -12,6 +12,8 @@ #include #include +#define RTE_MAX_PCI_REGIONS 9 + /* * Convert struct rte_pci_device to struct rte_pci_device_internal */ @@ -25,8 +27,15 @@ extern struct rte_pci_bus rte_pci_bus; struct rte_pci_driver; struct rte_pci_device; +struct rte_pci_region { + uint64_t size; + uint64_t offset; +}; + struct rte_pci_device_internal { struct rte_pci_device device; + /* PCI regions provided by e.g. VFIO. */ + struct rte_pci_region region[RTE_MAX_PCI_REGIONS]; }; /** From patchwork Tue Jun 1 03:06:41 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chenbo Xia X-Patchwork-Id: 93680 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id CFB62A0524; Tue, 1 Jun 2021 05:17:52 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id B4D2240E78; Tue, 1 Jun 2021 05:17:52 +0200 (CEST) Received: from mga05.intel.com (mga05.intel.com [192.55.52.43]) by mails.dpdk.org (Postfix) with ESMTP id A8FC840E6E for ; Tue, 1 Jun 2021 05:17:51 +0200 (CEST) IronPort-SDR: a91HO7d+UlykVIiecTt19buFTltM+uyjbmSO13zWU+WMU+zAWNOejFUZR6A9NgLZLXvxoDwiFU Bn9xUy1dciEw== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="289079291" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="289079291" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:17:50 -0700 IronPort-SDR: 16VRn3utpkvMzx/dsp2k/SWjPAF2FF7I36QUXOdt+IwzgNbT//P69BOPaC1kRWNHTo3yug58Jg xdXL9FwGwWjA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315443" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:17:44 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com, Tiwei Bie Date: Tue, 1 Jun 2021 11:06:41 +0800 Message-Id: <20210601030644.3318-4-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> Subject: [dpdk-dev] [RFC v3 3/6] bus/pci: introduce helper for MMIO read and write X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From: Tiwei Bie The MMIO regions may not be mmap-able for mediated PCI device. In this case, the application should explicitly do read and write to access these regions. Signed-off-by: Tiwei Bie --- drivers/bus/pci/bsd/pci.c | 22 +++++++++++++++ drivers/bus/pci/linux/pci.c | 46 ++++++++++++++++++++++++++++++ drivers/bus/pci/linux/pci_init.h | 10 +++++++ drivers/bus/pci/linux/pci_uio.c | 22 +++++++++++++++ drivers/bus/pci/linux/pci_vfio.c | 36 ++++++++++++++++++++++++ drivers/bus/pci/rte_bus_pci.h | 48 ++++++++++++++++++++++++++++++++ drivers/bus/pci/version.map | 4 +++ 7 files changed, 188 insertions(+) diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c index 20ce979f60..781f65c637 100644 --- a/drivers/bus/pci/bsd/pci.c +++ b/drivers/bus/pci/bsd/pci.c @@ -494,6 +494,28 @@ int rte_pci_write_config(const struct rte_pci_device *dev, return -1; } +/* Read PCI MMIO space. */ +int rte_pci_mmio_read(const struct rte_pci_device *dev, int bar, + void *buf, size_t len, off_t offset) +{ + if (bar >= PCI_MAX_RESOURCE || dev->mem_resource[bar].addr == NULL || + (uint64_t)offset + len > dev->mem_resource[bar].len) + return -1; + memcpy(buf, (uint8_t *)dev->mem_resource[bar].addr + offset, len); + return len; +} + +/* Write PCI MMIO space. */ +int rte_pci_mmio_write(const struct rte_pci_device *dev, int bar, + const void *buf, size_t len, off_t offset) +{ + if (bar >= PCI_MAX_RESOURCE || dev->mem_resource[bar].addr == NULL || + (uint64_t)offset + len > dev->mem_resource[bar].len) + return -1; + memcpy((uint8_t *)dev->mem_resource[bar].addr + offset, buf, len); + return len; +} + int rte_pci_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p) diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 8f1fddbf20..4805f277c5 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -682,6 +682,52 @@ int rte_pci_write_config(const struct rte_pci_device *device, } } +/* Read PCI MMIO space. */ +int rte_pci_mmio_read(const struct rte_pci_device *device, int bar, + void *buf, size_t len, off_t offset) +{ + char devname[RTE_DEV_NAME_MAX_LEN] = ""; + + switch (device->kdrv) { + case RTE_PCI_KDRV_IGB_UIO: + case RTE_PCI_KDRV_UIO_GENERIC: + return pci_uio_mmio_read(device, bar, buf, len, offset); +#ifdef VFIO_PRESENT + case RTE_PCI_KDRV_VFIO: + return pci_vfio_mmio_read(device, bar, buf, len, offset); +#endif + default: + rte_pci_device_name(&device->addr, devname, + RTE_DEV_NAME_MAX_LEN); + RTE_LOG(ERR, EAL, + "Unknown driver type for %s\n", devname); + return -1; + } +} + +/* Write PCI MMIO space. */ +int rte_pci_mmio_write(const struct rte_pci_device *device, int bar, + const void *buf, size_t len, off_t offset) +{ + char devname[RTE_DEV_NAME_MAX_LEN] = ""; + + switch (device->kdrv) { + case RTE_PCI_KDRV_IGB_UIO: + case RTE_PCI_KDRV_UIO_GENERIC: + return pci_uio_mmio_write(device, bar, buf, len, offset); +#ifdef VFIO_PRESENT + case RTE_PCI_KDRV_VFIO: + return pci_vfio_mmio_write(device, bar, buf, len, offset); +#endif + default: + rte_pci_device_name(&device->addr, devname, + RTE_DEV_NAME_MAX_LEN); + RTE_LOG(ERR, EAL, + "Unknown driver type for %s\n", devname); + return -1; + } +} + int rte_pci_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p) diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h index 9f6659ba6e..6853fa88a3 100644 --- a/drivers/bus/pci/linux/pci_init.h +++ b/drivers/bus/pci/linux/pci_init.h @@ -37,6 +37,11 @@ int pci_uio_read_config(const struct rte_intr_handle *intr_handle, int pci_uio_write_config(const struct rte_intr_handle *intr_handle, const void *buf, size_t len, off_t offs); +int pci_uio_mmio_read(const struct rte_pci_device *dev, int bar, + void *buf, size_t len, off_t offset); +int pci_uio_mmio_write(const struct rte_pci_device *dev, int bar, + const void *buf, size_t len, off_t offset); + int pci_uio_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p); void pci_uio_ioport_read(struct rte_pci_ioport *p, @@ -71,6 +76,11 @@ int pci_vfio_read_config(const struct rte_pci_device *dev, int pci_vfio_write_config(const struct rte_pci_device *dev, const void *buf, size_t len, off_t offs); +int pci_vfio_mmio_read(const struct rte_pci_device *dev, int bar, + void *buf, size_t len, off_t offset); +int pci_vfio_mmio_write(const struct rte_pci_device *dev, int bar, + const void *buf, size_t len, off_t offset); + int pci_vfio_ioport_map(struct rte_pci_device *dev, int bar, struct rte_pci_ioport *p); void pci_vfio_ioport_read(struct rte_pci_ioport *p, diff --git a/drivers/bus/pci/linux/pci_uio.c b/drivers/bus/pci/linux/pci_uio.c index 39ebeac2a0..2482635058 100644 --- a/drivers/bus/pci/linux/pci_uio.c +++ b/drivers/bus/pci/linux/pci_uio.c @@ -45,6 +45,28 @@ pci_uio_write_config(const struct rte_intr_handle *intr_handle, return pwrite(intr_handle->uio_cfg_fd, buf, len, offset); } +int +pci_uio_mmio_read(const struct rte_pci_device *dev, int bar, + void *buf, size_t len, off_t offset) +{ + if (bar >= PCI_MAX_RESOURCE || dev->mem_resource[bar].addr == NULL || + (uint64_t)offset + len > dev->mem_resource[bar].len) + return -1; + memcpy(buf, (uint8_t *)dev->mem_resource[bar].addr + offset, len); + return len; +} + +int +pci_uio_mmio_write(const struct rte_pci_device *dev, int bar, + const void *buf, size_t len, off_t offset) +{ + if (bar >= PCI_MAX_RESOURCE || dev->mem_resource[bar].addr == NULL || + (uint64_t)offset + len > dev->mem_resource[bar].len) + return -1; + memcpy((uint8_t *)dev->mem_resource[bar].addr + offset, buf, len); + return len; +} + static int pci_uio_set_bus_master(int dev_fd) { diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 012e7f72c1..3ecd984215 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -1212,6 +1212,42 @@ pci_vfio_ioport_unmap(struct rte_pci_ioport *p) return -1; } +int +pci_vfio_mmio_read(const struct rte_pci_device *dev, int bar, + void *buf, size_t len, off_t offs) +{ + uint64_t size, offset; + int fd; + + fd = dev->intr_handle.vfio_dev_fd; + + if (pci_vfio_get_region(dev, bar, &size, &offset) != 0) + return -1; + + if ((uint64_t)len + offs > size) + return -1; + + return pread64(fd, buf, len, offset + offs); +} + +int +pci_vfio_mmio_write(const struct rte_pci_device *dev, int bar, + const void *buf, size_t len, off_t offs) +{ + uint64_t size, offset; + int fd; + + fd = dev->intr_handle.vfio_dev_fd; + + if (pci_vfio_get_region(dev, bar, &size, &offset) != 0) + return -1; + + if ((uint64_t)len + offs > size) + return -1; + + return pwrite64(fd, buf, len, offset + offs); +} + int pci_vfio_is_enabled(void) { diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h index 64886b4731..dc26811b0a 100644 --- a/drivers/bus/pci/rte_bus_pci.h +++ b/drivers/bus/pci/rte_bus_pci.h @@ -310,6 +310,54 @@ int rte_pci_read_config(const struct rte_pci_device *device, int rte_pci_write_config(const struct rte_pci_device *device, const void *buf, size_t len, off_t offset); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Read from a MMIO pci resource. + * + * @param device + * A pointer to a rte_pci_device structure describing the device + * to use + * @param bar + * Index of the io pci resource we want to access. + * @param buf + * A data buffer where the bytes should be read into + * @param len + * The length of the data buffer. + * @param offset + * The offset into MMIO space described by @bar + * @return + * Number of bytes read on success, negative on error. + */ +__rte_experimental +int rte_pci_mmio_read(const struct rte_pci_device *device, int bar, + void *buf, size_t len, off_t offset); + +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Write to a MMIO pci resource. + * + * @param device + * A pointer to a rte_pci_device structure describing the device + * to use + * @param bar + * Index of the io pci resource we want to access. + * @param buf + * A data buffer containing the bytes should be written + * @param len + * The length of the data buffer. + * @param offset + * The offset into MMIO space described by @bar + * @return + * Number of bytes written on success, negative on error. + */ +__rte_experimental +int rte_pci_mmio_write(const struct rte_pci_device *device, int bar, + const void *buf, size_t len, off_t offset); + /** * A structure used to access io resources for a pci device. * rte_pci_ioport is arch, os, driver specific, and should not be used outside diff --git a/drivers/bus/pci/version.map b/drivers/bus/pci/version.map index f33ed0abd1..02e4219aab 100644 --- a/drivers/bus/pci/version.map +++ b/drivers/bus/pci/version.map @@ -21,4 +21,8 @@ EXPERIMENTAL { global: rte_pci_find_ext_capability; + + # added in 21.08 + rte_pci_mmio_read; + rte_pci_mmio_write; }; From patchwork Tue Jun 1 03:06:42 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chenbo Xia X-Patchwork-Id: 93681 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id F27DBA0524; Tue, 1 Jun 2021 05:18:01 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id E02F640E64; Tue, 1 Jun 2021 05:18:01 +0200 (CEST) Received: from mga05.intel.com (mga05.intel.com [192.55.52.43]) by mails.dpdk.org (Postfix) with ESMTP id B7F6040041 for ; Tue, 1 Jun 2021 05:18:00 +0200 (CEST) IronPort-SDR: FWJdBM8wRcovLnTcMC7atDNZk+3TXagjFzUX1ZNfO5wLbFlCcljwaXUfk1pC9G6gqH8QJlvbVN nqadjeur+NUA== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="289079337" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="289079337" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:17:59 -0700 IronPort-SDR: +MJuUDgC2I4+WO0/p93GNXzhreq3LyP0hRGRJ9KI9UMA/kdFTlcb6oNHBKZ6Z896cOITJDZAcP Lb2VsDl0viOA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315466" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:17:53 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com, Tiwei Bie Date: Tue, 1 Jun 2021 11:06:42 +0800 Message-Id: <20210601030644.3318-5-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> Subject: [dpdk-dev] [RFC v3 4/6] eal: add a helper for reading string from sysfs X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From: Tiwei Bie This patch adds a helper for reading string from sysfs. Signed-off-by: Cunming Liang Signed-off-by: Tiwei Bie --- lib/eal/common/eal_filesystem.h | 10 ++++++++++ lib/eal/freebsd/eal.c | 22 ++++++++++++++++++++++ lib/eal/linux/eal.c | 22 ++++++++++++++++++++++ lib/eal/version.map | 3 +++ 4 files changed, 57 insertions(+) diff --git a/lib/eal/common/eal_filesystem.h b/lib/eal/common/eal_filesystem.h index 5d21f07c20..be4c51ebb2 100644 --- a/lib/eal/common/eal_filesystem.h +++ b/lib/eal/common/eal_filesystem.h @@ -104,4 +104,14 @@ eal_get_hugefile_path(char *buffer, size_t buflen, const char *hugedir, int f_id * Used to read information from files on /sys */ int eal_parse_sysfs_value(const char *filename, unsigned long *val); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Function to read a line from a file on the filesystem. + * Used to read information from files on /sys + */ +__rte_experimental +int rte_eal_parse_sysfs_str(const char *filename, char *buf, unsigned long sz); + #endif /* EAL_FILESYSTEM_H */ diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c index f4d1676754..002f07f4da 100644 --- a/lib/eal/freebsd/eal.c +++ b/lib/eal/freebsd/eal.c @@ -169,6 +169,28 @@ eal_parse_sysfs_value(const char *filename, unsigned long *val) return 0; } +int +rte_eal_parse_sysfs_str(const char *filename, char *buf, unsigned long sz) +{ + FILE *f; + + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs file %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sz, f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs file %s\n", + __func__, filename); + fclose(f); + return -1; + } + + fclose(f); + return 0; +} /* create memory configuration in shared/mmap memory. Take out * a write lock on the memsegs, so we can auto-detect primary/secondary. diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c index ba19fc6347..d5917a48ca 100644 --- a/lib/eal/linux/eal.c +++ b/lib/eal/linux/eal.c @@ -260,6 +260,28 @@ eal_parse_sysfs_value(const char *filename, unsigned long *val) return 0; } +int +rte_eal_parse_sysfs_str(const char *filename, char *buf, unsigned long sz) +{ + FILE *f; + + f = fopen(filename, "r"); + if (f == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs file %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sz, f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs file %s\n", + __func__, filename); + fclose(f); + return -1; + } + + fclose(f); + return 0; +} /* create memory configuration in shared/mmap memory. Take out * a write lock on the memsegs, so we can auto-detect primary/secondary. diff --git a/lib/eal/version.map b/lib/eal/version.map index fe5c3dac98..3d7fce26a4 100644 --- a/lib/eal/version.map +++ b/lib/eal/version.map @@ -423,6 +423,9 @@ EXPERIMENTAL { rte_version_release; # WINDOWS_NO_EXPORT rte_version_suffix; # WINDOWS_NO_EXPORT rte_version_year; # WINDOWS_NO_EXPORT + + # added in 21.08 + rte_eal_parse_sysfs_str; # WINDOWS_NO_EXPORT }; INTERNAL { From patchwork Tue Jun 1 03:06:43 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Chenbo Xia X-Patchwork-Id: 93682 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 32965A0524; Tue, 1 Jun 2021 05:18:10 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 1FD96410D7; Tue, 1 Jun 2021 05:18:10 +0200 (CEST) Received: from mga04.intel.com (mga04.intel.com [192.55.52.120]) by mails.dpdk.org (Postfix) with ESMTP id 483BA40040 for ; Tue, 1 Jun 2021 05:18:08 +0200 (CEST) IronPort-SDR: nX5bDwfAvsyx/i6HuzeUGBb1aiP9u+6Dy3mZLeHVnOI4JM6fFFy4CCLkmdcCXISOLSpofFUcpK VHXDcpsbzm9g== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="201593041" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="201593041" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:18:07 -0700 IronPort-SDR: V7JPG4trP7Y4lXIOFGfeApbRK7zTN8Cd9WJDnew6B4sqEoqCMyN5UW+4aqz01GGfZapwJBTUGc x8vEULCpT2vg== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315491" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:18:01 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com, Tiwei Bie Date: Tue, 1 Jun 2021 11:06:43 +0800 Message-Id: <20210601030644.3318-6-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> MIME-Version: 1.0 Subject: [dpdk-dev] [RFC v3 5/6] bus/pci: add mdev support X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From: Tiwei Bie This patch adds the mdev (Mediated device) support in PCI bus driver. With this patch, the PCI bus driver will be able to scan and probe the mediated PCI devices (i.e. the Mediated devices whose device API is "vfio-pci") in the system. There are several things different between physical PCI devices and mediated PCI devices: - Mediated PCI devices have to be accessed through VFIO API; - The regions in mediated PCI devices may not be mmap-able, and drivers need to call read/write function to access them in this case; - Mediated PCI devices use UUID as device address; Signed-off-by: Cunming Liang Signed-off-by: Tiwei Bie Signed-off-by: Chenbo Xia --- drivers/bus/pci/linux/pci.c | 30 ++- drivers/bus/pci/linux/pci_init.h | 15 +- drivers/bus/pci/linux/pci_vfio.c | 147 ++++++++++++-- drivers/bus/pci/linux/pci_vfio_mdev.c | 277 ++++++++++++++++++++++++++ drivers/bus/pci/meson.build | 1 + drivers/bus/pci/pci_common.c | 84 +++++--- drivers/bus/pci/pci_params.c | 36 +++- drivers/bus/pci/private.h | 17 ++ drivers/bus/pci/rte_bus_pci.h | 17 +- lib/eal/linux/eal.c | 17 +- 10 files changed, 571 insertions(+), 70 deletions(-) create mode 100644 drivers/bus/pci/linux/pci_vfio_mdev.c diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c index 4805f277c5..29dd9ba26f 100644 --- a/drivers/bus/pci/linux/pci.c +++ b/drivers/bus/pci/linux/pci.c @@ -30,7 +30,7 @@ extern struct rte_pci_bus rte_pci_bus; -static int +int pci_get_kernel_driver_by_path(const char *filename, char *dri_name, size_t len) { @@ -70,7 +70,7 @@ rte_pci_map_device(struct rte_pci_device *dev) switch (dev->kdrv) { case RTE_PCI_KDRV_VFIO: #ifdef VFIO_PRESENT - if (pci_vfio_is_enabled()) + if (pci_vfio_is_enabled(dev)) ret = pci_vfio_map_resource(dev); #endif break; @@ -99,7 +99,7 @@ rte_pci_unmap_device(struct rte_pci_device *dev) switch (dev->kdrv) { case RTE_PCI_KDRV_VFIO: #ifdef VFIO_PRESENT - if (pci_vfio_is_enabled()) + if (pci_vfio_is_enabled(dev)) pci_vfio_unmap_resource(dev); #endif break; @@ -347,6 +347,15 @@ pci_scan_one(const char *dirname, const struct rte_pci_addr *addr) int ret; TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) { + /* + * Insert physical PCI devices before all mediated + * PCI devices. + */ + if (dev2->is_mdev) { + rte_pci_insert_device(dev2, dev); + return 0; + } + ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr); if (ret > 0) continue; @@ -465,8 +474,14 @@ rte_pci_scan(void) return 0; #ifdef VFIO_PRESENT - if (!pci_vfio_is_enabled()) - RTE_LOG(DEBUG, EAL, "VFIO PCI modules not loaded\n"); + if (!rte_vfio_is_enabled("vfio_pci")) + RTE_LOG(DEBUG, EAL, "VFIO PCI module not loaded\n"); + + if (!rte_vfio_is_enabled("vfio_mdev")) + RTE_LOG(DEBUG, EAL, "VFIO MDEV module not loaded\n"); + + if (pci_scan_mdev() != 0) + return -1; #endif dir = opendir(rte_pci_get_sysfs_path()); @@ -737,7 +752,7 @@ rte_pci_ioport_map(struct rte_pci_device *dev, int bar, switch (dev->kdrv) { #ifdef VFIO_PRESENT case RTE_PCI_KDRV_VFIO: - if (pci_vfio_is_enabled()) + if (pci_vfio_is_enabled(dev)) ret = pci_vfio_ioport_map(dev, bar, p); break; #endif @@ -801,8 +816,7 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p) switch (p->dev->kdrv) { #ifdef VFIO_PRESENT case RTE_PCI_KDRV_VFIO: - if (pci_vfio_is_enabled()) - ret = pci_vfio_ioport_unmap(p); + ret = -1; break; #endif case RTE_PCI_KDRV_IGB_UIO: diff --git a/drivers/bus/pci/linux/pci_init.h b/drivers/bus/pci/linux/pci_init.h index 6853fa88a3..0c0191b6d5 100644 --- a/drivers/bus/pci/linux/pci_init.h +++ b/drivers/bus/pci/linux/pci_init.h @@ -19,6 +19,9 @@ extern void *pci_map_addr; void *pci_find_max_end_va(void); +int pci_get_kernel_driver_by_path(const char *filename, char *dri_name, + size_t len); + /* parse one line of the "resource" sysfs file (note that the 'line' * string is modified) */ @@ -93,7 +96,17 @@ int pci_vfio_ioport_unmap(struct rte_pci_ioport *p); int pci_vfio_map_resource(struct rte_pci_device *dev); int pci_vfio_unmap_resource(struct rte_pci_device *dev); -int pci_vfio_is_enabled(void); +int pci_vfio_is_enabled(struct rte_pci_device *dev); + +int pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd, + struct vfio_device_info *device_info); + +int pci_vfio_get_pci_id(struct rte_pci_device *dev, int vfio_dev_fd, + struct rte_pci_id *pci_id); + +const char *pci_mdev_get_sysfs_path(void); + +int pci_scan_mdev(void); #endif diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 3ecd984215..00ba5db03a 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "eal_filesystem.h" @@ -741,7 +742,7 @@ pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) return ret; } -static int +int pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd, struct vfio_device_info *device_info) { @@ -776,6 +777,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; struct vfio_region_info *reg = NULL; char pci_addr[PATH_MAX] = {0}; + const char *sysfs_base; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; int i, ret; @@ -791,11 +793,17 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) #endif /* store PCI address string */ - snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + if (dev->is_mdev) { + sysfs_base = pci_mdev_get_sysfs_path(); + rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr)); + } else { + sysfs_base = rte_pci_get_sysfs_path(); + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); + } - ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, - &vfio_dev_fd, &device_info); + ret = rte_vfio_setup_device(sysfs_base, pci_addr, &vfio_dev_fd, + &device_info); if (ret) return ret; @@ -806,7 +814,13 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) "Cannot store VFIO mmap details\n"); goto err_vfio_dev_fd; } - memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); + + vfio_res->is_mdev = dev->is_mdev; + if (dev->is_mdev) + memcpy(&vfio_res->uuid, &dev->uuid, sizeof(vfio_res->uuid)); + else + memcpy(&vfio_res->pci_addr, &dev->addr, + sizeof(vfio_res->pci_addr)); /* get number of registers (up to BAR5) */ vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, @@ -938,6 +952,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) { struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; char pci_addr[PATH_MAX] = {0}; + const char *sysfs_base; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; int i, ret; @@ -953,15 +968,29 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) #endif /* store PCI address string */ - snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + if (dev->is_mdev) { + sysfs_base = pci_mdev_get_sysfs_path(); + rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr)); + } else { + sysfs_base = rte_pci_get_sysfs_path(); + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); + } /* if we're in a secondary process, just find our tailq entry */ TAILQ_FOREACH(vfio_res, vfio_res_list, next) { - if (rte_pci_addr_cmp(&vfio_res->pci_addr, - &dev->addr)) + if (dev->is_mdev != vfio_res->is_mdev) continue; - break; + + if (!dev->is_mdev && !rte_pci_addr_cmp(&vfio_res->pci_addr, + &dev->addr)) + break; + + if (dev->is_mdev && !rte_uuid_compare(vfio_res->uuid, + dev->uuid)) + break; + + continue; } /* if we haven't found our tailq entry, something's wrong */ if (vfio_res == NULL) { @@ -970,8 +999,8 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) return -1; } - ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, - &vfio_dev_fd, &device_info); + ret = rte_vfio_setup_device(sysfs_base, pci_addr, &vfio_dev_fd, + &device_info); if (ret) return ret; @@ -1030,9 +1059,18 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, /* Get vfio_res */ TAILQ_FOREACH(vfio_res, vfio_res_list, next) { - if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr)) + if (dev->is_mdev != vfio_res->is_mdev) continue; - break; + + if (!dev->is_mdev && !rte_pci_addr_cmp(&vfio_res->pci_addr, + &dev->addr)) + break; + + if (dev->is_mdev && !rte_uuid_compare(vfio_res->uuid, + dev->uuid)) + break; + + continue; } if (vfio_res == NULL) @@ -1061,6 +1099,7 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, static int pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) { + const char *sysfs_base; char pci_addr[PATH_MAX] = {0}; struct rte_pci_addr *loc = &dev->addr; struct mapped_pci_resource *vfio_res = NULL; @@ -1068,8 +1107,14 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) int ret; /* store PCI address string */ - snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + if (dev->is_mdev) { + sysfs_base = pci_mdev_get_sysfs_path(); + rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr)); + } else { + sysfs_base = rte_pci_get_sysfs_path(); + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); + } #ifdef HAVE_VFIO_DEV_REQ_INTERFACE ret = pci_vfio_disable_notifier(dev); @@ -1091,8 +1136,8 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) return -1; } - ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, - dev->intr_handle.vfio_dev_fd); + ret = rte_vfio_release_device(sysfs_base, pci_addr, + dev->intr_handle.vfio_dev_fd); if (ret < 0) { RTE_LOG(ERR, EAL, "Cannot release VFIO device\n"); return ret; @@ -1117,6 +1162,7 @@ pci_vfio_unmap_resource_primary(struct rte_pci_device *dev) static int pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) { + const char *sysfs_base; char pci_addr[PATH_MAX] = {0}; struct rte_pci_addr *loc = &dev->addr; struct mapped_pci_resource *vfio_res = NULL; @@ -1124,11 +1170,17 @@ pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev) int ret; /* store PCI address string */ - snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, + if (dev->is_mdev) { + sysfs_base = pci_mdev_get_sysfs_path(); + rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr)); + } else { + sysfs_base = rte_pci_get_sysfs_path(); + snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); + } - ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr, - dev->intr_handle.vfio_dev_fd); + ret = rte_vfio_release_device(sysfs_base, pci_addr, + dev->intr_handle.vfio_dev_fd); if (ret < 0) { RTE_LOG(ERR, EAL, "Cannot release VFIO device\n"); return ret; @@ -1249,8 +1301,61 @@ pci_vfio_mmio_write(const struct rte_pci_device *dev, int bar, } int -pci_vfio_is_enabled(void) +pci_vfio_is_enabled(struct rte_pci_device *dev) { - return rte_vfio_is_enabled("vfio_pci"); + return rte_vfio_is_enabled(dev->is_mdev ? "vfio_mdev" : "vfio_pci"); } + +int +pci_vfio_get_pci_id(struct rte_pci_device *dev, int vfio_dev_fd, + struct rte_pci_id *pci_id) +{ + uint64_t size, offset; + int class; + + if (pci_vfio_get_region(dev, VFIO_PCI_CONFIG_REGION_INDEX, + &size, &offset) != 0) { + RTE_LOG(DEBUG, EAL, "Cannot get offset of CONFIG region.\n"); + return -1; + } + + /* vendor_id */ + if (pread64(vfio_dev_fd, &pci_id->vendor_id, sizeof(uint16_t), + offset + PCI_VENDOR_ID) != sizeof(uint16_t)) { + RTE_LOG(DEBUG, EAL, "Cannot read VendorID from PCI config space\n"); + return -1; + } + + /* device_id */ + if (pread64(vfio_dev_fd, &pci_id->device_id, sizeof(uint16_t), + offset + PCI_DEVICE_ID) != sizeof(uint16_t)) { + RTE_LOG(DEBUG, EAL, "Cannot read DeviceID from PCI config space\n"); + return -1; + } + + /* subsystem_vendor_id */ + if (pread64(vfio_dev_fd, &pci_id->subsystem_vendor_id, sizeof(uint16_t), + offset + PCI_SUBSYSTEM_VENDOR_ID) != sizeof(uint16_t)) { + RTE_LOG(DEBUG, EAL, "Cannot read SubVendorID from PCI config space\n"); + return -1; + } + + /* subsystem_device_id */ + if (pread64(vfio_dev_fd, &pci_id->subsystem_device_id, sizeof(uint16_t), + offset + PCI_SUBSYSTEM_ID) != sizeof(uint16_t)) { + RTE_LOG(DEBUG, EAL, "Cannot read SubDeviceID from PCI config space\n"); + return -1; + } + + /* class_id */ + if (pread64(vfio_dev_fd, &class, sizeof(uint32_t), + offset + PCI_CLASS_REVISION) != sizeof(uint32_t)) { + RTE_LOG(DEBUG, EAL, "Cannot read ClassID from PCI config space\n"); + return -1; + } + pci_id->class_id = class >> 8; + + return 0; +} + #endif diff --git a/drivers/bus/pci/linux/pci_vfio_mdev.c b/drivers/bus/pci/linux/pci_vfio_mdev.c new file mode 100644 index 0000000000..ef25749a0d --- /dev/null +++ b/drivers/bus/pci/linux/pci_vfio_mdev.c @@ -0,0 +1,277 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2021 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_filesystem.h" + +#include "private.h" +#include "pci_init.h" + +#ifdef VFIO_PRESENT + +extern struct rte_pci_bus rte_pci_bus; + +#define SYSFS_MDEV_DEVICES "/sys/bus/mdev/devices" + +const char *pci_mdev_get_sysfs_path(void) +{ + const char *path = NULL; + + path = getenv("SYSFS_MDEV_DEVICES"); + if (path == NULL) + return SYSFS_MDEV_DEVICES; + + return path; +} + +static int +is_pci_device(const char *dirname) +{ + char device_api[PATH_MAX]; + char filename[PATH_MAX]; + char *ptr; + + /* get device_api */ + snprintf(filename, sizeof(filename), "%s/mdev_type/device_api", + dirname); + + if (rte_eal_parse_sysfs_str(filename, device_api, + sizeof(device_api)) < 0) { + return -1; + } + + ptr = strchr(device_api, '\n'); + if (ptr != NULL) + *ptr = '\0'; + + return strcmp(device_api, "vfio-pci") == 0; +} + +static int +pci_scan_one_mdev(const char *dirname, const rte_uuid_t addr) +{ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char name[RTE_UUID_STRLEN]; + char filename[PATH_MAX]; + char path[PATH_MAX]; + char driver[PATH_MAX]; + char *ptr; + struct rte_pci_device_internal *pdev; + struct rte_pci_device *dev; + bool need_release = false; + const char *sysfs_base; + unsigned long tmp; + int vfio_dev_fd; + int ret; + + sysfs_base = pci_mdev_get_sysfs_path(); + + pdev = malloc(sizeof(*pdev)); + if (pdev == NULL) + return -1; + + memset(pdev, 0, sizeof(*pdev)); + + dev = &pdev->device; + dev->device.bus = &rte_pci_bus.bus; + rte_uuid_unparse(addr, name, sizeof(name)); + + /* parse driver */ + snprintf(filename, sizeof(filename), "%s/driver", dirname); + ret = pci_get_kernel_driver_by_path(filename, driver, sizeof(driver)); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s: failed to get kernel driver\n", name); + goto err; + } + + if (ret != 0 || strcmp(driver, "vfio_mdev") != 0) { + RTE_LOG(DEBUG, EAL, "%s: unsupported mdev driver\n", name); + goto err; + } + + dev->kdrv = RTE_PCI_KDRV_VFIO; + + dev->is_mdev = 1; + rte_uuid_copy(dev->uuid, addr); + + snprintf(filename, sizeof(filename), "%s/%s", sysfs_base, name); + + /* Get the path of the parent device. */ + if (realpath(filename, path) == NULL) { + RTE_LOG(DEBUG, EAL, "%s: failed to get parent device\n", name); + goto err; + } + + ptr = strrchr(path, '/'); + if (ptr == NULL) { + RTE_LOG(DEBUG, EAL, "%s: failed to parse parent device\n", + name); + goto err; + } + *ptr = '\0'; + + /* get numa node, default to 0 if not present */ + snprintf(filename, sizeof(filename), "%s/numa_node", path); + + if (access(filename, F_OK) != -1) { + if (eal_parse_sysfs_value(filename, &tmp) == 0) + dev->device.numa_node = tmp; + else + dev->device.numa_node = -1; + } else { + dev->device.numa_node = 0; + } + + pci_name_set(dev); + + if (rte_vfio_setup_device(sysfs_base, name, &vfio_dev_fd, + &device_info) != 0) { + RTE_LOG(DEBUG, EAL, "%s: failed to setup device\n", name); + goto err; + } + + need_release = true; + + if (pci_vfio_fill_regions(dev, vfio_dev_fd, &device_info) != 0) { + RTE_LOG(DEBUG, EAL, "%s: failed to get regions\n", name); + goto err; + } + + if (pci_vfio_get_pci_id(dev, vfio_dev_fd, &dev->id) != 0) { + RTE_LOG(DEBUG, EAL, "%s: failed to access the device\n", name); + goto err; + } + + /* device is valid, add to the list (sorted) */ + if (TAILQ_EMPTY(&rte_pci_bus.device_list)) { + rte_pci_add_device(dev); + } else { + struct rte_pci_device *dev2; + int ret; + + TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) { + /* + * Insert mediated PCI devices after all physical + * PCI devices. + */ + if (!dev2->is_mdev) + continue; + ret = rte_uuid_compare(dev->uuid, dev2->uuid); + if (ret > 0) + continue; + if (ret < 0) + rte_pci_insert_device(dev2, dev); + else {/* already registered */ + if (!rte_dev_is_probed(&dev2->device)) { + dev2->kdrv = dev->kdrv; + dev2->max_vfs = dev->max_vfs; + pci_name_set(dev2); + memmove(dev2->mem_resource, + dev->mem_resource, + sizeof(dev->mem_resource)); + } else { + /** + * If device is plugged and driver is + * probed already, (This happens when + * we call rte_dev_probe which will + * scan all device on the bus) we don't + * need to do anything here unless... + **/ + if (dev2->kdrv != dev->kdrv || + dev2->max_vfs != dev->max_vfs || + memcmp(&dev2->id, &dev->id, + sizeof(dev2->id))) + /* + * This should not happen. + * But it is still possible if + * we unbind a device from + * vfio or uio before hotplug + * remove and rebind it with + * a different configure. + * So we just print out the + * error as an alarm. + */ + RTE_LOG(ERR, EAL, "Unexpected device scan at %s!\n", + filename); + else if (dev2->device.devargs != + dev->device.devargs) { + rte_devargs_remove(dev2->device.devargs); + pci_name_set(dev2); + } + } + free(pdev); + } + return 0; + } + + rte_pci_add_device(dev); + } + + return 0; + +err: + if (need_release) + rte_vfio_release_device(sysfs_base, name, vfio_dev_fd); + free(pdev); + return 1; +} + +int +pci_scan_mdev(void) +{ + struct dirent *e; + DIR *dir; + char dirname[PATH_MAX]; + rte_uuid_t addr; + + dir = opendir(pci_mdev_get_sysfs_path()); + if (dir == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): opendir failed: %s\n", + __func__, strerror(errno)); + return 0; + } + + while ((e = readdir(dir)) != NULL) { + if (e->d_name[0] == '.') + continue; + + if (rte_uuid_parse(e->d_name, addr) != 0) + continue; + + if (rte_mdev_ignore_device(addr)) + continue; + + snprintf(dirname, sizeof(dirname), "%s/%s", + pci_mdev_get_sysfs_path(), e->d_name); + + if (!is_pci_device(dirname)) + continue; + + if (pci_scan_one_mdev(dirname, addr) < 0) + goto error; + } + closedir(dir); + return 0; + +error: + closedir(dir); + return -1; +} + +#endif /* VFIO_PRESENT */ diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build index 81c7e94c00..fb7a9a1fa8 100644 --- a/drivers/bus/pci/meson.build +++ b/drivers/bus/pci/meson.build @@ -11,6 +11,7 @@ if is_linux 'linux/pci.c', 'linux/pci_uio.c', 'linux/pci_vfio.c', + 'linux/pci_vfio_mdev.c', ) includes += include_directories('linux') endif diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c index 1c368c254c..1984dbdba0 100644 --- a/drivers/bus/pci/pci_common.c +++ b/drivers/bus/pci/pci_common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "private.h" @@ -57,15 +58,34 @@ pci_devargs_lookup(const struct rte_pci_addr *pci_addr) return NULL; } +static struct rte_devargs * +mdev_devargs_lookup(const rte_uuid_t mdev_addr) +{ + struct rte_devargs *devargs; + rte_uuid_t id; + + RTE_EAL_DEVARGS_FOREACH("pci", devargs) { + devargs->bus->parse(devargs->name, &id); + if (!rte_uuid_compare(mdev_addr, id)) + return devargs; + } + return NULL; +} + void pci_name_set(struct rte_pci_device *dev) { struct rte_devargs *devargs; /* Each device has its internal, canonical name set. */ - rte_pci_device_name(&dev->addr, - dev->name, sizeof(dev->name)); - devargs = pci_devargs_lookup(&dev->addr); + if (dev->is_mdev) { + rte_uuid_unparse(dev->uuid, dev->name, sizeof(dev->name)); + devargs = mdev_devargs_lookup(dev->uuid); + } else { + rte_pci_device_name(&dev->addr, dev->name, sizeof(dev->name)); + devargs = pci_devargs_lookup(&dev->addr); + } + dev->device.devargs = devargs; /* When using a blocklist, only blocked devices will have @@ -166,21 +186,17 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr, { int ret; bool already_probed; - struct rte_pci_addr *loc; if ((dr == NULL) || (dev == NULL)) return -EINVAL; - loc = &dev->addr; - /* The device is not blocked; Check if driver supports it */ if (!rte_pci_match(dr, dev)) /* Match of device and driver failed */ return 1; - RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", - loc->domain, loc->bus, loc->devid, loc->function, - dev->device.numa_node); + RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n", + dev->name, dev->device.numa_node); /* no initialization when marked as blocked, return without error */ if (dev->device.devargs != NULL && @@ -235,10 +251,9 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr, } } - RTE_LOG(INFO, EAL, "Probe PCI driver: %s (%x:%x) device: "PCI_PRI_FMT" (socket %i)\n", + RTE_LOG(INFO, EAL, "Probe PCI driver: %s (%x:%x) device: %s (socket %i)\n", dr->driver.name, dev->id.vendor_id, dev->id.device_id, - loc->domain, loc->bus, loc->devid, loc->function, - dev->device.numa_node); + dev->name, dev->device.numa_node); /* call the driver probe() function */ ret = dr->probe(dr, dev); if (already_probed) @@ -266,7 +281,6 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr, static int rte_pci_detach_dev(struct rte_pci_device *dev) { - struct rte_pci_addr *loc; struct rte_pci_driver *dr; int ret = 0; @@ -274,11 +288,9 @@ rte_pci_detach_dev(struct rte_pci_device *dev) return -EINVAL; dr = dev->driver; - loc = &dev->addr; - RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n", - loc->domain, loc->bus, loc->devid, - loc->function, dev->device.numa_node); + RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n", + dev->name, dev->device.numa_node); RTE_LOG(DEBUG, EAL, " remove driver: %x:%x %s\n", dev->id.vendor_id, dev->id.device_id, dr->driver.name); @@ -345,10 +357,9 @@ pci_probe(void) ret = pci_probe_all_drivers(dev); if (ret < 0) { if (ret != -EEXIST) { - RTE_LOG(ERR, EAL, "Requested device " - PCI_PRI_FMT " cannot be used\n", - dev->addr.domain, dev->addr.bus, - dev->addr.devid, dev->addr.function); + RTE_LOG(ERR, EAL, + "Requested device %s cannot be used\n", + dev->name); rte_errno = errno; failed++; } @@ -395,11 +406,20 @@ pci_parse(const char *name, void *addr) { struct rte_pci_addr *out = addr; struct rte_pci_addr pci_addr; + rte_uuid_t mdev_addr; bool parse; parse = (rte_pci_addr_parse(name, &pci_addr) == 0); if (parse && addr != NULL) *out = pci_addr; + + if (parse) + return 0; + + parse = (rte_uuid_parse(name, mdev_addr) == 0); + if (parse && addr != NULL) + memcpy(addr, &mdev_addr, sizeof(mdev_addr)); + return parse == false; } @@ -622,11 +642,9 @@ pci_dma_unmap(struct rte_device *dev, void *addr, uint64_t iova, size_t len) return -1; } -bool -rte_pci_ignore_device(const struct rte_pci_addr *pci_addr) +static bool +devargs_ignore_device(struct rte_devargs *devargs) { - struct rte_devargs *devargs = pci_devargs_lookup(pci_addr); - switch (rte_pci_bus.bus.conf.scan_mode) { case RTE_BUS_SCAN_ALLOWLIST: if (devargs && devargs->policy == RTE_DEV_ALLOWED) @@ -641,6 +659,22 @@ rte_pci_ignore_device(const struct rte_pci_addr *pci_addr) return true; } +bool +rte_pci_ignore_device(const struct rte_pci_addr *pci_addr) +{ + struct rte_devargs *devargs = pci_devargs_lookup(pci_addr); + + return devargs_ignore_device(devargs); +} + +bool +rte_mdev_ignore_device(const rte_uuid_t mdev_addr) +{ + struct rte_devargs *devargs = mdev_devargs_lookup(mdev_addr); + + return devargs_ignore_device(devargs); +} + enum rte_iova_mode rte_pci_get_iommu_class(void) { diff --git a/drivers/bus/pci/pci_params.c b/drivers/bus/pci/pci_params.c index 3192e9c967..231e57213e 100644 --- a/drivers/bus/pci/pci_params.c +++ b/drivers/bus/pci/pci_params.c @@ -2,12 +2,15 @@ * Copyright 2018 GaĆ«tan Rivet */ +#include + #include #include #include #include #include #include +#include #include "private.h" @@ -35,6 +38,19 @@ pci_addr_kv_cmp(const char *key __rte_unused, return -abs(rte_pci_addr_cmp(addr1, addr2)); } +static int +mdev_addr_kv_cmp(const char *key __rte_unused, + const char *value, + void *_addr2) +{ + rte_uuid_t addr1; + unsigned char *addr2 = _addr2; + + if (rte_uuid_parse(value, addr1)) + return -1; + return -abs(rte_uuid_compare(addr1, addr2)); +} + static int pci_dev_match(const struct rte_device *dev, const void *_kvlist) @@ -47,11 +63,21 @@ pci_dev_match(const struct rte_device *dev, return 0; pdev = RTE_DEV_TO_PCI_CONST(dev); /* if any field does not match. */ - if (rte_kvargs_process(kvlist, pci_params_keys[RTE_PCI_PARAM_ADDR], - &pci_addr_kv_cmp, - (void *)(intptr_t)&pdev->addr)) - return 1; - return 0; + if (!pdev->is_mdev) { + if (rte_kvargs_process(kvlist, + pci_params_keys[RTE_PCI_PARAM_ADDR], &pci_addr_kv_cmp, + (void *)(intptr_t)&pdev->addr)) + return 1; + else + return 0; + } else { + if (rte_kvargs_process(kvlist, + pci_params_keys[RTE_PCI_PARAM_ADDR], &mdev_addr_kv_cmp, + (void *)(intptr_t)&pdev->uuid)) + return 1; + else + return 0; + } } void * diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 8b5fa70641..3515c086aa 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -64,6 +64,18 @@ pci_name_set(struct rte_pci_device *dev); */ bool rte_pci_ignore_device(const struct rte_pci_addr *pci_addr); +/** + * Validate whether a mediated PCI device with given uuid should be + * ignored or not. + * + * @param mdev_addr + * MDEV address of device to be validated + * @return + * true: if device is to be ignored, + * false: if device is to be scanned, + */ +bool rte_mdev_ignore_device(const rte_uuid_t mdev_addr); + /** * Add a PCI device to the PCI Bus (append to PCI Device list). This function * also updates the bus references of the PCI Device (and the generic device @@ -114,6 +126,11 @@ struct pci_msix_table { struct mapped_pci_resource { TAILQ_ENTRY(mapped_pci_resource) next; + union { + struct rte_pci_addr addr; + rte_uuid_t uuid; + }; + uint8_t is_mdev; struct rte_pci_addr pci_addr; char path[PATH_MAX]; int nb_maps; diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h index dc26811b0a..fb7d934bd0 100644 --- a/drivers/bus/pci/rte_bus_pci.h +++ b/drivers/bus/pci/rte_bus_pci.h @@ -51,6 +51,15 @@ TAILQ_HEAD(rte_pci_driver_list, rte_pci_driver); struct rte_devargs; +/* + * NOTE: we can't include rte_uuid.h directly due to the conflicts + * introduced by stdbool.h + */ +typedef unsigned char rte_uuid_t[16]; + +/* It's RTE_UUID_STRLEN, which is bigger than PCI_PRI_STR_SIZE. */ +#define RTE_PCI_NAME_LEN (36 + 1) + enum rte_pci_kernel_driver { RTE_PCI_KDRV_UNKNOWN = 0, /* may be misc UIO or bifurcated driver */ RTE_PCI_KDRV_IGB_UIO, /* igb_uio for Linux */ @@ -67,7 +76,11 @@ enum rte_pci_kernel_driver { struct rte_pci_device { TAILQ_ENTRY(rte_pci_device) next; /**< Next probed PCI device. */ struct rte_device device; /**< Inherit core device */ - struct rte_pci_addr addr; /**< PCI location. */ + union { + struct rte_pci_addr addr; /**< PCI location. */ + rte_uuid_t uuid; /**< Mdev location. */ + }; + uint8_t is_mdev; /**< True for mediated PCI device */ struct rte_pci_id id; /**< PCI ID. */ struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE]; /**< PCI Memory Resource */ @@ -75,7 +88,7 @@ struct rte_pci_device { struct rte_pci_driver *driver; /**< PCI driver used in probing */ uint16_t max_vfs; /**< sriov enable if not zero */ enum rte_pci_kernel_driver kdrv; /**< Kernel driver passthrough */ - char name[PCI_PRI_STR_SIZE+1]; /**< PCI location (ASCII) */ + char name[RTE_PCI_NAME_LEN]; /**< PCI/Mdev location (ASCII) */ struct rte_intr_handle vfio_req_intr_handle; /**< Handler of VFIO request interrupt */ }; diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c index d5917a48ca..323f13107e 100644 --- a/lib/eal/linux/eal.c +++ b/lib/eal/linux/eal.c @@ -1089,6 +1089,15 @@ rte_eal_init(int argc, char **argv) return -1; } +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) { + rte_eal_init_alert("Cannot init VFIO"); + rte_errno = EAGAIN; + __atomic_store_n(&run_once, 0, __ATOMIC_RELAXED); + return -1; + } +#endif + if (rte_bus_scan()) { rte_eal_init_alert("Cannot scan the buses for devices"); rte_errno = ENODEV; @@ -1194,14 +1203,6 @@ rte_eal_init(int argc, char **argv) return -1; } -#ifdef VFIO_PRESENT - if (rte_eal_vfio_setup() < 0) { - rte_eal_init_alert("Cannot init VFIO"); - rte_errno = EAGAIN; - __atomic_store_n(&run_once, 0, __ATOMIC_RELAXED); - return -1; - } -#endif /* in secondary processes, memory init may allocate additional fbarrays * not present in primary processes, so to avoid any potential issues, * initialize memzones first. From patchwork Tue Jun 1 03:06:44 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Chenbo Xia X-Patchwork-Id: 93683 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 1F64BA0A0A; Tue, 1 Jun 2021 05:18:17 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id A4419410E5; Tue, 1 Jun 2021 05:18:13 +0200 (CEST) Received: from mga04.intel.com (mga04.intel.com [192.55.52.120]) by mails.dpdk.org (Postfix) with ESMTP id A3B8A410EE for ; Tue, 1 Jun 2021 05:18:11 +0200 (CEST) IronPort-SDR: nP9u7NetYl16hQLvN96+M9ximiHxyS+6vVvZFikmYj7PcyDf71xVNu51dFLAcZXTPPSj5wuizs sBimBbvppGcg== X-IronPort-AV: E=McAfee;i="6200,9189,10001"; a="201593048" X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="201593048" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga104.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 31 May 2021 20:18:11 -0700 IronPort-SDR: kNcKM3TR6UPpXjqzmAelTAkLEEO0fNA7llkCYtkn+3Id/gBK5MxaYdkCF48oEQ1XN/0XCHGbL+ GIeaAG3XxpxA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.83,239,1616482800"; d="scan'208";a="482315504" Received: from npg-dpdk-virtio-xiachenbo-nw.sh.intel.com ([10.67.118.250]) by fmsmga002.fm.intel.com with ESMTP; 31 May 2021 20:18:08 -0700 From: Chenbo Xia To: dev@dpdk.org, thomas@monjalon.net, cunming.liang@intel.com, jingjing.wu@intel.com Cc: anatoly.burakov@intel.com, ferruh.yigit@intel.com, mdr@ashroe.eu, nhorman@tuxdriver.com, bruce.richardson@intel.com, david.marchand@redhat.com, stephen@networkplumber.org, konstantin.ananyev@intel.com Date: Tue, 1 Jun 2021 11:06:44 +0800 Message-Id: <20210601030644.3318-7-chenbo.xia@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20210601030644.3318-1-chenbo.xia@intel.com> References: <20190715075214.16616-6-tiwei.bie@intel.com> <20210601030644.3318-1-chenbo.xia@intel.com> Subject: [dpdk-dev] [RFC v3 6/6] bus/pci: add sparse mmap support for mediated PCI devices X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" This patch adds sparse mmap support in PCI bus. Sparse mmap is a capability defined in VFIO which allows multiple mmap areas in one VFIO region. Mediated pci devices could use this capability to let mdev parent driver have control over access of non-mmapable part of regions. Signed-off-by: Chenbo Xia --- drivers/bus/pci/linux/pci_vfio.c | 229 +++++++++++++++++++++++++++---- drivers/bus/pci/private.h | 2 + drivers/bus/pci/rte_bus_pci.h | 18 ++- 3 files changed, 218 insertions(+), 31 deletions(-) diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c index 00ba5db03a..e68eccb63f 100644 --- a/drivers/bus/pci/linux/pci_vfio.c +++ b/drivers/bus/pci/linux/pci_vfio.c @@ -654,6 +654,82 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, return 0; } +static int +pci_vfio_sparse_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res, + struct vfio_region_sparse_mmap_area *vfio_areas, + uint32_t nr_areas, int bar_index, int additional_flags, + int numa_node) +{ + struct pci_map *map = &vfio_res->maps[bar_index]; + struct rte_mem_map_area *area; + struct vfio_region_sparse_mmap_area *sparse; + void *bar_addr; + uint32_t i, j; + + map->nr_areas = nr_areas; + + if (map->size == 0) { + RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index); + return 0; + } + + if (!map->nr_areas) { + RTE_LOG(DEBUG, EAL, "Skip bar %d with no sparse mmap areas\n", + bar_index); + map->areas = NULL; + return 0; + } + + if (map->areas == NULL) { + map->areas = rte_zmalloc_socket(NULL, + sizeof(*map->areas) * nr_areas, + RTE_CACHE_LINE_SIZE, numa_node); + if (map->areas == NULL) { + RTE_LOG(ERR, EAL, + "Cannot alloc memory for sparse map areas\n"); + return -1; + } + } + + for (i = 0; i < map->nr_areas; i++) { + area = &map->areas[i]; + sparse = &vfio_areas[i]; + + bar_addr = mmap(map->addr, sparse->size, 0, MAP_PRIVATE | + MAP_ANONYMOUS | additional_flags, -1, 0); + if (bar_addr != MAP_FAILED) { + area->addr = pci_map_resource(bar_addr, vfio_dev_fd, + map->offset + sparse->offset, sparse->size, + RTE_MAP_FORCE_ADDRESS); + if (area->addr == NULL) { + munmap(bar_addr, sparse->size); + RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n", + bar_index); + goto err_map; + } + + area->offset = sparse->offset; + area->size = sparse->size; + } else { + RTE_LOG(ERR, EAL, "Failed to create inaccessible mapping for BAR%d\n", + bar_index); + goto err_map; + } + } + + return 0; + +err_map: + for (j = 0; j < i; j++) { + pci_unmap_resource(map->areas[j].addr, map->areas[j].size); + map->areas[j].offset = 0; + map->areas[j].size = 0; + } + rte_free(map->areas); + map->nr_areas = 0; + return -1; +} + /* * region info may contain capability headers, so we need to keep reallocating * the memory until we match allocated memory size with argsz. @@ -770,6 +846,31 @@ pci_vfio_fill_regions(struct rte_pci_device *dev, int vfio_dev_fd, return 0; } +static void +clean_up_pci_resource(struct mapped_pci_resource *vfio_res) +{ + struct pci_map *map; + uint32_t i, j; + + for (i = 0; i < PCI_MAX_RESOURCE; i++) { + map = &vfio_res->maps[i]; + if (map->nr_areas > 1) { + for (j = 0; j < map->nr_areas; j++) + pci_unmap_resource(map->areas[j].addr, + map->areas[j].size); + } else { + /* + * We do not need to be aware of MSI-X BAR mappings. + * Using current maps array is enough. + */ + if (map->addr) + pci_unmap_resource(map->addr, map->size); + } + } + + rte_free(map->areas); +} + static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { @@ -866,6 +967,8 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) for (i = 0; i < vfio_res->nb_maps; i++) { void *bar_addr; + struct vfio_info_cap_header *hdr; + struct vfio_region_info_cap_sparse_mmap *sparse; ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); if (ret < 0) { @@ -911,15 +1014,59 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) maps[i].size = reg->size; maps[i].path = NULL; /* vfio doesn't have per-resource paths */ - ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); - if (ret < 0) { - RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", - pci_addr, i, strerror(errno)); - free(reg); - goto err_vfio_res; - } + hdr = pci_vfio_info_cap(reg, VFIO_REGION_INFO_CAP_SPARSE_MMAP); + + if (dev->is_mdev && hdr != NULL) { + sparse = container_of(hdr, + struct vfio_region_info_cap_sparse_mmap, + header); + + ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, + sparse->areas, sparse->nr_areas, i, 0, + dev->device.numa_node); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(reg); + goto err_vfio_res; + } - dev->mem_resource[i].addr = maps[i].addr; + dev->sparse_mem[i].size = reg->size; + dev->sparse_mem[i].nr_maps = vfio_res->maps[i].nr_areas; + dev->sparse_mem[i].areas = vfio_res->maps[i].areas; + } else { + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(reg); + goto err_vfio_res; + } + + if (dev->is_mdev) { + struct pci_map *mdev_map = &maps[i]; + mdev_map->nr_areas = 1; + mdev_map->areas = rte_zmalloc_socket(NULL, + sizeof(*mdev_map->areas), + RTE_CACHE_LINE_SIZE, + dev->device.numa_node); + if (maps[i].areas == NULL) { + RTE_LOG(ERR, EAL, + "Cannot allocate memory for sparse map areas\n"); + goto err_vfio_res; + } + mdev_map->areas[0].addr = maps[i].addr; + mdev_map->areas[0].offset = 0; + mdev_map->areas[0].size = reg->size; + dev->sparse_mem[i].size = reg->size; + dev->sparse_mem[i].nr_maps = 1; + dev->sparse_mem[i].areas = mdev_map->areas; + } else { + maps[i].nr_areas = 0; + maps[i].areas = NULL; + dev->mem_resource[i].addr = maps[i].addr; + } + } free(reg); } @@ -940,6 +1087,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev) return 0; err_vfio_res: + clean_up_pci_resource(vfio_res); rte_free(vfio_res); err_vfio_dev_fd: rte_vfio_release_device(rte_pci_get_sysfs_path(), @@ -960,7 +1108,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); - struct pci_map *maps; + struct pci_map *maps, *cur; dev->intr_handle.fd = -1; #ifdef HAVE_VFIO_DEV_REQ_INTERFACE @@ -1012,14 +1160,49 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev) maps = vfio_res->maps; for (i = 0; i < vfio_res->nb_maps; i++) { - ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED); - if (ret < 0) { - RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", - pci_addr, i, strerror(errno)); - goto err_vfio_dev_fd; + cur = &maps[i]; + if (cur->nr_areas > 1) { + struct vfio_region_sparse_mmap_area *areas; + uint32_t i; + + areas = malloc(sizeof(*areas) * cur->nr_areas); + if (areas == NULL) { + RTE_LOG(ERR, EAL, "Failed to alloc vfio areas for %s\n", + pci_addr); + goto err_vfio_dev_fd; + } + + for (i = 0; i < cur->nr_areas; i++) { + areas[i].offset = cur->areas[i].offset; + areas[i].size = cur->areas[i].size; + } + + ret = pci_vfio_sparse_mmap_bar(vfio_dev_fd, vfio_res, + areas, cur->nr_areas, i, MAP_FIXED, + dev->device.numa_node); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s sparse mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + free(areas); + goto err_vfio_dev_fd; + } + + free(areas); + } else { + ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, + i, MAP_FIXED); + if (ret < 0) { + RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n", + pci_addr, i, strerror(errno)); + goto err_vfio_dev_fd; + } + + if (dev->is_mdev) + cur->areas[0].addr = cur->addr; + else + dev->mem_resource[i].addr = cur->addr; } - dev->mem_resource[i].addr = maps[i].addr; } /* we need save vfio_dev_fd, so it can be used during release */ @@ -1054,8 +1237,6 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, const char *pci_addr) { struct mapped_pci_resource *vfio_res = NULL; - struct pci_map *maps; - int i; /* Get vfio_res */ TAILQ_FOREACH(vfio_res, vfio_res_list, next) { @@ -1079,19 +1260,7 @@ find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list, RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n", pci_addr); - maps = vfio_res->maps; - for (i = 0; i < vfio_res->nb_maps; i++) { - - /* - * We do not need to be aware of MSI-X table BAR mappings as - * when mapping. Just using current maps array is enough - */ - if (maps[i].addr) { - RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n", - pci_addr, maps[i].addr); - pci_unmap_resource(maps[i].addr, maps[i].size); - } - } + clean_up_pci_resource(vfio_res); return vfio_res; } diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h index 3515c086aa..8d94d8acf8 100644 --- a/drivers/bus/pci/private.h +++ b/drivers/bus/pci/private.h @@ -110,6 +110,8 @@ struct pci_map { uint64_t offset; uint64_t size; uint64_t phaddr; + uint32_t nr_areas; + struct rte_mem_map_area *areas; }; struct pci_msix_table { diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h index fb7d934bd0..ddc913f121 100644 --- a/drivers/bus/pci/rte_bus_pci.h +++ b/drivers/bus/pci/rte_bus_pci.h @@ -70,6 +70,18 @@ enum rte_pci_kernel_driver { RTE_PCI_KDRV_NET_UIO, /* NetUIO for Windows */ }; +struct rte_mem_map_area { + void *addr; + uint64_t offset; + uint64_t size; +}; + +struct rte_sparse_mem_map { + uint64_t size; + uint32_t nr_maps; + struct rte_mem_map_area *areas; +}; + /** * A structure describing a PCI device. */ @@ -82,8 +94,12 @@ struct rte_pci_device { }; uint8_t is_mdev; /**< True for mediated PCI device */ struct rte_pci_id id; /**< PCI ID. */ - struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE]; + union { + struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE]; /**< PCI Memory Resource */ + struct rte_sparse_mem_map sparse_mem[PCI_MAX_RESOURCE]; + /**< Sparse Memory Map for Mdev */ + }; struct rte_intr_handle intr_handle; /**< Interrupt handle */ struct rte_pci_driver *driver; /**< PCI driver used in probing */ uint16_t max_vfs; /**< sriov enable if not zero */