@@ -29,6 +29,7 @@ Packet type parsing = Y
Basic stats = Y
Stats per queue = Y
FW version = Y
+Multiprocess aware = Y
Other kdrv = Y
Power8 = Y
x86-32 = Y
@@ -145,6 +145,16 @@ below.
Limitations
-----------
+- For secondary process:
+
+ - Forked secondary process not supported.
+ - All mempools must be initialized before rte_eth_dev_start().
+ - External memory unregistered in EAL memseg list cannot be used for DMA
+ unless such memory has been registered by ``mlx4_mr_update_ext_mp()`` in
+ primary process and remapped to the same virtual address in secondary
+ process. If the external memory is registered by primary process but has
+ different virtual address in secondary process, unexpected error may happen.
+
- CRC stripping is supported by default and always reported as "true".
The ability to enable/disable CRC stripping requires OFED version
4.3-1.5.0.0 and above or rdma-core version v18 and above.
@@ -18,6 +18,7 @@ ifneq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y)
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_glue.c
endif
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mp.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
@@ -93,6 +94,11 @@ mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
enum MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS \
$(AUTOCONF_OUTPUT)
$Q sh -- '$<' '$@' \
+ HAVE_IBV_MLX4_UAR_MMAP_OFFSET \
+ infiniband/mlx4dv.h \
+ enum MLX4DV_QP_MASK_UAR_MMAP_OFFSET \
+ $(AUTOCONF_OUTPUT)
+ $Q sh -- '$<' '$@' \
HAVE_IBV_MLX4_WQE_LSO_SEG \
infiniband/mlx4dv.h \
type 'struct mlx4_wqe_lso_seg' \
@@ -33,6 +33,7 @@ if build
'mlx4_ethdev.c',
'mlx4_flow.c',
'mlx4_intr.c',
+ 'mlx4_mp.c',
'mlx4_mr.c',
'mlx4_rxq.c',
'mlx4_rxtx.c',
@@ -76,6 +77,8 @@ if build
has_sym_args = [
[ 'HAVE_IBV_MLX4_BUF_ALLOCATORS', 'infiniband/mlx4dv.h',
'MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS' ],
+ [ 'HAVE_IBV_MLX4_UAR_MMAP_OFFSET', 'infiniband/mlx4dv.h',
+ 'MLX4DV_QP_MASK_UAR_MMAP_OFFSET' ],
]
config = configuration_data()
foreach arg:has_sym_args
@@ -17,6 +17,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/mman.h>
#include <unistd.h>
/* Verbs headers do not support -pedantic. */
@@ -48,10 +49,21 @@
#include "mlx4_rxtx.h"
#include "mlx4_utils.h"
-struct mlx4_dev_list mlx4_mem_event_cb_list =
- LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
+#if defined(HAVE_IBV_MLX4_UAR_MMAP_OFFSET) && \
+ defined(HAVE_IBV_MLX4_BUF_ALLOCATORS)
+#define HAVE_IBV_MLX4_SECONDARY_PROCESS
+#endif
+
+static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data";
+
+/* Shared memory between primary and secondary processes. */
+struct mlx4_shared_data *mlx4_shared_data;
-rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+/* Spinlock for mlx4_shared_data allocation. */
+static rte_spinlock_t mlx4_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx4_local_data mlx4_local_data;
/** Configuration structure for device arguments. */
struct mlx4_conf {
@@ -69,6 +81,77 @@ const char *pmd_mlx4_init_params[] = {
static void mlx4_dev_stop(struct rte_eth_dev *dev);
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_init_shared_data(void)
+{
+ const struct rte_memzone *mz;
+ int ret = 0;
+
+ rte_spinlock_lock(&mlx4_shared_data_lock);
+ if (mlx4_shared_data == NULL) {
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* Allocate shared memory. */
+ mz = rte_memzone_reserve(MZ_MLX4_PMD_SHARED_DATA,
+ sizeof(*mlx4_shared_data),
+ SOCKET_ID_ANY, 0);
+ if (mz == NULL) {
+ ERROR("Cannot allocate mlx4 shared data\n");
+ ret = -rte_errno;
+ goto error;
+ }
+ mlx4_shared_data = mz->addr;
+ memset(mlx4_shared_data, 0, sizeof(*mlx4_shared_data));
+ rte_spinlock_init(&mlx4_shared_data->lock);
+ } else {
+ /* Lookup allocated shared memory. */
+ mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
+ if (mz == NULL) {
+ ERROR("Cannot attach mlx4 shared data\n");
+ ret = -rte_errno;
+ goto error;
+ }
+ mlx4_shared_data = mz->addr;
+ memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
+ }
+ }
+error:
+ rte_spinlock_unlock(&mlx4_shared_data_lock);
+ return ret;
+}
+
+/**
+ * Uninitialize shared data between primary and secondary process.
+ *
+ * The pointer of secondary process is dereferenced and primary process frees
+ * the memzone.
+ */
+static void
+mlx4_uninit_shared_data(void)
+{
+ const struct rte_memzone *mz;
+
+ rte_spinlock_lock(&mlx4_shared_data_lock);
+ if (mlx4_shared_data) {
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
+ rte_memzone_free(mz);
+ } else {
+ memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
+ }
+ mlx4_shared_data = NULL;
+ }
+ rte_spinlock_unlock(&mlx4_shared_data_lock);
+}
+
#ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS
/**
* Verbs callback to allocate a memory. This function should allocate the space
@@ -181,6 +264,11 @@ mlx4_dev_start(struct rte_eth_dev *dev)
return 0;
DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
priv->started = 1;
+ ret = mlx4_tx_uar_remap(dev, priv->ctx->cmd_fd);
+ if (ret) {
+ ERROR("%p: cannot remap UAR", (void *)dev);
+ goto err;
+ }
ret = mlx4_rss_init(priv);
if (ret) {
ERROR("%p: cannot initialize RSS resources: %s",
@@ -208,6 +296,8 @@ mlx4_dev_start(struct rte_eth_dev *dev)
rte_wmb();
dev->tx_pkt_burst = mlx4_tx_burst;
dev->rx_pkt_burst = mlx4_rx_burst;
+ /* Enable datapath on secondary process. */
+ mlx4_mp_req_start_rxtx(dev);
return 0;
err:
mlx4_dev_stop(dev);
@@ -226,6 +316,8 @@ static void
mlx4_dev_stop(struct rte_eth_dev *dev)
{
struct mlx4_priv *priv = dev->data->dev_private;
+ const size_t page_size = sysconf(_SC_PAGESIZE);
+ int i;
if (!priv->started)
return;
@@ -234,9 +326,20 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
dev->tx_pkt_burst = mlx4_tx_burst_removed;
dev->rx_pkt_burst = mlx4_rx_burst_removed;
rte_wmb();
+ /* Disable datapath on secondary process. */
+ mlx4_mp_req_stop_rxtx(dev);
mlx4_flow_sync(priv, NULL);
mlx4_rxq_intr_disable(priv);
mlx4_rss_deinit(priv);
+ for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+ struct txq *txq;
+
+ txq = dev->data->tx_queues[i];
+ if (!txq)
+ continue;
+ munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->msq.db,
+ page_size), page_size);
+ }
}
/**
@@ -259,6 +362,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
dev->rx_pkt_burst = mlx4_rx_burst_removed;
dev->tx_pkt_burst = mlx4_tx_burst_removed;
rte_wmb();
+ /* Disable datapath on secondary process. */
+ mlx4_mp_req_stop_rxtx(dev);
mlx4_flow_clean(priv);
mlx4_rss_deinit(priv);
for (i = 0; i != dev->data->nb_rx_queues; ++i)
@@ -310,6 +415,16 @@ static const struct eth_dev_ops mlx4_dev_ops = {
.is_removed = mlx4_is_removed,
};
+#ifdef HAVE_IBV_MLX4_SECONDARY_PROCESS
+/* Available operations from secondary process. */
+static const struct eth_dev_ops mlx4_dev_sec_ops = {
+ .stats_get = mlx4_stats_get,
+ .stats_reset = mlx4_stats_reset,
+ .fw_version_get = mlx4_fw_version_get,
+ .dev_infos_get = mlx4_dev_infos_get,
+};
+#endif
+
/**
* Get PCI information from struct ibv_device.
*
@@ -549,6 +664,200 @@ mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
static struct rte_pci_driver mlx4_driver;
+static int
+find_lower_va_bound(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg)
+{
+ void **addr = arg;
+
+ if (msl->external)
+ return 0;
+ if (*addr == NULL)
+ *addr = ms->addr;
+ else
+ *addr = RTE_MIN(*addr, ms->addr);
+
+ return 0;
+}
+
+/**
+ * Reserve UAR address space for primary process.
+ *
+ * Process local resource is used by both primary and secondary to avoid
+ * duplicate reservation. The space has to be available on both primary and
+ * secondary process, TXQ UAR maps to this area using fixed mmap w/o double
+ * check.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_uar_init_primary(void)
+{
+ struct mlx4_shared_data *sd = mlx4_shared_data;
+ void *addr = (void *)0;
+
+ if (sd->uar_base)
+ return 0;
+ /* find out lower bound of hugepage segments */
+ rte_memseg_walk(find_lower_va_bound, &addr);
+ /* keep distance to hugepages to minimize potential conflicts. */
+ addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX4_UAR_OFFSET + MLX4_UAR_SIZE));
+ /* anonymous mmap, no real memory consumption. */
+ addr = mmap(addr, MLX4_UAR_SIZE,
+ PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED) {
+ ERROR("failed to reserve UAR address space, please"
+ " adjust MLX4_UAR_SIZE or try --base-virtaddr");
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ /* Accept either same addr or a new addr returned from mmap if target
+ * range occupied.
+ */
+ INFO("reserved UAR address space: %p", addr);
+ sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
+ return 0;
+}
+
+/**
+ * Unmap UAR address space reserved for primary process.
+ */
+static void
+mlx4_uar_uninit_primary(void)
+{
+ struct mlx4_shared_data *sd = mlx4_shared_data;
+
+ if (!sd->uar_base)
+ return;
+ munmap(sd->uar_base, MLX4_UAR_SIZE);
+ sd->uar_base = NULL;
+}
+
+/**
+ * Reserve UAR address space for secondary process, align with primary process.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_uar_init_secondary(void)
+{
+ struct mlx4_shared_data *sd = mlx4_shared_data;
+ struct mlx4_local_data *ld = &mlx4_local_data;
+ void *addr;
+
+ if (ld->uar_base) { /* Already reserved. */
+ assert(sd->uar_base == ld->uar_base);
+ return 0;
+ }
+ assert(sd->uar_base);
+ /* anonymous mmap, no real memory consumption. */
+ addr = mmap(sd->uar_base, MLX4_UAR_SIZE,
+ PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED) {
+ ERROR("UAR mmap failed: %p size: %llu",
+ sd->uar_base, MLX4_UAR_SIZE);
+ rte_errno = ENXIO;
+ return -rte_errno;
+ }
+ if (sd->uar_base != addr) {
+ ERROR("UAR address %p size %llu occupied, please"
+ " adjust MLX4_UAR_OFFSET or try EAL parameter"
+ " --base-virtaddr",
+ sd->uar_base, MLX4_UAR_SIZE);
+ rte_errno = ENXIO;
+ return -rte_errno;
+ }
+ ld->uar_base = addr;
+ INFO("reserved UAR address space: %p", addr);
+ return 0;
+}
+
+/**
+ * Unmap UAR address space reserved for secondary process.
+ */
+static void
+mlx4_uar_uninit_secondary(void)
+{
+ struct mlx4_local_data *ld = &mlx4_local_data;
+
+ if (!ld->uar_base)
+ return;
+ munmap(ld->uar_base, MLX4_UAR_SIZE);
+ ld->uar_base = NULL;
+}
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_init_once(void)
+{
+ struct mlx4_shared_data *sd;
+ struct mlx4_local_data *ld = &mlx4_local_data;
+ int ret;
+
+ if (mlx4_init_shared_data())
+ return -rte_errno;
+ sd = mlx4_shared_data;
+ assert(sd);
+ rte_spinlock_lock(&sd->lock);
+ switch (rte_eal_process_type()) {
+ case RTE_PROC_PRIMARY:
+ if (sd->init_done)
+ break;
+ LIST_INIT(&sd->mem_event_cb_list);
+ rte_rwlock_init(&sd->mem_event_rwlock);
+ rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
+ mlx4_mr_mem_event_cb, NULL);
+ mlx4_mp_init_primary();
+ ret = mlx4_uar_init_primary();
+ if (ret)
+ goto error;
+ sd->init_done = true;
+ break;
+ case RTE_PROC_SECONDARY:
+ if (ld->init_done)
+ break;
+ mlx4_mp_init_secondary();
+ ret = mlx4_uar_init_secondary();
+ if (ret)
+ goto error;
+ ++sd->secondary_cnt;
+ ld->init_done = true;
+ break;
+ default:
+ break;
+ }
+ rte_spinlock_unlock(&sd->lock);
+ return 0;
+error:
+ switch (rte_eal_process_type()) {
+ case RTE_PROC_PRIMARY:
+ mlx4_uar_uninit_primary();
+ mlx4_mp_uninit_primary();
+ rte_mem_event_callback_unregister("MLX4_MEM_EVENT_CB", NULL);
+ break;
+ case RTE_PROC_SECONDARY:
+ mlx4_uar_uninit_secondary();
+ mlx4_mp_uninit_secondary();
+ break;
+ default:
+ break;
+ }
+ rte_spinlock_unlock(&sd->lock);
+ mlx4_uninit_shared_data();
+ return -rte_errno;
+}
+
/**
* DPDK callback to register a PCI device.
*
@@ -579,6 +888,12 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
int i;
(void)pci_drv;
+ err = mlx4_init_once();
+ if (err) {
+ ERROR("unable to init PMD global data: %s",
+ strerror(rte_errno));
+ return -rte_errno;
+ }
assert(pci_drv == &mlx4_driver);
list = mlx4_glue->get_device_list(&i);
if (list == NULL) {
@@ -659,6 +974,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
struct mlx4_priv *priv = NULL;
struct rte_eth_dev *eth_dev = NULL;
struct ether_addr mac;
+ char name[RTE_ETH_NAME_MAX_LEN];
/* If port is not enabled, skip. */
if (!(conf.ports.enabled & (1 << i)))
@@ -669,6 +985,44 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
err = ENODEV;
goto port_error;
}
+ snprintf(name, sizeof(name), "%s port %u",
+ mlx4_glue->get_device_name(ibv_dev), port);
+#ifdef HAVE_IBV_MLX4_SECONDARY_PROCESS
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ eth_dev = rte_eth_dev_attach_secondary(name);
+ if (eth_dev == NULL) {
+ ERROR("can not attach rte ethdev");
+ rte_errno = ENOMEM;
+ err = rte_errno;
+ goto error;
+ }
+ eth_dev->device = &pci_dev->device;
+ eth_dev->dev_ops = &mlx4_dev_sec_ops;
+ /* Receive command fd from primary process */
+ err = mlx4_mp_req_verbs_cmd_fd(eth_dev);
+ if (err < 0) {
+ err = rte_errno;
+ goto error;
+ }
+ /* Remap UAR for Tx queues. */
+ err = mlx4_tx_uar_remap(eth_dev, err);
+ if (err) {
+ err = rte_errno;
+ goto error;
+ }
+ /*
+ * Ethdev pointer is still required as input since
+ * the primary device is not accessible from the
+ * secondary process.
+ */
+ eth_dev->tx_pkt_burst = mlx4_tx_burst;
+ eth_dev->rx_pkt_burst = mlx4_rx_burst;
+ claim_zero(mlx4_glue->close_device(ctx));
+ rte_eth_copy_pci_info(eth_dev, pci_dev);
+ rte_eth_dev_probing_finish(eth_dev);
+ continue;
+ }
+#endif
/* Check port status. */
err = mlx4_glue->query_port(ctx, port, &port_attr);
if (err) {
@@ -774,14 +1128,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
/* Get actual MTU if possible. */
mlx4_mtu_get(priv, &priv->mtu);
DEBUG("port %u MTU is %u", priv->port, priv->mtu);
- /* from rte_ethdev.c */
- {
- char name[RTE_ETH_NAME_MAX_LEN];
-
- snprintf(name, sizeof(name), "%s port %u",
- mlx4_glue->get_device_name(ibv_dev), port);
- eth_dev = rte_eth_dev_allocate(name);
- }
+ eth_dev = rte_eth_dev_allocate(name);
if (eth_dev == NULL) {
err = ENOMEM;
ERROR("can not allocate rte ethdev");
@@ -842,9 +1189,10 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
goto port_error;
}
/* Add device to memory callback list. */
- rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
- LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
- rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+ rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
+ LIST_INSERT_HEAD(&mlx4_shared_data->mem_event_cb_list,
+ priv, mem_event_cb);
+ rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
rte_eth_dev_probing_finish(eth_dev);
continue;
port_error:
@@ -1075,8 +1423,6 @@ RTE_INIT(rte_mlx4_pmd_init)
}
mlx4_glue->fork_init();
rte_pci_register(&mlx4_driver);
- rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
- mlx4_mr_mem_event_cb, NULL);
}
RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
@@ -53,6 +53,16 @@
/** Port parameter. */
#define MLX4_PMD_PORT_KVARG "port"
+/* Reserved address space for UAR mapping. */
+#define MLX4_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4))
+
+/* Offset of reserved UAR address space to hugepage memory. Offset is used here
+ * to minimize possibility of address next to hugepage being used by other code
+ * in either primary or secondary process, failing to map TX UAR would make TX
+ * packets invisible to HW.
+ */
+#define MLX4_UAR_OFFSET (2ULL << (sizeof(uintptr_t) * 4))
+
enum {
PCI_VENDOR_ID_MELLANOX = 0x15b3,
};
@@ -63,6 +73,23 @@ enum {
PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
};
+/* Request types for IPC. */
+enum mlx4_mp_req_type {
+ MLX4_MP_REQ_VERBS_CMD_FD = 1,
+ MLX4_MP_REQ_START_RXTX,
+ MLX4_MP_REQ_STOP_RXTX,
+};
+
+/* Pameters for IPC. */
+struct mlx4_mp_param {
+ enum mlx4_mp_req_type type;
+ int port_id;
+ int result;
+};
+
+/** Key string for IPC. */
+#define MLX4_MP_NAME "net_mlx4_mp"
+
/** Driver name reported to lower layers and used in log output. */
#define MLX4_DRIVER_NAME "net_mlx4"
@@ -93,6 +120,27 @@ struct mlx4_verbs_alloc_ctx {
LIST_HEAD(mlx4_dev_list, mlx4_priv);
LIST_HEAD(mlx4_mr_list, mlx4_mr);
+/* Shared data between primary and secondary processes. */
+struct mlx4_shared_data {
+ rte_spinlock_t lock;
+ /* Global spinlock for primary and secondary processes. */
+ int init_done; /* Whether primary has done initialization. */
+ unsigned int secondary_cnt; /* Number of secondary processes init'd. */
+ void *uar_base;
+ /* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+ struct mlx4_dev_list mem_event_cb_list;
+ rte_rwlock_t mem_event_rwlock;
+};
+
+/* Per-process data structure, not visible to other processes. */
+struct mlx4_local_data {
+ int init_done; /* Whether a secondary has done initialization. */
+ void *uar_base;
+ /* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+};
+
+extern struct mlx4_shared_data *mlx4_shared_data;
+
/** Private data structure. */
struct mlx4_priv {
LIST_ENTRY(mlx4_priv) mem_event_cb;
@@ -175,4 +223,13 @@ void mlx4_rxq_intr_disable(struct mlx4_priv *priv);
int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
+/* mlx4_mp.c */
+void mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev);
+void mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev);
+int mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+void mlx4_mp_init_primary(void);
+void mlx4_mp_uninit_primary(void);
+void mlx4_mp_init_secondary(void);
+void mlx4_mp_uninit_secondary(void);
+
#endif /* RTE_PMD_MLX4_H_ */
new file mode 100644
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <rte_ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Initialize IPC message.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[out] msg
+ * Pointer to message to fill in.
+ * @param[in] type
+ * Message type.
+ */
+static inline void
+mp_init_msg(struct rte_eth_dev *dev, struct rte_mp_msg *msg,
+ enum mlx4_mp_req_type type)
+{
+ struct mlx4_mp_param *param = (struct mlx4_mp_param *)msg->param;
+
+ memset(msg, 0, sizeof(*msg));
+ strlcpy(msg->name, MLX4_MP_NAME, sizeof(msg->name));
+ msg->len_param = sizeof(*param);
+ param->type = type;
+ param->port_id = dev->data->port_id;
+}
+
+/**
+ * Return file descriptor for mmap to the secondary process.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] peer
+ * Pointer to the peer socket path.
+ *
+ * @return
+ * 0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_mp_msg mp_res;
+ struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
+ const struct mlx4_mp_param *param =
+ (const struct mlx4_mp_param *)mp_msg->param;
+ struct rte_eth_dev *dev = &rte_eth_devices[param->port_id];
+ struct mlx4_priv *priv = dev->data->dev_private;
+ int ret = 0;
+
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ switch (param->type) {
+ case MLX4_MP_REQ_VERBS_CMD_FD:
+ mp_init_msg(dev, &mp_res, param->type);
+ mp_res.num_fds = 1;
+ mp_res.fds[0] = priv->ctx->cmd_fd;
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ default:
+ rte_errno = EINVAL;
+ ERROR("port %u invalid mp request type", dev->data->port_id);
+ return -rte_errno;
+ }
+ return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] peer
+ * Pointer to the peer socket path.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_mp_msg mp_res;
+ struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
+ const struct mlx4_mp_param *param =
+ (const struct mlx4_mp_param *)mp_msg->param;
+ struct rte_eth_dev *dev = &rte_eth_devices[param->port_id];
+ int ret = 0;
+
+ assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ switch (param->type) {
+ case MLX4_MP_REQ_START_RXTX:
+ INFO("port %u starting datapath", dev->data->port_id);
+ rte_mb();
+ dev->tx_pkt_burst = mlx4_tx_burst;
+ dev->rx_pkt_burst = mlx4_rx_burst;
+ mp_init_msg(dev, &mp_res, param->type);
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX4_MP_REQ_STOP_RXTX:
+ INFO("port %u stopping datapath", dev->data->port_id);
+ dev->tx_pkt_burst = mlx4_tx_burst_removed;
+ dev->rx_pkt_burst = mlx4_rx_burst_removed;
+ rte_mb();
+ mp_init_msg(dev, &mp_res, param->type);
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ default:
+ rte_errno = EINVAL;
+ ERROR("port %u invalid mp request type", dev->data->port_id);
+ return -rte_errno;
+ }
+ return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] type
+ * Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx4_mp_req_type type)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_msg *mp_res;
+ struct rte_mp_reply mp_rep;
+ struct mlx4_mp_param *res __rte_unused;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ int ret;
+
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (!mlx4_shared_data->secondary_cnt)
+ return;
+ if (type != MLX4_MP_REQ_START_RXTX && type != MLX4_MP_REQ_STOP_RXTX) {
+ ERROR("port %u unknown request (req_type %d)",
+ dev->data->port_id, type);
+ return;
+ }
+ mp_init_msg(dev, &mp_req, type);
+ ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+ if (ret) {
+ ERROR("port %u failed to request stop/start Rx/Tx (%d)",
+ dev->data->port_id, type);
+ goto exit;
+ }
+ if (mp_rep.nb_sent != mp_rep.nb_received) {
+ ERROR("port %u not all secondaries responded (req_type %d)",
+ dev->data->port_id, type);
+ goto exit;
+ }
+ mp_res = &mp_rep.msgs[0];
+ res = (struct mlx4_mp_param *)mp_res->param;
+ assert(!res->result);
+exit:
+ free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ */
+void
+mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev)
+{
+ mp_req_on_rxtx(dev, MLX4_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ */
+void
+mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+ mp_req_on_rxtx(dev, MLX4_MP_REQ_STOP_RXTX);
+}
+
+/**
+ * IPC message handler of primary process.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ *
+ * @return
+ * fd on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_msg *mp_res;
+ struct rte_mp_reply mp_rep;
+ struct mlx4_mp_param *res __rte_unused;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ int cmd_fd;
+ int ret;
+
+ assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ mp_init_msg(dev, &mp_req, MLX4_MP_REQ_VERBS_CMD_FD);
+ ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+ if (ret) {
+ ERROR("port %u failed to get command FD from primary process",
+ dev->data->port_id);
+ return -rte_errno;
+ }
+ assert(mp_rep.nb_received == 1);
+ mp_res = &mp_rep.msgs[0];
+ res = (struct mlx4_mp_param *)mp_res->param;
+ assert(!res->result);
+ assert(mp_res->num_fds == 1);
+ cmd_fd = mp_res->fds[0];
+ free(mp_rep.msgs);
+ DEBUG("port %u command FD from primary is %d",
+ dev->data->port_id, cmd_fd);
+ return cmd_fd;
+}
+
+/**
+ * Initialize by primary process.
+ */
+void
+mlx4_mp_init_primary(void)
+{
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ rte_mp_action_register(MLX4_MP_NAME, mp_primary_handle);
+}
+
+/**
+ * Un-initialize by primary process.
+ */
+void
+mlx4_mp_uninit_primary(void)
+{
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ rte_mp_action_unregister(MLX4_MP_NAME);
+}
+
+/**
+ * Initialize by secondary process.
+ */
+void
+mlx4_mp_init_secondary(void)
+{
+ assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ rte_mp_action_register(MLX4_MP_NAME, mp_secondary_handle);
+}
+
+/**
+ * Un-initialize by secondary process.
+ */
+void
+mlx4_mp_uninit_secondary(void)
+{
+ assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ rte_mp_action_unregister(MLX4_MP_NAME);
+}
@@ -489,6 +489,8 @@ mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
struct mlx4_mr *mr_next;
struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
+ /* Must be called from the primary process. */
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
/*
* MR can't be freed with holding the lock because rte_free() could call
* memory free callback function. This will be a deadlock situation.
@@ -561,6 +563,14 @@ mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
DEBUG("port %u creating a MR using address (%p)",
dev->data->port_id, (void *)addr);
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ WARN("port %u using address (%p) of unregistered mempool"
+ " in secondary process, please create mempool"
+ " before rte_eth_dev_start()",
+ dev->data->port_id, (void *)addr);
+ rte_errno = EPERM;
+ goto err_nolock;
+ }
/*
* Release detached MRs if any. This can't be called with holding either
* memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
@@ -890,14 +900,17 @@ mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
size_t len, void *arg __rte_unused)
{
struct mlx4_priv *priv;
+ struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list;
+ /* Must be called from the primary process. */
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
switch (event_type) {
case RTE_MEM_EVENT_FREE:
- rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
+ rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock);
/* Iterate all the existing mlx4 devices. */
- LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
+ LIST_FOREACH(priv, dev_list, mem_event_cb)
mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
- rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
+ rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock);
break;
case RTE_MEM_EVENT_ALLOC:
default:
@@ -1130,6 +1143,7 @@ mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
struct mlx4_mr_cache entry;
uint32_t lkey;
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
/* If already registered, it should return. */
rte_rwlock_read_lock(&priv->mr.rwlock);
lkey = mr_lookup_dev(dev, &entry, addr);
@@ -1225,6 +1239,14 @@ mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp)
struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
struct mlx4_priv *priv = txq->priv;
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+ WARN("port %u using address (%p) from unregistered mempool"
+ " having externally allocated memory"
+ " in secondary process, please create mempool"
+ " prior to rte_eth_dev_start()",
+ PORT_ID(priv), (void *)addr);
+ return UINT32_MAX;
+ }
mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
return mlx4_tx_addr2mr_bh(txq, addr);
}
@@ -1336,9 +1358,9 @@ mlx4_mr_release(struct rte_eth_dev *dev)
struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
/* Remove from memory callback device list. */
- rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+ rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
LIST_REMOVE(priv, mem_event_cb);
- rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+ rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
#ifndef NDEBUG
mlx4_mr_dump_dev(dev);
#endif
@@ -77,7 +77,9 @@ struct mlx4_sq {
uint32_t owner_opcode;
/**< Default owner opcode with HW valid owner bit. */
uint32_t stamp; /**< Stamp value with an invalid HW owner bit. */
- volatile uint32_t *db; /**< Pointer to the doorbell. */
+ volatile uint32_t *qp_sdb; /**< Pointer to the doorbell. */
+ volatile uint32_t *db; /**< Pointer to the doorbell remapped. */
+ off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
};
@@ -1365,6 +1365,7 @@ mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
(void)dpdk_txq;
(void)pkts;
(void)pkts_n;
+ rte_mb();
return 0;
}
@@ -1390,5 +1391,6 @@ mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
(void)dpdk_rxq;
(void)pkts;
(void)pkts_n;
+ rte_mb();
return 0;
}
@@ -152,6 +152,7 @@ uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
/* mlx4_txq.c */
+int mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd);
uint64_t mlx4_get_tx_port_offloads(struct mlx4_priv *priv);
int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
uint16_t desc, unsigned int socket,
@@ -13,7 +13,9 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
+#include <sys/mman.h>
#include <inttypes.h>
+#include <unistd.h>
/* Verbs headers do not support -pedantic. */
#ifdef PEDANTIC
@@ -38,6 +40,97 @@
#include "mlx4_utils.h"
/**
+ * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
+ * Both primary and secondary process do mmap to make UAR address
+ * aligned.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param fd
+ * Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+int
+mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd)
+{
+ unsigned int i, j;
+ const unsigned int txqs_n = dev->data->nb_tx_queues;
+ uintptr_t pages[txqs_n];
+ unsigned int pages_n = 0;
+ uintptr_t uar_va;
+ uintptr_t off;
+ void *addr;
+ void *ret;
+ struct txq *txq;
+ int already_mapped;
+ size_t page_size = sysconf(_SC_PAGESIZE);
+
+ memset(pages, 0, txqs_n * sizeof(uintptr_t));
+ /*
+ * As rdma-core, UARs are mapped in size of OS page size.
+ * Use aligned address to avoid duplicate mmap.
+ * Ref to libmlx4 function: mlx4_init_context()
+ */
+ for (i = 0; i != txqs_n; ++i) {
+ txq = dev->data->tx_queues[i];
+ if (!txq)
+ continue;
+ /* UAR addr form verbs used to find dup and offset in page. */
+ uar_va = (uintptr_t)txq->msq.qp_sdb;
+ off = uar_va & (page_size - 1); /* offset in page. */
+ uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
+ already_mapped = 0;
+ for (j = 0; j != pages_n; ++j) {
+ if (pages[j] == uar_va) {
+ already_mapped = 1;
+ break;
+ }
+ }
+ /* new address in reserved UAR address space. */
+ addr = RTE_PTR_ADD(mlx4_shared_data->uar_base,
+ uar_va & (uintptr_t)(MLX4_UAR_SIZE - 1));
+ if (!already_mapped) {
+ pages[pages_n++] = uar_va;
+ /* fixed mmap to specified address in reserved
+ * address space.
+ */
+ ret = mmap(addr, page_size,
+ PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+ txq->msq.uar_mmap_offset);
+ if (ret != addr) {
+ /* fixed mmap has to return same address. */
+ ERROR("call to mmap failed on UAR for txq %u",
+ i);
+ rte_errno = ENXIO;
+ return -rte_errno;
+ }
+ }
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once. */
+ txq->msq.db = RTE_PTR_ADD((void *)addr, off);
+ else
+ assert(txq->msq.db ==
+ RTE_PTR_ADD((void *)addr, off));
+ }
+ return 0;
+}
+#else
+int
+mlx4_tx_uar_remap(struct rte_eth_dev *dev __rte_unused, int fd __rte_unused)
+{
+ /*
+ * If rdma-core doesn't support UAR remap, secondary process is not
+ * supported, thus secondary cannot call this function but only primary
+ * makes a call. Return success to not interrupt initialization.
+ */
+ assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ return 0;
+}
+#endif
+
+/**
* Free Tx queue elements.
*
* @param txq
@@ -89,7 +182,12 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
(0u << MLX4_SQ_OWNER_BIT));
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+ sq->uar_mmap_offset = dqp->uar_mmap_offset;
+ sq->qp_sdb = dqp->sdb;
+#else
sq->db = dqp->sdb;
+#endif
sq->doorbell_qpn = dqp->doorbell_qpn;
cq->buf = dcq->buf.buf;
cq->cqe_cnt = dcq->cqe_cnt;
@@ -307,6 +405,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
goto error;
}
/* Retrieve device queue information. */
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+ dv_qp = (struct mlx4dv_qp){
+ .comp_mask = MLX4DV_QP_MASK_UAR_MMAP_OFFSET,
+ };
+#endif
mlxdv.cq.in = txq->cq;
mlxdv.cq.out = &dv_cq;
mlxdv.qp.in = txq->qp;
@@ -318,6 +421,13 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
" accessing the device queues", (void *)dev);
goto error;
}
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+ if (!(dv_qp.comp_mask & MLX4DV_QP_MASK_UAR_MMAP_OFFSET)) {
+ rte_errno = EINVAL;
+ ERROR("%p: failed to obtain UAR mmap offset", (void *)dev);
+ goto error;
+ }
+#endif
mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
/* Save first wqe pointer in the first element. */
(&(*txq->elts)[0])->wqe =