[2/2] net/mlx5: set txq affinity in round-robin

Message ID 20211020031938.3190843-3-rongweil@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers
Series set txq affinity in round-robin |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation warning apply issues

Commit Message

Rongwei Liu Oct. 20, 2021, 3:19 a.m. UTC
  Previously, we set txq affinity to 0 and let firmware
to perform round-robin when bonding. Firmware uses a
global counter to assign txq affinity to different
physical ports accord to remainder after division.

There are three dis-advantages:
1. The global counter is shared between kernel and dpdk.
2. After restarting pmd or port, the previous counter value
is reused, so the new affinity is unpredictable.
3. There is no way to get what affinity is set by firmware.

In this update, we will create several TISs up to the
number of bonding ports and bind each TIS to one PF port.

For each port, it will start to pick up TIS using its port
index. Upper layer application can quickly calculate each txq's
affinity without querying.

At DPDK layer, when creating txq with 2 bonding ports, the
affinity is set like:
port 0: 1-->2-->1-->2
port 1: 2-->1-->2-->1
port 2: 1-->2-->1-->2

Note: Only applicable to DevX api.
This affinity subjects to HW hash.

Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
Acked-by: Matan Azrad <matan@nvidia.com>
---
 doc/guides/nics/mlx5.rst         |  4 ++
 drivers/net/mlx5/linux/mlx5_os.c |  2 +-
 drivers/net/mlx5/mlx5.c          | 81 ++++++++++++++++++++++++++++----
 drivers/net/mlx5/mlx5.h          | 10 +++-
 drivers/net/mlx5/mlx5_devx.c     | 37 ++++++++++++++-
 drivers/net/mlx5/mlx5_txpp.c     |  4 +-
 6 files changed, 124 insertions(+), 14 deletions(-)
  

Patch

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index bae73f42d8..d817caedac 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -464,6 +464,10 @@  Limitations
   - In order to achieve best insertion rate, application should manage the flows per lcore.
   - Better to disable memory reclaim by setting ``reclaim_mem_mode`` to 0 to accelerate the flow object allocation and release with cache.
 
+- HW hashed bonding
+
+  - TXQ affinity subjects to HW hash once enabled.
+
 Statistics
 ----------
 
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index a823d26beb..1d7fa7dc6c 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -928,7 +928,6 @@  mlx5_representor_match(struct mlx5_dev_spawn_data *spawn,
 	return false;
 }
 
-
 /**
  * Spawn an Ethernet device from Verbs information.
  *
@@ -1707,6 +1706,7 @@  mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	 */
 	MLX5_ASSERT(spawn->ifindex);
 	priv->if_index = spawn->ifindex;
+	priv->lag_affinity_idx = sh->refcnt - 1;
 	eth_dev->data->dev_private = priv;
 	priv->dev_data = eth_dev->data;
 	eth_dev->data->mac_addrs = priv->mac;
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index e28cc461b9..e049a367f0 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1118,6 +1118,68 @@  mlx5_alloc_rxtx_uars(struct mlx5_dev_ctx_shared *sh,
 	return err;
 }
 
+/**
+ * Set up multiple TISs with different affinities according to
+ * number of bonding ports
+ *
+ * @param priv
+ * Pointer of shared context.
+ *
+ * @return
+ * Zero on success, -1 otherwise.
+ */
+static int
+mlx5_setup_tis(struct mlx5_dev_ctx_shared *sh)
+{
+	int i;
+	struct mlx5_devx_lag_context lag_ctx = { 0 };
+	struct mlx5_devx_tis_attr tis_attr = { 0 };
+
+	tis_attr.transport_domain = sh->td->id;
+	if (sh->bond.n_port) {
+		if (!mlx5_devx_cmd_query_lag(sh->ctx, &lag_ctx)) {
+			sh->lag.tx_remap_affinity[0] =
+				lag_ctx.tx_remap_affinity_1;
+			sh->lag.tx_remap_affinity[1] =
+				lag_ctx.tx_remap_affinity_2;
+			sh->lag.affinity_mode = lag_ctx.port_select_mode;
+		} else {
+			DRV_LOG(ERR, "Failed to query lag affinity.");
+			return -1;
+		}
+		if (sh->lag.affinity_mode == MLX5_LAG_MODE_TIS) {
+			for (i = 0; i < sh->bond.n_port; i++) {
+				tis_attr.lag_tx_port_affinity =
+					MLX5_IFC_LAG_MAP_TIS_AFFINITY(i,
+							sh->bond.n_port);
+				sh->tis[i] = mlx5_devx_cmd_create_tis(sh->ctx,
+						&tis_attr);
+				if (!sh->tis[i]) {
+					DRV_LOG(ERR, "Failed to TIS %d/%d for bonding device"
+						" %s.", i, sh->bond.n_port,
+						sh->ibdev_name);
+					return -1;
+				}
+			}
+			DRV_LOG(DEBUG, "LAG number of ports : %d, affinity_1 & 2 : pf%d & %d.\n",
+				sh->bond.n_port, lag_ctx.tx_remap_affinity_1,
+				lag_ctx.tx_remap_affinity_2);
+			return 0;
+		}
+		if (sh->lag.affinity_mode == MLX5_LAG_MODE_HASH)
+			DRV_LOG(INFO, "Device %s enabled HW hash based LAG.",
+					sh->ibdev_name);
+	}
+	tis_attr.lag_tx_port_affinity = 0;
+	sh->tis[0] = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
+	if (!sh->tis[0]) {
+		DRV_LOG(ERR, "Failed to TIS 0 for bonding device"
+			" %s.", sh->ibdev_name);
+		return -1;
+	}
+	return 0;
+}
+
 /**
  * Allocate shared device context. If there is multiport device the
  * master and representors will share this context, if there is single
@@ -1145,7 +1207,6 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 	struct mlx5_dev_ctx_shared *sh;
 	int err = 0;
 	uint32_t i;
-	struct mlx5_devx_tis_attr tis_attr = { 0 };
 
 	MLX5_ASSERT(spawn);
 	/* Secondary process should not create the shared context. */
@@ -1216,9 +1277,7 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 			err = ENOMEM;
 			goto error;
 		}
-		tis_attr.transport_domain = sh->td->id;
-		sh->tis = mlx5_devx_cmd_create_tis(sh->ctx, &tis_attr);
-		if (!sh->tis) {
+		if (mlx5_setup_tis(sh)) {
 			DRV_LOG(ERR, "TIS allocation failure");
 			err = ENOMEM;
 			goto error;
@@ -1282,10 +1341,13 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 	MLX5_ASSERT(sh);
 	if (sh->share_cache.cache.table)
 		mlx5_mr_btree_free(&sh->share_cache.cache);
-	if (sh->tis)
-		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
 	if (sh->td)
 		claim_zero(mlx5_devx_cmd_destroy(sh->td));
+	i = 0;
+	do {
+		if (sh->tis[i])
+			claim_zero(mlx5_devx_cmd_destroy(sh->tis[i]));
+	} while (++i < (uint32_t)sh->bond.n_port);
 	if (sh->devx_rx_uar)
 		mlx5_glue->devx_free_uar(sh->devx_rx_uar);
 	if (sh->tx_uar)
@@ -1310,6 +1372,7 @@  mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 void
 mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
 {
+	int i = 0;
 	pthread_mutex_lock(&mlx5_dev_ctx_list_mutex);
 #ifdef RTE_LIBRTE_MLX5_DEBUG
 	/* Check the object presence in the list. */
@@ -1361,8 +1424,10 @@  mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
 	}
 	if (sh->pd)
 		claim_zero(mlx5_os_dealloc_pd(sh->pd));
-	if (sh->tis)
-		claim_zero(mlx5_devx_cmd_destroy(sh->tis));
+	do {
+		if (sh->tis[i])
+			claim_zero(mlx5_devx_cmd_destroy(sh->tis[i]));
+	} while (++i < sh->bond.n_port);
 	if (sh->td)
 		claim_zero(mlx5_devx_cmd_destroy(sh->td));
 	if (sh->devx_rx_uar)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index a15f86616d..7ff5feaf4a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1111,6 +1111,12 @@  struct mlx5_aso_ct_pools_mng {
 	struct mlx5_aso_sq aso_sq; /* ASO queue objects. */
 };
 
+/* LAG attr. */
+struct mlx5_lag {
+	uint8_t tx_remap_affinity[16]; /* The PF port number of affinity */
+	uint8_t affinity_mode; /* TIS or hash based affinity */
+};
+
 /*
  * Shared Infiniband device context for Master/Representors
  * which belong to same IB device with multiple IB ports.
@@ -1178,8 +1184,9 @@  struct mlx5_dev_ctx_shared {
 	struct rte_intr_handle intr_handle; /* Interrupt handler for device. */
 	struct rte_intr_handle intr_handle_devx; /* DEVX interrupt handler. */
 	void *devx_comp; /* DEVX async comp obj. */
-	struct mlx5_devx_obj *tis; /* TIS object. */
+	struct mlx5_devx_obj *tis[16]; /* TIS object. */
 	struct mlx5_devx_obj *td; /* Transport domain. */
+	struct mlx5_lag lag; /* LAG attributes */
 	void *tx_uar; /* Tx/packet pacing shared UAR. */
 	struct mlx5_flex_parser_profiles fp[MLX5_FLEX_PARSER_MAX];
 	/* Flex parser profiles information. */
@@ -1445,6 +1452,7 @@  struct mlx5_priv {
 	uint32_t rss_shared_actions; /* RSS shared actions. */
 	struct mlx5_devx_obj *q_counters; /* DevX queue counter object. */
 	uint32_t counter_set_id; /* Queue counter ID to set in DevX objects. */
+	uint32_t lag_affinity_idx; /* LAG mode queue 0 affinity starting. */
 };
 
 #define PORT_ID(priv) ((priv)->dev_data->port_id)
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index a1db53577a..bff81c7df2 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -888,6 +888,37 @@  mlx5_devx_drop_action_destroy(struct rte_eth_dev *dev)
 	rte_errno = ENOTSUP;
 }
 
+/**
+ * Select TXQ TIS number.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param queue_idx
+ *   Queue index in DPDK Tx queue array.
+ *
+ * @return
+ *   > 0 on success, a negative errno value otherwise.
+ */
+static uint32_t
+mlx5_get_txq_tis_num(struct rte_eth_dev *dev, uint16_t queue_idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int tis_idx;
+
+	if (priv->sh->bond.n_port && priv->sh->lag.affinity_mode ==
+			MLX5_LAG_MODE_TIS) {
+		tis_idx = (priv->lag_affinity_idx + queue_idx) %
+			priv->sh->bond.n_port;
+		DRV_LOG(INFO, "port %d txq %d gets affinity %d and maps to PF %d.",
+			dev->data->port_id, queue_idx, tis_idx + 1,
+			priv->sh->lag.tx_remap_affinity[tis_idx]);
+	} else {
+		tis_idx = 0;
+	}
+	MLX5_ASSERT(priv->sh->tis[tis_idx]);
+	return priv->sh->tis[tis_idx]->id;
+}
+
 /**
  * Create the Tx hairpin queue object.
  *
@@ -935,7 +966,8 @@  mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx)
 	attr.wq_attr.log_hairpin_num_packets =
 			attr.wq_attr.log_hairpin_data_sz -
 			MLX5_HAIRPIN_QUEUE_STRIDE;
-	attr.tis_num = priv->sh->tis->id;
+
+	attr.tis_num = mlx5_get_txq_tis_num(dev, idx);
 	tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr);
 	if (!tmpl->sq) {
 		DRV_LOG(ERR,
@@ -992,14 +1024,15 @@  mlx5_txq_create_devx_sq_resources(struct rte_eth_dev *dev, uint16_t idx,
 		.allow_swp = !!priv->config.swp,
 		.cqn = txq_obj->cq_obj.cq->id,
 		.tis_lst_sz = 1,
-		.tis_num = priv->sh->tis->id,
 		.wq_attr = (struct mlx5_devx_wq_attr){
 			.pd = priv->sh->pdn,
 			.uar_page =
 				 mlx5_os_get_devx_uar_page_id(priv->sh->tx_uar),
 		},
 		.ts_format = mlx5_ts_format_conv(priv->sh->sq_ts_format),
+		.tis_num = mlx5_get_txq_tis_num(dev, idx),
 	};
+
 	/* Create Send Queue object with DevX. */
 	return mlx5_devx_sq_create(priv->sh->ctx, &txq_obj->sq_obj, log_desc_n,
 				   &sq_attr, priv->sh->numa_node);
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 2be7e71f89..6e874fa090 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -230,7 +230,7 @@  mlx5_txpp_create_rearm_queue(struct mlx5_dev_ctx_shared *sh)
 		.cd_master = 1,
 		.state = MLX5_SQC_STATE_RST,
 		.tis_lst_sz = 1,
-		.tis_num = sh->tis->id,
+		.tis_num = sh->tis[0]->id,
 		.wq_attr = (struct mlx5_devx_wq_attr){
 			.pd = sh->pdn,
 			.uar_page = mlx5_os_get_devx_uar_page_id(sh->tx_uar),
@@ -433,7 +433,7 @@  mlx5_txpp_create_clock_queue(struct mlx5_dev_ctx_shared *sh)
 	/* Create send queue object for Clock Queue. */
 	if (sh->txpp.test) {
 		sq_attr.tis_lst_sz = 1;
-		sq_attr.tis_num = sh->tis->id;
+		sq_attr.tis_num = sh->tis[0]->id;
 		sq_attr.non_wire = 0;
 		sq_attr.static_sq_wq = 1;
 	} else {