[v2] net/mlx5: support live migration

Message ID 20230215115937.3928360-1-rongweil@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers
Series [v2] net/mlx5: support live migration |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS

Commit Message

Rongwei Liu Feb. 15, 2023, 11:59 a.m. UTC
  When a DPDK application must be upgraded,
the traffic downtime should be shortened as much as possible.
During the migration time, the old application may stay alive
while the new application is starting and being configured.

In order to optimize the switch to the new application,
the old application may need to be aware of the presence
of the new application being prepared.
This is achieved with a new API allowing the user to change the
new application state to standby and active later.

The added function is trying to apply the new mode to all probed
mlx5 ports. To make this API simple and easy to use,
the same flags have to be accepted by all devices.

This is the scenario of operations in the old and new applications:
.       device: already configured by the old application
.       new:    start as active
.       new:    probe the same device
.       new:    set as standby
.       new:    configure the device
.       device: has configurations from old and new applications
.       old:    clear its device configuration
.       device: has only 1 configuration from new application
.       new:    set as active
.       device: downtime for connecting all to the new application
.       old:    shutdown

The active mode means network handling configurations are programmed
to the HW immediately, and no behavior changed. This is the default state.
The standby mode means configurations are queued in the HW.
If there is no application with active mode,
any configuration is effective immediately.

Signed-off-by: Rongwei Liu <rongweil@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
 doc/guides/nics/mlx5.rst        |   8 +
 drivers/net/mlx5/mlx5.h         |  24 +++
 drivers/net/mlx5/mlx5_flow.c    | 313 +++++++++++++++++++++++++++++++-
 drivers/net/mlx5/rte_pmd_mlx5.h |  66 +++++++
 drivers/net/mlx5/version.map    |   2 +
 5 files changed, 409 insertions(+), 4 deletions(-)
  

Patch

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index ee2df66e77..f232efad36 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -612,6 +612,14 @@  Limitations
 - When using DV/verbs flow engine (``dv_flow_en`` = 1/0 respectively), Match on SPI field
   in ESP header for group 0 needs MLNX_OFED 5.6+.
 
+- During live migration to a new process set its flow engine as standby mode,
+  the user should only program flow rules in group 0 (``fdb_def_rule_en=0``).
+  Live migration is only supported under SWS (``dv_flow_en=1``).
+  The flow group 0 is shared between DPDK processes
+  while the other flow groups are limited to the current process.
+  The flow engine of a process cannot move from active to standby mode
+  if preceding active application rules are still present and vice versa.
+
 
 Statistics
 ----------
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index bea1f62ea8..e16ff288af 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -33,6 +33,7 @@ 
 #include "mlx5_utils.h"
 #include "mlx5_os.h"
 #include "mlx5_autoconf.h"
+#include "rte_pmd_mlx5.h"
 #if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
 #ifndef RTE_EXEC_ENV_WINDOWS
 #define HAVE_MLX5_HWS_SUPPORT 1
@@ -1656,6 +1657,28 @@  struct mlx5_hw_ctrl_flow {
 	struct rte_flow *flow;
 };
 
+/*
+ * Flow rule structure for flow engine mode control, focus on group 0.
+ * Apply to all supported domains.
+ */
+struct mlx5_dv_flow_info {
+	LIST_ENTRY(mlx5_dv_flow_info) next;
+	uint32_t orig_prio; /* prio set by user */
+	uint32_t flow_idx_high_prio;
+	/* flow index owned by standby mode. priority is lower unless DUP flags. */
+	uint32_t flow_idx_low_prio;
+	struct rte_flow_item *items;
+	struct rte_flow_action *actions;
+	struct rte_flow_attr attr;
+};
+
+struct mlx5_flow_engine_mode_info {
+	enum mlx5_flow_engine_mode mode;
+	uint32_t mode_flag;
+	/* The list is maintained in insertion order. */
+	LIST_HEAD(hot_up_info, mlx5_dv_flow_info) hot_upgrade;
+};
+
 struct mlx5_flow_hw_ctrl_rx;
 
 struct mlx5_priv {
@@ -1763,6 +1786,7 @@  struct mlx5_priv {
 	uint32_t nb_queue; /* HW steering queue number. */
 	struct mlx5_hws_cnt_pool *hws_cpool; /* HW steering's counter pool. */
 	uint32_t hws_mark_refcnt; /* HWS mark action reference counter. */
+	struct mlx5_flow_engine_mode_info mode_info; /* Process set flow engine info. */
 #if defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_INFINIBAND_VERBS_H)
 	/* Item template list. */
 	LIST_HEAD(flow_hw_itt, rte_flow_pattern_template) flow_hw_itt;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 73e6d3b486..360d7a9dbb 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -164,6 +164,16 @@  mlx5_flow_expand_rss_adjust_node(const struct rte_flow_item *pattern,
 		const struct mlx5_flow_expand_node graph[],
 		const struct mlx5_flow_expand_node *node);
 
+static __rte_always_inline int
+mlx5_need_cache_flow(const struct mlx5_priv *priv,
+		     const struct rte_flow_attr *attr)
+{
+	return priv->isolated && priv->sh->config.dv_flow_en == 1 &&
+		(attr ? !attr->group : true) &&
+		priv->mode_info.mode == MLX5_FLOW_ENGINE_MODE_STANDBY &&
+		(!priv->sh->config.dv_esw_en || !priv->sh->config.fdb_def_rule);
+}
+
 static bool
 mlx5_flow_is_rss_expandable_item(const struct rte_flow_item *item)
 {
@@ -7477,6 +7487,252 @@  mlx5_flow_validate(struct rte_eth_dev *dev,
 	return ret;
 }
 
+static int
+mlx5_flow_cache_flow_info(struct rte_eth_dev *dev,
+			  const struct rte_flow_attr *attr,
+			  const uint32_t orig_prio,
+			  const struct rte_flow_item *items,
+			  const struct rte_flow_action *actions,
+			  uint32_t flow_idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+	struct mlx5_dv_flow_info *flow_info, *tmp_info;
+	struct rte_flow_error error;
+	int len, ret;
+
+	flow_info = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*flow_info), 0, SOCKET_ID_ANY);
+	if (!flow_info) {
+		DRV_LOG(ERR, "No enough memory for flow_info caching.");
+		return -1;
+	}
+	flow_info->orig_prio = orig_prio;
+	flow_info->attr = *attr;
+	/* Standby mode rule awlays saves it in low priority entry. */
+	flow_info->flow_idx_low_prio = flow_idx;
+
+	/* Store matching items. */
+	ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN, NULL, 0, items, &error);
+	if (ret <= 0) {
+		DRV_LOG(ERR, "Can't get items length.");
+		goto end;
+	}
+	len = RTE_ALIGN(ret, 16);
+	flow_info->items = mlx5_malloc(MLX5_MEM_ZERO, len, 0, SOCKET_ID_ANY);
+	if (!flow_info->items) {
+		DRV_LOG(ERR, "No enough memory for items caching.");
+		goto end;
+	}
+	ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN, flow_info->items, ret, items, &error);
+	if (ret <= 0) {
+		DRV_LOG(ERR, "Can't duplicate items.");
+		goto end;
+	}
+
+	/* Store flow actions. */
+	ret = rte_flow_conv(RTE_FLOW_CONV_OP_ACTIONS, NULL, 0, actions, &error);
+	if (ret <= 0) {
+		DRV_LOG(ERR, "Can't get actions length.");
+		goto end;
+	}
+	len = RTE_ALIGN(ret, 16);
+	flow_info->actions = mlx5_malloc(MLX5_MEM_ZERO, len, 0, SOCKET_ID_ANY);
+	if (!flow_info->actions) {
+		DRV_LOG(ERR, "No enough memory for actions caching.");
+		goto end;
+	}
+	ret = rte_flow_conv(RTE_FLOW_CONV_OP_ACTIONS, flow_info->actions, ret, actions, &error);
+	if (ret <= 0) {
+		DRV_LOG(ERR, "Can't duplicate actions.");
+		goto end;
+	}
+
+	/* Insert to the list end. */
+	if (LIST_EMPTY(&mode_info->hot_upgrade)) {
+		LIST_INSERT_HEAD(&mode_info->hot_upgrade, flow_info,  next);
+	} else {
+		tmp_info = LIST_FIRST(&mode_info->hot_upgrade);
+		while (LIST_NEXT(tmp_info, next))
+			tmp_info = LIST_NEXT(tmp_info, next);
+		LIST_INSERT_AFTER(tmp_info, flow_info, next);
+	}
+	return 0;
+end:
+	if (flow_info->items)
+		mlx5_free(flow_info->items);
+	if (flow_info->actions)
+		mlx5_free(flow_info->actions);
+	mlx5_free(flow_info);
+	return -1;
+}
+
+static int
+mlx5_flow_cache_flow_toggle(struct rte_eth_dev *dev, bool orig_prio)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+	struct mlx5_dv_flow_info *flow_info;
+	struct rte_flow_attr attr;
+	struct rte_flow_error error;
+	struct rte_flow *high, *low;
+
+	flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+	while (flow_info) {
+		/* DUP flow may have the same priority. */
+		if (flow_info->orig_prio != flow_info->attr.priority) {
+			attr = flow_info->attr;
+			if (orig_prio)
+				attr.priority = flow_info->orig_prio;
+			flow_info->flow_idx_high_prio = flow_list_create(dev, MLX5_FLOW_TYPE_GEN,
+					&attr, flow_info->items, flow_info->actions,
+					true, &error);
+			if (!flow_info->flow_idx_high_prio) {
+				DRV_LOG(ERR, "Priority toggle failed internally.");
+				goto err;
+			}
+		}
+		flow_info = LIST_NEXT(flow_info, next);
+	}
+	/* Delete the low priority rules and swap the flow handle. */
+	flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+	while (flow_info) {
+		MLX5_ASSERT(flow_info->flow_idx_low_prio);
+		if (flow_info->orig_prio != flow_info->attr.priority) {
+			high = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
+					flow_info->flow_idx_high_prio);
+			low = mlx5_ipool_get(priv->flows[MLX5_FLOW_TYPE_GEN],
+					flow_info->flow_idx_low_prio);
+			if (high && low) {
+				RTE_SWAP(*low, *high);
+				flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN, flow_info->flow_idx_low_prio);
+				flow_info->flow_idx_high_prio = 0;
+			}
+		}
+		flow_info = LIST_NEXT(flow_info, next);
+	}
+	return 0;
+err:
+	/* Destroy preceding successful high priority rules. */
+	flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+	while (flow_info) {
+		if (flow_info->orig_prio != flow_info->attr.priority) {
+			if (flow_info->flow_idx_high_prio)
+				flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN, flow_info->flow_idx_high_prio);
+			else
+				break;
+			flow_info->flow_idx_high_prio = 0;
+		}
+		flow_info = LIST_NEXT(flow_info, next);
+	}
+	return -1;
+}
+
+/**
+ * Set the mode of the flow engine of a process to active or standby during live migration.
+ *
+ * @param[in] mode
+ *   MLX5 flow engine mode, @see `enum mlx5_flow_engine_mode`.
+ * @param[in] flags
+ *   Flow engine mode specific flags.
+ *
+ * @return
+ *   Negative value on error, positive on success.
+ */
+int
+rte_pmd_mlx5_flow_engine_set_mode(enum mlx5_flow_engine_mode mode, uint32_t flags)
+{
+	struct mlx5_priv *priv;
+	struct mlx5_flow_engine_mode_info *mode_info;
+	struct mlx5_dv_flow_info *flow_info, *tmp_info;
+	uint16_t port, port_id;
+	uint16_t toggle_num = 0;
+	struct rte_eth_dev *dev;
+	enum mlx5_flow_engine_mode orig_mode;
+	uint32_t orig_flags;
+	bool need_toggle = false;
+
+	/* Check if flags combinations are supported. */
+	if (flags && flags != MLX5_FLOW_ENGINE_FLAG_STANDBY_DUP_INGRESS) {
+		DRV_LOG(ERR, "Doesn't support such flags %u", flags);
+		return -1;
+	}
+	MLX5_ETH_FOREACH_DEV(port, NULL) {
+		dev = &rte_eth_devices[port];
+		priv = dev->data->dev_private;
+		mode_info = &priv->mode_info;
+		/* No mode change. Assume all devices hold the same mode. */
+		if (mode_info->mode == mode) {
+			DRV_LOG(INFO, "Process flow engine has been in mode %u", mode);
+			if (mode_info->mode_flag != flags && !LIST_EMPTY(&mode_info->hot_upgrade)) {
+				DRV_LOG(ERR, "Port %u has rule cache with different flag %u\n",
+						port, mode_info->mode_flag);
+				orig_mode = mode_info->mode;
+				orig_flags = mode_info->mode_flag;
+				goto err;
+			}
+			mode_info->mode_flag = flags;
+			toggle_num++;
+			continue;
+		}
+		/* Active -> standby. */
+		if (mode == MLX5_FLOW_ENGINE_MODE_STANDBY) {
+			if (!LIST_EMPTY(&mode_info->hot_upgrade)) {
+				DRV_LOG(ERR, "Cached rule existed");
+				orig_mode = mode_info->mode;
+				orig_flags = mode_info->mode_flag;
+				goto err;
+			}
+			mode_info->mode_flag = flags;
+			mode_info->mode = mode;
+			toggle_num++;
+		/* Standby -> active. */
+		} else if (mode == MLX5_FLOW_ENGINE_MODE_ACTIVE) {
+			if (LIST_EMPTY(&mode_info->hot_upgrade)) {
+				DRV_LOG(INFO, "No cached rule existed");
+			} else {
+				if (mlx5_flow_cache_flow_toggle(dev, true)) {
+					orig_mode = mode_info->mode;
+					orig_flags = mode_info->mode_flag;
+					need_toggle = true;
+					goto err;
+				}
+			}
+			toggle_num++;
+		}
+	}
+	if (mode == MLX5_FLOW_ENGINE_MODE_ACTIVE) {
+		/* Clear cache flow rules. */
+		MLX5_ETH_FOREACH_DEV(port, NULL) {
+			priv = rte_eth_devices[port].data->dev_private;
+			mode_info = &priv->mode_info;
+			flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+			while (flow_info) {
+				tmp_info = LIST_NEXT(flow_info, next);
+				LIST_REMOVE(flow_info, next);
+				mlx5_free(flow_info->actions);
+				mlx5_free(flow_info->items);
+				mlx5_free(flow_info);
+				flow_info = tmp_info;
+			}
+			MLX5_ASSERT(LIST_EMPTY(&mode_info->hot_upgrade));
+		}
+	}
+	return toggle_num;
+err:
+	/* Rollback all preceding successful ports. */
+	MLX5_ETH_FOREACH_DEV(port_id, NULL) {
+		if (port_id == port)
+			break;
+		priv = rte_eth_devices[port_id].data->dev_private;
+		mode_info = &priv->mode_info;
+		if (need_toggle && !LIST_EMPTY(&mode_info->hot_upgrade) &&
+		    mlx5_flow_cache_flow_toggle(dev, false))
+			return -EPERM;
+		mode_info->mode = orig_mode;
+		mode_info->mode_flag = orig_flags;
+	}
+	return -EINVAL;
+}
 /**
  * Create a flow.
  *
@@ -7491,6 +7747,9 @@  mlx5_flow_create(struct rte_eth_dev *dev,
 		 struct rte_flow_error *error)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct rte_flow_attr *new_attr = (void *)(uintptr_t)attr;
+	uint32_t prio = attr->priority;
+	uint32_t flow_idx;
 
 	if (priv->sh->config.dv_flow_en == 2) {
 		rte_flow_error_set(error, ENOTSUP,
@@ -7513,10 +7772,22 @@  mlx5_flow_create(struct rte_eth_dev *dev,
 				   "port not started");
 		return NULL;
 	}
-
-	return (void *)(uintptr_t)flow_list_create(dev, MLX5_FLOW_TYPE_GEN,
-						   attr, items, actions,
-						   true, error);
+	if (unlikely(mlx5_need_cache_flow(priv, attr))) {
+		if (attr->transfer ||
+		    (attr->ingress &&
+		    !(priv->mode_info.mode_flag & MLX5_FLOW_ENGINE_FLAG_STANDBY_DUP_INGRESS)))
+			new_attr->priority += 1;
+	}
+	flow_idx = flow_list_create(dev, MLX5_FLOW_TYPE_GEN, attr, items, actions, true, error);
+	if (!flow_idx)
+		return NULL;
+	if (unlikely(mlx5_need_cache_flow(priv, attr))) {
+		if (mlx5_flow_cache_flow_info(dev, attr, prio, items, actions, flow_idx)) {
+			flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN, flow_idx);
+			flow_idx = 0;
+		}
+	}
+	return (void *)(uintptr_t)flow_idx;
 }
 
 /**
@@ -7573,6 +7844,8 @@  mlx5_flow_list_flush(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 	struct mlx5_priv *priv = dev->data->dev_private;
 	uint32_t num_flushed = 0, fidx = 1;
 	struct rte_flow *flow;
+	struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+	struct mlx5_dv_flow_info *flow_info;
 
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	if (priv->sh->config.dv_flow_en == 2 &&
@@ -7584,6 +7857,21 @@  mlx5_flow_list_flush(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 
 	MLX5_IPOOL_FOREACH(priv->flows[type], fidx, flow) {
 		flow_list_destroy(dev, type, fidx);
+		if (unlikely(mlx5_need_cache_flow(priv, NULL) && type == MLX5_FLOW_TYPE_GEN)) {
+			flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+			while (flow_info) {
+				/* Romove the cache flow info. */
+				if (flow_info->flow_idx_low_prio == (uint32_t)(uintptr_t)fidx) {
+					MLX5_ASSERT(!flow_info->flow_idx_high_prio);
+					LIST_REMOVE(flow_info, next);
+					mlx5_free(flow_info->items);
+					mlx5_free(flow_info->actions);
+					mlx5_free(flow_info);
+					break;
+				}
+				flow_info = LIST_NEXT(flow_info, next);
+			}
+		}
 		num_flushed++;
 	}
 	if (active) {
@@ -8032,6 +8320,8 @@  mlx5_flow_destroy(struct rte_eth_dev *dev,
 		  struct rte_flow_error *error __rte_unused)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_engine_mode_info *mode_info = &priv->mode_info;
+	struct mlx5_dv_flow_info *flow_info;
 
 	if (priv->sh->config.dv_flow_en == 2)
 		return rte_flow_error_set(error, ENOTSUP,
@@ -8040,6 +8330,21 @@  mlx5_flow_destroy(struct rte_eth_dev *dev,
 			  "Flow non-Q destruction not supported");
 	flow_list_destroy(dev, MLX5_FLOW_TYPE_GEN,
 				(uintptr_t)(void *)flow);
+	if (unlikely(mlx5_need_cache_flow(priv, NULL))) {
+		flow_info = LIST_FIRST(&mode_info->hot_upgrade);
+		while (flow_info) {
+			/* Romove the cache flow info. */
+			if (flow_info->flow_idx_low_prio == (uint32_t)(uintptr_t)flow) {
+				MLX5_ASSERT(!flow_info->flow_idx_high_prio);
+				LIST_REMOVE(flow_info, next);
+				mlx5_free(flow_info->items);
+				mlx5_free(flow_info->actions);
+				mlx5_free(flow_info);
+				break;
+			}
+			flow_info = LIST_NEXT(flow_info, next);
+		}
+	}
 	return 0;
 }
 
diff --git a/drivers/net/mlx5/rte_pmd_mlx5.h b/drivers/net/mlx5/rte_pmd_mlx5.h
index b71a291256..446235426e 100644
--- a/drivers/net/mlx5/rte_pmd_mlx5.h
+++ b/drivers/net/mlx5/rte_pmd_mlx5.h
@@ -158,6 +158,72 @@  int rte_pmd_mlx5_host_shaper_config(int port_id, uint8_t rate, uint32_t flags);
 __rte_experimental
 int rte_pmd_mlx5_external_sq_enable(uint16_t port_id, uint32_t sq_num);
 
+/* MLX5 flow engine mode definition for live migration. */
+enum mlx5_flow_engine_mode {
+	MLX5_FLOW_ENGINE_MODE_ACTIVE, /* active means high priority, effective in HW. */
+	MLX5_FLOW_ENGINE_MODE_STANDBY, /* standby mode with lower priority flow rules. */
+};
+
+/**
+ * When set on the flow engine of a standby process, ingress flow rules will be effective
+ * in active and standby processes, so the ingress traffic may be duplicated.
+ */
+#define MLX5_FLOW_ENGINE_FLAG_STANDBY_DUP_INGRESS      RTE_BIT32(0)
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Set the flow engine mode of the process to active or standby,
+ * affecting network traffic handling.
+ *
+ * If one device does not support this operation or fails,
+ * the whole operation is failed and rolled back.
+ *
+ * It is forbidden to have multiple flow engines with the same mode
+ * unless only one of them is configured to handle the traffic.
+ *
+ * The application's flow engine is active by default.
+ * The configuration from the active flow engine is effective immediately
+ * while the configuration from the standby flow engine is queued by hardware.
+ * When configuring the device from a standby flow engine,
+ * it has no effect except for below situations:
+ *   - traffic not handled by the active flow engine configuration
+ *   - no active flow engine
+ *
+ * When flow engine of a process is changed from a standby to an active mode,
+ * all preceding configurations that are queued by hardware
+ * should become effective immediately.
+ * Before mode transition, all the traffic handling configurations
+ * set by the active flow engine should be flushed first.
+ *
+ * In summary, the operations are expected to happen in this order
+ * in "old" and "new" applications:
+ *   device: already configured by the old application
+ *   new:    start as active
+ *   new:    probe the same device
+ *   new:    set as standby
+ *   new:    configure the device
+ *   device: has configurations from old and new applications
+ *   old:    clear its device configuration
+ *   device: has only 1 configuration from new application
+ *   new:    set as active
+ *   device: downtime for connecting all to the new application
+ *   old:    shutdown
+ *
+ * @param mode
+ *   The desired mode `mlx5_flow_engine_mode`.
+ * @param flags
+ *   Mode specific flags.
+ * @return
+ *   Positive value on success, -rte_errno value on error:
+ *   - (> 0) Number of switched devices.
+ *   - (-EINVAL) if error happen and rollback internally.
+ *   - (-EPERM) if operation failed and can't recover.
+ */
+__rte_experimental
+int rte_pmd_mlx5_flow_engine_set_mode(enum mlx5_flow_engine_mode mode, uint32_t flags);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/drivers/net/mlx5/version.map b/drivers/net/mlx5/version.map
index 848270da13..7ef598027b 100644
--- a/drivers/net/mlx5/version.map
+++ b/drivers/net/mlx5/version.map
@@ -15,4 +15,6 @@  EXPERIMENTAL {
 	# added in 22.07
 	rte_pmd_mlx5_host_shaper_config;
 	rte_pmd_mlx5_external_sq_enable;
+	# added in 23.03
+	rte_pmd_mlx5_flow_engine_set_mode;
 };