[dpdk-dev,v5,07/11] net/mlx5: support tunnel RSS level

Message ID 20180420122340.113348-8-xuemingl@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply patch file failure

Commit Message

Xueming Li April 20, 2018, 12:23 p.m. UTC
  Tunnel RSS level of flow RSS action offers user a choice to do RSS hash
calculation on inner or outer RSS fields. Testpmd flow command examples:

GRE flow inner RSS:
  flow create 0 ingress pattern eth / ipv4 proto is 47 / gre / end
actions rss queues 1 2 end level 1 / end

GRE tunnel flow outer RSS:
  flow create 0 ingress pattern eth  / ipv4 proto is 47 / gre / end
actions rss queues 1 2 end level 0 / end

Signed-off-by: Xueming Li <xuemingl@mellanox.com>
Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/Makefile    |   2 +-
 drivers/net/mlx5/mlx5_flow.c | 257 +++++++++++++++++++++++++++----------------
 drivers/net/mlx5/mlx5_glue.c |  16 +++
 drivers/net/mlx5/mlx5_glue.h |   8 ++
 drivers/net/mlx5/mlx5_rxq.c  |  58 +++++++++-
 drivers/net/mlx5/mlx5_rxtx.h |   5 +-
 6 files changed, 240 insertions(+), 106 deletions(-)
  

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index b710a10f5..d9447ace9 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -8,7 +8,7 @@  include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_pmd_mlx5.a
 LIB_GLUE = $(LIB_GLUE_BASE).$(LIB_GLUE_VERSION)
 LIB_GLUE_BASE = librte_pmd_mlx5_glue.so
-LIB_GLUE_VERSION = 18.02.0
+LIB_GLUE_VERSION = 18.05.0
 
 # Sources.
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5.c
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index c2e57094e..174f2ba6e 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -117,6 +117,7 @@  enum hash_rxq_type {
 	HASH_RXQ_UDPV6,
 	HASH_RXQ_IPV6,
 	HASH_RXQ_ETH,
+	HASH_RXQ_TUNNEL,
 };
 
 /* Initialization data for hash RX queue. */
@@ -455,6 +456,7 @@  struct mlx5_flow_parse {
 	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< Queues indexes to use. */
 	uint8_t rss_key[40]; /**< copy of the RSS key. */
 	enum hash_rxq_type layer; /**< Last pattern layer detected. */
+	enum hash_rxq_type out_layer; /**< Last outer pattern layer detected. */
 	uint32_t tunnel; /**< Tunnel type of RTE_PTYPE_TUNNEL_XXX. */
 	struct ibv_counter_set *cs; /**< Holds the counter set for the rule */
 	struct {
@@ -462,6 +464,7 @@  struct mlx5_flow_parse {
 		/**< Pointer to Verbs attributes. */
 		unsigned int offset;
 		/**< Current position or total size of the attribute. */
+		uint64_t hash_fields; /**< Verbs hash fields. */
 	} queue[RTE_DIM(hash_rxq_init)];
 };
 
@@ -697,7 +700,8 @@  mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " function is Toeplitz");
 				return -rte_errno;
 			}
-			if (rss->level) {
+#ifndef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+			if (parser->rss_conf.level > 1) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
 						   actions,
@@ -705,6 +709,15 @@  mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 						   " level is not supported");
 				return -rte_errno;
 			}
+#endif
+			if (parser->rss_conf.level > 2) {
+				rte_flow_error_set(error, EINVAL,
+						   RTE_FLOW_ERROR_TYPE_ACTION,
+						   actions,
+						   "RSS encapsulation level"
+						   " > 1 is not supported");
+				return -rte_errno;
+			}
 			if (rss->types & MLX5_RSS_HF_MASK) {
 				rte_flow_error_set(error, EINVAL,
 						   RTE_FLOW_ERROR_TYPE_ACTION,
@@ -755,7 +768,7 @@  mlx5_flow_convert_actions(struct rte_eth_dev *dev,
 			}
 			parser->rss_conf = (struct rte_flow_action_rss){
 				.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
-				.level = 0,
+				.level = rss->level,
 				.types = rss->types,
 				.key_len = rss_key_len,
 				.queue_num = rss->queue_num,
@@ -839,10 +852,12 @@  mlx5_flow_convert_actions(struct rte_eth_dev *dev,
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx5_flow_convert_items_validate(const struct rte_flow_item items[],
+mlx5_flow_convert_items_validate(struct rte_eth_dev *dev,
+				 const struct rte_flow_item items[],
 				 struct rte_flow_error *error,
 				 struct mlx5_flow_parse *parser)
 {
+	struct priv *priv = dev->data->dev_private;
 	const struct mlx5_flow_items *cur_item = mlx5_flow_items;
 	unsigned int i;
 	int ret = 0;
@@ -882,6 +897,14 @@  mlx5_flow_convert_items_validate(const struct rte_flow_item items[],
 						   " tunnel encapsulations.");
 				return -rte_errno;
 			}
+			if (!priv->config.tunnel_en &&
+			    parser->rss_conf.level > 1) {
+				rte_flow_error_set(error, ENOTSUP,
+					RTE_FLOW_ERROR_TYPE_ITEM,
+					items,
+					"RSS on tunnel is not supported");
+				return -rte_errno;
+			}
 			parser->inner = IBV_FLOW_SPEC_INNER;
 			parser->tunnel = flow_ptype[items->type];
 		}
@@ -1001,7 +1024,11 @@  static void
 mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 {
 	unsigned int i;
+	uint32_t inner = parser->inner;
 
+	/* Don't create extra flows for outer RSS. */
+	if (parser->tunnel && parser->rss_conf.level < 2)
+		return;
 	/*
 	 * Fill missing layers in verbs specifications, or compute the correct
 	 * offset to allocate the memory space for the attributes and
@@ -1012,23 +1039,25 @@  mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 			struct ibv_flow_spec_ipv4_ext ipv4;
 			struct ibv_flow_spec_ipv6 ipv6;
 			struct ibv_flow_spec_tcp_udp udp_tcp;
+			struct ibv_flow_spec_eth eth;
 		} specs;
 		void *dst;
 		uint16_t size;
 
 		if (i == parser->layer)
 			continue;
-		if (parser->layer == HASH_RXQ_ETH) {
+		if (parser->layer == HASH_RXQ_ETH ||
+		    parser->layer == HASH_RXQ_TUNNEL) {
 			if (hash_rxq_init[i].ip_version == MLX5_IPV4) {
 				size = sizeof(struct ibv_flow_spec_ipv4_ext);
 				specs.ipv4 = (struct ibv_flow_spec_ipv4_ext){
-					.type = IBV_FLOW_SPEC_IPV4_EXT,
+					.type = inner | IBV_FLOW_SPEC_IPV4_EXT,
 					.size = size,
 				};
 			} else {
 				size = sizeof(struct ibv_flow_spec_ipv6);
 				specs.ipv6 = (struct ibv_flow_spec_ipv6){
-					.type = IBV_FLOW_SPEC_IPV6,
+					.type = inner | IBV_FLOW_SPEC_IPV6,
 					.size = size,
 				};
 			}
@@ -1045,7 +1074,7 @@  mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 		    (i == HASH_RXQ_UDPV6) || (i == HASH_RXQ_TCPV6)) {
 			size = sizeof(struct ibv_flow_spec_tcp_udp);
 			specs.udp_tcp = (struct ibv_flow_spec_tcp_udp) {
-				.type = ((i == HASH_RXQ_UDPV4 ||
+				.type = inner | ((i == HASH_RXQ_UDPV4 ||
 					  i == HASH_RXQ_UDPV6) ?
 					 IBV_FLOW_SPEC_UDP :
 					 IBV_FLOW_SPEC_TCP),
@@ -1075,50 +1104,93 @@  mlx5_flow_convert_finalise(struct mlx5_flow_parse *parser)
 static int
 mlx5_flow_convert_rss(struct mlx5_flow_parse *parser)
 {
-	const unsigned int ipv4 =
-		hash_rxq_init[parser->layer].ip_version == MLX5_IPV4;
-	const enum hash_rxq_type hmin = ipv4 ? HASH_RXQ_TCPV4 : HASH_RXQ_TCPV6;
-	const enum hash_rxq_type hmax = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6;
-	const enum hash_rxq_type ohmin = ipv4 ? HASH_RXQ_TCPV6 : HASH_RXQ_TCPV4;
-	const enum hash_rxq_type ohmax = ipv4 ? HASH_RXQ_IPV6 : HASH_RXQ_IPV4;
-	const enum hash_rxq_type ip = ipv4 ? HASH_RXQ_IPV4 : HASH_RXQ_IPV6;
 	unsigned int i;
-
-	/* Remove any other flow not matching the pattern. */
-	if (parser->rss_conf.queue_num == 1 && !parser->rss_conf.types) {
-		for (i = 0; i != hash_rxq_init_n; ++i) {
-			if (i == HASH_RXQ_ETH)
+	enum hash_rxq_type start;
+	enum hash_rxq_type layer;
+	int outer = parser->tunnel && parser->rss_conf.level < 2;
+	uint64_t rss = parser->rss_conf.types;
+
+	/* Default to outer RSS. */
+	if (!parser->rss_conf.level)
+		parser->rss_conf.level = 1;
+	layer = outer ? parser->out_layer : parser->layer;
+	if (layer == HASH_RXQ_TUNNEL)
+		layer = HASH_RXQ_ETH;
+	if (outer) {
+		/* Only one hash type for outer RSS. */
+		if (rss && layer == HASH_RXQ_ETH) {
+			start = HASH_RXQ_TCPV4;
+		} else if (rss && layer != HASH_RXQ_ETH &&
+			   !(rss & hash_rxq_init[layer].dpdk_rss_hf)) {
+			/* If RSS not match L4 pattern, try L3 RSS. */
+			if (layer < HASH_RXQ_IPV4)
+				layer = HASH_RXQ_IPV4;
+			else if (layer > HASH_RXQ_IPV4 && layer < HASH_RXQ_IPV6)
+				layer = HASH_RXQ_IPV6;
+			start = layer;
+		} else {
+			start = layer;
+		}
+		/* Scan first valid hash type. */
+		for (i = start; rss && i <= layer; ++i) {
+			if (!parser->queue[i].ibv_attr)
 				continue;
-			rte_free(parser->queue[i].ibv_attr);
-			parser->queue[i].ibv_attr = NULL;
+			if (hash_rxq_init[i].dpdk_rss_hf & rss)
+				break;
 		}
-		return 0;
-	}
-	if (parser->layer == HASH_RXQ_ETH)
-		return 0;
-	/* This layer becomes useless as the pattern define under layers. */
-	rte_free(parser->queue[HASH_RXQ_ETH].ibv_attr);
-	parser->queue[HASH_RXQ_ETH].ibv_attr = NULL;
-	/* Remove opposite kind of layer e.g. IPv6 if the pattern is IPv4. */
-	for (i = ohmin; i != (ohmax + 1); ++i) {
-		if (!parser->queue[i].ibv_attr)
-			continue;
-		rte_free(parser->queue[i].ibv_attr);
-		parser->queue[i].ibv_attr = NULL;
-	}
-	/* Remove impossible flow according to the RSS configuration. */
-	if (hash_rxq_init[parser->layer].dpdk_rss_hf &
-	    parser->rss_conf.types) {
-		/* Remove any other flow. */
-		for (i = hmin; i != (hmax + 1); ++i) {
-			if (i == parser->layer || !parser->queue[i].ibv_attr)
+		if (rss && i <= layer)
+			parser->queue[layer].hash_fields =
+					hash_rxq_init[i].hash_fields;
+		/* Trim unused hash types. */
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (parser->queue[i].ibv_attr && i != layer) {
+				rte_free(parser->queue[i].ibv_attr);
+				parser->queue[i].ibv_attr = NULL;
+			}
+		}
+	} else {
+		/* Expand for inner or normal RSS. */
+		if (rss && (layer == HASH_RXQ_ETH || layer == HASH_RXQ_IPV4))
+			start = HASH_RXQ_TCPV4;
+		else if (rss && layer == HASH_RXQ_IPV6)
+			start = HASH_RXQ_TCPV6;
+		else
+			start = layer;
+		/* For L4 pattern, try L3 RSS if no L4 RSS. */
+		/* Trim unused hash types. */
+		for (i = 0; i != hash_rxq_init_n; ++i) {
+			if (!parser->queue[i].ibv_attr)
 				continue;
-			rte_free(parser->queue[i].ibv_attr);
-			parser->queue[i].ibv_attr = NULL;
+			if (i < start || i > layer) {
+				rte_free(parser->queue[i].ibv_attr);
+				parser->queue[i].ibv_attr = NULL;
+				continue;
+			}
+			if (!rss)
+				continue;
+			if (hash_rxq_init[i].dpdk_rss_hf & rss) {
+				parser->queue[i].hash_fields =
+						hash_rxq_init[i].hash_fields;
+			} else if (i != layer) {
+				/* Remove unused RSS expansion. */
+				rte_free(parser->queue[i].ibv_attr);
+				parser->queue[i].ibv_attr = NULL;
+			} else if (layer < HASH_RXQ_IPV4 &&
+				   (hash_rxq_init[HASH_RXQ_IPV4].dpdk_rss_hf &
+				    rss)) {
+				/* Allow IPv4 RSS on L4 pattern. */
+				parser->queue[i].hash_fields =
+					hash_rxq_init[HASH_RXQ_IPV4]
+						.hash_fields;
+			} else if (i > HASH_RXQ_IPV4 && i < HASH_RXQ_IPV6 &&
+				   (hash_rxq_init[HASH_RXQ_IPV6].dpdk_rss_hf &
+				    rss)) {
+				/* Allow IPv4 RSS on L4 pattern. */
+				parser->queue[i].hash_fields =
+					hash_rxq_init[HASH_RXQ_IPV6]
+						.hash_fields;
+			}
 		}
-	} else if (!parser->queue[ip].ibv_attr) {
-		/* no RSS possible with the current configuration. */
-		parser->rss_conf.queue_num = 1;
 	}
 	return 0;
 }
@@ -1166,7 +1238,7 @@  mlx5_flow_convert(struct rte_eth_dev *dev,
 	ret = mlx5_flow_convert_actions(dev, actions, error, parser);
 	if (ret)
 		return ret;
-	ret = mlx5_flow_convert_items_validate(items, error, parser);
+	ret = mlx5_flow_convert_items_validate(dev, items, error, parser);
 	if (ret)
 		return ret;
 	mlx5_flow_convert_finalise(parser);
@@ -1187,10 +1259,6 @@  mlx5_flow_convert(struct rte_eth_dev *dev,
 		for (i = 0; i != hash_rxq_init_n; ++i) {
 			unsigned int offset;
 
-			if (!(parser->rss_conf.types &
-			      hash_rxq_init[i].dpdk_rss_hf) &&
-			    (i != HASH_RXQ_ETH))
-				continue;
 			offset = parser->queue[i].offset;
 			parser->queue[i].ibv_attr =
 				mlx5_flow_convert_allocate(offset, error);
@@ -1202,6 +1270,7 @@  mlx5_flow_convert(struct rte_eth_dev *dev,
 	/* Third step. Conversion parse, fill the specifications. */
 	parser->inner = 0;
 	parser->tunnel = 0;
+	parser->layer = HASH_RXQ_ETH;
 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; ++items) {
 		struct mlx5_flow_data data = {
 			.dev = dev,
@@ -1282,17 +1351,11 @@  mlx5_flow_create_copy(struct mlx5_flow_parse *parser, void *src,
 	for (i = 0; i != hash_rxq_init_n; ++i) {
 		if (!parser->queue[i].ibv_attr)
 			continue;
-		/* Specification must be the same l3 type or none. */
-		if (parser->layer == HASH_RXQ_ETH ||
-		    (hash_rxq_init[parser->layer].ip_version ==
-		     hash_rxq_init[i].ip_version) ||
-		    (hash_rxq_init[i].ip_version == 0)) {
-			dst = (void *)((uintptr_t)parser->queue[i].ibv_attr +
-					parser->queue[i].offset);
-			memcpy(dst, src, size);
-			++parser->queue[i].ibv_attr->num_of_specs;
-			parser->queue[i].offset += size;
-		}
+		dst = (void *)((uintptr_t)parser->queue[i].ibv_attr +
+				parser->queue[i].offset);
+		memcpy(dst, src, size);
+		++parser->queue[i].ibv_attr->num_of_specs;
+		parser->queue[i].offset += size;
 	}
 }
 
@@ -1323,9 +1386,7 @@  mlx5_flow_create_eth(const struct rte_flow_item *item,
 		.size = eth_size,
 	};
 
-	/* Don't update layer for the inner pattern. */
-	if (!parser->inner)
-		parser->layer = HASH_RXQ_ETH;
+	parser->layer = HASH_RXQ_ETH;
 	if (spec) {
 		unsigned int i;
 
@@ -1446,9 +1507,7 @@  mlx5_flow_create_ipv4(const struct rte_flow_item *item,
 					  "L3 VXLAN not enabled by device"
 					  " parameter and/or not configured"
 					  " in firmware");
-	/* Don't update layer for the inner pattern. */
-	if (!parser->inner)
-		parser->layer = HASH_RXQ_IPV4;
+	parser->layer = HASH_RXQ_IPV4;
 	if (spec) {
 		if (!mask)
 			mask = default_mask;
@@ -1511,9 +1570,7 @@  mlx5_flow_create_ipv6(const struct rte_flow_item *item,
 					  "L3 VXLAN not enabled by device"
 					  " parameter and/or not configured"
 					  " in firmware");
-	/* Don't update layer for the inner pattern. */
-	if (!parser->inner)
-		parser->layer = HASH_RXQ_IPV6;
+	parser->layer = HASH_RXQ_IPV6;
 	if (spec) {
 		unsigned int i;
 		uint32_t vtc_flow_val;
@@ -1586,13 +1643,10 @@  mlx5_flow_create_udp(const struct rte_flow_item *item,
 		.size = udp_size,
 	};
 
-	/* Don't update layer for the inner pattern. */
-	if (!parser->inner) {
-		if (parser->layer == HASH_RXQ_IPV4)
-			parser->layer = HASH_RXQ_UDPV4;
-		else
-			parser->layer = HASH_RXQ_UDPV6;
-	}
+	if (parser->layer == HASH_RXQ_IPV4)
+		parser->layer = HASH_RXQ_UDPV4;
+	else
+		parser->layer = HASH_RXQ_UDPV6;
 	if (spec) {
 		if (!mask)
 			mask = default_mask;
@@ -1635,13 +1689,10 @@  mlx5_flow_create_tcp(const struct rte_flow_item *item,
 		.size = tcp_size,
 	};
 
-	/* Don't update layer for the inner pattern. */
-	if (!parser->inner) {
-		if (parser->layer == HASH_RXQ_IPV4)
-			parser->layer = HASH_RXQ_TCPV4;
-		else
-			parser->layer = HASH_RXQ_TCPV6;
-	}
+	if (parser->layer == HASH_RXQ_IPV4)
+		parser->layer = HASH_RXQ_TCPV4;
+	else
+		parser->layer = HASH_RXQ_TCPV6;
 	if (spec) {
 		if (!mask)
 			mask = default_mask;
@@ -1691,6 +1742,11 @@  mlx5_flow_create_vxlan(const struct rte_flow_item *item,
 	id.vni[0] = 0;
 	parser->inner = IBV_FLOW_SPEC_INNER;
 	parser->tunnel = ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_VXLAN)];
+	parser->out_layer = parser->layer;
+	parser->layer = HASH_RXQ_TUNNEL;
+	/* Default VXLAN to outer RSS. */
+	if (!parser->rss_conf.level)
+		parser->rss_conf.level = 1;
 	if (spec) {
 		if (!mask)
 			mask = default_mask;
@@ -1748,6 +1804,11 @@  mlx5_flow_create_gre(const struct rte_flow_item *item __rte_unused,
 
 	parser->inner = IBV_FLOW_SPEC_INNER;
 	parser->tunnel = ptype_ext[PTYPE_IDX(RTE_PTYPE_TUNNEL_GRE)];
+	parser->out_layer = parser->layer;
+	parser->layer = HASH_RXQ_TUNNEL;
+	/* Default GRE to inner RSS. */
+	if (!parser->rss_conf.level)
+		parser->rss_conf.level = 2;
 	/* Update encapsulation IP layer protocol. */
 	for (i = 0; i != hash_rxq_init_n; ++i) {
 		if (!parser->queue[i].ibv_attr)
@@ -1939,33 +2000,33 @@  mlx5_flow_create_action_queue_rss(struct rte_eth_dev *dev,
 	unsigned int i;
 
 	for (i = 0; i != hash_rxq_init_n; ++i) {
-		uint64_t hash_fields;
-
 		if (!parser->queue[i].ibv_attr)
 			continue;
 		flow->frxq[i].ibv_attr = parser->queue[i].ibv_attr;
 		parser->queue[i].ibv_attr = NULL;
-		hash_fields = hash_rxq_init[i].hash_fields;
+		flow->frxq[i].hash_fields = parser->queue[i].hash_fields;
 		if (!priv->dev->data->dev_started)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_get(dev,
 				      parser->rss_conf.key,
 				      parser->rss_conf.key_len,
-				      hash_fields,
+				      flow->frxq[i].hash_fields,
 				      parser->rss_conf.queue,
 				      parser->rss_conf.queue_num,
-				      parser->tunnel);
+				      parser->tunnel,
+				      parser->rss_conf.level);
 		if (flow->frxq[i].hrxq)
 			continue;
 		flow->frxq[i].hrxq =
 			mlx5_hrxq_new(dev,
 				      parser->rss_conf.key,
 				      parser->rss_conf.key_len,
-				      hash_fields,
+				      flow->frxq[i].hash_fields,
 				      parser->rss_conf.queue,
 				      parser->rss_conf.queue_num,
-				      parser->tunnel);
+				      parser->tunnel,
+				      parser->rss_conf.level);
 		if (!flow->frxq[i].hrxq) {
 			return rte_flow_error_set(error, ENOMEM,
 						  RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -2070,7 +2131,7 @@  mlx5_flow_create_action_queue(struct rte_eth_dev *dev,
 		DRV_LOG(DEBUG, "port %u %p type %d QP %p ibv_flow %p",
 			dev->data->port_id,
 			(void *)flow, i,
-			(void *)flow->frxq[i].hrxq,
+			(void *)flow->frxq[i].hrxq->qp,
 			(void *)flow->frxq[i].ibv_flow);
 	}
 	if (!flows_n) {
@@ -2598,19 +2659,21 @@  mlx5_flow_start(struct rte_eth_dev *dev, struct mlx5_flows *list)
 			flow->frxq[i].hrxq =
 				mlx5_hrxq_get(dev, flow->rss_conf.key,
 					      flow->rss_conf.key_len,
-					      hash_rxq_init[i].hash_fields,
+					      flow->frxq[i].hash_fields,
 					      flow->rss_conf.queue,
 					      flow->rss_conf.queue_num,
-					      flow->tunnel);
+					      flow->tunnel,
+					      flow->rss_conf.level);
 			if (flow->frxq[i].hrxq)
 				goto flow_create;
 			flow->frxq[i].hrxq =
 				mlx5_hrxq_new(dev, flow->rss_conf.key,
 					      flow->rss_conf.key_len,
-					      hash_rxq_init[i].hash_fields,
+					      flow->frxq[i].hash_fields,
 					      flow->rss_conf.queue,
 					      flow->rss_conf.queue_num,
-					      flow->tunnel);
+					      flow->tunnel,
+					      flow->rss_conf.level);
 			if (!flow->frxq[i].hrxq) {
 				DRV_LOG(DEBUG,
 					"port %u flow %p cannot be applied",
diff --git a/drivers/net/mlx5/mlx5_glue.c b/drivers/net/mlx5/mlx5_glue.c
index a771ac4c7..cd2716352 100644
--- a/drivers/net/mlx5/mlx5_glue.c
+++ b/drivers/net/mlx5/mlx5_glue.c
@@ -313,6 +313,21 @@  mlx5_glue_dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
 	return mlx5dv_init_obj(obj, obj_type);
 }
 
+static struct ibv_qp *
+mlx5_glue_dv_create_qp(struct ibv_context *context,
+		       struct ibv_qp_init_attr_ex *qp_init_attr_ex,
+		       struct mlx5dv_qp_init_attr *dv_qp_init_attr)
+{
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	return mlx5dv_create_qp(context, qp_init_attr_ex, dv_qp_init_attr);
+#else
+	(void)context;
+	(void)qp_init_attr_ex;
+	(void)dv_qp_init_attr;
+	return NULL;
+#endif
+}
+
 const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){
 	.version = MLX5_GLUE_VERSION,
 	.fork_init = mlx5_glue_fork_init,
@@ -356,4 +371,5 @@  const struct mlx5_glue *mlx5_glue = &(const struct mlx5_glue){
 	.dv_query_device = mlx5_glue_dv_query_device,
 	.dv_set_context_attr = mlx5_glue_dv_set_context_attr,
 	.dv_init_obj = mlx5_glue_dv_init_obj,
+	.dv_create_qp = mlx5_glue_dv_create_qp,
 };
diff --git a/drivers/net/mlx5/mlx5_glue.h b/drivers/net/mlx5/mlx5_glue.h
index 33385d226..9f36af81a 100644
--- a/drivers/net/mlx5/mlx5_glue.h
+++ b/drivers/net/mlx5/mlx5_glue.h
@@ -31,6 +31,10 @@  struct ibv_counter_set_init_attr;
 struct ibv_query_counter_set_attr;
 #endif
 
+#ifndef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+struct mlx5dv_qp_init_attr;
+#endif
+
 /* LIB_GLUE_VERSION must be updated every time this structure is modified. */
 struct mlx5_glue {
 	const char *version;
@@ -106,6 +110,10 @@  struct mlx5_glue {
 				   enum mlx5dv_set_ctx_attr_type type,
 				   void *attr);
 	int (*dv_init_obj)(struct mlx5dv_obj *obj, uint64_t obj_type);
+	struct ibv_qp *(*dv_create_qp)
+		(struct ibv_context *context,
+		 struct ibv_qp_init_attr_ex *qp_init_attr_ex,
+		 struct mlx5dv_qp_init_attr *dv_qp_init_attr);
 };
 
 const struct mlx5_glue *mlx5_glue;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 6756f25fa..58403b5b6 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1385,7 +1385,9 @@  mlx5_ind_table_ibv_verify(struct rte_eth_dev *dev)
  * @param queues_n
  *   Number of queues.
  * @param tunnel
- *   Tunnel type.
+ *   Tunnel type, implies tunnel offloading like inner checksum if available.
+ * @param rss_level
+ *   RSS hash on tunnel level.
  *
  * @return
  *   The Verbs object initialised, NULL otherwise and rte_errno is set.
@@ -1394,13 +1396,17 @@  struct mlx5_hrxq *
 mlx5_hrxq_new(struct rte_eth_dev *dev,
 	      const uint8_t *rss_key, uint32_t rss_key_len,
 	      uint64_t hash_fields,
-	      const uint16_t *queues, uint32_t queues_n, uint32_t tunnel)
+	      const uint16_t *queues, uint32_t queues_n,
+	      uint32_t tunnel, uint32_t rss_level)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
 	struct mlx5_ind_table_ibv *ind_tbl;
 	struct ibv_qp *qp;
 	int err;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	struct mlx5dv_qp_init_attr qp_init_attr = {0};
+#endif
 
 	queues_n = hash_fields ? queues_n : 1;
 	ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n);
@@ -1410,6 +1416,36 @@  mlx5_hrxq_new(struct rte_eth_dev *dev,
 		rte_errno = ENOMEM;
 		return NULL;
 	}
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	if (tunnel) {
+		qp_init_attr.comp_mask =
+				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+		qp_init_attr.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
+	}
+	qp = mlx5_glue->dv_create_qp(
+		priv->ctx,
+		&(struct ibv_qp_init_attr_ex){
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.comp_mask =
+				IBV_QP_INIT_ATTR_PD |
+				IBV_QP_INIT_ATTR_IND_TABLE |
+				IBV_QP_INIT_ATTR_RX_HASH,
+			.rx_hash_conf = (struct ibv_rx_hash_conf){
+				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = rss_key_len ? rss_key_len :
+						   rss_hash_default_key_len,
+				.rx_hash_key = rss_key ?
+					       (void *)(uintptr_t)rss_key :
+					       rss_hash_default_key,
+				.rx_hash_fields_mask = hash_fields |
+					(tunnel && rss_level > 1 ?
+					(uint32_t)IBV_RX_HASH_INNER : 0),
+			},
+			.rwq_ind_tbl = ind_tbl->ind_table,
+			.pd = priv->pd,
+		},
+		&qp_init_attr);
+#else
 	qp = mlx5_glue->create_qp_ex
 		(priv->ctx,
 		 &(struct ibv_qp_init_attr_ex){
@@ -1420,13 +1456,17 @@  mlx5_hrxq_new(struct rte_eth_dev *dev,
 				IBV_QP_INIT_ATTR_RX_HASH,
 			.rx_hash_conf = (struct ibv_rx_hash_conf){
 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
-				.rx_hash_key_len = rss_key_len,
-				.rx_hash_key = (void *)(uintptr_t)rss_key,
+				.rx_hash_key_len = rss_key_len ? rss_key_len :
+						   rss_hash_default_key_len,
+				.rx_hash_key = rss_key ?
+					       (void *)(uintptr_t)rss_key :
+					       rss_hash_default_key,
 				.rx_hash_fields_mask = hash_fields,
 			},
 			.rwq_ind_tbl = ind_tbl->ind_table,
 			.pd = priv->pd,
 		 });
+#endif
 	if (!qp) {
 		rte_errno = errno;
 		goto error;
@@ -1439,6 +1479,7 @@  mlx5_hrxq_new(struct rte_eth_dev *dev,
 	hrxq->rss_key_len = rss_key_len;
 	hrxq->hash_fields = hash_fields;
 	hrxq->tunnel = tunnel;
+	hrxq->rss_level = rss_level;
 	memcpy(hrxq->rss_key, rss_key, rss_key_len);
 	rte_atomic32_inc(&hrxq->refcnt);
 	LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
@@ -1468,7 +1509,9 @@  mlx5_hrxq_new(struct rte_eth_dev *dev,
  * @param queues_n
  *   Number of queues.
  * @param tunnel
- *   Tunnel type.
+ *   Tunnel type, implies tunnel offloading like inner checksum if available.
+ * @param rss_level
+ *   RSS hash on tunnel level
  *
  * @return
  *   An hash Rx queue on success.
@@ -1477,7 +1520,8 @@  struct mlx5_hrxq *
 mlx5_hrxq_get(struct rte_eth_dev *dev,
 	      const uint8_t *rss_key, uint32_t rss_key_len,
 	      uint64_t hash_fields,
-	      const uint16_t *queues, uint32_t queues_n, uint32_t tunnel)
+	      const uint16_t *queues, uint32_t queues_n,
+	      uint32_t tunnel, uint32_t rss_level)
 {
 	struct priv *priv = dev->data->dev_private;
 	struct mlx5_hrxq *hrxq;
@@ -1494,6 +1538,8 @@  mlx5_hrxq_get(struct rte_eth_dev *dev,
 			continue;
 		if (hrxq->tunnel != tunnel)
 			continue;
+		if (hrxq->rss_level != rss_level)
+			continue;
 		ind_tbl = mlx5_ind_table_ibv_get(dev, queues, queues_n);
 		if (!ind_tbl)
 			continue;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 188fd65c5..07b3adfae 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -147,6 +147,7 @@  struct mlx5_hrxq {
 	struct ibv_qp *qp; /* Verbs queue pair. */
 	uint64_t hash_fields; /* Verbs Hash fields. */
 	uint32_t tunnel; /* Tunnel type. */
+	uint32_t rss_level; /* RSS on tunnel level. */
 	uint32_t rss_key_len; /* Hash key length in bytes. */
 	uint8_t rss_key[]; /* Hash key. */
 };
@@ -251,12 +252,12 @@  struct mlx5_hrxq *mlx5_hrxq_new(struct rte_eth_dev *dev,
 				const uint8_t *rss_key, uint32_t rss_key_len,
 				uint64_t hash_fields,
 				const uint16_t *queues, uint32_t queues_n,
-				uint32_t tunnel);
+				uint32_t tunnel, uint32_t rss_level);
 struct mlx5_hrxq *mlx5_hrxq_get(struct rte_eth_dev *dev,
 				const uint8_t *rss_key, uint32_t rss_key_len,
 				uint64_t hash_fields,
 				const uint16_t *queues, uint32_t queues_n,
-				uint32_t tunnel);
+				uint32_t tunnel, uint32_t rss_level);
 int mlx5_hrxq_release(struct rte_eth_dev *dev, struct mlx5_hrxq *hxrq);
 int mlx5_hrxq_ibv_verify(struct rte_eth_dev *dev);
 uint64_t mlx5_get_rx_port_offloads(void);