diff mbox series

[4/5] net/mlx5: e-switch VXLAN flow translation routine

Message ID 1538461807-37507-4-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers show
Series None | expand

Checks

Context Check Description
ci/Intel-compilation success Compilation OK

Commit Message

Slava Ovsiienko Oct. 2, 2018, 6:30 a.m. UTC
This part of patchset adds support of VXLAN-related items and
actions to the flow translation routine. If some of them are
specified in the rule, the extra space for tunnel description
structure is allocated. Later some tunnel types, other than VXLAN
can be addedd (GRE). No VTEP devices are created at this point,
the flow rule is just translated, not applied yet.

Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_flow_tcf.c | 671 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 591 insertions(+), 80 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
index 97451bd..dfffc50 100644
--- a/drivers/net/mlx5/mlx5_flow_tcf.c
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -1597,7 +1597,7 @@  struct flow_tcf_ptoi {
 
 	size += SZ_NLATTR_STRZ_OF("flower") +
 		SZ_NLATTR_NEST + /* TCA_OPTIONS. */
-		SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
+		SZ_NLATTR_TYPE_OF_UINT32; /* TCA_CLS_FLAGS_SKIP_SW. */
 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
 		switch (items->type) {
 		case RTE_FLOW_ITEM_TYPE_VOID:
@@ -1605,45 +1605,49 @@  struct flow_tcf_ptoi {
 		case RTE_FLOW_ITEM_TYPE_PORT_ID:
 			break;
 		case RTE_FLOW_ITEM_TYPE_ETH:
-			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
+			size += SZ_NLATTR_TYPE_OF_UINT16 + /* Ether type. */
 				SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
 				/* dst/src MAC addr and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L2;
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
-			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
-				SZ_NLATTR_TYPE_OF(uint16_t) +
+			size += SZ_NLATTR_TYPE_OF_UINT16 + /* Ether type. */
+				SZ_NLATTR_TYPE_OF_UINT16 +
 				/* VLAN Ether type. */
-				SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
-				SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
+				SZ_NLATTR_TYPE_OF_UINT8 + /* VLAN prio. */
+				SZ_NLATTR_TYPE_OF_UINT16; /* VLAN ID. */
 			flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
-			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
-				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
-				SZ_NLATTR_TYPE_OF(uint32_t) * 4;
+			size += SZ_NLATTR_TYPE_OF_UINT16 + /* Ether type. */
+				SZ_NLATTR_TYPE_OF_UINT8 + /* IP proto. */
+				SZ_NLATTR_TYPE_OF_UINT32 * 4;
 				/* dst/src IP addr and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV6:
-			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
-				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
+			size += SZ_NLATTR_TYPE_OF_UINT16 + /* Ether type. */
+				SZ_NLATTR_TYPE_OF_UINT8 + /* IP proto. */
 				SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
 				/* dst/src IP addr and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
 			break;
 		case RTE_FLOW_ITEM_TYPE_UDP:
-			size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
-				SZ_NLATTR_TYPE_OF(uint16_t) * 4;
+			size += SZ_NLATTR_TYPE_OF_UINT8 + /* IP proto. */
+				SZ_NLATTR_TYPE_OF_UINT16 * 4;
 				/* dst/src port and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
 			break;
 		case RTE_FLOW_ITEM_TYPE_TCP:
-			size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
-				SZ_NLATTR_TYPE_OF(uint16_t) * 4;
+			size += SZ_NLATTR_TYPE_OF_UINT8 + /* IP proto. */
+				SZ_NLATTR_TYPE_OF_UINT16 * 4;
 				/* dst/src port and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
 			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			size += SZ_NLATTR_TYPE_OF_UINT32;
+			flags |= MLX5_FLOW_LAYER_VXLAN;
+			break;
 		default:
 			DRV_LOG(WARNING,
 				"unsupported item %p type %d,"
@@ -1657,6 +1661,265 @@  struct flow_tcf_ptoi {
 }
 
 /**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
+ * @param[in] mask
+ *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
+ * @param[out] encap
+ *   Structure to fill the gathered MAC address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
+			       const struct rte_flow_item_eth *mask,
+			       struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. No redundant checks. */
+	assert(spec);
+	if (!mask || !memcmp(&mask->dst,
+			     &rte_flow_item_eth_mask.dst,
+			     sizeof(rte_flow_item_eth_mask.dst))) {
+		/*
+		 * Ethernet addresses are not supported by
+		 * tc as tunnel_key parameters. Destination
+		 * address is needed to form encap packet
+		 * header and retrieved by kernel from
+		 * implicit sources (ARP table, etc),
+		 * address masks are not supported at all.
+		 */
+		encap->eth.dst = spec->dst;
+		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_DST;
+	}
+	if (!mask || !memcmp(&mask->src,
+			     &rte_flow_item_eth_mask.src,
+			     sizeof(rte_flow_item_eth_mask.src))) {
+		/*
+		 * Ethernet addresses are not supported by
+		 * tc as tunnel_key parameters. Source ethernet
+		 * address is ignored anyway.
+		 */
+		encap->eth.src = spec->src;
+		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_SRC;
+	}
+	/*
+	 * No space allocated for ethernet addresses within Netlink
+	 * message tunnel_key record - these ones are not
+	 * supported by tc.
+	 */
+	return 0;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
+ * @param[out] encap
+ *   Structure to fill the gathered IPV4 address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
+				struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. No redundant checks. */
+	assert(spec);
+	encap->ipv4.dst = spec->hdr.dst_addr;
+	encap->ipv4.src = spec->hdr.src_addr;
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC |
+		       MLX5_FLOW_TCF_ENCAP_IPV4_DST;
+	return SZ_NLATTR_TYPE_OF_UINT32 * 2;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
+ * @param[out] encap
+ *   Structure to fill the gathered IPV6 address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
+				struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. No redundant checks. */
+	assert(spec);
+	memcpy(encap->ipv6.dst, spec->hdr.dst_addr, sizeof(encap->ipv6.dst));
+	memcpy(encap->ipv6.src, spec->hdr.src_addr, sizeof(encap->ipv6.src));
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV6_SRC |
+		       MLX5_FLOW_TCF_ENCAP_IPV6_DST;
+	return SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 2;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
+ * @param[in] mask
+ *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
+ * @param[out] encap
+ *   Structure to fill the gathered UDP port data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
+			       const struct rte_flow_item_udp *mask,
+			       struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	int size = SZ_NLATTR_TYPE_OF_UINT16;
+
+	assert(spec);
+	encap->udp.dst = spec->hdr.dst_port;
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_UDP_DST;
+	if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
+		encap->udp.src = spec->hdr.src_port;
+		size += SZ_NLATTR_TYPE_OF_UINT16;
+		encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC;
+	}
+	return size;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
+ * @param[out] encap
+ *   Structure to fill the gathered VNI address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
+			       struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. Do not redundant checks. */
+	assert(spec);
+	memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_VXLAN_VNI;
+	return SZ_NLATTR_TYPE_OF_UINT32;
+}
+
+/**
+ * Populate consolidated encapsulation object from list of pattern items.
+ *
+ * Helper function to process configuration of action such as
+ * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
+ * validated, there is no way to return an meaningful error.
+ *
+ * @param[in] action
+ *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
+ *   List of pattern items to gather data from.
+ * @param[out] src
+ *   Structure to fill gathered data.
+ *
+ * @return
+ *   The size the part of Netlink message buffer to store the item
+ *   attributes on success, zero otherwise. The mask field in
+ *   result structure reflects correctly parsed items.
+ */
+static int
+flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
+			   struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	union {
+		const struct rte_flow_item_eth *eth;
+		const struct rte_flow_item_ipv4 *ipv4;
+		const struct rte_flow_item_ipv6 *ipv6;
+		const struct rte_flow_item_udp *udp;
+		const struct rte_flow_item_vxlan *vxlan;
+	} spec, mask;
+	const struct rte_flow_item *items;
+	int size = 0;
+
+	assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
+	assert(action->conf);
+
+	items = ((const struct rte_flow_action_vxlan_encap *)
+					action->conf)->definition;
+	assert(items);
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			mask.eth = items->mask;
+			spec.eth = items->spec;
+			size += flow_tcf_parse_vxlan_encap_eth(spec.eth,
+							       mask.eth,
+							       encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			spec.ipv4 = items->spec;
+			size += flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4,
+								encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			spec.ipv6 = items->spec;
+			size += flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6,
+								encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			mask.udp = items->mask;
+			spec.udp = items->spec;
+			size += flow_tcf_parse_vxlan_encap_udp(spec.udp,
+							       mask.udp,
+							       encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			spec.vxlan = items->spec;
+			size += flow_tcf_parse_vxlan_encap_vni(spec.vxlan,
+							       encap);
+			break;
+		default:
+			assert(false);
+			DRV_LOG(WARNING,
+				"unsupported item %p type %d,"
+				" items must be validated"
+				" before flow creation",
+				(const void *)items, items->type);
+			encap->mask = 0;
+			return 0;
+		}
+	}
+	return size;
+}
+
+/**
  * Calculate maximum size of memory for flow actions of Linux TC flower and
  * extract specified actions.
  *
@@ -1664,13 +1927,16 @@  struct flow_tcf_ptoi {
  *   Pointer to the list of actions.
  * @param[out] action_flags
  *   Pointer to the detected actions.
+ * @param[out] tunnel
+ *   Pointer to tunnel encapsulation parameters structure to fill.
  *
  * @return
  *   Maximum size of memory for actions.
  */
 static int
 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
-			      uint64_t *action_flags)
+			      uint64_t *action_flags,
+			      void *tunnel)
 {
 	int size = 0;
 	uint64_t flags = 0;
@@ -1684,14 +1950,14 @@  struct flow_tcf_ptoi {
 			size += SZ_NLATTR_NEST + /* na_act_index. */
 				SZ_NLATTR_STRZ_OF("mirred") +
 				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
-				SZ_NLATTR_TYPE_OF(struct tc_mirred);
+				SZ_NLATTR_TYPE_OF_STRUCT(tc_mirred);
 			flags |= MLX5_ACTION_PORT_ID;
 			break;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			size += SZ_NLATTR_NEST + /* na_act_index. */
 				SZ_NLATTR_STRZ_OF("gact") +
 				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
-				SZ_NLATTR_TYPE_OF(struct tc_gact);
+				SZ_NLATTR_TYPE_OF_STRUCT(tc_gact);
 			flags |= MLX5_ACTION_DROP;
 			break;
 		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
@@ -1710,11 +1976,34 @@  struct flow_tcf_ptoi {
 			size += SZ_NLATTR_NEST + /* na_act_index. */
 				SZ_NLATTR_STRZ_OF("vlan") +
 				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
-				SZ_NLATTR_TYPE_OF(struct tc_vlan) +
-				SZ_NLATTR_TYPE_OF(uint16_t) +
+				SZ_NLATTR_TYPE_OF_STRUCT(tc_vlan) +
+				SZ_NLATTR_TYPE_OF_UINT16 +
 				/* VLAN protocol. */
-				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
-				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
+				SZ_NLATTR_TYPE_OF_UINT16 + /* VLAN ID. */
+				SZ_NLATTR_TYPE_OF_UINT8; /* VLAN prio. */
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("tunnel_key") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF_UINT8 + /* no UDP sum */
+				SZ_NLATTR_TYPE_OF_STRUCT(tc_tunnel_key) +
+				flow_tcf_vxlan_encap_parse(actions, tunnel) +
+				RTE_ALIGN_CEIL /* preceding encap params. */
+				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
+				MNL_ALIGNTO);
+			flags |= MLX5_ACTION_VXLAN_ENCAP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("tunnel_key") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF_UINT8 + /* no UDP sum */
+				SZ_NLATTR_TYPE_OF_STRUCT(tc_tunnel_key) +
+				RTE_ALIGN_CEIL /* preceding decap params. */
+				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
+				MNL_ALIGNTO);
+			flags |= MLX5_ACTION_VXLAN_DECAP;
 			break;
 		default:
 			DRV_LOG(WARNING,
@@ -1750,6 +2039,26 @@  struct flow_tcf_ptoi {
 }
 
 /**
+ * Convert VXLAN VNI to 32-bit integer.
+ *
+ * @param[in] vni
+ *   VXLAN VNI in 24-bit wire format.
+ *
+ * @return
+ *   VXLAN VNI as a 32-bit integer value in network endian.
+ */
+static rte_be32_t
+vxlan_vni_as_be32(const uint8_t vni[3])
+{
+	rte_be32_t ret;
+
+	ret = vni[0];
+	ret = (ret << 8) | vni[1];
+	ret = (ret << 8) | vni[2];
+	return RTE_BE32(ret);
+}
+
+/**
  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
  * memory required, allocates the memory, initializes Netlink message headers
  * and set unique TC message handle.
@@ -1784,22 +2093,54 @@  struct flow_tcf_ptoi {
 	struct mlx5_flow *dev_flow;
 	struct nlmsghdr *nlh;
 	struct tcmsg *tcm;
+	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
+	uint8_t *sp, *tun = NULL;
 
 	size += flow_tcf_get_items_and_size(items, item_flags);
-	size += flow_tcf_get_actions_and_size(actions, action_flags);
-	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
+	size += flow_tcf_get_actions_and_size(actions, action_flags, &encap);
+	dev_flow = rte_zmalloc(__func__, size,
+			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
+				(size_t)MNL_ALIGNTO));
 	if (!dev_flow) {
 		rte_flow_error_set(error, ENOMEM,
 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 				   "not enough memory to create E-Switch flow");
 		return NULL;
 	}
-	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
+	sp = (uint8_t *)(dev_flow + 1);
+	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
+		tun = sp;
+		sp += RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
+			MNL_ALIGNTO);
+		size -= RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
+			MNL_ALIGNTO);
+		encap.hdr.type = MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
+		memcpy(tun, &encap,
+		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
+	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
+		tun = sp;
+		sp += RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
+			MNL_ALIGNTO);
+		size -= RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
+			MNL_ALIGNTO);
+		encap.hdr.type = MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
+		memcpy(tun, &encap,
+		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
+	}
+	nlh = mnl_nlmsg_put_header(sp);
 	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
 	*dev_flow = (struct mlx5_flow){
 		.tcf = (struct mlx5_flow_tcf){
+			.nlsize = size,
 			.nlh = nlh,
 			.tcm = tcm,
+			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
+			.item_flags = *item_flags,
+			.action_flags = *action_flags,
 		},
 	};
 	/*
@@ -1853,6 +2194,7 @@  struct flow_tcf_ptoi {
 		const struct rte_flow_item_ipv6 *ipv6;
 		const struct rte_flow_item_tcp *tcp;
 		const struct rte_flow_item_udp *udp;
+		const struct rte_flow_item_vxlan *vxlan;
 	} spec, mask;
 	union {
 		const struct rte_flow_action_port_id *port_id;
@@ -1862,6 +2204,14 @@  struct flow_tcf_ptoi {
 		const struct rte_flow_action_of_set_vlan_pcp *
 			of_set_vlan_pcp;
 	} conf;
+	union {
+		struct mlx5_flow_tcf_tunnel_hdr *hdr;
+		struct mlx5_flow_tcf_vxlan_decap *vxlan;
+	} decap;
+	union {
+		struct mlx5_flow_tcf_tunnel_hdr *hdr;
+		struct mlx5_flow_tcf_vxlan_encap *vxlan;
+	} encap;
 	struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
 	struct nlmsghdr *nlh = dev_flow->tcf.nlh;
 	struct tcmsg *tcm = dev_flow->tcf.tcm;
@@ -1877,6 +2227,12 @@  struct flow_tcf_ptoi {
 
 	claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
 						PTOI_TABLE_SZ_MAX(dev)));
+	encap.hdr = NULL;
+	decap.hdr = NULL;
+	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_ENCAP)
+		encap.vxlan = dev_flow->tcf.vxlan_encap;
+	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_DECAP)
+		decap.vxlan = dev_flow->tcf.vxlan_decap;
 	nlh = dev_flow->tcf.nlh;
 	tcm = dev_flow->tcf.tcm;
 	/* Prepare API must have been called beforehand. */
@@ -1892,7 +2248,6 @@  struct flow_tcf_ptoi {
 				  RTE_BE16(ETH_P_ALL));
 	mnl_attr_put_strz(nlh, TCA_KIND, "flower");
 	na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
-	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
 		unsigned int i;
 
@@ -1935,6 +2290,12 @@  struct flow_tcf_ptoi {
 						 spec.eth->type);
 				eth_type_set = 1;
 			}
+			/*
+			 * Send L2 addresses/masks anyway, including
+			 * VXLAN encap/decap cases, sometimes kernel
+			 * returns an error if no L2 address provided
+			 * and skip_sw flag is set
+			 */
 			if (!is_zero_ether_addr(&mask.eth->dst)) {
 				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
 					     ETHER_ADDR_LEN,
@@ -1951,8 +2312,19 @@  struct flow_tcf_ptoi {
 					     ETHER_ADDR_LEN,
 					     mask.eth->src.addr_bytes);
 			}
+			if (decap.hdr) {
+				DRV_LOG(WARNING,
+				"ethernet addresses are treated "
+				"as inner ones for tunnel decapsulation");
+			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
+			if (encap.hdr || decap.hdr)
+				return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, NULL,
+					  "outer VLAN is not "
+					  "supported for tunnels");
 			mask.vlan = flow_tcf_item_mask
 				(items, &rte_flow_item_vlan_mask,
 				 &flow_tcf_mask_supported.vlan,
@@ -1983,6 +2355,7 @@  struct flow_tcf_ptoi {
 						 rte_be_to_cpu_16
 						 (spec.vlan->tci &
 						  RTE_BE16(0x0fff)));
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
 			mask.ipv4 = flow_tcf_item_mask
@@ -1992,36 +2365,53 @@  struct flow_tcf_ptoi {
 				 sizeof(flow_tcf_mask_supported.ipv4),
 				 error);
 			assert(mask.ipv4);
-			if (!eth_type_set || !vlan_eth_type_set)
-				mnl_attr_put_u16(nlh,
-						 vlan_present ?
-						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
-						 TCA_FLOWER_KEY_ETH_TYPE,
-						 RTE_BE16(ETH_P_IP));
-			eth_type_set = 1;
-			vlan_eth_type_set = 1;
-			if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
-				break;
 			spec.ipv4 = items->spec;
-			if (mask.ipv4->hdr.next_proto_id) {
-				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
+			if (!decap.vxlan) {
+				if (!eth_type_set || !vlan_eth_type_set) {
+					mnl_attr_put_u16(nlh,
+						vlan_present ?
+						TCA_FLOWER_KEY_VLAN_ETH_TYPE :
+						TCA_FLOWER_KEY_ETH_TYPE,
+						RTE_BE16(ETH_P_IP));
+				}
+				eth_type_set = 1;
+				vlan_eth_type_set = 1;
+				if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
+					break;
+				if (mask.ipv4->hdr.next_proto_id) {
+					mnl_attr_put_u8
+						(nlh, TCA_FLOWER_KEY_IP_PROTO,
 						spec.ipv4->hdr.next_proto_id);
-				ip_proto_set = 1;
+					ip_proto_set = 1;
+				}
+			} else {
+				assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
 			}
 			if (mask.ipv4->hdr.src_addr) {
-				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
-						 spec.ipv4->hdr.src_addr);
-				mnl_attr_put_u32(nlh,
-						 TCA_FLOWER_KEY_IPV4_SRC_MASK,
-						 mask.ipv4->hdr.src_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_SRC :
+					 TCA_FLOWER_KEY_IPV4_SRC,
+					 spec.ipv4->hdr.src_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
+					 TCA_FLOWER_KEY_IPV4_SRC_MASK,
+					 mask.ipv4->hdr.src_addr);
 			}
 			if (mask.ipv4->hdr.dst_addr) {
-				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
-						 spec.ipv4->hdr.dst_addr);
-				mnl_attr_put_u32(nlh,
-						 TCA_FLOWER_KEY_IPV4_DST_MASK,
-						 mask.ipv4->hdr.dst_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_DST :
+					 TCA_FLOWER_KEY_IPV4_DST,
+					 spec.ipv4->hdr.dst_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
+					 TCA_FLOWER_KEY_IPV4_DST_MASK,
+					 mask.ipv4->hdr.dst_addr);
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV6:
 			mask.ipv6 = flow_tcf_item_mask
@@ -2031,38 +2421,53 @@  struct flow_tcf_ptoi {
 				 sizeof(flow_tcf_mask_supported.ipv6),
 				 error);
 			assert(mask.ipv6);
-			if (!eth_type_set || !vlan_eth_type_set)
-				mnl_attr_put_u16(nlh,
+			spec.ipv6 = items->spec;
+			if (!decap.vxlan) {
+				if (!eth_type_set || !vlan_eth_type_set) {
+					mnl_attr_put_u16(nlh,
 						 vlan_present ?
 						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
 						 TCA_FLOWER_KEY_ETH_TYPE,
 						 RTE_BE16(ETH_P_IPV6));
-			eth_type_set = 1;
-			vlan_eth_type_set = 1;
-			if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
-				break;
-			spec.ipv6 = items->spec;
-			if (mask.ipv6->hdr.proto) {
-				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
-						spec.ipv6->hdr.proto);
-				ip_proto_set = 1;
+				}
+				eth_type_set = 1;
+				vlan_eth_type_set = 1;
+				if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
+					break;
+				if (mask.ipv6->hdr.proto) {
+					mnl_attr_put_u8
+						(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						 spec.ipv6->hdr.proto);
+					ip_proto_set = 1;
+				}
+			} else {
+				assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_SRC :
+					     TCA_FLOWER_KEY_IPV6_SRC,
 					     sizeof(spec.ipv6->hdr.src_addr),
 					     spec.ipv6->hdr.src_addr);
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
+					     TCA_FLOWER_KEY_IPV6_SRC_MASK,
 					     sizeof(mask.ipv6->hdr.src_addr),
 					     mask.ipv6->hdr.src_addr);
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_DST :
+					     TCA_FLOWER_KEY_IPV6_DST,
 					     sizeof(spec.ipv6->hdr.dst_addr),
 					     spec.ipv6->hdr.dst_addr);
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
+					     TCA_FLOWER_KEY_IPV6_DST_MASK,
 					     sizeof(mask.ipv6->hdr.dst_addr),
 					     mask.ipv6->hdr.dst_addr);
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_UDP:
 			mask.udp = flow_tcf_item_mask
@@ -2072,27 +2477,45 @@  struct flow_tcf_ptoi {
 				 sizeof(flow_tcf_mask_supported.udp),
 				 error);
 			assert(mask.udp);
-			if (!ip_proto_set)
-				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
-						IPPROTO_UDP);
-			if (mask.udp == &flow_tcf_mask_empty.udp)
-				break;
 			spec.udp = items->spec;
+			if (!decap.vxlan) {
+				if (!ip_proto_set)
+					mnl_attr_put_u8
+						(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						IPPROTO_UDP);
+				if (mask.udp == &flow_tcf_mask_empty.udp)
+					break;
+			} else {
+				assert(mask.udp != &flow_tcf_mask_empty.udp);
+				decap.vxlan->udp_port
+					= RTE_BE16(spec.udp->hdr.dst_port);
+			}
 			if (mask.udp->hdr.src_port) {
-				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
-						 spec.udp->hdr.src_port);
-				mnl_attr_put_u16(nlh,
-						 TCA_FLOWER_KEY_UDP_SRC_MASK,
-						 mask.udp->hdr.src_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
+					 TCA_FLOWER_KEY_UDP_SRC,
+					 spec.udp->hdr.src_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
+					 TCA_FLOWER_KEY_UDP_SRC_MASK,
+					 mask.udp->hdr.src_port);
 			}
 			if (mask.udp->hdr.dst_port) {
-				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
-						 spec.udp->hdr.dst_port);
-				mnl_attr_put_u16(nlh,
-						 TCA_FLOWER_KEY_UDP_DST_MASK,
-						 mask.udp->hdr.dst_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
+					 TCA_FLOWER_KEY_UDP_DST,
+					 spec.udp->hdr.dst_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
+					 TCA_FLOWER_KEY_UDP_DST_MASK,
+					 mask.udp->hdr.dst_port);
 			}
-			break;
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+		break;
 		case RTE_FLOW_ITEM_TYPE_TCP:
 			mask.tcp = flow_tcf_item_mask
 				(items, &rte_flow_item_tcp_mask,
@@ -2121,6 +2544,15 @@  struct flow_tcf_ptoi {
 						 TCA_FLOWER_KEY_TCP_DST_MASK,
 						 mask.tcp->hdr.dst_port);
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			assert(decap.vxlan);
+			spec.vxlan = items->spec;
+			mnl_attr_put_u32(nlh,
+					 TCA_FLOWER_KEY_ENC_KEY_ID,
+					 vxlan_vni_as_be32(spec.vxlan->vni));
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
@@ -2154,6 +2586,14 @@  struct flow_tcf_ptoi {
 			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
 			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
 			assert(na_act);
+			if (encap.hdr) {
+				assert(dev_flow->tcf.tunnel);
+				dev_flow->tcf.tunnel->ifindex_ptr =
+					&((struct tc_mirred *)
+					mnl_attr_get_payload
+					(mnl_nlmsg_get_payload_tail
+						(nlh)))->ifindex;
+			}
 			mnl_attr_put(nlh, TCA_MIRRED_PARMS,
 				     sizeof(struct tc_mirred),
 				     &(struct tc_mirred){
@@ -2163,6 +2603,7 @@  struct flow_tcf_ptoi {
 				     });
 			mnl_attr_nest_end(nlh, na_act);
 			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			na_act_index =
@@ -2243,6 +2684,74 @@  struct flow_tcf_ptoi {
 					(na_vlan_priority) =
 					conf.of_set_vlan_pcp->vlan_pcp;
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+			assert(decap.vxlan);
+			assert(dev_flow->tcf.tunnel);
+			dev_flow->tcf.tunnel->ifindex_ptr
+				= (unsigned int *)&tcm->tcm_ifindex;
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
+				sizeof(struct tc_tunnel_key),
+				&(struct tc_tunnel_key){
+					.action = TC_ACT_PIPE,
+					.t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
+					});
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+			assert(encap.vxlan);
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
+				sizeof(struct tc_tunnel_key),
+				&(struct tc_tunnel_key){
+					.action = TC_ACT_PIPE,
+					.t_action = TCA_TUNNEL_KEY_ACT_SET,
+					});
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_UDP_DST)
+				mnl_attr_put_u16(nlh,
+					 TCA_TUNNEL_KEY_ENC_DST_PORT,
+					 encap.vxlan->udp.dst);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
+				mnl_attr_put_u32(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
+					 encap.vxlan->ipv4.src);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
+				mnl_attr_put_u32(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
+					 encap.vxlan->ipv4.dst);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
+				mnl_attr_put(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
+					 sizeof(encap.vxlan->ipv6.src),
+					 &encap.vxlan->ipv6.src);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
+				mnl_attr_put(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
+					 sizeof(encap.vxlan->ipv6.dst),
+					 &encap.vxlan->ipv6.dst);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
+				mnl_attr_put_u32(nlh,
+					 TCA_TUNNEL_KEY_ENC_KEY_ID,
+					 vxlan_vni_as_be32
+						(encap.vxlan->vxlan.vni));
+			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
@@ -2254,7 +2763,9 @@  struct flow_tcf_ptoi {
 	assert(na_flower);
 	assert(na_flower_act);
 	mnl_attr_nest_end(nlh, na_flower_act);
+	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
 	mnl_attr_nest_end(nlh, na_flower);
+	assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 	return 0;
 }