diff mbox series

[v2,3/7] net/mlx5: e-switch VXLAN flow translation routine

Message ID 1539612815-47199-4-git-send-email-viacheslavo@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers show
Series net/mlx5: e-switch VXLAN encap/decap hardware offload | expand

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch warning coding style issues

Commit Message

Slava Ovsiienko Oct. 15, 2018, 2:13 p.m. UTC
This part of patchset adds support of VXLAN-related items and
actions to the flow translation routine. If some of them are
specified in the rule, the extra space for tunnel description
structure is allocated. Later some tunnel types, other than
VXLAN can be addedd (GRE). No VTEP devices are created at this
point, the flow rule is just translated, not applied yet.

Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_flow_tcf.c | 641 +++++++++++++++++++++++++++++++++++----
 1 file changed, 578 insertions(+), 63 deletions(-)

Comments

Yongseok Koh Oct. 23, 2018, 10:06 a.m. UTC | #1
On Mon, Oct 15, 2018 at 02:13:31PM +0000, Viacheslav Ovsiienko wrote:
> This part of patchset adds support of VXLAN-related items and
> actions to the flow translation routine. If some of them are
> specified in the rule, the extra space for tunnel description
> structure is allocated. Later some tunnel types, other than
> VXLAN can be addedd (GRE). No VTEP devices are created at this
> point, the flow rule is just translated, not applied yet.
> 
> Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_flow_tcf.c | 641 +++++++++++++++++++++++++++++++++++----
>  1 file changed, 578 insertions(+), 63 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
> index 0055417..660d45e 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -2094,6 +2094,265 @@ struct pedit_parser {
>  }
>  
>  /**
> + * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
> + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
> + * in the encapsulation parameters structure. The item must be prevalidated,
> + * no any validation checks performed by function.
> + *
> + * @param[in] spec
> + *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
> + * @param[in] mask
> + *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
> + * @param[out] encap
> + *   Structure to fill the gathered MAC address data.
> + *
> + * @return
> + *   The size needed the Netlink message tunnel_key
> + *   parameter buffer to store the item attributes.
> + */
> +static int
> +flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
> +			       const struct rte_flow_item_eth *mask,
> +			       struct mlx5_flow_tcf_vxlan_encap *encap)
> +{
> +	/* Item must be validated before. No redundant checks. */
> +	assert(spec);
> +	if (!mask || !memcmp(&mask->dst,
> +			     &rte_flow_item_eth_mask.dst,
> +			     sizeof(rte_flow_item_eth_mask.dst))) {
> +		/*
> +		 * Ethernet addresses are not supported by
> +		 * tc as tunnel_key parameters. Destination
> +		 * address is needed to form encap packet
> +		 * header and retrieved by kernel from
> +		 * implicit sources (ARP table, etc),
> +		 * address masks are not supported at all.
> +		 */
> +		encap->eth.dst = spec->dst;
> +		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_DST;
> +	}
> +	if (!mask || !memcmp(&mask->src,
> +			     &rte_flow_item_eth_mask.src,
> +			     sizeof(rte_flow_item_eth_mask.src))) {
> +		/*
> +		 * Ethernet addresses are not supported by
> +		 * tc as tunnel_key parameters. Source ethernet
> +		 * address is ignored anyway.
> +		 */
> +		encap->eth.src = spec->src;
> +		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_SRC;
> +	}
> +	/*
> +	 * No space allocated for ethernet addresses within Netlink
> +	 * message tunnel_key record - these ones are not
> +	 * supported by tc.
> +	 */
> +	return 0;
> +}
> +
> +/**
> + * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
> + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
> + * in the encapsulation parameters structure. The item must be prevalidated,
> + * no any validation checks performed by function.
> + *
> + * @param[in] spec
> + *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
> + * @param[out] encap
> + *   Structure to fill the gathered IPV4 address data.
> + *
> + * @return
> + *   The size needed the Netlink message tunnel_key
> + *   parameter buffer to store the item attributes.
> + */
> +static int
> +flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
> +				struct mlx5_flow_tcf_vxlan_encap *encap)
> +{
> +	/* Item must be validated before. No redundant checks. */
> +	assert(spec);
> +	encap->ipv4.dst = spec->hdr.dst_addr;
> +	encap->ipv4.src = spec->hdr.src_addr;
> +	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC |
> +		       MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> +	return 2 * SZ_NLATTR_TYPE_OF(uint32_t);
> +}
> +
> +/**
> + * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
> + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
> + * in the encapsulation parameters structure. The item must be prevalidated,
> + * no any validation checks performed by function.
> + *
> + * @param[in] spec
> + *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
> + * @param[out] encap
> + *   Structure to fill the gathered IPV6 address data.
> + *
> + * @return
> + *   The size needed the Netlink message tunnel_key
> + *   parameter buffer to store the item attributes.
> + */
> +static int
> +flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
> +				struct mlx5_flow_tcf_vxlan_encap *encap)
> +{
> +	/* Item must be validated before. No redundant checks. */
> +	assert(spec);
> +	memcpy(encap->ipv6.dst, spec->hdr.dst_addr, sizeof(encap->ipv6.dst));
> +	memcpy(encap->ipv6.src, spec->hdr.src_addr, sizeof(encap->ipv6.src));
> +	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV6_SRC |
> +		       MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> +	return SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
> +}
> +
> +/**
> + * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
> + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
> + * in the encapsulation parameters structure. The item must be prevalidated,
> + * no any validation checks performed by function.
> + *
> + * @param[in] spec
> + *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
> + * @param[in] mask
> + *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
> + * @param[out] encap
> + *   Structure to fill the gathered UDP port data.
> + *
> + * @return
> + *   The size needed the Netlink message tunnel_key
> + *   parameter buffer to store the item attributes.
> + */
> +static int
> +flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
> +			       const struct rte_flow_item_udp *mask,
> +			       struct mlx5_flow_tcf_vxlan_encap *encap)
> +{
> +	int size = SZ_NLATTR_TYPE_OF(uint16_t);
> +
> +	assert(spec);
> +	encap->udp.dst = spec->hdr.dst_port;
> +	encap->mask |= MLX5_FLOW_TCF_ENCAP_UDP_DST;
> +	if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
> +		encap->udp.src = spec->hdr.src_port;
> +		size += SZ_NLATTR_TYPE_OF(uint16_t);
> +		encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC;
> +	}
> +	return size;
> +}
> +
> +/**
> + * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
> + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
> + * in the encapsulation parameters structure. The item must be prevalidated,
> + * no any validation checks performed by function.
> + *
> + * @param[in] spec
> + *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
> + * @param[out] encap
> + *   Structure to fill the gathered VNI address data.
> + *
> + * @return
> + *   The size needed the Netlink message tunnel_key
> + *   parameter buffer to store the item attributes.
> + */
> +static int
> +flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
> +			       struct mlx5_flow_tcf_vxlan_encap *encap)
> +{
> +	/* Item must be validated before. Do not redundant checks. */
> +	assert(spec);
> +	memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
> +	encap->mask |= MLX5_FLOW_TCF_ENCAP_VXLAN_VNI;
> +	return SZ_NLATTR_TYPE_OF(uint32_t);
> +}
> +
> +/**
> + * Populate consolidated encapsulation object from list of pattern items.
> + *
> + * Helper function to process configuration of action such as
> + * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
> + * validated, there is no way to return an meaningful error.
> + *
> + * @param[in] action
> + *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
> + *   List of pattern items to gather data from.
> + * @param[out] src
> + *   Structure to fill gathered data.
> + *
> + * @return
> + *   The size the part of Netlink message buffer to store the item
> + *   attributes on success, zero otherwise. The mask field in
> + *   result structure reflects correctly parsed items.
> + */
> +static int
> +flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
> +			   struct mlx5_flow_tcf_vxlan_encap *encap)
> +{
> +	union {
> +		const struct rte_flow_item_eth *eth;
> +		const struct rte_flow_item_ipv4 *ipv4;
> +		const struct rte_flow_item_ipv6 *ipv6;
> +		const struct rte_flow_item_udp *udp;
> +		const struct rte_flow_item_vxlan *vxlan;
> +	} spec, mask;
> +	const struct rte_flow_item *items;
> +	int size = 0;
> +
> +	assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
> +	assert(action->conf);
> +
> +	items = ((const struct rte_flow_action_vxlan_encap *)
> +					action->conf)->definition;
> +	assert(items);
> +	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
> +		switch (items->type) {
> +		case RTE_FLOW_ITEM_TYPE_VOID:
> +			break;
> +		case RTE_FLOW_ITEM_TYPE_ETH:
> +			mask.eth = items->mask;
> +			spec.eth = items->spec;
> +			size += flow_tcf_parse_vxlan_encap_eth(spec.eth,
> +							       mask.eth,
> +							       encap);
> +			break;
> +		case RTE_FLOW_ITEM_TYPE_IPV4:
> +			spec.ipv4 = items->spec;
> +			size += flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4,
> +								encap);
> +			break;
> +		case RTE_FLOW_ITEM_TYPE_IPV6:
> +			spec.ipv6 = items->spec;
> +			size += flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6,
> +								encap);
> +			break;
> +		case RTE_FLOW_ITEM_TYPE_UDP:
> +			mask.udp = items->mask;
> +			spec.udp = items->spec;
> +			size += flow_tcf_parse_vxlan_encap_udp(spec.udp,
> +							       mask.udp,
> +							       encap);
> +			break;
> +		case RTE_FLOW_ITEM_TYPE_VXLAN:
> +			spec.vxlan = items->spec;
> +			size += flow_tcf_parse_vxlan_encap_vni(spec.vxlan,
> +							       encap);
> +			break;
> +		default:
> +			assert(false);
> +			DRV_LOG(WARNING,
> +				"unsupported item %p type %d,"
> +				" items must be validated"
> +				" before flow creation",
> +				(const void *)items, items->type);
> +			encap->mask = 0;
> +			return 0;
> +		}
> +	}
> +	return size;
> +}
> +
> +/**
>   * Calculate maximum size of memory for flow items of Linux TC flower and
>   * extract specified items.
>   *
> @@ -2148,7 +2407,7 @@ struct pedit_parser {
>  		case RTE_FLOW_ITEM_TYPE_IPV6:
>  			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
>  				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
> -				SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
> +				SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
>  				/* dst/src IP addr and mask. */
>  			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
>  			break;
> @@ -2164,6 +2423,10 @@ struct pedit_parser {
>  				/* dst/src port and mask. */
>  			flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
>  			break;
> +		case RTE_FLOW_ITEM_TYPE_VXLAN:
> +			size += SZ_NLATTR_TYPE_OF(uint32_t);
> +			flags |= MLX5_FLOW_LAYER_VXLAN;
> +			break;
>  		default:
>  			DRV_LOG(WARNING,
>  				"unsupported item %p type %d,"
> @@ -2184,13 +2447,16 @@ struct pedit_parser {
>   *   Pointer to the list of actions.
>   * @param[out] action_flags
>   *   Pointer to the detected actions.
> + * @param[out] tunnel
> + *   Pointer to tunnel encapsulation parameters structure to fill.
>   *
>   * @return
>   *   Maximum size of memory for actions.
>   */
>  static int
>  flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
> -			      uint64_t *action_flags)
> +			      uint64_t *action_flags,
> +			      void *tunnel)

This func is to get actions and size but you are parsing and filling tunnel info
here. It would be better to move parsing to translate() because it anyway has
multiple if conditions (same as switch/case) to set TCA_TUNNEL_KEY_ENC_* there.

>  {
>  	int size = 0;
>  	uint64_t flags = 0;
> @@ -2246,6 +2512,29 @@ struct pedit_parser {
>  				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
>  				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
>  			break;
> +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> +			size += SZ_NLATTR_NEST + /* na_act_index. */
> +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
> +				SZ_NLATTR_TYPE_OF(uint8_t);
> +			size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> +			size +=	flow_tcf_vxlan_encap_parse(actions, tunnel) +
> +				RTE_ALIGN_CEIL /* preceding encap params. */
> +				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> +				MNL_ALIGNTO);

Is it different from SZ_NLATTR_TYPE_OF(struct mlx5_flow_tcf_vxlan_encap)? Or,
use __rte_aligned(MNL_ALIGNTO) instead.

> +			flags |= MLX5_ACTION_VXLAN_ENCAP;
> +			break;
> +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> +			size += SZ_NLATTR_NEST + /* na_act_index. */
> +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
> +				SZ_NLATTR_TYPE_OF(uint8_t);
> +			size +=	SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> +			size +=	RTE_ALIGN_CEIL /* preceding decap params. */
> +				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> +				MNL_ALIGNTO);

Same here.

> +			flags |= MLX5_ACTION_VXLAN_DECAP;
> +			break;
>  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
>  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
>  		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> @@ -2289,6 +2578,26 @@ struct pedit_parser {
>  }
>  
>  /**
> + * Convert VXLAN VNI to 32-bit integer.
> + *
> + * @param[in] vni
> + *   VXLAN VNI in 24-bit wire format.
> + *
> + * @return
> + *   VXLAN VNI as a 32-bit integer value in network endian.
> + */
> +static rte_be32_t

make it inline.

> +vxlan_vni_as_be32(const uint8_t vni[3])
> +{
> +	rte_be32_t ret;

Defining ret as rte_be32_t? The return value of this func which is bswap(ret) is
also rte_be32_t??

> +
> +	ret = vni[0];
> +	ret = (ret << 8) | vni[1];
> +	ret = (ret << 8) | vni[2];
> +	return RTE_BE32(ret);

Use rte_cpu_to_be_*() instead. But I still don't understand why you shuffle
bytes twice. One with shift and or and other by bswap().

{
	union {
		uint8_t vni[4];
		rte_be32_t dword;
	} ret = {
		.vni = { 0, vni[0], vni[1], vni[2] },
	};
	return ret.dword;
}

This will have the same result without extra cost.

> +}
> +
> +/**
>   * Prepare a flow object for Linux TC flower. It calculates the maximum size of
>   * memory required, allocates the memory, initializes Netlink message headers
>   * and set unique TC message handle.
> @@ -2323,22 +2632,54 @@ struct pedit_parser {
>  	struct mlx5_flow *dev_flow;
>  	struct nlmsghdr *nlh;
>  	struct tcmsg *tcm;
> +	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
> +	uint8_t *sp, *tun = NULL;
>  
>  	size += flow_tcf_get_items_and_size(attr, items, item_flags);
> -	size += flow_tcf_get_actions_and_size(actions, action_flags);
> -	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
> +	size += flow_tcf_get_actions_and_size(actions, action_flags, &encap);
> +	dev_flow = rte_zmalloc(__func__, size,
> +			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
> +				(size_t)MNL_ALIGNTO));

Why RTE_MAX between the two? Note that it is alignment for start address of the
memory and the minimum alignment is cacheline size. On x86, non-zero value less
than 64 will have same result as 64.

>  	if (!dev_flow) {
>  		rte_flow_error_set(error, ENOMEM,
>  				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
>  				   "not enough memory to create E-Switch flow");
>  		return NULL;
>  	}
> -	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
> +	sp = (uint8_t *)(dev_flow + 1);
> +	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
> +		tun = sp;
> +		sp += RTE_ALIGN_CEIL
> +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> +			MNL_ALIGNTO);

And why should it be aligned? As the size of dev_flow might not be aligned, it
is meaningless, isn't it? If you think it must be aligned for better
performance (not much anyway), you can use __rte_aligned(MNL_ALIGNTO) on the
struct definition but not for mlx5_flow (it's not only for tcf, have to do it
manually).

> +		size -= RTE_ALIGN_CEIL
> +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> +			MNL_ALIGNTO);

Don't you have to subtract sizeof(struct mlx5_flow) as well? But like I
mentioned, if '.nlsize' below isn't needed, you don't need to have this
calculation either.

> +		encap.hdr.type = MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
> +		memcpy(tun, &encap,
> +		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
> +	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
> +		tun = sp;
> +		sp += RTE_ALIGN_CEIL
> +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> +			MNL_ALIGNTO);
> +		size -= RTE_ALIGN_CEIL
> +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> +			MNL_ALIGNTO);
> +		encap.hdr.type = MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
> +		memcpy(tun, &encap,
> +		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
> +	}
> +	nlh = mnl_nlmsg_put_header(sp);
>  	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
>  	*dev_flow = (struct mlx5_flow){
>  		.tcf = (struct mlx5_flow_tcf){
> +			.nlsize = size,
>  			.nlh = nlh,
>  			.tcm = tcm,
> +			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
> +			.item_flags = *item_flags,
> +			.action_flags = *action_flags,
>  		},
>  	};
>  	/*
> @@ -2392,6 +2733,7 @@ struct pedit_parser {
>  		const struct rte_flow_item_ipv6 *ipv6;
>  		const struct rte_flow_item_tcp *tcp;
>  		const struct rte_flow_item_udp *udp;
> +		const struct rte_flow_item_vxlan *vxlan;
>  	} spec, mask;
>  	union {
>  		const struct rte_flow_action_port_id *port_id;
> @@ -2402,6 +2744,14 @@ struct pedit_parser {
>  		const struct rte_flow_action_of_set_vlan_pcp *
>  			of_set_vlan_pcp;
>  	} conf;
> +	union {
> +		struct mlx5_flow_tcf_tunnel_hdr *hdr;
> +		struct mlx5_flow_tcf_vxlan_decap *vxlan;
> +	} decap;
> +	union {
> +		struct mlx5_flow_tcf_tunnel_hdr *hdr;
> +		struct mlx5_flow_tcf_vxlan_encap *vxlan;
> +	} encap;
>  	struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
>  	struct nlmsghdr *nlh = dev_flow->tcf.nlh;
>  	struct tcmsg *tcm = dev_flow->tcf.tcm;
> @@ -2418,6 +2768,12 @@ struct pedit_parser {
>  
>  	claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
>  						PTOI_TABLE_SZ_MAX(dev)));
> +	encap.hdr = NULL;
> +	decap.hdr = NULL;
> +	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_ENCAP)

dev_flow->flow->actions already has it.

> +		encap.vxlan = dev_flow->tcf.vxlan_encap;
> +	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_DECAP)
> +		decap.vxlan = dev_flow->tcf.vxlan_decap;
>  	nlh = dev_flow->tcf.nlh;
>  	tcm = dev_flow->tcf.tcm;
>  	/* Prepare API must have been called beforehand. */
> @@ -2435,7 +2791,6 @@ struct pedit_parser {
>  		mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
>  	mnl_attr_put_strz(nlh, TCA_KIND, "flower");
>  	na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
> -	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);

Why do you move it? You anyway can know if it is vxlan decap at this point.

>  	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
>  		unsigned int i;
>  
> @@ -2479,6 +2834,12 @@ struct pedit_parser {
>  						 spec.eth->type);
>  				eth_type_set = 1;
>  			}
> +			/*
> +			 * L2 addresses/masks should  be sent anyway,
> +			 * including VXLAN encap/decap cases, sometimes

"sometimes" sounds like a bug. Did you figure out why it is inconsistent?

> +			 * kernel returns an error if no L2 address
> +			 * provided and skip_sw flag is set
> +			 */
>  			if (!is_zero_ether_addr(&mask.eth->dst)) {
>  				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
>  					     ETHER_ADDR_LEN,
> @@ -2495,8 +2856,19 @@ struct pedit_parser {
>  					     ETHER_ADDR_LEN,
>  					     mask.eth->src.addr_bytes);
>  			}
> -			break;
> +			if (decap.hdr) {
> +				DRV_LOG(INFO,
> +				"ethernet addresses are treated "
> +				"as inner ones for tunnel decapsulation");
> +			}

I know there's no enc_[src|dst]_mac in tc flower command but can you clarify
more about this? Let me take an example.

  flow create 1 ingress transfer
    pattern eth src is 66:77:88:99:aa:bb
      dst is 00:11:22:33:44:55 / ipv4 src is 2.2.2.2 dst is 1.1.1.1 /
      udp src is 4789 dst is 4242 / vxlan vni is 0x112233 / end
    actions vxlan_decap / port_id id 2 / end

In this case, will the mac addrs specified above be regarded as inner mac addrs?
That sounds very weird. If inner mac addrs have to be specified it should be:

  flow create 1 ingress transfer
    pattern eth / ipv4 src is 2.2.2.2 dst is 1.1.1.1 /
      udp src is 4789 dst is 4242 / vxlan vni is 0x112233 /
      eth src is 66:77:88:99:aa:bb dst is 00:11:22:33:44:55 / end
    actions vxlan_decap / port_id id 2 / end

Isn't it?

Also, I hope it to be in validate() as well.

> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> +		break;
>  		case RTE_FLOW_ITEM_TYPE_VLAN:
> +			if (encap.hdr || decap.hdr)
> +				return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ITEM, NULL,
> +					  "outer VLAN is not "
> +					  "supported for tunnels");

This should be moved to validate(). And the error message sounds like inner vlan
is allowed, doesn't it? Even if it is moved to validate(), there's no way to
distinguish between outer vlan and inner vlan in your code. A bit confusing.
Please clarify.

>  			item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
>  			mask.vlan = flow_tcf_item_mask
>  				(items, &rte_flow_item_vlan_mask,
> @@ -2528,6 +2900,7 @@ struct pedit_parser {
>  						 rte_be_to_cpu_16
>  						 (spec.vlan->tci &
>  						  RTE_BE16(0x0fff)));
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_IPV4:
>  			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
> @@ -2538,36 +2911,53 @@ struct pedit_parser {
>  				 sizeof(flow_tcf_mask_supported.ipv4),
>  				 error);
>  			assert(mask.ipv4);
> -						 vlan_present ?
> -						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> -						 TCA_FLOWER_KEY_ETH_TYPE,
> -						 RTE_BE16(ETH_P_IP));
> -			eth_type_set = 1;
> -			vlan_eth_type_set = 1;
> -			if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
> -				break;
>  			spec.ipv4 = items->spec;
> -			if (mask.ipv4->hdr.next_proto_id) {
> -				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
> +			if (!decap.vxlan) {
> +				if (!eth_type_set || !vlan_eth_type_set) {
> +					mnl_attr_put_u16(nlh,
> +						vlan_present ?
> +						TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> +						TCA_FLOWER_KEY_ETH_TYPE,
> +						RTE_BE16(ETH_P_IP));
> +				}
> +				eth_type_set = 1;
> +				vlan_eth_type_set = 1;
> +				if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
> +					break;
> +				if (mask.ipv4->hdr.next_proto_id) {
> +					mnl_attr_put_u8
> +						(nlh, TCA_FLOWER_KEY_IP_PROTO,
>  						spec.ipv4->hdr.next_proto_id);
> -				ip_proto_set = 1;
> +					ip_proto_set = 1;
> +				}
> +			} else {
> +				assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
>  			}
>  			if (mask.ipv4->hdr.src_addr) {
> -				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
> -						 spec.ipv4->hdr.src_addr);
> -				mnl_attr_put_u32(nlh,
> -						 TCA_FLOWER_KEY_IPV4_SRC_MASK,
> -						 mask.ipv4->hdr.src_addr);
> +				mnl_attr_put_u32
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_IPV4_SRC :
> +					 TCA_FLOWER_KEY_IPV4_SRC,
> +					 spec.ipv4->hdr.src_addr);
> +				mnl_attr_put_u32
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
> +					 TCA_FLOWER_KEY_IPV4_SRC_MASK,
> +					 mask.ipv4->hdr.src_addr);
>  			}
>  			if (mask.ipv4->hdr.dst_addr) {
> -				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
> -						 spec.ipv4->hdr.dst_addr);
> -				mnl_attr_put_u32(nlh,
> -						 TCA_FLOWER_KEY_IPV4_DST_MASK,
> -						 mask.ipv4->hdr.dst_addr);
> +				mnl_attr_put_u32
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_IPV4_DST :
> +					 TCA_FLOWER_KEY_IPV4_DST,
> +					 spec.ipv4->hdr.dst_addr);
> +				mnl_attr_put_u32
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
> +					 TCA_FLOWER_KEY_IPV4_DST_MASK,
> +					 mask.ipv4->hdr.dst_addr);
>  			}
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_IPV6:
>  			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
> @@ -2578,38 +2968,53 @@ struct pedit_parser {
>  				 sizeof(flow_tcf_mask_supported.ipv6),
>  				 error);
>  			assert(mask.ipv6);
> -			if (!eth_type_set || !vlan_eth_type_set)
> -				mnl_attr_put_u16(nlh,
> -						 vlan_present ?
> -						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> -						 TCA_FLOWER_KEY_ETH_TYPE,
> -						 RTE_BE16(ETH_P_IPV6));
> -			eth_type_set = 1;
> -			vlan_eth_type_set = 1;
> -			if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
> -				break;
>  			spec.ipv6 = items->spec;
> -			if (mask.ipv6->hdr.proto) {
> -				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
> -						spec.ipv6->hdr.proto);
> -				ip_proto_set = 1;
> +			if (!decap.vxlan) {
> +				if (!eth_type_set || !vlan_eth_type_set) {
> +					mnl_attr_put_u16(nlh,
> +						vlan_present ?
> +						TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> +						TCA_FLOWER_KEY_ETH_TYPE,
> +						RTE_BE16(ETH_P_IPV6));
> +				}
> +				eth_type_set = 1;
> +				vlan_eth_type_set = 1;
> +				if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
> +					break;
> +				if (mask.ipv6->hdr.proto) {
> +					mnl_attr_put_u8
> +						(nlh, TCA_FLOWER_KEY_IP_PROTO,
> +						 spec.ipv6->hdr.proto);
> +					ip_proto_set = 1;
> +				}
> +			} else {
> +				assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
>  			}
>  			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
> -				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
> +				mnl_attr_put(nlh, decap.vxlan ?
> +					     TCA_FLOWER_KEY_ENC_IPV6_SRC :
> +					     TCA_FLOWER_KEY_IPV6_SRC,
>  					     sizeof(spec.ipv6->hdr.src_addr),
>  					     spec.ipv6->hdr.src_addr);
> -				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
> +				mnl_attr_put(nlh, decap.vxlan ?
> +					     TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
> +					     TCA_FLOWER_KEY_IPV6_SRC_MASK,
>  					     sizeof(mask.ipv6->hdr.src_addr),
>  					     mask.ipv6->hdr.src_addr);
>  			}
>  			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
> -				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
> +				mnl_attr_put(nlh, decap.vxlan ?
> +					     TCA_FLOWER_KEY_ENC_IPV6_DST :
> +					     TCA_FLOWER_KEY_IPV6_DST,
>  					     sizeof(spec.ipv6->hdr.dst_addr),
>  					     spec.ipv6->hdr.dst_addr);
> -				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
> +				mnl_attr_put(nlh, decap.vxlan ?
> +					     TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
> +					     TCA_FLOWER_KEY_IPV6_DST_MASK,
>  					     sizeof(mask.ipv6->hdr.dst_addr),
>  					     mask.ipv6->hdr.dst_addr);
>  			}
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_UDP:
>  			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
> @@ -2620,26 +3025,44 @@ struct pedit_parser {
>  				 sizeof(flow_tcf_mask_supported.udp),
>  				 error);
>  			assert(mask.udp);
> -			if (!ip_proto_set)
> -				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
> -						IPPROTO_UDP);
> -			if (mask.udp == &flow_tcf_mask_empty.udp)
> -				break;
>  			spec.udp = items->spec;
> +			if (!decap.vxlan) {
> +				if (!ip_proto_set)
> +					mnl_attr_put_u8
> +						(nlh, TCA_FLOWER_KEY_IP_PROTO,
> +						IPPROTO_UDP);
> +				if (mask.udp == &flow_tcf_mask_empty.udp)
> +					break;
> +			} else {
> +				assert(mask.udp != &flow_tcf_mask_empty.udp);
> +				decap.vxlan->udp_port
> +					= RTE_BE16(spec.udp->hdr.dst_port);

Use rte_cpu_to_be_*() instead. And (=) sign has to be moved up.

> +			}
>  			if (mask.udp->hdr.src_port) {
> -				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
> -						 spec.udp->hdr.src_port);
> -				mnl_attr_put_u16(nlh,
> -						 TCA_FLOWER_KEY_UDP_SRC_MASK,
> -						 mask.udp->hdr.src_port);
> +				mnl_attr_put_u16
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
> +					 TCA_FLOWER_KEY_UDP_SRC,
> +					 spec.udp->hdr.src_port);
> +				mnl_attr_put_u16
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
> +					 TCA_FLOWER_KEY_UDP_SRC_MASK,
> +					 mask.udp->hdr.src_port);
>  			}
>  			if (mask.udp->hdr.dst_port) {
> -				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
> -						 spec.udp->hdr.dst_port);
> -				mnl_attr_put_u16(nlh,
> -						 TCA_FLOWER_KEY_UDP_DST_MASK,
> -						 mask.udp->hdr.dst_port);
> +				mnl_attr_put_u16
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
> +					 TCA_FLOWER_KEY_UDP_DST,
> +					 spec.udp->hdr.dst_port);
> +				mnl_attr_put_u16
> +					(nlh, decap.vxlan ?
> +					 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
> +					 TCA_FLOWER_KEY_UDP_DST_MASK,
> +					 mask.udp->hdr.dst_port);
>  			}
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ITEM_TYPE_TCP:
>  			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
> @@ -2682,7 +3105,15 @@ struct pedit_parser {
>  					 rte_cpu_to_be_16
>  						(mask.tcp->hdr.tcp_flags));
>  			}
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
> +		case RTE_FLOW_ITEM_TYPE_VXLAN:
> +			assert(decap.vxlan);
> +			spec.vxlan = items->spec;
> +			mnl_attr_put_u32(nlh,
> +					 TCA_FLOWER_KEY_ENC_KEY_ID,
> +					 vxlan_vni_as_be32(spec.vxlan->vni));
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  		default:
>  			return rte_flow_error_set(error, ENOTSUP,
>  						  RTE_FLOW_ERROR_TYPE_ITEM,
> @@ -2715,6 +3146,14 @@ struct pedit_parser {
>  			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
>  			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
>  			assert(na_act);
> +			if (encap.hdr) {
> +				assert(dev_flow->tcf.tunnel);
> +				dev_flow->tcf.tunnel->ifindex_ptr =
> +					&((struct tc_mirred *)
> +					mnl_attr_get_payload
> +					(mnl_nlmsg_get_payload_tail
> +						(nlh)))->ifindex;
> +			}
>  			mnl_attr_put(nlh, TCA_MIRRED_PARMS,
>  				     sizeof(struct tc_mirred),
>  				     &(struct tc_mirred){
> @@ -2724,6 +3163,7 @@ struct pedit_parser {
>  				     });
>  			mnl_attr_nest_end(nlh, na_act);
>  			mnl_attr_nest_end(nlh, na_act_index);
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ACTION_TYPE_JUMP:
>  			conf.jump = actions->conf;
> @@ -2741,6 +3181,7 @@ struct pedit_parser {
>  				     });
>  			mnl_attr_nest_end(nlh, na_act);
>  			mnl_attr_nest_end(nlh, na_act_index);
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ACTION_TYPE_DROP:
>  			na_act_index =
> @@ -2827,6 +3268,76 @@ struct pedit_parser {
>  					(na_vlan_priority) =
>  					conf.of_set_vlan_pcp->vlan_pcp;
>  			}
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> +			break;
> +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> +			assert(decap.vxlan);
> +			assert(dev_flow->tcf.tunnel);
> +			dev_flow->tcf.tunnel->ifindex_ptr
> +				= (unsigned int *)&tcm->tcm_ifindex;
> +			na_act_index =
> +				mnl_attr_nest_start(nlh, na_act_index_cur++);
> +			assert(na_act_index);
> +			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
> +			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
> +			assert(na_act);
> +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> +				sizeof(struct tc_tunnel_key),
> +				&(struct tc_tunnel_key){
> +					.action = TC_ACT_PIPE,
> +					.t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
> +					});
> +			mnl_attr_nest_end(nlh, na_act);
> +			mnl_attr_nest_end(nlh, na_act_index);
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> +			break;
> +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> +			assert(encap.vxlan);
> +			na_act_index =
> +				mnl_attr_nest_start(nlh, na_act_index_cur++);
> +			assert(na_act_index);
> +			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
> +			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
> +			assert(na_act);
> +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> +				sizeof(struct tc_tunnel_key),
> +				&(struct tc_tunnel_key){
> +					.action = TC_ACT_PIPE,
> +					.t_action = TCA_TUNNEL_KEY_ACT_SET,
> +					});
> +			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_UDP_DST)
> +				mnl_attr_put_u16(nlh,
> +					 TCA_TUNNEL_KEY_ENC_DST_PORT,
> +					 encap.vxlan->udp.dst);
> +			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
> +				mnl_attr_put_u32(nlh,
> +					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
> +					 encap.vxlan->ipv4.src);
> +			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> +				mnl_attr_put_u32(nlh,
> +					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
> +					 encap.vxlan->ipv4.dst);
> +			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
> +				mnl_attr_put(nlh,
> +					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
> +					 sizeof(encap.vxlan->ipv6.src),
> +					 &encap.vxlan->ipv6.src);
> +			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> +				mnl_attr_put(nlh,
> +					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
> +					 sizeof(encap.vxlan->ipv6.dst),
> +					 &encap.vxlan->ipv6.dst);
> +			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
> +				mnl_attr_put_u32(nlh,
> +					 TCA_TUNNEL_KEY_ENC_KEY_ID,
> +					 vxlan_vni_as_be32
> +						(encap.vxlan->vxlan.vni));
> +#ifdef TCA_TUNNEL_KEY_NO_CSUM
> +			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
> +#endif

TCA_TUNNEL_KEY_NO_CSUM is anyway defined like others, then why do you treat it
differently with #ifdef/#endif?

> +			mnl_attr_nest_end(nlh, na_act);
> +			mnl_attr_nest_end(nlh, na_act_index);
> +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  			break;
>  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
>  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> @@ -2850,7 +3361,11 @@ struct pedit_parser {
>  	assert(na_flower);
>  	assert(na_flower_act);
>  	mnl_attr_nest_end(nlh, na_flower_act);
> +	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS,
> +			 dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_DECAP
> +			 ? 0 : TCA_CLS_FLAGS_SKIP_SW);
>  	mnl_attr_nest_end(nlh, na_flower);
> +	assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
>  	return 0;
>  }
Slava Ovsiienko Oct. 25, 2018, 2:37 p.m. UTC | #2
> -----Original Message-----
> From: Yongseok Koh
> Sent: Tuesday, October 23, 2018 13:06
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> Subject: Re: [PATCH v2 3/7] net/mlx5: e-switch VXLAN flow translation
> routine
> 
> On Mon, Oct 15, 2018 at 02:13:31PM +0000, Viacheslav Ovsiienko wrote:
> > This part of patchset adds support of VXLAN-related items and actions
> > to the flow translation routine. If some of them are specified in the
> > rule, the extra space for tunnel description structure is allocated.
> > Later some tunnel types, other than VXLAN can be addedd (GRE). No VTEP
> > devices are created at this point, the flow rule is just translated,
> > not applied yet.
> >
> > Suggested-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5_flow_tcf.c | 641
> > +++++++++++++++++++++++++++++++++++----
> >  1 file changed, 578 insertions(+), 63 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c
> > b/drivers/net/mlx5/mlx5_flow_tcf.c
> > index 0055417..660d45e 100644
> > --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> > +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> > @@ -2094,6 +2094,265 @@ struct pedit_parser {  }
> >
> >  /**
> > + * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in
> > +configuration
> > + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC
> address
> > +fields
> > + * in the encapsulation parameters structure. The item must be
> > +prevalidated,
> > + * no any validation checks performed by function.
> > + *
> > + * @param[in] spec
> > + *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
> > + * @param[in] mask
> > + *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
> > + * @param[out] encap
> > + *   Structure to fill the gathered MAC address data.
> > + *
> > + * @return
> > + *   The size needed the Netlink message tunnel_key
> > + *   parameter buffer to store the item attributes.
> > + */
> > +static int
> > +flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
> > +			       const struct rte_flow_item_eth *mask,
> > +			       struct mlx5_flow_tcf_vxlan_encap *encap) {
> > +	/* Item must be validated before. No redundant checks. */
> > +	assert(spec);
> > +	if (!mask || !memcmp(&mask->dst,
> > +			     &rte_flow_item_eth_mask.dst,
> > +			     sizeof(rte_flow_item_eth_mask.dst))) {
> > +		/*
> > +		 * Ethernet addresses are not supported by
> > +		 * tc as tunnel_key parameters. Destination
> > +		 * address is needed to form encap packet
> > +		 * header and retrieved by kernel from
> > +		 * implicit sources (ARP table, etc),
> > +		 * address masks are not supported at all.
> > +		 */
> > +		encap->eth.dst = spec->dst;
> > +		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_DST;
> > +	}
> > +	if (!mask || !memcmp(&mask->src,
> > +			     &rte_flow_item_eth_mask.src,
> > +			     sizeof(rte_flow_item_eth_mask.src))) {
> > +		/*
> > +		 * Ethernet addresses are not supported by
> > +		 * tc as tunnel_key parameters. Source ethernet
> > +		 * address is ignored anyway.
> > +		 */
> > +		encap->eth.src = spec->src;
> > +		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_SRC;
> > +	}
> > +	/*
> > +	 * No space allocated for ethernet addresses within Netlink
> > +	 * message tunnel_key record - these ones are not
> > +	 * supported by tc.
> > +	 */
> > +	return 0;
> > +}
> > +
> > +/**
> > + * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in
> > +configuration
> > + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4
> address
> > +fields
> > + * in the encapsulation parameters structure. The item must be
> > +prevalidated,
> > + * no any validation checks performed by function.
> > + *
> > + * @param[in] spec
> > + *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
> > + * @param[out] encap
> > + *   Structure to fill the gathered IPV4 address data.
> > + *
> > + * @return
> > + *   The size needed the Netlink message tunnel_key
> > + *   parameter buffer to store the item attributes.
> > + */
> > +static int
> > +flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
> > +				struct mlx5_flow_tcf_vxlan_encap *encap) {
> > +	/* Item must be validated before. No redundant checks. */
> > +	assert(spec);
> > +	encap->ipv4.dst = spec->hdr.dst_addr;
> > +	encap->ipv4.src = spec->hdr.src_addr;
> > +	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC |
> > +		       MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> > +	return 2 * SZ_NLATTR_TYPE_OF(uint32_t); }
> > +
> > +/**
> > + * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in
> > +configuration
> > + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6
> address
> > +fields
> > + * in the encapsulation parameters structure. The item must be
> > +prevalidated,
> > + * no any validation checks performed by function.
> > + *
> > + * @param[in] spec
> > + *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
> > + * @param[out] encap
> > + *   Structure to fill the gathered IPV6 address data.
> > + *
> > + * @return
> > + *   The size needed the Netlink message tunnel_key
> > + *   parameter buffer to store the item attributes.
> > + */
> > +static int
> > +flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
> > +				struct mlx5_flow_tcf_vxlan_encap *encap) {
> > +	/* Item must be validated before. No redundant checks. */
> > +	assert(spec);
> > +	memcpy(encap->ipv6.dst, spec->hdr.dst_addr, sizeof(encap-
> >ipv6.dst));
> > +	memcpy(encap->ipv6.src, spec->hdr.src_addr, sizeof(encap-
> >ipv6.src));
> > +	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV6_SRC |
> > +		       MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> > +	return SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2; }
> > +
> > +/**
> > + * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in
> > +configuration
> > + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port
> > +fields
> > + * in the encapsulation parameters structure. The item must be
> > +prevalidated,
> > + * no any validation checks performed by function.
> > + *
> > + * @param[in] spec
> > + *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
> > + * @param[in] mask
> > + *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
> > + * @param[out] encap
> > + *   Structure to fill the gathered UDP port data.
> > + *
> > + * @return
> > + *   The size needed the Netlink message tunnel_key
> > + *   parameter buffer to store the item attributes.
> > + */
> > +static int
> > +flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
> > +			       const struct rte_flow_item_udp *mask,
> > +			       struct mlx5_flow_tcf_vxlan_encap *encap) {
> > +	int size = SZ_NLATTR_TYPE_OF(uint16_t);
> > +
> > +	assert(spec);
> > +	encap->udp.dst = spec->hdr.dst_port;
> > +	encap->mask |= MLX5_FLOW_TCF_ENCAP_UDP_DST;
> > +	if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
> > +		encap->udp.src = spec->hdr.src_port;
> > +		size += SZ_NLATTR_TYPE_OF(uint16_t);
> > +		encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC;
> > +	}
> > +	return size;
> > +}
> > +
> > +/**
> > + * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in
> > +configuration
> > + * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
> > + * in the encapsulation parameters structure. The item must be
> > +prevalidated,
> > + * no any validation checks performed by function.
> > + *
> > + * @param[in] spec
> > + *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
> > + * @param[out] encap
> > + *   Structure to fill the gathered VNI address data.
> > + *
> > + * @return
> > + *   The size needed the Netlink message tunnel_key
> > + *   parameter buffer to store the item attributes.
> > + */
> > +static int
> > +flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
> > +			       struct mlx5_flow_tcf_vxlan_encap *encap) {
> > +	/* Item must be validated before. Do not redundant checks. */
> > +	assert(spec);
> > +	memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
> > +	encap->mask |= MLX5_FLOW_TCF_ENCAP_VXLAN_VNI;
> > +	return SZ_NLATTR_TYPE_OF(uint32_t);
> > +}
> > +
> > +/**
> > + * Populate consolidated encapsulation object from list of pattern items.
> > + *
> > + * Helper function to process configuration of action such as
> > + * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
> > + * validated, there is no way to return an meaningful error.
> > + *
> > + * @param[in] action
> > + *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
> > + *   List of pattern items to gather data from.
> > + * @param[out] src
> > + *   Structure to fill gathered data.
> > + *
> > + * @return
> > + *   The size the part of Netlink message buffer to store the item
> > + *   attributes on success, zero otherwise. The mask field in
> > + *   result structure reflects correctly parsed items.
> > + */
> > +static int
> > +flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
> > +			   struct mlx5_flow_tcf_vxlan_encap *encap) {
> > +	union {
> > +		const struct rte_flow_item_eth *eth;
> > +		const struct rte_flow_item_ipv4 *ipv4;
> > +		const struct rte_flow_item_ipv6 *ipv6;
> > +		const struct rte_flow_item_udp *udp;
> > +		const struct rte_flow_item_vxlan *vxlan;
> > +	} spec, mask;
> > +	const struct rte_flow_item *items;
> > +	int size = 0;
> > +
> > +	assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
> > +	assert(action->conf);
> > +
> > +	items = ((const struct rte_flow_action_vxlan_encap *)
> > +					action->conf)->definition;
> > +	assert(items);
> > +	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
> > +		switch (items->type) {
> > +		case RTE_FLOW_ITEM_TYPE_VOID:
> > +			break;
> > +		case RTE_FLOW_ITEM_TYPE_ETH:
> > +			mask.eth = items->mask;
> > +			spec.eth = items->spec;
> > +			size += flow_tcf_parse_vxlan_encap_eth(spec.eth,
> > +							       mask.eth,
> > +							       encap);
> > +			break;
> > +		case RTE_FLOW_ITEM_TYPE_IPV4:
> > +			spec.ipv4 = items->spec;
> > +			size += flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4,
> > +								encap);
> > +			break;
> > +		case RTE_FLOW_ITEM_TYPE_IPV6:
> > +			spec.ipv6 = items->spec;
> > +			size += flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6,
> > +								encap);
> > +			break;
> > +		case RTE_FLOW_ITEM_TYPE_UDP:
> > +			mask.udp = items->mask;
> > +			spec.udp = items->spec;
> > +			size += flow_tcf_parse_vxlan_encap_udp(spec.udp,
> > +							       mask.udp,
> > +							       encap);
> > +			break;
> > +		case RTE_FLOW_ITEM_TYPE_VXLAN:
> > +			spec.vxlan = items->spec;
> > +			size += flow_tcf_parse_vxlan_encap_vni(spec.vxlan,
> > +							       encap);
> > +			break;
> > +		default:
> > +			assert(false);
> > +			DRV_LOG(WARNING,
> > +				"unsupported item %p type %d,"
> > +				" items must be validated"
> > +				" before flow creation",
> > +				(const void *)items, items->type);
> > +			encap->mask = 0;
> > +			return 0;
> > +		}
> > +	}
> > +	return size;
> > +}
> > +
> > +/**
> >   * Calculate maximum size of memory for flow items of Linux TC flower
> and
> >   * extract specified items.
> >   *
> > @@ -2148,7 +2407,7 @@ struct pedit_parser {
> >  		case RTE_FLOW_ITEM_TYPE_IPV6:
> >  			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether
> type. */
> >  				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto.
> */
> > -				SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
> > +				SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
> >  				/* dst/src IP addr and mask. */
> >  			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
> >  			break;
> > @@ -2164,6 +2423,10 @@ struct pedit_parser {
> >  				/* dst/src port and mask. */
> >  			flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
> >  			break;
> > +		case RTE_FLOW_ITEM_TYPE_VXLAN:
> > +			size += SZ_NLATTR_TYPE_OF(uint32_t);
> > +			flags |= MLX5_FLOW_LAYER_VXLAN;
> > +			break;
> >  		default:
> >  			DRV_LOG(WARNING,
> >  				"unsupported item %p type %d,"
> > @@ -2184,13 +2447,16 @@ struct pedit_parser {
> >   *   Pointer to the list of actions.
> >   * @param[out] action_flags
> >   *   Pointer to the detected actions.
> > + * @param[out] tunnel
> > + *   Pointer to tunnel encapsulation parameters structure to fill.
> >   *
> >   * @return
> >   *   Maximum size of memory for actions.
> >   */
> >  static int
> >  flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
> > -			      uint64_t *action_flags)
> > +			      uint64_t *action_flags,
> > +			      void *tunnel)
> 
> This func is to get actions and size but you are parsing and filling tunnel info
> here. It would be better to move parsing to translate() because it anyway has
> multiple if conditions (same as switch/case) to set TCA_TUNNEL_KEY_ENC_*
> there.
Do you mean call of flow_tcf_vxlan_encap_parse(actions, tunnel)?
OK, let's move it to translate stage. Anyway, we need to keep encap structure
for local/neigh rules.

> 
> >  {
> >  	int size = 0;
> >  	uint64_t flags = 0;
> > @@ -2246,6 +2512,29 @@ struct pedit_parser {
> >  				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID.
> */
> >  				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio.
> */
> >  			break;
> > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> */
> > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > +			size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > +			size +=	flow_tcf_vxlan_encap_parse(actions, tunnel)
> +
> > +				RTE_ALIGN_CEIL /* preceding encap params.
> */
> > +				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > +				MNL_ALIGNTO);
> 
> Is it different from SZ_NLATTR_TYPE_OF(struct
> mlx5_flow_tcf_vxlan_encap)? Or, use __rte_aligned(MNL_ALIGNTO) instead.

It is written intentionally in this form. It means that there is struct mlx5_flow_tcf_vxlan_encap 
at the beginning of buffer. This is not the NL attribute, usage of SZ_NLATTR_TYPE_OF is
not relevant here. Alignment is needed for the following Netlink message.
> 
> > +			flags |= MLX5_ACTION_VXLAN_ENCAP;
> > +			break;
> > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> */
> > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > +			size +=	SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > +			size +=	RTE_ALIGN_CEIL /* preceding decap params.
> */
> > +				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > +				MNL_ALIGNTO);
> 
> Same here.
> 
> > +			flags |= MLX5_ACTION_VXLAN_DECAP;
> > +			break;
> >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> >  		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> > @@ -2289,6 +2578,26 @@ struct pedit_parser {  }
> >
> >  /**
> > + * Convert VXLAN VNI to 32-bit integer.
> > + *
> > + * @param[in] vni
> > + *   VXLAN VNI in 24-bit wire format.
> > + *
> > + * @return
> > + *   VXLAN VNI as a 32-bit integer value in network endian.
> > + */
> > +static rte_be32_t
> 
> make it inline.
OK. Missed point.

> 
> > +vxlan_vni_as_be32(const uint8_t vni[3]) {
> > +	rte_be32_t ret;
> 
> Defining ret as rte_be32_t? The return value of this func which is bswap(ret)
> is also rte_be32_t??
Yes. And it is directly stored in the net-endian NL attribute. 
I've compiled and checked the listing of the function you proposed. It seems to be best, I'll take it.

> 
> > +
> > +	ret = vni[0];
> > +	ret = (ret << 8) | vni[1];
> > +	ret = (ret << 8) | vni[2];
> > +	return RTE_BE32(ret);
> 
> Use rte_cpu_to_be_*() instead. But I still don't understand why you shuffle
> bytes twice. One with shift and or and other by bswap().
And it works. There are three bytes in very bizarre order (in NL attribute) - 0, vni[0], vni[1], vni[2].

> 
> {
> 	union {
> 		uint8_t vni[4];
> 		rte_be32_t dword;
> 	} ret = {
> 		.vni = { 0, vni[0], vni[1], vni[2] },
> 	};
> 	return ret.dword;
> }
> 
> This will have the same result without extra cost.

OK. Investigated, it is the best for x86-64. Also I'm going to test it on the ARM 32,
with various compilers, just curious.

> 
> > +}
> > +
> > +/**
> >   * Prepare a flow object for Linux TC flower. It calculates the maximum
> size of
> >   * memory required, allocates the memory, initializes Netlink message
> headers
> >   * and set unique TC message handle.
> > @@ -2323,22 +2632,54 @@ struct pedit_parser {
> >  	struct mlx5_flow *dev_flow;
> >  	struct nlmsghdr *nlh;
> >  	struct tcmsg *tcm;
> > +	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
> > +	uint8_t *sp, *tun = NULL;
> >
> >  	size += flow_tcf_get_items_and_size(attr, items, item_flags);
> > -	size += flow_tcf_get_actions_and_size(actions, action_flags);
> > -	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
> > +	size += flow_tcf_get_actions_and_size(actions, action_flags,
> &encap);
> > +	dev_flow = rte_zmalloc(__func__, size,
> > +			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
> > +				(size_t)MNL_ALIGNTO));
> 
> Why RTE_MAX between the two? Note that it is alignment for start address
> of the memory and the minimum alignment is cacheline size. On x86, non-
> zero value less than 64 will have same result as 64.

OK. Thanks for note.
It is not expected the structure alignments exceed the cache line size.
So? Just specify zero?
> 
> >  	if (!dev_flow) {
> >  		rte_flow_error_set(error, ENOMEM,
> >  				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> NULL,
> >  				   "not enough memory to create E-Switch
> flow");
> >  		return NULL;
> >  	}
> > -	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
> > +	sp = (uint8_t *)(dev_flow + 1);
> > +	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
> > +		tun = sp;
> > +		sp += RTE_ALIGN_CEIL
> > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > +			MNL_ALIGNTO);
> 
> And why should it be aligned? 

Netlink message should be aligned. It follows the mlx5_flow_tcf_vxlan_encap,
that's why the pointer is aligned.

As the size of dev_flow might not be aligned, it
> is meaningless, isn't it? If you think it must be aligned for better performance
> (not much anyway), you can use __rte_aligned(MNL_ALIGNTO) on the struct
Hm. Where we can use __rte_aligned? Could you clarify, please.

> definition but not for mlx5_flow (it's not only for tcf, have to do it manually).
> 
> > +		size -= RTE_ALIGN_CEIL
> > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > +			MNL_ALIGNTO);
> 
> Don't you have to subtract sizeof(struct mlx5_flow) as well? But like I
> mentioned, if '.nlsize' below isn't needed, you don't need to have this
> calculation either.
Yes, it is a bug. Should be fixed. Thank you.
Let's discuss whether we can keep the nlsize under NDEBUG switch.

> 
> > +		encap.hdr.type =
> MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
> > +		memcpy(tun, &encap,
> > +		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
> > +	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
> > +		tun = sp;
> > +		sp += RTE_ALIGN_CEIL
> > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > +			MNL_ALIGNTO);
> > +		size -= RTE_ALIGN_CEIL
> > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > +			MNL_ALIGNTO);
> > +		encap.hdr.type =
> MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
> > +		memcpy(tun, &encap,
> > +		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
> > +	}
> > +	nlh = mnl_nlmsg_put_header(sp);
> >  	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
> >  	*dev_flow = (struct mlx5_flow){
> >  		.tcf = (struct mlx5_flow_tcf){
> > +			.nlsize = size,
> >  			.nlh = nlh,
> >  			.tcm = tcm,
> > +			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
> > +			.item_flags = *item_flags,
> > +			.action_flags = *action_flags,
> >  		},
> >  	};
> >  	/*
> > @@ -2392,6 +2733,7 @@ struct pedit_parser {
> >  		const struct rte_flow_item_ipv6 *ipv6;
> >  		const struct rte_flow_item_tcp *tcp;
> >  		const struct rte_flow_item_udp *udp;
> > +		const struct rte_flow_item_vxlan *vxlan;
> >  	} spec, mask;
> >  	union {
> >  		const struct rte_flow_action_port_id *port_id; @@ -2402,6
> +2744,14
> > @@ struct pedit_parser {
> >  		const struct rte_flow_action_of_set_vlan_pcp *
> >  			of_set_vlan_pcp;
> >  	} conf;
> > +	union {
> > +		struct mlx5_flow_tcf_tunnel_hdr *hdr;
> > +		struct mlx5_flow_tcf_vxlan_decap *vxlan;
> > +	} decap;
> > +	union {
> > +		struct mlx5_flow_tcf_tunnel_hdr *hdr;
> > +		struct mlx5_flow_tcf_vxlan_encap *vxlan;
> > +	} encap;
> >  	struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
> >  	struct nlmsghdr *nlh = dev_flow->tcf.nlh;
> >  	struct tcmsg *tcm = dev_flow->tcf.tcm; @@ -2418,6 +2768,12 @@
> struct
> > pedit_parser {
> >
> >  	claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
> >  						PTOI_TABLE_SZ_MAX(dev)));
> > +	encap.hdr = NULL;
> > +	decap.hdr = NULL;
> > +	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_ENCAP)
> 
> dev_flow->flow->actions already has it.
> 
> > +		encap.vxlan = dev_flow->tcf.vxlan_encap;
> > +	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_DECAP)
> > +		decap.vxlan = dev_flow->tcf.vxlan_decap;
> >  	nlh = dev_flow->tcf.nlh;
> >  	tcm = dev_flow->tcf.tcm;
> >  	/* Prepare API must have been called beforehand. */ @@ -2435,7
> > +2791,6 @@ struct pedit_parser {
> >  		mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
> >  	mnl_attr_put_strz(nlh, TCA_KIND, "flower");
> >  	na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
> > -	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS,
> TCA_CLS_FLAGS_SKIP_SW);
> 
> Why do you move it? You anyway can know if it is vxlan decap at this point.
> 
> >  	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
> >  		unsigned int i;
> >
> > @@ -2479,6 +2834,12 @@ struct pedit_parser {
> >  						 spec.eth->type);
> >  				eth_type_set = 1;
> >  			}
> > +			/*
> > +			 * L2 addresses/masks should  be sent anyway,
> > +			 * including VXLAN encap/decap cases, sometimes
> 
> "sometimes" sounds like a bug. Did you figure out why it is inconsistent?
> 
> > +			 * kernel returns an error if no L2 address
> > +			 * provided and skip_sw flag is set
> > +			 */
> >  			if (!is_zero_ether_addr(&mask.eth->dst)) {
> >  				mnl_attr_put(nlh,
> TCA_FLOWER_KEY_ETH_DST,
> >  					     ETHER_ADDR_LEN,
> > @@ -2495,8 +2856,19 @@ struct pedit_parser {
> >  					     ETHER_ADDR_LEN,
> >  					     mask.eth->src.addr_bytes);
> >  			}
> > -			break;
> > +			if (decap.hdr) {
> > +				DRV_LOG(INFO,
> > +				"ethernet addresses are treated "
> > +				"as inner ones for tunnel decapsulation");
> > +			}
> 
> I know there's no enc_[src|dst]_mac in tc flower command but can you
> clarify more about this? Let me take an example.
> 
>   flow create 1 ingress transfer
>     pattern eth src is 66:77:88:99:aa:bb
>       dst is 00:11:22:33:44:55 / ipv4 src is 2.2.2.2 dst is 1.1.1.1 /
>       udp src is 4789 dst is 4242 / vxlan vni is 0x112233 / end
>     actions vxlan_decap / port_id id 2 / end
> 
> In this case, will the mac addrs specified above be regarded as inner mac
> addrs?
> That sounds very weird. If inner mac addrs have to be specified it should be:

> 
>   flow create 1 ingress transfer
>     pattern eth / ipv4 src is 2.2.2.2 dst is 1.1.1.1 /
>       udp src is 4789 dst is 4242 / vxlan vni is 0x112233 /
>       eth src is 66:77:88:99:aa:bb dst is 00:11:22:33:44:55 / end
>     actions vxlan_decap / port_id id 2 / end
> 
> Isn't it?

Hm. I inherited Adrien's approach, but now It seems it is not correct.
I think we should change vxlan_decap to inner MACs.
 
> Also, I hope it to be in validate() as well.
> 
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > +		break;
> >  		case RTE_FLOW_ITEM_TYPE_VLAN:
> > +			if (encap.hdr || decap.hdr)
> > +				return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ITEM,
> NULL,
> > +					  "outer VLAN is not "
> > +					  "supported for tunnels");
> 
> This should be moved to validate(). And the error message sounds like inner
> vlan is allowed, doesn't it? Even if it is moved to validate(), there's no way to
> distinguish between outer vlan and inner vlan in your code. A bit confusing.
> Please clarify.
Check for tunnel flags? OK.
> 
> >  			item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
> >  			mask.vlan = flow_tcf_item_mask
> >  				(items, &rte_flow_item_vlan_mask, @@ -
> 2528,6 +2900,7 @@ struct
> > pedit_parser {
> >  						 rte_be_to_cpu_16
> >  						 (spec.vlan->tci &
> >  						  RTE_BE16(0x0fff)));
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ITEM_TYPE_IPV4:
> >  			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
> @@ -2538,36 +2911,53
> > @@ struct pedit_parser {
> >  				 sizeof(flow_tcf_mask_supported.ipv4),
> >  				 error);
> >  			assert(mask.ipv4);
> > -						 vlan_present ?
> > -
> TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> > -
> TCA_FLOWER_KEY_ETH_TYPE,
> > -						 RTE_BE16(ETH_P_IP));
> > -			eth_type_set = 1;
> > -			vlan_eth_type_set = 1;
> > -			if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
> > -				break;
> >  			spec.ipv4 = items->spec;
> > -			if (mask.ipv4->hdr.next_proto_id) {
> > -				mnl_attr_put_u8(nlh,
> TCA_FLOWER_KEY_IP_PROTO,
> > +			if (!decap.vxlan) {
> > +				if (!eth_type_set || !vlan_eth_type_set) {
> > +					mnl_attr_put_u16(nlh,
> > +						vlan_present ?
> > +
> 	TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> > +
> 	TCA_FLOWER_KEY_ETH_TYPE,
> > +						RTE_BE16(ETH_P_IP));
> > +				}
> > +				eth_type_set = 1;
> > +				vlan_eth_type_set = 1;
> > +				if (mask.ipv4 ==
> &flow_tcf_mask_empty.ipv4)
> > +					break;
> > +				if (mask.ipv4->hdr.next_proto_id) {
> > +					mnl_attr_put_u8
> > +						(nlh,
> TCA_FLOWER_KEY_IP_PROTO,
> >  						spec.ipv4-
> >hdr.next_proto_id);
> > -				ip_proto_set = 1;
> > +					ip_proto_set = 1;
> > +				}
> > +			} else {
> > +				assert(mask.ipv4 !=
> &flow_tcf_mask_empty.ipv4);
> >  			}
> >  			if (mask.ipv4->hdr.src_addr) {
> > -				mnl_attr_put_u32(nlh,
> TCA_FLOWER_KEY_IPV4_SRC,
> > -						 spec.ipv4->hdr.src_addr);
> > -				mnl_attr_put_u32(nlh,
> > -
> TCA_FLOWER_KEY_IPV4_SRC_MASK,
> > -						 mask.ipv4->hdr.src_addr);
> > +				mnl_attr_put_u32
> > +					(nlh, decap.vxlan ?
> > +					 TCA_FLOWER_KEY_ENC_IPV4_SRC :
> > +					 TCA_FLOWER_KEY_IPV4_SRC,
> > +					 spec.ipv4->hdr.src_addr);
> > +				mnl_attr_put_u32
> > +					(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
> > +
> TCA_FLOWER_KEY_IPV4_SRC_MASK,
> > +					 mask.ipv4->hdr.src_addr);
> >  			}
> >  			if (mask.ipv4->hdr.dst_addr) {
> > -				mnl_attr_put_u32(nlh,
> TCA_FLOWER_KEY_IPV4_DST,
> > -						 spec.ipv4->hdr.dst_addr);
> > -				mnl_attr_put_u32(nlh,
> > -
> TCA_FLOWER_KEY_IPV4_DST_MASK,
> > -						 mask.ipv4->hdr.dst_addr);
> > +				mnl_attr_put_u32
> > +					(nlh, decap.vxlan ?
> > +					 TCA_FLOWER_KEY_ENC_IPV4_DST :
> > +					 TCA_FLOWER_KEY_IPV4_DST,
> > +					 spec.ipv4->hdr.dst_addr);
> > +				mnl_attr_put_u32
> > +					(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
> > +
> TCA_FLOWER_KEY_IPV4_DST_MASK,
> > +					 mask.ipv4->hdr.dst_addr);
> >  			}
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ITEM_TYPE_IPV6:
> >  			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
> @@ -2578,38 +2968,53
> > @@ struct pedit_parser {
> >  				 sizeof(flow_tcf_mask_supported.ipv6),
> >  				 error);
> >  			assert(mask.ipv6);
> > -			if (!eth_type_set || !vlan_eth_type_set)
> > -				mnl_attr_put_u16(nlh,
> > -						 vlan_present ?
> > -
> TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> > -
> TCA_FLOWER_KEY_ETH_TYPE,
> > -						 RTE_BE16(ETH_P_IPV6));
> > -			eth_type_set = 1;
> > -			vlan_eth_type_set = 1;
> > -			if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
> > -				break;
> >  			spec.ipv6 = items->spec;
> > -			if (mask.ipv6->hdr.proto) {
> > -				mnl_attr_put_u8(nlh,
> TCA_FLOWER_KEY_IP_PROTO,
> > -						spec.ipv6->hdr.proto);
> > -				ip_proto_set = 1;
> > +			if (!decap.vxlan) {
> > +				if (!eth_type_set || !vlan_eth_type_set) {
> > +					mnl_attr_put_u16(nlh,
> > +						vlan_present ?
> > +
> 	TCA_FLOWER_KEY_VLAN_ETH_TYPE :
> > +
> 	TCA_FLOWER_KEY_ETH_TYPE,
> > +						RTE_BE16(ETH_P_IPV6));
> > +				}
> > +				eth_type_set = 1;
> > +				vlan_eth_type_set = 1;
> > +				if (mask.ipv6 ==
> &flow_tcf_mask_empty.ipv6)
> > +					break;
> > +				if (mask.ipv6->hdr.proto) {
> > +					mnl_attr_put_u8
> > +						(nlh,
> TCA_FLOWER_KEY_IP_PROTO,
> > +						 spec.ipv6->hdr.proto);
> > +					ip_proto_set = 1;
> > +				}
> > +			} else {
> > +				assert(mask.ipv6 !=
> &flow_tcf_mask_empty.ipv6);
> >  			}
> >  			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6-
> >hdr.src_addr)) {
> > -				mnl_attr_put(nlh,
> TCA_FLOWER_KEY_IPV6_SRC,
> > +				mnl_attr_put(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_IPV6_SRC :
> > +					     TCA_FLOWER_KEY_IPV6_SRC,
> >  					     sizeof(spec.ipv6->hdr.src_addr),
> >  					     spec.ipv6->hdr.src_addr);
> > -				mnl_attr_put(nlh,
> TCA_FLOWER_KEY_IPV6_SRC_MASK,
> > +				mnl_attr_put(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
> > +
> TCA_FLOWER_KEY_IPV6_SRC_MASK,
> >  					     sizeof(mask.ipv6->hdr.src_addr),
> >  					     mask.ipv6->hdr.src_addr);
> >  			}
> >  			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6-
> >hdr.dst_addr)) {
> > -				mnl_attr_put(nlh,
> TCA_FLOWER_KEY_IPV6_DST,
> > +				mnl_attr_put(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_IPV6_DST :
> > +					     TCA_FLOWER_KEY_IPV6_DST,
> >  					     sizeof(spec.ipv6->hdr.dst_addr),
> >  					     spec.ipv6->hdr.dst_addr);
> > -				mnl_attr_put(nlh,
> TCA_FLOWER_KEY_IPV6_DST_MASK,
> > +				mnl_attr_put(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
> > +
> TCA_FLOWER_KEY_IPV6_DST_MASK,
> >  					     sizeof(mask.ipv6->hdr.dst_addr),
> >  					     mask.ipv6->hdr.dst_addr);
> >  			}
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ITEM_TYPE_UDP:
> >  			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
> @@ -2620,26 +3025,44
> > @@ struct pedit_parser {
> >  				 sizeof(flow_tcf_mask_supported.udp),
> >  				 error);
> >  			assert(mask.udp);
> > -			if (!ip_proto_set)
> > -				mnl_attr_put_u8(nlh,
> TCA_FLOWER_KEY_IP_PROTO,
> > -						IPPROTO_UDP);
> > -			if (mask.udp == &flow_tcf_mask_empty.udp)
> > -				break;
> >  			spec.udp = items->spec;
> > +			if (!decap.vxlan) {
> > +				if (!ip_proto_set)
> > +					mnl_attr_put_u8
> > +						(nlh,
> TCA_FLOWER_KEY_IP_PROTO,
> > +						IPPROTO_UDP);
> > +				if (mask.udp == &flow_tcf_mask_empty.udp)
> > +					break;
> > +			} else {
> > +				assert(mask.udp !=
> &flow_tcf_mask_empty.udp);
> > +				decap.vxlan->udp_port
> > +					= RTE_BE16(spec.udp->hdr.dst_port);
> 
> Use rte_cpu_to_be_*() instead. And (=) sign has to be moved up.
> 
> > +			}
> >  			if (mask.udp->hdr.src_port) {
> > -				mnl_attr_put_u16(nlh,
> TCA_FLOWER_KEY_UDP_SRC,
> > -						 spec.udp->hdr.src_port);
> > -				mnl_attr_put_u16(nlh,
> > -
> TCA_FLOWER_KEY_UDP_SRC_MASK,
> > -						 mask.udp->hdr.src_port);
> > +				mnl_attr_put_u16
> > +					(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
> > +					 TCA_FLOWER_KEY_UDP_SRC,
> > +					 spec.udp->hdr.src_port);
> > +				mnl_attr_put_u16
> > +					(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
> > +
> TCA_FLOWER_KEY_UDP_SRC_MASK,
> > +					 mask.udp->hdr.src_port);
> >  			}
> >  			if (mask.udp->hdr.dst_port) {
> > -				mnl_attr_put_u16(nlh,
> TCA_FLOWER_KEY_UDP_DST,
> > -						 spec.udp->hdr.dst_port);
> > -				mnl_attr_put_u16(nlh,
> > -
> TCA_FLOWER_KEY_UDP_DST_MASK,
> > -						 mask.udp->hdr.dst_port);
> > +				mnl_attr_put_u16
> > +					(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
> > +					 TCA_FLOWER_KEY_UDP_DST,
> > +					 spec.udp->hdr.dst_port);
> > +				mnl_attr_put_u16
> > +					(nlh, decap.vxlan ?
> > +
> TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
> > +
> TCA_FLOWER_KEY_UDP_DST_MASK,
> > +					 mask.udp->hdr.dst_port);
> >  			}
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ITEM_TYPE_TCP:
> >  			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
> @@ -2682,7 +3105,15 @@
> > struct pedit_parser {
> >  					 rte_cpu_to_be_16
> >  						(mask.tcp->hdr.tcp_flags));
> >  			}
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> > +		case RTE_FLOW_ITEM_TYPE_VXLAN:
> > +			assert(decap.vxlan);
> > +			spec.vxlan = items->spec;
> > +			mnl_attr_put_u32(nlh,
> > +					 TCA_FLOWER_KEY_ENC_KEY_ID,
> > +					 vxlan_vni_as_be32(spec.vxlan-
> >vni));
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  		default:
> >  			return rte_flow_error_set(error, ENOTSUP,
> >
> RTE_FLOW_ERROR_TYPE_ITEM,
> > @@ -2715,6 +3146,14 @@ struct pedit_parser {
> >  			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
> >  			na_act = mnl_attr_nest_start(nlh,
> TCA_ACT_OPTIONS);
> >  			assert(na_act);
> > +			if (encap.hdr) {
> > +				assert(dev_flow->tcf.tunnel);
> > +				dev_flow->tcf.tunnel->ifindex_ptr =
> > +					&((struct tc_mirred *)
> > +					mnl_attr_get_payload
> > +					(mnl_nlmsg_get_payload_tail
> > +						(nlh)))->ifindex;
> > +			}
> >  			mnl_attr_put(nlh, TCA_MIRRED_PARMS,
> >  				     sizeof(struct tc_mirred),
> >  				     &(struct tc_mirred){
> > @@ -2724,6 +3163,7 @@ struct pedit_parser {
> >  				     });
> >  			mnl_attr_nest_end(nlh, na_act);
> >  			mnl_attr_nest_end(nlh, na_act_index);
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ACTION_TYPE_JUMP:
> >  			conf.jump = actions->conf;
> > @@ -2741,6 +3181,7 @@ struct pedit_parser {
> >  				     });
> >  			mnl_attr_nest_end(nlh, na_act);
> >  			mnl_attr_nest_end(nlh, na_act_index);
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ACTION_TYPE_DROP:
> >  			na_act_index =
> > @@ -2827,6 +3268,76 @@ struct pedit_parser {
> >  					(na_vlan_priority) =
> >  					conf.of_set_vlan_pcp->vlan_pcp;
> >  			}
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > +			break;
> > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > +			assert(decap.vxlan);
> > +			assert(dev_flow->tcf.tunnel);
> > +			dev_flow->tcf.tunnel->ifindex_ptr
> > +				= (unsigned int *)&tcm->tcm_ifindex;
> > +			na_act_index =
> > +				mnl_attr_nest_start(nlh,
> na_act_index_cur++);
> > +			assert(na_act_index);
> > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> "tunnel_key");
> > +			na_act = mnl_attr_nest_start(nlh,
> TCA_ACT_OPTIONS);
> > +			assert(na_act);
> > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > +				sizeof(struct tc_tunnel_key),
> > +				&(struct tc_tunnel_key){
> > +					.action = TC_ACT_PIPE,
> > +					.t_action =
> TCA_TUNNEL_KEY_ACT_RELEASE,
> > +					});
> > +			mnl_attr_nest_end(nlh, na_act);
> > +			mnl_attr_nest_end(nlh, na_act_index);
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > +			break;
> > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > +			assert(encap.vxlan);
> > +			na_act_index =
> > +				mnl_attr_nest_start(nlh,
> na_act_index_cur++);
> > +			assert(na_act_index);
> > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> "tunnel_key");
> > +			na_act = mnl_attr_nest_start(nlh,
> TCA_ACT_OPTIONS);
> > +			assert(na_act);
> > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > +				sizeof(struct tc_tunnel_key),
> > +				&(struct tc_tunnel_key){
> > +					.action = TC_ACT_PIPE,
> > +					.t_action =
> TCA_TUNNEL_KEY_ACT_SET,
> > +					});
> > +			if (encap.vxlan->mask &
> MLX5_FLOW_TCF_ENCAP_UDP_DST)
> > +				mnl_attr_put_u16(nlh,
> > +					 TCA_TUNNEL_KEY_ENC_DST_PORT,
> > +					 encap.vxlan->udp.dst);
> > +			if (encap.vxlan->mask &
> MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
> > +				mnl_attr_put_u32(nlh,
> > +					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
> > +					 encap.vxlan->ipv4.src);
> > +			if (encap.vxlan->mask &
> MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> > +				mnl_attr_put_u32(nlh,
> > +					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
> > +					 encap.vxlan->ipv4.dst);
> > +			if (encap.vxlan->mask &
> MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
> > +				mnl_attr_put(nlh,
> > +					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
> > +					 sizeof(encap.vxlan->ipv6.src),
> > +					 &encap.vxlan->ipv6.src);
> > +			if (encap.vxlan->mask &
> MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> > +				mnl_attr_put(nlh,
> > +					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
> > +					 sizeof(encap.vxlan->ipv6.dst),
> > +					 &encap.vxlan->ipv6.dst);
> > +			if (encap.vxlan->mask &
> MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
> > +				mnl_attr_put_u32(nlh,
> > +					 TCA_TUNNEL_KEY_ENC_KEY_ID,
> > +					 vxlan_vni_as_be32
> > +						(encap.vxlan->vxlan.vni));
> > +#ifdef TCA_TUNNEL_KEY_NO_CSUM
> > +			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM,
> 0); #endif
> 
> TCA_TUNNEL_KEY_NO_CSUM is anyway defined like others, then why do
> you treat it differently with #ifdef/#endif?

As it was found it is not defined on old kernels, on some our CI machines
compilation errors occurred.
 
WBR, Slava

> > +			mnl_attr_nest_end(nlh, na_act);
> > +			mnl_attr_nest_end(nlh, na_act_index);
> > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  			break;
> >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> > @@ -2850,7 +3361,11 @@ struct pedit_parser {
> >  	assert(na_flower);
> >  	assert(na_flower_act);
> >  	mnl_attr_nest_end(nlh, na_flower_act);
> > +	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS,
> > +			 dev_flow->tcf.action_flags &
> MLX5_ACTION_VXLAN_DECAP
> > +			 ? 0 : TCA_CLS_FLAGS_SKIP_SW);
> >  	mnl_attr_nest_end(nlh, na_flower);
> > +	assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> >  	return 0;
> >  }
Yongseok Koh Oct. 26, 2018, 4:22 a.m. UTC | #3
On Thu, Oct 25, 2018 at 07:37:56AM -0700, Slava Ovsiienko wrote:
> > -----Original Message-----
> > From: Yongseok Koh
> > Sent: Tuesday, October 23, 2018 13:06
> > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> > Subject: Re: [PATCH v2 3/7] net/mlx5: e-switch VXLAN flow translation
> > routine
> > 
> > On Mon, Oct 15, 2018 at 02:13:31PM +0000, Viacheslav Ovsiienko wrote:
[...]
> > > @@ -2184,13 +2447,16 @@ struct pedit_parser {
> > >   *   Pointer to the list of actions.
> > >   * @param[out] action_flags
> > >   *   Pointer to the detected actions.
> > > + * @param[out] tunnel
> > > + *   Pointer to tunnel encapsulation parameters structure to fill.
> > >   *
> > >   * @return
> > >   *   Maximum size of memory for actions.
> > >   */
> > >  static int
> > >  flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
> > > -			      uint64_t *action_flags)
> > > +			      uint64_t *action_flags,
> > > +			      void *tunnel)
> > 
> > This func is to get actions and size but you are parsing and filling tunnel info
> > here. It would be better to move parsing to translate() because it anyway has
> > multiple if conditions (same as switch/case) to set TCA_TUNNEL_KEY_ENC_*
> > there.
> Do you mean call of flow_tcf_vxlan_encap_parse(actions, tunnel)?

Yes.

> OK, let's move it to translate stage. Anyway, we need to keep encap structure
> for local/neigh rules.
> 
> > 
> > >  {
> > >  	int size = 0;
> > >  	uint64_t flags = 0;
> > > @@ -2246,6 +2512,29 @@ struct pedit_parser {
> > >  				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID.
> > */
> > >  				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio.
> > */
> > >  			break;
> > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> > */
> > > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > > +			size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > > +			size +=	flow_tcf_vxlan_encap_parse(actions, tunnel)
> > +
> > > +				RTE_ALIGN_CEIL /* preceding encap params.
> > */
> > > +				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > +				MNL_ALIGNTO);
> > 
> > Is it different from SZ_NLATTR_TYPE_OF(struct
> > mlx5_flow_tcf_vxlan_encap)? Or, use __rte_aligned(MNL_ALIGNTO) instead.
> 
> It is written intentionally in this form. It means that there is struct mlx5_flow_tcf_vxlan_encap 
> at the beginning of buffer. This is not the NL attribute, usage of SZ_NLATTR_TYPE_OF is
> not relevant here. Alignment is needed for the following Netlink message.

Good point. Understood.

> > 
> > > +			flags |= MLX5_ACTION_VXLAN_ENCAP;
> > > +			break;
> > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> > */
> > > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > > +			size +=	SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > > +			size +=	RTE_ALIGN_CEIL /* preceding decap params.
> > */
> > > +				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > +				MNL_ALIGNTO);
> > 
> > Same here.
> > 
> > > +			flags |= MLX5_ACTION_VXLAN_DECAP;
> > > +			break;
> > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> > > @@ -2289,6 +2578,26 @@ struct pedit_parser {  }
> > >
> > >  /**
> > > + * Convert VXLAN VNI to 32-bit integer.
> > > + *
> > > + * @param[in] vni
> > > + *   VXLAN VNI in 24-bit wire format.
> > > + *
> > > + * @return
> > > + *   VXLAN VNI as a 32-bit integer value in network endian.
> > > + */
> > > +static rte_be32_t
> > 
> > make it inline.
> OK. Missed point.
> 
> > 
> > > +vxlan_vni_as_be32(const uint8_t vni[3]) {
> > > +	rte_be32_t ret;
> > 
> > Defining ret as rte_be32_t? The return value of this func which is bswap(ret)
> > is also rte_be32_t??
> Yes. And it is directly stored in the net-endian NL attribute. 
> I've compiled and checked the listing of the function you proposed. It seems to be best, I'll take it.
> 
> > 
> > > +
> > > +	ret = vni[0];
> > > +	ret = (ret << 8) | vni[1];
> > > +	ret = (ret << 8) | vni[2];
> > > +	return RTE_BE32(ret);
> > 
> > Use rte_cpu_to_be_*() instead. But I still don't understand why you shuffle
> > bytes twice. One with shift and or and other by bswap().
> And it works. There are three bytes in very bizarre order (in NL attribute) - 0, vni[0], vni[1], vni[2].
> 
> > 
> > {
> > 	union {
> > 		uint8_t vni[4];
> > 		rte_be32_t dword;
> > 	} ret = {
> > 		.vni = { 0, vni[0], vni[1], vni[2] },
> > 	};
> > 	return ret.dword;
> > }
> > 
> > This will have the same result without extra cost.
> 
> OK. Investigated, it is the best for x86-64. Also I'm going to test it on the ARM 32,
> with various compilers, just curious.
> 
> > 
> > > +}
> > > +
> > > +/**
> > >   * Prepare a flow object for Linux TC flower. It calculates the maximum
> > size of
> > >   * memory required, allocates the memory, initializes Netlink message
> > headers
> > >   * and set unique TC message handle.
> > > @@ -2323,22 +2632,54 @@ struct pedit_parser {
> > >  	struct mlx5_flow *dev_flow;
> > >  	struct nlmsghdr *nlh;
> > >  	struct tcmsg *tcm;
> > > +	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
> > > +	uint8_t *sp, *tun = NULL;
> > >
> > >  	size += flow_tcf_get_items_and_size(attr, items, item_flags);
> > > -	size += flow_tcf_get_actions_and_size(actions, action_flags);
> > > -	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
> > > +	size += flow_tcf_get_actions_and_size(actions, action_flags,
> > &encap);
> > > +	dev_flow = rte_zmalloc(__func__, size,
> > > +			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
> > > +				(size_t)MNL_ALIGNTO));
> > 
> > Why RTE_MAX between the two? Note that it is alignment for start address
> > of the memory and the minimum alignment is cacheline size. On x86, non-
> > zero value less than 64 will have same result as 64.
> 
> OK. Thanks for note.
> It is not expected the structure alignments exceed the cache line size.
> So? Just specify zero?
> > 
> > >  	if (!dev_flow) {
> > >  		rte_flow_error_set(error, ENOMEM,
> > >  				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> > NULL,
> > >  				   "not enough memory to create E-Switch
> > flow");
> > >  		return NULL;
> > >  	}
> > > -	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
> > > +	sp = (uint8_t *)(dev_flow + 1);
> > > +	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
> > > +		tun = sp;
> > > +		sp += RTE_ALIGN_CEIL
> > > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > +			MNL_ALIGNTO);
> > 
> > And why should it be aligned? 
> 
> Netlink message should be aligned. It follows the mlx5_flow_tcf_vxlan_encap,
> that's why the pointer is aligned.

Not true. There's no requirement for nl msg buffer alignment. MNL_ALIGNTO is for
mainly size alignment. For example, checkout the source code of
mnl_nlmsg_put_header(void *buf). There's no requirement of aligning the start
address of buf. But, size of any entries (hdr, attr ...) should be aligned to
MNL_ALIGNTO(4).

> 
> As the size of dev_flow might not be aligned, it
> > is meaningless, isn't it? If you think it must be aligned for better performance
> > (not much anyway), you can use __rte_aligned(MNL_ALIGNTO) on the struct
> Hm. Where we can use __rte_aligned? Could you clarify, please.

For example,

struct mlx5_flow_tcf_tunnel_hdr {
	uint32_t type; /**< Tunnel action type. */
	unsigned int ifindex_tun; /**< Tunnel endpoint interface. */
	unsigned int ifindex_org; /**< Original dst/src interface */
	unsigned int *ifindex_ptr; /**< Interface ptr in message. */
} __rte_aligned(MNL_ALIGNTO);

A good example is the struct rte_mbuf. If this attribute is used, the size of
the struct will be aligned to the value.

If you still want to make the nl msg aligned,

	dev_flow = rte_zmalloc(..., MNL_ALIGNTO); /* anyway cacheline aligned. */
	tun = RTE_PTR_ALIGN(dev_flow + 1, MNL_ALIGNTO);
	nlh = mnl_nlmsg_put_header(tun);

with adding '__rte_aligned(MNL_ALIGNTO)' to struct mlx5_flow_tcf_vxlan_encap/decap.

Then, nlh will be aligned. You should make sure size is correctly calculated.

> 
> > definition but not for mlx5_flow (it's not only for tcf, have to do it manually).
> > 
> > > +		size -= RTE_ALIGN_CEIL
> > > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > +			MNL_ALIGNTO);
> > 
> > Don't you have to subtract sizeof(struct mlx5_flow) as well? But like I
> > mentioned, if '.nlsize' below isn't needed, you don't need to have this
> > calculation either.
> Yes, it is a bug. Should be fixed. Thank you.
> Let's discuss whether we can keep the nlsize under NDEBUG switch.

I agreed on using NDEBUG for it.

> 
> > 
> > > +		encap.hdr.type =
> > MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
> > > +		memcpy(tun, &encap,
> > > +		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
> > > +	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
> > > +		tun = sp;
> > > +		sp += RTE_ALIGN_CEIL
> > > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > +			MNL_ALIGNTO);
> > > +		size -= RTE_ALIGN_CEIL
> > > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > +			MNL_ALIGNTO);
> > > +		encap.hdr.type =
> > MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
> > > +		memcpy(tun, &encap,
> > > +		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
> > > +	}
> > > +	nlh = mnl_nlmsg_put_header(sp);
> > >  	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
> > >  	*dev_flow = (struct mlx5_flow){
> > >  		.tcf = (struct mlx5_flow_tcf){
> > > +			.nlsize = size,
> > >  			.nlh = nlh,
> > >  			.tcm = tcm,
> > > +			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
> > > +			.item_flags = *item_flags,
> > > +			.action_flags = *action_flags,
> > >  		},
> > >  	};
> > >  	/*
[...]
> > > @@ -2827,6 +3268,76 @@ struct pedit_parser {
> > >  					(na_vlan_priority) =
> > >  					conf.of_set_vlan_pcp->vlan_pcp;
> > >  			}
> > > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > > +			break;
> > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > > +			assert(decap.vxlan);
> > > +			assert(dev_flow->tcf.tunnel);
> > > +			dev_flow->tcf.tunnel->ifindex_ptr
> > > +				= (unsigned int *)&tcm->tcm_ifindex;
> > > +			na_act_index =
> > > +				mnl_attr_nest_start(nlh,
> > na_act_index_cur++);
> > > +			assert(na_act_index);
> > > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> > "tunnel_key");
> > > +			na_act = mnl_attr_nest_start(nlh,
> > TCA_ACT_OPTIONS);
> > > +			assert(na_act);
> > > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > > +				sizeof(struct tc_tunnel_key),
> > > +				&(struct tc_tunnel_key){
> > > +					.action = TC_ACT_PIPE,
> > > +					.t_action =
> > TCA_TUNNEL_KEY_ACT_RELEASE,
> > > +					});
> > > +			mnl_attr_nest_end(nlh, na_act);
> > > +			mnl_attr_nest_end(nlh, na_act_index);
> > > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > > +			break;
> > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > > +			assert(encap.vxlan);
> > > +			na_act_index =
> > > +				mnl_attr_nest_start(nlh,
> > na_act_index_cur++);
> > > +			assert(na_act_index);
> > > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> > "tunnel_key");
> > > +			na_act = mnl_attr_nest_start(nlh,
> > TCA_ACT_OPTIONS);
> > > +			assert(na_act);
> > > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > > +				sizeof(struct tc_tunnel_key),
> > > +				&(struct tc_tunnel_key){
> > > +					.action = TC_ACT_PIPE,
> > > +					.t_action =
> > TCA_TUNNEL_KEY_ACT_SET,
> > > +					});
> > > +			if (encap.vxlan->mask &
> > MLX5_FLOW_TCF_ENCAP_UDP_DST)
> > > +				mnl_attr_put_u16(nlh,
> > > +					 TCA_TUNNEL_KEY_ENC_DST_PORT,
> > > +					 encap.vxlan->udp.dst);
> > > +			if (encap.vxlan->mask &
> > MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
> > > +				mnl_attr_put_u32(nlh,
> > > +					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
> > > +					 encap.vxlan->ipv4.src);
> > > +			if (encap.vxlan->mask &
> > MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> > > +				mnl_attr_put_u32(nlh,
> > > +					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
> > > +					 encap.vxlan->ipv4.dst);
> > > +			if (encap.vxlan->mask &
> > MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
> > > +				mnl_attr_put(nlh,
> > > +					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
> > > +					 sizeof(encap.vxlan->ipv6.src),
> > > +					 &encap.vxlan->ipv6.src);
> > > +			if (encap.vxlan->mask &
> > MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> > > +				mnl_attr_put(nlh,
> > > +					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
> > > +					 sizeof(encap.vxlan->ipv6.dst),
> > > +					 &encap.vxlan->ipv6.dst);
> > > +			if (encap.vxlan->mask &
> > MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
> > > +				mnl_attr_put_u32(nlh,
> > > +					 TCA_TUNNEL_KEY_ENC_KEY_ID,
> > > +					 vxlan_vni_as_be32
> > > +						(encap.vxlan->vxlan.vni));
> > > +#ifdef TCA_TUNNEL_KEY_NO_CSUM
> > > +			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM,
> > 0); #endif
> > 
> > TCA_TUNNEL_KEY_NO_CSUM is anyway defined like others, then why do
> > you treat it differently with #ifdef/#endif?
> 
> As it was found it is not defined on old kernels, on some our CI machines
> compilation errors occurred.

In your first patch, TCA_TUNNEL_KEY_NO_CSUM is defined if there isn't
HAVE_TC_ACT_TUNNEL_KEY. Actually I'm wondering why it is different from
HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT. It looks like the following is needed -
HAVE_TCA_TUNNEL_KEY_NO_CSUM ??


	#ifdef HAVE_TC_ACT_TUNNEL_KEY

	#include <linux/tc_act/tc_tunnel_key.h>

	#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
	#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
	#endif

	#ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
	#define TCA_TUNNEL_KEY_NO_CSUM 10
	#endif

	#else /* HAVE_TC_ACT_TUNNEL_KEY */


Thanks,
Yongseok
Slava Ovsiienko Oct. 26, 2018, 9:06 a.m. UTC | #4
> -----Original Message-----
> From: Yongseok Koh
> Sent: Friday, October 26, 2018 7:22
> To: Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> Subject: Re: [PATCH v2 3/7] net/mlx5: e-switch VXLAN flow translation
> routine
> 
> On Thu, Oct 25, 2018 at 07:37:56AM -0700, Slava Ovsiienko wrote:
> > > -----Original Message-----
> > > From: Yongseok Koh
> > > Sent: Tuesday, October 23, 2018 13:06
> > > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > > Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> > > Subject: Re: [PATCH v2 3/7] net/mlx5: e-switch VXLAN flow
> > > translation routine
> > >
> > > On Mon, Oct 15, 2018 at 02:13:31PM +0000, Viacheslav Ovsiienko wrote:
> [...]
> > > > @@ -2184,13 +2447,16 @@ struct pedit_parser {
> > > >   *   Pointer to the list of actions.
> > > >   * @param[out] action_flags
> > > >   *   Pointer to the detected actions.
> > > > + * @param[out] tunnel
> > > > + *   Pointer to tunnel encapsulation parameters structure to fill.
> > > >   *
> > > >   * @return
> > > >   *   Maximum size of memory for actions.
> > > >   */
> > > >  static int
> > > >  flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
> > > > -			      uint64_t *action_flags)
> > > > +			      uint64_t *action_flags,
> > > > +			      void *tunnel)
> > >
> > > This func is to get actions and size but you are parsing and filling
> > > tunnel info here. It would be better to move parsing to translate()
> > > because it anyway has multiple if conditions (same as switch/case)
> > > to set TCA_TUNNEL_KEY_ENC_* there.
> > Do you mean call of flow_tcf_vxlan_encap_parse(actions, tunnel)?
> 
> Yes.
> 
> > OK, let's move it to translate stage. Anyway, we need to keep encap
> > structure for local/neigh rules.
> >
> > >
> > > >  {
> > > >  	int size = 0;
> > > >  	uint64_t flags = 0;
> > > > @@ -2246,6 +2512,29 @@ struct pedit_parser {
> > > >  				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID.
> > > */
> > > >  				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio.
> > > */
> > > >  			break;
> > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > > > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > > > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > > > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> > > */
> > > > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > > > +			size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > > > +			size +=	flow_tcf_vxlan_encap_parse(actions, tunnel)
> > > +
> > > > +				RTE_ALIGN_CEIL /* preceding encap params.
> > > */
> > > > +				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > > +				MNL_ALIGNTO);
> > >
> > > Is it different from SZ_NLATTR_TYPE_OF(struct
> > > mlx5_flow_tcf_vxlan_encap)? Or, use __rte_aligned(MNL_ALIGNTO)
> instead.
> >
> > It is written intentionally in this form. It means that there is
> > struct mlx5_flow_tcf_vxlan_encap at the beginning of buffer. This is
> > not the NL attribute, usage of SZ_NLATTR_TYPE_OF is not relevant here.
> Alignment is needed for the following Netlink message.
> 
> Good point. Understood.
> 
> > >
> > > > +			flags |= MLX5_ACTION_VXLAN_ENCAP;
> > > > +			break;
> > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > > > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > > > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > > > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> > > */
> > > > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > > > +			size +=	SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > > > +			size +=	RTE_ALIGN_CEIL /* preceding decap params.
> > > */
> > > > +				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > > +				MNL_ALIGNTO);
> > >
> > > Same here.
> > >
> > > > +			flags |= MLX5_ACTION_VXLAN_DECAP;
> > > > +			break;
> > > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> > > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> > > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> > > > @@ -2289,6 +2578,26 @@ struct pedit_parser {  }
> > > >
> > > >  /**
> > > > + * Convert VXLAN VNI to 32-bit integer.
> > > > + *
> > > > + * @param[in] vni
> > > > + *   VXLAN VNI in 24-bit wire format.
> > > > + *
> > > > + * @return
> > > > + *   VXLAN VNI as a 32-bit integer value in network endian.
> > > > + */
> > > > +static rte_be32_t
> > >
> > > make it inline.
> > OK. Missed point.
> >
> > >
> > > > +vxlan_vni_as_be32(const uint8_t vni[3]) {
> > > > +	rte_be32_t ret;
> > >
> > > Defining ret as rte_be32_t? The return value of this func which is
> > > bswap(ret) is also rte_be32_t??
> > Yes. And it is directly stored in the net-endian NL attribute.
> > I've compiled and checked the listing of the function you proposed. It
> seems to be best, I'll take it.
> >
> > >
> > > > +
> > > > +	ret = vni[0];
> > > > +	ret = (ret << 8) | vni[1];
> > > > +	ret = (ret << 8) | vni[2];
> > > > +	return RTE_BE32(ret);
> > >
> > > Use rte_cpu_to_be_*() instead. But I still don't understand why you
> > > shuffle bytes twice. One with shift and or and other by bswap().
> > And it works. There are three bytes in very bizarre order (in NL attribute) -
> 0, vni[0], vni[1], vni[2].
> >
> > >
> > > {
> > > 	union {
> > > 		uint8_t vni[4];
> > > 		rte_be32_t dword;
> > > 	} ret = {
> > > 		.vni = { 0, vni[0], vni[1], vni[2] },
> > > 	};
> > > 	return ret.dword;
> > > }
> > >
> > > This will have the same result without extra cost.
> >
> > OK. Investigated, it is the best for x86-64. Also I'm going to test it
> > on the ARM 32, with various compilers, just curious.
> >
> > >
> > > > +}
> > > > +
> > > > +/**
> > > >   * Prepare a flow object for Linux TC flower. It calculates the
> > > > maximum
> > > size of
> > > >   * memory required, allocates the memory, initializes Netlink
> > > > message
> > > headers
> > > >   * and set unique TC message handle.
> > > > @@ -2323,22 +2632,54 @@ struct pedit_parser {
> > > >  	struct mlx5_flow *dev_flow;
> > > >  	struct nlmsghdr *nlh;
> > > >  	struct tcmsg *tcm;
> > > > +	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
> > > > +	uint8_t *sp, *tun = NULL;
> > > >
> > > >  	size += flow_tcf_get_items_and_size(attr, items, item_flags);
> > > > -	size += flow_tcf_get_actions_and_size(actions, action_flags);
> > > > -	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
> > > > +	size += flow_tcf_get_actions_and_size(actions, action_flags,
> > > &encap);
> > > > +	dev_flow = rte_zmalloc(__func__, size,
> > > > +			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
> > > > +				(size_t)MNL_ALIGNTO));
> > >
> > > Why RTE_MAX between the two? Note that it is alignment for start
> > > address of the memory and the minimum alignment is cacheline size.
> > > On x86, non- zero value less than 64 will have same result as 64.
> >
> > OK. Thanks for note.
> > It is not expected the structure alignments exceed the cache line size.
> > So? Just specify zero?
> > >
> > > >  	if (!dev_flow) {
> > > >  		rte_flow_error_set(error, ENOMEM,
> > > >  				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> > > NULL,
> > > >  				   "not enough memory to create E-Switch
> > > flow");
> > > >  		return NULL;
> > > >  	}
> > > > -	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
> > > > +	sp = (uint8_t *)(dev_flow + 1);
> > > > +	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
> > > > +		tun = sp;
> > > > +		sp += RTE_ALIGN_CEIL
> > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > > +			MNL_ALIGNTO);
> > >
> > > And why should it be aligned?
> >
> > Netlink message should be aligned. It follows the
> > mlx5_flow_tcf_vxlan_encap, that's why the pointer is aligned.
> 
> Not true. There's no requirement for nl msg buffer alignment.
> MNL_ALIGNTO is for mainly size alignment. For example, checkout the
> source code of mnl_nlmsg_put_header(void *buf). There's no requirement of
> aligning the start address of buf. But, size of any entries (hdr, attr ...) should
> be aligned to MNL_ALIGNTO(4).

Formally speaking, yes. There is no explicit requirement for the header
alignment. And the entire message goes to the send(), it does not care about alignment.
But not aligning the entire structure does not look as a good practice.
( I had been living for a long time on embedded systems with activated
Alignment Check feature and off unaligned access compiler flags. 
There was not very long waiting time to get punishing exception. )

> 
> >
> > As the size of dev_flow might not be aligned, it
> > > is meaningless, isn't it? If you think it must be aligned for better
> > > performance (not much anyway), you can use
> > > __rte_aligned(MNL_ALIGNTO) on the struct
> > Hm. Where we can use __rte_aligned? Could you clarify, please.
> 
> For example,
> 
> struct mlx5_flow_tcf_tunnel_hdr {
> 	uint32_t type; /**< Tunnel action type. */
> 	unsigned int ifindex_tun; /**< Tunnel endpoint interface. */
> 	unsigned int ifindex_org; /**< Original dst/src interface */
> 	unsigned int *ifindex_ptr; /**< Interface ptr in message. */ }
> __rte_aligned(MNL_ALIGNTO);

No. tunnel_hdr should not know anything about NL message.
It happens, we have the NL message follows the tunnel_hdr 
in some our memory buf. What if we would like to add some other
object after tunnel_hdr in buffer? Not NL message? 
The aligment of objects is  duty of code which places objects into buffer,
Objects can be very different, with different alignment requirements,
and, generally speaking, placed  in arbitrary order. Why, while
declaring the tunnel_hdr  structure,  we should make an assumption
it is always followed by NL message? _rte_aligned(MNL_ALIGNTO) at the end
of tunnel_hdr - is exactly an example of  that unapropriate assumption.

> 
> A good example is the struct rte_mbuf. If this attribute is used, the size of the
> struct will be aligned to the value.
> 
> If you still want to make the nl msg aligned,
> 
> 	dev_flow = rte_zmalloc(..., MNL_ALIGNTO); /* anyway cacheline
> aligned. */
> 	tun = RTE_PTR_ALIGN(dev_flow + 1, MNL_ALIGNTO);
> 	nlh = mnl_nlmsg_put_header(tun);
> 
> with adding '__rte_aligned(MNL_ALIGNTO)' to struct
> mlx5_flow_tcf_vxlan_encap/decap.
> 
> Then, nlh will be aligned. You should make sure size is correctly calculated.
> 
> >
> > > definition but not for mlx5_flow (it's not only for tcf, have to do it
> manually).
> > >
> > > > +		size -= RTE_ALIGN_CEIL
> > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > > +			MNL_ALIGNTO);
> > >
> > > Don't you have to subtract sizeof(struct mlx5_flow) as well? But
> > > like I mentioned, if '.nlsize' below isn't needed, you don't need to
> > > have this calculation either.
> > Yes, it is a bug. Should be fixed. Thank you.
> > Let's discuss whether we can keep the nlsize under NDEBUG switch.
> 
> I agreed on using NDEBUG for it.
> 
> >
> > >
> > > > +		encap.hdr.type =
> > > MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
> > > > +		memcpy(tun, &encap,
> > > > +		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
> > > > +	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
> > > > +		tun = sp;
> > > > +		sp += RTE_ALIGN_CEIL
> > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > > +			MNL_ALIGNTO);
> > > > +		size -= RTE_ALIGN_CEIL
> > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > > +			MNL_ALIGNTO);
> > > > +		encap.hdr.type =
> > > MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
> > > > +		memcpy(tun, &encap,
> > > > +		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
> > > > +	}
> > > > +	nlh = mnl_nlmsg_put_header(sp);
> > > >  	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
> > > >  	*dev_flow = (struct mlx5_flow){
> > > >  		.tcf = (struct mlx5_flow_tcf){
> > > > +			.nlsize = size,
> > > >  			.nlh = nlh,
> > > >  			.tcm = tcm,
> > > > +			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
> > > > +			.item_flags = *item_flags,
> > > > +			.action_flags = *action_flags,
> > > >  		},
> > > >  	};
> > > >  	/*
> [...]
> > > > @@ -2827,6 +3268,76 @@ struct pedit_parser {
> > > >  					(na_vlan_priority) =
> > > >  					conf.of_set_vlan_pcp->vlan_pcp;
> > > >  			}
> > > > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > > > +			break;
> > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > > > +			assert(decap.vxlan);
> > > > +			assert(dev_flow->tcf.tunnel);
> > > > +			dev_flow->tcf.tunnel->ifindex_ptr
> > > > +				= (unsigned int *)&tcm->tcm_ifindex;
> > > > +			na_act_index =
> > > > +				mnl_attr_nest_start(nlh,
> > > na_act_index_cur++);
> > > > +			assert(na_act_index);
> > > > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> > > "tunnel_key");
> > > > +			na_act = mnl_attr_nest_start(nlh,
> > > TCA_ACT_OPTIONS);
> > > > +			assert(na_act);
> > > > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > > > +				sizeof(struct tc_tunnel_key),
> > > > +				&(struct tc_tunnel_key){
> > > > +					.action = TC_ACT_PIPE,
> > > > +					.t_action =
> > > TCA_TUNNEL_KEY_ACT_RELEASE,
> > > > +					});
> > > > +			mnl_attr_nest_end(nlh, na_act);
> > > > +			mnl_attr_nest_end(nlh, na_act_index);
> > > > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > > > +			break;
> > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > > > +			assert(encap.vxlan);
> > > > +			na_act_index =
> > > > +				mnl_attr_nest_start(nlh,
> > > na_act_index_cur++);
> > > > +			assert(na_act_index);
> > > > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> > > "tunnel_key");
> > > > +			na_act = mnl_attr_nest_start(nlh,
> > > TCA_ACT_OPTIONS);
> > > > +			assert(na_act);
> > > > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > > > +				sizeof(struct tc_tunnel_key),
> > > > +				&(struct tc_tunnel_key){
> > > > +					.action = TC_ACT_PIPE,
> > > > +					.t_action =
> > > TCA_TUNNEL_KEY_ACT_SET,
> > > > +					});
> > > > +			if (encap.vxlan->mask &
> > > MLX5_FLOW_TCF_ENCAP_UDP_DST)
> > > > +				mnl_attr_put_u16(nlh,
> > > > +					 TCA_TUNNEL_KEY_ENC_DST_PORT,
> > > > +					 encap.vxlan->udp.dst);
> > > > +			if (encap.vxlan->mask &
> > > MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
> > > > +				mnl_attr_put_u32(nlh,
> > > > +					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
> > > > +					 encap.vxlan->ipv4.src);
> > > > +			if (encap.vxlan->mask &
> > > MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> > > > +				mnl_attr_put_u32(nlh,
> > > > +					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
> > > > +					 encap.vxlan->ipv4.dst);
> > > > +			if (encap.vxlan->mask &
> > > MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
> > > > +				mnl_attr_put(nlh,
> > > > +					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
> > > > +					 sizeof(encap.vxlan->ipv6.src),
> > > > +					 &encap.vxlan->ipv6.src);
> > > > +			if (encap.vxlan->mask &
> > > MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> > > > +				mnl_attr_put(nlh,
> > > > +					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
> > > > +					 sizeof(encap.vxlan->ipv6.dst),
> > > > +					 &encap.vxlan->ipv6.dst);
> > > > +			if (encap.vxlan->mask &
> > > MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
> > > > +				mnl_attr_put_u32(nlh,
> > > > +					 TCA_TUNNEL_KEY_ENC_KEY_ID,
> > > > +					 vxlan_vni_as_be32
> > > > +						(encap.vxlan->vxlan.vni));
> > > > +#ifdef TCA_TUNNEL_KEY_NO_CSUM
> > > > +			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM,
> > > 0); #endif
> > >
> > > TCA_TUNNEL_KEY_NO_CSUM is anyway defined like others, then why do
> > > you treat it differently with #ifdef/#endif?
> >
> > As it was found it is not defined on old kernels, on some our CI
> > machines compilation errors occurred.
> 
> In your first patch, TCA_TUNNEL_KEY_NO_CSUM is defined if there isn't
> HAVE_TC_ACT_TUNNEL_KEY. Actually I'm wondering why it is different from
> HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT. It looks like the following is
> needed - HAVE_TCA_TUNNEL_KEY_NO_CSUM ??
> 
> 
> 	#ifdef HAVE_TC_ACT_TUNNEL_KEY
> 
> 	#include <linux/tc_act/tc_tunnel_key.h>
> 
> 	#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
> 	#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
> 	#endif
> 
> 	#ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
> 	#define TCA_TUNNEL_KEY_NO_CSUM 10
> 	#endif

I think it is subject to check. Yes, we can define the "missing"
macros, but it seems the old kernel just does not know these
keys. Whether the rule with these keys is accepted by kernel?
I did not check (have no host with old setup to check),
I'd prefer to exclude not very significant key to lower the
rule rejection risk. 

> 
> 	#else /* HAVE_TC_ACT_TUNNEL_KEY */
> 
> 
> Thanks,
> Yongseok
Yongseok Koh Oct. 26, 2018, 10:10 p.m. UTC | #5
On Fri, Oct 26, 2018 at 02:06:53AM -0700, Slava Ovsiienko wrote:
> > -----Original Message-----
> > From: Yongseok Koh
> > Sent: Friday, October 26, 2018 7:22
> > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> > Subject: Re: [PATCH v2 3/7] net/mlx5: e-switch VXLAN flow translation
> > routine
> > 
> > On Thu, Oct 25, 2018 at 07:37:56AM -0700, Slava Ovsiienko wrote:
> > > > -----Original Message-----
> > > > From: Yongseok Koh
> > > > Sent: Tuesday, October 23, 2018 13:06
> > > > To: Slava Ovsiienko <viacheslavo@mellanox.com>
> > > > Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org
> > > > Subject: Re: [PATCH v2 3/7] net/mlx5: e-switch VXLAN flow
> > > > translation routine
> > > >
> > > > On Mon, Oct 15, 2018 at 02:13:31PM +0000, Viacheslav Ovsiienko wrote:
> > [...]
> > > > > @@ -2184,13 +2447,16 @@ struct pedit_parser {
> > > > >   *   Pointer to the list of actions.
> > > > >   * @param[out] action_flags
> > > > >   *   Pointer to the detected actions.
> > > > > + * @param[out] tunnel
> > > > > + *   Pointer to tunnel encapsulation parameters structure to fill.
> > > > >   *
> > > > >   * @return
> > > > >   *   Maximum size of memory for actions.
> > > > >   */
> > > > >  static int
> > > > >  flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
> > > > > -			      uint64_t *action_flags)
> > > > > +			      uint64_t *action_flags,
> > > > > +			      void *tunnel)
> > > >
> > > > This func is to get actions and size but you are parsing and filling
> > > > tunnel info here. It would be better to move parsing to translate()
> > > > because it anyway has multiple if conditions (same as switch/case)
> > > > to set TCA_TUNNEL_KEY_ENC_* there.
> > > Do you mean call of flow_tcf_vxlan_encap_parse(actions, tunnel)?
> > 
> > Yes.
> > 
> > > OK, let's move it to translate stage. Anyway, we need to keep encap
> > > structure for local/neigh rules.
> > >
> > > >
> > > > >  {
> > > > >  	int size = 0;
> > > > >  	uint64_t flags = 0;
> > > > > @@ -2246,6 +2512,29 @@ struct pedit_parser {
> > > > >  				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID.
> > > > */
> > > > >  				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio.
> > > > */
> > > > >  			break;
> > > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > > > > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > > > > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > > > > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> > > > */
> > > > > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > > > > +			size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > > > > +			size +=	flow_tcf_vxlan_encap_parse(actions, tunnel)
> > > > +
> > > > > +				RTE_ALIGN_CEIL /* preceding encap params.
> > > > */
> > > > > +				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > > > +				MNL_ALIGNTO);
> > > >
> > > > Is it different from SZ_NLATTR_TYPE_OF(struct
> > > > mlx5_flow_tcf_vxlan_encap)? Or, use __rte_aligned(MNL_ALIGNTO)
> > instead.
> > >
> > > It is written intentionally in this form. It means that there is
> > > struct mlx5_flow_tcf_vxlan_encap at the beginning of buffer. This is
> > > not the NL attribute, usage of SZ_NLATTR_TYPE_OF is not relevant here.
> > Alignment is needed for the following Netlink message.
> > 
> > Good point. Understood.
> > 
> > > >
> > > > > +			flags |= MLX5_ACTION_VXLAN_ENCAP;
> > > > > +			break;
> > > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > > > > +			size += SZ_NLATTR_NEST + /* na_act_index. */
> > > > > +				SZ_NLATTR_STRZ_OF("tunnel_key") +
> > > > > +				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS.
> > > > */
> > > > > +				SZ_NLATTR_TYPE_OF(uint8_t);
> > > > > +			size +=	SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
> > > > > +			size +=	RTE_ALIGN_CEIL /* preceding decap params.
> > > > */
> > > > > +				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > > > +				MNL_ALIGNTO);
> > > >
> > > > Same here.
> > > >
> > > > > +			flags |= MLX5_ACTION_VXLAN_DECAP;
> > > > > +			break;
> > > > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> > > > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> > > > >  		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> > > > > @@ -2289,6 +2578,26 @@ struct pedit_parser {  }
> > > > >
> > > > >  /**
> > > > > + * Convert VXLAN VNI to 32-bit integer.
> > > > > + *
> > > > > + * @param[in] vni
> > > > > + *   VXLAN VNI in 24-bit wire format.
> > > > > + *
> > > > > + * @return
> > > > > + *   VXLAN VNI as a 32-bit integer value in network endian.
> > > > > + */
> > > > > +static rte_be32_t
> > > >
> > > > make it inline.
> > > OK. Missed point.
> > >
> > > >
> > > > > +vxlan_vni_as_be32(const uint8_t vni[3]) {
> > > > > +	rte_be32_t ret;
> > > >
> > > > Defining ret as rte_be32_t? The return value of this func which is
> > > > bswap(ret) is also rte_be32_t??
> > > Yes. And it is directly stored in the net-endian NL attribute.
> > > I've compiled and checked the listing of the function you proposed. It
> > seems to be best, I'll take it.
> > >
> > > >
> > > > > +
> > > > > +	ret = vni[0];
> > > > > +	ret = (ret << 8) | vni[1];
> > > > > +	ret = (ret << 8) | vni[2];
> > > > > +	return RTE_BE32(ret);
> > > >
> > > > Use rte_cpu_to_be_*() instead. But I still don't understand why you
> > > > shuffle bytes twice. One with shift and or and other by bswap().
> > > And it works. There are three bytes in very bizarre order (in NL attribute) -
> > 0, vni[0], vni[1], vni[2].
> > >
> > > >
> > > > {
> > > > 	union {
> > > > 		uint8_t vni[4];
> > > > 		rte_be32_t dword;
> > > > 	} ret = {
> > > > 		.vni = { 0, vni[0], vni[1], vni[2] },
> > > > 	};
> > > > 	return ret.dword;
> > > > }
> > > >
> > > > This will have the same result without extra cost.
> > >
> > > OK. Investigated, it is the best for x86-64. Also I'm going to test it
> > > on the ARM 32, with various compilers, just curious.
> > >
> > > >
> > > > > +}
> > > > > +
> > > > > +/**
> > > > >   * Prepare a flow object for Linux TC flower. It calculates the
> > > > > maximum
> > > > size of
> > > > >   * memory required, allocates the memory, initializes Netlink
> > > > > message
> > > > headers
> > > > >   * and set unique TC message handle.
> > > > > @@ -2323,22 +2632,54 @@ struct pedit_parser {
> > > > >  	struct mlx5_flow *dev_flow;
> > > > >  	struct nlmsghdr *nlh;
> > > > >  	struct tcmsg *tcm;
> > > > > +	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
> > > > > +	uint8_t *sp, *tun = NULL;
> > > > >
> > > > >  	size += flow_tcf_get_items_and_size(attr, items, item_flags);
> > > > > -	size += flow_tcf_get_actions_and_size(actions, action_flags);
> > > > > -	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
> > > > > +	size += flow_tcf_get_actions_and_size(actions, action_flags,
> > > > &encap);
> > > > > +	dev_flow = rte_zmalloc(__func__, size,
> > > > > +			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
> > > > > +				(size_t)MNL_ALIGNTO));
> > > >
> > > > Why RTE_MAX between the two? Note that it is alignment for start
> > > > address of the memory and the minimum alignment is cacheline size.
> > > > On x86, non- zero value less than 64 will have same result as 64.
> > >
> > > OK. Thanks for note.
> > > It is not expected the structure alignments exceed the cache line size.
> > > So? Just specify zero?
> > > >
> > > > >  	if (!dev_flow) {
> > > > >  		rte_flow_error_set(error, ENOMEM,
> > > > >  				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> > > > NULL,
> > > > >  				   "not enough memory to create E-Switch
> > > > flow");
> > > > >  		return NULL;
> > > > >  	}
> > > > > -	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
> > > > > +	sp = (uint8_t *)(dev_flow + 1);
> > > > > +	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
> > > > > +		tun = sp;
> > > > > +		sp += RTE_ALIGN_CEIL
> > > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > > > +			MNL_ALIGNTO);
> > > >
> > > > And why should it be aligned?
> > >
> > > Netlink message should be aligned. It follows the
> > > mlx5_flow_tcf_vxlan_encap, that's why the pointer is aligned.
> > 
> > Not true. There's no requirement for nl msg buffer alignment.
> > MNL_ALIGNTO is for mainly size alignment. For example, checkout the
> > source code of mnl_nlmsg_put_header(void *buf). There's no requirement of
> > aligning the start address of buf. But, size of any entries (hdr, attr ...) should
> > be aligned to MNL_ALIGNTO(4).
> 
> Formally speaking, yes. There is no explicit requirement for the header
> alignment. And the entire message goes to the send(), it does not care about alignment.
> But not aligning the entire structure does not look as a good practice.
> ( I had been living for a long time on embedded systems with activated
> Alignment Check feature and off unaligned access compiler flags. 
> There was not very long waiting time to get punishing exception. )

Like mentioned before, I don't have any objection for the alignment.

> > > As the size of dev_flow might not be aligned, it
> > > > is meaningless, isn't it? If you think it must be aligned for better
> > > > performance (not much anyway), you can use
> > > > __rte_aligned(MNL_ALIGNTO) on the struct
> > > Hm. Where we can use __rte_aligned? Could you clarify, please.
> > 
> > For example,
> > 
> > struct mlx5_flow_tcf_tunnel_hdr {
> > 	uint32_t type; /**< Tunnel action type. */
> > 	unsigned int ifindex_tun; /**< Tunnel endpoint interface. */
> > 	unsigned int ifindex_org; /**< Original dst/src interface */
> > 	unsigned int *ifindex_ptr; /**< Interface ptr in message. */ }
> > __rte_aligned(MNL_ALIGNTO);
> 
> No. tunnel_hdr should not know anything about NL message.
> It happens, we have the NL message follows the tunnel_hdr 
> in some our memory buf. What if we would like to add some other
> object after tunnel_hdr in buffer? Not NL message? 
> The aligment of objects is  duty of code which places objects into buffer,
> Objects can be very different, with different alignment requirements,
> and, generally speaking, placed  in arbitrary order. Why, while
> declaring the tunnel_hdr  structure,  we should make an assumption
> it is always followed by NL message? _rte_aligned(MNL_ALIGNTO) at the end
> of tunnel_hdr - is exactly an example of  that unapropriate assumption.

Yeah, I agree. That was just an example.

And my point was the original code isn't enough to achieve the alignment as the
size of dev_flow isn't aligned. You should've done something like:

	tun = RTE_PTR_ALIGN(dev_flow + 1, MNL_ALIGNTO);

In summary, if you want to make it aligned, please fix the issue I raised and
improve readability of the code.

> > A good example is the struct rte_mbuf. If this attribute is used, the size of the
> > struct will be aligned to the value.
> > 
> > If you still want to make the nl msg aligned,
> > 
> > 	dev_flow = rte_zmalloc(..., MNL_ALIGNTO); /* anyway cacheline
> > aligned. */
> > 	tun = RTE_PTR_ALIGN(dev_flow + 1, MNL_ALIGNTO);
> > 	nlh = mnl_nlmsg_put_header(tun);
> > 
> > with adding '__rte_aligned(MNL_ALIGNTO)' to struct
> > mlx5_flow_tcf_vxlan_encap/decap.
> > 
> > Then, nlh will be aligned. You should make sure size is correctly calculated.
> > 
> > >
> > > > definition but not for mlx5_flow (it's not only for tcf, have to do it
> > manually).
> > > >
> > > > > +		size -= RTE_ALIGN_CEIL
> > > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
> > > > > +			MNL_ALIGNTO);
> > > >
> > > > Don't you have to subtract sizeof(struct mlx5_flow) as well? But
> > > > like I mentioned, if '.nlsize' below isn't needed, you don't need to
> > > > have this calculation either.
> > > Yes, it is a bug. Should be fixed. Thank you.
> > > Let's discuss whether we can keep the nlsize under NDEBUG switch.
> > 
> > I agreed on using NDEBUG for it.
> > 
> > >
> > > >
> > > > > +		encap.hdr.type =
> > > > MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
> > > > > +		memcpy(tun, &encap,
> > > > > +		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
> > > > > +	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
> > > > > +		tun = sp;
> > > > > +		sp += RTE_ALIGN_CEIL
> > > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > > > +			MNL_ALIGNTO);
> > > > > +		size -= RTE_ALIGN_CEIL
> > > > > +			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
> > > > > +			MNL_ALIGNTO);
> > > > > +		encap.hdr.type =
> > > > MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
> > > > > +		memcpy(tun, &encap,
> > > > > +		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
> > > > > +	}
> > > > > +	nlh = mnl_nlmsg_put_header(sp);
> > > > >  	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
> > > > >  	*dev_flow = (struct mlx5_flow){
> > > > >  		.tcf = (struct mlx5_flow_tcf){
> > > > > +			.nlsize = size,
> > > > >  			.nlh = nlh,
> > > > >  			.tcm = tcm,
> > > > > +			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
> > > > > +			.item_flags = *item_flags,
> > > > > +			.action_flags = *action_flags,
> > > > >  		},
> > > > >  	};
> > > > >  	/*
> > [...]
> > > > > @@ -2827,6 +3268,76 @@ struct pedit_parser {
> > > > >  					(na_vlan_priority) =
> > > > >  					conf.of_set_vlan_pcp->vlan_pcp;
> > > > >  			}
> > > > > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > > > > +			break;
> > > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> > > > > +			assert(decap.vxlan);
> > > > > +			assert(dev_flow->tcf.tunnel);
> > > > > +			dev_flow->tcf.tunnel->ifindex_ptr
> > > > > +				= (unsigned int *)&tcm->tcm_ifindex;
> > > > > +			na_act_index =
> > > > > +				mnl_attr_nest_start(nlh,
> > > > na_act_index_cur++);
> > > > > +			assert(na_act_index);
> > > > > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> > > > "tunnel_key");
> > > > > +			na_act = mnl_attr_nest_start(nlh,
> > > > TCA_ACT_OPTIONS);
> > > > > +			assert(na_act);
> > > > > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > > > > +				sizeof(struct tc_tunnel_key),
> > > > > +				&(struct tc_tunnel_key){
> > > > > +					.action = TC_ACT_PIPE,
> > > > > +					.t_action =
> > > > TCA_TUNNEL_KEY_ACT_RELEASE,
> > > > > +					});
> > > > > +			mnl_attr_nest_end(nlh, na_act);
> > > > > +			mnl_attr_nest_end(nlh, na_act_index);
> > > > > +			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
> > > > > +			break;
> > > > > +		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> > > > > +			assert(encap.vxlan);
> > > > > +			na_act_index =
> > > > > +				mnl_attr_nest_start(nlh,
> > > > na_act_index_cur++);
> > > > > +			assert(na_act_index);
> > > > > +			mnl_attr_put_strz(nlh, TCA_ACT_KIND,
> > > > "tunnel_key");
> > > > > +			na_act = mnl_attr_nest_start(nlh,
> > > > TCA_ACT_OPTIONS);
> > > > > +			assert(na_act);
> > > > > +			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
> > > > > +				sizeof(struct tc_tunnel_key),
> > > > > +				&(struct tc_tunnel_key){
> > > > > +					.action = TC_ACT_PIPE,
> > > > > +					.t_action =
> > > > TCA_TUNNEL_KEY_ACT_SET,
> > > > > +					});
> > > > > +			if (encap.vxlan->mask &
> > > > MLX5_FLOW_TCF_ENCAP_UDP_DST)
> > > > > +				mnl_attr_put_u16(nlh,
> > > > > +					 TCA_TUNNEL_KEY_ENC_DST_PORT,
> > > > > +					 encap.vxlan->udp.dst);
> > > > > +			if (encap.vxlan->mask &
> > > > MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
> > > > > +				mnl_attr_put_u32(nlh,
> > > > > +					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
> > > > > +					 encap.vxlan->ipv4.src);
> > > > > +			if (encap.vxlan->mask &
> > > > MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> > > > > +				mnl_attr_put_u32(nlh,
> > > > > +					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
> > > > > +					 encap.vxlan->ipv4.dst);
> > > > > +			if (encap.vxlan->mask &
> > > > MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
> > > > > +				mnl_attr_put(nlh,
> > > > > +					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
> > > > > +					 sizeof(encap.vxlan->ipv6.src),
> > > > > +					 &encap.vxlan->ipv6.src);
> > > > > +			if (encap.vxlan->mask &
> > > > MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> > > > > +				mnl_attr_put(nlh,
> > > > > +					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
> > > > > +					 sizeof(encap.vxlan->ipv6.dst),
> > > > > +					 &encap.vxlan->ipv6.dst);
> > > > > +			if (encap.vxlan->mask &
> > > > MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
> > > > > +				mnl_attr_put_u32(nlh,
> > > > > +					 TCA_TUNNEL_KEY_ENC_KEY_ID,
> > > > > +					 vxlan_vni_as_be32
> > > > > +						(encap.vxlan->vxlan.vni));
> > > > > +#ifdef TCA_TUNNEL_KEY_NO_CSUM
> > > > > +			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM,
> > > > 0); #endif
> > > >
> > > > TCA_TUNNEL_KEY_NO_CSUM is anyway defined like others, then why do
> > > > you treat it differently with #ifdef/#endif?
> > >
> > > As it was found it is not defined on old kernels, on some our CI
> > > machines compilation errors occurred.
> > 
> > In your first patch, TCA_TUNNEL_KEY_NO_CSUM is defined if there isn't
> > HAVE_TC_ACT_TUNNEL_KEY. Actually I'm wondering why it is different from
> > HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT. It looks like the following is
> > needed - HAVE_TCA_TUNNEL_KEY_NO_CSUM ??
> > 
> > 
> > 	#ifdef HAVE_TC_ACT_TUNNEL_KEY
> > 
> > 	#include <linux/tc_act/tc_tunnel_key.h>
> > 
> > 	#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
> > 	#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
> > 	#endif
> > 
> > 	#ifndef HAVE_TCA_TUNNEL_KEY_NO_CSUM
> > 	#define TCA_TUNNEL_KEY_NO_CSUM 10
> > 	#endif
> 
> I think it is subject to check. Yes, we can define the "missing"
> macros, but it seems the old kernel just does not know these
> keys. Whether the rule with these keys is accepted by kernel?
> I did not check (have no host with old setup to check),
> I'd prefer to exclude not very significant key to lower the
> rule rejection risk. 

My question is that why the two missing macros (TCA_TUNNEL_KEY_ENC_DST_PORT and
TCA_TUNNEL_KEY_NO_CSUM) are treated differently? AFAIK, the reason for defining
it manually for old kernel is that it can be run on a different host from the
compile host. Even though the compile machine doesn't have the feature, but it
can be supported on the machine it runs on. If kernel doesn't understand it on
an old machine, an error will be returned, which is fine.

Thanks,
Yongseok

> > 
> > 	#else /* HAVE_TC_ACT_TUNNEL_KEY */
> > 
> >
diff mbox series

Patch

diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
index 0055417..660d45e 100644
--- a/drivers/net/mlx5/mlx5_flow_tcf.c
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -2094,6 +2094,265 @@  struct pedit_parser {
 }
 
 /**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_ETH entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the MAC address fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_ETH entry specification.
+ * @param[in] mask
+ *   RTE_FLOW_ITEM_TYPE_ETH entry mask.
+ * @param[out] encap
+ *   Structure to fill the gathered MAC address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_eth(const struct rte_flow_item_eth *spec,
+			       const struct rte_flow_item_eth *mask,
+			       struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. No redundant checks. */
+	assert(spec);
+	if (!mask || !memcmp(&mask->dst,
+			     &rte_flow_item_eth_mask.dst,
+			     sizeof(rte_flow_item_eth_mask.dst))) {
+		/*
+		 * Ethernet addresses are not supported by
+		 * tc as tunnel_key parameters. Destination
+		 * address is needed to form encap packet
+		 * header and retrieved by kernel from
+		 * implicit sources (ARP table, etc),
+		 * address masks are not supported at all.
+		 */
+		encap->eth.dst = spec->dst;
+		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_DST;
+	}
+	if (!mask || !memcmp(&mask->src,
+			     &rte_flow_item_eth_mask.src,
+			     sizeof(rte_flow_item_eth_mask.src))) {
+		/*
+		 * Ethernet addresses are not supported by
+		 * tc as tunnel_key parameters. Source ethernet
+		 * address is ignored anyway.
+		 */
+		encap->eth.src = spec->src;
+		encap->mask |= MLX5_FLOW_TCF_ENCAP_ETH_SRC;
+	}
+	/*
+	 * No space allocated for ethernet addresses within Netlink
+	 * message tunnel_key record - these ones are not
+	 * supported by tc.
+	 */
+	return 0;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_IPV4 entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV4 address fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_IPV4 entry specification.
+ * @param[out] encap
+ *   Structure to fill the gathered IPV4 address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_ipv4(const struct rte_flow_item_ipv4 *spec,
+				struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. No redundant checks. */
+	assert(spec);
+	encap->ipv4.dst = spec->hdr.dst_addr;
+	encap->ipv4.src = spec->hdr.src_addr;
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC |
+		       MLX5_FLOW_TCF_ENCAP_IPV4_DST;
+	return 2 * SZ_NLATTR_TYPE_OF(uint32_t);
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_IPV6 entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the IPV6 address fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_IPV6 entry specification.
+ * @param[out] encap
+ *   Structure to fill the gathered IPV6 address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_ipv6(const struct rte_flow_item_ipv6 *spec,
+				struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. No redundant checks. */
+	assert(spec);
+	memcpy(encap->ipv6.dst, spec->hdr.dst_addr, sizeof(encap->ipv6.dst));
+	memcpy(encap->ipv6.src, spec->hdr.src_addr, sizeof(encap->ipv6.src));
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV6_SRC |
+		       MLX5_FLOW_TCF_ENCAP_IPV6_DST;
+	return SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 2;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_UDP entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the UDP port fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_UDP entry specification.
+ * @param[in] mask
+ *   RTE_FLOW_ITEM_TYPE_UDP entry mask.
+ * @param[out] encap
+ *   Structure to fill the gathered UDP port data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_udp(const struct rte_flow_item_udp *spec,
+			       const struct rte_flow_item_udp *mask,
+			       struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	int size = SZ_NLATTR_TYPE_OF(uint16_t);
+
+	assert(spec);
+	encap->udp.dst = spec->hdr.dst_port;
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_UDP_DST;
+	if (!mask || mask->hdr.src_port != RTE_BE16(0x0000)) {
+		encap->udp.src = spec->hdr.src_port;
+		size += SZ_NLATTR_TYPE_OF(uint16_t);
+		encap->mask |= MLX5_FLOW_TCF_ENCAP_IPV4_SRC;
+	}
+	return size;
+}
+
+/**
+ * Helper function to process RTE_FLOW_ITEM_TYPE_VXLAN entry in configuration
+ * of action RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. Fills the VNI fields
+ * in the encapsulation parameters structure. The item must be prevalidated,
+ * no any validation checks performed by function.
+ *
+ * @param[in] spec
+ *   RTE_FLOW_ITEM_TYPE_VXLAN entry specification.
+ * @param[out] encap
+ *   Structure to fill the gathered VNI address data.
+ *
+ * @return
+ *   The size needed the Netlink message tunnel_key
+ *   parameter buffer to store the item attributes.
+ */
+static int
+flow_tcf_parse_vxlan_encap_vni(const struct rte_flow_item_vxlan *spec,
+			       struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	/* Item must be validated before. Do not redundant checks. */
+	assert(spec);
+	memcpy(encap->vxlan.vni, spec->vni, sizeof(encap->vxlan.vni));
+	encap->mask |= MLX5_FLOW_TCF_ENCAP_VXLAN_VNI;
+	return SZ_NLATTR_TYPE_OF(uint32_t);
+}
+
+/**
+ * Populate consolidated encapsulation object from list of pattern items.
+ *
+ * Helper function to process configuration of action such as
+ * RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP. The item list should be
+ * validated, there is no way to return an meaningful error.
+ *
+ * @param[in] action
+ *   RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP action object.
+ *   List of pattern items to gather data from.
+ * @param[out] src
+ *   Structure to fill gathered data.
+ *
+ * @return
+ *   The size the part of Netlink message buffer to store the item
+ *   attributes on success, zero otherwise. The mask field in
+ *   result structure reflects correctly parsed items.
+ */
+static int
+flow_tcf_vxlan_encap_parse(const struct rte_flow_action *action,
+			   struct mlx5_flow_tcf_vxlan_encap *encap)
+{
+	union {
+		const struct rte_flow_item_eth *eth;
+		const struct rte_flow_item_ipv4 *ipv4;
+		const struct rte_flow_item_ipv6 *ipv6;
+		const struct rte_flow_item_udp *udp;
+		const struct rte_flow_item_vxlan *vxlan;
+	} spec, mask;
+	const struct rte_flow_item *items;
+	int size = 0;
+
+	assert(action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP);
+	assert(action->conf);
+
+	items = ((const struct rte_flow_action_vxlan_encap *)
+					action->conf)->definition;
+	assert(items);
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			mask.eth = items->mask;
+			spec.eth = items->spec;
+			size += flow_tcf_parse_vxlan_encap_eth(spec.eth,
+							       mask.eth,
+							       encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			spec.ipv4 = items->spec;
+			size += flow_tcf_parse_vxlan_encap_ipv4(spec.ipv4,
+								encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			spec.ipv6 = items->spec;
+			size += flow_tcf_parse_vxlan_encap_ipv6(spec.ipv6,
+								encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			mask.udp = items->mask;
+			spec.udp = items->spec;
+			size += flow_tcf_parse_vxlan_encap_udp(spec.udp,
+							       mask.udp,
+							       encap);
+			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			spec.vxlan = items->spec;
+			size += flow_tcf_parse_vxlan_encap_vni(spec.vxlan,
+							       encap);
+			break;
+		default:
+			assert(false);
+			DRV_LOG(WARNING,
+				"unsupported item %p type %d,"
+				" items must be validated"
+				" before flow creation",
+				(const void *)items, items->type);
+			encap->mask = 0;
+			return 0;
+		}
+	}
+	return size;
+}
+
+/**
  * Calculate maximum size of memory for flow items of Linux TC flower and
  * extract specified items.
  *
@@ -2148,7 +2407,7 @@  struct pedit_parser {
 		case RTE_FLOW_ITEM_TYPE_IPV6:
 			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
 				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
-				SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
+				SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) * 4;
 				/* dst/src IP addr and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
 			break;
@@ -2164,6 +2423,10 @@  struct pedit_parser {
 				/* dst/src port and mask. */
 			flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
 			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			size += SZ_NLATTR_TYPE_OF(uint32_t);
+			flags |= MLX5_FLOW_LAYER_VXLAN;
+			break;
 		default:
 			DRV_LOG(WARNING,
 				"unsupported item %p type %d,"
@@ -2184,13 +2447,16 @@  struct pedit_parser {
  *   Pointer to the list of actions.
  * @param[out] action_flags
  *   Pointer to the detected actions.
+ * @param[out] tunnel
+ *   Pointer to tunnel encapsulation parameters structure to fill.
  *
  * @return
  *   Maximum size of memory for actions.
  */
 static int
 flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
-			      uint64_t *action_flags)
+			      uint64_t *action_flags,
+			      void *tunnel)
 {
 	int size = 0;
 	uint64_t flags = 0;
@@ -2246,6 +2512,29 @@  struct pedit_parser {
 				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
 				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
 			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("tunnel_key") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF(uint8_t);
+			size += SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
+			size +=	flow_tcf_vxlan_encap_parse(actions, tunnel) +
+				RTE_ALIGN_CEIL /* preceding encap params. */
+				(sizeof(struct mlx5_flow_tcf_vxlan_encap),
+				MNL_ALIGNTO);
+			flags |= MLX5_ACTION_VXLAN_ENCAP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("tunnel_key") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF(uint8_t);
+			size +=	SZ_NLATTR_TYPE_OF(struct tc_tunnel_key);
+			size +=	RTE_ALIGN_CEIL /* preceding decap params. */
+				(sizeof(struct mlx5_flow_tcf_vxlan_decap),
+				MNL_ALIGNTO);
+			flags |= MLX5_ACTION_VXLAN_DECAP;
+			break;
 		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
 		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
 		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
@@ -2289,6 +2578,26 @@  struct pedit_parser {
 }
 
 /**
+ * Convert VXLAN VNI to 32-bit integer.
+ *
+ * @param[in] vni
+ *   VXLAN VNI in 24-bit wire format.
+ *
+ * @return
+ *   VXLAN VNI as a 32-bit integer value in network endian.
+ */
+static rte_be32_t
+vxlan_vni_as_be32(const uint8_t vni[3])
+{
+	rte_be32_t ret;
+
+	ret = vni[0];
+	ret = (ret << 8) | vni[1];
+	ret = (ret << 8) | vni[2];
+	return RTE_BE32(ret);
+}
+
+/**
  * Prepare a flow object for Linux TC flower. It calculates the maximum size of
  * memory required, allocates the memory, initializes Netlink message headers
  * and set unique TC message handle.
@@ -2323,22 +2632,54 @@  struct pedit_parser {
 	struct mlx5_flow *dev_flow;
 	struct nlmsghdr *nlh;
 	struct tcmsg *tcm;
+	struct mlx5_flow_tcf_vxlan_encap encap = {.mask = 0};
+	uint8_t *sp, *tun = NULL;
 
 	size += flow_tcf_get_items_and_size(attr, items, item_flags);
-	size += flow_tcf_get_actions_and_size(actions, action_flags);
-	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
+	size += flow_tcf_get_actions_and_size(actions, action_flags, &encap);
+	dev_flow = rte_zmalloc(__func__, size,
+			RTE_MAX(alignof(struct mlx5_flow_tcf_tunnel_hdr),
+				(size_t)MNL_ALIGNTO));
 	if (!dev_flow) {
 		rte_flow_error_set(error, ENOMEM,
 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 				   "not enough memory to create E-Switch flow");
 		return NULL;
 	}
-	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
+	sp = (uint8_t *)(dev_flow + 1);
+	if (*action_flags & MLX5_ACTION_VXLAN_ENCAP) {
+		tun = sp;
+		sp += RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
+			MNL_ALIGNTO);
+		size -= RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_encap),
+			MNL_ALIGNTO);
+		encap.hdr.type = MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP;
+		memcpy(tun, &encap,
+		       sizeof(struct mlx5_flow_tcf_vxlan_encap));
+	} else if (*action_flags & MLX5_ACTION_VXLAN_DECAP) {
+		tun = sp;
+		sp += RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
+			MNL_ALIGNTO);
+		size -= RTE_ALIGN_CEIL
+			(sizeof(struct mlx5_flow_tcf_vxlan_decap),
+			MNL_ALIGNTO);
+		encap.hdr.type = MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP;
+		memcpy(tun, &encap,
+		       sizeof(struct mlx5_flow_tcf_vxlan_decap));
+	}
+	nlh = mnl_nlmsg_put_header(sp);
 	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
 	*dev_flow = (struct mlx5_flow){
 		.tcf = (struct mlx5_flow_tcf){
+			.nlsize = size,
 			.nlh = nlh,
 			.tcm = tcm,
+			.tunnel = (struct mlx5_flow_tcf_tunnel_hdr *)tun,
+			.item_flags = *item_flags,
+			.action_flags = *action_flags,
 		},
 	};
 	/*
@@ -2392,6 +2733,7 @@  struct pedit_parser {
 		const struct rte_flow_item_ipv6 *ipv6;
 		const struct rte_flow_item_tcp *tcp;
 		const struct rte_flow_item_udp *udp;
+		const struct rte_flow_item_vxlan *vxlan;
 	} spec, mask;
 	union {
 		const struct rte_flow_action_port_id *port_id;
@@ -2402,6 +2744,14 @@  struct pedit_parser {
 		const struct rte_flow_action_of_set_vlan_pcp *
 			of_set_vlan_pcp;
 	} conf;
+	union {
+		struct mlx5_flow_tcf_tunnel_hdr *hdr;
+		struct mlx5_flow_tcf_vxlan_decap *vxlan;
+	} decap;
+	union {
+		struct mlx5_flow_tcf_tunnel_hdr *hdr;
+		struct mlx5_flow_tcf_vxlan_encap *vxlan;
+	} encap;
 	struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
 	struct nlmsghdr *nlh = dev_flow->tcf.nlh;
 	struct tcmsg *tcm = dev_flow->tcf.tcm;
@@ -2418,6 +2768,12 @@  struct pedit_parser {
 
 	claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
 						PTOI_TABLE_SZ_MAX(dev)));
+	encap.hdr = NULL;
+	decap.hdr = NULL;
+	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_ENCAP)
+		encap.vxlan = dev_flow->tcf.vxlan_encap;
+	if (dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_DECAP)
+		decap.vxlan = dev_flow->tcf.vxlan_decap;
 	nlh = dev_flow->tcf.nlh;
 	tcm = dev_flow->tcf.tcm;
 	/* Prepare API must have been called beforehand. */
@@ -2435,7 +2791,6 @@  struct pedit_parser {
 		mnl_attr_put_u32(nlh, TCA_CHAIN, attr->group);
 	mnl_attr_put_strz(nlh, TCA_KIND, "flower");
 	na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
-	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
 	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
 		unsigned int i;
 
@@ -2479,6 +2834,12 @@  struct pedit_parser {
 						 spec.eth->type);
 				eth_type_set = 1;
 			}
+			/*
+			 * L2 addresses/masks should  be sent anyway,
+			 * including VXLAN encap/decap cases, sometimes
+			 * kernel returns an error if no L2 address
+			 * provided and skip_sw flag is set
+			 */
 			if (!is_zero_ether_addr(&mask.eth->dst)) {
 				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
 					     ETHER_ADDR_LEN,
@@ -2495,8 +2856,19 @@  struct pedit_parser {
 					     ETHER_ADDR_LEN,
 					     mask.eth->src.addr_bytes);
 			}
-			break;
+			if (decap.hdr) {
+				DRV_LOG(INFO,
+				"ethernet addresses are treated "
+				"as inner ones for tunnel decapsulation");
+			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+		break;
 		case RTE_FLOW_ITEM_TYPE_VLAN:
+			if (encap.hdr || decap.hdr)
+				return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM, NULL,
+					  "outer VLAN is not "
+					  "supported for tunnels");
 			item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
 			mask.vlan = flow_tcf_item_mask
 				(items, &rte_flow_item_vlan_mask,
@@ -2528,6 +2900,7 @@  struct pedit_parser {
 						 rte_be_to_cpu_16
 						 (spec.vlan->tci &
 						  RTE_BE16(0x0fff)));
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV4:
 			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
@@ -2538,36 +2911,53 @@  struct pedit_parser {
 				 sizeof(flow_tcf_mask_supported.ipv4),
 				 error);
 			assert(mask.ipv4);
-			if (!eth_type_set || !vlan_eth_type_set)
-				mnl_attr_put_u16(nlh,
-						 vlan_present ?
-						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
-						 TCA_FLOWER_KEY_ETH_TYPE,
-						 RTE_BE16(ETH_P_IP));
-			eth_type_set = 1;
-			vlan_eth_type_set = 1;
-			if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
-				break;
 			spec.ipv4 = items->spec;
-			if (mask.ipv4->hdr.next_proto_id) {
-				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
+			if (!decap.vxlan) {
+				if (!eth_type_set || !vlan_eth_type_set) {
+					mnl_attr_put_u16(nlh,
+						vlan_present ?
+						TCA_FLOWER_KEY_VLAN_ETH_TYPE :
+						TCA_FLOWER_KEY_ETH_TYPE,
+						RTE_BE16(ETH_P_IP));
+				}
+				eth_type_set = 1;
+				vlan_eth_type_set = 1;
+				if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
+					break;
+				if (mask.ipv4->hdr.next_proto_id) {
+					mnl_attr_put_u8
+						(nlh, TCA_FLOWER_KEY_IP_PROTO,
 						spec.ipv4->hdr.next_proto_id);
-				ip_proto_set = 1;
+					ip_proto_set = 1;
+				}
+			} else {
+				assert(mask.ipv4 != &flow_tcf_mask_empty.ipv4);
 			}
 			if (mask.ipv4->hdr.src_addr) {
-				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
-						 spec.ipv4->hdr.src_addr);
-				mnl_attr_put_u32(nlh,
-						 TCA_FLOWER_KEY_IPV4_SRC_MASK,
-						 mask.ipv4->hdr.src_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_SRC :
+					 TCA_FLOWER_KEY_IPV4_SRC,
+					 spec.ipv4->hdr.src_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK :
+					 TCA_FLOWER_KEY_IPV4_SRC_MASK,
+					 mask.ipv4->hdr.src_addr);
 			}
 			if (mask.ipv4->hdr.dst_addr) {
-				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
-						 spec.ipv4->hdr.dst_addr);
-				mnl_attr_put_u32(nlh,
-						 TCA_FLOWER_KEY_IPV4_DST_MASK,
-						 mask.ipv4->hdr.dst_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_DST :
+					 TCA_FLOWER_KEY_IPV4_DST,
+					 spec.ipv4->hdr.dst_addr);
+				mnl_attr_put_u32
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_IPV4_DST_MASK :
+					 TCA_FLOWER_KEY_IPV4_DST_MASK,
+					 mask.ipv4->hdr.dst_addr);
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_IPV6:
 			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
@@ -2578,38 +2968,53 @@  struct pedit_parser {
 				 sizeof(flow_tcf_mask_supported.ipv6),
 				 error);
 			assert(mask.ipv6);
-			if (!eth_type_set || !vlan_eth_type_set)
-				mnl_attr_put_u16(nlh,
-						 vlan_present ?
-						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
-						 TCA_FLOWER_KEY_ETH_TYPE,
-						 RTE_BE16(ETH_P_IPV6));
-			eth_type_set = 1;
-			vlan_eth_type_set = 1;
-			if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
-				break;
 			spec.ipv6 = items->spec;
-			if (mask.ipv6->hdr.proto) {
-				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
-						spec.ipv6->hdr.proto);
-				ip_proto_set = 1;
+			if (!decap.vxlan) {
+				if (!eth_type_set || !vlan_eth_type_set) {
+					mnl_attr_put_u16(nlh,
+						vlan_present ?
+						TCA_FLOWER_KEY_VLAN_ETH_TYPE :
+						TCA_FLOWER_KEY_ETH_TYPE,
+						RTE_BE16(ETH_P_IPV6));
+				}
+				eth_type_set = 1;
+				vlan_eth_type_set = 1;
+				if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
+					break;
+				if (mask.ipv6->hdr.proto) {
+					mnl_attr_put_u8
+						(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						 spec.ipv6->hdr.proto);
+					ip_proto_set = 1;
+				}
+			} else {
+				assert(mask.ipv6 != &flow_tcf_mask_empty.ipv6);
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_SRC :
+					     TCA_FLOWER_KEY_IPV6_SRC,
 					     sizeof(spec.ipv6->hdr.src_addr),
 					     spec.ipv6->hdr.src_addr);
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK :
+					     TCA_FLOWER_KEY_IPV6_SRC_MASK,
 					     sizeof(mask.ipv6->hdr.src_addr),
 					     mask.ipv6->hdr.src_addr);
 			}
 			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_DST :
+					     TCA_FLOWER_KEY_IPV6_DST,
 					     sizeof(spec.ipv6->hdr.dst_addr),
 					     spec.ipv6->hdr.dst_addr);
-				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+				mnl_attr_put(nlh, decap.vxlan ?
+					     TCA_FLOWER_KEY_ENC_IPV6_DST_MASK :
+					     TCA_FLOWER_KEY_IPV6_DST_MASK,
 					     sizeof(mask.ipv6->hdr.dst_addr),
 					     mask.ipv6->hdr.dst_addr);
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_UDP:
 			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
@@ -2620,26 +3025,44 @@  struct pedit_parser {
 				 sizeof(flow_tcf_mask_supported.udp),
 				 error);
 			assert(mask.udp);
-			if (!ip_proto_set)
-				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
-						IPPROTO_UDP);
-			if (mask.udp == &flow_tcf_mask_empty.udp)
-				break;
 			spec.udp = items->spec;
+			if (!decap.vxlan) {
+				if (!ip_proto_set)
+					mnl_attr_put_u8
+						(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						IPPROTO_UDP);
+				if (mask.udp == &flow_tcf_mask_empty.udp)
+					break;
+			} else {
+				assert(mask.udp != &flow_tcf_mask_empty.udp);
+				decap.vxlan->udp_port
+					= RTE_BE16(spec.udp->hdr.dst_port);
+			}
 			if (mask.udp->hdr.src_port) {
-				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
-						 spec.udp->hdr.src_port);
-				mnl_attr_put_u16(nlh,
-						 TCA_FLOWER_KEY_UDP_SRC_MASK,
-						 mask.udp->hdr.src_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT :
+					 TCA_FLOWER_KEY_UDP_SRC,
+					 spec.udp->hdr.src_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK :
+					 TCA_FLOWER_KEY_UDP_SRC_MASK,
+					 mask.udp->hdr.src_port);
 			}
 			if (mask.udp->hdr.dst_port) {
-				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
-						 spec.udp->hdr.dst_port);
-				mnl_attr_put_u16(nlh,
-						 TCA_FLOWER_KEY_UDP_DST_MASK,
-						 mask.udp->hdr.dst_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_DST_PORT :
+					 TCA_FLOWER_KEY_UDP_DST,
+					 spec.udp->hdr.dst_port);
+				mnl_attr_put_u16
+					(nlh, decap.vxlan ?
+					 TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK :
+					 TCA_FLOWER_KEY_UDP_DST_MASK,
+					 mask.udp->hdr.dst_port);
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ITEM_TYPE_TCP:
 			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
@@ -2682,7 +3105,15 @@  struct pedit_parser {
 					 rte_cpu_to_be_16
 						(mask.tcp->hdr.tcp_flags));
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
+		case RTE_FLOW_ITEM_TYPE_VXLAN:
+			assert(decap.vxlan);
+			spec.vxlan = items->spec;
+			mnl_attr_put_u32(nlh,
+					 TCA_FLOWER_KEY_ENC_KEY_ID,
+					 vxlan_vni_as_be32(spec.vxlan->vni));
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ITEM,
@@ -2715,6 +3146,14 @@  struct pedit_parser {
 			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
 			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
 			assert(na_act);
+			if (encap.hdr) {
+				assert(dev_flow->tcf.tunnel);
+				dev_flow->tcf.tunnel->ifindex_ptr =
+					&((struct tc_mirred *)
+					mnl_attr_get_payload
+					(mnl_nlmsg_get_payload_tail
+						(nlh)))->ifindex;
+			}
 			mnl_attr_put(nlh, TCA_MIRRED_PARMS,
 				     sizeof(struct tc_mirred),
 				     &(struct tc_mirred){
@@ -2724,6 +3163,7 @@  struct pedit_parser {
 				     });
 			mnl_attr_nest_end(nlh, na_act);
 			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ACTION_TYPE_JUMP:
 			conf.jump = actions->conf;
@@ -2741,6 +3181,7 @@  struct pedit_parser {
 				     });
 			mnl_attr_nest_end(nlh, na_act);
 			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			na_act_index =
@@ -2827,6 +3268,76 @@  struct pedit_parser {
 					(na_vlan_priority) =
 					conf.of_set_vlan_pcp->vlan_pcp;
 			}
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+			assert(decap.vxlan);
+			assert(dev_flow->tcf.tunnel);
+			dev_flow->tcf.tunnel->ifindex_ptr
+				= (unsigned int *)&tcm->tcm_ifindex;
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
+				sizeof(struct tc_tunnel_key),
+				&(struct tc_tunnel_key){
+					.action = TC_ACT_PIPE,
+					.t_action = TCA_TUNNEL_KEY_ACT_RELEASE,
+					});
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
+			break;
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+			assert(encap.vxlan);
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "tunnel_key");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_TUNNEL_KEY_PARMS,
+				sizeof(struct tc_tunnel_key),
+				&(struct tc_tunnel_key){
+					.action = TC_ACT_PIPE,
+					.t_action = TCA_TUNNEL_KEY_ACT_SET,
+					});
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_UDP_DST)
+				mnl_attr_put_u16(nlh,
+					 TCA_TUNNEL_KEY_ENC_DST_PORT,
+					 encap.vxlan->udp.dst);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC)
+				mnl_attr_put_u32(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV4_SRC,
+					 encap.vxlan->ipv4.src);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
+				mnl_attr_put_u32(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV4_DST,
+					 encap.vxlan->ipv4.dst);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC)
+				mnl_attr_put(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV6_SRC,
+					 sizeof(encap.vxlan->ipv6.src),
+					 &encap.vxlan->ipv6.src);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
+				mnl_attr_put(nlh,
+					 TCA_TUNNEL_KEY_ENC_IPV6_DST,
+					 sizeof(encap.vxlan->ipv6.dst),
+					 &encap.vxlan->ipv6.dst);
+			if (encap.vxlan->mask & MLX5_FLOW_TCF_ENCAP_VXLAN_VNI)
+				mnl_attr_put_u32(nlh,
+					 TCA_TUNNEL_KEY_ENC_KEY_ID,
+					 vxlan_vni_as_be32
+						(encap.vxlan->vxlan.vni));
+#ifdef TCA_TUNNEL_KEY_NO_CSUM
+			mnl_attr_put_u8(nlh, TCA_TUNNEL_KEY_NO_CSUM, 0);
+#endif
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 			break;
 		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
 		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
@@ -2850,7 +3361,11 @@  struct pedit_parser {
 	assert(na_flower);
 	assert(na_flower_act);
 	mnl_attr_nest_end(nlh, na_flower_act);
+	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS,
+			 dev_flow->tcf.action_flags & MLX5_ACTION_VXLAN_DECAP
+			 ? 0 : TCA_CLS_FLAGS_SKIP_SW);
 	mnl_attr_nest_end(nlh, na_flower);
+	assert(dev_flow->tcf.nlsize >= nlh->nlmsg_len);
 	return 0;
 }